def parse_txt( parser: SpacyBISTParser, txt_path: Union[str, PathLike], out_dir: Union[str, PathLike] = None, show_tok=True, show_doc=True, ): """Parse raw documents in the form of lines in a text file. Args: parser (SpacyBISTParser) txt_path (str or PathLike) out_dir (str or PathLike): If specified, the output will also be written to this path. show_tok (bool, optional): Specifies whether to include token text in output. show_doc (bool, optional): Specifies whether to include document text in output. Yields: CoreNLPDoc: the annotated document. """ with open(txt_path, encoding="utf-8") as f: if out_dir: print("Writing parsed documents to {}".format(out_dir)) for i, doc_text in enumerate( tqdm(f, total=line_count(txt_path), file=sys.stdout)): parsed_doc = parser.parse(doc_text.rstrip("\n"), show_tok, show_doc) if out_dir: out_path = Path(out_dir) / (str(i + 1) + ".json") with open(out_path, "w", encoding="utf-8") as doc_file: doc_file.write(parsed_doc.pretty_json()) yield parsed_doc
def __init__( self, parse: bool = True, rerank_model: PathLike = None, asp_thresh: int = 3, op_thresh: int = 2, max_iter: int = 3, ): self.acquire_lexicon = AcquireTerms(asp_thresh, op_thresh, max_iter) if parse: from nlp_architect.pipelines.spacy_bist import SpacyBISTParser self.parser = SpacyBISTParser() else: self.parser = None if not rerank_model: print("using pre-trained reranking model") rerank_model = _download_pretrained_rerank_model( RERANK_MODEL_DEFAULT_PATH) download_unzip(*EMBEDDING_URL, EMBEDDING_PATH, license_msg="Glove word embeddings.") self.rerank = RerankTerms(vector_cache=True, rerank_model=rerank_model, emb_model_path=EMBEDDING_PATH)
def __init__(self, aspect_lex: PathLike, opinion_lex: PathLike or dict, parse: bool = True): """Inits SentimentInference with given aspect and opinion lexicons.""" INFERENCE_OUT.mkdir(parents=True, exist_ok=True) self.opinion_lex = \ opinion_lex if type(opinion_lex) is dict else load_opinion_lex(opinion_lex) self.aspect_lex = _load_aspect_lexicon(aspect_lex) self.intensifier_lex = _read_lexicon_from_csv('IntensifiersLex.csv') self.negation_lex = _read_lexicon_from_csv('NegationSentLex.csv') if parse: from nlp_architect.pipelines.spacy_bist import SpacyBISTParser self.parser = SpacyBISTParser() else: self.parser = None
def __init__( self, aspect_lex: Union[str, PathLike], opinion_lex: Union[str, PathLike, dict], parse: bool = True, ): """Inits SentimentInference with given aspect and opinion lexicons.""" INFERENCE_OUT.mkdir(parents=True, exist_ok=True) self.opinion_lex = (opinion_lex if type(opinion_lex) is dict else load_opinion_lex(Path(opinion_lex))) self.aspect_lex = _load_aspect_lexicon(Path(aspect_lex)) self.intensifier_lex = _read_lexicon_from_csv("IntensifiersLex.csv") self.negation_lex = _read_lexicon_from_csv("NegationSentLex.csv") if parse: from nlp_architect.pipelines.spacy_bist import SpacyBISTParser self.parser = SpacyBISTParser(spacy_model="en") else: self.parser = None
def run( self, aspect_lex: PathLike = None, opinion_lex: PathLike = None, data: PathLike = None, parsed_data: PathLike = None, inference_results: PathLike = None, ) -> Optional[pd.DataFrame]: opinions = load_opinion_lex(opinion_lex) if not opinions: raise ValueError("Empty opinion lexicon!") aspects = pd.read_csv(aspect_lex, header=None, encoding="utf-8")[0] if aspects.empty: raise ValueError("Empty aspect lexicon!") if inference_results: with open(inference_results, encoding="utf-8") as f: results = json.loads(f.read(), object_hook=SentimentDoc.decoder) elif data or parsed_data: inference = SentimentInference(aspect_lex, opinions, parse=False) parse = None if not parsed_data: # source data is raw text, need to parse from nlp_architect.pipelines.spacy_bist import SpacyBISTParser parse = SpacyBISTParser().parse results = {} print("Running inference on data files... (Iterating data files)") data_source = parsed_data if parsed_data else data for file, doc in self._iterate_docs(data_source): parsed_doc = (parse(doc) if parse else json.loads( doc, object_hook=CoreNLPDoc.decoder)) sentiment_doc = inference.run(parsed_doc=parsed_doc) if sentiment_doc: results[file] = sentiment_doc with open(SENTIMENT_OUT / "inference_results.json", "w", encoding="utf-8") as f: json.dump(results, f, cls=SentimentDocEncoder, indent=4, sort_keys=True) else: print( "No input given. Please supply one of: " "data directory, parsed data directory, or inference results.") return None print("\nComputing statistics...") stats = self._compute_stats(results, aspects, opinions) print("Done.") return stats
class Fixtures: default_parser = SpacyBISTParser() ptb_pos_tags = { "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", } token_label_types = { "start": int, "len": int, "pos": str, "ner": str, "lemma": str, "gov": int, "rel": str, }
class Fixtures: default_parser = SpacyBISTParser() ptb_pos_tags = { 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB' } token_label_types = { 'start': int, 'len': int, 'pos': str, 'ner': str, 'lemma': str, 'gov': int, 'rel': str }
def __init__( self, aspect_lex: Union[str, PathLike], opinion_lex: Union[str, PathLike, dict], parse: bool = True, parser="spacy", spacy_model="en_core_web_sm", ): """Inits SentimentInference with given aspect and opinion lexicons.""" INFERENCE_OUT.mkdir(parents=True, exist_ok=True) self.opinion_lex = ( opinion_lex if type(opinion_lex) is dict else load_opinion_lex(Path(opinion_lex)) ) self.aspect_lex = _load_aspect_lexicon(Path(aspect_lex)) self.intensifier_lex = _read_lexicon_from_csv("IntensifiersLex.csv") self.negation_lex = _read_lexicon_from_csv("NegationSentLex.csv") self.parser_name = parser if parse: if parser == "bist": from nlp_architect.pipelines.spacy_bist import SpacyBISTParser self.parser = SpacyBISTParser(spacy_model=spacy_model) elif parser == "spacy": from nlp_architect.utils.text import SpacyInstance disable = [ "merge_noun_chunks", "ner", "entity_linker", "textcat", "entity_ruler", "sentencizer", "merge_entities", ] self.parser = SpacyInstance( model=spacy_model, disable=disable, ptb_pos=True, n_jobs=1 ) else: self.parser = None
class BistParserApi(AbstractApi): """ Bist Parser API """ def __init__(self): self.model = None def load_model(self): """ Load SpacyBISTParser model """ self.model = SpacyBISTParser() def inference(self, doc): """ Parse according to SpacyBISTParser's model Args: doc (str): the doc str Returns: CoreNLPDoc: the parser's response hosted in CoreNLPDoc object """ return self.model.parse(doc)
def load_model(self): """ Load SpacyBISTParser model """ self.model = SpacyBISTParser()
class SentimentInference(object): """Main class for sentiment inference execution. Attributes: opinion_lex: Opinion lexicon as outputted by TrainSentiment module. aspect_lex: Aspect lexicon as outputted by TrainSentiment module. intensifier_lex (dict): Pre-defined intensifier lexicon. negation_lex (dict): Pre-defined negation lexicon. """ def __init__(self, aspect_lex: PathLike, opinion_lex: PathLike or dict, parse: bool = True): """Inits SentimentInference with given aspect and opinion lexicons.""" INFERENCE_OUT.mkdir(parents=True, exist_ok=True) self.opinion_lex = \ opinion_lex if type(opinion_lex) is dict else load_opinion_lex(opinion_lex) self.aspect_lex = _load_aspect_lexicon(aspect_lex) self.intensifier_lex = _read_lexicon_from_csv('IntensifiersLex.csv') self.negation_lex = _read_lexicon_from_csv('NegationSentLex.csv') if parse: from nlp_architect.pipelines.spacy_bist import SpacyBISTParser self.parser = SpacyBISTParser() else: self.parser = None def run(self, doc: str = None, parsed_doc: CoreNLPDoc = None) -> SentimentDoc: """Run SentimentInference on a single document. Returns: The sentiment annotated document, which contains the detected events per sentence. """ if not parsed_doc: if not self.parser: raise RuntimeError( "Parser not initialized (try parse=True at init )") parsed_doc = self.parser.parse(doc) sentiment_doc = None for sentence in parsed_doc.sentences: events = [] scores = [] for aspect_row in self.aspect_lex: _, asp_events = self._extract_event(aspect_row, sentence) for asp_event in asp_events: events.append(asp_event) scores += [ term.score for term in asp_event if term.type == TermType.ASPECT ] if events: if not sentiment_doc: sentiment_doc = SentimentDoc(parsed_doc.doc_text) sentiment_doc.sentences.append( SentimentSentence( sentence[0]['start'], sentence[-1]['start'] + sentence[-1]['len'] - 1, events)) return sentiment_doc def _extract_intensifier_terms(self, toks, sentiment_index, polarity, sentence): """Extract intensifier events from sentence.""" count = 0 terms = [] for intens_i, intens in [(i, x) for i, x in enumerate(toks) if x in self.intensifier_lex]: if math.fabs(sentiment_index - intens_i) == 1: score = self.intensifier_lex[intens].score terms.append( Term(intens, TermType.INTENSIFIER, polarity, score, sentence[intens_i]['start'], sentence[intens_i]['len'])) count += abs(score + float(INTENSIFIER_FACTOR)) return count if count != 0 else 1, terms def _extract_neg_terms(self, toks: list, op_i: int, sentence: list) -> tuple: """Extract negation terms from sentence. Args: toks: Sentence text broken down to tokens (words). op_i: Index of opinion term in sentence. sentence: parsed sentence Returns: List of negation terms and its aggregated sign (positive or negative). """ sign = 1 terms = [] gov_op_i = sentence[op_i]['gov'] dep_op_indices = [ sentence.index(x) for x in sentence if x['gov'] == op_i ] for neg_i, negation in [(i, x) for i, x in enumerate(toks) if x in self.negation_lex]: position = self.negation_lex[negation].position dist = op_i - neg_i before = position == 'before' and (dist == 1 or neg_i in dep_op_indices) after = position == 'after' and (dist == -1 or neg_i == gov_op_i) both = position == 'both' and dist in (1, -1) if before or after or both: terms.append( Term(negation, TermType.NEGATION, Polarity.NEG, self.negation_lex[negation].score, sentence[toks.index(negation)]['start'], sentence[toks.index(negation)]['len'])) sign *= self.negation_lex[negation].score return terms, sign def _extract_event(self, aspect_row: LexiconElement, parsed_sentence: list) -> tuple: """Extract opinion and aspect terms from sentence.""" event = [] sent_aspect_pair = None real_aspect_indices = _consolidate_aspects(aspect_row.term, parsed_sentence) aspect_key = aspect_row.term[0] for aspect_index_range in real_aspect_indices: for word_index in aspect_index_range: sent_aspect_pair, event = \ self._detect_opinion_aspect_events(word_index, parsed_sentence, aspect_key, aspect_index_range) if sent_aspect_pair: break return sent_aspect_pair, event @staticmethod def _modify_for_multiple_word(cur_tkn, parsed_sentence, index_range): """Modify multiple-word aspect tkn length and start index. Args: index_range: The index range of the multi-word aspect. Returns: The modified aspect token. """ if len(index_range) >= 2: cur_tkn["start"] = parsed_sentence[index_range[0]]["start"] cur_tkn["len"] = len(parsed_sentence[index_range[0]]["text"]) for i in index_range[1:]: cur_tkn["len"] = int(cur_tkn["len"]) + len( parsed_sentence[i]["text"]) + 1 return cur_tkn def _detect_opinion_aspect_events(self, aspect_index, parsed_sent, aspect_key, index_range): """Extract opinion-aspect events from sentence. Args: aspect_index: index of aspect in sentence. parsed_sent: current sentence parse tree. aspect_key: main aspect term serves as key in aspect dict. index_range: The index range of the multi word aspect. Returns: List of aspect sentiment pair, and list of events extracted. """ all_pairs, events = [], [] sentence_text_list = [x["text"] for x in parsed_sent] sentence_text = ' '.join(sentence_text_list) cur_aspect = parsed_sent[aspect_index]["text"] for tok in parsed_sent: aspect_op_pair = [] terms = [] pos = tok['pos'] gov_i = tok['gov'] gov = parsed_sent[gov_i] gov_text = gov['text'] tok_text = tok['text'] # is cur_tkn an aspect and gov a opinion? if tok_text.lower() == cur_aspect.lower() and parsed_sent.index(tok) == aspect_index \ and gov_text.lower() in self.opinion_lex and \ gov['pos'] not in VERB_POS: aspect_op_pair.append( (self._modify_for_multiple_word(tok, parsed_sent, index_range), gov)) # is gov an aspect and cur_tkn a opinion? if gov_text.lower() == cur_aspect.lower() and gov_i == aspect_index \ and tok_text.lower() in self.opinion_lex and pos not in VERB_POS: aspect_op_pair.append( (self._modify_for_multiple_word(gov, parsed_sent, index_range), tok)) # if aspect_tok found for aspect, opinion in aspect_op_pair: op_tok_i = parsed_sent.index(opinion) score = self.opinion_lex[opinion['text'].lower()].score neg_terms, sign = self._extract_neg_terms( sentence_text_list, op_tok_i, parsed_sent) polarity = Polarity.POS if score * sign > 0 else Polarity.NEG intensifier_score, intensifier_terms = self._extract_intensifier_terms( sentence_text_list, op_tok_i, polarity, parsed_sent) over_all_score = score * sign * intensifier_score terms.append( Term(aspect_key, TermType.ASPECT, polarity, over_all_score, aspect['start'], aspect['len'])) terms.append( Term(opinion['text'], TermType.OPINION, polarity, over_all_score, opinion['start'], opinion['len'])) if len(neg_terms) > 0: terms = terms + neg_terms if len(intensifier_terms) > 0: terms = terms + intensifier_terms all_pairs.append([ aspect_key, opinion['text'], over_all_score, polarity, sentence_text ]) events.append(terms) return all_pairs, events
import spacy import datetime import logging logging.basicConfig(level=logging.INFO) import neuralcoref from nlp_architect.pipelines.spacy_bist import SpacyBISTParser nlp = spacy.load('en_core_web_sm') nlp.replace_pipe("parser", SpacyBISTParser()) neuralcoref.add_to_pipe(nlp) print('processing', datetime.datetime.utcnow()) doc = nlp( 'However in recent times attempts at systematizing this relationship is imposed by the convergence brought about by technological change or data revolution which has enabled use of better observation devices that can be in citizen’s hands. However in recent times attempts at systematizing this relationship is imposed by the convergence brought about by technological change or data revolution which has enabled use of better observation devices that can be in citizen’s hands. However in recent times attempts at systematizing this relationship is imposed by the convergence brought about by technological change or data revolution which has enabled use of better observation devices that can be in citizen’s hands.' ) print('done', datetime.datetime.utcnow()) doc.to_json() doc._.coref_clusters