def build_corpus(fn: str, title_col: str, body_col: str, model: Language) -> List[Doc]: df = load_data(fn, title_col, body_col) # df_combined = df.assign(title_body=df[f"{title_col}"] + df[f"{body_col}"]) # docs = list(model.pipe(content for content in df_combined["title_body"])) # for the moment, just use titles until we figure out data cleaning of summaries docs = list(model.pipe(content for content in df["title"])) return docs
def test_issue2564(): """Test the tagger sets is_tagged correctly when used via Language.pipe.""" nlp = Language() tagger = nlp.create_pipe("tagger") tagger.begin_training() # initialise weights nlp.add_pipe(tagger) doc = nlp("hello world") assert doc.is_tagged docs = nlp.pipe(["hello", "world"]) piped_doc = next(docs) assert piped_doc.is_tagged
def evaluate_textcat(cfg: Config, nlp: Language, val_data) -> Dict: # TODO: https://github.com/explosion/spaCy/pull/4664 texts, golds = zip(*val_data) try: y = np.array(list(map(lambda x: goldcat_to_label(x["cats"]), golds))) docs = list(nlp.pipe(texts, batch_size=cfg.nbatch * 2)) preds = np.array([doc._.get(TOP_LABEL) for doc in docs]) except Exception: report_fail(val_data) raise return classification_report(y, preds, output_dict=True)
def test_issue2564(): """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe.""" nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("A") nlp.initialize() doc = nlp("hello world") assert doc.has_annotation("TAG") docs = nlp.pipe(["hello", "world"]) piped_doc = next(docs) assert piped_doc.has_annotation("TAG")
def test_list_of_docs_pickles_efficiently(): nlp = Language() for i in range(10000): _ = nlp.vocab[unicode_(i)] # noqa: F841 one_pickled = pickle.dumps(nlp("0"), -1) docs = list(nlp.pipe(unicode_(i) for i in range(100))) many_pickled = pickle.dumps(docs, -1) assert len(many_pickled) < (len(one_pickled) * 2) many_unpickled = pickle.loads(many_pickled) assert many_unpickled[0].text == "0" assert many_unpickled[-1].text == "99" assert len(many_unpickled) == 100
def test_list_of_docs_pickles_efficiently(): nlp = Language() for i in range(10000): _ = nlp.vocab[unicode_(i)] # noqa: F841 one_pickled = pickle.dumps(nlp("0"), -1) docs = list(nlp.pipe(unicode_(i) for i in range(100))) many_pickled = pickle.dumps(docs, -1) assert len(many_pickled) < (len(one_pickled) * 2) many_unpickled = pickle.loads(many_pickled) assert many_unpickled[0].text == "0" assert many_unpickled[-1].text == "99" assert len(many_unpickled) == 100
def _process_content_bearing_samples( model: Language, samples_to_pipe: List[Tuple[int, Text]]) -> List[Tuple[int, Doc]]: """Sends content bearing training samples to SpaCy's pipe.""" docs = [(to_pipe_sample[0], doc) for to_pipe_sample, doc in zip( samples_to_pipe, [ doc for doc in model.pipe([txt for _, txt in samples_to_pipe], batch_size=50) ], )] return docs
def get_entities( lang: Language, rows: List[T], getVal: Callable[[T], str] = None) -> Iterable[Iterable[Entity]]: res = list( lang.pipe([(getVal(r) if getVal is not None else r) or "" for r in rows], n_process=4)) return map( lambda r: [ Entity(e.text.strip(), e.label_, e.start_char, e.end_char) for e in r.ents if e.label_ not in EXCLUDE_LABELS ], res)
def test_doc_gc(): # If the Doc object is garbage collected, the spans won't be functional afterwards nlp = Language() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) spancat.add_label("PERSON") nlp.initialize() texts = ["Just a sentence.", "I like London and Berlin", "I like Berlin", "I eat ham."] all_spans = [doc.spans for doc in nlp.pipe(texts)] for text, spangroups in zip(texts, all_spans): assert isinstance(spangroups, SpanGroups) for key, spangroup in spangroups.items(): assert isinstance(spangroup, SpanGroup) assert len(spangroup) > 0 with pytest.raises(RuntimeError): span = spangroup[0]
def get_entities( lang: Language, rows: List[T], getVal: Callable[[T], Union[str, None]] = None) -> Iterable[Iterable[Entity]]: def get_cleaned_txt(r: T): val = (getVal(r) if getVal is not None else r) or "" return clean_text(val) res: List[Any] = list( lang.pipe([get_cleaned_txt(r) for r in rows], n_process=4)) return map( lambda r: [ Entity(e.text.strip(), e.label_, e.start_char, e.end_char) for e in r.ents if e.label_ not in EXCLUDE_LABELS ], res)
def __init__( self, nlp: Language, bands: Iterable[MusicBand], destination: Optional[str] = None, ): """ :param nlp: The nlp pipeline used to tokenize the phrases. :param bands: The bands to match against. :param destination: The document extension where the entities should be stored. When it is missing, the entities are stored in ``doc.ents``. """ docs = nlp.pipe((band.name for band in bands)) self._matcher = PhraseMatcher(nlp.vocab) self._matcher.add("music_band", docs) if destination and not Doc.has_extension(destination): Doc.set_extension(destination, default=None) self._destination = destination
def _make_spacy_doc_from_text_chunks(text: str, lang: Language, chunk_size: int) -> Doc: text_chunks = (text[i : i + chunk_size] for i in range(0, len(text), chunk_size)) return Doc.from_docs(list(lang.pipe(text_chunks)))
def build_corpus(fn: str, title_col: str, model: Language) -> List[Doc]: df = load_data(fn, title_col) docs = list(model.pipe(title for title in df[f"{title_col}"])) return docs
def get_predictions(nlp: Language, docs: List[dict]): from collections import Counter ner = nlp.get_pipe('ner') parses = list(nlp.pipe([t['text'] for t in docs])) beams = [ ner.beam_parse([x], beam_width=16)[0] for x in tqdm(parses, desc="Predicting labels...") ] results = [] # print(type(docs), type(parses), type(beams)) # print(len(docs), len(parses), len(beams)) items = zip(docs, parses, beams) for document, parse, beam in items: text = document['text'] # if parse.ents: # print("Entities:", text, parse.ents) # else: # print("No entities found:", text, parse.ents) entities = ner.moves.get_beam_annot(beam) words = Counter() start_end = {} for (estart, eend, etype), v in sorted(entities.items(), key=lambda x: (x[1], x[0])): etype_str = parse.vocab.strings[etype] if (estart, eend) in start_end: print("Removing completely overlapping entry:", (estart, eend, etype_str)) continue words[estart, eend, etype_str] = v start_end[estart, eend] = True words_items = sorted(words.items(), key=lambda x: (-x[1], x[0])) labels = [] predicts = [] unsure = 0.001 # print(repr(text)) max_per_type = Counter() for (estart, eend, etype), escore in words_items: cstart = parse[estart].idx if eend == len(parse): cend = len(text) else: cend = parse[eend].idx # cend = parse[eend-1].idx + len(parse[eend].text) # print(cstart, cend, estart, eend, f"'{parse[estart:eend]}', '{text[cstart:cend]}'", escore) # assert parse[estart:eend].text.strip() == text[cstart:cend].strip() unsure += 0.5 - abs(escore - 0.5) if escore > 0.01: # 0.4 <= escore: max_per_type[etype] += 1 if max_per_type[etype] < 100: labels.append((cstart, cend, etype)) predicts.append( (cstart, cend, parse[estart:eend].text, etype, escore)) results.append({ 'document': document, 'labels': labels, 'unsure': unsure / len(text), 'predicts': predicts, }) return results
def get_date_matches(nlp: Language, df: pd.DataFrame, parameters: Dict[str, Any]) -> pd.DataFrame: # TODO: regex to configfile pattern1 = [ # "%m/%d/%Y", "%m/%d/%y" or "%d/%m/%Y", "%d/%m/%y" { "TEXT": { "REGEX": r"^(?:(1[0-2]|0?[1-9])[.\-\/]{1}(3[01]|[12][0-9]|0?[1-9]))[.\-\/]{1}(?:[0-9]{2})?[0-9]{2}$|^(?:(3[01]|[12][0-9]|0?[1-9])[.\-\/]{1}(1[0-2]|0?[1-9]))[.\-\/]{1}(?:[0-9]{2})?[0-9]{2}$" } } ] pattern2 = [ # "%Y/%m/%d" { "TEXT": { "REGEX": r"^(?:[1-9]{1}[0-9]{3})[.\-\/]{1}(?:(1[0-2]|0?[1-9])[.\-\/]{1}(3[01]|[12][0-9]|0?[1-9]))$" } } ] months = r"(Jan(uar(y)?)?|Feb(ruar(y)?)?|Mar(ch)?|Mär(z)?|Apr(il)?|Ma(y|i)|Jun(e|i)?|Jul(y|i)?|Aug(ust)?|Sep(tember)?|O(c|k)t(ober)?|Nov(ember)?|De(c|z)(ember)?)" pattern3 = [ # "%d-%B-%Y", "d-%b-%Y" (20-Jun-2020, 20-June-2020) { "TEXT": { "REGEX": fr"^(?:(3[01]|[12][0-9]|0?[1-9])[.\-\/]{{1}}({months}))[.\-\/]{{1}}(?:[0-9]{{2}})?[0-9]{{2}}$" } } ] matcher = Matcher(nlp.vocab) matcher.add("Date: (__/__/yyyy _/_/yy)", None, pattern1) matcher.add("Date: (yyyy/mm/dd)", None, pattern2) matcher.add("Date: (dd-Mon-yyyy)", None, pattern3) def parse_date(string: str, lang: str) -> datetime: from dateparser import parse date = parse(string, languages=[lang]) if not date: date = parse(string) return date def get_date_matches_from_text(doc: Doc) -> str: result = [] all_dates = [] for i, (match_id, start, end) in enumerate(matcher(doc)): match_id_str = nlp.vocab.strings[match_id] match_string = doc[start:end].text match_date = parse_date(match_string, doc._.language['language']) text_left = doc[max(0, start - parameters['n_lefts']):max(0, end - 1)].text text_right = doc[end:min(len(doc), end + parameters['n_rights'])].text result.append({ 'date_position': i, 'match_id': match_id_str, 'match_string': match_string, 'match_date': match_date, 'text_left': text_left, 'text_right': text_right, }) all_dates.append(match_date) # get order of dates date_order = { date: i for i, date in enumerate(np.sort(pd.unique(all_dates))) } for r in result: r['date_order'] = date_order[r['match_date']] r['match_date'] = r['match_date'].strftime('%Y-%m-%d') # also add total number of found dates r['n_match_dates'] = len(all_dates) return json.dumps(result) # find matching date strings matches = [] language = [] for d in nlp.pipe(df['pdf_text'], disable=['tagger', 'ner']): matches.append(get_date_matches_from_text(d)) language.append(d._.language) df['matches'] = pd.Series(matches) df['language'] = pd.Series(language) return df