Ejemplo n.º 1
0
def build_corpus(fn: str, title_col: str, body_col: str,
                 model: Language) -> List[Doc]:

    df = load_data(fn, title_col, body_col)
    # df_combined = df.assign(title_body=df[f"{title_col}"] + df[f"{body_col}"])
    # docs = list(model.pipe(content for content in df_combined["title_body"]))

    # for the moment, just use titles until we figure out data cleaning of summaries
    docs = list(model.pipe(content for content in df["title"]))
    return docs
Ejemplo n.º 2
0
def test_issue2564():
    """Test the tagger sets is_tagged correctly when used via Language.pipe."""
    nlp = Language()
    tagger = nlp.create_pipe("tagger")
    tagger.begin_training()  # initialise weights
    nlp.add_pipe(tagger)
    doc = nlp("hello world")
    assert doc.is_tagged
    docs = nlp.pipe(["hello", "world"])
    piped_doc = next(docs)
    assert piped_doc.is_tagged
Ejemplo n.º 3
0
def evaluate_textcat(cfg: Config, nlp: Language, val_data) -> Dict:
    # TODO: https://github.com/explosion/spaCy/pull/4664
    texts, golds = zip(*val_data)
    try:
        y = np.array(list(map(lambda x: goldcat_to_label(x["cats"]), golds)))
        docs = list(nlp.pipe(texts, batch_size=cfg.nbatch * 2))
        preds = np.array([doc._.get(TOP_LABEL) for doc in docs])
    except Exception:
        report_fail(val_data)
        raise
    return classification_report(y, preds, output_dict=True)
Ejemplo n.º 4
0
def test_issue2564():
    """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
    nlp.initialize()
    doc = nlp("hello world")
    assert doc.has_annotation("TAG")
    docs = nlp.pipe(["hello", "world"])
    piped_doc = next(docs)
    assert piped_doc.has_annotation("TAG")
Ejemplo n.º 5
0
def test_list_of_docs_pickles_efficiently():
    nlp = Language()
    for i in range(10000):
        _ = nlp.vocab[unicode_(i)]  # noqa: F841
    one_pickled = pickle.dumps(nlp("0"), -1)
    docs = list(nlp.pipe(unicode_(i) for i in range(100)))
    many_pickled = pickle.dumps(docs, -1)
    assert len(many_pickled) < (len(one_pickled) * 2)
    many_unpickled = pickle.loads(many_pickled)
    assert many_unpickled[0].text == "0"
    assert many_unpickled[-1].text == "99"
    assert len(many_unpickled) == 100
Ejemplo n.º 6
0
def test_list_of_docs_pickles_efficiently():
    nlp = Language()
    for i in range(10000):
        _ = nlp.vocab[unicode_(i)]  # noqa: F841
    one_pickled = pickle.dumps(nlp("0"), -1)
    docs = list(nlp.pipe(unicode_(i) for i in range(100)))
    many_pickled = pickle.dumps(docs, -1)
    assert len(many_pickled) < (len(one_pickled) * 2)
    many_unpickled = pickle.loads(many_pickled)
    assert many_unpickled[0].text == "0"
    assert many_unpickled[-1].text == "99"
    assert len(many_unpickled) == 100
Ejemplo n.º 7
0
 def _process_content_bearing_samples(
         model: Language,
         samples_to_pipe: List[Tuple[int, Text]]) -> List[Tuple[int, Doc]]:
     """Sends content bearing training samples to SpaCy's pipe."""
     docs = [(to_pipe_sample[0], doc) for to_pipe_sample, doc in zip(
         samples_to_pipe,
         [
             doc for doc in model.pipe([txt for _, txt in samples_to_pipe],
                                       batch_size=50)
         ],
     )]
     return docs
Ejemplo n.º 8
0
def get_entities(
        lang: Language,
        rows: List[T],
        getVal: Callable[[T], str] = None) -> Iterable[Iterable[Entity]]:
    res = list(
        lang.pipe([(getVal(r) if getVal is not None else r) or ""
                   for r in rows],
                  n_process=4))
    return map(
        lambda r: [
            Entity(e.text.strip(), e.label_, e.start_char, e.end_char)
            for e in r.ents if e.label_ not in EXCLUDE_LABELS
        ], res)
Ejemplo n.º 9
0
def test_doc_gc():
    # If the Doc object is garbage collected, the spans won't be functional afterwards
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    spancat.add_label("PERSON")
    nlp.initialize()
    texts = ["Just a sentence.", "I like London and Berlin", "I like Berlin", "I eat ham."]
    all_spans = [doc.spans for doc in nlp.pipe(texts)]
    for text, spangroups in zip(texts, all_spans):
        assert isinstance(spangroups, SpanGroups)
        for key, spangroup in spangroups.items():
            assert isinstance(spangroup, SpanGroup)
            assert len(spangroup) > 0
            with pytest.raises(RuntimeError):
                span = spangroup[0]
Ejemplo n.º 10
0
def get_entities(
    lang: Language,
    rows: List[T],
    getVal: Callable[[T], Union[str,
                                None]] = None) -> Iterable[Iterable[Entity]]:
    def get_cleaned_txt(r: T):
        val = (getVal(r) if getVal is not None else r) or ""
        return clean_text(val)

    res: List[Any] = list(
        lang.pipe([get_cleaned_txt(r) for r in rows], n_process=4))
    return map(
        lambda r: [
            Entity(e.text.strip(), e.label_, e.start_char, e.end_char)
            for e in r.ents if e.label_ not in EXCLUDE_LABELS
        ], res)
Ejemplo n.º 11
0
    def __init__(
        self,
        nlp: Language,
        bands: Iterable[MusicBand],
        destination: Optional[str] = None,
    ):
        """
        :param nlp: The nlp pipeline used to tokenize the phrases.
        :param bands: The bands to match against.
        :param destination: The document extension where the entities
            should be stored.  When it is missing, the entities are
            stored in ``doc.ents``.
        """
        docs = nlp.pipe((band.name for band in bands))
        self._matcher = PhraseMatcher(nlp.vocab)
        self._matcher.add("music_band", docs)

        if destination and not Doc.has_extension(destination):
            Doc.set_extension(destination, default=None)
        self._destination = destination
Ejemplo n.º 12
0
def _make_spacy_doc_from_text_chunks(text: str, lang: Language, chunk_size: int) -> Doc:
    text_chunks = (text[i : i + chunk_size] for i in range(0, len(text), chunk_size))
    return Doc.from_docs(list(lang.pipe(text_chunks)))
Ejemplo n.º 13
0
def build_corpus(fn: str, title_col: str, model: Language) -> List[Doc]:

    df = load_data(fn, title_col)
    docs = list(model.pipe(title for title in df[f"{title_col}"]))

    return docs
Ejemplo n.º 14
0
def get_predictions(nlp: Language, docs: List[dict]):
    from collections import Counter
    ner = nlp.get_pipe('ner')
    parses = list(nlp.pipe([t['text'] for t in docs]))
    beams = [
        ner.beam_parse([x], beam_width=16)[0]
        for x in tqdm(parses, desc="Predicting labels...")
    ]

    results = []
    # print(type(docs), type(parses), type(beams))
    # print(len(docs), len(parses), len(beams))
    items = zip(docs, parses, beams)
    for document, parse, beam in items:
        text = document['text']
        # if parse.ents:
        #     print("Entities:", text, parse.ents)
        # else:
        #     print("No entities found:", text, parse.ents)
        entities = ner.moves.get_beam_annot(beam)
        words = Counter()
        start_end = {}
        for (estart, eend, etype), v in sorted(entities.items(),
                                               key=lambda x: (x[1], x[0])):
            etype_str = parse.vocab.strings[etype]
            if (estart, eend) in start_end:
                print("Removing completely overlapping entry:",
                      (estart, eend, etype_str))
                continue
            words[estart, eend, etype_str] = v
            start_end[estart, eend] = True

        words_items = sorted(words.items(), key=lambda x: (-x[1], x[0]))
        labels = []
        predicts = []
        unsure = 0.001
        # print(repr(text))
        max_per_type = Counter()
        for (estart, eend, etype), escore in words_items:
            cstart = parse[estart].idx
            if eend == len(parse):
                cend = len(text)
            else:
                cend = parse[eend].idx
                # cend = parse[eend-1].idx + len(parse[eend].text)
            # print(cstart, cend, estart, eend, f"'{parse[estart:eend]}', '{text[cstart:cend]}'", escore)
            # assert parse[estart:eend].text.strip() == text[cstart:cend].strip()
            unsure += 0.5 - abs(escore - 0.5)
            if escore > 0.01:  # 0.4 <= escore:
                max_per_type[etype] += 1
                if max_per_type[etype] < 100:
                    labels.append((cstart, cend, etype))
                predicts.append(
                    (cstart, cend, parse[estart:eend].text, etype, escore))

        results.append({
            'document': document,
            'labels': labels,
            'unsure': unsure / len(text),
            'predicts': predicts,
        })

    return results
Ejemplo n.º 15
0
def get_date_matches(nlp: Language, df: pd.DataFrame,
                     parameters: Dict[str, Any]) -> pd.DataFrame:
    # TODO: regex to configfile
    pattern1 = [  # "%m/%d/%Y", "%m/%d/%y" or "%d/%m/%Y", "%d/%m/%y"
        {
            "TEXT": {
                "REGEX":
                r"^(?:(1[0-2]|0?[1-9])[.\-\/]{1}(3[01]|[12][0-9]|0?[1-9]))[.\-\/]{1}(?:[0-9]{2})?[0-9]{2}$|^(?:(3[01]|[12][0-9]|0?[1-9])[.\-\/]{1}(1[0-2]|0?[1-9]))[.\-\/]{1}(?:[0-9]{2})?[0-9]{2}$"
            }
        }
    ]
    pattern2 = [  # "%Y/%m/%d"
        {
            "TEXT": {
                "REGEX":
                r"^(?:[1-9]{1}[0-9]{3})[.\-\/]{1}(?:(1[0-2]|0?[1-9])[.\-\/]{1}(3[01]|[12][0-9]|0?[1-9]))$"
            }
        }
    ]
    months = r"(Jan(uar(y)?)?|Feb(ruar(y)?)?|Mar(ch)?|Mär(z)?|Apr(il)?|Ma(y|i)|Jun(e|i)?|Jul(y|i)?|Aug(ust)?|Sep(tember)?|O(c|k)t(ober)?|Nov(ember)?|De(c|z)(ember)?)"
    pattern3 = [  # "%d-%B-%Y", "d-%b-%Y" (20-Jun-2020, 20-June-2020)
        {
            "TEXT": {
                "REGEX":
                fr"^(?:(3[01]|[12][0-9]|0?[1-9])[.\-\/]{{1}}({months}))[.\-\/]{{1}}(?:[0-9]{{2}})?[0-9]{{2}}$"
            }
        }
    ]

    matcher = Matcher(nlp.vocab)

    matcher.add("Date: (__/__/yyyy _/_/yy)", None, pattern1)
    matcher.add("Date: (yyyy/mm/dd)", None, pattern2)
    matcher.add("Date: (dd-Mon-yyyy)", None, pattern3)

    def parse_date(string: str, lang: str) -> datetime:
        from dateparser import parse
        date = parse(string, languages=[lang])
        if not date:
            date = parse(string)
        return date

    def get_date_matches_from_text(doc: Doc) -> str:
        result = []
        all_dates = []
        for i, (match_id, start, end) in enumerate(matcher(doc)):
            match_id_str = nlp.vocab.strings[match_id]
            match_string = doc[start:end].text
            match_date = parse_date(match_string, doc._.language['language'])
            text_left = doc[max(0, start -
                                parameters['n_lefts']):max(0, end - 1)].text
            text_right = doc[end:min(len(doc), end +
                                     parameters['n_rights'])].text
            result.append({
                'date_position': i,
                'match_id': match_id_str,
                'match_string': match_string,
                'match_date': match_date,
                'text_left': text_left,
                'text_right': text_right,
            })
            all_dates.append(match_date)

        # get order of dates
        date_order = {
            date: i
            for i, date in enumerate(np.sort(pd.unique(all_dates)))
        }
        for r in result:
            r['date_order'] = date_order[r['match_date']]
            r['match_date'] = r['match_date'].strftime('%Y-%m-%d')
            # also add total number of found dates
            r['n_match_dates'] = len(all_dates)

        return json.dumps(result)

    # find matching date strings
    matches = []
    language = []
    for d in nlp.pipe(df['pdf_text'], disable=['tagger', 'ner']):
        matches.append(get_date_matches_from_text(d))
        language.append(d._.language)
    df['matches'] = pd.Series(matches)
    df['language'] = pd.Series(language)

    return df