def preprocess_article(old_article, timeml_raw, nlp):
    tokens, time_values = parse_timeml_doc(timeml_raw)

    if tokens is None:
        return None

    doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
    nlp.tagger(doc)
    nlp.entity(doc)
    nlp.parser(doc)

    token_objects = []
    for token in doc:
        token_object = Token(
            token.orth_,
            token.lemma_,
            token.tag_,
            token.ent_type_,
            token.ent_iob_,
            token.dep_,
            token.head.i,
            time_values[token.i][0],
            time_values[token.i][1],
        )
        token_objects.append(token_object)

    sentence_objects = []
    for sent in doc.sents:
        sent_tokens = token_objects[sent.start : sent.end]
        times = [tok.time for tok in sent_tokens if tok.time]
        if times:
            time = times[0]
        else:
            time = None

        pub_time = arrow.get(old_article["time"])
        sent_object = Sentence(str(sent), sent_tokens, pub_time, time, None)
        sentence_objects.append(sent_object)

    raw_title = old_article.get("title")
    if raw_title:
        title_object = preprocess_title(raw_title, pub_time, nlp)
    else:
        title_object = None

    new_article = Article(
        title=raw_title,
        text=old_article["text"],
        time=old_article["time"],
        id=old_article.get("id"),
        taxo=old_article["taxo"],
        page=old_article["page"],
        sentences=sentence_objects,
        title_sentence=title_object,
    )
    return new_article
Exemple #2
0
def preprocess_title(title, pub_time, nlp):
    doc = nlp(title)
    token_objects = []
    for token in doc:
        token_object = Token(
            token.orth_,
            token.lemma_,
            token.tag_,
            token.ent_type_,
            token.ent_iob_,
            token.dep_,
            token.head.i,
            None,
            None,
        )
        token_objects.append(token_object)
    title_object = Sentence(title, token_objects, pub_time, None, None)
    return title_object