def preprocess_article(old_article, timeml_raw, nlp): tokens, time_values = parse_timeml_doc(timeml_raw) if tokens is None: return None doc = spacy.tokens.Doc(nlp.vocab, words=tokens) nlp.tagger(doc) nlp.entity(doc) nlp.parser(doc) token_objects = [] for token in doc: token_object = Token( token.orth_, token.lemma_, token.tag_, token.ent_type_, token.ent_iob_, token.dep_, token.head.i, time_values[token.i][0], time_values[token.i][1], ) token_objects.append(token_object) sentence_objects = [] for sent in doc.sents: sent_tokens = token_objects[sent.start : sent.end] times = [tok.time for tok in sent_tokens if tok.time] if times: time = times[0] else: time = None pub_time = arrow.get(old_article["time"]) sent_object = Sentence(str(sent), sent_tokens, pub_time, time, None) sentence_objects.append(sent_object) raw_title = old_article.get("title") if raw_title: title_object = preprocess_title(raw_title, pub_time, nlp) else: title_object = None new_article = Article( title=raw_title, text=old_article["text"], time=old_article["time"], id=old_article.get("id"), taxo=old_article["taxo"], page=old_article["page"], sentences=sentence_objects, title_sentence=title_object, ) return new_article
def preprocess_title(title, pub_time, nlp): doc = nlp(title) token_objects = [] for token in doc: token_object = Token( token.orth_, token.lemma_, token.tag_, token.ent_type_, token.ent_iob_, token.dep_, token.head.i, None, None, ) token_objects.append(token_object) title_object = Sentence(title, token_objects, pub_time, None, None) return title_object