def terms_inference(sentences, terms_trie): """ Given (tokenized and tagged) sentences and a trie of terms, it will infere terms occurences and return list of sentence trees. Args: sentences: shallow-parsed text terms_trie: trie of terms Return: list of shallow parse trees with inferred terms, dictionary of refferences to terms positions """ parsed_sentences = [] terms_positions = defaultdict(list) for sentence in sentences: parsed_sentence = ParentedTree('S', []) token_index = 0 while token_index < len(sentence): term_label, term_length = _longest_matching_term( sentence, token_index, terms_trie) if term_length > 0: # term found term_node = ParentedTree('TERM', []) term = name_to_term(term_label) term_node.term = term terms_positions[term].append(term_node) for token in sentence[token_index:token_index + term_length]: _append_word_token(term_node, token) parsed_sentence.append(term_node) token_index += term_length else: # there is no term starting from current postion token = sentence[token_index] _append_word_token(parsed_sentence, token) token_index += 1 parsed_sentences.append(parsed_sentence) return parsed_sentences, terms_positions