def predict(self, sentence, relation): """ Predict the tags for each token in sentence. :param sentence: the sentence to tag with conccept tags (see constants.(LEFT|RIGHT)_CONCEPT_TAG :param relation: the current relation :return: a dictionary: {concept_tag : list of tokenized concept mentions} e.g. Is the University of Rome in Rome? { "l": [ ["University", "of", "Rome"] ], "r": [ ["Rome"] ] } """ x = self.make_x(sentence, relation) X = [np.array(t) for t in zip(*[x])] Y_pred = self.model.predict(X) no_pads = [t for t in x[0] if t != self.w2idx[c.PAD_TAG]] true_len = len(no_pads) word_list = [t.text for t in get_spacy_parser().tokenizer(sentence)] tag_sequence = self.to_tag_sequence(Y_pred[0])[:true_len] type2concepts = misc.merge_concept_tags(word_list, tag_sequence) return type2concepts
def make_x(self, answer, relation): parser = get_spacy_parser() # question = datautils.clean_question(question) # q_parsed = parser(question) # q_vect = list( # map(lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed)) # pad_length = self.MAX_LENGTH - len(q_vect) # nil_X = 0 # x_question = (q_vect + (pad_length) * [nil_X]) # x_question= np.array(x_question) answer = datautils.clean_question(answer) a_parsed = parser(answer) a_vect = list( map( lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], a_parsed)) pad_length = self.MAX_LENGTH - len(a_vect) nil_X = 0 x_answer = (a_vect + (pad_length) * [nil_X]) x_answer = np.array(x_answer) x_rel = datautils.to_ohenc(c.RELATIONS.index(relation), len(c.RELATIONS)) return np.array([x_answer, x_rel])
def make_XY(self, sents_relations, concepts): """ :param sents_relations: list of pairs (question, relations) :param concepts: list of pairs (c1, c2) :return: """ parser = get_spacy_parser() X, Y = [], [] for (sent, relation), c_list in zip(sents_relations, concepts): x = self.make_x(sent, relation) question = datautils.clean_question(sent) q_parsed = parser(question) c1, c2 = datautils.clean_concept( c_list[0]), datautils.clean_concept(c_list[1]) # find the indexes of the concept mentions c1_idx = question.find(c1) assert c1_idx != -1 assert question[c1_idx:c1_idx + len(c1)] == c1 # print((question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):])) c2_idx = (question[:c1_idx] + "#" * len(c1) + question[c1_idx + len(c1):]).find(c2) if c2_idx != -1: assert question[c2_idx:c2_idx + len(c2)] == c2 # iterate over tokens of the question # if the index falls into concept mentions indexes, then it is a concept right or left) tags = list( map( lambda t: datautils.to_ohenc( c.entity_tags.index(c.LEFT_ENT_TAG), len(c.entity_tags) ) if (t.idx >= c1_idx and t.idx + len(t) <= c1_idx + len( c1)) else datautils.to_ohenc( c.entity_tags.index(c.RIGHT_ENT_TAG), len(c.entity_tags)) if (c2_idx != -1 and t.idx >= c2_idx and t.idx + len(t) <= c2_idx + len(c2)) else datautils.to_ohenc( c.entity_tags.index(c.N_ENT_TAG), len(c.entity_tags)), q_parsed)) nil_Y = datautils.to_ohenc(c.entity_tags.index(c.NIL_TAG), len(c.entity_tags)) pad_length = self.MAX_LENGTH - len(tags) y = (tags + ((pad_length) * [nil_Y])) X.append(np.array(x)) Y.append(np.array(y)) X = [np.array(t) for t in zip(*X)] # at the end, X is a list of two arrays: # the 1st is a list of sentences (in indexed forms) # the 2nd is a list of relation representation # Y is a list of samples, each of them a list of tags return X, np.array(Y)
def make_x(self, sent): parser = get_spacy_parser() question = datautils.clean_question(sent) # tokenize the question string q_parsed = parser(question) q_vect = list(map(lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed)) pad_length = self.MAX_LENGTH - len(q_vect) nil_X = 0 x = (q_vect + (pad_length) * [nil_X]) return np.array(x)
def make_vocab(self, sents): parser = get_spacy_parser() for s in sents: cleaned_s = datautils.clean_question(s) doc = parser.tokenizer(cleaned_s) for t in doc: if not t.text in self.w2idx: new_idx = len(self.w2idx) self.w2idx[t.text] = new_idx self.idx2w[new_idx] = t.text if not t.text.lower() in self.w2idx: new_idx = len(self.w2idx) self.w2idx[t.text.lower()] = new_idx self.idx2w[new_idx] = t.text.lower()
def make_XY(self, questions_answers_relations, concepts): parser = get_spacy_parser() X, Y = [], [] for (question, answer, relation), c_list in zip(questions_answers_relations, concepts): # x = self.make_x(question, answer, relation) x = self.make_x(answer, relation) answer = datautils.clean_question(answer) a_parsed = parser(answer) c1, c2 = datautils.clean_concept( c_list[0]), datautils.clean_concept(c_list[1]) # the question input layer is an older version: you can ignore the question parts # assert c1_idx != -1 # assert question[c1_idx:c1_idx + len(c1)] == c1 # print((question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):])) # c2_idx = (question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):]).find(c2) c1_idx = answer.find(c1) c2_idx = answer.find(c2) tags = list( map( lambda t: datautils.to_ohenc( c.entity_tags.index(c.RIGHT_ENT_TAG), len(c.entity_tags )) if (c2_idx != -1 and t.idx >= c2_idx and t.idx + len(t) <= c2_idx + len(c2)) else datautils.to_ohenc( c.entity_tags.index(c.LEFT_ENT_TAG), len(c.entity_tags)) if (c1_idx != -1 and t.idx >= c1_idx and t.idx + len(t) <= c1_idx + len(c1)) else datautils.to_ohenc( c.entity_tags.index(c.N_ENT_TAG), len(c.entity_tags)), a_parsed)) print(c1, ";", c2) print(a_parsed) print(self.to_tag_sequence(tags)) nil_Y = datautils.to_ohenc(c.entity_tags.index(c.NIL_TAG), len(c.entity_tags)) pad_length = self.MAX_LENGTH - len(tags) y = (tags + ((pad_length) * [nil_Y])) X.append(np.array(x)) Y.append(np.array(y)) X = [np.array(t) for t in zip(*X)] return X, np.array(Y)
def predict(self, answer, relation): """See the ConceptRecognizer classifier, they are similar""" x = self.make_x(answer, relation) X = [np.array(t) for t in zip(*[x])] Y_pred = self.model.predict(X) no_pads = [t for t in x[0] if t != self.w2idx[c.PAD_TAG]] true_len = len(no_pads) word_list = [t.text for t in get_spacy_parser().tokenizer(answer)] tag_sequence = self.to_tag_sequence(Y_pred[0])[:true_len] type2concepts = misc.merge_concept_tags(word_list, tag_sequence) print(type2concepts) return type2concepts
def make_x(self, sent, relation): parser = get_spacy_parser() question = datautils.clean_question(sent) q_parsed = parser(question) q_vect = list( map( lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed)) pad_length = self.MAX_LENGTH - len(q_vect) nil_X = 0 x = (q_vect + (pad_length) * [nil_X]) x = np.array(x) x_rel = datautils.to_ohenc(c.RELATIONS.index(relation), len(c.RELATIONS)) return np.array([x, x_rel])
def find_relation_from_datadir(data_dir, relation_extractors): # Iterate over the whole dataset. # subdir is the integer representing the current subdirectory that is under analysis; # xml_path_list is the list of filepaths of the files in that subdirectoy for subdir, xml_path_list in fman.get_docs_list_by_subdir(data_dir): # time measurement start_subdir = datetime.datetime.now() start = datetime.datetime.now() log_print(start, "Loading from subdir %03d" % subdir) cur_xml_list = [] for xml_path in xml_path_list: try: # XML manager for parse the Wikipedia page in XML format cur_xml_man = fman.LxmlParser(xml_path) except Exception as e: log_print("Problem with " + xml_path) continue # read the first default number of sentences. cur_sentences = list( cur_xml_man.get_sentences( num_of_sentences=conf.NUM_FIRST_WIKI_SENTENCES)) # retrieve only the usable annotations, relative to only the retrieved sentences len_sentences = sum(len(sent) for sent in cur_sentences) + len(cur_sentences) cur_annotations = list( cur_xml_man.get_annotations_by_range(0, len_sentences)) # for every RelationExtraction object (i.e.: relation instance) retrieved, yield it. for inst in analyze_xml(cur_sentences, cur_annotations, dep_parser.get_spacy_parser(), relation_extractors): log_print(xml_path) yield inst log_print(datetime.datetime.now(), "Total Time elapsed: ", datetime.datetime.now() - start_subdir) log_print("-" * 50)
def find_relation_from_filelist(filepath_list, relation_extractors): for xml_path in filepath_list: try: cur_xml_man = fman.LxmlParser(xml_path) except Exception as e: log_print("Problem with " + xml_path) continue cur_sentences = list( cur_xml_man.get_sentences( num_of_sentences=conf.NUM_FIRST_WIKI_SENTENCES)) len_sentences = sum(len(sent) for sent in cur_sentences) + len(cur_sentences) cur_annotations = list( cur_xml_man.get_annotations_by_range(0, len_sentences)) for inst in analyze_xml(cur_sentences, cur_annotations, dep_parser.get_spacy_parser(), relation_extractors): log_print(xml_path) yield inst pass
def filterPair(self, p): parser = get_spacy_parser() return len(parser.tokenizer(p[0])) < MAX_LENGTH and \ len(parser.tokenizer(p[1])) < MAX_LENGTH
def addSentence(self, sentence): parser = get_spacy_parser() for word in parser.tokenizer(sentence): self.addWord(word.text)
import utils.dependency_parser as dep_parser import configurations as conf parser = dep_parser.get_spacy_parser() class RelationExtractor(object): """ This class represent the extractor object which compute all it is needed for retrieve new relation instances. """ def __init__(self, name, seeds, similarity_threshold=conf.DEFAULT_SIMILARITY_THRESHOLD): self.name = name # a list of strings self.initial_seeds = seeds # a float self.similarity_threshold = similarity_threshold # a list of tuples in the form (dep_parsed_rel_phrase, similarity_handicap), where: # - dep_parsed_rel_phrase is the object returned by the spaCy dependency parser # over the fictitious sentence (adding "X" and "Y"); # - similarity_handicap is 1.0 for root seeds; for second or higher level seeds, # this value is exactly the similarity that brings them in the RelationExtractor. # It is used for decrease the computed similarity with not-root seeds self.parsed_seeds = [ (next(parser("X " + relational_phrase + " Y").sents), 1.) for relational_phrase in seeds ] # a list of strings, contains zero-level seeds and higher-level ones when they are inserted.
def simplify_ctx(ctx): """dep-parse the context and find the shortest path between xxx and yyy; then enrich the path with auxiliary words and others""" parser = get_spacy_parser() # print("="*50) # print(orig_ctx) # print(ctx) doc = parser(ctx) sents = list(doc.sents) try: assert len(sents) == 1 except AssertionError: c1_idx = ctx.find(c.LEFT_CONCEPT_TAG) c2_idx = ctx.find(c.RIGHT_CONCEPT_TAG) sent = ctx[c1_idx:c2_idx + len(c.RIGHT_CONCEPT_TAG)] if sent == "": sent = ctx[c2_idx:c1_idx + len(c.LEFT_CONCEPT_TAG)] doc = parser(sent) sents = list(doc.sents) try: assert len(sents) == 1 except: return "" sent = sents[0] xxx_tok = None yyy_tok = None for t in sent: if (c.LEFT_CONCEPT_TAG in t.text): xxx_tok = t # print("xxx:", xxx_tok.text) break for t in sent: if (c.RIGHT_CONCEPT_TAG in t.text): yyy_tok = t # print("yyy:", yyy_tok.text) break assert xxx_tok and yyy_tok if xxx_tok.i < yyy_tok.i: start = xxx_tok end = yyy_tok else: start = yyy_tok end = xxx_tok G = gut.build_networkXGraph_from_spaCy_depGraph(sent) sh_path = gut.shortest_path(G, source=start, target=end) # print("Shortest path: ", sh_path) final_string = "" left_string = "" right_string = "" added_tokens = [] for t in sh_path: left_children = t.lefts right_children = t.lefts for child in left_children: if child in sh_path or child in added_tokens: continue if child.pos_ == "DET" or (t.pos_ == "VERB" and child.pos_ == "VERB"): left_string += " " + child.text added_tokens.append(child) for child in right_children: if child in sh_path or child in added_tokens: continue if child.pos_ == "DET" or (t.pos_ == "VERB" and child.pos_ == "VERB"): right_string += " " + child.text added_tokens.append(child) final_string += left_string + " " + t.text + right_string # print(repr(final_string), repr(left_string), repr(t.text), repr(right_string)) left_string = "" right_string = "" res = final_string.strip() + " ." # res = build_string() # print(res) return res