Exemple #1
0
    def predict(self, sentence, relation):
        """
        Predict the tags for each token in sentence.
        :param sentence: the sentence to tag with conccept tags (see constants.(LEFT|RIGHT)_CONCEPT_TAG
        :param relation: the current relation
        :return: a dictionary:
            {concept_tag : list of tokenized concept mentions}

            e.g.
            Is the University of Rome in Rome?
            {
                "l": [ ["University", "of", "Rome"] ],
                "r": [ ["Rome"] ]
             }

        """
        x = self.make_x(sentence, relation)
        X = [np.array(t) for t in zip(*[x])]
        Y_pred = self.model.predict(X)
        no_pads = [t for t in x[0] if t != self.w2idx[c.PAD_TAG]]
        true_len = len(no_pads)

        word_list = [t.text for t in get_spacy_parser().tokenizer(sentence)]
        tag_sequence = self.to_tag_sequence(Y_pred[0])[:true_len]

        type2concepts = misc.merge_concept_tags(word_list, tag_sequence)

        return type2concepts
Exemple #2
0
    def make_x(self, answer, relation):
        parser = get_spacy_parser()

        # question = datautils.clean_question(question)
        # q_parsed = parser(question)
        # q_vect = list(
        #     map(lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed))
        # pad_length = self.MAX_LENGTH - len(q_vect)
        # nil_X = 0
        # x_question = (q_vect + (pad_length) * [nil_X])
        # x_question= np.array(x_question)

        answer = datautils.clean_question(answer)
        a_parsed = parser(answer)
        a_vect = list(
            map(
                lambda t: self.w2idx[t.text]
                if t.text in self.w2idx else self.w2idx[c.UNK_TAG], a_parsed))
        pad_length = self.MAX_LENGTH - len(a_vect)
        nil_X = 0
        x_answer = (a_vect + (pad_length) * [nil_X])
        x_answer = np.array(x_answer)

        x_rel = datautils.to_ohenc(c.RELATIONS.index(relation),
                                   len(c.RELATIONS))
        return np.array([x_answer, x_rel])
Exemple #3
0
    def make_XY(self, sents_relations, concepts):
        """

        :param sents_relations: list of pairs (question, relations)
        :param concepts: list of pairs (c1, c2)
        :return:
        """
        parser = get_spacy_parser()
        X, Y = [], []
        for (sent, relation), c_list in zip(sents_relations, concepts):
            x = self.make_x(sent, relation)

            question = datautils.clean_question(sent)
            q_parsed = parser(question)
            c1, c2 = datautils.clean_concept(
                c_list[0]), datautils.clean_concept(c_list[1])

            # find the indexes of the concept mentions
            c1_idx = question.find(c1)
            assert c1_idx != -1
            assert question[c1_idx:c1_idx + len(c1)] == c1

            # print((question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):]))
            c2_idx = (question[:c1_idx] + "#" * len(c1) +
                      question[c1_idx + len(c1):]).find(c2)
            if c2_idx != -1:
                assert question[c2_idx:c2_idx + len(c2)] == c2

            # iterate over tokens of the question
            # if the index falls into concept mentions indexes, then it is a concept right or left)
            tags = list(
                map(
                    lambda t: datautils.to_ohenc(
                        c.entity_tags.index(c.LEFT_ENT_TAG), len(c.entity_tags)
                    ) if (t.idx >= c1_idx and t.idx + len(t) <= c1_idx + len(
                        c1)) else datautils.to_ohenc(
                            c.entity_tags.index(c.RIGHT_ENT_TAG),
                            len(c.entity_tags)) if
                    (c2_idx != -1 and t.idx >= c2_idx and t.idx + len(t) <=
                     c2_idx + len(c2)) else datautils.to_ohenc(
                         c.entity_tags.index(c.N_ENT_TAG), len(c.entity_tags)),
                    q_parsed))

            nil_Y = datautils.to_ohenc(c.entity_tags.index(c.NIL_TAG),
                                       len(c.entity_tags))

            pad_length = self.MAX_LENGTH - len(tags)
            y = (tags + ((pad_length) * [nil_Y]))

            X.append(np.array(x))
            Y.append(np.array(y))

        X = [np.array(t) for t in zip(*X)]
        # at the end, X is a list of two arrays:
        #     the 1st is a list of sentences (in indexed forms)
        #     the 2nd is a list of relation representation
        # Y is a list of samples, each of them a list of tags
        return X, np.array(Y)
Exemple #4
0
    def make_x(self, sent):
        parser = get_spacy_parser()
        question = datautils.clean_question(sent)

        # tokenize the question string
        q_parsed = parser(question)
        q_vect = list(map(lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed))
        pad_length = self.MAX_LENGTH - len(q_vect)
        nil_X = 0
        x = (q_vect + (pad_length) * [nil_X])
        return np.array(x)
Exemple #5
0
 def make_vocab(self, sents):
     parser = get_spacy_parser()
     for s in sents:
         cleaned_s = datautils.clean_question(s)
         doc = parser.tokenizer(cleaned_s)
         for t in doc:
             if not t.text in self.w2idx:
                 new_idx = len(self.w2idx)
                 self.w2idx[t.text] = new_idx
                 self.idx2w[new_idx] = t.text
             if not t.text.lower() in self.w2idx:
                 new_idx = len(self.w2idx)
                 self.w2idx[t.text.lower()] = new_idx
                 self.idx2w[new_idx] = t.text.lower()
Exemple #6
0
    def make_XY(self, questions_answers_relations, concepts):
        parser = get_spacy_parser()
        X, Y = [], []
        for (question, answer,
             relation), c_list in zip(questions_answers_relations, concepts):
            # x = self.make_x(question, answer, relation)
            x = self.make_x(answer, relation)
            answer = datautils.clean_question(answer)
            a_parsed = parser(answer)
            c1, c2 = datautils.clean_concept(
                c_list[0]), datautils.clean_concept(c_list[1])

            # the question input layer is an older version: you can ignore the question parts
            # assert c1_idx != -1
            # assert question[c1_idx:c1_idx + len(c1)] == c1

            # print((question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):]))
            # c2_idx = (question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):]).find(c2)

            c1_idx = answer.find(c1)
            c2_idx = answer.find(c2)

            tags = list(
                map(
                    lambda t: datautils.to_ohenc(
                        c.entity_tags.index(c.RIGHT_ENT_TAG), len(c.entity_tags
                                                                  ))
                    if (c2_idx != -1 and t.idx >= c2_idx and t.idx + len(t) <=
                        c2_idx + len(c2)) else datautils.to_ohenc(
                            c.entity_tags.index(c.LEFT_ENT_TAG),
                            len(c.entity_tags)) if
                    (c1_idx != -1 and t.idx >= c1_idx and t.idx + len(t) <=
                     c1_idx + len(c1)) else datautils.to_ohenc(
                         c.entity_tags.index(c.N_ENT_TAG), len(c.entity_tags)),
                    a_parsed))
            print(c1, ";", c2)
            print(a_parsed)
            print(self.to_tag_sequence(tags))

            nil_Y = datautils.to_ohenc(c.entity_tags.index(c.NIL_TAG),
                                       len(c.entity_tags))

            pad_length = self.MAX_LENGTH - len(tags)
            y = (tags + ((pad_length) * [nil_Y]))

            X.append(np.array(x))
            Y.append(np.array(y))

        X = [np.array(t) for t in zip(*X)]
        return X, np.array(Y)
Exemple #7
0
    def predict(self, answer, relation):
        """See the ConceptRecognizer classifier, they are similar"""
        x = self.make_x(answer, relation)
        X = [np.array(t) for t in zip(*[x])]
        Y_pred = self.model.predict(X)
        no_pads = [t for t in x[0] if t != self.w2idx[c.PAD_TAG]]
        true_len = len(no_pads)

        word_list = [t.text for t in get_spacy_parser().tokenizer(answer)]
        tag_sequence = self.to_tag_sequence(Y_pred[0])[:true_len]

        type2concepts = misc.merge_concept_tags(word_list, tag_sequence)
        print(type2concepts)

        return type2concepts
Exemple #8
0
    def make_x(self, sent, relation):
        parser = get_spacy_parser()
        question = datautils.clean_question(sent)
        q_parsed = parser(question)
        q_vect = list(
            map(
                lambda t: self.w2idx[t.text]
                if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed))
        pad_length = self.MAX_LENGTH - len(q_vect)
        nil_X = 0
        x = (q_vect + (pad_length) * [nil_X])
        x = np.array(x)

        x_rel = datautils.to_ohenc(c.RELATIONS.index(relation),
                                   len(c.RELATIONS))
        return np.array([x, x_rel])
Exemple #9
0
def find_relation_from_datadir(data_dir, relation_extractors):
    # Iterate over the whole dataset.
    # subdir is the integer representing the current subdirectory that is under analysis;
    # xml_path_list is the list of filepaths of the files in that subdirectoy
    for subdir, xml_path_list in fman.get_docs_list_by_subdir(data_dir):

        # time measurement
        start_subdir = datetime.datetime.now()

        start = datetime.datetime.now()
        log_print(start, "Loading from subdir %03d" % subdir)
        cur_xml_list = []

        for xml_path in xml_path_list:
            try:
                # XML manager for parse the Wikipedia page in XML format
                cur_xml_man = fman.LxmlParser(xml_path)
            except Exception as e:
                log_print("Problem with " + xml_path)
                continue

            # read the first default number of sentences.
            cur_sentences = list(
                cur_xml_man.get_sentences(
                    num_of_sentences=conf.NUM_FIRST_WIKI_SENTENCES))

            # retrieve only the usable annotations, relative to only the retrieved sentences
            len_sentences = sum(len(sent)
                                for sent in cur_sentences) + len(cur_sentences)
            cur_annotations = list(
                cur_xml_man.get_annotations_by_range(0, len_sentences))

            # for every RelationExtraction object (i.e.: relation instance) retrieved, yield it.
            for inst in analyze_xml(cur_sentences, cur_annotations,
                                    dep_parser.get_spacy_parser(),
                                    relation_extractors):
                log_print(xml_path)
                yield inst

        log_print(datetime.datetime.now(), "Total Time elapsed: ",
                  datetime.datetime.now() - start_subdir)
        log_print("-" * 50)
Exemple #10
0
def find_relation_from_filelist(filepath_list, relation_extractors):
    for xml_path in filepath_list:

        try:
            cur_xml_man = fman.LxmlParser(xml_path)
        except Exception as e:
            log_print("Problem with " + xml_path)
            continue

        cur_sentences = list(
            cur_xml_man.get_sentences(
                num_of_sentences=conf.NUM_FIRST_WIKI_SENTENCES))
        len_sentences = sum(len(sent)
                            for sent in cur_sentences) + len(cur_sentences)
        cur_annotations = list(
            cur_xml_man.get_annotations_by_range(0, len_sentences))

        for inst in analyze_xml(cur_sentences, cur_annotations,
                                dep_parser.get_spacy_parser(),
                                relation_extractors):
            log_print(xml_path)
            yield inst
    pass
Exemple #11
0
 def filterPair(self, p):
     parser = get_spacy_parser()
     return len(parser.tokenizer(p[0])) < MAX_LENGTH and \
            len(parser.tokenizer(p[1])) < MAX_LENGTH
Exemple #12
0
 def addSentence(self, sentence):
     parser = get_spacy_parser()
     for word in parser.tokenizer(sentence):
         self.addWord(word.text)
Exemple #13
0
import utils.dependency_parser as dep_parser
import configurations as conf

parser = dep_parser.get_spacy_parser()


class RelationExtractor(object):
    """
	This class represent the extractor object which compute all it is needed for retrieve new relation instances.
	"""
    def __init__(self,
                 name,
                 seeds,
                 similarity_threshold=conf.DEFAULT_SIMILARITY_THRESHOLD):
        self.name = name
        # a list of strings
        self.initial_seeds = seeds
        # a float
        self.similarity_threshold = similarity_threshold
        # a list of tuples in the form (dep_parsed_rel_phrase, similarity_handicap), where:
        # 	- dep_parsed_rel_phrase is the object returned by the spaCy dependency parser
        # 		over the fictitious sentence (adding "X" and "Y");
        # 	- similarity_handicap is 1.0 for root seeds; for second or higher level seeds,
        # 		this value is exactly the similarity that brings them in the RelationExtractor.
        # 		It is used for decrease the computed similarity with not-root seeds
        self.parsed_seeds = [
            (next(parser("X " + relational_phrase + " Y").sents), 1.)
            for relational_phrase in seeds
        ]

        # a list of strings, contains zero-level seeds and higher-level ones when they are inserted.
Exemple #14
0
def simplify_ctx(ctx):
    """dep-parse the context and find the shortest path between xxx and yyy;
    then enrich the path with auxiliary words and others"""
    parser = get_spacy_parser()
    # print("="*50)
    # print(orig_ctx)
    # print(ctx)

    doc = parser(ctx)
    sents = list(doc.sents)
    try:
        assert len(sents) == 1
    except AssertionError:
        c1_idx = ctx.find(c.LEFT_CONCEPT_TAG)
        c2_idx = ctx.find(c.RIGHT_CONCEPT_TAG)
        sent = ctx[c1_idx:c2_idx + len(c.RIGHT_CONCEPT_TAG)]
        if sent == "":
            sent = ctx[c2_idx:c1_idx + len(c.LEFT_CONCEPT_TAG)]
        doc = parser(sent)
        sents = list(doc.sents)
        try:
            assert len(sents) == 1
        except:
            return ""

    sent = sents[0]
    xxx_tok = None
    yyy_tok = None

    for t in sent:
        if (c.LEFT_CONCEPT_TAG in t.text):
            xxx_tok = t
            # print("xxx:", xxx_tok.text)
            break

    for t in sent:
        if (c.RIGHT_CONCEPT_TAG in t.text):
            yyy_tok = t
            # print("yyy:", yyy_tok.text)
            break

    assert xxx_tok and yyy_tok

    if xxx_tok.i < yyy_tok.i:
        start = xxx_tok
        end = yyy_tok
    else:
        start = yyy_tok
        end = xxx_tok

    G = gut.build_networkXGraph_from_spaCy_depGraph(sent)
    sh_path = gut.shortest_path(G, source=start, target=end)
    # print("Shortest path: ", sh_path)
    final_string = ""
    left_string = ""
    right_string = ""
    added_tokens = []
    for t in sh_path:
        left_children = t.lefts
        right_children = t.lefts
        for child in left_children:
            if child in sh_path or child in added_tokens: continue
            if child.pos_ == "DET" or (t.pos_ == "VERB"
                                       and child.pos_ == "VERB"):
                left_string += " " + child.text
                added_tokens.append(child)
        for child in right_children:
            if child in sh_path or child in added_tokens: continue
            if child.pos_ == "DET" or (t.pos_ == "VERB"
                                       and child.pos_ == "VERB"):
                right_string += " " + child.text
                added_tokens.append(child)

        final_string += left_string + " " + t.text + right_string
        # print(repr(final_string), repr(left_string), repr(t.text), repr(right_string))
        left_string = ""
        right_string = ""

    res = final_string.strip() + " ."
    # res = build_string()
    # print(res)
    return res