Example #1
0
def align_yields(p1, p2, align_func=None):
    """finds the best alignment of words from two passages
    Note: this function is symetrical
    consider using reverse_mapping instead of calling it twice

    returns iterator of tuples (i,j)
            mapping from i - p1 positions
                    to j - aligned p2 positions"""
    positions1, positions2 = break2common_sentences(p1, p2)
    terminals1 = extract_terminals(p1)
    terminals2 = extract_terminals(p2)

    # map the words in each sentence to each other
    if len(positions1) == len(positions2):
        mapping = set()
        sentence_start1 = 0
        sentence_start2 = 0
        for i in range(len(positions1)):
            sentence1 = terminals1[sentence_start1:positions1[i]]
            sentence2 = terminals2[sentence_start2:positions2[i]]
            for (j, k) in align(sentence1, sentence2, False, align_func)[1]:
                if j != -1:
                    j += sentence_start1
                if k != -1:
                    k += sentence_start2
                mapping.add((j, k))

            sentence_start1 = positions1[i]
            sentence_start2 = positions2[i]
        return mapping
    else:
        print(
            "Error number of sentences aqquired from break2common_sentences does not match"
        )
Example #2
0
def align_yields(p1, p2):
    """finds the best alignment of words from two passages
    Note: this function is symetrical
    consider using reverse_mapping instead of calling it twice

    returns iterator of tuples (i,j)
            mapping from i - p1 positions 
                    to j - aligned p2 positions"""
    positions1, positions2 = break2common_sentences(p1, p2)
    terminals1 = extract_terminals(p1)
    terminals2 = extract_terminals(p2)

    # map the words in each sentence to each other
    if len(positions1) == len(positions2):
        mapping = set()
        sentence_start1 = 0
        sentence_start2 = 0
        for i in range(len(positions1)):
            sentence1 = terminals1[sentence_start1:positions1[i]]
            sentence2 = terminals2[sentence_start2:positions2[i]]
            for (j, k) in align(sentence1, sentence2, False)[1]:
                if j != -1:
                    j += sentence_start1
                if k != -1:
                    k += sentence_start2
                mapping.add((j, k))

            sentence_start1 = positions1[i]
            sentence_start2 = positions2[i]
        return mapping
    else:
        print("Error number of sentences aqquired from break2common_sentences does not match")
Example #3
0
 def split(self, passage):
     ends = []
     ids = []
     tokens = []
     for terminal in extract_terminals(passage):
         tokens.append(terminal.text)
         sentence = " ".join(tokens)
         # if len(tokens) > max(map(len, map(str.split, sentence_to_index))):
         #     raise ValueError("Failed matching '%s'" % sentence)
         if self.index is not None and self.index < len(self.sentences) and \
                 self.sentences[self.index].startswith(sentence):  # Try matching next sentence rather than shortest
             index = self.index if self.sentences[
                 self.index] == sentence else None
         else:
             index = self.index = self.sentence_to_index.get(sentence)
         if index is not None:
             self.matched_indices.add(index)
             ends.append(terminal.position)
             ids.append(str(index))
             tokens = []
             self.index += 1
     return split_passage(passage,
                          ends,
                          ids=ids if self.enumerate else None,
                          suffix_format=self.suffix_format,
                          suffix_start=self.suffix_start)
Example #4
0
 def split(self, passage):
     ends = []
     ids = []
     token_lists = []
     for terminal in extract_terminals(passage):
         token_lists.append([])
         for terminals in token_lists if self.index is None else [token_lists[0]]:
             terminals.append(terminal)
             sentence = " ".join(t.text for t in terminals)
             if self.index is not None and self.index < len(self.sentences) and self.sentences[
                     self.index].startswith(sentence):  # Try matching next sentence rather than shortest
                 index = self.index if self.sentences[self.index] == sentence else None
             else:
                 indices = self.sentence_to_index.get(sentence)
                 index = self.index = indices.pop(0) if indices else None
             if index is not None:
                 self.matched_indices.add(index)
                 last_end = terminals[0].position - 1
                 if len(terminals) > 1 and last_end and last_end not in ends:
                     ends.append(last_end)
                 ends.append(terminal.position)
                 ids.append(str(index))
                 token_lists = []
                 self.index += 1
                 break
     return split_passage(passage, ends, ids=ids if self.enumerate else None,
                          suffix_format=self.suffix_format, suffix_start=self.suffix_start)
Example #5
0
def get_lowest_fn(p):
    """ finds the FN that has terminals as children"""
    s = set()
    for term in extract_terminals(p):
        s.update([
            edge.parent for edge in term.incoming
            if is_foundational(edge.parent)
        ])
    return s
Example #6
0
def split(passage, order):
    ends = []
    ids = []
    sentence = []
    for terminal in extract_terminals(passage):
        sentence.append(terminal.text)
        # if len(sentence) > max(map(len, map(str.split, order))):
        #     raise ValueError("Failed matching '%s'" % " ".join(sentence))
        index = order.get(" ".join(sentence))
        if index is not None:
            ends.append(terminal.position)
            ids.append(str(index))
            sentence = []
    return split_passage(passage, ends, ids=ids)
Example #7
0
 def split(self, passage):
     ends = []
     ids = []
     token_lists = []
     for terminal in extract_terminals(passage):
         token_lists.append([])
         for terminals in token_lists if self.index is None else [
                 token_lists[0]
         ]:
             terminals.append(terminal)
             sentence = " ".join(t.text for t in terminals)
             if self.index is not None and self.index < len(
                     self.sentences
             ) and self.sentences[self.index].startswith(
                     sentence
             ):  # Try matching next sentence rather than shortest
                 index = self.index if self.sentences[
                     self.index] == sentence else None
             else:
                 indices = self.sentence_to_index.get(sentence)
                 index = self.index = indices.pop(0) if indices else None
             if index is not None:
                 self.matched_indices.add(index)
                 last_end = terminals[0].position - 1
                 if len(terminals
                        ) > 1 and last_end and last_end not in ends:
                     ends.append(last_end)
                 ends.append(terminal.position)
                 ids.append(str(index))
                 token_lists = []
                 self.index += 1
                 break
     return split_passage(passage,
                          ends,
                          ids=ids if self.enumerate else None,
                          suffix_format=self.suffix_format,
                          suffix_start=self.suffix_start)
Example #8
0
def get_lowest_fn(p):
    """ finds the FN that has terminals as children"""
    s = set()
    for term in extract_terminals(p):
        s.update([edge.parent for edge in term.incoming if is_foundational(edge.parent)])
    return s
Example #9
0
def main():
    print(
        align.align("what has is by the meaning of the word is",
                    "what is the men for the wk is are be"))

    # read xml files
    print("reading db xmls")
    p = []
    for filename in filenames:
        with open(add_path(filename), "rb") as fl:
            p += pickle.load(fl)[0]
        print(
            "read ", filename, " it starts with ",
            tuple(term.text for term in textutil.extract_terminals(
                convert.from_site(p[-1]))[:6]))
    # convert xml to passages
    p = list(map(convert.from_site, p))

    print("reading passage xmls")
    # read passage files
    for filename in passage_filenames:
        print("reading" + filename)
        if os.path.isfile(add_path(os.path.splitext(filename)[0] + ".pkl")):
            with open(add_path(os.path.splitext(filename)[0] + ".pkl"),
                      "rb") as fl:
                p.append(pickle.load(fl))
        else:
            p.append(file2passage(add_path(filename)))
            with open(add_path(os.path.splitext(filename)[0] + ".pkl"),
                      "wb") as fl:
                pickle.dump(p[-1], fl)
                print("dumping",
                      add_path(os.path.splitext(filename)[0] + ".pkl"))

    all_filenames = filenames + passage_filenames
    print("read ", all_filenames)
    word2word = align.align_yields(p[0], p[1])
    assert align.reverse_mapping(word2word) == align.align_yields(
        p[1], p[0]), "align_yields asymmetrical"

    # create symmilarity matrix
    sources = []
    goals = []
    names = []
    i = 0
    while i < len(p):
        names.append(all_filenames[i])
        sources.append(p[i])
        i += 1
        goals.append(p[i])
        i += 1
    chunksize = 1
    if (len(goals) > 100):
        chunksize = int(len(goals) / POOL_SIZE / 10)
    print("multithreading with chunksize", chunksize)
    pool = Pool(POOL_SIZE)
    if r2s:
        results = pool.starmap(distances, zip(goals, sources, names),
                               chunksize)
    else:
        results = pool.starmap(distances, zip(sources, goals, names),
                               chunksize)
    print(results)
    pool.close()
    pool.join()
    sym_mat = []
    keys = []
    for row, key in results:
        keys.append(key)
        sym_mat.append(row)
    print("functions and matrix")
    print(funcs + keys)
    for item in sym_mat:
        print(item)
    print("overall token analysis")
    print(align.token_level_analysis(p))
    output_path = trial_name + "output.csv"
    with open(output_path, "w") as f:
        print("writing output to " + output_path)
        writer = csv.writer(f)
        writer.writerows(sym_mat)
    send_mail("*****@*****.**", "finished",
              os.path.abspath(output_path))
    return
Example #10
0
def ucca_mod(reference,
             candidate,
             reference_passage=None,
             candidate_passage=None,
             pos=False,
             **kwargs):
    """

    :param reference: reference sentence: string
    :param candidate: candidate sentence: string
    :param reference_passage: UCCA representation of reference sentence
    :param candidate_passage: UCCA representation of candidate sentence
    :param pos: Use POS instead of UCCA to determine core words. default: False
    :param kwargs: kwargs used in calibration(call calibrate_ucca_single),
                   including length_weight, scene_weight, edge_weight and node_weight
    :return: the weighted UCCA-MTE score
    """

    # return weight of a word based on its path tags
    def find_score(core_set: dict, tagchain: list):
        if tagchain[0] not in core_set:
            return 0
        return core_set[tagchain[0]]

    # extract word nodes from UCCA representations
    if reference_passage is None or candidate_passage is None:
        reference_passage, candidate_passage = tuple(
            ucca_parse_sentences([reference, candidate], 'models/ucca-bilstm'))

    if type(reference_passage) is NoSentence or type(
            candidate_passage) is NoSentence:
        return 0

    reference_terminals = [
        node for node in extract_terminals(reference_passage)
    ]
    candidate_terminals = [
        node for node in extract_terminals(candidate_passage)
    ]

    core_set = {
        'P': 1,
        'S': 1,
        'A': 1,
        'C': 1
    }  # semantic role tag set of semantic core words

    # define core POSs
    def good_pos(s: str):
        pos = ['V', 'N', 'PRP', 'WP']
        return any([s.startswith(p) for p in pos])

    # POS tagging
    if pos:
        reference_pos = pos_tag(
            [node.text for node in filter(lambda x: x, reference_terminals)])
        candidate_pos = pos_tag(
            [node.text for node in filter(lambda x: x, candidate_terminals)])
        for i in range(len(reference_terminals)):
            if reference_terminals[i] is None:
                reference_pos.insert(i, ("", ""))
        for i in range(len(candidate_terminals)):
            if candidate_terminals[i] is None:
                candidate_pos.insert(i, ("", ""))

    # find core words
    reference_cores = {}
    for i in range(len(reference_terminals)):
        if reference_terminals[i]:
            tags, parents = align.find_ancester(
                reference_terminals[i])  # get path tags
            if not pos:
                # determine core by UCCA tags
                if len(set(tags[0][0:1]) - core_set.keys()) == 0:
                    reference_cores[i] = (reference_terminals[i],
                                          find_score(core_set,
                                                     tags[0]), tags, parents)
            else:
                # determine core by POS tags
                if good_pos(reference_pos[i][1]):
                    reference_cores[i] = (reference_terminals[i], 1, tags,
                                          parents)
    candidate_cores = {}
    for i in range(len(candidate_terminals)):
        if candidate_terminals[i]:
            tags, parents = align.find_ancester(candidate_terminals[i])
            if not pos:
                if len(set(tags[0][0:1]) - core_set.keys()) == 0:
                    candidate_cores[i] = (candidate_terminals[i],
                                          find_score(core_set,
                                                     tags[0]), tags, parents)
            else:
                if good_pos(candidate_pos[i][1]):
                    candidate_cores[i] = (candidate_terminals[i], 1, tags,
                                          parents)

    # get stems of core words
    stemmer = PorterStemmer()
    reference_stems = Counter([
        stemmer.stem(core[0].text.lower())
        for core in reference_cores.values()
    ])
    candidate_stems = Counter([
        stemmer.stem(core[0].text.lower())
        for core in candidate_cores.values()
    ])

    # compute matching proportion
    reference_count = 0
    for k, v in reference_stems.items():
        reference_count += min(v, candidate_stems.get(k, 0))
    reference_core_score = reference_count / max(len(reference_cores), 1)
    candidate_count = 0
    for k, v in candidate_stems.items():
        candidate_count += min(v, reference_stems.get(k, 0))
    candidate_core_score = candidate_count / max(len(candidate_cores), 1)

    # compute F1
    if reference_core_score + candidate_core_score == 0:
        core_score = 0.5
    else:
        core_score = 2 * reference_core_score * candidate_core_score / (
            reference_core_score + candidate_core_score)

    # calibration
    core_score = calibrate_ucca_single(core_score, reference, candidate,
                                       reference_passage, candidate_passage,
                                       **kwargs)
    return core_score