def align_yields(p1, p2, align_func=None): """finds the best alignment of words from two passages Note: this function is symetrical consider using reverse_mapping instead of calling it twice returns iterator of tuples (i,j) mapping from i - p1 positions to j - aligned p2 positions""" positions1, positions2 = break2common_sentences(p1, p2) terminals1 = extract_terminals(p1) terminals2 = extract_terminals(p2) # map the words in each sentence to each other if len(positions1) == len(positions2): mapping = set() sentence_start1 = 0 sentence_start2 = 0 for i in range(len(positions1)): sentence1 = terminals1[sentence_start1:positions1[i]] sentence2 = terminals2[sentence_start2:positions2[i]] for (j, k) in align(sentence1, sentence2, False, align_func)[1]: if j != -1: j += sentence_start1 if k != -1: k += sentence_start2 mapping.add((j, k)) sentence_start1 = positions1[i] sentence_start2 = positions2[i] return mapping else: print( "Error number of sentences aqquired from break2common_sentences does not match" )
def align_yields(p1, p2): """finds the best alignment of words from two passages Note: this function is symetrical consider using reverse_mapping instead of calling it twice returns iterator of tuples (i,j) mapping from i - p1 positions to j - aligned p2 positions""" positions1, positions2 = break2common_sentences(p1, p2) terminals1 = extract_terminals(p1) terminals2 = extract_terminals(p2) # map the words in each sentence to each other if len(positions1) == len(positions2): mapping = set() sentence_start1 = 0 sentence_start2 = 0 for i in range(len(positions1)): sentence1 = terminals1[sentence_start1:positions1[i]] sentence2 = terminals2[sentence_start2:positions2[i]] for (j, k) in align(sentence1, sentence2, False)[1]: if j != -1: j += sentence_start1 if k != -1: k += sentence_start2 mapping.add((j, k)) sentence_start1 = positions1[i] sentence_start2 = positions2[i] return mapping else: print("Error number of sentences aqquired from break2common_sentences does not match")
def split(self, passage): ends = [] ids = [] tokens = [] for terminal in extract_terminals(passage): tokens.append(terminal.text) sentence = " ".join(tokens) # if len(tokens) > max(map(len, map(str.split, sentence_to_index))): # raise ValueError("Failed matching '%s'" % sentence) if self.index is not None and self.index < len(self.sentences) and \ self.sentences[self.index].startswith(sentence): # Try matching next sentence rather than shortest index = self.index if self.sentences[ self.index] == sentence else None else: index = self.index = self.sentence_to_index.get(sentence) if index is not None: self.matched_indices.add(index) ends.append(terminal.position) ids.append(str(index)) tokens = [] self.index += 1 return split_passage(passage, ends, ids=ids if self.enumerate else None, suffix_format=self.suffix_format, suffix_start=self.suffix_start)
def split(self, passage): ends = [] ids = [] token_lists = [] for terminal in extract_terminals(passage): token_lists.append([]) for terminals in token_lists if self.index is None else [token_lists[0]]: terminals.append(terminal) sentence = " ".join(t.text for t in terminals) if self.index is not None and self.index < len(self.sentences) and self.sentences[ self.index].startswith(sentence): # Try matching next sentence rather than shortest index = self.index if self.sentences[self.index] == sentence else None else: indices = self.sentence_to_index.get(sentence) index = self.index = indices.pop(0) if indices else None if index is not None: self.matched_indices.add(index) last_end = terminals[0].position - 1 if len(terminals) > 1 and last_end and last_end not in ends: ends.append(last_end) ends.append(terminal.position) ids.append(str(index)) token_lists = [] self.index += 1 break return split_passage(passage, ends, ids=ids if self.enumerate else None, suffix_format=self.suffix_format, suffix_start=self.suffix_start)
def get_lowest_fn(p): """ finds the FN that has terminals as children""" s = set() for term in extract_terminals(p): s.update([ edge.parent for edge in term.incoming if is_foundational(edge.parent) ]) return s
def split(passage, order): ends = [] ids = [] sentence = [] for terminal in extract_terminals(passage): sentence.append(terminal.text) # if len(sentence) > max(map(len, map(str.split, order))): # raise ValueError("Failed matching '%s'" % " ".join(sentence)) index = order.get(" ".join(sentence)) if index is not None: ends.append(terminal.position) ids.append(str(index)) sentence = [] return split_passage(passage, ends, ids=ids)
def split(self, passage): ends = [] ids = [] token_lists = [] for terminal in extract_terminals(passage): token_lists.append([]) for terminals in token_lists if self.index is None else [ token_lists[0] ]: terminals.append(terminal) sentence = " ".join(t.text for t in terminals) if self.index is not None and self.index < len( self.sentences ) and self.sentences[self.index].startswith( sentence ): # Try matching next sentence rather than shortest index = self.index if self.sentences[ self.index] == sentence else None else: indices = self.sentence_to_index.get(sentence) index = self.index = indices.pop(0) if indices else None if index is not None: self.matched_indices.add(index) last_end = terminals[0].position - 1 if len(terminals ) > 1 and last_end and last_end not in ends: ends.append(last_end) ends.append(terminal.position) ids.append(str(index)) token_lists = [] self.index += 1 break return split_passage(passage, ends, ids=ids if self.enumerate else None, suffix_format=self.suffix_format, suffix_start=self.suffix_start)
def get_lowest_fn(p): """ finds the FN that has terminals as children""" s = set() for term in extract_terminals(p): s.update([edge.parent for edge in term.incoming if is_foundational(edge.parent)]) return s
def main(): print( align.align("what has is by the meaning of the word is", "what is the men for the wk is are be")) # read xml files print("reading db xmls") p = [] for filename in filenames: with open(add_path(filename), "rb") as fl: p += pickle.load(fl)[0] print( "read ", filename, " it starts with ", tuple(term.text for term in textutil.extract_terminals( convert.from_site(p[-1]))[:6])) # convert xml to passages p = list(map(convert.from_site, p)) print("reading passage xmls") # read passage files for filename in passage_filenames: print("reading" + filename) if os.path.isfile(add_path(os.path.splitext(filename)[0] + ".pkl")): with open(add_path(os.path.splitext(filename)[0] + ".pkl"), "rb") as fl: p.append(pickle.load(fl)) else: p.append(file2passage(add_path(filename))) with open(add_path(os.path.splitext(filename)[0] + ".pkl"), "wb") as fl: pickle.dump(p[-1], fl) print("dumping", add_path(os.path.splitext(filename)[0] + ".pkl")) all_filenames = filenames + passage_filenames print("read ", all_filenames) word2word = align.align_yields(p[0], p[1]) assert align.reverse_mapping(word2word) == align.align_yields( p[1], p[0]), "align_yields asymmetrical" # create symmilarity matrix sources = [] goals = [] names = [] i = 0 while i < len(p): names.append(all_filenames[i]) sources.append(p[i]) i += 1 goals.append(p[i]) i += 1 chunksize = 1 if (len(goals) > 100): chunksize = int(len(goals) / POOL_SIZE / 10) print("multithreading with chunksize", chunksize) pool = Pool(POOL_SIZE) if r2s: results = pool.starmap(distances, zip(goals, sources, names), chunksize) else: results = pool.starmap(distances, zip(sources, goals, names), chunksize) print(results) pool.close() pool.join() sym_mat = [] keys = [] for row, key in results: keys.append(key) sym_mat.append(row) print("functions and matrix") print(funcs + keys) for item in sym_mat: print(item) print("overall token analysis") print(align.token_level_analysis(p)) output_path = trial_name + "output.csv" with open(output_path, "w") as f: print("writing output to " + output_path) writer = csv.writer(f) writer.writerows(sym_mat) send_mail("*****@*****.**", "finished", os.path.abspath(output_path)) return
def ucca_mod(reference, candidate, reference_passage=None, candidate_passage=None, pos=False, **kwargs): """ :param reference: reference sentence: string :param candidate: candidate sentence: string :param reference_passage: UCCA representation of reference sentence :param candidate_passage: UCCA representation of candidate sentence :param pos: Use POS instead of UCCA to determine core words. default: False :param kwargs: kwargs used in calibration(call calibrate_ucca_single), including length_weight, scene_weight, edge_weight and node_weight :return: the weighted UCCA-MTE score """ # return weight of a word based on its path tags def find_score(core_set: dict, tagchain: list): if tagchain[0] not in core_set: return 0 return core_set[tagchain[0]] # extract word nodes from UCCA representations if reference_passage is None or candidate_passage is None: reference_passage, candidate_passage = tuple( ucca_parse_sentences([reference, candidate], 'models/ucca-bilstm')) if type(reference_passage) is NoSentence or type( candidate_passage) is NoSentence: return 0 reference_terminals = [ node for node in extract_terminals(reference_passage) ] candidate_terminals = [ node for node in extract_terminals(candidate_passage) ] core_set = { 'P': 1, 'S': 1, 'A': 1, 'C': 1 } # semantic role tag set of semantic core words # define core POSs def good_pos(s: str): pos = ['V', 'N', 'PRP', 'WP'] return any([s.startswith(p) for p in pos]) # POS tagging if pos: reference_pos = pos_tag( [node.text for node in filter(lambda x: x, reference_terminals)]) candidate_pos = pos_tag( [node.text for node in filter(lambda x: x, candidate_terminals)]) for i in range(len(reference_terminals)): if reference_terminals[i] is None: reference_pos.insert(i, ("", "")) for i in range(len(candidate_terminals)): if candidate_terminals[i] is None: candidate_pos.insert(i, ("", "")) # find core words reference_cores = {} for i in range(len(reference_terminals)): if reference_terminals[i]: tags, parents = align.find_ancester( reference_terminals[i]) # get path tags if not pos: # determine core by UCCA tags if len(set(tags[0][0:1]) - core_set.keys()) == 0: reference_cores[i] = (reference_terminals[i], find_score(core_set, tags[0]), tags, parents) else: # determine core by POS tags if good_pos(reference_pos[i][1]): reference_cores[i] = (reference_terminals[i], 1, tags, parents) candidate_cores = {} for i in range(len(candidate_terminals)): if candidate_terminals[i]: tags, parents = align.find_ancester(candidate_terminals[i]) if not pos: if len(set(tags[0][0:1]) - core_set.keys()) == 0: candidate_cores[i] = (candidate_terminals[i], find_score(core_set, tags[0]), tags, parents) else: if good_pos(candidate_pos[i][1]): candidate_cores[i] = (candidate_terminals[i], 1, tags, parents) # get stems of core words stemmer = PorterStemmer() reference_stems = Counter([ stemmer.stem(core[0].text.lower()) for core in reference_cores.values() ]) candidate_stems = Counter([ stemmer.stem(core[0].text.lower()) for core in candidate_cores.values() ]) # compute matching proportion reference_count = 0 for k, v in reference_stems.items(): reference_count += min(v, candidate_stems.get(k, 0)) reference_core_score = reference_count / max(len(reference_cores), 1) candidate_count = 0 for k, v in candidate_stems.items(): candidate_count += min(v, reference_stems.get(k, 0)) candidate_core_score = candidate_count / max(len(candidate_cores), 1) # compute F1 if reference_core_score + candidate_core_score == 0: core_score = 0.5 else: core_score = 2 * reference_core_score * candidate_core_score / ( reference_core_score + candidate_core_score) # calibration core_score = calibrate_ucca_single(core_score, reference, candidate, reference_passage, candidate_passage, **kwargs) return core_score