def padding_leaves(tree): leaves_location = [ tree.leaf_treeposition(i) for i in range(len(tree.leaves())) ] for i in range(len(leaves_location)): tree[leaves_location[i]] = "{0:03}".format(i) + "||||" + tree[ leaves_location[i]] for i in range(len(tree.leaves())): if len(tree[tree.leaf_treeposition(i)[:-1]]) > 1: tree[tree.leaf_treeposition(i)] = Tree( tree[tree.leaf_treeposition(i)[:-1]].label(), [tree.leaves()[i]])
def iornnFromTree(tree, vocabulary, grammarBased=False): # print tree if tree.height() > 2: if grammarBased: cat = tree.label() + ' -> ' + ' '.join( [child.label() for child in tree]) else: cat = 'composition' children = [ iornnFromTree(child, vocabulary, grammarBased) for child in tree ] parent = IORNN.Node(children, cat, 'tanh', 'tanh') return parent else: #preterminal node words = tree.leaves() if len(words) == 1: word = words[0].lower() else: print 'Not exactly one leaf?!', tree word = 'UNK' try: index = vocabulary.index(word) except: index = 0 leaf = IORNN.Leaf('word', index, 'tanh', word) return leaf
def tree_string_to_symbols(tree_string, remove_root=True, no_collapse=False, **kwargs): tree = tree_from_string(tree_string) # if not no_collapse: # if remove_root: # tree = clean_maybe_rmnode(tree) # else: # # tree = remove_atnodeset_single_nodeset(tree, remove_root=remove_root) # tree = remove_single_nodeset(tree, remove_root) leaves = tree.leaves() labels = [] queue_tree = queue.Queue() queue_tree.put(tree) while not queue_tree.empty(): node = queue_tree.get() labels.append(node.label()) if len(node) == 1 and isinstance(node[0], str): # node is terminal, its only child is a leaves continue for i in range(len(node)): child = node[i] if isinstance(child, nltk.Tree): queue_tree.put(child) tokens = leaves + labels return tokens
def orderSentence(tree, printThings, order="mixed"): global model linearized = [] tree, sentence = tree #tree = copy.deepcopy(tree) for i in range(len(sentence)): line = sentence[i] if line["dep"] == "root": continue head = line["head"] - 1 if "children" not in sentence[head]: sentence[head]["children"] = [] sentence[head]["children"].append(i) end, incoming, outgoing = numberSpans(tree, 0, sentence) assert len(incoming) == 1, incoming assert len(outgoing) == 0, outgoing if (end != len(sentence)): print(tree.leaves()) print([x["word"] for x in sentence]) orderSentenceRec(tree, sentence, printThings, linearized, order=order) #if printThings: # print("linearized", linearized) #for word in linearized: # assert "*-" not in word["word"], word return linearized
def rnnFromTree(tree, vocabulary, wordReduction=False, grammarBased=False): if tree.height() > 2: if grammarBased: cat = tree.label() + ' -> ' + ' '.join( [child.label() for child in tree]) else: cat = 'composition' children = [ rnnFromTree(child, vocabulary, wordReduction) for child in tree ] return Node(children, cat, 'tanh') else: #preterminal node words = tree.leaves() if len(words) == 1: word = words[0] else: 'Not exactly one leaf?!', tree try: index = vocabulary.index(word) except: index = 0 leaf = Leaf('word', index, word) if wordReduction: # wordReduction adds an extra layer to reduce high-dimensional words # to the dimensionality of the inner representations if grammarBased: cat = tree.label() else: cat = 'preterminal' return Node([leaf], cat, 'tanh') else: return leaf
def words_in_tree(self, tree, token_sequence): """ checks to see if the token sequence appears in the tree """ tree_words = " ".join(tree.leaves()) target = " ".join([token.token for token in token_sequence]) return target in tree_words
def tree_to_leave_pos_node_span(tree): leaves = tree.leaves() pos_tags = [] # meta = dict() # list_subtree = list(tree.subtrees()) # meta_lst = [] tree_node_lst = [] spans = [] queue_tree = queue.Queue() queue_tree.put(tree) # meta[list_subtree.index(tree)] = [] found_prob = False while not queue_tree.empty(): node = queue_tree.get() if len(node) <= 0: warnings.warn("[bft]: len(node) <= 0!! will cause error later") if len(node) == 1 and isinstance(node[0], str): pos_tags.append(node.label()) continue tree_node_lst.append(node) # meta_lst.append(meta[list_subtree.index(node)]) # create the spans internal_leaves = node.leaves() spans.append(leaves2span(internal_leaves, leaves)) for i in range(len(node)): child = node[i] if isinstance(child, nltk.Tree): # meta[list_subtree.index(child)] = deepcopy(meta[list_subtree.index(node)]) # meta[list_subtree.index(child)].append(i) queue_tree.put(child) nodes = [x.label() for x in tree_node_lst] return leaves, pos_tags, nodes, spans, tree_node_lst
def tree_to_leave_pos_node_span_collapse(tree): # print(f'tree_to_leave_pos_node_span_collapse.....') leaves = tree.leaves() pos_tags = [] tree_node_lst = [] spans = [] queue_tree = queue.Queue() queue_tree.put(tree) while not queue_tree.empty(): node = queue_tree.get() if len(node) == 1 and isinstance(node[0], str): pos_tags.append(node.label()) continue while len(node) == 1 and isinstance(node[0], nltk.Tree): node.set_label(node[0].label()) node[0:] = [c for c in node[0]] tree_node_lst.append(node) internal_leaves = node.leaves() spans.append(leaves2span(internal_leaves, leaves)) for c in node: if isinstance(c, nltk.Tree): queue_tree.put(c) del queue_tree nodes = [x.label() for x in tree_node_lst] return leaves, pos_tags, nodes, spans, tree_node_lst
def build_tree(tree, parent=None): if len(tree) == 1: root = Node(parent) root.isLeaf = True root.word = tree.leaves()[0] else: root = Node(parent) root.isLeaf = False root.left = build_tree(tree[0], root) root.right = build_tree(tree[1], root) return root
def tree_to_leave_pos_node_span_collapse_v2(tree): # print(f'tree_to_leave_pos_node_span_collapse.....') leaves = tree.leaves() len_leave = len(leaves) pos_tags = [] tree_node_lst = [] spans = [] queue_tree = queue.Queue() queue_tree.put(tree) level = 0 start = 0 end = len_leave - 1 while not queue_tree.empty(): node = queue_tree.get() while len(node) == 1 and isinstance(node[0], nltk.Tree): node.set_label(node[0].label()) node[0:] = [c for c in node[0]] internal_leaves = node.leaves() if level == 0: _span = [start, len_leave - 1] level += 1 else: _span = [start, start + len(internal_leaves) - 1] start = start + len(internal_leaves) # print(start) if start >= len_leave: # end start = 0 level += 1 if len(node) == 1 and isinstance(node[0], str): pos_tags.append(node.label()) continue tree_node_lst.append(node) spans.append(_span) # spans.append(leaves2span(internal_leaves, leaves)) # loc = [t.leaf_treeposition(i) for i in range(3)] for c in node: if isinstance(c, nltk.Tree): queue_tree.put(c) del queue_tree nodes = [x.label() for x in tree_node_lst] print(f'{len(spans)}, {len(nodes)}') tree.pretty_print() for n, s in zip(nodes, spans): print(f'[{n}]: {s}') return leaves, pos_tags, nodes, spans, tree_node_lst
def orderSentence(tree, printThings): global model linearized = [] tree, sentence = tree for i in range(len(sentence)): line = sentence[i] if line["dep"] == "root": continue head = line["head"] - 1 if "children" not in sentence[head]: sentence[head]["children"] = [] sentence[head]["children"].append(i) end, incoming, outgoing = numberSpans(tree, 0, sentence) assert len(incoming) == 1, incoming assert len(outgoing) == 0, outgoing if (end != len(sentence)): print(tree.leaves()) print([x["word"] for x in sentence]) return binarize(orderSentenceRec(tree, sentence, printThings, linearized))
def get_PTP(pair,parsed_sentences): m1_index = pair.first.offsets[0] m2_index = pair.second.offsets[0] senID = pair.first.sentenceID tree = parsed_sentences[senID] if m2_index >= len(tree.leaves()): m2_index -=1 path1 = list(tree.leaf_treeposition(m1_index)) path2 = list(tree.leaf_treeposition(m2_index)) phrase_labels = [] n = 0 share_path = [] for i,j in zip(path1,path2): if i == j: n+=1 share_path.append(i) else: break sub_path1 = path1[n:] sub_path2 = path2[n:] def get_labels(stree,path): subtree = copy.deepcopy(stree) labels = [subtree.node] for i in path: if isinstance(subtree[i],nltk.tree.Tree): labels.append(subtree[i].node) temp = subtree[i] subtree = temp return tuple((subtree,labels)) subtree = get_labels(tree,share_path)[0] path1_labels = get_labels(subtree,sub_path1)[1] path2_labels = get_labels(subtree,sub_path2)[1] path1_labels.reverse() path2_labels.reverse() if path1_labels[-1] == path2_labels[0]: return list(set(path1_labels[:]+path2_labels[1:])) else: ValueError("Path cannot connect:%s,%s" % (path1_labels,path2_labels ))
def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans): self.verb = [] """A list of the word indices of the words that compose the verb whose arguments are identified by this instance. This will contain multiple word indices when multi-word verbs are used (e.g. 'turn on').""" self.verb_head = verb_head """The word index of the head word of the verb whose arguments are identified by this instance. E.g., for a sentence that uses the verb 'turn on,' C{verb_head} will be the word index of the word 'turn'.""" self.verb_stem = verb_stem self.roleset = roleset self.arguments = [] """A list of C{(argspan, argid)} tuples, specifying the location and type for each of the arguments identified by this instance. C{argspan} is a tuple C{start, end}, indicating that the argument consists of the C{words[start:end]}.""" self.tagged_spans = tagged_spans """A list of C{(span, id)} tuples, specifying the location and type for each of the arguments, as well as the verb pieces, that make up this instance.""" self.tree = tree """The parse tree for the sentence containing this instance.""" self.words = tree.leaves() """A list of the words in the sentence containing this instance.""" # Fill in the self.verb and self.arguments values. for (start, end), tag in tagged_spans: if tag in ('V', 'C-V'): self.verb += range(start, end) else: self.arguments.append(((start, end), tag))
def _get_max_depth(tree : tree.Tree, factor : str ='right') -> int: tree.collapse_unary() max_depth = 0 tree.chomsky_normal_form(factor=factor) leaf_positions = tree.treepositions('leaves') for leaf_p in leaf_positions: p_str = '0'+''.join([str(x) for x in leaf_p[:-1]]) turns = re.findall('0[1-9]', p_str) this_depth = len(turns) if this_depth > max_depth: max_depth = this_depth if max_depth == 0 and len(leaf_positions) != 1: print(leaf_positions) print(tree) raise Exception max_depth /= len(tree.leaves()) return max_depth
def tree_to_leave_pos_node_span_collapse_v3(tree): # print(f'tree_to_leave_pos_node_span_collapse.....') leaves = tree.leaves() # tree.pretty_print() # len_leave = len(leaves) padding_leaves_wnum(leaves, tree) pos_tags = [] tree_node_lst = [] spans = [] queue_tree = queue.Queue() queue_tree.put(tree) while not queue_tree.empty(): node = queue_tree.get() while len(node) == 1 and isinstance(node[0], nltk.Tree): node.set_label(node[0].label()) node[0:] = [c for c in node[0]] if len(node) == 1 and isinstance(node[0], str): pos_tags.append(node.label()) continue internal_leaves = node.leaves() tree_node_lst.append(node) _span = [int(internal_leaves[0]), int(internal_leaves[-1])] spans.append(_span) # spans.append(leaves2span(internal_leaves, leaves)) # loc = [t.leaf_treeposition(i) for i in range(3)] for c in node: if isinstance(c, nltk.Tree): queue_tree.put(c) del queue_tree nodes = [x.label() for x in tree_node_lst] if len(nodes) == 0: nodes = [tree.label()] spans = [[0, len(leaves) - 1]] # print(f'{len(spans)}, {len(nodes)}') # tree.pretty_print() # for n, s in zip(nodes, spans): # print(f'[{n}]: {s}') return leaves, pos_tags, nodes, spans, tree_node_lst
def orderSentenceRec(tree, sentence, printThings, linearized): global totalCountRCs global totalCountObjectIsLast label = tree.label() if label[-1] in "1234567890": label = label[:label.rfind("-")] children = [child for child in tree] if type(children[0]) != nltk.tree.Tree: assert all([type(x) != nltk.tree.Tree for x in children]) assert len(list(children)) == 1, list(children) for c in children: if label in ["'", ":", "``", ",", "''", "#", ".", "-NONE-" ] or label[0] == "-" or "*-" in c: continue word = sentence[tree.start]["word"] #c.lower(), ) if word != c.lower().replace("\/", "/"): print(142, word, c.lower()) return { "word": word, "category": label, "children": None, "dependency": "NONE" } else: assert all([type(x) == nltk.tree.Tree for x in children]) children = [ child for child in children if child.start < child.end ] # remove children that consist of gaps or otherwise eliminated tokens # find which children seem to be dependents of which other children if True or model != "REAL_REAL": childDeps = [None for _ in children] childHeads = [None for _ in children] for i in range(len(children)): incomingFromOutside = [ x for x in tree.incoming if x in children[i].incoming ] if len(incomingFromOutside) > 0: childDeps[i] = sentence[incomingFromOutside[-1][1]]["dep"] childHeads[i] = sentence[incomingFromOutside[-1] [1]]["head"] if len(incomingFromOutside) > 1: print("FROM OUTSIDE", [ sentence[incomingFromOutside[x][1]]["dep"] for x in range(len(incomingFromOutside)) ]) for j in range(len(children)): if i == j: continue incomingFromJ = [ x for x in children[i].incoming if x in children[j].outgoing ] if len(incomingFromJ) > 0: if len(incomingFromJ) > 1: duplicateDeps = tuple([ sentence[incomingFromJ[x][1]]["dep"] for x in range(len(incomingFromJ)) ]) if not (duplicateDeps == ("obj", "xcomp")): print("INCOMING FROM NEIGHBOR", duplicateDeps) childDeps[i] = sentence[incomingFromJ[-1][1]]["dep"] childHeads[i] = sentence[incomingFromJ[-1][1]]["head"] assert None not in childDeps, (childDeps, children) keys = childDeps childrenLinearized = children childrenAsTrees = [] for child, dependency in zip(children, childDeps): childrenAsTrees.append( orderSentenceRec(child, sentence, printThings, linearized)) if childrenAsTrees[ -1] is None: # this will happen for punctuation etc del childrenAsTrees[-1] else: childrenAsTrees[-1]["dependency"] = dependency if label == "SBAR": if len(childrenAsTrees) > 1: if len(childrenAsTrees ) == 2 and childrenAsTrees[0]["category"] in [ "IN", "WHNP" ] and childrenAsTrees[1][ "category"] == "S" and childrenAsTrees[1][ "dependency"] == "acl:relcl": # Relative clause if childrenAsTrees[0][ "dependency"] == "nsubj": # SUBJECT Relatives if sentence[childHeads[1] - 1]["dep"] in ["nsubj", "obj"]: # _ = 0 if sentence[childHeads[1] - 1]["dep"] == "nsubj": leaves = [ x for x in tree.leaves() if not (x.startswith("*T*") or x.startswith("*U*")) ] # print("WORDS ", " ".join(leaves)) # print("CATEGORIES ", zip([x["category"] for x in childrenAsTrees], [x["dependency"] for x in childrenAsTrees])) # print("Position in matrix clause", sentence[childHeads[1]-1]["dep"], len(tree.leaves())) # assert len(childrenAsTrees[1]["children"]) == 1, childrenAsTrees[1]["children"] # print("CHILDREN IN THE RC", [x["category"] for x in childrenAsTrees[1]["children"]]) firstVP = [ x["category"] for x in childrenAsTrees[1]["children"] ].index("VP") # print("First VP", firstVP) childrenInTheVP = [ x["category"] for x in childrenAsTrees[1] ["children"][firstVP]["children"] ] print("CHILDREN IN THE VP", childrenInTheVP) if len(childrenInTheVP ) > 1 and childrenInTheVP[0].startswith( "VB" ) and childrenInTheVP[1] == "NP": # print("Embedded verb head", sentence[childHeads[0]-1]) # print("Is the last word of RC?", sentence[childHeads[0]-1]["word"] == leaves[-1]) totalCountRCs += 1 totalCountObjectIsLast += ( 1 if len(childrenInTheVP) == 2 else 0) print(totalCountObjectIsLast / float(totalCountRCs), totalCountRCs ) # about 63% # What follows the relative clause? # else: # print(childrenAsTrees) # print(tree.leaves()) # print([x["category"] for x in childrenAsTrees]) return { "category": label, "children": childrenAsTrees, "dependency": "NONE" }
def containsPhrase(tree, phrase): if isinstance(tree, str): return False else: return regex.containsPhrase(phrase, tree.leaves(), flags=re.IGNORECASE)
def extractWords(tree): return tree.leaves()
def __getitem__(self, index): tree = tree_from_string(self.data[index]) leaves = tree.leaves() return " ".join(leaves)
def remap_chars(tree): for i in range(len(tree.leaves())): if tree.leaves()[i] in SPECIAL_CHAR_MBACK: tree[tree.leaf_treeposition(i)] = SPECIAL_CHAR_MBACK[tree.leaves() [i]]
def get_noun_chunk(tree): if tree.label() == 'NP': nouns_phase = ''.join(tree.leaves()) # noun_chunk.append(nouns_phase) return nouns_phase
def get_noun_chunk(tree): noun_chunk = [] if tree.label() == "NP": nouns_phase = ''.join(tree.leaves()) noun_chunk.append(nouns_phase) return noun_chunk
def write_to_file(self, corpus_path, metadata_path, target_folder_path, ranges, errorLog): """Writes files to a target folder with the mappings from words in utterances to tree nodes in trees. """ if errorLog: errorLog = open(errorLog, 'w') corpus = CorpusReader(corpus_path, metadata_path) # Iterate through all transcripts incorrectTrees = 0 folder = None corpus_file = None for trans in corpus.iter_transcripts(): # print "iterating",trans.conversation_no if not trans.has_pos(): continue # print "has pos" if ranges and not trans.conversation_no in ranges: continue # print "in range" # just look at transcripts WITH trees as compliment to the # below models if not trans.has_trees(): continue end = trans.swda_filename.rfind("/") start = trans.swda_filename.rfind("/", 0, end) c_folder = trans.swda_filename[start + 1:end] if c_folder != folder: # for now splitting the maps by folder folder = c_folder if corpus_file: corpus_file.close() corpus_file = open( target_folder_path + "/Tree_map_{0}.csv.text".format(folder), 'w') wordTreeMapList = TreeMapCorpus(False, errorLog) print "new map for folder", folder translist = trans.utterances translength = len(translist) count = 0 # iterating through transcript utterance by utterance # create list of tuples i.e. map from word to the index(ices) # (possibly multiple or null) of the relevant leaf/ves # of a given tree i.e. utt.tree[0].leaves[0] would be a pair (0,0)) while count < translength: utt = trans.utterances[count] words = utt.text_words() wordTreeMap = [] # [((word), (List of LeafIndices))] forwardtrack = 0 backtrack = 0 continued = False # print "\n COUNT" + str(count) # print utt.damsl_act_tag() if len(utt.trees) == 0 or utt.damsl_act_tag() == "x": wordTreeMap.append((utt, [])) # just dummy value # errormessage = "WARNING: NO TREE for file/utt: " +\ # str(utt.swda_filename) + " " + utt.caller + "." + \ # str(utt.utterance_index) + "." + \ #str(utt.subutterance_index) + " " + utt.text # print(errormessage) count += 1 continue # raw_input() # indices for which tree and leaf we're at: i = 0 # tree j = 0 # leaf # initialise pairs of trees and ptb pairs trees = [] for l in range(0, len(utt.trees)): trees.append( (utt.ptb_treenumbers[l], count, l, utt.trees[l])) # print "TREES = " # for tree in trees: # print tree origtrees = list(trees) origcount = count # overcoming the problem of previous utterances contributing # to the tree at this utterance, we need to add the words from # the previous utt add in all the words from previous utterance # with a dialogue act tag/or the same tree? # check that the last tree in the previous utterance # is the same as the previous one previousUttSame = trans.previous_utt_same_speaker(utt) # print previousUttSame lastTreeMap = None if previousUttSame: # print "search for previous full act utt # for " + str(utt.swda_filename) + # str(utt.transcript_index) lastTreeMap = wordTreeMapList.get_treemap( trans, previousUttSame) if ((not lastTreeMap) or (len(lastTreeMap) == 0) or (len(lastTreeMap) == 1 and lastTreeMap[0][1] == [])): # print "no last tree map, backwards searching" while previousUttSame and \ ((not lastTreeMap) or (len(lastTreeMap) == 0) or (len(lastTreeMap) == 1 and lastTreeMap[0][1] == [])): previousUttSame = trans.previous_utt_same_speaker( previousUttSame) # go back one more lastTreeMap = wordTreeMapList.get_treemap( trans, previousUttSame) if previousUttSame: pass # print previousUttSame.transcript_index if not lastTreeMap: pass # print "no last treemap found for:" # print utt.swda_filename # print utt.transcript_index if lastTreeMap and \ (utt.damsl_act_tag() == "+" or (len(lastTreeMap.treebank_numbers) > 0 and lastTreeMap.treebank_numbers[-1] == utt.ptb_treenumbers[0])): continued = True # might have to backtrack # now checking for wrong trees lastPTB = lastTreeMap.treebank_numbers lastIndexes = lastTreeMap.transcript_numbers lastTreesTemp = lastTreeMap.get_trees(trans) lastTrees = [] for i in range(0, len(lastPTB)): lastTrees.append([ lastPTB[i], lastIndexes[i][0], lastIndexes[i][1], lastTreesTemp[i] ]) if not (lastPTB[-1] == utt.ptb_treenumbers[0]): # print "not same, need to correct!" # print words # print trees # print "last one" # print previousUttSame.text_words() # print lastTrees if utt.ptb_treenumbers[0] - lastPTB[-1] > 1: # backtrack and redo the antecedent count = count - (count - lastIndexes[-1][0]) utt = previousUttSame words = utt.text_words() mytrees = [] for i in range(0, len(lastTrees) - 1): mytrees.append(lastTrees[i]) trees = mytrees + [origtrees[0]] # print "\n(1)backtrack to with new trees:" backtrack = 1 # print utt.transcript_index # print words # print trees # raw_input() # alternately, this utt's tree may be further back # than its antecdent's, rare mistake elif utt.ptb_treenumbers[0] < lastTrees[-1][0]: # continue with this utterance and trees # (if there are any), but replace its first # tree with its antecdents last one forwardtrack = 1 trees = [lastTrees[-1]] + origtrees[1:] # print "\n(2)replacing first one to lasttreemap's:" # print words # print trees # raw_input() if backtrack != 1: # we should have no match found_treemap = False # resetting # for t in wordTreeMapList.keys(): # print t # print wordTreeMapList[t] for t in range(len(lastTreeMap) - 1, -1, -1): # print lastTreeMap[t][1] # if there is a leafIndices for the # word being looked at, gets last mapped one if len(lastTreeMap[t][1]) > 0: # print "last treemapping of last # caller utterance = # " + str(lastTreeMap[t][1][-1]) j = lastTreeMap[t][1][-1][1] + 1 found_treemap = True # print "found last mapping, j -1 = " + str(j-1) # raw_input() break if not found_treemap: pass # print "NO matched last TREEMAP found for \ # previous Utt Same Speaker of " + \ # str(trans.swda_filename) + " " + \ # str(utt.transcript_index) # print lastTreeMap # for tmap in wordTreeMapList.keys(): # print tmap # print wordTreeMapList[tmap] # raw_input() possibleComment = False # can have comments, flag mistranscribe = False LeafIndices = [] # possibly empty list of leaf indices word = words[0] # loop until no more words left to be matched in utterance while len(words) > 0: # print "top WORD:" + word if not mistranscribe: wordtest = re.sub(r"[\.\,\?\"\!]", "", word) wordtest = wordtest.replace("(", "").replace(")", "") match = False LeafIndices = [] # possibly empty list of leaf indices if (possibleComment or word[0:1] in [ "{", "}", "-" ] or word in ["/", ".", ",", "]"] or wordtest == "" or any( [ x in word for x in ["<", ">", "*", "[", "+", "]]", "...", "#", "="] ])): # no tree equivalent for {D } type annotations if (word[0:1] == "-" or any([x in word for x in ["*", "<<", "<+", "[[", "<"]])) \ and not possibleComment: possibleComment = True if possibleComment: #print("match COMMENT!:" + word) # raw_input() LeafIndices = [] match = True #wordTreeMap.append((word, LeafIndices)) if any([x in word for x in [">>", "]]", ">"]]) or \ word[0] == "-": # turn off comment possibleComment = False #del words[0] # LeadIndices will be null here wordTreeMap.append((word, LeafIndices)) LeafIndices = [] match = True # print "match annotation!:" + word del words[0] # word is consumed, should always be one if len(words) > 0: word = words[0] wordtest = re.sub(r"[\.\,\?\/\)\(\"\!]", "", word) wordtest = wordtest.replace("(", "") wordtest = wordtest.replace(")", "") else: break continue # carry on to next word without updating indices? else: while i < len(trees): # print "i number of trees :" + str(len(utt.trees)) # print "i tree number :" + str(i) # print "i loop word :" + word tree = trees[i][3] # print "looking at ptb number " + str(trees[i][0]) # print "looking at index number " \ #+ str(trees[i][1])+","+str(trees[i][2]) while j < len(tree.leaves()): leaf = tree.leaves()[j] # print "j number of leaves : " \ #+ str(len(tree.leaves())) # print "j loop word : " + word # print "j loop wordtest : " + wordtest # print "j leaf : " + str(j) + " " + leaf breaker = False # exact match if wordtest == leaf or word == leaf: LeafIndices.append((i, j)) wordTreeMap.append((word, LeafIndices)) # print("match!:" + word + " " + \ # str(utt.swda_filename) + " " + \ # utt.caller + "." + \ # str(utt.utterance_index) + \ # "." + str(utt.subutterance_index)) del words[0] # word is consumed if len(words) > 0: word = words[0] # next word wordtest = re.sub( r"[\.\,\?\/\)\(\"\!]", "", word) wordtest = wordtest.replace("(", "") wordtest = wordtest.replace(")", "") LeafIndices = [] j += 1 # increment loop to next leaf match = True breaker = True # raw_input() break elif leaf in wordtest or \ leaf in word and not leaf == ",": testleaf = leaf LeafIndices.append((i, j)) j += 1 for k in range(j, j + 3): # 3 beyond if (k >= len(tree.leaves())): j = 0 i += 1 #breaker = True breaker = True break # got to next tree if (testleaf + tree.leaves()[k]) \ in wordtest or (testleaf + tree.leaves()[k])\ in word: testleaf += tree.leaves()[k] LeafIndices.append((i, k)) j += 1 # concatenation if testleaf == wordtest or \ testleaf == word: # word matched wordTreeMap.append( (word, LeafIndices)) del words[0] # remove word # print "match!:" + word +\ #str(utt.swda_filename) + " "\ # + utt.caller + "." + \ # str(utt.utterance_index) +\ # "." + \ # str(utt.subutterance_index)) if len(words) > 0: word = words[0] wordtest = re.sub( r"[\.\,\?\/\)\(\"\!]", "", word) wordtest = wordtest.\ replace("(", "") wordtest = wordtest.\ replace(")", "") # reinitialise leaves LeafIndices = [] j = k + 1 match = True breaker = True # raw_input() break else: # otherwise go on j += 1 if breaker: break if match: break if j >= len(tree.leaves()): j = 0 i += 1 if match: break # could not match word! try mistranscriptions first: if not match: if not mistranscribe: # one final stab at matching! mistranscribe = True for pair in possibleMistranscription: if pair[0] == wordtest: wordtest = pair[1] if len(wordTreeMap) > 0: if len(wordTreeMap[-1][1]) > 0: i = wordTreeMap[-1][1][-1][0] j = wordTreeMap[-1][1][-1][1] else: # go back to beginning of # tree search i = 0 j = 0 else: i = 0 # go back to beginning j = 0 break # matched elif continued: # possible lack of matching up of words in # previous utterance same caller and same # tree// not always within same tree!! errormessage = "Possible bad start for \ CONTINUED UTT ''" + words[0] + "'' in file/utt: "\ + str(utt.swda_filename) + "\n " + utt.caller + \ "." + str(utt.utterance_index) + "." + \ str(utt.subutterance_index) + \ "POSSIBLE COMMENT = " + str(possibleComment) # print errormessage if not errorLog is None: errorLog.write(errormessage + "\n") # raw_input() if backtrack == 1: backtrack += 1 elif backtrack == 2: # i.e. we've done two loops and # still haven't found it, try the other way count = origcount utt = trans.utterances[count] words = utt.text_words() word = words[0] trees = [lastTrees[-1]] + origtrees[1:] # print "\nSECOND PASS(2)replacing \ # first one to lasttreemap's:" # print words # print trees backtrack += 1 # mistranscribe = False #TODO perhaps needed wordTreeMap = [] # switch to forward track this is # the only time we want to try # from the previous mapped leaf in the # other tree foundTreemap = False for t in range(len(lastTreeMap) - 1, -1, -1): # backwards iteration through words # print lastTreeMap[t][1] if len(lastTreeMap[t][1]) > 0: # print "last treemapping of last \ # caller utterance = " + \ # str(lastTreeMap[t][1][-1]) j = lastTreeMap[t][1][-1][1] + 1 foundTreemap = True # print "found last mapping, j = " \ #+ str(j) # raw_input() # break when last tree # mapped word from this caller is found break if not foundTreemap: # print "NO matched last TREEMAP found\ # for previous Utt Same Speaker of " + \ # str(utt.swda_filename) + " " + \ # utt.caller + "." + \ # str(utt.utterance_index) + "." +\ # str(utt.subutterance_index) j = 0 # for tmap in wordTreeMapList.keys(): # print tmap # print wordTreeMapList[tmap] # raw_input() i = 0 # go back to first tree continue elif forwardtrack == 1: forwardtrack += 1 elif forwardtrack == 2: count = count - (count - lastIndexes[-1][0]) utt = previousUttSame words = utt.text_words() word = words[0] mytrees = [] for i in range(0, len(lastTrees) - 1): mytrees.append(lastTrees[i]) trees = mytrees + [origtrees[0]] # print "\nSECOND PASS(1)backtrack to \ # with new trees:" # print utt.transcript_index # print words # print trees forwardtrack += 1 # mistranscribe = False #TODO maybe needed wordTreeMap = [] # raw_input() elif forwardtrack == 3 or backtrack == 3: # if this hasn't worked reset to old trees # print "trying final reset" count = origcount utt = trans.utterances[count] words = utt.text_words() word = words[0] trees = origtrees forwardtrack = 0 backtrack = 0 # mistranscribe = False #TODO maybe needed wordTreeMap = [] # raw_input() else: pass # print "resetting search" # raw_input() # unless forward tracking now, # just go back to beginning i = 0 # go back to beginning of tree search j = 0 else: mistranscribe = False LeafIndices = [] wordTreeMap.append((word, LeafIndices)) errormessage = "WARNING: 440 no/partial tree \ mapping for ''" + words[0] + "'' in file/utt: "\ + str(utt.swda_filename) + " \n" + utt.caller\ + "." + str(utt.utterance_index) + "." + \ str(utt.subutterance_index) + \ "POSSIBLE COMMENT = " + str(possibleComment) # print utt.text_words() del words[0] # remove word # for trip in wordTreeMap: # print "t",trip if len(words) > 0: word = words[0] wordtest = re.sub(r"[\.\,\?\/\)\(\"\!]", "", word) wordtest = wordtest.replace("(", "") wordtest = wordtest.replace(")", "") # print errormessage if errorLog: errorLog.write("possible wrong tree mapping:" + errormessage + "\n") raw_input() # end of while loop (words) mytreenumbers = [] for treemap in trees: # the whole list but the tree mytreenumbers.append(treemap[:-1]) if not len(utt.text_words()) == len(wordTreeMap): print "ERROR. uneven lengths!" print utt.text_words() print wordTreeMap print trans.swda_filename print utt.transcript_index raw_input() count += 1 continue # add the treemap wordTreeMapList.append(trans.conversation_no, utt.transcript_index, tuple(mytreenumbers), tuple(wordTreeMap)) count += 1 # rewrite after each transcript filedict = defaultdict(str) for key in wordTreeMapList.keys(): csv_string = '"' + str(list(wordTreeMapList[key])) + '"' mytreenumbers = wordTreeMapList[key].transcript_numbers myptbnumbers = wordTreeMapList[key].treebank_numbers tree_list_string = '"' for i in range(0, len(mytreenumbers)): treemap = [myptbnumbers[i]] + mytreenumbers[i] tree_list_string += str(treemap) + ";" tree_list_string = tree_list_string[:-1] + '"' filename = '"' + key[0:key.rfind(':')] + '"' transindex = key[key.rfind(':') + 1:] filedict[int(transindex)] = filename \ + "\t" + transindex + '\t' + csv_string + "\t" \ + tree_list_string + "\n" for key in sorted(filedict.keys()): corpus_file.write(filedict[key]) wordTreeMapList = TreeMapCorpus(False, errorLog) # reset each time print "\n" + str(incorrectTrees) + " incorrect trees" corpus_file.close() if not errorLog is None: errorLog.close()