コード例 #1
0
def padding_leaves(tree):
    leaves_location = [
        tree.leaf_treeposition(i) for i in range(len(tree.leaves()))
    ]
    for i in range(len(leaves_location)):
        tree[leaves_location[i]] = "{0:03}".format(i) + "||||" + tree[
            leaves_location[i]]
    for i in range(len(tree.leaves())):
        if len(tree[tree.leaf_treeposition(i)[:-1]]) > 1:
            tree[tree.leaf_treeposition(i)] = Tree(
                tree[tree.leaf_treeposition(i)[:-1]].label(),
                [tree.leaves()[i]])
コード例 #2
0
ファイル: naturalLogic.py プロジェクト: Veldhoen/thesis
def iornnFromTree(tree, vocabulary, grammarBased=False):
    #  print tree
    if tree.height() > 2:
        if grammarBased:
            cat = tree.label() + ' -> ' + ' '.join(
                [child.label() for child in tree])
        else:
            cat = 'composition'
        children = [
            iornnFromTree(child, vocabulary, grammarBased) for child in tree
        ]
        parent = IORNN.Node(children, cat, 'tanh', 'tanh')
        return parent
    else:  #preterminal node
        words = tree.leaves()
        if len(words) == 1: word = words[0].lower()
        else:
            print 'Not exactly one leaf?!', tree
            word = 'UNK'
        try:
            index = vocabulary.index(word)
        except:
            index = 0
        leaf = IORNN.Leaf('word', index, 'tanh', word)
        return leaf
コード例 #3
0
def tree_string_to_symbols(tree_string,
                           remove_root=True,
                           no_collapse=False,
                           **kwargs):
    tree = tree_from_string(tree_string)
    # if not no_collapse:
    #     if remove_root:
    #         tree = clean_maybe_rmnode(tree)
    #     else:
    #         # tree = remove_atnodeset_single_nodeset(tree, remove_root=remove_root)
    #         tree = remove_single_nodeset(tree, remove_root)
    leaves = tree.leaves()
    labels = []
    queue_tree = queue.Queue()
    queue_tree.put(tree)
    while not queue_tree.empty():
        node = queue_tree.get()
        labels.append(node.label())
        if len(node) == 1 and isinstance(node[0], str):
            # node is terminal, its only child is a leaves
            continue
        for i in range(len(node)):
            child = node[i]
            if isinstance(child, nltk.Tree):
                queue_tree.put(child)
    tokens = leaves + labels
    return tokens
def orderSentence(tree, printThings, order="mixed"):
   global model
   linearized = []
   tree, sentence = tree
   #tree = copy.deepcopy(tree)
   for i in range(len(sentence)):
      line = sentence[i]
      if line["dep"] == "root":
         continue
      head = line["head"] - 1
      if "children" not in sentence[head]:
        sentence[head]["children"] = []
      sentence[head]["children"].append(i)


   end, incoming, outgoing = numberSpans(tree, 0, sentence)
   assert len(incoming) == 1, incoming
   assert len(outgoing) == 0, outgoing
   if (end != len(sentence)):
      print(tree.leaves())
      print([x["word"] for x in sentence])
   orderSentenceRec(tree, sentence, printThings, linearized, order=order)
   #if printThings:
   #  print("linearized", linearized)
   #for word in linearized:
   #   assert "*-" not in word["word"], word
   return linearized
コード例 #5
0
ファイル: naturalLogic (2).py プロジェクト: Veldhoen/thesis
def rnnFromTree(tree, vocabulary, wordReduction=False, grammarBased=False):
    if tree.height() > 2:
        if grammarBased:
            cat = tree.label() + ' -> ' + ' '.join(
                [child.label() for child in tree])
        else:
            cat = 'composition'
        children = [
            rnnFromTree(child, vocabulary, wordReduction) for child in tree
        ]
        return Node(children, cat, 'tanh')
    else:  #preterminal node
        words = tree.leaves()
        if len(words) == 1: word = words[0]
        else: 'Not exactly one leaf?!', tree
        try:
            index = vocabulary.index(word)
        except:
            index = 0
        leaf = Leaf('word', index, word)

        if wordReduction:
            # wordReduction adds an extra layer to reduce high-dimensional words
            # to the dimensionality of the inner representations
            if grammarBased: cat = tree.label()
            else: cat = 'preterminal'
            return Node([leaf], cat, 'tanh')
        else:
            return leaf
コード例 #6
0
ファイル: corpus.py プロジェクト: meitals/coref-cs137
	def words_in_tree(self, tree, token_sequence):
		"""
			checks to see if the token sequence appears in the tree
		"""
		tree_words = " ".join(tree.leaves())
		target = " ".join([token.token for token in token_sequence])
		return target in tree_words
コード例 #7
0
def tree_to_leave_pos_node_span(tree):
    leaves = tree.leaves()
    pos_tags = []
    # meta = dict()
    # list_subtree = list(tree.subtrees())
    # meta_lst = []
    tree_node_lst = []
    spans = []
    queue_tree = queue.Queue()
    queue_tree.put(tree)
    # meta[list_subtree.index(tree)] = []
    found_prob = False
    while not queue_tree.empty():
        node = queue_tree.get()
        if len(node) <= 0:
            warnings.warn("[bft]: len(node) <= 0!! will cause error later")
        if len(node) == 1 and isinstance(node[0], str):
            pos_tags.append(node.label())
            continue
        tree_node_lst.append(node)
        # meta_lst.append(meta[list_subtree.index(node)])
        # create the spans
        internal_leaves = node.leaves()
        spans.append(leaves2span(internal_leaves, leaves))
        for i in range(len(node)):
            child = node[i]
            if isinstance(child, nltk.Tree):
                # meta[list_subtree.index(child)] = deepcopy(meta[list_subtree.index(node)])
                # meta[list_subtree.index(child)].append(i)
                queue_tree.put(child)
    nodes = [x.label() for x in tree_node_lst]
    return leaves, pos_tags, nodes, spans, tree_node_lst
コード例 #8
0
def tree_to_leave_pos_node_span_collapse(tree):
    # print(f'tree_to_leave_pos_node_span_collapse.....')
    leaves = tree.leaves()
    pos_tags = []
    tree_node_lst = []
    spans = []
    queue_tree = queue.Queue()
    queue_tree.put(tree)
    while not queue_tree.empty():
        node = queue_tree.get()
        if len(node) == 1 and isinstance(node[0], str):
            pos_tags.append(node.label())
            continue
        while len(node) == 1 and isinstance(node[0], nltk.Tree):
            node.set_label(node[0].label())
            node[0:] = [c for c in node[0]]
        tree_node_lst.append(node)
        internal_leaves = node.leaves()
        spans.append(leaves2span(internal_leaves, leaves))
        for c in node:
            if isinstance(c, nltk.Tree):
                queue_tree.put(c)
    del queue_tree
    nodes = [x.label() for x in tree_node_lst]
    return leaves, pos_tags, nodes, spans, tree_node_lst
コード例 #9
0
ファイル: loadTree.py プロジェクト: ankitp94/aspect-sentiment
def build_tree(tree, parent=None):
    if len(tree) == 1:
        root = Node(parent)
        root.isLeaf = True
        root.word = tree.leaves()[0]
    else:
        root = Node(parent)
        root.isLeaf = False
        root.left = build_tree(tree[0], root)
        root.right = build_tree(tree[1], root)
    return root
コード例 #10
0
ファイル: loadTree.py プロジェクト: zxsted/DRM
def build_tree(tree, parent=None):
    if len(tree) == 1:
        root = Node(parent)
        root.isLeaf = True
        root.word = tree.leaves()[0]
    else:
        root = Node(parent)
        root.isLeaf = False
        root.left = build_tree(tree[0], root)
        root.right = build_tree(tree[1], root)
    return root
コード例 #11
0
def tree_to_leave_pos_node_span_collapse_v2(tree):
    # print(f'tree_to_leave_pos_node_span_collapse.....')
    leaves = tree.leaves()
    len_leave = len(leaves)
    pos_tags = []
    tree_node_lst = []
    spans = []
    queue_tree = queue.Queue()
    queue_tree.put(tree)
    level = 0
    start = 0
    end = len_leave - 1
    while not queue_tree.empty():
        node = queue_tree.get()
        while len(node) == 1 and isinstance(node[0], nltk.Tree):
            node.set_label(node[0].label())
            node[0:] = [c for c in node[0]]
        internal_leaves = node.leaves()
        if level == 0:
            _span = [start, len_leave - 1]
            level += 1
        else:
            _span = [start, start + len(internal_leaves) - 1]
            start = start + len(internal_leaves)
            # print(start)
            if start >= len_leave:
                # end
                start = 0
                level += 1
        if len(node) == 1 and isinstance(node[0], str):
            pos_tags.append(node.label())
            continue
        tree_node_lst.append(node)
        spans.append(_span)
        # spans.append(leaves2span(internal_leaves, leaves))
        # loc = [t.leaf_treeposition(i) for i in range(3)]
        for c in node:
            if isinstance(c, nltk.Tree):
                queue_tree.put(c)
    del queue_tree
    nodes = [x.label() for x in tree_node_lst]
    print(f'{len(spans)}, {len(nodes)}')
    tree.pretty_print()
    for n, s in zip(nodes, spans):
        print(f'[{n}]: {s}')
    return leaves, pos_tags, nodes, spans, tree_node_lst
コード例 #12
0
def orderSentence(tree, printThings):
    global model
    linearized = []
    tree, sentence = tree
    for i in range(len(sentence)):
        line = sentence[i]
        if line["dep"] == "root":
            continue
        head = line["head"] - 1
        if "children" not in sentence[head]:
            sentence[head]["children"] = []
        sentence[head]["children"].append(i)
    end, incoming, outgoing = numberSpans(tree, 0, sentence)
    assert len(incoming) == 1, incoming
    assert len(outgoing) == 0, outgoing
    if (end != len(sentence)):
        print(tree.leaves())
        print([x["word"] for x in sentence])
    return binarize(orderSentenceRec(tree, sentence, printThings, linearized))
コード例 #13
0
def get_PTP(pair,parsed_sentences):

    m1_index = pair.first.offsets[0]
    m2_index = pair.second.offsets[0]
    senID = pair.first.sentenceID
    
    tree = parsed_sentences[senID]
    if m2_index >= len(tree.leaves()):
        m2_index -=1
    path1 = list(tree.leaf_treeposition(m1_index))
    path2 = list(tree.leaf_treeposition(m2_index))

    phrase_labels = []
    n = 0
    share_path = []
    for i,j in zip(path1,path2):
        if i == j:
            n+=1
            share_path.append(i)
        else:
            break
    sub_path1 = path1[n:]
    sub_path2 = path2[n:]
    def get_labels(stree,path):
        subtree = copy.deepcopy(stree)
        labels = [subtree.node]
        for i in path:
            if isinstance(subtree[i],nltk.tree.Tree):
                labels.append(subtree[i].node)
                temp = subtree[i]
                subtree = temp
        return tuple((subtree,labels))
    subtree = get_labels(tree,share_path)[0]
    path1_labels = get_labels(subtree,sub_path1)[1]
    path2_labels = get_labels(subtree,sub_path2)[1]
    path1_labels.reverse()
    path2_labels.reverse()
    if path1_labels[-1] == path2_labels[0]:
        return list(set(path1_labels[:]+path2_labels[1:]))
    else:
        ValueError("Path cannot connect:%s,%s" % (path1_labels,path2_labels ))
コード例 #14
0
    def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
        self.verb = []
        """A list of the word indices of the words that compose the
           verb whose arguments are identified by this instance.
           This will contain multiple word indices when multi-word
           verbs are used (e.g. 'turn on')."""

        self.verb_head = verb_head
        """The word index of the head word of the verb whose arguments
           are identified by this instance.  E.g., for a sentence that
           uses the verb 'turn on,' C{verb_head} will be the word index
           of the word 'turn'."""

        self.verb_stem = verb_stem

        self.roleset = roleset

        self.arguments = []
        """A list of C{(argspan, argid)} tuples, specifying the location
           and type for each of the arguments identified by this
           instance.  C{argspan} is a tuple C{start, end}, indicating
           that the argument consists of the C{words[start:end]}."""

        self.tagged_spans = tagged_spans
        """A list of C{(span, id)} tuples, specifying the location and
           type for each of the arguments, as well as the verb pieces,
           that make up this instance."""

        self.tree = tree
        """The parse tree for the sentence containing this instance."""

        self.words = tree.leaves()
        """A list of the words in the sentence containing this
           instance."""

        # Fill in the self.verb and self.arguments values.
        for (start, end), tag in tagged_spans:
            if tag in ('V', 'C-V'):
                self.verb += range(start, end)
            else:
                self.arguments.append(((start, end), tag))
コード例 #15
0
def _get_max_depth(tree : tree.Tree, factor : str ='right') -> int:
    tree.collapse_unary()
    max_depth = 0

    tree.chomsky_normal_form(factor=factor)

    leaf_positions = tree.treepositions('leaves')

    for leaf_p in leaf_positions:
        p_str = '0'+''.join([str(x) for x in leaf_p[:-1]])
        turns = re.findall('0[1-9]', p_str)
        this_depth = len(turns)
        if this_depth > max_depth:
            max_depth = this_depth
    if max_depth == 0 and len(leaf_positions) != 1:
        print(leaf_positions)
        print(tree)
        raise Exception

    max_depth /= len(tree.leaves())

    return max_depth
コード例 #16
0
def tree_to_leave_pos_node_span_collapse_v3(tree):
    # print(f'tree_to_leave_pos_node_span_collapse.....')
    leaves = tree.leaves()
    # tree.pretty_print()
    # len_leave = len(leaves)
    padding_leaves_wnum(leaves, tree)
    pos_tags = []
    tree_node_lst = []
    spans = []
    queue_tree = queue.Queue()
    queue_tree.put(tree)
    while not queue_tree.empty():
        node = queue_tree.get()
        while len(node) == 1 and isinstance(node[0], nltk.Tree):
            node.set_label(node[0].label())
            node[0:] = [c for c in node[0]]
        if len(node) == 1 and isinstance(node[0], str):
            pos_tags.append(node.label())
            continue
        internal_leaves = node.leaves()
        tree_node_lst.append(node)
        _span = [int(internal_leaves[0]), int(internal_leaves[-1])]
        spans.append(_span)
        # spans.append(leaves2span(internal_leaves, leaves))
        # loc = [t.leaf_treeposition(i) for i in range(3)]
        for c in node:
            if isinstance(c, nltk.Tree):
                queue_tree.put(c)
    del queue_tree
    nodes = [x.label() for x in tree_node_lst]
    if len(nodes) == 0:
        nodes = [tree.label()]
        spans = [[0, len(leaves) - 1]]
    # print(f'{len(spans)}, {len(nodes)}')
    # tree.pretty_print()
    # for n, s in zip(nodes, spans):
    #     print(f'[{n}]: {s}')
    return leaves, pos_tags, nodes, spans, tree_node_lst
コード例 #17
0
def orderSentenceRec(tree, sentence, printThings, linearized):
    global totalCountRCs
    global totalCountObjectIsLast

    label = tree.label()
    if label[-1] in "1234567890":
        label = label[:label.rfind("-")]
    children = [child for child in tree]
    if type(children[0]) != nltk.tree.Tree:
        assert all([type(x) != nltk.tree.Tree for x in children])
        assert len(list(children)) == 1, list(children)
        for c in children:
            if label in ["'", ":", "``", ",", "''", "#", ".", "-NONE-"
                         ] or label[0] == "-" or "*-" in c:
                continue
            word = sentence[tree.start]["word"]  #c.lower(), )
            if word != c.lower().replace("\/", "/"):
                print(142, word, c.lower())
            return {
                "word": word,
                "category": label,
                "children": None,
                "dependency": "NONE"
            }
    else:
        assert all([type(x) == nltk.tree.Tree for x in children])
        children = [
            child for child in children if child.start < child.end
        ]  # remove children that consist of gaps or otherwise eliminated tokens

        # find which children seem to be dependents of which other children
        if True or model != "REAL_REAL":
            childDeps = [None for _ in children]
            childHeads = [None for _ in children]
            for i in range(len(children)):
                incomingFromOutside = [
                    x for x in tree.incoming if x in children[i].incoming
                ]
                if len(incomingFromOutside) > 0:
                    childDeps[i] = sentence[incomingFromOutside[-1][1]]["dep"]
                    childHeads[i] = sentence[incomingFromOutside[-1]
                                             [1]]["head"]

                    if len(incomingFromOutside) > 1:
                        print("FROM OUTSIDE", [
                            sentence[incomingFromOutside[x][1]]["dep"]
                            for x in range(len(incomingFromOutside))
                        ])
                for j in range(len(children)):
                    if i == j:
                        continue
                    incomingFromJ = [
                        x for x in children[i].incoming
                        if x in children[j].outgoing
                    ]
                    if len(incomingFromJ) > 0:
                        if len(incomingFromJ) > 1:
                            duplicateDeps = tuple([
                                sentence[incomingFromJ[x][1]]["dep"]
                                for x in range(len(incomingFromJ))
                            ])
                            if not (duplicateDeps == ("obj", "xcomp")):
                                print("INCOMING FROM NEIGHBOR", duplicateDeps)
                        childDeps[i] = sentence[incomingFromJ[-1][1]]["dep"]
                        childHeads[i] = sentence[incomingFromJ[-1][1]]["head"]
            assert None not in childDeps, (childDeps, children)

            keys = childDeps

            childrenLinearized = children

        childrenAsTrees = []
        for child, dependency in zip(children, childDeps):
            childrenAsTrees.append(
                orderSentenceRec(child, sentence, printThings, linearized))
            if childrenAsTrees[
                    -1] is None:  # this will happen for punctuation etc
                del childrenAsTrees[-1]
            else:
                childrenAsTrees[-1]["dependency"] = dependency
        if label == "SBAR":
            if len(childrenAsTrees) > 1:
                if len(childrenAsTrees
                       ) == 2 and childrenAsTrees[0]["category"] in [
                           "IN", "WHNP"
                       ] and childrenAsTrees[1][
                           "category"] == "S" and childrenAsTrees[1][
                               "dependency"] == "acl:relcl":  # Relative clause
                    if childrenAsTrees[0][
                            "dependency"] == "nsubj":  # SUBJECT Relatives
                        if sentence[childHeads[1] -
                                    1]["dep"] in ["nsubj", "obj"]:
                            #     _ = 0
                            if sentence[childHeads[1] - 1]["dep"] == "nsubj":

                                leaves = [
                                    x for x in tree.leaves()
                                    if not (x.startswith("*T*")
                                            or x.startswith("*U*"))
                                ]
                                #                  print("WORDS       ", " ".join(leaves))
                                #                   print("CATEGORIES  ", zip([x["category"] for x in childrenAsTrees], [x["dependency"] for x in childrenAsTrees]))
                                #                    print("Position in matrix clause", sentence[childHeads[1]-1]["dep"], len(tree.leaves()))
                                #                      assert len(childrenAsTrees[1]["children"]) == 1, childrenAsTrees[1]["children"]
                                #                     print("CHILDREN IN THE RC", [x["category"] for x in childrenAsTrees[1]["children"]])
                                firstVP = [
                                    x["category"]
                                    for x in childrenAsTrees[1]["children"]
                                ].index("VP")
                                #                      print("First VP", firstVP)
                                childrenInTheVP = [
                                    x["category"] for x in childrenAsTrees[1]
                                    ["children"][firstVP]["children"]
                                ]
                                print("CHILDREN IN THE VP", childrenInTheVP)
                                if len(childrenInTheVP
                                       ) > 1 and childrenInTheVP[0].startswith(
                                           "VB"
                                       ) and childrenInTheVP[1] == "NP":

                                    #                 print("Embedded verb head", sentence[childHeads[0]-1])
                                    #                print("Is the last word of RC?", sentence[childHeads[0]-1]["word"] == leaves[-1])
                                    totalCountRCs += 1
                                    totalCountObjectIsLast += (
                                        1 if len(childrenInTheVP) == 2 else 0)
                                    print(totalCountObjectIsLast /
                                          float(totalCountRCs), totalCountRCs
                                          )  # about 63%
                                # What follows the relative clause?

            #   else:


#             print(childrenAsTrees)
#    print(tree.leaves())
#   print([x["category"] for x in childrenAsTrees])

        return {
            "category": label,
            "children": childrenAsTrees,
            "dependency": "NONE"
        }
コード例 #18
0
def containsPhrase(tree, phrase):
    if isinstance(tree, str): return False
    else: return regex.containsPhrase(phrase, tree.leaves(), flags=re.IGNORECASE)
コード例 #19
0
def extractWords(tree):
    return tree.leaves()
コード例 #20
0
 def __getitem__(self, index):
     tree = tree_from_string(self.data[index])
     leaves = tree.leaves()
     return " ".join(leaves)
コード例 #21
0
def remap_chars(tree):
    for i in range(len(tree.leaves())):
        if tree.leaves()[i] in SPECIAL_CHAR_MBACK:
            tree[tree.leaf_treeposition(i)] = SPECIAL_CHAR_MBACK[tree.leaves()
                                                                 [i]]
コード例 #22
0
def get_noun_chunk(tree):
    if tree.label() == 'NP':
        nouns_phase = ''.join(tree.leaves())
    #    noun_chunk.append(nouns_phase)
    return nouns_phase
コード例 #23
0
def get_noun_chunk(tree):
    noun_chunk = []
    if tree.label() == "NP":
        nouns_phase = ''.join(tree.leaves())
        noun_chunk.append(nouns_phase)
    return noun_chunk
コード例 #24
0
    def write_to_file(self, corpus_path, metadata_path, target_folder_path,
                      ranges, errorLog):
        """Writes files to a target folder with the mappings
        from words in utterances to tree nodes in trees.
        """

        if errorLog:
            errorLog = open(errorLog, 'w')
        corpus = CorpusReader(corpus_path, metadata_path)
        # Iterate through all transcripts
        incorrectTrees = 0
        folder = None
        corpus_file = None

        for trans in corpus.iter_transcripts():

            # print "iterating",trans.conversation_no
            if not trans.has_pos():
                continue
            # print "has pos"
            if ranges and not trans.conversation_no in ranges:
                continue
            # print "in range"
            # just look at transcripts WITH trees as compliment to the
            # below models
            if not trans.has_trees():
                continue
            end = trans.swda_filename.rfind("/")
            start = trans.swda_filename.rfind("/", 0, end)
            c_folder = trans.swda_filename[start + 1:end]
            if c_folder != folder:
                # for now splitting the maps by folder
                folder = c_folder
                if corpus_file:
                    corpus_file.close()
                corpus_file = open(
                    target_folder_path +
                    "/Tree_map_{0}.csv.text".format(folder), 'w')
                wordTreeMapList = TreeMapCorpus(False, errorLog)
                print "new map for folder", folder

            translist = trans.utterances
            translength = len(translist)
            count = 0

            # iterating through transcript utterance by utterance
            # create list of tuples i.e. map from word to the index(ices)
            # (possibly multiple or null) of the relevant leaf/ves
            # of a given tree i.e. utt.tree[0].leaves[0] would be a pair (0,0))
            while count < translength:
                utt = trans.utterances[count]
                words = utt.text_words()
                wordTreeMap = []  # [((word), (List of LeafIndices))]
                forwardtrack = 0
                backtrack = 0
                continued = False
                # print "\n COUNT" + str(count)
                # print utt.damsl_act_tag()
                if len(utt.trees) == 0 or utt.damsl_act_tag() == "x":
                    wordTreeMap.append((utt, []))  # just dummy value
                    # errormessage = "WARNING: NO TREE for file/utt: " +\
                    # str(utt.swda_filename) + " " + utt.caller + "." +  \
                    # str(utt.utterance_index) + "." + \
                    #str(utt.subutterance_index) + " " + utt.text
                    # print(errormessage)
                    count += 1
                    continue
                    # raw_input()

                # indices for which tree and leaf we're at:
                i = 0  # tree
                j = 0  # leaf
                # initialise pairs of trees and ptb pairs
                trees = []
                for l in range(0, len(utt.trees)):
                    trees.append(
                        (utt.ptb_treenumbers[l], count, l, utt.trees[l]))
                # print "TREES = "
                # for tree in trees:
                #    print tree
                origtrees = list(trees)
                origcount = count
                # overcoming the problem of previous utterances contributing
                # to the tree at this utterance, we need to add the words from
                # the previous utt add in all the words from previous utterance
                # with a dialogue act tag/or the same tree?
                # check that the last tree in the previous utterance
                # is the same as the previous one
                previousUttSame = trans.previous_utt_same_speaker(utt)
                # print previousUttSame
                lastTreeMap = None
                if previousUttSame:
                    # print "search for previous full act utt
                    # for " + str(utt.swda_filename) +
                    # str(utt.transcript_index)
                    lastTreeMap = wordTreeMapList.get_treemap(
                        trans, previousUttSame)
                    if ((not lastTreeMap) or (len(lastTreeMap) == 0) or
                        (len(lastTreeMap) == 1 and lastTreeMap[0][1] == [])):
                        # print "no last tree map, backwards searching"
                        while previousUttSame and \
                            ((not lastTreeMap) or (len(lastTreeMap) == 0) or
                             (len(lastTreeMap) == 1 and lastTreeMap[0][1] == [])):
                            previousUttSame = trans.previous_utt_same_speaker(
                                previousUttSame)  # go back one more
                            lastTreeMap = wordTreeMapList.get_treemap(
                                trans, previousUttSame)
                            if previousUttSame:
                                pass
                                # print previousUttSame.transcript_index

                    if not lastTreeMap:
                        pass
                        # print "no last treemap found for:"
                        # print utt.swda_filename
                        # print utt.transcript_index

                if lastTreeMap and \
                        (utt.damsl_act_tag() == "+" or
                         (len(lastTreeMap.treebank_numbers) > 0
                          and lastTreeMap.treebank_numbers[-1] ==
                          utt.ptb_treenumbers[0])):
                    continued = True
                    # might have to backtrack
                    # now checking for wrong trees
                    lastPTB = lastTreeMap.treebank_numbers
                    lastIndexes = lastTreeMap.transcript_numbers
                    lastTreesTemp = lastTreeMap.get_trees(trans)
                    lastTrees = []
                    for i in range(0, len(lastPTB)):
                        lastTrees.append([
                            lastPTB[i], lastIndexes[i][0], lastIndexes[i][1],
                            lastTreesTemp[i]
                        ])
                    if not (lastPTB[-1] == utt.ptb_treenumbers[0]):
                        # print "not same, need to correct!"
                        # print words
                        # print trees
                        # print "last one"
                        # print previousUttSame.text_words()
                        # print lastTrees
                        if utt.ptb_treenumbers[0] - lastPTB[-1] > 1:
                            # backtrack and redo the antecedent
                            count = count - (count - lastIndexes[-1][0])
                            utt = previousUttSame
                            words = utt.text_words()
                            mytrees = []
                            for i in range(0, len(lastTrees) - 1):
                                mytrees.append(lastTrees[i])
                            trees = mytrees + [origtrees[0]]
                            # print "\n(1)backtrack to with new trees:"
                            backtrack = 1
                            # print utt.transcript_index
                            # print words
                            # print trees
                            # raw_input()
                        # alternately, this utt's tree may be further back
                        # than its antecdent's, rare mistake
                        elif utt.ptb_treenumbers[0] < lastTrees[-1][0]:
                            # continue with this utterance and trees
                            # (if there are any), but replace its first
                            # tree with its antecdents last one
                            forwardtrack = 1
                            trees = [lastTrees[-1]] + origtrees[1:]
                            # print "\n(2)replacing first one to lasttreemap's:"
                            # print words
                            # print trees
                            # raw_input()

                    if backtrack != 1:  # we should have no match
                        found_treemap = False
                        # resetting
                        # for t in wordTreeMapList.keys():
                        #        print t
                        #        print wordTreeMapList[t]
                        for t in range(len(lastTreeMap) - 1, -1, -1):
                            # print lastTreeMap[t][1]
                            # if there is a leafIndices for the
                            # word being looked at, gets last mapped one
                            if len(lastTreeMap[t][1]) > 0:
                                # print "last treemapping of last
                                # caller utterance =
                                # " + str(lastTreeMap[t][1][-1])
                                j = lastTreeMap[t][1][-1][1] + 1
                                found_treemap = True
                                # print "found last mapping, j -1 = " + str(j-1)
                                # raw_input()
                                break
                        if not found_treemap:
                            pass
                            # print "NO matched last TREEMAP found for \
                            # previous Utt Same Speaker of " + \
                            # str(trans.swda_filename) + " " + \
                            # str(utt.transcript_index)
                            # print lastTreeMap
                            # for tmap in wordTreeMapList.keys():
                            #    print tmap
                            #    print wordTreeMapList[tmap]
                            # raw_input()

                possibleComment = False  # can have comments, flag
                mistranscribe = False
                LeafIndices = []  # possibly empty list of leaf indices
                word = words[0]
                # loop until no more words left to be matched in utterance
                while len(words) > 0:
                    # print "top WORD:" + word
                    if not mistranscribe:
                        wordtest = re.sub(r"[\.\,\?\"\!]", "", word)
                        wordtest = wordtest.replace("(", "").replace(")", "")
                    match = False
                    LeafIndices = []  # possibly empty list of leaf indices
                    if (possibleComment or word[0:1] in [
                            "{", "}", "-"
                    ] or word in ["/", ".", ",", "]"] or wordtest == "" or any(
                        [
                            x in word for x in
                            ["<", ">", "*", "[", "+", "]]", "...", "#", "="]
                        ])):
                        # no tree equivalent for {D } type annotations
                        if (word[0:1] == "-" or
                                any([x in word for x in
                                     ["*", "<<", "<+", "[[", "<"]])) \
                                and not possibleComment:
                            possibleComment = True
                        if possibleComment:
                            #print("match COMMENT!:" + word)
                            # raw_input()
                            LeafIndices = []
                            match = True
                            #wordTreeMap.append((word, LeafIndices))
                            if any([x in word for x in [">>", "]]", ">"]]) or \
                                    word[0] == "-":  # turn off comment
                                possibleComment = False
                                #del words[0]
                        # LeadIndices will be null here
                        wordTreeMap.append((word, LeafIndices))
                        LeafIndices = []
                        match = True
                        # print "match annotation!:" + word
                        del words[0]  # word is consumed, should always be one
                        if len(words) > 0:
                            word = words[0]
                            wordtest = re.sub(r"[\.\,\?\/\)\(\"\!]", "", word)
                            wordtest = wordtest.replace("(", "")
                            wordtest = wordtest.replace(")", "")
                        else:
                            break
                        continue
                        # carry on to next word without updating indices?
                    else:
                        while i < len(trees):
                            # print "i number of trees :" + str(len(utt.trees))
                            # print "i tree number :" + str(i)
                            # print "i loop word :" + word
                            tree = trees[i][3]
                            # print "looking at ptb number " + str(trees[i][0])
                            # print "looking at index number " \
                            #+ str(trees[i][1])+","+str(trees[i][2])
                            while j < len(tree.leaves()):
                                leaf = tree.leaves()[j]
                                # print "j number of leaves : " \
                                #+ str(len(tree.leaves()))
                                # print "j loop word : " + word
                                # print "j loop wordtest : " + wordtest
                                # print "j leaf : " + str(j) + " " + leaf
                                breaker = False
                                # exact match
                                if wordtest == leaf or word == leaf:
                                    LeafIndices.append((i, j))
                                    wordTreeMap.append((word, LeafIndices))
                                    # print("match!:" + word + " " + \
                                    # str(utt.swda_filename) + " " + \
                                    # utt.caller + "." +  \
                                    # str(utt.utterance_index) + \
                                    # "." + str(utt.subutterance_index))
                                    del words[0]  # word is consumed
                                    if len(words) > 0:
                                        word = words[0]  # next word
                                        wordtest = re.sub(
                                            r"[\.\,\?\/\)\(\"\!]", "", word)
                                        wordtest = wordtest.replace("(", "")
                                        wordtest = wordtest.replace(")", "")
                                    LeafIndices = []
                                    j += 1  # increment loop to next leaf
                                    match = True
                                    breaker = True
                                    # raw_input()
                                    break
                                elif leaf in wordtest or \
                                        leaf in word and not leaf == ",":
                                    testleaf = leaf
                                    LeafIndices.append((i, j))
                                    j += 1
                                    for k in range(j, j + 3):  # 3 beyond
                                        if (k >= len(tree.leaves())):
                                            j = 0
                                            i += 1
                                            #breaker = True
                                            breaker = True
                                            break  # got to next tree
                                        if (testleaf + tree.leaves()[k]) \
                                                in wordtest or (testleaf +
                                                                tree.leaves()[k])\
                                                in word:
                                            testleaf += tree.leaves()[k]
                                            LeafIndices.append((i, k))
                                            j += 1
                                            # concatenation
                                            if testleaf == wordtest or \
                                                    testleaf == word:  # word matched
                                                wordTreeMap.append(
                                                    (word, LeafIndices))
                                                del words[0]  # remove word
                                                # print "match!:" + word +\
                                                #str(utt.swda_filename) + " "\
                                                # + utt.caller + "." +  \
                                                # str(utt.utterance_index) +\
                                                # "." + \
                                                # str(utt.subutterance_index))
                                                if len(words) > 0:
                                                    word = words[0]
                                                    wordtest = re.sub(
                                                        r"[\.\,\?\/\)\(\"\!]",
                                                        "", word)
                                                    wordtest = wordtest.\
                                                        replace("(", "")
                                                    wordtest = wordtest.\
                                                        replace(")", "")
                                                # reinitialise leaves
                                                LeafIndices = []
                                                j = k + 1
                                                match = True
                                                breaker = True
                                                # raw_input()
                                                break
                                else:
                                    # otherwise go on
                                    j += 1
                                if breaker:
                                    break
                                if match:
                                    break
                            if j >= len(tree.leaves()):
                                j = 0
                                i += 1
                            if match:
                                break

                    # could not match word! try mistranscriptions first:
                    if not match:
                        if not mistranscribe:  # one final stab at matching!
                            mistranscribe = True
                            for pair in possibleMistranscription:
                                if pair[0] == wordtest:
                                    wordtest = pair[1]
                                    if len(wordTreeMap) > 0:
                                        if len(wordTreeMap[-1][1]) > 0:
                                            i = wordTreeMap[-1][1][-1][0]
                                            j = wordTreeMap[-1][1][-1][1]
                                        else:
                                            # go back to beginning of
                                            # tree search
                                            i = 0
                                            j = 0
                                    else:
                                        i = 0  # go back to beginning
                                        j = 0
                                    break  # matched
                        elif continued:
                            # possible lack of matching up of words in
                            # previous utterance same caller and same
                            # tree// not always within same tree!!
                            errormessage = "Possible bad start for \
                            CONTINUED UTT ''"                                              + words[0] + "'' in file/utt: "\
                                + str(utt.swda_filename) + "\n " + utt.caller + \
                                "." + str(utt.utterance_index) + "." + \
                                str(utt.subutterance_index) + \
                                "POSSIBLE COMMENT = " + str(possibleComment)
                            # print errormessage
                            if not errorLog is None:
                                errorLog.write(errormessage + "\n")
                            # raw_input()
                            if backtrack == 1:
                                backtrack += 1
                            elif backtrack == 2:
                                # i.e. we've done two loops and
                                # still haven't found it, try the other way
                                count = origcount
                                utt = trans.utterances[count]
                                words = utt.text_words()
                                word = words[0]
                                trees = [lastTrees[-1]] + origtrees[1:]
                                # print "\nSECOND PASS(2)replacing \
                                # first one to lasttreemap's:"
                                # print words
                                # print trees
                                backtrack += 1
                                # mistranscribe = False #TODO perhaps needed
                                wordTreeMap = []
                                # switch to forward track this is
                                # the only time we want to try
                                # from the previous mapped leaf in the
                                # other tree
                                foundTreemap = False
                                for t in range(len(lastTreeMap) - 1, -1, -1):
                                    # backwards iteration through words
                                    # print lastTreeMap[t][1]
                                    if len(lastTreeMap[t][1]) > 0:
                                        # print "last treemapping of last \
                                        # caller utterance = " + \
                                        # str(lastTreeMap[t][1][-1])
                                        j = lastTreeMap[t][1][-1][1] + 1
                                        foundTreemap = True
                                        # print "found last mapping, j = " \
                                        #+ str(j)
                                        # raw_input()
                                        # break when last tree
                                        # mapped word from this caller is found
                                        break
                                    if not foundTreemap:
                                        # print "NO matched last TREEMAP found\
                                        # for previous Utt Same Speaker of " + \
                                        # str(utt.swda_filename) + " " + \
                                        # utt.caller + "." +  \
                                        # str(utt.utterance_index) + "." +\
                                        #  str(utt.subutterance_index)
                                        j = 0
                                        # for tmap in wordTreeMapList.keys():
                                        #    print tmap
                                        #    print wordTreeMapList[tmap]
                                        # raw_input()
                                i = 0  # go back to first tree
                                continue
                            elif forwardtrack == 1:
                                forwardtrack += 1
                            elif forwardtrack == 2:
                                count = count - (count - lastIndexes[-1][0])
                                utt = previousUttSame
                                words = utt.text_words()
                                word = words[0]
                                mytrees = []
                                for i in range(0, len(lastTrees) - 1):
                                    mytrees.append(lastTrees[i])
                                trees = mytrees + [origtrees[0]]
                                # print "\nSECOND PASS(1)backtrack to \
                                # with new trees:"
                                # print utt.transcript_index
                                # print words
                                # print trees
                                forwardtrack += 1
                                # mistranscribe = False #TODO maybe needed
                                wordTreeMap = []
                                # raw_input()
                            elif forwardtrack == 3 or backtrack == 3:
                                # if this hasn't worked reset to old trees
                                # print "trying final reset"
                                count = origcount
                                utt = trans.utterances[count]
                                words = utt.text_words()
                                word = words[0]
                                trees = origtrees
                                forwardtrack = 0
                                backtrack = 0
                                # mistranscribe = False #TODO maybe needed
                                wordTreeMap = []
                                # raw_input()
                            else:
                                pass
                                # print "resetting search"
                                # raw_input()
                            # unless forward tracking now,
                            # just go back to beginning
                            i = 0  # go back to beginning of tree search
                            j = 0
                        else:
                            mistranscribe = False
                            LeafIndices = []
                            wordTreeMap.append((word, LeafIndices))
                            errormessage = "WARNING: 440 no/partial tree \
                            mapping for ''"                                            + words[0] + "'' in file/utt: "\
                                + str(utt.swda_filename) + " \n" + utt.caller\
                                + "." + str(utt.utterance_index) + "." + \
                                str(utt.subutterance_index) + \
                                "POSSIBLE COMMENT = " + str(possibleComment)
                            # print utt.text_words()
                            del words[0]  # remove word
                            # for trip in wordTreeMap:
                            #    print "t",trip
                            if len(words) > 0:
                                word = words[0]
                                wordtest = re.sub(r"[\.\,\?\/\)\(\"\!]", "",
                                                  word)
                                wordtest = wordtest.replace("(", "")
                                wordtest = wordtest.replace(")", "")
                            # print errormessage
                            if errorLog:
                                errorLog.write("possible wrong tree mapping:" +
                                               errormessage + "\n")
                            raw_input()
                # end of while loop (words)
                mytreenumbers = []
                for treemap in trees:
                    # the whole list but the tree
                    mytreenumbers.append(treemap[:-1])
                if not len(utt.text_words()) == len(wordTreeMap):
                    print "ERROR. uneven lengths!"
                    print utt.text_words()
                    print wordTreeMap
                    print trans.swda_filename
                    print utt.transcript_index
                    raw_input()
                    count += 1
                    continue
                # add the treemap
                wordTreeMapList.append(trans.conversation_no,
                                       utt.transcript_index,
                                       tuple(mytreenumbers),
                                       tuple(wordTreeMap))
                count += 1
            # rewrite after each transcript
            filedict = defaultdict(str)
            for key in wordTreeMapList.keys():
                csv_string = '"' + str(list(wordTreeMapList[key])) + '"'
                mytreenumbers = wordTreeMapList[key].transcript_numbers
                myptbnumbers = wordTreeMapList[key].treebank_numbers
                tree_list_string = '"'
                for i in range(0, len(mytreenumbers)):
                    treemap = [myptbnumbers[i]] + mytreenumbers[i]
                    tree_list_string += str(treemap) + ";"
                tree_list_string = tree_list_string[:-1] + '"'
                filename = '"' + key[0:key.rfind(':')] + '"'
                transindex = key[key.rfind(':') + 1:]
                filedict[int(transindex)] = filename \
                    + "\t" + transindex + '\t' + csv_string + "\t" \
                    + tree_list_string + "\n"
            for key in sorted(filedict.keys()):
                corpus_file.write(filedict[key])

            wordTreeMapList = TreeMapCorpus(False, errorLog)  # reset each time
        print "\n" + str(incorrectTrees) + " incorrect trees"
        corpus_file.close()
        if not errorLog is None:
            errorLog.close()