"clause": {},
}

#for xmlfilename in xmlfiles:
#for xmlfilename in glob.glob('/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellCheckerCorrected/entrycorrected_*/0/0/0/entrycorrected_*.E*.dep.xml'):
for xmlfilename in glob.glob('/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellCheckerCorrected/entrycorrected_394/0/0/0/entrycorrected_*.E*.dep.xml'):

    #xmlfilename = "/home/nparslow/Documents/AutoCorrige/Corpora/" + xmlfilename
    print "analysing", xmlfilename
    tok2finalforms, tok2lemmacats, verb2info, trees, (weight, maxweight) = getFinalTokenFormsAndTreesAndWeight(xmlfilename)
    print
    print "verb2info"
    print verb2info
    print

    info  = classifyVerbs.classifyVerbs(verb2info)

    print "info"
    print info
    print
    '''
    for verb, info in verb2info.items():

        if len(info) == 1:
            tense = "notense"
            if "tense" in info[0]:
                tense = info[0]["tense"]
            mood = "infinitive" # sometimes mood is not there
            if "mode" in info[0]:
                mood = info[0]["mode"]
            if mood == "indicative" and tense == "notense":
    def __init__(self, processedDepXMLFile, processedTokenFile, processedLogFile, processedMEltFile, debug=False):

        if debug:
            print "new Sentence"
            print processedDepXMLFile
            print processedTokenFile
            print processedLogFile
            print processedMEltFile

        self.debug = debug

        self.tokens = None
        self.rawtokens = None
        self.matchregex = None
        self.uniquetokens = None
        self.lemmaCats = None
        self.spellingcorrections = None
        self.weightperword = None
        self.minweight = None
        self.meltconfidences = None
        self.meltdiffs = None
        self.wordsbeforemainverb = None
        self.hasnomainverb = None
        # parse info
        self.parsed = None
        # verb info:
        self.vgroups = None # should be sum of verb entries for a particular axis projection, != no. of verbs as compounds count as one verb group
        self.vanalysis = None # will be a dictionary with property -> count
        self.vsingle = None
        self.vaux = None
        self.vcompound = None
        self.vindicative = None
        self.vconditional = None
        self.vsubjunctive = None
        self.vimperfect = None
        self.vfuture = None
        self.vpresent = None
        self.vnotense = None
        # clause info:
        self.crel = None
        self.cnom = None
        self.cacc = None
        self.cloc = None
        # tree info
        self.trees = None

        self.wordforms = []
        self.verbAnalysis = None
        self.totalverbgroups = None

        if os.path.isfile(processedTokenFile):
            self.rawtokens = readDepXMLFile.getTokensFromFile(processedTokenFile)

        # melttoks is a list of (tok, tag, prob)
        meltToks = getMEltInfo.loadAndAlign(processedMEltFile, self.rawtokens )

        if os.path.isfile(processedDepXMLFile):
            xmlinfo = readDepXMLFile.readDepXMLFile(processedDepXMLFile, self.debug)
            self.wordforms = xmlinfo["wordforms"]

            if debug:
                print "xmlinfo, tok to final forms:"
                print xmlinfo["tok2finalforms"]
                print "xmlinfo, tok to lemmacats:"
                print xmlinfo["tok2lemmacats"]
            self.setLemmaCats(xmlinfo["tok2lemmacats"])
            if debug:
                print "xmlinfo, lemmacats:"
                print self.lemmaCats
            self.weightperword = xmlinfo["weightperword"]
            self.minweight = xmlinfo["minweight"]
            self.wordsbeforemainverb = xmlinfo["wordsbeforemainverb"]
            self.hasnomainverb = 1 if xmlinfo["wordsbeforemainverb"] == -1 else 0
            self.trees = dict( (x,len(xmlinfo["trees"][x])) for x in xmlinfo["trees"])
            self.parsed = xmlinfo["parsemode"]
            #print "sent parsed", self.parsed

            self.tok2finalforms = xmlinfo["tok2finalforms"]

            #print "sent", xmlinfo["verb2info"]
            self.verbAnalysis, self.totalverbgroups = classifyVerbs.classifyVerbs(xmlinfo["verb2info"])
            #print "sent", self.verbAnalysis

            self.calcSpellingCorrections()

            self.meltdiffs = getMEltInfo.getMEltDiffs(xmlinfo["tok2lemmacats"], meltToks, debug)

        if len(self.wordforms) > 0:
            # sentence was at least partially parsed
            #print "sorted word forms:"
            #print sorted(wordsforms, key=lambda x: x[2].split('_')[1])
            # sort by the start point of the token
            #re.sub(ur'E\d+F\d+\|', u'', x[3], flags=re.UNICODE)
            self.tokens = [Token(x[0], x[1], re.findall(ur'(?<=[F\d]\d\|)[^ ]+', x[3], flags=re.UNICODE),
                                     [int(x) for x in re.findall(ur'(?<=F)\d+(?=\|)', x[3], flags=re.UNICODE) ])
                               for x in sorted(self.wordforms, key=lambda x: int(x[2].split('_')[1]))]
            #sentence.forms = [x[1] for x in sorted(wordsforms, key=lambda x: x[2].split('_')[1])]
        else:
            # sentence wasn't parsed, so use the tokens file:
            #tokens = getTokensFromFile(processedTokenFile)
            print "using tokens***"
            self.tokens = [Token(None, None, self.rawtokens[i], i) for i in range(len(self.rawtokens))]

        if debug: print "sent tokens:", self.tokens

        self.matchregex = makeRegexFromTokens(self.rawtokens, debug)
        if debug:
            print "obstokens:", self.rawtokens
            print "regex", self.matchregex



        self.meltconfidences = [float(x[2]) if (x[2] is not None and x[2] != 'None') else 1.0 for x in meltToks]
        if debug: print "sent melt confs:", self.meltconfidences
def getNextSentenceFromFiles( processedDepXMLFile, processedTokenFile, processedLogFile, debug=False ):

    #print processedSentenceFile
    sentence = None
    if os.path.isfile(processedDepXMLFile):
        sentence = Sentence()

        tree = ET.parse(processedDepXMLFile)
        # 'W' nodes are 'words' which can include multiple tokens, e.g. 'bien que' is one word
        # .iter for recursive, .findall for depth of 1
        # id the cluster then get the lex element from the cluster (we'll process it later)
        wordsforms = [(x.attrib['lemma'], x.attrib['form'], x.attrib['cluster'],
                       fixMixedEncodings(tree.findall("cluster[@id='" + x.attrib['cluster']+"']")[0].attrib["lex"]))
                      for x in tree.iter('node') if len(x.get('lemma'))>0 and x.get('lemma') != "_EPSILON"]
        print "wordsforms"
        print wordsforms
        # correct the encodings and remove epsilons

        tokens = getTokensFromFile(processedTokenFile)
        if len(wordsforms) > 0:
            # sentence was at least partially parsed
            #print "sorted word forms:"
            #print sorted(wordsforms, key=lambda x: x[2].split('_')[1])
            # sort by the start point of the token
            #re.sub(ur'E\d+F\d+\|', u'', x[3], flags=re.UNICODE)
            sentence.tokens = [Token(x[0], x[1], re.findall(ur'(?<=[F\d]\d\|)[^ ]+', x[3], flags=re.UNICODE),
                                     [int(x) for x in re.findall(ur'(?<=F)\d+(?=\|)', x[3], flags=re.UNICODE) ])
                               for x in sorted(wordsforms, key=lambda x: int(x[2].split('_')[1]))]
            #sentence.forms = [x[1] for x in sorted(wordsforms, key=lambda x: x[2].split('_')[1])]
        else:
            # sentence wasn't parsed, so use the tokens file:
            #tokens = getTokensFromFile(processedTokenFile)
            print "using tokens***"
            sentence.tokens = [Token(None, None, tokens[i], i) for i in range(len(tokens))]

        #print sentence.tokens
        #print "obs forms:", [x.observedform for x in sentence.tokens]

        tok2finalforms, tok2lemmacats, verb2info, trees, (weight, minweight) = \
            compareCorrectedCorpus.getFinalTokenFormsAndTreesAndWeight(processedDepXMLFile)
        sentence.setLemmaCats(tok2lemmacats)
        sentence.weightperword = weight
        sentence.minweight = minweight

        verbAnalysis = classifyVerbs.classifyVerbs(verb2info)
        sentence.vsingle = 0
        if "single" in verbAnalysis: sentence.vsingle += verbAnalysis["single"]
        sentence.vaux = 0
        if "aux" in verbAnalysis: sentence.vaux += verbAnalysis["aux"]
        sentence.vcompound = 0
        if "compound" in verbAnalysis: sentence.vcompound += verbAnalysis["compound"]
        sentence.vindicative = 0
        if "indicative" in verbAnalysis: sentence.vindicative += verbAnalysis["indicative"]
        sentence.vconditional = 0
        if "conditional" in verbAnalysis: sentence.vconditional += verbAnalysis["conditional"]
        sentence.vsubjunctive = 0
        if "subjonctive" in verbAnalysis: sentence.vsubjunctive += verbAnalysis["subjonctive"]
        sentence.vimperfect = 0
        if "imperfect" in verbAnalysis: sentence.vimperfect += verbAnalysis["imperfect"]
        sentence.vfuture = 0
        if "future" in verbAnalysis: sentence.vfuture += verbAnalysis["future"]
        sentence.vpresent = 0
        if "present" in verbAnalysis: sentence.vpresent += verbAnalysis["present"]
        sentence.vnotense = 0
        if "notense" in verbAnalysis: sentence.vnotense += verbAnalysis["notense"]
        # clause info:
        sentence.crel = 0
        if "rel" in verbAnalysis: sentence.crel += verbAnalysis["rel"]
        sentence.cnom = 0
        if "nom" in verbAnalysis: sentence.cnom += verbAnalysis["nom"]
        sentence.cacc = 0
        if "acc" in verbAnalysis: sentence.cacc += verbAnalysis["acc"]
        sentence.cloc = 0
        if "loc" in verbAnalysis: sentence.cloc += verbAnalysis["loc"]

        sxpipeSpellingChanges = 0
        for i in range(len(tokens)):
            # skip multitoken elements, too hard
            if len(tok2finalforms[i+1]) > 1 or len(tok2finalforms[i+1][0].split(' ')) > 1: continue
            if tok2finalforms[i+1][0][0] == "_": continue
            t = tokens[i].lower()
            f = tok2finalforms[i+1][0].lower()

            if t != f:
                print "spelling?", t, f, tok2finalforms[i+1]
                sxpipeSpellingChanges += 1
        sentence.spellingcorrections=sxpipeSpellingChanges
        '''
            t = joinTokens([tokens[i].lower()])
            f = joinTokens(tok2finalforms[i+1]).lower()

            print t, f
            if i > 0:
                # tok2finalforms first token is no. 1
                if tok2finalforms[i+1] == tok2finalforms[i]: continue # don't look if two same wordforms in a row
            if t != f:
                # check its not a multiwoprint "spelling?", t, f, tok2finalforms[i+1]
                    sxpipeSpellingChanges += 1rd thing
                tmpi = i+1
                isDouble = False
                while f.startswith(t) and tmpi < len(tokens) and len(t) < len(f):
                    t = joinTokens([t, tokens[tmpi]])
                    if t == f:
                        isDouble = True
                        break
                    tmpi += 1

                if not isDouble:
                    print "spelling?", t, f, tok2finalforms[i+1]
                    sxpipeSpellingChanges += 1
        '''


        '''
        for s_token, observedtoken in zip([sentence.tokens[0]] + [sentence.tokens[i] for i in range(1,len(sentence.tokens))
                                                                  if sentence.tokens[i-1].parseposition != \
                                                                     sentence.tokens[i].parseposition],
                                          getTokensFromFile(processedTokenFile)):
            # the tokens file overrules the depxml, e.g. in depxml you have \?
            if s_token.observedform != observedtoken:
                print "combining:", s_token.frmgform, s_token.observedform, observedtoken
                s_token.observedform = observedtoken
        '''
        #print words
        #print forms

        # we remove double entries from amalgams
        if debug: print "pre make regex:"
        '''
        print [(sentence.tokens[i].parseposition[0],
                sentence.tokens[i-1].parseposition[-1],
                sentence.tokens[i].observedform) for i in range(1, len(sentence.tokens))]
        print [sentence.tokens[0].observedform] +[sentence.tokens[i].observedform
                                   for i in range(1,len(sentence.tokens))
                                   if sentence.tokens[i].parseposition[0] > sentence.tokens[i-1].parseposition[-1]]
        '''
        '''
        obstokens = []
        obstokenpositions = []
        for token in sentence.tokens:
            for obstoken, obstokenposition in zip(token.observedform, token.parseposition):
                if obstokenposition not in obstokenpositions:
                    obstokens.append(obstoken)
                    obstokenpositions.append(obstokenposition)
        '''
        obstokens = sentence.setAndGetUniqueTokens()
        #print "obstokens", obstokens
        sentence.matchregex = makeRegexFromTokens(obstokens)
        if debug: print "obstokens:", obstokens
        '''
        sentence.matchregex = makeRegexFromTokens(
            [sentence.tokens[0].observedform] +[sentence.tokens[i].observedform
                                   for i in range(1,len(sentence.tokens))
                                   if sentence.tokens[i].parseposition[0] > sentence.tokens[i-1].parseposition[-1]])
        '''
    return sentence