def main():


    lexiquefile = u"/home/nparslow/Documents/AutoCorrige/tools/Lexique380/Bases+Scripts/Lexique380.txt"
    lex = {}
    queryLexique380.loadLexiqueToDict( lexiquefile, lex )

    vocabfiledir = "/home/nparslow/Documents/AutoCorrige/vocabulary/vocab_lists/vocabulaire_progressif_du_francais/"
    beginnerfilename = "001_beginnervocablist.txt"
    intermediatefilename = "002_intermediatevocablist.txt"
    advancedfilename = "003_advancedvocablist.txt"

    beginnerWords = getVocabFromFile(os.path.join(vocabfiledir, beginnerfilename))
    print "beginner vocab size", len(beginnerWords)
    checkWordQuality(beginnerWords, lex)

    intermediateWords = getVocabFromFile(os.path.join(vocabfiledir, intermediatefilename))
    print "intermed vocab size", len(intermediateWords)
    checkWordQuality(intermediateWords, lex)

    advancedWords = getVocabFromFile(os.path.join(vocabfiledir, advancedfilename))
    print "advanced vocab size", len(advancedWords)
    checkWordQuality(advancedWords, lex)

    allintersected = set(beginnerWords).intersection(set(intermediateWords)).intersection(set(advancedWords))

    purebeginner = set(beginnerWords).difference(set(intermediateWords)).difference(allintersected)
    highbeginner = set(beginnerWords).intersection(set(intermediateWords)).difference(allintersected)
    pureintermediate = set(intermediateWords).difference(set(beginnerWords)).difference(set(advancedWords))
    highintermediate = set(advancedWords).intersection(set(intermediateWords)).difference(allintersected)
    highadvanced = set(advancedWords).difference(highintermediate).difference(allintersected)



    print "group sizes:"
    print "pure beginner     :", len(purebeginner)
    print "high beginner     :", len(highbeginner)
    print "all levels        :", len(allintersected)
    print "pure intermediate :", len(pureintermediate)
    print "high intermediate :", len(highintermediate)
    print "high advanced     :", len(highadvanced)
    print "total         :", len(purebeginner) + len(highbeginner) + len(allintersected) + len(highintermediate) + len(highadvanced)
    print "original total:", len(beginnerWords) + len(intermediateWords) + len(advancedWords)
    # overlap is difference

    for samp in [purebeginner, highbeginner, allintersected, pureintermediate, highintermediate, highadvanced]:
        print
        print
        print "Next Sample!!!"
        for word in samp:
            print word
def main(argv):
    # todo add a debug variable and optionfilename variable
    optionsfilename = "settings/optionsfile.txt"
    try:
        opts, args = getopt.getopt(argv,"hi:", ["in_options_file="])
    except getopt.GetoptError:
        print 'textExtractor.py -i <input options file>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'textExtractor.py -i <input options file>'
            sys.exit()
        elif opt in ("-i", "--in_options_file"):
            optionsfilename = arg 

    globalparams, variableparams = optionsFileReader.readOptionsFile(optionsfilename)
    filenames = allCorpusFiles( globalparams["origtextdir"] )
    print filenames
    # todo for the moment we just take the first element for these
    #meltDir = globalparams["melteddir"][0]
    #ddagDir = globalparams["ddageddir"][0]
    #frmgDir = globalparams["frmgeddir"][0]
    filenameResources = {
        "filename": None,
        "melteddir" : globalparams["melteddir"][0],
        "ddageddir" : globalparams["ddageddir"][0],
        "frmgeddir" : globalparams["frmgeddir"][0],
    }

    print globalparams
    outarffdir = globalparams["outdir"][0]
    corpusName = globalparams["corpusName"][0]
    headerInfo = globalparams["headerInfo"][0]
    lexiquefile = globalparams["lexiqueDict"][0]
    freq2ranksfile = globalparams["freq2ranks"][0]

    gensimModelFile = globalparams["word2vecmodel"][0]

    variableTypes = set([x[0] for x in variableparams])

    lemmacat2freqrank = {}
    if "PLex" in variableTypes or "S" in variableTypes or "altS" in variableTypes or "LFP" in variableTypes:
        print "loading freq info"
        lemmacat2freqrank = calcPLex.loadLemmaCat2freqrank(freq2ranksfile)

    word2vecModel = None
    if "w2vct" in variableTypes:
        print "loading word2vec model"
        word2vecModel = gensim.models.Word2Vec.load(gensimModelFile)

    nGramDict, nmoGramDict, totalcounts = {}, {}, 1000000000000000
    if "bigramLogProb" in variableTypes:
        print "loading bi-gram model"
        ngramModelFile = globalparams["bigrammodel"][0]
        nGramDict, nmoGramDict, totalcounts = nGramModel.getNgramDicts(ngramModelFile)

    lexiqueDict = {}
    if "syllablesPerWord" in variableTypes:
        loadLexiqueToDict(lexiquefile, lexiqueDict)

    # create a list of variables for the arff file:
    variables = [ ("filename", "string")] + \
        [ (variableAndParamsToString(vname, params), "numeric") for vname,params in variableparams] + \
        [ ("level", "numeric")] # note that level must go last!

    for x in variables:
        print x

    #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/C/Catja.txt"]
    #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/A/Arvid.txt"]
    #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/A/Amie4.txt"]
    #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/B/Bror2.txt"]
    filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/C/Caroline.txt"]
    filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/E/Eddy.txt"] # just to test an E
    filenames = ["/home/pinot/alpage/nparslow/Documents/Corpora/CEFLE/E/Eddy.txt"] # just to test an E


    resources = {
        "lemmacat2freqrank": lemmacat2freqrank,
        "word2vecModel": word2vecModel,
        "nGramDict": nGramDict,
        "nmoGramDict": nmoGramDict,
        "nGramCounts": totalcounts,
        "lexiqueDict": lexiqueDict
    }

    outputRows =[]


    allobservedtrees = set([])
    treespertext = []
    for filename in filenames:

        # we change the resouce filename with each round:
        filenameResources["filename"] = filename
        baseFileName = os.path.basename(filename)
        baseFileName, extension = os.path.splitext(baseFileName)
        filenameResources["ddagfile"] = os.path.join(filenameResources["ddageddir"],
                                                         os.path.basename(baseFileName) + ".ddag")

        print
        print filename
        #fname = os.path.join(baseDir, filename)
        #if "CEFLE" in baseDir:
        #    fname = os.path.join(baseDir, filename[0], filename)

        #text = getDocumentProperties(frmgDir, meltDir, ddagDir, fname, word2vecModel,
        #                             (nGramDict, nmoGramDict, totalcounts), debug=False)
        #text = getDocumentProperties(filenameResources, variables, word2vecModel,
        #                             (nGramDict, nmoGramDict, totalcounts), debug=False)
        text = getDocumentProperties(filenameResources, variableparams, debug=False)

        text.calcVariables( resources )

        for i in range(len(variableparams)+1):
            #variable, params = variableparams[i]
            varlabel = variables[i]
            # squeeze the filename in first:
            value = filename
            if i > 0:
                value = text.variablevalues[i-1]
            print varlabel, "\t", str(value)
        print 'level', "\t", text.level

        allobservedtrees.update(text.trees.keys())
        treespertext.append( (set(text.trees.keys()), text.level) )
        #print "mwpw", text.getMeanWeightPerWord()
        outputRows.append( [baseFileName] + text.variablevalues + [text.level] )


    savetoArff(outarffdir, corpusName, headerInfo, variables, outputRows )
    savetoArff(outarffdir, corpusName + "class", headerInfo, variables, outputRows, levelAsClass=True )

    arfftreefile = "testtrees"
    #corpusName = "test"
    #headerInfo = "a test corpus\n of stuff"
    treevariables = list(allobservedtrees)
    #print allobservedtrees
    #print treespertext
    treeoutputRows = [[1 if x in trees else 0 for x in treevariables] + [level] for trees, level in treespertext]
    treevariables.append("level")
    #print "ntrees", len(treevariables)
    #for a,b in zip(treevariables, treeoutputRows):
    #    print a,len(b), b

    savetoTreeArff(outarffdir, arfftreefile, headerInfo, treevariables, treeoutputRows, levelAsClass=True )
                    sentence = []
                elif line[0] != "#":
                    tokennum, tokeninfo, nexttokennum = line.split('\t')
                    #print tokeninfo
                    # can have multiple tokens so start from the right:
                    token = re.search(ur'(?: )([^\}]+)$', tokeninfo, flags=re.UNICODE).groups()[0]
                    # some left over space:
                    token = token.strip()

                    # in case some empty tokens arrive here:
                    if len(token) > 0:
                        if token not in vocab2count:
                            sentence.append(token)
                            missingwords.add(token)

nativecorpus = "/home/nparslow/Documents/AutoCorrige/Corpora/tokenised/CORPUS_LITTAVANCE"
missed = {}
missingwords = set([])
analyse(nativecorpus, vocab2count, missed, missingwords)
print missed

lex = {}
queryLexique380.loadLexiqueToDict(u"/home/nparslow/Documents/AutoCorrige/tools/Lexique380/Bases+Scripts/Lexique380.txt", lex)

for mword in missingwords:
    if mword in lex:
        print mword, lex[mword]
    else:
        print "NOT IN LEXIQUE!", mword
print "not in lex:", len([x for x in missingwords if x not in lex]), " of ", len(missingwords)
def getDocumentProperties(corpus, filename, debug=False):

    text = Text([])
    sentenceNum = 1
    currentParagraph = Paragraph([])
    wordSyllableLengths = []
    wordCharacterLengths = []

    baseFileName, extension = os.path.splitext(filename)
    processedLogFile = os.path.basename(baseFileName) + ".log"

    currentSentence = None

    # get the learner level if it's known:
    text.level = getCorpusInfo.getCorpusInfo(baseFileName)

    #print corpus, processedLogFile
    print
    print "file:", filename
    parsinginfos = getLogFileInfo(os.path.join(corpus, processedLogFile))
    if debug: print "Num sentences:", len(parsinginfos)

    with codecs.open(filename, mode='r', encoding='utf8') as infile:
        #lastParagraphSentenceBreak = 0

        lineNumber = 0
        for line in infile:
            if debug:
                print "line:", line
            lineNumber += 1

            #print processedSentenceFile, os.path.isfile(processedSentenceFile)
            if currentSentence is None:
                currentSentence = getNextSentence(corpus, baseFileName, sentenceNum, debug=debug)
            currentSentenceUntested = True
            while currentSentenceUntested and currentSentence is not None:

                if debug:
                    print "sentence:", sentenceNum
                    print currentSentence.tokens
                    print "regex", currentSentence.matchregex

                if re.match(currentSentence.matchregex, line, flags=re.UNICODE):
                    # remove it from the line:
                    line = re.sub(ur'^' + currentSentence.matchregex, u'', line, flags=re.UNICODE)
                    if debug: print "newline:", line
                    sentenceNum += 1
                    # stock the current info and replace it with the next info
                    currentParagraph.sentences.append(currentSentence)
                    currentSentence = getNextSentence(corpus, baseFileName, sentenceNum)
                else:
                    currentSentenceUntested = False
                    if len(currentParagraph.sentences) > 0:
                    #if sentenceNum -1 > lastParagraphSentenceBreak :
                        text.paragraphs.append(currentParagraph)
                        currentParagraph = Paragraph([])
                        #paragraphLengths.append(sentenceNum-1-lastParagraphSentenceBreak)

                        #lastParagraphSentenceBreak = sentenceNum -1
                        #print "new paragraph:", paragraphLengths, lastParagraphSentenceBreak

                print "current para", len(currentParagraph.sentences), len(text.paragraphs)



    # wrap up the last paragraph (here minus 1 as the sentence num is one higher than observed
    # even though we want the break after the last sentence unlike previously
    #if lastParagraphSentenceBreak < sentenceNum-1:
    if len(currentParagraph.sentences) > 0:
        text.paragraphs.append(currentParagraph)
        #paragraphLengths.append(sentenceNum-1-lastParagraphSentenceBreak)
        #print "final paragraph:", paragraphLengths
    paragraphLengths = [len(x.sentences) for x in text.paragraphs]
    print "all paragraphs:        ", paragraphLengths
    print "sum of para lengths:   ", sum(paragraphLengths)
    print "last real sentence:    ", sentenceNum -1
    print "expected no. sentences:", len(parsinginfos)
    if len(parsinginfos) != sum(paragraphLengths):
        print "PROBLEM!!!!!"
    text.parsedok = 1.0*parsinginfos.count("ok")/len(parsinginfos)
    text.parsedrob = 1.0*parsinginfos.count("robust")/len(parsinginfos)
    text.parsedcorr = 1.0*parsinginfos.count("corrected")/len(parsinginfos)


    #else:
    #    print "Sentene numbers match :)"

    lexiqueDict = {}
    loadLexiqueToDict(u"/home/nparslow/Documents/AutoCorrige/tools/Lexique380/Bases+Scripts/Lexique380.txt",
                      lexiqueDict)
    #print type(lexiqueDict)
    text.addLexiqueInfo( lexiqueDict)

    text.setVocabularyMeasures()
    text.setVerbClauseInfo() # must be run after all the sentences to pass the info up

    return text