def main(): lexiquefile = u"/home/nparslow/Documents/AutoCorrige/tools/Lexique380/Bases+Scripts/Lexique380.txt" lex = {} queryLexique380.loadLexiqueToDict( lexiquefile, lex ) vocabfiledir = "/home/nparslow/Documents/AutoCorrige/vocabulary/vocab_lists/vocabulaire_progressif_du_francais/" beginnerfilename = "001_beginnervocablist.txt" intermediatefilename = "002_intermediatevocablist.txt" advancedfilename = "003_advancedvocablist.txt" beginnerWords = getVocabFromFile(os.path.join(vocabfiledir, beginnerfilename)) print "beginner vocab size", len(beginnerWords) checkWordQuality(beginnerWords, lex) intermediateWords = getVocabFromFile(os.path.join(vocabfiledir, intermediatefilename)) print "intermed vocab size", len(intermediateWords) checkWordQuality(intermediateWords, lex) advancedWords = getVocabFromFile(os.path.join(vocabfiledir, advancedfilename)) print "advanced vocab size", len(advancedWords) checkWordQuality(advancedWords, lex) allintersected = set(beginnerWords).intersection(set(intermediateWords)).intersection(set(advancedWords)) purebeginner = set(beginnerWords).difference(set(intermediateWords)).difference(allintersected) highbeginner = set(beginnerWords).intersection(set(intermediateWords)).difference(allintersected) pureintermediate = set(intermediateWords).difference(set(beginnerWords)).difference(set(advancedWords)) highintermediate = set(advancedWords).intersection(set(intermediateWords)).difference(allintersected) highadvanced = set(advancedWords).difference(highintermediate).difference(allintersected) print "group sizes:" print "pure beginner :", len(purebeginner) print "high beginner :", len(highbeginner) print "all levels :", len(allintersected) print "pure intermediate :", len(pureintermediate) print "high intermediate :", len(highintermediate) print "high advanced :", len(highadvanced) print "total :", len(purebeginner) + len(highbeginner) + len(allintersected) + len(highintermediate) + len(highadvanced) print "original total:", len(beginnerWords) + len(intermediateWords) + len(advancedWords) # overlap is difference for samp in [purebeginner, highbeginner, allintersected, pureintermediate, highintermediate, highadvanced]: print print print "Next Sample!!!" for word in samp: print word
def main(argv): # todo add a debug variable and optionfilename variable optionsfilename = "settings/optionsfile.txt" try: opts, args = getopt.getopt(argv,"hi:", ["in_options_file="]) except getopt.GetoptError: print 'textExtractor.py -i <input options file>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'textExtractor.py -i <input options file>' sys.exit() elif opt in ("-i", "--in_options_file"): optionsfilename = arg globalparams, variableparams = optionsFileReader.readOptionsFile(optionsfilename) filenames = allCorpusFiles( globalparams["origtextdir"] ) print filenames # todo for the moment we just take the first element for these #meltDir = globalparams["melteddir"][0] #ddagDir = globalparams["ddageddir"][0] #frmgDir = globalparams["frmgeddir"][0] filenameResources = { "filename": None, "melteddir" : globalparams["melteddir"][0], "ddageddir" : globalparams["ddageddir"][0], "frmgeddir" : globalparams["frmgeddir"][0], } print globalparams outarffdir = globalparams["outdir"][0] corpusName = globalparams["corpusName"][0] headerInfo = globalparams["headerInfo"][0] lexiquefile = globalparams["lexiqueDict"][0] freq2ranksfile = globalparams["freq2ranks"][0] gensimModelFile = globalparams["word2vecmodel"][0] variableTypes = set([x[0] for x in variableparams]) lemmacat2freqrank = {} if "PLex" in variableTypes or "S" in variableTypes or "altS" in variableTypes or "LFP" in variableTypes: print "loading freq info" lemmacat2freqrank = calcPLex.loadLemmaCat2freqrank(freq2ranksfile) word2vecModel = None if "w2vct" in variableTypes: print "loading word2vec model" word2vecModel = gensim.models.Word2Vec.load(gensimModelFile) nGramDict, nmoGramDict, totalcounts = {}, {}, 1000000000000000 if "bigramLogProb" in variableTypes: print "loading bi-gram model" ngramModelFile = globalparams["bigrammodel"][0] nGramDict, nmoGramDict, totalcounts = nGramModel.getNgramDicts(ngramModelFile) lexiqueDict = {} if "syllablesPerWord" in variableTypes: loadLexiqueToDict(lexiquefile, lexiqueDict) # create a list of variables for the arff file: variables = [ ("filename", "string")] + \ [ (variableAndParamsToString(vname, params), "numeric") for vname,params in variableparams] + \ [ ("level", "numeric")] # note that level must go last! for x in variables: print x #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/C/Catja.txt"] #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/A/Arvid.txt"] #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/A/Amie4.txt"] #filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/B/Bror2.txt"] filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/C/Caroline.txt"] filenames = ["/home/nparslow/Documents/AutoCorrige/Corpora/CORPUS_CEFLE/E/Eddy.txt"] # just to test an E filenames = ["/home/pinot/alpage/nparslow/Documents/Corpora/CEFLE/E/Eddy.txt"] # just to test an E resources = { "lemmacat2freqrank": lemmacat2freqrank, "word2vecModel": word2vecModel, "nGramDict": nGramDict, "nmoGramDict": nmoGramDict, "nGramCounts": totalcounts, "lexiqueDict": lexiqueDict } outputRows =[] allobservedtrees = set([]) treespertext = [] for filename in filenames: # we change the resouce filename with each round: filenameResources["filename"] = filename baseFileName = os.path.basename(filename) baseFileName, extension = os.path.splitext(baseFileName) filenameResources["ddagfile"] = os.path.join(filenameResources["ddageddir"], os.path.basename(baseFileName) + ".ddag") print print filename #fname = os.path.join(baseDir, filename) #if "CEFLE" in baseDir: # fname = os.path.join(baseDir, filename[0], filename) #text = getDocumentProperties(frmgDir, meltDir, ddagDir, fname, word2vecModel, # (nGramDict, nmoGramDict, totalcounts), debug=False) #text = getDocumentProperties(filenameResources, variables, word2vecModel, # (nGramDict, nmoGramDict, totalcounts), debug=False) text = getDocumentProperties(filenameResources, variableparams, debug=False) text.calcVariables( resources ) for i in range(len(variableparams)+1): #variable, params = variableparams[i] varlabel = variables[i] # squeeze the filename in first: value = filename if i > 0: value = text.variablevalues[i-1] print varlabel, "\t", str(value) print 'level', "\t", text.level allobservedtrees.update(text.trees.keys()) treespertext.append( (set(text.trees.keys()), text.level) ) #print "mwpw", text.getMeanWeightPerWord() outputRows.append( [baseFileName] + text.variablevalues + [text.level] ) savetoArff(outarffdir, corpusName, headerInfo, variables, outputRows ) savetoArff(outarffdir, corpusName + "class", headerInfo, variables, outputRows, levelAsClass=True ) arfftreefile = "testtrees" #corpusName = "test" #headerInfo = "a test corpus\n of stuff" treevariables = list(allobservedtrees) #print allobservedtrees #print treespertext treeoutputRows = [[1 if x in trees else 0 for x in treevariables] + [level] for trees, level in treespertext] treevariables.append("level") #print "ntrees", len(treevariables) #for a,b in zip(treevariables, treeoutputRows): # print a,len(b), b savetoTreeArff(outarffdir, arfftreefile, headerInfo, treevariables, treeoutputRows, levelAsClass=True )
sentence = [] elif line[0] != "#": tokennum, tokeninfo, nexttokennum = line.split('\t') #print tokeninfo # can have multiple tokens so start from the right: token = re.search(ur'(?: )([^\}]+)$', tokeninfo, flags=re.UNICODE).groups()[0] # some left over space: token = token.strip() # in case some empty tokens arrive here: if len(token) > 0: if token not in vocab2count: sentence.append(token) missingwords.add(token) nativecorpus = "/home/nparslow/Documents/AutoCorrige/Corpora/tokenised/CORPUS_LITTAVANCE" missed = {} missingwords = set([]) analyse(nativecorpus, vocab2count, missed, missingwords) print missed lex = {} queryLexique380.loadLexiqueToDict(u"/home/nparslow/Documents/AutoCorrige/tools/Lexique380/Bases+Scripts/Lexique380.txt", lex) for mword in missingwords: if mword in lex: print mword, lex[mword] else: print "NOT IN LEXIQUE!", mword print "not in lex:", len([x for x in missingwords if x not in lex]), " of ", len(missingwords)
def getDocumentProperties(corpus, filename, debug=False): text = Text([]) sentenceNum = 1 currentParagraph = Paragraph([]) wordSyllableLengths = [] wordCharacterLengths = [] baseFileName, extension = os.path.splitext(filename) processedLogFile = os.path.basename(baseFileName) + ".log" currentSentence = None # get the learner level if it's known: text.level = getCorpusInfo.getCorpusInfo(baseFileName) #print corpus, processedLogFile print print "file:", filename parsinginfos = getLogFileInfo(os.path.join(corpus, processedLogFile)) if debug: print "Num sentences:", len(parsinginfos) with codecs.open(filename, mode='r', encoding='utf8') as infile: #lastParagraphSentenceBreak = 0 lineNumber = 0 for line in infile: if debug: print "line:", line lineNumber += 1 #print processedSentenceFile, os.path.isfile(processedSentenceFile) if currentSentence is None: currentSentence = getNextSentence(corpus, baseFileName, sentenceNum, debug=debug) currentSentenceUntested = True while currentSentenceUntested and currentSentence is not None: if debug: print "sentence:", sentenceNum print currentSentence.tokens print "regex", currentSentence.matchregex if re.match(currentSentence.matchregex, line, flags=re.UNICODE): # remove it from the line: line = re.sub(ur'^' + currentSentence.matchregex, u'', line, flags=re.UNICODE) if debug: print "newline:", line sentenceNum += 1 # stock the current info and replace it with the next info currentParagraph.sentences.append(currentSentence) currentSentence = getNextSentence(corpus, baseFileName, sentenceNum) else: currentSentenceUntested = False if len(currentParagraph.sentences) > 0: #if sentenceNum -1 > lastParagraphSentenceBreak : text.paragraphs.append(currentParagraph) currentParagraph = Paragraph([]) #paragraphLengths.append(sentenceNum-1-lastParagraphSentenceBreak) #lastParagraphSentenceBreak = sentenceNum -1 #print "new paragraph:", paragraphLengths, lastParagraphSentenceBreak print "current para", len(currentParagraph.sentences), len(text.paragraphs) # wrap up the last paragraph (here minus 1 as the sentence num is one higher than observed # even though we want the break after the last sentence unlike previously #if lastParagraphSentenceBreak < sentenceNum-1: if len(currentParagraph.sentences) > 0: text.paragraphs.append(currentParagraph) #paragraphLengths.append(sentenceNum-1-lastParagraphSentenceBreak) #print "final paragraph:", paragraphLengths paragraphLengths = [len(x.sentences) for x in text.paragraphs] print "all paragraphs: ", paragraphLengths print "sum of para lengths: ", sum(paragraphLengths) print "last real sentence: ", sentenceNum -1 print "expected no. sentences:", len(parsinginfos) if len(parsinginfos) != sum(paragraphLengths): print "PROBLEM!!!!!" text.parsedok = 1.0*parsinginfos.count("ok")/len(parsinginfos) text.parsedrob = 1.0*parsinginfos.count("robust")/len(parsinginfos) text.parsedcorr = 1.0*parsinginfos.count("corrected")/len(parsinginfos) #else: # print "Sentene numbers match :)" lexiqueDict = {} loadLexiqueToDict(u"/home/nparslow/Documents/AutoCorrige/tools/Lexique380/Bases+Scripts/Lexique380.txt", lexiqueDict) #print type(lexiqueDict) text.addLexiqueInfo( lexiqueDict) text.setVocabularyMeasures() text.setVerbClauseInfo() # must be run after all the sentences to pass the info up return text