def __init__(self, variables, resource2filename, sentences, debug=False ): self.variables = variables self.resource2filename = resource2filename self.debug = debug self.sentences = sentences # student level (None if unknown) self.level = getCorpusInfo.getCorpusInfo(resource2filename["filename"]) self.nWords = None self.paragraphStarts = [0] # the start position 0 is always there self.lemmacats = None self.vanalysis = None self.vgroups = None self.ddagSentences = None # prepare any of the required properties for particular variables: self.variabletypes = set([x[0] for x in variables]) self.__prepareRequiredElements() # variable name to function #self.requiredFuncs = [] #for variable in variables: # self.requiredFuncs.append( ) # todo self.variablevalues = [None]*len(variables) # use position in list to define
def getDocumentProperties(corpus, filename, debug=False): text = Text([]) sentenceNum = 1 currentParagraph = Paragraph([]) wordSyllableLengths = [] wordCharacterLengths = [] baseFileName, extension = os.path.splitext(filename) processedLogFile = os.path.basename(baseFileName) + ".log" currentSentence = None # get the learner level if it's known: text.level = getCorpusInfo.getCorpusInfo(baseFileName) #print corpus, processedLogFile print print "file:", filename parsinginfos = getLogFileInfo(os.path.join(corpus, processedLogFile)) if debug: print "Num sentences:", len(parsinginfos) with codecs.open(filename, mode='r', encoding='utf8') as infile: #lastParagraphSentenceBreak = 0 lineNumber = 0 for line in infile: if debug: print "line:", line lineNumber += 1 #print processedSentenceFile, os.path.isfile(processedSentenceFile) if currentSentence is None: currentSentence = getNextSentence(corpus, baseFileName, sentenceNum, debug=debug) currentSentenceUntested = True while currentSentenceUntested and currentSentence is not None: if debug: print "sentence:", sentenceNum print currentSentence.tokens print "regex", currentSentence.matchregex if re.match(currentSentence.matchregex, line, flags=re.UNICODE): # remove it from the line: line = re.sub(ur'^' + currentSentence.matchregex, u'', line, flags=re.UNICODE) if debug: print "newline:", line sentenceNum += 1 # stock the current info and replace it with the next info currentParagraph.sentences.append(currentSentence) currentSentence = getNextSentence(corpus, baseFileName, sentenceNum) else: currentSentenceUntested = False if len(currentParagraph.sentences) > 0: #if sentenceNum -1 > lastParagraphSentenceBreak : text.paragraphs.append(currentParagraph) currentParagraph = Paragraph([]) #paragraphLengths.append(sentenceNum-1-lastParagraphSentenceBreak) #lastParagraphSentenceBreak = sentenceNum -1 #print "new paragraph:", paragraphLengths, lastParagraphSentenceBreak print "current para", len(currentParagraph.sentences), len(text.paragraphs) # wrap up the last paragraph (here minus 1 as the sentence num is one higher than observed # even though we want the break after the last sentence unlike previously #if lastParagraphSentenceBreak < sentenceNum-1: if len(currentParagraph.sentences) > 0: text.paragraphs.append(currentParagraph) #paragraphLengths.append(sentenceNum-1-lastParagraphSentenceBreak) #print "final paragraph:", paragraphLengths paragraphLengths = [len(x.sentences) for x in text.paragraphs] print "all paragraphs: ", paragraphLengths print "sum of para lengths: ", sum(paragraphLengths) print "last real sentence: ", sentenceNum -1 print "expected no. sentences:", len(parsinginfos) if len(parsinginfos) != sum(paragraphLengths): print "PROBLEM!!!!!" text.parsedok = 1.0*parsinginfos.count("ok")/len(parsinginfos) text.parsedrob = 1.0*parsinginfos.count("robust")/len(parsinginfos) text.parsedcorr = 1.0*parsinginfos.count("corrected")/len(parsinginfos) #else: # print "Sentene numbers match :)" lexiqueDict = {} loadLexiqueToDict(u"/home/nparslow/Documents/AutoCorrige/tools/Lexique380/Bases+Scripts/Lexique380.txt", lexiqueDict) #print type(lexiqueDict) text.addLexiqueInfo( lexiqueDict) text.setVocabularyMeasures() text.setVerbClauseInfo() # must be run after all the sentences to pass the info up return text