def vectorSpaceInit(self,questions,colName): questions.rewind() DBStore.dropColl(colName) print 'Creating vector space' for question in questions: DBStore.getDB()[colName].insert({'qID':question['qID'],'question':question['question'],'CoarseCode':self.__class__.coarseClassCode[question['coarse']],'FineCode':self.__class__.fineClassCode[question['fine']]}) print 'Done'
def insertFile2Database(self,colName='',training=True,parse=False): # posTagger = POSTagger() # posTagger.loadTagger() # tagger = posTagger.getTagger() lemmatizer = WordNetLemmatizer() DBStore.dropColl('raw'+colName) collection = DBStore.getDB()['raw'+colName] if training: readFileName = DBStore.trainingRoot + "\\raw_" + colName + ".txt" else: readFileName = DBStore.testingRoot + "\\raw_" + colName + ".txt" outputFileName = DBStore.commonRoot + "\\parsed_" + colName + ".txt" readFile = open(readFileName,'w') if colName=='': print 'Filename is needed' else: print 'Beginning insertion of ' +'raw' + colName start = time.time() if training: file = DBStore.trainingRoot+'/train_'+colName+'.label' else: file = DBStore.testingRoot+'/TREC_'+colName+'.label' input = open(file,'r') pattern = re.compile(r"(?P<coarse>\w+):(?P<fine>\w+) (?P<question>.+)"); i = 0 for line in input: print i i = i + 1 match = pattern.match(line) tokenizeWords = word_tokenize(match.group('question')) print match.group('question') p = re.compile('\.') #pattern for eliminating . p2 = re.compile('(1|2)\d\d\d') #pattern for grouping year tokenizeWords = [p.sub('',word) for word in tokenizeWords if word not in string.punctuation] taggedWords = dict(pos_tag(tokenizeWords)) lemmatizedQuestion = [lemmatizer.lemmatize(word,pos=self.replace(taggedWords[word])) for word in tokenizeWords] # print taggedWords # print match.group('question') pairQuestion = dict(zip(tokenizeWords,lemmatizedQuestion)) readFile.write(match.group('question')+'\n') collection.insert({"qID":i, "question": match.group('question'), "coarse":match.group("coarse"), "fine":match.group("coarse")+":"+match.group("fine"), "lemma":lemmatizedQuestion, "tagged":taggedWords, "tokenized":tokenizeWords, "pair":pairQuestion }) readFile.close() if parse: self.parseQuestion(readFileName, outputFileName) total = time.time()-start print 'End of insertion with total time '+ str(total)