def vectorSpaceInit(self,questions,colName):
     questions.rewind()
     DBStore.dropColl(colName)
     print 'Creating vector space'
     for question in questions:
         DBStore.getDB()[colName].insert({'qID':question['qID'],'question':question['question'],'CoarseCode':self.__class__.coarseClassCode[question['coarse']],'FineCode':self.__class__.fineClassCode[question['fine']]})
     print 'Done'
    def insertFile2Database(self,colName='',training=True,parse=False):
#        posTagger = POSTagger()
#        posTagger.loadTagger()
#        tagger = posTagger.getTagger()
        lemmatizer = WordNetLemmatizer()
        DBStore.dropColl('raw'+colName)
        collection = DBStore.getDB()['raw'+colName]
        if training:
            readFileName = DBStore.trainingRoot + "\\raw_" + colName + ".txt"
        else:
            readFileName = DBStore.testingRoot + "\\raw_" + colName + ".txt"
        outputFileName = DBStore.commonRoot + "\\parsed_" + colName + ".txt"
        readFile = open(readFileName,'w')
        if colName=='':
            print 'Filename is needed'
        else:
            print 'Beginning insertion of ' +'raw' + colName
            start = time.time()
            if training:
                file = DBStore.trainingRoot+'/train_'+colName+'.label'
            else:
                file = DBStore.testingRoot+'/TREC_'+colName+'.label'
            input = open(file,'r')
            pattern = re.compile(r"(?P<coarse>\w+):(?P<fine>\w+) (?P<question>.+)");
            i = 0
            for line in input:
                print i
                i = i + 1
                match = pattern.match(line)
                tokenizeWords = word_tokenize(match.group('question'))
                print match.group('question')
                p = re.compile('\.') #pattern for eliminating .
                p2 = re.compile('(1|2)\d\d\d') #pattern for grouping year
                tokenizeWords = [p.sub('',word) for word in tokenizeWords if word not in string.punctuation]
                taggedWords = dict(pos_tag(tokenizeWords))
                lemmatizedQuestion = [lemmatizer.lemmatize(word,pos=self.replace(taggedWords[word])) for word in tokenizeWords]
#                print taggedWords
#                print match.group('question')
                pairQuestion = dict(zip(tokenizeWords,lemmatizedQuestion))
                readFile.write(match.group('question')+'\n')
                collection.insert({"qID":i,
                                   "question": match.group('question'),
                                   "coarse":match.group("coarse"),
                                   "fine":match.group("coarse")+":"+match.group("fine"),
                                   "lemma":lemmatizedQuestion,
                                   "tagged":taggedWords,
                                   "tokenized":tokenizeWords,
                                   "pair":pairQuestion
                                        })
            readFile.close()
            if parse:
                self.parseQuestion(readFileName, outputFileName)
            total = time.time()-start
            print 'End of insertion with total time '+ str(total)