Ejemplo n.º 1
0
 def _extrParamW2V(self, model, reprList, ques):
     """
     extract parameters using word2vec model
     """
     if model == None:
         raise Exception("W2VM Not Found")
     parsedQues = ut.parseSentence(ques)
     res = []
     for word in parsedQues.split(' '):
         replWord = ut.replNum(word)
         if replWord in model.vocab:
             l = []
             for w in reprList:
                 replRepr = ut.replNum(w)  #.decode('utf-8')
                 if replRepr in model.vocab:
                     val = model.similarity(replRepr, replWord)
                     l.append(val)
             if self.W2VCalcMethod == 'max':
                 d = (word, max(l))
             if self.W2VCalcMethod == 'avg':
                 d = (word, sum(l) / len(l))
             print '{} : {}'.format(d[0].encode('utf-8'), d[1])
             if d[1] > self.THR:
                 res.append(d)
     return res
Ejemplo n.º 2
0
 def _buildIndvW2VM(self, cat, corpus):
     """
     Build the category's word2vec model using corpus
     """
     sentences = [
         ut.replNum(ut.parseSentence(x)).split(' ') for x in corpus
     ]
     self.indvW2VM[cat] = gensim.models.Word2Vec(sentences,
                                                 min_count=1,
                                                 size=100,
                                                 workers=12)
     self.save()
Ejemplo n.º 3
0
 def _buildAllW2VM(self, allCorpus):
     """
     Build word2vec model using all corpus
     NOTE : indvW2V makes each category's model but allW2V is shared among categories, so It doesn't have to rebuild often.
     """
     sentences = []
     for v in allCorpus.values():
         sentences.extend(
             [ut.replNum(ut.parseSentence(x)).split(' ') for x in v])
     self.allW2VM = gensim.models.Word2Vec(sentences,
                                           min_count=1,
                                           size=100,
                                           workers=12)
     self.save()
Ejemplo n.º 4
0
    def build(self, allCorpus):
        """
        Build classifier model from corpus
        """
        #Make question and category list to use at sklearn
        #NOTE: each category's corpus has different amount of corpus, so we equalize each category's corpus
        cntPerCat = min(map(len, allCorpus.values()))
        quesList = sum([x[0:cntPerCat] for x in allCorpus.values()], [])
        catList = sum([[x] * cntPerCat for x in allCorpus.keys()], [])
        #shuffle question and category list to build better model
        combined = list(zip(quesList, catList))
        random.shuffle(combined)
        quesList[:], catList[:] = zip(*combined)

        self.categories = allCorpus.keys()
        #We use TfidVectorizer and bigram
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        Xlist = self.vectorizer.fit_transform(
            [ut.replNum(ut.parseSentence(x)) for x in quesList])
        Ylist = [self.categories.index(x) for x in catList]
        print 'build prepared'

        #Search best model
        svc_param = {'C': np.logspace(-2, 0, 20)}
        print 'build start!'
        gs_svc = GridSearchCV(LinearSVC(), svc_param, cv=5, n_jobs=8)
        gs_svc.fit(Xlist, Ylist)
        #logging.debug(gs_svc.best_params_)
        #logging.debug('score : ' + str(gs_svc.best_score_))
        print gs_svc.best_params_
        print 'score : ' + str(gs_svc.best_score_)
        print 'make model using C parameter...'
        svm = LinearSVC(C=gs_svc.best_params_['C'])
        self.clfModel = CalibratedClassifierCV(base_estimator=svm)

        #Build model
        self.clfModel.fit(Xlist, Ylist)
        #save model
        self.save()
Ejemplo n.º 5
0
    def predict(self, ques):
        """
        Predict category of the question
        """
        if type(ques) is not unicode:
            ques = ques.decode('utf-8')
        if self.vectorizer == None or self.categories == None or self.clfModel == None:
            raise Exception('contextClf Not built yet')

        parsedQues = ut.parseSentence(ques)
        testX = self.vectorizer.transform([ut.replNum(parsedQues)])
        """
        Predict category and probability
        """
        predList = self.clfModel.predict_proba(testX)
        """
        Make List of tuples and sort to return
        """
        res = [(self.categories[x], predList[0][x])
               for x in range(len(self.categories))]
        sortedRes = sorted(res, key=operator.itemgetter(1), reverse=True)
        return sortedRes