Ejemplo n.º 1
0
class Factory(object):
    def __init__(self, config):
        self.config = config
        self.analyzer = Analyzer(self.config)
        self.classify = Classify(config)

    @staticmethod
    def get_all_column_data(file):
        """
        Combine all column data into a single feature matrix
        :param file:
        :return:
        """
        # Get all the feature matrices
        title_matrix, response_vector = f.analyze_column_data(file, 'title')
        abstract_matrix, response_vector = f.analyze_column_data(
            file, 'abstract')
        claims_matrix, response_vector = f.analyze_column_data(file, 'claims')

        # Get them all together
        feature_matrix = hstack([title_matrix, abstract_matrix])
        feature_matrix = hstack([feature_matrix, claims_matrix])
        return feature_matrix, response_vector

    def analyze_column_data(self, filename, column_name):
        """
        Create the feature model and matrix for the abstract column
        :param filename:
        :return:
        """
        self.analyzer.load_patent_data(filename)
        self.analyzer.extract_data(column_name)
        n_grams = 1
        self.analyzer.extract_features(n_grams, column_name)
        return self.analyzer.feature_matrix, self.analyzer.response

    def compute_heuristics(self, filename, column_name):
        """
        Figure out what words make up the groups in the shit
        :param filename:
        :return:
        """
        self.analyze_column_data(filename, column_name)
        self.analyzer.heuristics(column_name)

    def full_train(self):
        """
        GET THE CLASSIFIER TRAINED
        :return:
        """
        # self.classify.feature_selection()
        self.classify.classifier_selection()
        # self.classify.optimize_classifier()
        self.classify.train()
        self.classify.save_classifier()

    def evaluate(self, title, abstract, claims):
        """
        Predict group of a single entry
        :param abstract:
        :return:
        """
        self.analyzer.load_model('title')
        title_vector = self.analyzer.transform([title])
        self.analyzer.load_model('abstract')
        abstract_vector = self.analyzer.transform([abstract])
        self.analyzer.load_model('claims')
        claims_vector = self.analyzer.transform([claims])

        feature_vector = hstack([title_vector, abstract_vector])
        feature_vector = hstack([feature_vector, claims_vector])

        return feature_vector

    def predict(self, feature_vector):
        """
        Predict class based on feature vector input
        :param feature_vector:
        :return:
        """
        group = self.classify.predict(feature_vector)
        return group
Ejemplo n.º 2
0
#-*- coding:utf-8 -*-
from classify import Classify
import numpy as np

import sys
reload(sys)
sys.setdefaultencoding("utf8")

if __name__ == "__main__":

    X_train = np.array([
        u"我想听张学友的歌", u"周杰伦的龙卷风", u"鹿晗有什么歌好听", u"姚明打篮球好厉害", u"张继科会打乒乓球",
        u"詹姆士是体育明星"
    ])
    Y_train = np.array([1, 1, 1, 2, 2, 2])
    Test_data = [u"我想听薛之谦的演员", "邓亚萍是体育明星", "刘翔是体育明星"]
    Model = Classify()
    Model.load_W2V_Model("word2vec.model")
    Model.train(X_train, Y_train)
    Model.predict(Test_data)

    Model.save_NBmodel("NB.model")
    del Model

    NBmodel_test = Classify()
    NBmodel_test.load_NBmodel("NB.model")
    NBmodel_test.predict(Test_data)
    del NBmodel_test