コード例 #1
0
class extractor(object):
    def __init__(self):
        self.path = config.basePath + config.microblog
        self.SentiModel = self.load_obj("_model")
        self.ch2 = self.load_obj("_feature_selector")
        self.vectorizer = self.load_obj("_vectorizer")
        self.filter = Filter()

    def load_obj(self, name):
        with open(self.path + name + '.pkl', 'rb') as f:
            return pickle.load(f)

    def getSentimentScore(self, message):
        message = self.filter.process(message)
        emo = re.findall(r"(\w+)#\(([+-]*\d+.\d+)\)#", message.decode("utf-8"))
        emoScore = 0.0
        for e in emo:
            emoScore += float(e[1])
        # print(emo)
        # remove emo info
        onlyEMO = 0
        message = re.sub(r"\w+#\([+-]*\d+.\d+\)#", " ",
                         message.decode("utf-8"))
        if len(message.strip()) == 0:
            onlyEMO = 1
        vec = self.vectorizer.transform([message.encode("utf-8")])
        Tvec = self.ch2.transform(vec)
        predScore = self.SentiModel.predict(Tvec)

        # Vary weights if wanted (default - 0.35,0.65)
        if onlyEMO == 1:
            predScore = emoScore
        elif len(emo) >= 1:
            predScore = 0.35 * predScore + 0.65 * emoScore
        return float(predScore)
class extractor(object):
	def __init__(self):
		self.path = config.basePath +config.microblog
		self.SentiModel = self.load_obj("_model")
		self.ch2 = self.load_obj("_feature_selector")
		self.vectorizer = self.load_obj("_vectorizer")
		self.filter = Filter()

	def load_obj(self,name):
		    with open( self.path + name + '.pkl', 'rb') as f:
		        return pickle.load(f)

	def getSentimentScore(self,message):
		message = self.filter.process(message)
		emo = re.findall(r"(\w+)#\(([+-]*\d+.\d+)\)#",message.decode("utf-8"))
		emoScore = 0.0
		for e in emo:
			emoScore += float(e[1])
		# print(emo)
				# remove emo info
		onlyEMO = 0
		message = re.sub(r"\w+#\([+-]*\d+.\d+\)#"," ",message.decode("utf-8"))
		if len(message.strip()) == 0:
			onlyEMO = 1
		vec = self.vectorizer.transform([message.encode("utf-8")])
		Tvec = self.ch2.transform(vec)
		predScore = self.SentiModel.predict(Tvec)

		# Vary weights if wanted (default - 0.35,0.65)
		if onlyEMO == 1:
			predScore = emoScore
		elif len(emo) >= 1:
			predScore = 0.35 * predScore + 0.65 * emoScore
		return float(predScore)
コード例 #3
0
class extractor(object):

	def __init__(self):
		self.path = config.basePath +config.microblog
		self.SentiModel = self.load_obj("_model")
		self.ch2 = self.load_obj("_feature_selector")
		self.vectorizer = self.load_obj("_vectorizer")
		self.filter = Filter()

	def load_obj(self,name):
		    with open( self.path + name + '.pkl', 'rb') as f:
		        return pickle.load(f)

	def simpleProcess(self,text):
		text = text.lower().strip()
		line = re.sub(Url_RE,"",text)
		line = re.sub(r"[@#]","",line)
		line =u" ".join(tokenize(line))
		return line

	def getSentimentScore(self,message):

		vec = self.vectorizer.transform([self.simpleProcess(message)])
		Tvec = self.ch2.transform(vec)
		plainScore = self.SentiModel.predict(Tvec)

		#####
		message = self.filter.process(message)
		emo = re.findall(r"(\w+)#\(([+-]*\d+.\d+)\)#",message.decode("utf-8"))
		emoScore = 0.0
		for e in emo:
			emoScore += float(e[1])
		# remove emo info
		onlyEMO = 0
		message = re.sub(r"\w+#\([+-]*\d+.\d+\)#"," ",message.decode("utf-8"))
		if len(message.strip()) == 0:
			onlyEMO = 1
		vec = self.vectorizer.transform([message])
		Tvec = self.ch2.transform(vec)
		predScore = self.SentiModel.predict(Tvec)

		# Vary weights if wanted
		if onlyEMO == 1:
			predScore = emoScore
		elif len(emo) >= 1:
			predScore = 0.20 * predScore + 0.30 * plainScore + 0.50 * emoScore
		else:
			predScore = 0.35 * predScore + 0.65 * plainScore

		return float(predScore)
コード例 #4
0
class extractor(object):

	def __init__(self):
		self.path = config.basePath +config.microblog
		self.SentiModel = self.load_obj("_model")
		self.ch2 = self.load_obj("_feature_selector")
		self.vectorizer = self.load_obj("_vectorizer")
		self.filter = Filter()

	def load_obj(self,name):
		    with open( self.path + name + '.pkl', 'rb') as f:
		        return pickle.load(f)

	def simpleProcess(self,text):
		text = text.lower().strip()
		line = re.sub(Url_RE,"",text)
		line = re.sub(r"[@#]","",line)
		line =u" ".join(tokenize(line))
		return line

	def getSentimentScore(self,message):

		vec = self.vectorizer.transform([self.simpleProcess(message)])
		Tvec = self.ch2.transform(vec)
		plainScore = self.SentiModel.predict(Tvec)

		#####
		message = self.filter.process(message)
		emo = re.findall(r"(\w+)#\(([+-]*\d+.\d+)\)#",message.decode("utf-8"))
		emoScore = 0.0
		for e in emo:
			emoScore += float(e[1])
		# remove emo info
		onlyEMO = 0
		message = re.sub(r"\w+#\([+-]*\d+.\d+\)#"," ",message.decode("utf-8"))
		if len(message.strip()) == 0:
			onlyEMO = 1
		vec = self.vectorizer.transform([message])
		Tvec = self.ch2.transform(vec)
		predScore = self.SentiModel.predict(Tvec)

		# Vary weights if wanted
		if onlyEMO == 1:
			predScore = emoScore
		elif len(emo) >= 1:
			predScore = 0.20 * predScore + 0.30 * plainScore + 0.50 * emoScore
		else:
			predScore = 0.35 * predScore + 0.65 * plainScore

		return float(predScore)
コード例 #5
0
class TextLearner(object):
    def __init__(self,data_path,model_path = "./",name = ""):
        self.name = name
        self.data_path = data_path
        self.model_path = model_path
        self.DesignMatrix = []
        self.TestMatrix = []
        self.X_train = []
        self.y_train = [] # not only train but general purpose too
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None
        self.F = Filter()

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.DesignMatrix = []
        self.TestMatrix = []
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None
        self.F = None

    def addModelDetails(self,model_p,name = ""):
        self.name = name
        self.model_path = model_p


    def load_data(self,TrTe = 0):               #TrTe => 0-Train  1-Test # returns the dimensions of vectors
        with open( self.data_path, 'rb') as f:
            if TrTe == 0:
                self.DesignMatrix = pickle.load(f)
                return len(self.DesignMatrix[1])
            if TrTe == 1:
                self.TestMatrix = pickle.load(f)
                return len(self.TestMatrix[1])

    def clearOld(self):
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None


    def process(self,text,default = 0):
        if default == 0:
            text = text.strip().lower().encode("utf-8")
        else:
            text = self.F.process(text)
        return text


    def loadXY(self,TrTe = 0,feature_index = 0,label_index = 1):     #TrTe => 0-Train  1-Test
        if TrTe == 0:
            for i in self.DesignMatrix:
                self.X_train.append(self.process(i[feature_index]))
                self.y_train.append(i[label_index])
            self.X_train = np.array(self.X_train)
            self.y_train = np.array(self.y_train)

        elif TrTe == 1:
            for i in self.TestMatrix:
                self.X_test.append(self.process(i[feature_index]))
                self.y_test.append(i[label_index])
            self.X_test = np.array(self.X_test)
            self.y_test = np.array(self.y_test)


    def featurizeXY(self,only_train = 1):      # Extracts Features
        sw = ['a', 'across', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'been', 'being', 'but', 'by', 'can', 'could', 'did', 'do', 'does', 'each', 'for', 'from', 'had', 'has', 'have', 'in', 'into', 'is', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'of', 'on', 'or', 'that', "that's", 'thats', 'the', 'there', "there's", 'theres', 'these', 'this', 'those', 'to', 'under', 'until', 'up', 'were', 'will', 'with', 'would']
        self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words=sw)
        self.X_train = self.vectorizer.fit_transform(self.X_train)
        self.feature_names = self.vectorizer.get_feature_names()
        if only_train == 0:
            self.X_test = self.vectorizer.transform(self.X_test)


    def reduceDimension(self,only_train = 1, percent = 50):      # Reduce dimensions / self best of features
        n_samples, n_features = self.X_train.shape
        k = int(n_features*(percent/100))

        self.chi2 = SelectKBest(chi2, k=k)
        self.X_train = self.chi2.fit_transform(self.X_train, self.y_train)
        self.feature_names = [self.feature_names[i] for i in self.chi2.get_support(indices=True)]
        self.feature_names = np.asarray(self.feature_names)
        if only_train == 0:
            self.X_test = self.chi2.transform(self.X_test)


    def trainModel(self,Model = "default"):
        if Model == "default":
            self.mlModel = LinearSVR(loss='squared_epsilon_insensitive',dual=False, tol=1e-3)
        else:
            self.mlModel = Model
        self.mlModel.fit(self.X_train, self.y_train)


    def testModel(self,approx = 1):        # returns score ONLY
        self.y_pred = np.array(self.mlModel.predict(self.X_test))

        if approx == 1:
            ### To convert real valued results to binary for scoring
            temp = []
            for y in self.y_pred:
                if y > 0.0:
                    temp.append(1.0)
                else:
                    temp.append(-1.0)
            self.y_pred = temp

        return metrics.accuracy_score(self.y_test, self.y_pred)


    def getReport(self,save = 1, get_top_words = 0):       # returns report
        report = ""
        if get_top_words == 1:
            if hasattr(self.mlModel, 'coef_'):
                    report += "Dimensionality: " + str(self.mlModel.coef_.shape[1])
                    report += "\nDensity: " +  str(density(self.mlModel.coef_))

                    rank = np.argsort(self.mlModel.coef_[0])
                    top10 = rank[-20:]
                    bottom10 = rank[:20]
                    report += "\n\nTop 10 keywords: "
                    report += "\nPositive: " + (" ".join(self.feature_names[top10]))
                    report += "\nNegative: " + (" ".join(self.feature_names[bottom10]))

        score = metrics.accuracy_score(self.y_test, self.y_pred)
        report += "\n\nAccuracy: " + str(score)
        report += "\nClassification report: "
        report += "\n\n" + str(metrics.classification_report(self.y_test, self.y_pred,target_names=["Negative","Positive"]))
        report += "\nConfusion matrix: "
        report += "\n\n" + str(metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n"

        if save == 1:
            with open(self.model_path + "report.txt", "w") as text_file:
                text_file.write(report)

        return report


    def crossVal(self,folds = 5, dim_red = 50,full_iter = 0, save = 1):        # returns report # Caution: resets train and test X,y
        skf = cross_validation.StratifiedKFold(self.y_train, n_folds = folds,shuffle=True)
        print(skf)
        master_report = ""

        X_copy = self.X_train
        y_copy = self.y_train

        for train_index, test_index in skf:
            self.X_train, self.X_test = X_copy[train_index], X_copy[test_index]
            self.y_train, self.y_test = y_copy[train_index], y_copy[test_index]
            self.featurizeXY(0)
            self.reduceDimension(0,dim_red)
            self.trainModel()
            self.testModel()
            master_report += self.getReport(save = 0,get_top_words = 0)
            if full_iter == 1:
                continue
            else:
                break

        if save == 1:
            with open(self.model_path + "master_report.txt", "w") as text_file:
                text_file.write(master_report)

        return master_report


    def save_obj(self,obj, name ):
        with open(self.model_path + name + '.pkl', 'wb') as f:
            pickle.dump(obj, f,  protocol=2)


    def saveModel(self):        # saves in model path
        self.save_obj(self.mlModel, self.name + "_model")
        self.save_obj(self.vectorizer, self.name + "_vectorizer")
        self.save_obj(self.chi2, self.name + "_feature_selector")


    def plot(self):
        '''
        beta (Just plotting the model) (Not working)
        '''

        h = .02  # step size in the mesh
        # create a mesh to plot in
        x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1
        y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        Z = self.mlModel.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.contour(xx, yy, Z, cmap=plt.cm.Paired)

        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xticks(())
        plt.yticks(())
        plt.title(self.name)
        plt.savefig(self.model_path + 'plot.png')
コード例 #6
0
class TextLearner(object):
    def __init__(self, data_path, model_path="./", name=""):
        self.name = name
        self.data_path = data_path
        self.model_path = model_path
        self.DesignMatrix = []
        self.TestMatrix = []
        self.X_train = []
        self.y_train = []  # not only train but general purpose too
        self.X_test = []
        self.y_test = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None
        self.F = Filter()

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.DesignMatrix = []
        self.TestMatrix = []
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None
        self.F = None

    def addModelDetails(self, model_p, name=""):
        self.name = name
        self.model_path = model_p

    def load_data(
            self,
            TrTe=0
    ):  #TrTe => 0-Train  1-Test # returns the dimensions of vectors
        with open(self.data_path, 'rb') as f:
            if TrTe == 0:
                self.DesignMatrix = pickle.load(f)
                return len(self.DesignMatrix[1])
            if TrTe == 1:
                self.TestMatrix = pickle.load(f)
                return len(self.TestMatrix[1])

    def clearOld(self):
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None

    def process(self, text, default=0):
        if default == 0:
            text = text.strip().lower().encode("utf-8")
        else:
            text = self.F.process(text)
        return text

    def loadXY(self,
               TrTe=0,
               feature_index=0,
               label_index=1):  #TrTe => 0-Train  1-Test
        if TrTe == 0:
            for i in self.DesignMatrix:
                self.X_train.append(self.process(i[feature_index]))
                self.y_train.append(i[label_index])
            self.X_train = np.array(self.X_train)
            self.y_train = np.array(self.y_train)

        elif TrTe == 1:
            for i in self.TestMatrix:
                self.X_test.append(self.process(i[feature_index]))
                self.y_test.append(i[label_index])
            self.X_test = np.array(self.X_test)
            self.y_test = np.array(self.y_test)

    def featurizeXY(self, only_train=1):  # Extracts Features
        sw = [
            'a', 'across', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be',
            'been', 'being', 'but', 'by', 'can', 'could', 'did', 'do', 'does',
            'each', 'for', 'from', 'had', 'has', 'have', 'in', 'into', 'is',
            "isn't", 'it', "it'd", "it'll", "it's", 'its', 'of', 'on', 'or',
            'that', "that's", 'thats', 'the', 'there', "there's", 'theres',
            'these', 'this', 'those', 'to', 'under', 'until', 'up', 'were',
            'will', 'with', 'would'
        ]
        self.vectorizer = TfidfVectorizer(sublinear_tf=True,
                                          max_df=0.5,
                                          stop_words=sw)
        self.X_train = self.vectorizer.fit_transform(self.X_train)
        self.feature_names = self.vectorizer.get_feature_names()
        if only_train == 0:
            self.X_test = self.vectorizer.transform(self.X_test)

    def reduceDimension(
            self,
            only_train=1,
            percent=50):  # Reduce dimensions / self best of features
        n_samples, n_features = self.X_train.shape
        k = int(n_features * (percent / 100))

        self.chi2 = SelectKBest(chi2, k=k)
        self.X_train = self.chi2.fit_transform(self.X_train, self.y_train)
        self.feature_names = [
            self.feature_names[i] for i in self.chi2.get_support(indices=True)
        ]
        self.feature_names = np.asarray(self.feature_names)
        if only_train == 0:
            self.X_test = self.chi2.transform(self.X_test)

    def trainModel(self, Model="default"):
        if Model == "default":
            self.mlModel = LinearSVR(loss='squared_epsilon_insensitive',
                                     dual=False,
                                     tol=1e-3)
        else:
            self.mlModel = Model
        self.mlModel.fit(self.X_train, self.y_train)

    def testModel(self, approx=1):  # returns score ONLY
        self.y_pred = np.array(self.mlModel.predict(self.X_test))

        if approx == 1:
            ### To convert real valued results to binary for scoring
            temp = []
            for y in self.y_pred:
                if y > 0.0:
                    temp.append(1.0)
                else:
                    temp.append(-1.0)
            self.y_pred = temp

        return metrics.accuracy_score(self.y_test, self.y_pred)

    def getReport(self, save=1, get_top_words=0):  # returns report
        report = ""
        if get_top_words == 1:
            if hasattr(self.mlModel, 'coef_'):
                report += "Dimensionality: " + str(self.mlModel.coef_.shape[1])
                report += "\nDensity: " + str(density(self.mlModel.coef_))

                rank = np.argsort(self.mlModel.coef_[0])
                top10 = rank[-20:]
                bottom10 = rank[:20]
                report += "\n\nTop 10 keywords: "
                report += "\nPositive: " + (" ".join(
                    self.feature_names[top10]))
                report += "\nNegative: " + (" ".join(
                    self.feature_names[bottom10]))

        score = metrics.accuracy_score(self.y_test, self.y_pred)
        report += "\n\nAccuracy: " + str(score)
        report += "\nClassification report: "
        report += "\n\n" + str(
            metrics.classification_report(
                self.y_test,
                self.y_pred,
                target_names=["Negative", "Positive"]))
        report += "\nConfusion matrix: "
        report += "\n\n" + str(
            metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n"

        if save == 1:
            with open(self.model_path + "report.txt", "w") as text_file:
                text_file.write(report)

        return report

    def crossVal(
            self,
            folds=5,
            dim_red=50,
            full_iter=0,
            save=1):  # returns report # Caution: resets train and test X,y
        skf = cross_validation.StratifiedKFold(self.y_train,
                                               n_folds=folds,
                                               shuffle=True)
        print(skf)
        master_report = ""

        X_copy = self.X_train
        y_copy = self.y_train

        for train_index, test_index in skf:
            self.X_train, self.X_test = X_copy[train_index], X_copy[test_index]
            self.y_train, self.y_test = y_copy[train_index], y_copy[test_index]
            self.featurizeXY(0)
            self.reduceDimension(0, dim_red)
            self.trainModel()
            self.testModel()
            master_report += self.getReport(save=0, get_top_words=0)
            if full_iter == 1:
                continue
            else:
                break

        if save == 1:
            with open(self.model_path + "master_report.txt", "w") as text_file:
                text_file.write(master_report)

        return master_report

    def save_obj(self, obj, name):
        with open(self.model_path + name + '.pkl', 'wb') as f:
            pickle.dump(obj, f, protocol=2)

    def saveModel(self):  # saves in model path
        self.save_obj(self.mlModel, self.name + "_model")
        self.save_obj(self.vectorizer, self.name + "_vectorizer")
        self.save_obj(self.chi2, self.name + "_feature_selector")

    def plot(self):
        '''
        beta (Just plotting the model) (Not working)
        '''

        h = .02  # step size in the mesh
        # create a mesh to plot in
        x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:,
                                                                  0].max() + 1
        y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:,
                                                                  1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        Z = self.mlModel.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.contour(xx, yy, Z, cmap=plt.cm.Paired)

        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xticks(())
        plt.yticks(())
        plt.title(self.name)
        plt.savefig(self.model_path + 'plot.png')