def main(): # reading data print("reading data") data = pd.read_json('../Dataset/100/Data.json' , encoding="utf8") # cleaning and parsing data print("cleaning and parsing data \n") cleanData = [] for i in range(0,len(data)): cleanData.append(preprocessing.postToWord(data["text"][i])) if (i % 1000 == 0): print("Post %d of %d...\n" % (i, len(data))) # print(cleanTrain[i]) # top unigrams before removing stop words common_words = get_top_n_words(data['text'], 20 , 1) df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) df1['ReviewText'] = [persianEncoding(df1['ReviewText'][i]) for i in range(0 , len(df1['ReviewText']))] plotBarChart(tuple(list(df1['ReviewText'])) , list(df1['count']) , "top unigrams before removing stop words" , "Words" , "Counts" ) # top unigrams after removing stop words common_words = get_top_n_words(cleanData, 40 , 1) df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) df2['ReviewText'] = [persianEncoding(df2['ReviewText'][i]) for i in range(0 , len(df2['ReviewText']))] plotBarChart(tuple(list(df2['ReviewText'])) , list(df2['count']) , "top unigrams after removing stop words" , "Words" , "Counts" ) # top bigrams before removing stop words common_words = get_top_n_words(cleanData, 20 , 2) df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) df3['ReviewText'] = [persianEncoding(df3['ReviewText'][i]) for i in range(0 , len(df3['ReviewText']))] plotBarChart(tuple(list(df3['ReviewText'])) , list(df3['count']) , "top bigrams before removing stop words" , "Words" , "Counts" ) # top bigrams after removing stop words common_words = get_top_n_words(cleanData, 20 , 2) df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) df4['ReviewText'] = [persianEncoding(df4['ReviewText'][i]) for i in range(0 , len(df4['ReviewText']))] plotBarChart(tuple(list(df4['ReviewText'])) , list(df4['count']) , "top bigrams after removing stop words" , "Words" , "Counts" )
from sklearn.tree import DecisionTreeClassifier from sklearn import svm # from sklearn.cross_validation import cross_val_score, cross_val_predict from sklearn.model_selection import cross_val_score, cross_val_predict from sklearn import metrics from sklearn.metrics import roc_auc_score from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt from time import time print("Cleaning and parsing telegram post...\n") cleanData = [] Data = pd.read_json('../DataSet/100/Data.json', encoding="utf8") for i in range(0, len(Data)): cleanData.append(preprocessing.postToWord(Data["text"][i])) if (i % 1000 == 0): print("Post %d of %d...\n" % (i, len(Data))) print("**************************") print("cleanData Ok...") print("**************************") # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) # vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000 , ngram_range=(2,2))
from time import time from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import confusion_matrix from sklearn.metrics import f1_score from sklearn.metrics import roc_auc_score from sklearn.metrics import accuracy_score print("Cleaning and parsing the training set telegram post...\n") cleanTrain = [] train = pd.read_json('../DataSet/80_20/LabeledTrainedData.json' , encoding="utf8") for i in range(0,len(train)): cleanTrain.append(preprocessing.postToWord(train["text"][i])) if (i % 1000 == 0): print("Post %d of %d...\n" % (i, len(train))) # print(cleanTrain[i]) print("**************************") print("cleanTrain Ok...") print("**************************") # Initialize the "CountVectorizer" object, which is scikit-learn's # # bag of words tool. # vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000) # vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000 , ngram_range=(2,2)) # And now testing TFIDF vectorizer: vectorizer = TfidfVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000)
tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) # threshold for z-scor threshold = 30 politicalData = Data[Data['class'] != 0] cleanPolitical = [] # cleaning political data print("cleaning political data\n") for index in Data.index[Data['class'] != 0].tolist(): cleanPolitical.append( preprocessing.postToWord(politicalData["text"][index])) if (index % 1000 == 0): print("Post %d ...\n" % (index)) # transform political data into feature vectors politicalDataFeatures = vectorizer.fit_transform(cleanPolitical) # convert the result to an Numpy array politicalDataFeatures = politicalDataFeatures.toarray() print("political data befor removing outlier: ", politicalDataFeatures.shape, "\n") post_df = pd.DataFrame(politicalDataFeatures) z = np.abs(stats.zscore(post_df)) political_o = post_df[(z < threshold).all(axis=1)]