Esempio n. 1
0
def main():
    # reading data 
    print("reading data")
    data = pd.read_json('../Dataset/100/Data.json' , encoding="utf8")
    
    # cleaning and parsing data
    print("cleaning and parsing data \n")
    cleanData = []
    for i in range(0,len(data)):
        cleanData.append(preprocessing.postToWord(data["text"][i]))
        if (i % 1000 == 0):
            print("Post %d of %d...\n" % (i, len(data)))
            # print(cleanTrain[i])

    # top unigrams before removing stop words
    common_words = get_top_n_words(data['text'], 20 , 1)
    df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
    df1['ReviewText'] = [persianEncoding(df1['ReviewText'][i]) for i in range(0 , len(df1['ReviewText']))]
    plotBarChart(tuple(list(df1['ReviewText'])) , list(df1['count']) , "top unigrams before removing stop words" , "Words" , "Counts" )

    # top unigrams after removing stop words
    common_words = get_top_n_words(cleanData, 40 , 1)
    df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
    df2['ReviewText'] = [persianEncoding(df2['ReviewText'][i]) for i in range(0 , len(df2['ReviewText']))]
    plotBarChart(tuple(list(df2['ReviewText'])) , list(df2['count']) , "top unigrams after removing stop words" , "Words" , "Counts" )

    # top bigrams before removing stop words
    common_words = get_top_n_words(cleanData, 20 , 2)
    df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
    df3['ReviewText'] = [persianEncoding(df3['ReviewText'][i]) for i in range(0 , len(df3['ReviewText']))]
    plotBarChart(tuple(list(df3['ReviewText'])) , list(df3['count']) , "top bigrams before removing stop words" , "Words" , "Counts" )
    
    # top bigrams after removing stop words
    common_words = get_top_n_words(cleanData, 20 , 2)
    df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
    df4['ReviewText'] = [persianEncoding(df4['ReviewText'][i]) for i in range(0 , len(df4['ReviewText']))]
    plotBarChart(tuple(list(df4['ReviewText'])) , list(df4['count']) , "top bigrams after removing stop words" , "Words" , "Counts" )
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
# from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from time import time

print("Cleaning and parsing telegram post...\n")
cleanData = []
Data = pd.read_json('../DataSet/100/Data.json', encoding="utf8")

for i in range(0, len(Data)):
    cleanData.append(preprocessing.postToWord(Data["text"][i]))
    if (i % 1000 == 0):
        print("Post %d of %d...\n" % (i, len(Data)))

print("**************************")
print("cleanData Ok...")
print("**************************")

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=None,
                             max_features=5000)
# vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000 , ngram_range=(2,2))
Esempio n. 3
0
from time import time
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score



print("Cleaning and parsing the training set telegram post...\n")
cleanTrain = []
train = pd.read_json('../DataSet/80_20/LabeledTrainedData.json' , encoding="utf8")

for i in range(0,len(train)):
    cleanTrain.append(preprocessing.postToWord(train["text"][i]))
    if (i % 1000 == 0):
        print("Post %d of %d...\n" % (i, len(train)))
        # print(cleanTrain[i])

print("**************************")
print("cleanTrain Ok...")
print("**************************")

# Initialize the "CountVectorizer" object, which is scikit-learn's
# # bag of words tool.
# vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000)
# vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000 , ngram_range=(2,2))
# And now testing TFIDF vectorizer:
vectorizer = TfidfVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,max_features = 5000)
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=None,
                             max_features=5000)

# threshold for z-scor
threshold = 30

politicalData = Data[Data['class'] != 0]
cleanPolitical = []

# cleaning political data
print("cleaning political data\n")
for index in Data.index[Data['class'] != 0].tolist():
    cleanPolitical.append(
        preprocessing.postToWord(politicalData["text"][index]))
    if (index % 1000 == 0):
        print("Post %d ...\n" % (index))

# transform political data into feature vectors
politicalDataFeatures = vectorizer.fit_transform(cleanPolitical)
# convert the result to an Numpy array
politicalDataFeatures = politicalDataFeatures.toarray()

print("political data befor removing outlier: ", politicalDataFeatures.shape,
      "\n")

post_df = pd.DataFrame(politicalDataFeatures)
z = np.abs(stats.zscore(post_df))
political_o = post_df[(z < threshold).all(axis=1)]