def predict_category(X, y, filename): df_test = dcsv.import_from_csv(filename) X_test_id = df_test[['Id', 'Title', 'Content']] X_test = X_test_id f = lambda x: x['Title'] + ' ' + x['Content'] X_test = X_test.apply(f, 1) X_train = X Y_train = y vectorizer = CountVectorizer(stop_words='english') transformer = TfidfTransformer() clf = MultinomialNB(alpha=naive_bayes_a) pipeline = Pipeline([('vect', vectorizer), ('tfidf', transformer), ('clf', clf)]) #Simple Pipeline Fit pipeline.fit(X_train, Y_train) #Predict the train set predicted = pipeline.predict(X_test) # create lists to append the id from the test set # and the results from the prediction ID = [] category = [] for row in X_test_id.iterrows(): index, data = row ID.append(data['Id']) id_dic = {'ID': ID} for pred in predicted: category.append(le.inverse_transform(pred)) category_dic = {'Predicted Category': category} #finally append them to a dictionary for export out_dic = {} out_dic.update(id_dic) out_dic.update(category_dic) # Append the result to the csv print("Exporting predicted category to csv") dcsv.export_to_csv_categories("./data/testSet_categories.csv", out_dic)
user_input = int(raw_input("Enter the number again: ")) print if (user_input==0): print("Program exits...") sys.exit() elif (user_input==1): print("LDA features selected...") elif (user_input==2): print("LDA features + ex1 features selected...") else: print("Category prediction selected...") print print("#"*60) df=dcsv.import_from_csv(sys.argv[1]) print("Preprocessing starting...\n") #merge content with title, in order to make use of the title help X=df[['Title','Content']] f=lambda x: x['Title'] + ' '+ x['Content'] X=X.apply(f, 1) le=preprocessing.LabelEncoder() le.fit(df["Category"]) y=le.transform(df["Category"]) # if the user wants to make a prediction test if user_input==3: dp.predict_category(X,y,1000,le,sys.argv[2]) print("*"*60) print
# update the bar sys.stdout.write("#") sys.stdout.flush() total_sum = 0 cluster_index = "Cluster" + str(cluster + 1) formated_results[cluster_index] = {} for vector in vectors: total_sum += 1 category = category_list[np.where(X_train == vector)[0][0]] try: formated_results[cluster_index][category] += 1 except KeyError: formated_results[cluster_index][category] = 1 for category in formated_results[cluster_index]: formated_results[cluster_index][category] = round(formated_results[cluster_index][category] / float(total_sum), 2) print print"Genarating results finished." return formated_results # The main of the program start here # if __name__ == "__main__": print" Clustering with K-Means Program starts..." print('=' * 60) dataset=dcsv.import_from_csv(sys.argv[1]) X_train=init_vector(dataset) centers, clusters = find_centers(X_train,5) # In this example K=5 print('-' * 60) formated_results=generate_formated_results(dataset, X_train, clusters) dcsv.export_to_csv_cluster('./data/clustering_KMeans.csv',formated_results) print('=' * 60) print" Clustering with K-Means Program ends..."
def predict_category(X, y, k, le, filename): print("Predict the category with SGD Classifier and K = %d ..." % k) df_test = dcsv.import_from_csv(filename) X_test_id = df_test[['Id', 'Title', 'Content']] X_test = X_test_id f = lambda x: x['Title'] + ' ' + x['Content'] X_test = X_test.apply(f, 1) X_train = X Y_train = y vectorizer = CountVectorizer(stop_words='english', tokenizer=dff.text_preprocessor) transformer = TfidfTransformer() clf = SGDClassifier(loss='modified_huber', alpha=0.0001) ###################### Preprocess the train set first ################## print("LDA features for the Train set") print #Convert docs to a list where elements are a tokens list corpus_train = dff.corpus_tokenizer(X_train) #Create Gen-Sim dictionary (Similar to SKLearn vectorizer) dictionary_train = corpora.Dictionary(corpus_train) #Create the Gen-Sim corpus using the vectorizer corpus_train = [dictionary_train.doc2bow(text) for text in corpus_train] x_train_lda = dff.LDA_processing(corpus_train, dictionary_train, k) print("Transforms in Train set") print x_train_vect = vectorizer.fit_transform(X_train) x_train_tfidf = transformer.fit_transform(x_train_vect) # merging the features together x_train_merged = sparse.hstack((x_train_tfidf, x_train_lda), format='csr') grid_search = GridSearchCV(clf, {}, cv=k_fold, n_jobs=-1) #Simple fit grid_search.fit(x_train_merged, Y_train) ####################### TEST PREDICTION ########################### # We need to convert also the test set in order to match with the train , # so the same procedure was followed print("LDA features for the Test set") print #Convert docs to a list where elements are a tokens list corpus_test = dff.corpus_tokenizer(X_test) #Create Gen-Sim dictionary (Similar to SKLearn vectorizer) dictionary_test = corpora.Dictionary(corpus_test) #Create the Gen-Sim corpus using the vectorizer corpus_test = [dictionary_test.doc2bow(text) for text in corpus_test] x_test_lda = dff.LDA_processing(corpus_test, dictionary_test, k) print("Transforms in Test set") print x_test_vect = vectorizer.transform(X_test) x_test_tfidf = transformer.transform(x_test_vect) x_test_merged = sparse.hstack((x_test_tfidf, x_test_lda), format='csr') print("Starting the prediction") print #Predict the categories predicted = grid_search.predict(x_test_merged) # create lists to append the id from the test set # and the results from the prediction ID = [] category = [] for row in X_test_id.iterrows(): index, data = row ID.append(data['Id']) id_dic = {'ID': ID} for pred in predicted: category.append(le.inverse_transform(pred)) category_dic = {'Predicted Category': category} #finally append them to a dictionary for export out_dic = {} out_dic.update(id_dic) out_dic.update(category_dic) # Append the result to the csv print("Exporting predicted category to csv") dcsv.export_to_csv_categories("./data/testSet_categories.csv", out_dic)