コード例 #1
0
def predict_category(X, y, filename):

    df_test = dcsv.import_from_csv(filename)
    X_test_id = df_test[['Id', 'Title', 'Content']]
    X_test = X_test_id
    f = lambda x: x['Title'] + ' ' + x['Content']
    X_test = X_test.apply(f, 1)

    X_train = X
    Y_train = y
    vectorizer = CountVectorizer(stop_words='english')
    transformer = TfidfTransformer()

    clf = MultinomialNB(alpha=naive_bayes_a)

    pipeline = Pipeline([('vect', vectorizer), ('tfidf', transformer),
                         ('clf', clf)])
    #Simple Pipeline Fit
    pipeline.fit(X_train, Y_train)
    #Predict the train set
    predicted = pipeline.predict(X_test)

    # create lists to append the id from the test set
    # and the results from the prediction
    ID = []
    category = []

    for row in X_test_id.iterrows():
        index, data = row
        ID.append(data['Id'])
    id_dic = {'ID': ID}

    for pred in predicted:
        category.append(le.inverse_transform(pred))
    category_dic = {'Predicted Category': category}
    #finally append them to a dictionary for export
    out_dic = {}
    out_dic.update(id_dic)
    out_dic.update(category_dic)
    # Append the result to the csv
    print("Exporting predicted category to csv")
    dcsv.export_to_csv_categories("./data/testSet_categories.csv", out_dic)
コード例 #2
0
		user_input = int(raw_input("Enter the number again:  "))

	print
	if (user_input==0):
		print("Program exits...")
		sys.exit()
	elif (user_input==1):
		print("LDA features selected...")
	elif (user_input==2):
		print("LDA features + ex1 features selected...")
	else:
		print("Category prediction selected...")
	print
	print("#"*60)

	df=dcsv.import_from_csv(sys.argv[1])

	print("Preprocessing starting...\n")
	#merge content with title, in order to make use of the title help
	X=df[['Title','Content']]
	f=lambda x: x['Title']  + ' '+ x['Content']
	X=X.apply(f, 1)
	le=preprocessing.LabelEncoder()
	le.fit(df["Category"])
	y=le.transform(df["Category"])
	
	# if the user wants to make a prediction test
	if user_input==3:
		dp.predict_category(X,y,1000,le,sys.argv[2])
		print("*"*60)
		print
コード例 #3
0
		# update the bar
		sys.stdout.write("#")
		sys.stdout.flush()
		total_sum = 0
		cluster_index = "Cluster" + str(cluster + 1)
		formated_results[cluster_index] = {}
		for vector in vectors:
			total_sum += 1
			category = category_list[np.where(X_train == vector)[0][0]]
			try:
				formated_results[cluster_index][category] += 1
			except KeyError:
				formated_results[cluster_index][category] = 1
		for category in formated_results[cluster_index]:
			formated_results[cluster_index][category] = round(formated_results[cluster_index][category] / float(total_sum), 2)
	print
	print"Genarating results finished."
	return formated_results

# The main of the program start here #
if __name__ == "__main__":
	print"   Clustering with K-Means Program starts..."
	print('=' * 60)
	dataset=dcsv.import_from_csv(sys.argv[1])
	X_train=init_vector(dataset)
	centers, clusters = find_centers(X_train,5) # In this example K=5
	print('-' * 60)
	formated_results=generate_formated_results(dataset, X_train, clusters)
	dcsv.export_to_csv_cluster('./data/clustering_KMeans.csv',formated_results)
	print('=' * 60)
	print"   Clustering with K-Means Program ends..."
コード例 #4
0
def predict_category(X, y, k, le, filename):

    print("Predict the category with SGD Classifier and K = %d ..." % k)
    df_test = dcsv.import_from_csv(filename)
    X_test_id = df_test[['Id', 'Title', 'Content']]
    X_test = X_test_id
    f = lambda x: x['Title'] + ' ' + x['Content']
    X_test = X_test.apply(f, 1)

    X_train = X
    Y_train = y
    vectorizer = CountVectorizer(stop_words='english',
                                 tokenizer=dff.text_preprocessor)
    transformer = TfidfTransformer()

    clf = SGDClassifier(loss='modified_huber', alpha=0.0001)

    ###################### Preprocess the train set first ##################
    print("LDA features for the Train set")
    print
    #Convert docs to a list where elements are a tokens list
    corpus_train = dff.corpus_tokenizer(X_train)
    #Create Gen-Sim dictionary (Similar to SKLearn vectorizer)
    dictionary_train = corpora.Dictionary(corpus_train)
    #Create the Gen-Sim corpus using the vectorizer
    corpus_train = [dictionary_train.doc2bow(text) for text in corpus_train]

    x_train_lda = dff.LDA_processing(corpus_train, dictionary_train, k)

    print("Transforms in Train set")
    print
    x_train_vect = vectorizer.fit_transform(X_train)
    x_train_tfidf = transformer.fit_transform(x_train_vect)
    # merging the features together
    x_train_merged = sparse.hstack((x_train_tfidf, x_train_lda), format='csr')

    grid_search = GridSearchCV(clf, {}, cv=k_fold, n_jobs=-1)
    #Simple fit
    grid_search.fit(x_train_merged, Y_train)

    ####################### TEST PREDICTION ###########################
    # We need to convert also the test set in order to match with the train ,
    # so the same procedure was followed
    print("LDA features for the Test set")
    print
    #Convert docs to a list where elements are a tokens list
    corpus_test = dff.corpus_tokenizer(X_test)
    #Create Gen-Sim dictionary (Similar to SKLearn vectorizer)
    dictionary_test = corpora.Dictionary(corpus_test)
    #Create the Gen-Sim corpus using the vectorizer
    corpus_test = [dictionary_test.doc2bow(text) for text in corpus_test]

    x_test_lda = dff.LDA_processing(corpus_test, dictionary_test, k)

    print("Transforms in Test set")
    print
    x_test_vect = vectorizer.transform(X_test)
    x_test_tfidf = transformer.transform(x_test_vect)

    x_test_merged = sparse.hstack((x_test_tfidf, x_test_lda), format='csr')
    print("Starting the prediction")
    print
    #Predict the categories
    predicted = grid_search.predict(x_test_merged)

    # create lists to append the id from the test set
    # and the results from the prediction
    ID = []
    category = []

    for row in X_test_id.iterrows():
        index, data = row
        ID.append(data['Id'])
    id_dic = {'ID': ID}

    for pred in predicted:
        category.append(le.inverse_transform(pred))
    category_dic = {'Predicted Category': category}
    #finally append them to a dictionary for export
    out_dic = {}
    out_dic.update(id_dic)
    out_dic.update(category_dic)
    # Append the result to the csv
    print("Exporting predicted category to csv")
    dcsv.export_to_csv_categories("./data/testSet_categories.csv", out_dic)