def main(): """ main function to make prediction use random forest :return: """ train = pd.read_csv("/path/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) test = pd.read_csv("/path/testData.tsv", header=0, delimiter="\t", quoting=3) modelName = "/path/Word2VectforNLPTraining" model = Word2Vec.load(modelName) print("Processing training data...") cleaned_training_data = processData.clean_data(train) trainingDataFV = getAvgFeatureVecs(cleaned_training_data,model) print("Processing test data...") cleaned_test_data = processData.clean_data(test) testDataFV = getAvgFeatureVecs(cleaned_test_data,model) n_estimators = 100 result = randomForestClassifier.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV) output = pd.DataFrame(data={"id": test["id"], "sentiment": result}) output.to_csv("Word2Vec_AvgVecPredict.csv", index=False, quoting=3)
def main(): modelName = "Word2VectforNLPTraining" model = Word2Vec.load(modelName) # model.init_sims(replace=True) word_vectors = model.syn0 # print(word_vectors[0]) num_clusters = int(word_vectors.shape[0] / 5) # print("number of clusters: {}".format(num_clusters)) # input("Press enter to continue:") print("Clustering...") startTime = time.time() cluster_index = kMeans.kmeans(num_clusters, word_vectors) endTime = time.time() print("Time taken for clustering: {} seconds".format(endTime - startTime)) # create a word/index dictionary, mapping each vocabulary word to a cluster number # zip(): make an iterator that aggregates elements from each of the iterables index_word_map = dict(zip(model.index2word, cluster_index)) def create_bag_of_centroids(reviewData): """ assign each word in the review to a centroid this returns a numpy array with the dimension as num_clusters each will be served as one feature for classification :param reviewData: :return: """ featureVector = np.zeros(num_clusters, dtype=np.float) for word in reviewData: if word in index_word_map: index = index_word_map[word] featureVector[index] += 1 return featureVector train = pd.read_csv("/path/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) test = pd.read_csv("/path/testData.tsv", header=0, delimiter="\t", quoting=3) trainingDataFV = np.zeros((train["review"].size, num_clusters), dtype=np.float) testDataFV = np.zeros((test["review"].size, num_clusters), dtype=np.float) print("Processing training data...") counter = 0 cleaned_training_data = processData.clean_data(train) for review in cleaned_training_data: trainingDataFV[counter] = create_bag_of_centroids(review) counter += 1 print("Processing test data...") counter = 0 cleaned_test_data = processData.clean_data(test) for review in cleaned_test_data: testDataFV[counter] = create_bag_of_centroids(review) counter += 1 n_estimators = 100 result = randomForestClassifier.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV) output = pd.DataFrame(data={"id": test["id"], "sentiment": result}) output.to_csv("Doc2Vec_Clustering.csv", index=False, quoting=3)