Ejemplo n.º 1
0
def gather_sentiment_data_set():
	"""
	labelled_data.to_csv('all_labelled_data_before_processing.csv', encoding="utf-8", index=False)
	unlabelled_data.to_csv('all_unlabelled_data_before_processing.csv', encoding="utf-8", index=False)

	:return: labelled_data, unlabelled_data
	"""
	mc.move_to_main_location()
	path_to_train_pos = os.path.abspath(os.path.join(os.path.curdir, "SentimentDataSet", "train", "pos"))
	path_to_train_neg = os.path.abspath(os.path.join(os.path.curdir, "SentimentDataSet", "train", "neg"))
	path_to_test_pos = os.path.abspath(os.path.join(os.path.curdir, "SentimentDataSet", "test", "pos"))
	path_to_test_neg = os.path.abspath(os.path.join(os.path.curdir, "SentimentDataSet", "test", "neg"))

	train_pos_df = review2pddataframe(path_to_train_pos, 1)
	train_neg_df = review2pddataframe(path_to_train_neg, 0)
	test_pos_df = review2pddataframe(path_to_test_pos, 1)
	test_neg_df = review2pddataframe(path_to_test_neg, 0)

	labelled_data = [test_neg_df, train_neg_df, test_pos_df, train_pos_df]
	labelled_data = pd.concat(labelled_data)
	labelled_data = labelled_data.sample(frac=1).reset_index(drop=True)

	path_to_unlabelled_data = os.path.abspath(os.path.join(os.path.curdir, "SentimentDataSet", "train", "unsup"))
	unlabelled_data = review2pddataframe(path_to_unlabelled_data, -1)
	unlabelled_data = unlabelled_data.sample(frac=1).reset_index(drop=True)

	mc.move_to_data_location()
	labelled_data.to_csv('all_labelled_data_before_processing.csv', encoding="utf-8", index=False)
	unlabelled_data.to_csv('all_unlabelled_data_before_processing.csv', encoding="utf-8", index=False)

	return labelled_data, unlabelled_data
Ejemplo n.º 2
0
def preprocessing_reviews_in_df_to_words(data_before_processing, filename):
	"""
	mc.move_to_data_location()
	write_list_of_lists_to_csv(processed_reviews, filename)
	:param data_before_processing:
	:param filename:
	:return: processed_reviews
	"""
	reviews = data_before_processing['review'].as_matrix().tolist()
	clean = lambda x: re.sub('[^a-zA-Z1-9\s]', '', x)
	processed_reviews = [(''.join(list(map(clean, reviews[i]))).lower()).split() for i in range(len(reviews))]
	mc.move_to_data_location()
	write_list_of_lists_to_csv(processed_reviews, filename)
	return processed_reviews
Ejemplo n.º 3
0
def preprocessing_reviews_in_df_to_sentences(data_before_processing, filename):
	"""

	write_list_of_lists_to_csv(sentences, filename)
	:param data_before_processing:
	:param filename:
	:return: sentences (python list)
	"""
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	reviews = data_before_processing['review'].as_matrix().tolist()
	reviews_sentences =[tokenizer.tokenize(review.strip()) for review in reviews]
	clean = lambda x: re.sub('[^a-zA-Z1-9\s]', '', x)
	lower_split = lambda x: x.lower().split()
	clean_sentences = [list(map(clean, reviews_sentences[i])) for i in range(len(reviews_sentences))]
	to_low_split_sentences = [list(map(lower_split, clean_sentences[i])) for i in range(len(clean_sentences))]
	sentences = [sentence for review in to_low_split_sentences for sentence in review]

	mc.move_to_data_location()
	write_list_of_lists_to_csv(sentences, filename)
	return sentences
Ejemplo n.º 4
0
def preprocessing_reviews_in_df(set_df, max_seq_length, filename):
	"""
	mc.move_to_data_location()
	set_df.to_csv(filename + '.csv', index=False)
	:param set_df:
	:param max_seq_length:
	:param filename:
	:return: set_df
	"""

	# BUG when reading from csv -- quotes
	for i in tnrange(1, desc='processing'):
		set_df = set_df[['review', 'sentiment']]
		set_df.info()
		set_df['review'] = set_df['review'].apply(lambda x: re.sub('[^a-zA-Z1-9\s]', '', x))
		set_df['review'] = set_df['review'].apply(lambda x: x.lower().split()[:max_seq_length])
		set_df['sentiment'] = set_df['sentiment'].apply(lambda x: [1, 0] if x == 1 else [0, 1])
		mc.move_to_data_location()
		set_df.to_(filename + '.csv', index=False)
	return set_df
Ejemplo n.º 5
0
def convert_pd_words_reviews_to_np_ids_matrix(reviews_df, maxSeqLength,
                                              wordsList, set_name):
    """
	mc.move_to_data_location()
	np.save(set_name + '_ids_matrix', ids_matrix)
	np.save(set_name + '_sentiment_labels', sentiment_set)

	:param reviews_df:
	:param maxSeqLength:
	:param wordsList:
	:param set_name:
	:return: ids_matrix, sentiment_set
	"""
    number_of_rev = len(reviews_df)
    id_of_unknown = len(wordsList)
    ids_matrix = np.full((number_of_rev, maxSeqLength),
                         id_of_unknown,
                         dtype='int32')
    rev_set = reviews_df['review'].as_matrix()
    sentiment_set = reviews_df['sentiment'].as_matrix()
    rev_counter = 0
    word_counter = 0
    for i in tnrange(len(rev_set), desc='processing review'):
        for word in rev_set[i]:
            try:
                ids_matrix[rev_counter][word_counter] = wordsList.index(word)
            except ValueError:
                ids_matrix[rev_counter][word_counter] = id_of_unknown
            word_counter += 1
        rev_counter += 1
        word_counter = 0
    mc.move_to_data_location()
    np.save(set_name + '_ids_matrix', ids_matrix)
    np.save(set_name + '_sentiment_labels', sentiment_set)
    mc.move_to_main_location()
    return ids_matrix, sentiment_set
Ejemplo n.º 6
0
            batch_predictions = sess.run(predictions, {
                X: x_test_batch,
                dropout_keep_prob: 1.0
            })
            all_predictions = np.concatenate(
                [all_predictions, batch_predictions])
            counter += 1

        # Print accuracy if y_test is defined
        correct_predictions = float(sum(all_predictions == y_test))
        print("Total number of test examples: {}".format(len(y_test)))
        print('Accuracy: {:g}'.format(correct_predictions /
                                      float(len(y_test))))

        # Save the evaluation to a csv
        mc.move_to_data_location()
        # TODO: change when learn new model and prepere new data
        test_set = pd.read_csv('test_processed_set.csv')
        x_raw = test_set['review']
        predictions_human_readable = np.column_stack(
            (np.array(x_raw), all_predictions))
        # without dots
        out_path = os.path.join(FLAGS.checkpoint_dir, "prediction.csv")
        print("Saving evaluation to  {0}".format(out_path))
        with open(out_path, 'w') as f:
            csv.writer(f).writerows(predictions_human_readable)

        acc_path = os.path.join(FLAGS.checkpoint_dir, "accuracy.csv")
        print("Saving accurancy to  {0}".format(acc_path))
        with open(acc_path, 'w') as f:
            f.write('{}'.format(correct_predictions / float(len(y_test))))
Ejemplo n.º 7
0
def load_ids_matrix_and_sentiment(x_filename, y_filename):
    mc.move_to_data_location()
    X = np.load(x_filename)
    Y = np.load(y_filename)
    Y = np.array(Y.tolist())
    return X, Y