def predict(): pred_text = input("Please enter a review in english: ") contractions = get_contractions() pred_text = utils.clean_text(pred_text, contractions) pred_seq = tokenizer.text_to_sequence(pred_text, pred=True) pred_seq = np.tile(pred_seq, (args.batch_size, 1)) with tf.Session(graph=train_graph) as sess: checkpoint = "./saves/best_model.ckpt" all_preds = [] # with tf.Session() as sess: saver = tf.train.Saver() # Load the model saver.restore(sess, checkpoint) state = sess.run(graph.initial_state) feed = { graph.input_data: pred_seq, graph.keep_prob: args.keep_prob, graph.initial_state: state } preds = sess.run(graph.predictions, feed_dict=feed) for i in range(len(preds)): all_preds.append(preds[i, :]) all_preds = np.asarray(all_preds) y_predictions = np.argmax(all_preds, axis=1) counts = np.bincount(y_predictions) print("\nYou rated the restaurant: " + str(np.argmax(counts)) + " stars!")
def process_reviews(bus_file='.data/dataset/business.json', rev_file='./data/dataset/review.json'): """ Function will initialize the review preprocessing pipeline. It will expand contractions of text and then perform text cleaning :param bus_file: Type string, path to business json file :param rev_file: Type string, path to reviews json file :return: """ assert isinstance(bus_file, str) assert isinstance(rev_file, str) restId = [] for line in open(bus_file, 'r'): data = json.loads(line) if 'Restaurants' in data['categories'] or 'Food' in data['categories']: restId.append(data['business_id']) print("There are %d restaurants" % (len(restId))) contractions = get_contractions() revs_list = [[]] stars_list = [[]] k = 0 # Count nolang = [[]] for line in open(rev_file, 'r'): # encoding='utf-8' if k >= args.num_reviews: break data = json.loads(line) text = data['text'] star = data['stars'] ID = data['business_id'] # Check language if text is None: continue if star is None: continue if ID not in restId: continue try: if detect(text) == 'en': revs_list.append(utils.clean_text(text, contractions)) stars_list.append(star) k += 1 # Notify for every 5000 reviews if len(revs_list) % 5000 == 0: print("Currently processed %d reviews" % len(revs_list)) except ValueError: nolang.append(text) print("Detected text with no language! Now at: %d" % len(nolang)) print("Length of Reviews:\t" + str(len(revs_list)) + "Length of Stars:\t" + str(len(stars_list))) return revs_list, stars_list
def main(): hamlet = read_file(filepath) hamlet_cleaned = clean_text(hamlet) #hamlet_wordcount = wordcount(hamlet_cleaned) #hamlet_wordcount = wordcount_counter(hamlet_cleaned) #hamlet_wordcount = wordcount_dd(hamlet_cleaned) hamlet_wordcount = word_count_err_handling(hamlet_cleaned) #print(hamlet_wordcount.most_common(50)) print(hamlet_wordcount)
def ProcessData(self): """ Runs DataProcessing class and creates dataframe of cleaned reviews and associated rating labels Paras: None Returns: None """ columns = ["summary", "reviewText", "overall"] df = loadData("../Dataset/raw_training_set", columns) split_reviews = [filterReview(df, "overall", i) for i in range(1, 6)] df = self.balanceReviews(split_reviews, columns) df["reviews"] = df["summary"] + " " + df["reviewText"] df["reviews"] = df["reviews"].apply(lambda x: clean_text(x)) df = df.sample(frac=1).reset_index(drop=True) return df[["reviews", "overall"]]
from utilities import read_file, clean_text, wordcount filepath = "D:\\py101\\notebooks\\hamlet.txt" hamlet = read_file(filepath) hamlet_cleaned = clean_text(hamlet) hamlet_wordcount = wordcount(hamlet_cleaned) print(hamlet_wordcount)
def add_text_clean_col_to_df(df): df['text_clean'] = df['traducciones'].apply(lambda x: funcs.clean_text(x)) return df