Ejemplo n.º 1
0
def predict():
    pred_text = input("Please enter a review in english: ")
    contractions = get_contractions()
    pred_text = utils.clean_text(pred_text, contractions)
    pred_seq = tokenizer.text_to_sequence(pred_text, pred=True)
    pred_seq = np.tile(pred_seq, (args.batch_size, 1))

    with tf.Session(graph=train_graph) as sess:
        checkpoint = "./saves/best_model.ckpt"
        all_preds = []
        # with tf.Session() as sess:
        saver = tf.train.Saver()
        # Load the model
        saver.restore(sess, checkpoint)
        state = sess.run(graph.initial_state)
        feed = {
            graph.input_data: pred_seq,
            graph.keep_prob: args.keep_prob,
            graph.initial_state: state
        }

        preds = sess.run(graph.predictions, feed_dict=feed)
        for i in range(len(preds)):
            all_preds.append(preds[i, :])
    all_preds = np.asarray(all_preds)
    y_predictions = np.argmax(all_preds, axis=1)
    counts = np.bincount(y_predictions)
    print("\nYou rated the restaurant: " + str(np.argmax(counts)) + " stars!")
def process_reviews(bus_file='.data/dataset/business.json',
                    rev_file='./data/dataset/review.json'):
    """
    Function will initialize the review preprocessing pipeline. It will expand contractions of text
    and then perform text cleaning
    :param bus_file: Type string, path to business json file
    :param rev_file: Type string, path to reviews json file
    :return:
    """
    assert isinstance(bus_file, str)
    assert isinstance(rev_file, str)

    restId = []
    for line in open(bus_file, 'r'):
        data = json.loads(line)
        if 'Restaurants' in data['categories'] or 'Food' in data['categories']:
            restId.append(data['business_id'])
    print("There are %d restaurants" % (len(restId)))

    contractions = get_contractions()

    revs_list = [[]]
    stars_list = [[]]
    k = 0  # Count
    nolang = [[]]
    for line in open(rev_file, 'r'):  # encoding='utf-8'
        if k >= args.num_reviews:
            break
        data = json.loads(line)
        text = data['text']
        star = data['stars']
        ID = data['business_id']
        # Check language
        if text is None:
            continue
        if star is None:
            continue
        if ID not in restId:
            continue
        try:
            if detect(text) == 'en':
                revs_list.append(utils.clean_text(text, contractions))
                stars_list.append(star)
                k += 1
                # Notify for every 5000 reviews
                if len(revs_list) % 5000 == 0:
                    print("Currently processed %d reviews" % len(revs_list))
        except ValueError:
            nolang.append(text)
            print("Detected text with no language! Now at: %d" % len(nolang))
    print("Length of Reviews:\t" + str(len(revs_list)) + "Length of Stars:\t" +
          str(len(stars_list)))
    return revs_list, stars_list
Ejemplo n.º 3
0
def main():
    hamlet = read_file(filepath)

    hamlet_cleaned = clean_text(hamlet)

    #hamlet_wordcount = wordcount(hamlet_cleaned)
    #hamlet_wordcount = wordcount_counter(hamlet_cleaned)
    #hamlet_wordcount = wordcount_dd(hamlet_cleaned)
    hamlet_wordcount = word_count_err_handling(hamlet_cleaned)

    #print(hamlet_wordcount.most_common(50))
    print(hamlet_wordcount)
Ejemplo n.º 4
0
 def ProcessData(self):
     """
         Runs DataProcessing class and creates dataframe of cleaned reviews and associated rating labels
         
         Paras:
             None
         Returns:
             None
     """
     columns = ["summary", "reviewText", "overall"]
     df = loadData("../Dataset/raw_training_set", columns)
     split_reviews = [filterReview(df, "overall", i) for i in range(1, 6)]
     df = self.balanceReviews(split_reviews, columns)
     df["reviews"] = df["summary"] + " " + df["reviewText"]
     df["reviews"] = df["reviews"].apply(lambda x: clean_text(x))
     df = df.sample(frac=1).reset_index(drop=True)
     return df[["reviews", "overall"]]
Ejemplo n.º 5
0
from utilities import read_file, clean_text, wordcount

filepath = "D:\\py101\\notebooks\\hamlet.txt"

hamlet = read_file(filepath)

hamlet_cleaned = clean_text(hamlet)

hamlet_wordcount = wordcount(hamlet_cleaned)

print(hamlet_wordcount)
Ejemplo n.º 6
0
def add_text_clean_col_to_df(df):
    df['text_clean'] = df['traducciones'].apply(lambda x: funcs.clean_text(x))

    return df