コード例 #1
0
ファイル: w2v.py プロジェクト: yflyzhang/bigData
def w2v_train(test=True):
    sentence_path = get_constant('W2V_SRC_PATH', test)
    model_path = get_constant('W2V_MODEL_PATH', test)
    model_txt_path = get_constant('W2V_MODEL_TXT_PATH', test)

    sentences = MySentences(sentence_path)  # a memory-friendly iterator
    model = gensim.models.Word2Vec(sentences, size=200)
    model.save(model_path)
    model.save_word2vec_format(model_txt_path, binary=False)
コード例 #2
0
ファイル: functions.py プロジェクト: yflyzhang/bigData
def scrape(stop_at=None, train=0.85, test=True):
    # This function is to scrape only TestReview and Overall fields from original Amazon database

    HUNDRED = 100

    data_path = get_constant('DATA_PATH', test)
    train_path = get_constant('TRAIN_PATH', test)
    validate_path = get_constant('VALIDATE_PATH', test)
    reviews_path = get_constant('W2V_SRC_PATH', test)

    total_num_reviews = num_of_reviews(data_path)
    unit_percent_reviews = int(total_num_reviews / HUNDRED)

    data_file = open(data_path, 'r')
    train_file = open(train_path, 'w')
    validate_file = open(validate_path, 'w')
    reviews_file = open(reviews_path, 'w')

    progress = 0
    count = 1

    for line in data_file:
        line = line.strip()
        data = json.loads(line)

        review = del_punctuations(data['reviewText'])
        review = review.lower()
        review = del_stopwords(review)

        output = {"overall": data['overall'], "reviewText": review}

        if count <= train * total_num_reviews:
            train_file.write(json.dumps(output) + '\n')
        else:
            validate_file.write(json.dumps(output) + '\n')

        reviews_file.write(review + '\n')

        # print count
        if count % unit_percent_reviews == 0:
            progress = int(count * HUNDRED / float(total_num_reviews))
            print '{}%'.format(progress)
        elif count == total_num_reviews:
            print '100%'

        count += 1

        if stop_at and progress == stop_at:
            break

    data_file.close()
    train_file.close()
    validate_file.close()
    reviews_file.close()

    return
コード例 #3
0
ファイル: tf_training.py プロジェクト: craigstar/bigData
def test_cascade(mode=myconstants.Mode.MODE_MEAN, classification=None):
    test = False

    accuracy_path = get_constant('ACCURACY_PATH', test)
    w2v_model_path = get_constant('W2V_MODEL_PATH', test)
    d2v_model_path = get_constant('D2V_MODEL_PATH', test)

    # Load model
    if mode == myconstants.Mode.MODE_PCA:
        vec_dim = myconstants.Mode.PCA_COMPONENTS * myconstants.W2V_DIM
        model = gensim.models.Word2Vec.load(w2v_model_path)
    elif mode == myconstants.Mode.MODE_MEAN:
        vec_dim = myconstants.W2V_DIM
        model = gensim.models.Word2Vec.load(w2v_model_path)
    elif mode == myconstants.Mode.MODE_D2V:
        vec_dim = myconstants.D2V_DIM
        model = gensim.models.Doc2Vec.load(d2v_model_path)

    accuracy_ratings, accuracy_reviews = get_validate_data(
        accuracy_path, model, mode)

    accuracy_normal = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
    accuracy_cascade = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
    for idx, reviews in enumerate(accuracy_reviews):
        temp_classes = classification
        import operator
        rating, value = max(enumerate(accuracy_ratings[idx]),
                            key=operator.itemgetter(1))
        rating += 1

        while 1:
            if len(temp_classes) > 1:
                p = predict(reviews, temp_classes, vec_dim, mode)
                temp_classes = form_classes(p)
            else:
                if temp_classes[0][0] == rating:
                    accuracy_cascade[temp_classes[0][0]] += 1

                p = predict(reviews, myconstants.CLASSES, vec_dim, mode)
                if p[0] == rating:
                    accuracy_normal[p[0]] += 1
                break

        # p = predict(reviews, myconstants.CLASSES, vec_dim, mode)
        # # print tolerate_class(p)
        # if rating in tolerate_class(p):
        #   accuracy_cascade[rating] += 1

        # p = predict(reviews, myconstants.CLASSES, vec_dim, mode)
        # if p[0] == rating:
        #   accuracy_normal[rating] += 1
        print idx
    print accuracy_normal, accuracy_cascade
    return accuracy_normal, accuracy_cascade
コード例 #4
0
def predict(sentence):
    print 'your review is : %s' % sentence
    sentence = ml.del_punctuations(sentence)
    sentence = sentence.lower()
    sentence = ml.del_stopwords(sentence)

    while sentence == '':
        sentence = raw_input("Meaningless sentence! Please enter another sentence :")
        print 'your review is : %s' % sentence
        sentence = ml.del_punctuations(sentence)
        sentence = sentence.lower()
        sentence = ml.del_stopwords(sentence)

    print 'your key words for vector are : %s' % sentence

    # load model
    test = False
    accuracy_path = get_constant('ACCURACY_PATH', test)
    w2v_model_path = get_constant('W2V_MODEL_PATH', test)
    d2v_model_path = get_constant('D2V_MODEL_PATH', test)

    # load model
    if mode == myconstants.Mode.MODE_PCA:
        vec_dim = myconstants.Mode.PCA_COMPONENTS * myconstants.W2V_DIM
        model = gensim.models.Word2Vec.load(w2v_model_path)
    elif mode == myconstants.Mode.MODE_MEAN:
        vec_dim = myconstants.W2V_DIM
        model = gensim.models.Word2Vec.load(w2v_model_path)
    elif mode == myconstants.Mode.MODE_D2V:
        vec_dim = myconstants.D2V_DIM
        model = gensim.models.Doc2Vec.load(d2v_model_path)

    vector = np.array(sentence_mean(sentence, model))
    vector.shape = [1, vec_dim]
    sess = tf.Session()
    x = tf.Variable(vector)

    try:
        W = tf.Variable(np.load(get_save_name(myconstants.NUMPY_W, mode)))
        b = tf.Variable(np.load(get_save_name(myconstants.NUMPY_B, mode)))
    except:
        print 'you have not trained your model yet!'

    y = tf.argmax(tf.nn.softmax(tf.matmul(x, W) + b), 1)

    init = tf.initialize_all_variables()
    sess.run(init)
    result = sess.run(y)
    total_class = myconstants.CLASSES
    print 'predicted rating : ', total_class[result]
    print ''
コード例 #5
0
def d2v_train(test=True, combine=False):
    train_path = get_constant('D2V_SRC_PATH', test)
    accuracy_path = get_constant('ACCURACY_PATH', test)
    model_path = get_constant('D2V_MODEL_PATH', test)

    if combine:
        sentences = LabeledLineSentence([[train_path, 'TRAIN'],
                                         [accuracy_path, 'TEST', 'JSON']])
    else:
        sentences = LabeledLineSentence([[train_path, 'TRAIN']])

    model = Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
    model.build_vocab(sentences)
    for epoch in range(5):
        model.train(sentences)
        model.alpha -= 0.002  # decrease the learning rate
        model.min_alpha = model.alpha  # fix the learning rate, no decay

        model.save(model_path)
コード例 #6
0
ファイル: tf_training.py プロジェクト: craigstar/bigData
def test_similarity(stop_at=None,
                    max_count=2,
                    test=True,
                    mode=myconstants.Mode.MODE_PCA):
    sentence_path = get_constant('TRAIN_PATH', test)
    model_path = get_constant('W2V_MODEL_PATH', test=False)

    model = gensim.models.Word2Vec.load(model_path)

    sen_list = {i: [] for i in range(1, 6)}

    count = 1

    train_file = open(sentence_path, 'r')
    for line in train_file:
        line = line.strip()
        data = json.loads(line)

        current_rating = data['overall']
        sentence = data['reviewText']

        if mode == myconstants.Mode.MODE_PCA:
            sen_represent = sentence_pca(sentence, model)
        elif mode == myconstants.Mode.MODE_MEAN:
            sen_represent = sentence_mean(sentence, model)

        sen_list, full_flag = form_sen_list(sen_list, sen_represent,
                                            current_rating, max_count)

        if full_flag or (stop_at and stop_at == count):
            print_comparison(sen_list, max_count)
            print count
            break

        count += 1

    return
コード例 #7
0
ファイル: tf_training.py プロジェクト: craigstar/bigData
def next_batch(size, test, model, mode):
    from random import choice
    if test:
        choices = CHOICES_TEST
    else:
        choices = CHOICES

    p = [choice(choices) for n in range(size)]

    sentence_path = get_constant('TRAIN_PATH', test)

    mean_batch = []
    rating_batch = []

    with open(sentence_path, 'r') as train_file:
        lines = train_file.readlines()

        for i in p:
            current_i = i
            sen_represent = None
            rating_valid = False

            while sen_represent == None:
                line = lines[current_i]
                data = json.loads(line)

                rating = data['overall']
                sentence = data['reviewText']

                if mode == myconstants.Mode.MODE_PCA:
                    sen_represent = sentence_pca(sentence, model)
                elif mode == myconstants.Mode.MODE_MEAN:
                    sen_represent = sentence_mean(sentence, model)
                elif mode == myconstants.Mode.MODE_D2V:
                    # import pdb; pdb.set_trace()
                    sen_represent = sentence_d2v(sentence, current_i, model)

                current_i = choice(choices)

            rating_vec = one_hot(rating)

            mean_batch.append(sen_represent)
            rating_batch.append(rating_vec)
    return [np.array(mean_batch), np.array(rating_batch)]
コード例 #8
0
ファイル: tf_training.py プロジェクト: craigstar/bigData
def tf_train(test=True,
             learning_rate=0.5,
             mode=myconstants.Mode.MODE_MEAN,
             loops=1000):
    accuracy_path = get_constant('ACCURACY_PATH', test)
    w2v_model_path = get_constant('W2V_MODEL_PATH', test)
    d2v_model_path = get_constant('D2V_MODEL_PATH', test)
    result_folder = get_constant('RESULT_DIR', test)

    # total_num_reviews = f.num_of_reviews(sentence_path)
    # total_num_reviews = myconstants.TOTAL_REVIEWS
    # unit_percent_reviews = int(total_num_reviews / HUNDRED / 10)

    # result/path
    result_path = result_folder + myconstants.Mode.MODE_NAME[mode]
    for cla in myconstants.CLASSES:
        result_path += '_'

        for rating in cla:
            result_path += str(rating)

    result_path += '.txt'

    import os
    if os.path.isfile(result_path):
        result_file = open(result_path, 'a')
        result_file.write('\n\n')
    else:
        result_file = open(result_path, 'w')
        result_file.write('')

    # Load model

    if mode == myconstants.Mode.MODE_PCA:
        vec_dim = myconstants.Mode.PCA_COMPONENTS * myconstants.W2V_DIM
        model = gensim.models.Word2Vec.load(w2v_model_path)
    elif mode == myconstants.Mode.MODE_MEAN:
        vec_dim = myconstants.W2V_DIM
        model = gensim.models.Word2Vec.load(w2v_model_path)
    elif mode == myconstants.Mode.MODE_D2V:
        vec_dim = myconstants.D2V_DIM
        model = gensim.models.Doc2Vec.load(d2v_model_path)

    accuracy_ratings, accuracy_reviews = get_validate_data(
        accuracy_path, model, mode)

    # tf training graph
    graph = tf.Graph()
    with graph.as_default():
        sess = tf.InteractiveSession()

        try:
            W = tf.Variable(np.load(get_save_name(myconstants.NUMPY_W, mode)))
            b = tf.Variable(np.load(get_save_name(myconstants.NUMPY_B, mode)))
            print('Exist {}: {}'.format(myconstants.Mode.MODE_NAME[mode],
                                        myconstants.CLASSES))
        except:
            W = tf.Variable(tf.truncated_normal([vec_dim, TOTAL_CLASSES]))
            b = tf.Variable(tf.truncated_normal([TOTAL_CLASSES]))
            print('New {}: {}'.format(myconstants.Mode.MODE_NAME[mode],
                                      myconstants.CLASSES))

        x = tf.placeholder(tf.float32, [None, vec_dim])
        y = tf.matmul(x, W) + b

        y_ = tf.placeholder(tf.float32, [None, TOTAL_CLASSES])

        cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(y, y_))
        # cross_entropy += tf.reduce_mean(tf.square(b))
        # cross_entropy += tf.reduce_mean(tf.square(W))
        train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(
            cross_entropy)

        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        init = tf.initialize_all_variables()
        sess.run(init)

        count = 1
        progress = 0

        for line in range(loops):
            mean_batch, rating_batch = next_batch(100, test, model, mode)

            sess.run(train_step, feed_dict={x: mean_batch, y_: rating_batch})

            if count % 10 == 0:
                print_value = "percent test accuracy {}, ".format(
                    accuracy.eval(feed_dict={
                        x: accuracy_reviews,
                        y_: accuracy_ratings
                    }))
                print_value += 'loop: {}'.format(count)
                result_file.write(print_value + '\n')
                print(print_value)
            count += 1

        print('{}: {}'.format(myconstants.Mode.MODE_NAME[mode],
                              myconstants.CLASSES))
        np.save(get_save_name(myconstants.NUMPY_W, mode), sess.run(W))
        np.save(get_save_name(myconstants.NUMPY_B, mode), sess.run(b))

        result_file.close()
    return