Exemple #1
0
def review_to_tensor(review_list, word2index, max_sentence_length,
                     max_review_length):
    """
    As the result, each review will be composed of max_rev_len sentences. If the original review is longer than that,
    we truncate it, and if shorter than that, we append empty sentences to it. And each sentence will be composed of
    sent_length words. If the original sentence is longer than that, we truncate it, and if shorter, we append the word
    of 'UNK' to it. Also, we keep track of the actual number of sentences each review contains.
    :param review_list:
    :param word2index
    :param max_sentence_length:
    :param max_review_length:
    :return: [batch_size, max_review_length, max_sentence_length]
    """
    batch_size = len(review_list)
    review_tensor_list = np.zeros(
        (batch_size, max_review_length, max_sentence_length), dtype=np.int32)
    review_lens = []
    for index, review in enumerate(review_list):
        review_tensor = get_train_data(review, word2index)
        review_tensor = preprocessing.sequence.pad_sequences(
            review_tensor,
            maxlen=max_sentence_length,
            padding="post",
            truncating="post",
            value=0)
        review_lens.append(min(review_tensor.shape[0], 15))
        review_tensor = preprocessing.sequence.pad_sequences(
            [review_tensor],
            maxlen=max_review_length,
            padding="post",
            truncating="post",
            value=np.zeros(max_sentence_length))[0]
        review_tensor_list[index] = review_tensor

    return review_tensor_list, np.array(review_lens)
Exemple #2
0
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.utils.data as Data
import torch.nn.functional as F
from preprocess import get_train_data

device = 'cuda'
BATCH_SIZE = 20000

print('--getting training data --')
print('device: ', device)
npx, npy = get_train_data()
print('size: ', npx.shape, npy.shape)

x = torch.from_numpy(npx)
y = torch.from_numpy(npy)
x = x.type(torch.float)
y = y.type(torch.float)
x = x.to(device)
y = y.to(device)

torch_dataset = Data.TensorDataset(x, y)
loader = Data.DataLoader(
    dataset=torch_dataset,  # torch TensorDataset format
    batch_size=BATCH_SIZE,  # mini batch size
    shuffle=True,  # random shuffle for training
    num_workers=0,  # subprocesses for loading data
)
Exemple #3
0
import preprocess
import train
import predict

if __name__ == "__main__":
    print("preprocess Start")
    preprocess.get_train_data()
    preprocess.get_test_data()

    print("train Start")
    train.full_train()

    print("predict Start")
    predict.past_predict()
Exemple #4
0
import codecs
import string
import numpy as np
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import TweetTokenizer

sys.path.append("../")
from preprocess import get_train_data, clean_host_texts

data = "../data/"
train_file = data + "train_noduplicates.csv"
train_hosts, y_train = get_train_data(train_file)

# Loading the textual content of a set of web pages for each host into the dictionary "text".
# The encoding parameter is required since the majority of our text is french.
file_names = os.listdir("../text/text")
splitting_text = "__________________________________________________________________"
file_name_format = "#.txt"


def new_f_out(input_file, file_num):
    """
    Function to create the file for each subtext of the original bigger text 

    Input : 
        - input_file : the file name str for the text which we seperate 
        - file_num : index for the subtext 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from preprocess import get_train_data, get_test_data
from dtree import DecisionTree

x_train, y_train = get_train_data()
x_test, y_test = get_test_data()
n_attr = np.size(x_train, axis=1)


def learn_depths():  # training decision tree for different heights
    train_acc = np.zeros(n_attr)
    test_acc = np.zeros(n_attr)
    for depth in range(n_attr):
        dtree = DecisionTree(x_train, y_train, max_depth=depth)
        dtree.fit()
        train_acc[depth] = dtree.accuracy(x_train, y_train)
        test_acc[depth] = dtree.accuracy(x_test, y_test)
    df = pd.DataFrame({
        'depth': range(1, n_attr + 1),
        'Train accuracy': train_acc,
        'Test accuracy': test_acc
    })
    # df.to_csv('res/acc.csv')
    return df


def plot_acc(df):
    plt.plot('depth', 'Train accuracy', data=df)
    plt.plot('depth', 'Test accuracy', data=df)
Exemple #6
0
# training set: train_d aka x, (mon4, mon5); train_t aka y, (mon6)
# testing set: test_d aka x, (mon4, mon5, mon6); test_t aka y, (mon7)
train_d = dc(mon4)
train_d.extend(mon5)
test_d = dc(train_d)
test_d.extend(mon6)
train_t = dc(mon6)
test_t = dc(mon7)

# processing data
train_d = pp.process_activity(train_d)
train_t = pp.process_activity(mon6)
train_d = pp.normalization(train_d)

train_x,train_y = pp.get_train_data(train_d,train_t)
test_d = pp.process_activity(test_d)
test_d = pp.normalization(test_d)
test_t = pp.process_activity(test_t)

# using percetron to train model
pcpt = Perceptron()
pcpt.fit(train_x, train_y)

# geting 3000 best prediction data
result = heapq.nlargest(2000,test_d,lambda x:pcpt.decision_function(test_d[x]))

# calculating the quality of result
precision, recall, f1 = pp.get_comments(result, test_t)
print "Precision rate: %f\nRecall rate: %f\nF1: %f\n" % (precision,recall,f1)