Beispiel #1
0
    def process(self):
        tok = Tokenizer()

        # consider entire corpus as text ( train + test text columns )
        if self.test_csv:
            text = list(self.df.loc[:, self.text_cols].values) + list(
                self.test_df.loc[:, self.text_cols])
        else:
            text = list(self.df.loc[:, self.text_cols].values)

        self.tokens = [tok.tokenizer(x) for x in text]
        self.vocab = Vocab.create(self.tokens, self.max_vocab, self.min_freq)

        self.ntokens = [self.vocab.numericalize(t) for t in self.tokens]

        # only full training
        if self.valid_pct == 0 and self.test_csv is None:
            self.trn_ds = (self.ntokens, self.df.loc[:,
                                                     self.label_cols].values)
            self.vld_ds = ([], [])
            self.test_ds = ([], [])

        # holdout
        elif self.valid_pct > 0 and self.test_csv is None:
            self.trn_ds = (self.ntokens[self.cut:],
                           self.df.loc[:, self.label_cols].values[self.cut:])
            self.vld_ds = (self.ntokens[:self.cut],
                           self.df.loc[:, self.label_cols].values[:self.cut])
            self.tst_ds = ([], [])

        # holdout and test prediction
        elif self.valid_pct > 0 and self.test_csv is not None:
            self.trn_tokens = self.ntokens[:len(self.df)]
            self.tst_ds = (self.ntokens[len(self.df):], [])

            trn_tokens = self.trn_tokens[self.cut:]
            vld_tokens = self.trn_tokens[:self.cut]

            self.trn_ds = (trn_tokens,
                           self.df.loc[:, self.label_cols].values[self.cut:])
            self.vld_ds = (vld_tokens,
                           self.df.loc[:, self.label_cols].values[:self.cut])

        # full training and test prediction
        else:
            self.trn_ds = (self.ntokens[:len(self.df)],
                           self.df.loc[:, self.label_cols].values)
            self.vld_ds = ([], [])
            self.tst_ds = (self.ntokens[len(self.df):], [])

        return self.vocab, self.trn_ds, self.vld_ds, self.tst_ds
    def __init__(self, dataset_name, tokenizer, is_train:bool, data_path, read_data_func=None, is_to_tokens=True):
        self.is_train = is_train
        self.dataset_name = dataset_name
        self.labels_num = config_data[dataset_name].labels_num
        self.data = []
        self.labels = []
        self.data_token = []
        self.data_seq = []
        self.labels_tensor = []
        self.vocab = None
        self.tokenizer = tokenizer if tokenizer else Tokenizer('normal', remove_stop_words=False)
        self.maxlen = None
        if isinstance(data_path, str):
            data_path = [data_path]
        for path in data_path:
            if read_data_func is not None: td, tl = read_data_func(path)
            else: td, tl = read_standard_data(path)
            self.data += td
            self.labels += tl

        if is_to_tokens:
            self.data2token()
n_h = 1000  ## number of hidden nodes in encoder

n_d = 1000  ## number of hidden nodes in decoder
n_y = dim_word

stochastic = False
verbose = 1

## tokenize text, change to matrix

text = []
with open("data/TED2013.raw.en") as f:
    for line in f:
        text.append(line)
        #text.append(korean_morph(line))
input = Tokenizer(n_words_x)
input.fit_on_texts(text)
seq = input.texts_to_sequences(text, n_sentence, n_maxlen)

n_words_x = input.nb_words
'''
text=[]
with open("data/TED2013.raw.en") as f:
    for line in f:
        text.append(line)

output=Tokenizer(n_words)
output.fit_on_texts(text)
'''
output = input
#targets=output.texts_to_sequences(text,n_sentence,n_maxlen)
Beispiel #4
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import matplotlib.pyplot as plt
from preprocess import Tokenizer
import sklearn.cross_validation
from sklearn.linear_model import Perceptron
import reading as rd

#   Random Forest # other

score = 0.0
vectorizer_tfidf = TfidfVectorizer(sublinear_tf=True,
                                   max_df=0.5,
                                   tokenizer=Tokenizer(),
                                   lowercase=True,
                                   strip_accents='unicode',
                                   stop_words='english',
                                   ngram_range=(1, 3))

for i in range(0, 5):
    Xtrain, Xtest, y_train, y_test = sklearn.cross_validation.train_test_split(
        rd.dataset, rd.target, test_size=0.2)

    X_train = vectorizer_tfidf.fit_transform(Xtrain)
    X_test = vectorizer_tfidf.transform(Xtest)

    clf = Perceptron(n_iter=50)

    clf.fit(X_train, y_train)

    pred = clf.predict(X_test)
from sklearn.feature_extraction.text import HashingVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeClassifier
import sklearn.cross_validation
import reading as rd
from preprocess import Tokenizer

vectorizer_hash = HashingVectorizer(tokenizer=Tokenizer(),
                                    lowercase=True,
                                    strip_accents='unicode',
                                    stop_words='english',
                                    ngram_range=(1, 3))

score = 0.0

for i in range(0, 5):
    Xtrain, Xtest, y_train, y_test = sklearn.cross_validation.train_test_split(
        rd.dataset, rd.target, test_size=0.2)

    X_train = vectorizer_hash.transform(Xtrain)
    X_test = vectorizer_hash.transform(Xtest)

    #SVM with SGD
    clf = RidgeClassifier(tol=1e-2, solver="lsqr")

    clf.fit(X_train, y_train)

    pred = clf.predict(X_test)
    print(metrics.confusion_matrix(y_test, pred))
n_d = 1000 ## number of hidden nodes in decoder
n_y = dim_word

stochastic=False
verbose=1



## tokenize text, change to matrix

text=[]
with open("data/TED2013.raw.en") as f:
    for line in f:
        text.append(line)
        #text.append(korean_morph(line))
input=Tokenizer(n_words)
input.fit_on_texts(text)
seq=input.texts_to_sequences(text,n_sentence,n_maxlen)

n_words_x=input.nb_words

text=[]
with open("data/TED2013.raw.en") as f:
    for line in f:
        text.append(line)

output=Tokenizer(n_words)
output.fit_on_texts(text)
targets=output.texts_to_sequences(text,n_sentence,n_maxlen)

n_words_y=output.nb_words
n_d = 1000 ## number of hidden nodes in decoder
n_y = dim_word

stochastic=False
verbose=1



## tokenize text, change to matrix

text=[]
with open("data/TED2013.raw.en") as f:
    for line in f:
        text.append(line)
        #text.append(korean_morph(line))
input=Tokenizer(n_words_x)
input.fit_on_texts(text)
seq=input.texts_to_sequences(text,n_sentence,n_maxlen)

n_words_x=input.nb_words
'''
text=[]
with open("data/TED2013.raw.en") as f:
    for line in f:
        text.append(line)

output=Tokenizer(n_words)
output.fit_on_texts(text)
'''
output=input
#targets=output.texts_to_sequences(text,n_sentence,n_maxlen)