def process(self): tok = Tokenizer() # consider entire corpus as text ( train + test text columns ) if self.test_csv: text = list(self.df.loc[:, self.text_cols].values) + list( self.test_df.loc[:, self.text_cols]) else: text = list(self.df.loc[:, self.text_cols].values) self.tokens = [tok.tokenizer(x) for x in text] self.vocab = Vocab.create(self.tokens, self.max_vocab, self.min_freq) self.ntokens = [self.vocab.numericalize(t) for t in self.tokens] # only full training if self.valid_pct == 0 and self.test_csv is None: self.trn_ds = (self.ntokens, self.df.loc[:, self.label_cols].values) self.vld_ds = ([], []) self.test_ds = ([], []) # holdout elif self.valid_pct > 0 and self.test_csv is None: self.trn_ds = (self.ntokens[self.cut:], self.df.loc[:, self.label_cols].values[self.cut:]) self.vld_ds = (self.ntokens[:self.cut], self.df.loc[:, self.label_cols].values[:self.cut]) self.tst_ds = ([], []) # holdout and test prediction elif self.valid_pct > 0 and self.test_csv is not None: self.trn_tokens = self.ntokens[:len(self.df)] self.tst_ds = (self.ntokens[len(self.df):], []) trn_tokens = self.trn_tokens[self.cut:] vld_tokens = self.trn_tokens[:self.cut] self.trn_ds = (trn_tokens, self.df.loc[:, self.label_cols].values[self.cut:]) self.vld_ds = (vld_tokens, self.df.loc[:, self.label_cols].values[:self.cut]) # full training and test prediction else: self.trn_ds = (self.ntokens[:len(self.df)], self.df.loc[:, self.label_cols].values) self.vld_ds = ([], []) self.tst_ds = (self.ntokens[len(self.df):], []) return self.vocab, self.trn_ds, self.vld_ds, self.tst_ds
def __init__(self, dataset_name, tokenizer, is_train:bool, data_path, read_data_func=None, is_to_tokens=True): self.is_train = is_train self.dataset_name = dataset_name self.labels_num = config_data[dataset_name].labels_num self.data = [] self.labels = [] self.data_token = [] self.data_seq = [] self.labels_tensor = [] self.vocab = None self.tokenizer = tokenizer if tokenizer else Tokenizer('normal', remove_stop_words=False) self.maxlen = None if isinstance(data_path, str): data_path = [data_path] for path in data_path: if read_data_func is not None: td, tl = read_data_func(path) else: td, tl = read_standard_data(path) self.data += td self.labels += tl if is_to_tokens: self.data2token()
n_h = 1000 ## number of hidden nodes in encoder n_d = 1000 ## number of hidden nodes in decoder n_y = dim_word stochastic = False verbose = 1 ## tokenize text, change to matrix text = [] with open("data/TED2013.raw.en") as f: for line in f: text.append(line) #text.append(korean_morph(line)) input = Tokenizer(n_words_x) input.fit_on_texts(text) seq = input.texts_to_sequences(text, n_sentence, n_maxlen) n_words_x = input.nb_words ''' text=[] with open("data/TED2013.raw.en") as f: for line in f: text.append(line) output=Tokenizer(n_words) output.fit_on_texts(text) ''' output = input #targets=output.texts_to_sequences(text,n_sentence,n_maxlen)
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import metrics import matplotlib.pyplot as plt from preprocess import Tokenizer import sklearn.cross_validation from sklearn.linear_model import Perceptron import reading as rd # Random Forest # other score = 0.0 vectorizer_tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.5, tokenizer=Tokenizer(), lowercase=True, strip_accents='unicode', stop_words='english', ngram_range=(1, 3)) for i in range(0, 5): Xtrain, Xtest, y_train, y_test = sklearn.cross_validation.train_test_split( rd.dataset, rd.target, test_size=0.2) X_train = vectorizer_tfidf.fit_transform(Xtrain) X_test = vectorizer_tfidf.transform(Xtest) clf = Perceptron(n_iter=50) clf.fit(X_train, y_train) pred = clf.predict(X_test)
from sklearn.feature_extraction.text import HashingVectorizer #from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import metrics import matplotlib.pyplot as plt from sklearn.linear_model import RidgeClassifier import sklearn.cross_validation import reading as rd from preprocess import Tokenizer vectorizer_hash = HashingVectorizer(tokenizer=Tokenizer(), lowercase=True, strip_accents='unicode', stop_words='english', ngram_range=(1, 3)) score = 0.0 for i in range(0, 5): Xtrain, Xtest, y_train, y_test = sklearn.cross_validation.train_test_split( rd.dataset, rd.target, test_size=0.2) X_train = vectorizer_hash.transform(Xtrain) X_test = vectorizer_hash.transform(Xtest) #SVM with SGD clf = RidgeClassifier(tol=1e-2, solver="lsqr") clf.fit(X_train, y_train) pred = clf.predict(X_test) print(metrics.confusion_matrix(y_test, pred))
n_d = 1000 ## number of hidden nodes in decoder n_y = dim_word stochastic=False verbose=1 ## tokenize text, change to matrix text=[] with open("data/TED2013.raw.en") as f: for line in f: text.append(line) #text.append(korean_morph(line)) input=Tokenizer(n_words) input.fit_on_texts(text) seq=input.texts_to_sequences(text,n_sentence,n_maxlen) n_words_x=input.nb_words text=[] with open("data/TED2013.raw.en") as f: for line in f: text.append(line) output=Tokenizer(n_words) output.fit_on_texts(text) targets=output.texts_to_sequences(text,n_sentence,n_maxlen) n_words_y=output.nb_words
n_d = 1000 ## number of hidden nodes in decoder n_y = dim_word stochastic=False verbose=1 ## tokenize text, change to matrix text=[] with open("data/TED2013.raw.en") as f: for line in f: text.append(line) #text.append(korean_morph(line)) input=Tokenizer(n_words_x) input.fit_on_texts(text) seq=input.texts_to_sequences(text,n_sentence,n_maxlen) n_words_x=input.nb_words ''' text=[] with open("data/TED2013.raw.en") as f: for line in f: text.append(line) output=Tokenizer(n_words) output.fit_on_texts(text) ''' output=input #targets=output.texts_to_sequences(text,n_sentence,n_maxlen)