def run(self): self.analyzer = SentimentIntensityAnalyzer() self.output().makedirs() data = dataset.Dataset().load_named(self.dataset) data_sent = self.calc_sent(data, self.dataset) np.save('cache/sentiment/{}.npy'.format(self.dataset), data_sent)
def run(self): self.output().makedirs() kvecs = gensim.models.KeyedVectors.load_word2vec_format(w2v_file) data = dataset.Dataset().load_named(self.dataset) dists = np.zeros(data.shape[0]) i = 0 #def wmd(row): # return kvecs.wmdistance(row.question1_raw, row.question2_raw) #dists = dask.dataframe.from_pandas(data, npartitions=16, sort=False).apply(wmd).compute().values for q1, q2 in tqdm(zip(data.question1_raw, data.question2_raw), total=data.question1_raw.shape[0], desc='Computing %s WMD' % self.dataset): dists[i] = kvecs.wmdistance(q1, q2) i += 1 np.save( 'cache/functional-distance/%s-%s.tmp.npy' % (self.name, self.dataset), dists) os.rename( 'cache/functional-distance/%s-%s.tmp.npy' % (self.name, self.dataset), 'cache/functional-distance/%s-%s.npy' % (self.name, self.dataset))
def requires(self): yield dataset.Dataset() yield tfidf_matrix.TFIDFFeature() yield question_vectors.QuestionVector() yield distances.AllDistances() yield shared_entites.SharedEntities() yield wordmat_distance.WordMatDistance()
def vectorize_dataset(self, dataset_name): examples = dataset.Dataset().load_named(dataset_name) all_examples = pandas.concat( [examples.question1_tokens, examples.question2_tokens]) all_vecs = np.vstack(all_examples.progress_apply( self.vectorize_sent)).astype(np.float16) return all_vecs[:all_vecs.shape[0] // 2], all_vecs[all_vecs.shape[0] // 2:]
def testicles(self): X = self._load_named('train') y = dataset.Dataset().load_named('train').is_duplicate.values cls = lightgbm.LGBMClassifier(num_leaves=512, n_estimators=500) cls.fit(X.values, y) X_test = self._load_named('valid').values y_test = dataset.Dataset().load_named('valid').is_duplicate.values y_pred = cls.predict_proba(X_test)[:, 1] scoring = core.score_data(y_test, y_pred) importances = pandas.Series(cls.feature_importances_, index=X.columns) print(scoring) print(importances) with self.output().open('w') as f: f.write("Score: {:f}\n".format(scoring)) f.write(str(importances))
def load_data(self, subset): X1 = feature_collection.FeatureCollection().load_named(subset) X2 = count_matrix.CountFeature.load_dataset(subset) y = dataset.Dataset().load_named(subset).is_duplicate.values res = sp.hstack([X1.values, X2]) cols = list(X1.columns) + ['count.%d' % i for i in range(X2.shape[1])] return res, y, cols
def valid(self): pred = self.predict('valid') print(colors.green | "prediction sample...") print(colors.green | str(pred.head())) y = dataset.Dataset().load()[2] loss = core.score_data(y.is_duplicate, pred) print(colors.green | "Performance: " + str(loss)) return pred
def valid(self): pred = self.pred_simple_target('valid') print(colors.green | "prediction sample...") print(colors.green | str(pred.head())) y = dataset.Dataset().load()[2] weights = core.weights[y.is_duplicate.values] loss = metrics.log_loss(y.is_duplicate, pred.is_duplicate, sample_weight=weights) print(colors.green | "Performance: " + str(loss)) return pred
def run(self): X = np.load('cache/sentiment/valid.npy') y = dataset.Dataset().load_named('valid').is_duplicate.values.astype( int) X_test = np.load('cache/sentiment/merge.npy') y_test = dataset.Dataset().load_named( 'merge').is_duplicate.values.astype(int) summary_cls = ensemble.ExtraTreesClassifier(n_estimators=200, n_jobs=-1) summary_cls.fit(X, y) perf = summary_cls.predict_proba(X_test) importances = pandas.Series(summary_cls.feature_importances_, index=['neg', 'neu', 'pos', 'compound']) score = metrics.log_loss(y_test, perf) print(score) print(importances) with self.output().open('w') as f: f.write(str(score)) f.write('\n') f.write(str(importances))
def complete(self): if False and self.data_subset == 'test': if not os.path.exists('cache/vw_data/test_0.svm'): return False test_size = dataset.Dataset().load_test().shape[0] target_ixs = self.test_target_indexes(test_size) for target_ix in target_ixs: if not os.path.exists('cache/vw_data/test_%d.svm' % target_ix): return False return True else: return os.path.exists('cache/vw_data/%s.svm' % self.data_subset)
def run(self): self.output().makedirs() data = dataset.Dataset().load_named(self.dataset) dists = np.zeros(data.shape[0]) i = 0 for _, row in tqdm(data.iterrows(), desc='%s distance %s' % (self.dataset, self.name), total=data.shape[0]): dists[i] = self.dist_fn(row.question1_tokens, row.question2_tokens) i += 1 np.save( 'cache/functional-distance/%s-%s.tmp.npy' % (self.name, self.dataset), dists) os.rename( 'cache/functional-distance/%s-%s.tmp.npy' % (self.name, self.dataset), 'cache/functional-distance/%s-%s.npy' % (self.name, self.dataset))
def run(self): train_data, _, _ = dataset.Dataset().load() vocab_count = Counter() for sent in tqdm.tqdm(train_data.question1_tokens, desc='Counting questions one', total=train_data.shape[0]): for tok in sent: vocab_count[tok] += 1 for sent in tqdm.tqdm(train_data.question1_tokens, desc='Counting questions two', total=train_data.shape[0]): for tok in sent: vocab_count[tok] += 1 vocab_counts = pandas.Series(vocab_count) self.output().makedirs() vocab_counts.to_msgpack(self.output().path)
def test(self): test_size = dataset.Dataset().load_test().shape[0] test_tasks = lightgbm.SVMData.test_target_indexes(test_size) print(colors.green & colors.bold | "Predicting test values, this takes a long time...") for target_ix in tqdm(test_tasks, desc='Predicting'): with open('cache/xgb/test.conf', 'w') as f: f.write(self.test_conf % (target_ix, target_ix)) local[self.xgb_path]['cache/xgb/test.conf'] & FG preds = [] for target_ix in tqdm(test_tasks, desc='Reading results file'): pred = pandas.read_csv('./cache/xgb/test_preds_%d.csv' % target_ix, names=['is_duplicate']) pred.index = pandas.Series(np.arange( target_ix, min(test_size, target_ix + lightgbm.SVMData.max_size)), name='test_id') preds.append(pred) preds = pandas.concat(preds, 0) return preds
def run(self): def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text) train_texts_1 = [] train_texts_2 = [] train_labels = [] train_dataset = dataset.Dataset().load_named('train') for _, row in tqdm(train_dataset.iterrows(), total=train_dataset.shape[0]): train_texts_1.append(text_to_wordlist(row.question1_raw)) train_texts_2.append(text_to_wordlist(row.question2_raw)) train_labels.append(row.is_duplicate) print('Found %s texts in train.csv' % len(train_texts_1)) valid_texts_1 = [] valid_texts_2 = [] valid_labels = [] valid_dataset = dataset.Dataset().load_named('valid') for _, row in tqdm(valid_dataset.iterrows(), total=valid_dataset.shape[0]): valid_texts_1.append(text_to_wordlist(row.question1_raw)) valid_texts_2.append(text_to_wordlist(row.question2_raw)) valid_labels.append(row.is_duplicate) print('Found %s texts in valid.csv' % len(valid_texts_1)) merge_texts_1 = [] merge_texts_2 = [] merge_labels = [] merge_dataset = dataset.Dataset().load_named('merge') for _, row in tqdm(merge_dataset.iterrows(), total=merge_dataset.shape[0]): merge_texts_1.append(text_to_wordlist(row.question1_raw)) merge_texts_2.append(text_to_wordlist(row.question2_raw)) merge_labels.append(row.is_duplicate) print('Found %s texts in merge.csv' % len(merge_texts_1)) test_texts_1 = [] test_texts_2 = [] test_dataset = dataset.Dataset().load_named('test') for _, row in tqdm(test_dataset.iterrows(), total=test_dataset.shape[0]): test_texts_1.append(text_to_wordlist(row.question1_raw)) test_texts_2.append(text_to_wordlist(row.question2_raw)) print('Found %s texts in test.csv' % len(test_texts_1)) tokenizer = Tokenizer(num_words=self.max_nb_words) tokenizer.fit_on_texts(train_texts_1 + valid_texts_1 + merge_texts_1 + test_texts_1 + train_texts_2 + valid_texts_2 + merge_texts_2 + test_texts_2) train_sequences_1 = tokenizer.texts_to_sequences(train_texts_1) train_sequences_2 = tokenizer.texts_to_sequences(train_texts_2) valid_sequences_1 = tokenizer.texts_to_sequences(valid_texts_1) valid_sequences_2 = tokenizer.texts_to_sequences(valid_texts_2) merge_sequences_1 = tokenizer.texts_to_sequences(merge_texts_1) merge_sequences_2 = tokenizer.texts_to_sequences(merge_texts_2) test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1) test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2) word_index = tokenizer.word_index print('Found %s unique tokens' % len(word_index)) train_data_1 = pad_sequences(train_sequences_1, maxlen=self.MAX_SEQUENCE_LENGTH) train_data_2 = pad_sequences(train_sequences_2, maxlen=self.MAX_SEQUENCE_LENGTH) valid_data_1 = pad_sequences(valid_sequences_1, maxlen=self.MAX_SEQUENCE_LENGTH) valid_data_2 = pad_sequences(valid_sequences_2, maxlen=self.MAX_SEQUENCE_LENGTH) merge_data_1 = pad_sequences(merge_sequences_1, maxlen=self.MAX_SEQUENCE_LENGTH) merge_data_2 = pad_sequences(merge_sequences_2, maxlen=self.MAX_SEQUENCE_LENGTH) test_data_1 = pad_sequences(test_sequences_1, maxlen=self.MAX_SEQUENCE_LENGTH) test_data_2 = pad_sequences(test_sequences_2, maxlen=self.MAX_SEQUENCE_LENGTH) train_labels = np.array(train_labels) valid_labels = np.array(valid_labels) merge_labels = np.array(merge_labels) English = spacy.en.English() embedding_matrix = np.zeros((self.max_nb_words, 300)) for word, i in word_index.items(): lex = English.vocab[word] if not lex.is_oov: embedding_matrix[i] = lex.vector print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) self.output().makedirs() np.save('cache/kaggledata/train_1.npy', train_data_1) np.save('cache/kaggledata/train_2.npy', train_data_2) np.save('cache/kaggledata/train_labels.npy', train_labels) np.save('cache/kaggledata/valid_1.npy', valid_data_1) np.save('cache/kaggledata/valid_2.npy', valid_data_2) np.save('cache/kaggledata/valid_labels.npy', valid_labels) np.save('cache/kaggledata/merge_1.npy', merge_data_1) np.save('cache/kaggledata/merge_2.npy', merge_data_2) np.save('cache/kaggledata/merge_labels.npy', merge_labels) np.save('cache/kaggledata/test_1.npy', test_data_1) np.save('cache/kaggledata/test_2.npy', test_data_2) np.save('cache/kaggledata/embedding.npy', embedding_matrix) with self.output().open('w'): pass
def run(self): assert self.data_subset in {'train', 'test', 'merge', 'valid'} if self.data_subset in {'train', 'valid', 'merge'}: ix = {'train': 0, 'merge': 1, 'valid': 2}[self.data_subset] #vecs = tfidf_matrix.TFIDFFeature.load_dataset(self.data_subset) vecs = count_matrix.CountFeature.load_dataset(self.data_subset) qvecs = question_vectors.QuestionVector().load_named( self.data_subset) dvecs = distances.AllDistances().load()[ix] evecs = shared_entites.SharedEntities().load_named( self.data_subset) wmvecs = wordmat_distance.WordMatDistance().load_named( self.data_subset) labels = dataset.Dataset().load()[ix].is_duplicate.values else: #vecs = tfidf_matrix.TFIDFFeature.load_dataset('test') vecs = count_matrix.CountFeature.load_dataset('test') qvecs = question_vectors.QuestionVector().load_named('test') dvecs = distances.AllDistances().load_named('test') evecs = shared_entites.SharedEntities().load_named('test') wmvecs = wordmat_distance.WordMatDistance().load_named('test') labels = np.zeros(qvecs.shape[0], dtype='uint8') qvec_offset = 1 dvec_offset = qvecs.shape[1] evec_offset = dvec_offset + dvecs.shape[1] wmvec_offset = evec_offset + evecs.shape[1] vecs_offset = wmvec_offset + wmvecs.shape[1] def write_row(i, f): row = vecs[i] qvec = qvecs[i].copy() dvec = dvecs[i].copy() evec = evecs[i].copy() wmvec = wmvecs[i].copy() label = labels[i] * 2 - 1 qvec[np.isnan(qvec)] = -1 dvec[np.isnan(dvec)] = -1 evec[np.isnan(evec)] = -1 wmvec[np.isnan(wmvec)] = -1 qvec_entries = ' '.join('%d:%.2f' % ix_v for ix_v in enumerate(qvec)) dvec_entries = ' '.join('%d:%.2f' % ix_v for ix_v in enumerate(dvec)) evec_entries = ' '.join('%d:%.2f' % ix_v for ix_v in enumerate(evec)) wmvec_entries = ' '.join('%d:%.2f' % ix_v for ix_v in enumerate(wmvec)) entries = " ".join(("%d:%.2f" % (ind + vecs_offset, data) for ind, data in zip(row.indices, row.data))) f.write("%d %f |Q %s |D %s |E %s |M %s |W %s\n" % (label, core.weights[label], qvec_entries, dvec_entries, evec_entries, wmvec_entries, entries)) os.makedirs('cache/vw_data', exist_ok=True) if False: for start_ix in tqdm(self.test_target_indexes(vecs.shape[0])): with open('cache/vw_data/test_%d_tmp.svm' % start_ix, 'w') as f: for i in range( start_ix, min(start_ix + self.max_size, vecs.shape[0])): write_row(i, f) os.rename('cache/vw_data/test_%d_tmp.svm' % start_ix, 'cache/vw_data/test_%d.svm' % start_ix) else: with open('cache/vw_data/%s_tmp.svm' % self.data_subset, 'w') as f: for i in tqdm(range(qvecs.shape[0]), desc='writing %s data' % self.data_subset): write_row(i, f) os.rename('cache/vw_data/%s_tmp.svm' % self.data_subset, 'cache/vw_data/%s.svm' % self.data_subset)
def requires(self): yield dataset.Dataset() yield count_matrix.CountFeature() yield feature_collection.FeatureCollection()
def load_data(self, subset): X = feature_collection.FeatureCollection().load_named(subset) y = dataset.Dataset().load_named(subset).is_duplicate.values cols = X.columns return X.values, y, cols
def requires(self): yield dataset.Dataset() yield vocab.Vocab()
def requires(self): yield lightgbm.TrainSVMData() yield lightgbm.ValidSVMData() yield lightgbm.MergeSVMData() yield lightgbm.TestSVMData() yield dataset.Dataset()
def requires(self): yield dataset.Dataset()
def requires(self): yield count_matrix.CountFeature() yield dataset.Dataset()
def requires(self): return dataset.Dataset()
def run(self): self.output().makedirs() train_data = dataset.Dataset().load_named('train') merge_data = dataset.Dataset().load_named('merge') valid_data = dataset.Dataset().load_named('valid') test_data = dataset.Dataset().load_named('test') all_questions = pandas.concat([ train_data.question1_raw, train_data.question2_raw, merge_data.question1_raw, merge_data.question2_raw, valid_data.question1_raw, valid_data.question2_raw, test_data.question1_raw, test_data.question2_raw, ]) question_freq = all_questions.value_counts().to_dict() train_feature = pandas.DataFrame({ 'freq1': train_data.question1_raw.apply(question_freq.get), 'freq2': train_data.question2_raw.apply(question_freq.get) }) train_feature['qfreq_diff'] = np.abs(train_feature.freq1 - train_feature.freq2) np.save('cache/question-freq/train.npy', train_feature.values) merge_feature = pandas.DataFrame({ 'freq1': merge_data.question1_raw.apply(question_freq.get), 'freq2': merge_data.question2_raw.apply(question_freq.get) }) merge_feature['qfreq_diff'] = np.abs(merge_feature.freq1 - merge_feature.freq2) np.save('cache/question-freq/merge.npy', merge_feature.values) valid_feature = pandas.DataFrame({ 'freq1': valid_data.question1_raw.apply(question_freq.get), 'freq2': valid_data.question2_raw.apply(question_freq.get) }) valid_feature['qfreq_diff'] = np.abs(valid_feature.freq1 - valid_feature.freq2) np.save('cache/question-freq/valid.npy', valid_feature.values) test_feature = pandas.DataFrame({ 'freq1': test_data.question1_raw.apply(question_freq.get), 'freq2': test_data.question2_raw.apply(question_freq.get) }) test_feature['qfreq_diff'] = np.abs(test_feature.freq1 - test_feature.freq2) np.save('cache/question-freq/test.npy', test_feature.values) with self.output().open('w') as f: f.write( str( train_feature.groupby( train_data.is_duplicate).qfreq_diff.mean()))
def requires(self): yield TrainVWData() yield ValidVWData() yield MergeVWData() yield TestVWData() yield dataset.Dataset()