Beispiel #1
0
    def run(self):
        self.analyzer = SentimentIntensityAnalyzer()
        self.output().makedirs()

        data = dataset.Dataset().load_named(self.dataset)
        data_sent = self.calc_sent(data, self.dataset)
        np.save('cache/sentiment/{}.npy'.format(self.dataset), data_sent)
Beispiel #2
0
        def run(self):
            self.output().makedirs()
            kvecs = gensim.models.KeyedVectors.load_word2vec_format(w2v_file)
            data = dataset.Dataset().load_named(self.dataset)
            dists = np.zeros(data.shape[0])
            i = 0

            #def wmd(row):
            #    return kvecs.wmdistance(row.question1_raw, row.question2_raw)
            #dists = dask.dataframe.from_pandas(data, npartitions=16, sort=False).apply(wmd).compute().values

            for q1, q2 in tqdm(zip(data.question1_raw, data.question2_raw),
                               total=data.question1_raw.shape[0],
                               desc='Computing %s WMD' % self.dataset):
                dists[i] = kvecs.wmdistance(q1, q2)
                i += 1

            np.save(
                'cache/functional-distance/%s-%s.tmp.npy' %
                (self.name, self.dataset), dists)
            os.rename(
                'cache/functional-distance/%s-%s.tmp.npy' %
                (self.name, self.dataset),
                'cache/functional-distance/%s-%s.npy' %
                (self.name, self.dataset))
Beispiel #3
0
 def requires(self):
     yield dataset.Dataset()
     yield tfidf_matrix.TFIDFFeature()
     yield question_vectors.QuestionVector()
     yield distances.AllDistances()
     yield shared_entites.SharedEntities()
     yield wordmat_distance.WordMatDistance()
Beispiel #4
0
 def vectorize_dataset(self, dataset_name):
     examples = dataset.Dataset().load_named(dataset_name)
     all_examples = pandas.concat(
         [examples.question1_tokens, examples.question2_tokens])
     all_vecs = np.vstack(all_examples.progress_apply(
         self.vectorize_sent)).astype(np.float16)
     return all_vecs[:all_vecs.shape[0] // 2], all_vecs[all_vecs.shape[0] //
                                                        2:]
Beispiel #5
0
    def testicles(self):
        X = self._load_named('train')
        y = dataset.Dataset().load_named('train').is_duplicate.values

        cls = lightgbm.LGBMClassifier(num_leaves=512, n_estimators=500)
        cls.fit(X.values, y)
        X_test = self._load_named('valid').values
        y_test = dataset.Dataset().load_named('valid').is_duplicate.values
        y_pred = cls.predict_proba(X_test)[:, 1]

        scoring = core.score_data(y_test, y_pred)
        importances = pandas.Series(cls.feature_importances_, index=X.columns)
        print(scoring)
        print(importances)
        with self.output().open('w') as f:
            f.write("Score: {:f}\n".format(scoring))
            f.write(str(importances))
Beispiel #6
0
    def load_data(self, subset):
        X1 = feature_collection.FeatureCollection().load_named(subset)
        X2 = count_matrix.CountFeature.load_dataset(subset)
        y = dataset.Dataset().load_named(subset).is_duplicate.values

        res = sp.hstack([X1.values, X2])
        cols = list(X1.columns) + ['count.%d' % i for i in range(X2.shape[1])]
        return res, y, cols
Beispiel #7
0
    def valid(self):
        pred = self.predict('valid')
        print(colors.green | "prediction sample...")
        print(colors.green | str(pred.head()))
        y = dataset.Dataset().load()[2]
        loss = core.score_data(y.is_duplicate, pred)
        print(colors.green | "Performance: " + str(loss))

        return pred
Beispiel #8
0
    def valid(self):
        pred = self.pred_simple_target('valid')
        print(colors.green | "prediction sample...")
        print(colors.green | str(pred.head()))
        y = dataset.Dataset().load()[2]
        weights = core.weights[y.is_duplicate.values]
        loss = metrics.log_loss(y.is_duplicate,
                                pred.is_duplicate,
                                sample_weight=weights)
        print(colors.green | "Performance: " + str(loss))

        return pred
Beispiel #9
0
    def run(self):
        X = np.load('cache/sentiment/valid.npy')
        y = dataset.Dataset().load_named('valid').is_duplicate.values.astype(
            int)
        X_test = np.load('cache/sentiment/merge.npy')
        y_test = dataset.Dataset().load_named(
            'merge').is_duplicate.values.astype(int)
        summary_cls = ensemble.ExtraTreesClassifier(n_estimators=200,
                                                    n_jobs=-1)
        summary_cls.fit(X, y)
        perf = summary_cls.predict_proba(X_test)
        importances = pandas.Series(summary_cls.feature_importances_,
                                    index=['neg', 'neu', 'pos', 'compound'])
        score = metrics.log_loss(y_test, perf)

        print(score)
        print(importances)
        with self.output().open('w') as f:
            f.write(str(score))
            f.write('\n')
            f.write(str(importances))
Beispiel #10
0
 def complete(self):
     if False and self.data_subset == 'test':
         if not os.path.exists('cache/vw_data/test_0.svm'):
             return False
         test_size = dataset.Dataset().load_test().shape[0]
         target_ixs = self.test_target_indexes(test_size)
         for target_ix in target_ixs:
             if not os.path.exists('cache/vw_data/test_%d.svm' % target_ix):
                 return False
         return True
     else:
         return os.path.exists('cache/vw_data/%s.svm' % self.data_subset)
Beispiel #11
0
 def run(self):
     self.output().makedirs()
     data = dataset.Dataset().load_named(self.dataset)
     dists = np.zeros(data.shape[0])
     i = 0
     for _, row in tqdm(data.iterrows(),
                        desc='%s distance %s' % (self.dataset, self.name),
                        total=data.shape[0]):
         dists[i] = self.dist_fn(row.question1_tokens, row.question2_tokens)
         i += 1
     np.save(
         'cache/functional-distance/%s-%s.tmp.npy' %
         (self.name, self.dataset), dists)
     os.rename(
         'cache/functional-distance/%s-%s.tmp.npy' %
         (self.name, self.dataset),
         'cache/functional-distance/%s-%s.npy' % (self.name, self.dataset))
Beispiel #12
0
    def run(self):
        train_data, _, _ = dataset.Dataset().load()
        vocab_count = Counter()
        for sent in tqdm.tqdm(train_data.question1_tokens,
                              desc='Counting questions one',
                              total=train_data.shape[0]):
            for tok in sent:
                vocab_count[tok] += 1

        for sent in tqdm.tqdm(train_data.question1_tokens,
                              desc='Counting questions two',
                              total=train_data.shape[0]):
            for tok in sent:
                vocab_count[tok] += 1

        vocab_counts = pandas.Series(vocab_count)
        self.output().makedirs()
        vocab_counts.to_msgpack(self.output().path)
Beispiel #13
0
    def test(self):
        test_size = dataset.Dataset().load_test().shape[0]
        test_tasks = lightgbm.SVMData.test_target_indexes(test_size)
        print(colors.green & colors.bold
              | "Predicting test values, this takes a long time...")
        for target_ix in tqdm(test_tasks, desc='Predicting'):
            with open('cache/xgb/test.conf', 'w') as f:
                f.write(self.test_conf % (target_ix, target_ix))
            local[self.xgb_path]['cache/xgb/test.conf'] & FG

        preds = []
        for target_ix in tqdm(test_tasks, desc='Reading results file'):
            pred = pandas.read_csv('./cache/xgb/test_preds_%d.csv' % target_ix,
                                   names=['is_duplicate'])
            pred.index = pandas.Series(np.arange(
                target_ix, min(test_size,
                               target_ix + lightgbm.SVMData.max_size)),
                                       name='test_id')
            preds.append(pred)
        preds = pandas.concat(preds, 0)
        return preds
Beispiel #14
0
    def run(self):
        def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
            # Clean the text, with the option to remove stopwords and to stem words.

            # Convert words to lower case and split them
            text = text.lower().split()

            text = " ".join(text)

            # Clean the text
            text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
            text = re.sub(r"what's", "what is ", text)
            text = re.sub(r"\'s", " ", text)
            text = re.sub(r"\'ve", " have ", text)
            text = re.sub(r"can't", "cannot ", text)
            text = re.sub(r"n't", " not ", text)
            text = re.sub(r"i'm", "i am ", text)
            text = re.sub(r"\'re", " are ", text)
            text = re.sub(r"\'d", " would ", text)
            text = re.sub(r"\'ll", " will ", text)
            text = re.sub(r",", " ", text)
            text = re.sub(r"\.", " ", text)
            text = re.sub(r"!", " ! ", text)
            text = re.sub(r"\/", " ", text)
            text = re.sub(r"\^", " ^ ", text)
            text = re.sub(r"\+", " + ", text)
            text = re.sub(r"\-", " - ", text)
            text = re.sub(r"\=", " = ", text)
            text = re.sub(r"'", " ", text)
            text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
            text = re.sub(r":", " : ", text)
            text = re.sub(r" e g ", " eg ", text)
            text = re.sub(r" b g ", " bg ", text)
            text = re.sub(r" u s ", " american ", text)
            text = re.sub(r"\0s", "0", text)
            text = re.sub(r" 9 11 ", "911", text)
            text = re.sub(r"e - mail", "email", text)
            text = re.sub(r"j k", "jk", text)
            text = re.sub(r"\s{2,}", " ", text)

            # Optionally, shorten words to their stems
            if stem_words:
                text = text.split()
                stemmer = SnowballStemmer('english')
                stemmed_words = [stemmer.stem(word) for word in text]
                text = " ".join(stemmed_words)

            # Return a list of words
            return (text)

        train_texts_1 = []
        train_texts_2 = []
        train_labels = []

        train_dataset = dataset.Dataset().load_named('train')
        for _, row in tqdm(train_dataset.iterrows(),
                           total=train_dataset.shape[0]):
            train_texts_1.append(text_to_wordlist(row.question1_raw))
            train_texts_2.append(text_to_wordlist(row.question2_raw))
            train_labels.append(row.is_duplicate)
        print('Found %s texts in train.csv' % len(train_texts_1))

        valid_texts_1 = []
        valid_texts_2 = []
        valid_labels = []

        valid_dataset = dataset.Dataset().load_named('valid')
        for _, row in tqdm(valid_dataset.iterrows(),
                           total=valid_dataset.shape[0]):
            valid_texts_1.append(text_to_wordlist(row.question1_raw))
            valid_texts_2.append(text_to_wordlist(row.question2_raw))
            valid_labels.append(row.is_duplicate)
        print('Found %s texts in valid.csv' % len(valid_texts_1))

        merge_texts_1 = []
        merge_texts_2 = []
        merge_labels = []

        merge_dataset = dataset.Dataset().load_named('merge')
        for _, row in tqdm(merge_dataset.iterrows(),
                           total=merge_dataset.shape[0]):
            merge_texts_1.append(text_to_wordlist(row.question1_raw))
            merge_texts_2.append(text_to_wordlist(row.question2_raw))
            merge_labels.append(row.is_duplicate)
        print('Found %s texts in merge.csv' % len(merge_texts_1))

        test_texts_1 = []
        test_texts_2 = []

        test_dataset = dataset.Dataset().load_named('test')
        for _, row in tqdm(test_dataset.iterrows(),
                           total=test_dataset.shape[0]):
            test_texts_1.append(text_to_wordlist(row.question1_raw))
            test_texts_2.append(text_to_wordlist(row.question2_raw))
        print('Found %s texts in test.csv' % len(test_texts_1))

        tokenizer = Tokenizer(num_words=self.max_nb_words)
        tokenizer.fit_on_texts(train_texts_1 + valid_texts_1 + merge_texts_1 +
                               test_texts_1 + train_texts_2 + valid_texts_2 +
                               merge_texts_2 + test_texts_2)

        train_sequences_1 = tokenizer.texts_to_sequences(train_texts_1)
        train_sequences_2 = tokenizer.texts_to_sequences(train_texts_2)
        valid_sequences_1 = tokenizer.texts_to_sequences(valid_texts_1)
        valid_sequences_2 = tokenizer.texts_to_sequences(valid_texts_2)
        merge_sequences_1 = tokenizer.texts_to_sequences(merge_texts_1)
        merge_sequences_2 = tokenizer.texts_to_sequences(merge_texts_2)
        test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
        test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

        word_index = tokenizer.word_index
        print('Found %s unique tokens' % len(word_index))

        train_data_1 = pad_sequences(train_sequences_1,
                                     maxlen=self.MAX_SEQUENCE_LENGTH)
        train_data_2 = pad_sequences(train_sequences_2,
                                     maxlen=self.MAX_SEQUENCE_LENGTH)
        valid_data_1 = pad_sequences(valid_sequences_1,
                                     maxlen=self.MAX_SEQUENCE_LENGTH)
        valid_data_2 = pad_sequences(valid_sequences_2,
                                     maxlen=self.MAX_SEQUENCE_LENGTH)
        merge_data_1 = pad_sequences(merge_sequences_1,
                                     maxlen=self.MAX_SEQUENCE_LENGTH)
        merge_data_2 = pad_sequences(merge_sequences_2,
                                     maxlen=self.MAX_SEQUENCE_LENGTH)
        test_data_1 = pad_sequences(test_sequences_1,
                                    maxlen=self.MAX_SEQUENCE_LENGTH)
        test_data_2 = pad_sequences(test_sequences_2,
                                    maxlen=self.MAX_SEQUENCE_LENGTH)

        train_labels = np.array(train_labels)
        valid_labels = np.array(valid_labels)
        merge_labels = np.array(merge_labels)

        English = spacy.en.English()
        embedding_matrix = np.zeros((self.max_nb_words, 300))
        for word, i in word_index.items():
            lex = English.vocab[word]
            if not lex.is_oov:
                embedding_matrix[i] = lex.vector
        print('Null word embeddings: %d' %
              np.sum(np.sum(embedding_matrix, axis=1) == 0))

        self.output().makedirs()
        np.save('cache/kaggledata/train_1.npy', train_data_1)
        np.save('cache/kaggledata/train_2.npy', train_data_2)
        np.save('cache/kaggledata/train_labels.npy', train_labels)

        np.save('cache/kaggledata/valid_1.npy', valid_data_1)
        np.save('cache/kaggledata/valid_2.npy', valid_data_2)
        np.save('cache/kaggledata/valid_labels.npy', valid_labels)

        np.save('cache/kaggledata/merge_1.npy', merge_data_1)
        np.save('cache/kaggledata/merge_2.npy', merge_data_2)
        np.save('cache/kaggledata/merge_labels.npy', merge_labels)

        np.save('cache/kaggledata/test_1.npy', test_data_1)
        np.save('cache/kaggledata/test_2.npy', test_data_2)

        np.save('cache/kaggledata/embedding.npy', embedding_matrix)

        with self.output().open('w'):
            pass
Beispiel #15
0
    def run(self):
        assert self.data_subset in {'train', 'test', 'merge', 'valid'}
        if self.data_subset in {'train', 'valid', 'merge'}:
            ix = {'train': 0, 'merge': 1, 'valid': 2}[self.data_subset]
            #vecs = tfidf_matrix.TFIDFFeature.load_dataset(self.data_subset)
            vecs = count_matrix.CountFeature.load_dataset(self.data_subset)
            qvecs = question_vectors.QuestionVector().load_named(
                self.data_subset)
            dvecs = distances.AllDistances().load()[ix]
            evecs = shared_entites.SharedEntities().load_named(
                self.data_subset)
            wmvecs = wordmat_distance.WordMatDistance().load_named(
                self.data_subset)
            labels = dataset.Dataset().load()[ix].is_duplicate.values
        else:
            #vecs = tfidf_matrix.TFIDFFeature.load_dataset('test')
            vecs = count_matrix.CountFeature.load_dataset('test')
            qvecs = question_vectors.QuestionVector().load_named('test')
            dvecs = distances.AllDistances().load_named('test')
            evecs = shared_entites.SharedEntities().load_named('test')
            wmvecs = wordmat_distance.WordMatDistance().load_named('test')
            labels = np.zeros(qvecs.shape[0], dtype='uint8')

        qvec_offset = 1
        dvec_offset = qvecs.shape[1]
        evec_offset = dvec_offset + dvecs.shape[1]
        wmvec_offset = evec_offset + evecs.shape[1]
        vecs_offset = wmvec_offset + wmvecs.shape[1]

        def write_row(i, f):
            row = vecs[i]
            qvec = qvecs[i].copy()
            dvec = dvecs[i].copy()
            evec = evecs[i].copy()
            wmvec = wmvecs[i].copy()
            label = labels[i] * 2 - 1
            qvec[np.isnan(qvec)] = -1
            dvec[np.isnan(dvec)] = -1
            evec[np.isnan(evec)] = -1
            wmvec[np.isnan(wmvec)] = -1

            qvec_entries = ' '.join('%d:%.2f' % ix_v
                                    for ix_v in enumerate(qvec))
            dvec_entries = ' '.join('%d:%.2f' % ix_v
                                    for ix_v in enumerate(dvec))
            evec_entries = ' '.join('%d:%.2f' % ix_v
                                    for ix_v in enumerate(evec))
            wmvec_entries = ' '.join('%d:%.2f' % ix_v
                                     for ix_v in enumerate(wmvec))
            entries = " ".join(("%d:%.2f" % (ind + vecs_offset, data)
                                for ind, data in zip(row.indices, row.data)))
            f.write("%d %f |Q %s |D %s |E %s |M %s |W %s\n" %
                    (label, core.weights[label], qvec_entries, dvec_entries,
                     evec_entries, wmvec_entries, entries))

        os.makedirs('cache/vw_data', exist_ok=True)
        if False:
            for start_ix in tqdm(self.test_target_indexes(vecs.shape[0])):
                with open('cache/vw_data/test_%d_tmp.svm' % start_ix,
                          'w') as f:
                    for i in range(
                            start_ix,
                            min(start_ix + self.max_size, vecs.shape[0])):
                        write_row(i, f)
                os.rename('cache/vw_data/test_%d_tmp.svm' % start_ix,
                          'cache/vw_data/test_%d.svm' % start_ix)
        else:
            with open('cache/vw_data/%s_tmp.svm' % self.data_subset, 'w') as f:
                for i in tqdm(range(qvecs.shape[0]),
                              desc='writing %s data' % self.data_subset):
                    write_row(i, f)
            os.rename('cache/vw_data/%s_tmp.svm' % self.data_subset,
                      'cache/vw_data/%s.svm' % self.data_subset)
Beispiel #16
0
 def requires(self):
     yield dataset.Dataset()
     yield count_matrix.CountFeature()
     yield feature_collection.FeatureCollection()
Beispiel #17
0
 def load_data(self, subset):
     X = feature_collection.FeatureCollection().load_named(subset)
     y = dataset.Dataset().load_named(subset).is_duplicate.values
     cols = X.columns
     return X.values, y, cols
Beispiel #18
0
 def requires(self):
     yield dataset.Dataset()
     yield vocab.Vocab()
Beispiel #19
0
 def requires(self):
     yield lightgbm.TrainSVMData()
     yield lightgbm.ValidSVMData()
     yield lightgbm.MergeSVMData()
     yield lightgbm.TestSVMData()
     yield dataset.Dataset()
Beispiel #20
0
 def requires(self):
     yield dataset.Dataset()
Beispiel #21
0
 def requires(self):
     yield count_matrix.CountFeature()
     yield dataset.Dataset()
Beispiel #22
0
 def requires(self):
     return dataset.Dataset()
Beispiel #23
0
    def run(self):
        self.output().makedirs()
        train_data = dataset.Dataset().load_named('train')
        merge_data = dataset.Dataset().load_named('merge')
        valid_data = dataset.Dataset().load_named('valid')
        test_data = dataset.Dataset().load_named('test')
        all_questions = pandas.concat([
            train_data.question1_raw,
            train_data.question2_raw,
            merge_data.question1_raw,
            merge_data.question2_raw,
            valid_data.question1_raw,
            valid_data.question2_raw,
            test_data.question1_raw,
            test_data.question2_raw,
        ])

        question_freq = all_questions.value_counts().to_dict()

        train_feature = pandas.DataFrame({
            'freq1':
            train_data.question1_raw.apply(question_freq.get),
            'freq2':
            train_data.question2_raw.apply(question_freq.get)
        })
        train_feature['qfreq_diff'] = np.abs(train_feature.freq1 -
                                             train_feature.freq2)
        np.save('cache/question-freq/train.npy', train_feature.values)

        merge_feature = pandas.DataFrame({
            'freq1':
            merge_data.question1_raw.apply(question_freq.get),
            'freq2':
            merge_data.question2_raw.apply(question_freq.get)
        })
        merge_feature['qfreq_diff'] = np.abs(merge_feature.freq1 -
                                             merge_feature.freq2)
        np.save('cache/question-freq/merge.npy', merge_feature.values)

        valid_feature = pandas.DataFrame({
            'freq1':
            valid_data.question1_raw.apply(question_freq.get),
            'freq2':
            valid_data.question2_raw.apply(question_freq.get)
        })
        valid_feature['qfreq_diff'] = np.abs(valid_feature.freq1 -
                                             valid_feature.freq2)
        np.save('cache/question-freq/valid.npy', valid_feature.values)

        test_feature = pandas.DataFrame({
            'freq1':
            test_data.question1_raw.apply(question_freq.get),
            'freq2':
            test_data.question2_raw.apply(question_freq.get)
        })
        test_feature['qfreq_diff'] = np.abs(test_feature.freq1 -
                                            test_feature.freq2)
        np.save('cache/question-freq/test.npy', test_feature.values)

        with self.output().open('w') as f:
            f.write(
                str(
                    train_feature.groupby(
                        train_data.is_duplicate).qfreq_diff.mean()))
Beispiel #24
0
 def requires(self):
     yield TrainVWData()
     yield ValidVWData()
     yield MergeVWData()
     yield TestVWData()
     yield dataset.Dataset()