def pipeline(self, X, pos_tag_x):
        X = [preprocessor.process_text(x) for x in X]

        X_word_chunks = word_chunks(X, n=300, process=True, sliding=True)
        X_char_chunks = char_chunks(X, n=2000, sliding=True)
        X_pos_chunks = word_chunks(pos_tag_x,
                                   n=300,
                                   process=True,
                                   sliding=True)

        max_segments = 20
        lexical_features = sequence.pad_sequences(lexical(X_word_chunks),
                                                  maxlen=max_segments)
        stop_word_features = sequence.pad_sequences(
            self.ngrams.get_stop_words(X_word_chunks), maxlen=max_segments)
        function_word_features = sequence.pad_sequences(
            self.ngrams.get_function_words(X_word_chunks), maxlen=max_segments)
        pos_tag_features = sequence.pad_sequences(
            self.ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments)
        word_frequency = sequence.pad_sequences(
            wf.average_word_frequency(X_word_chunks), maxlen=max_segments)
        readability_features = sequence.pad_sequences(
            readability(X_word_chunks), maxlen=max_segments)

        return np.concatenate([
            lexical_features, stop_word_features, function_word_features,
            pos_tag_features, word_frequency, readability_features
        ],
                              axis=2)
예제 #2
0
    def predict(self, test_x):
        predictions = []

        test_x_preprocessed = [preprocessor.process_text(x) for x in test_x]
        test_word_chunks = word_chunks(test_x,
                                       **self.params['word_chunk_params'])
        test_word_chunks_preprocessed = word_chunks(test_x_preprocessed,
                                                    chunks=3,
                                                    process=True,
                                                    sliding=True)

        cnt = 0
        if self.params['use_raw_text_models']:
            for model in self.raw_text_models:
                print("Predicting via LightGBM ...")
                test_probs = model.predict_proba(test_x).tolist()
                predictions.append(test_probs)
                if self.params['output_probabilities_test']:
                    res = pd.DataFrame(np.array(test_probs))
                    res.to_csv('model_{0}_test.csv'.format(cnt))
                    cnt += 1
        else:
            print("No")

        test_word_chunks = word_chunks(test_x,
                                       **self.params['word_chunk_params'])
        print("Computed word chunks")

        for (transformer, apply_on_word_chunks, preprocess, scaler,
             predictors), model_weights in zip(self.stack,
                                               self.global_model_weights):
            if preprocess:
                raw_test = test_word_chunks_preprocessed if apply_on_word_chunks else test_x_preprocessed
            else:
                raw_test = test_word_chunks if apply_on_word_chunks else test_x

            data_transformed = scaler.transform(transformer(raw_test))

            local_predictions = [
                predictor.predict_proba(data_transformed).tolist()
                for predictor in predictors
            ]
            predictions.append(
                self.weight_local_predictions(local_predictions,
                                              model_weights))

            if self.params['output_probabilities_test']:
                res = pd.DataFrame(np.array(test_probs))
                res.to_csv('model_{0}_test.csv'.format(cnt))
                cnt += 1

        if (self.params['meta_learner']):
            prediction_converted = self.convert_to_meta_input(predictions)

            return self.meta_learner.predict(prediction_converted)
        else:
            predictions_prob = [[sum(y) for y in zip(*x)]
                                for x in zip(*predictions)]

            return [x[0] < x[1] for x in predictions_prob]
def main(cutoff=None, persist=False):
    train_x, train_y, train_positions, train_file_names = get_data(
        main_dir=TRAINING_DIR)

    if cutoff:
        train_x = train_x[:cutoff]
        train_y = train_y[:cutoff]
        train_positions = train_positions[:cutoff]

    df = pd.DataFrame(data={'label': train_y, 'pos': train_positions})

    pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x]
    ngrams = NGrams(train_x, pos_tag_x)

    X = [preprocessor.process_text(x) for x in train_x]

    X_word_chunks = word_chunks(X, n=300, process=True)
    X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True)

    fmap = {
        'lexical_features': lexical(X_word_chunks),
        'stop_word_features': ngrams.get_stop_words(X_word_chunks),
        'function_word_features': ngrams.get_function_words(X_word_chunks),
        'pos_tag_features': ngrams.get_pos_tags(X_pos_chunks),
        'process_tag_features': processed_tags(X_word_chunks),
        'word_frequency': wf.average_word_frequency(X_word_chunks),
        'readability_features': readability(X_word_chunks)
    }

    for key, feature in fmap.items():
        df[key] = feature

    if persist:
        df.to_csv(TRAIN_CSV_FILE)
예제 #4
0
def main(cutoff=None, persist=True):
    train_x, train_y, train_positions, train_file_names = get_data(
        #main_dir=TRAINING_DIR,
        external_file=TRAINING_EXTERNAL_FILE
    )

    if cutoff:
        train_x = train_x[:cutoff]
        train_y = train_y[:cutoff]
        train_positions = train_positions[:cutoff]

    pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x]
    ngrams = NGrams(train_x, pos_tag_x)

    X = [preprocessor.process_text(x) for x in train_x]

    X_word_chunks = word_chunks(X, n=300, process=True, sliding=True)
    #print('Word', max([len(s) for s in X_word_chunks]))
    X_char_chunks = char_chunks(X, n=2000, sliding=True)
    #print('Char', max([len(s) for s in X_char_chunks]))

    X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True, sliding=True)

    max_segments = 20

    lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments)
    stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments)
    function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments)
    pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments)
    word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments)
    readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments)

    # lexical_features = lexical(X_word_chunks)
    # stop_word_features = ngrams.get_stop_words(X_word_chunks)
    # function_word_features = ngrams.get_function_words(X_word_chunks)
    # pos_tag_features = ngrams.get_pos_tags(X_pos_chunks)
    # process_tag_features = processed_tags(X_word_chunks)
    # word_frequency = wf.average_word_frequency(X_word_chunks)
    # readability_features = readability(X_word_chunks)
    # tfidf = ngrams.get_word_tfidf(X_word_chunks)


    X = np.concatenate([lexical_features, stop_word_features,
                            function_word_features, pos_tag_features, 
                            word_frequency,
                            readability_features, paragraph_features], axis=2)

    print(X.shape)

    if persist:
        np.save(TRAIN_X_FILE, X)
        np.save(TRAIN_Y_FILE, train_y)
예제 #5
0
    def predict(self, test_x):
        test_x_preprocessed = [preprocessor.process_text(x) for x in test_x]

        predictions = []

        test_word_chunks = word_chunks(test_x,
                                       **self.params['word_chunk_params'])
        test_word_chunks_preprocessed = word_chunks(test_x_preprocessed,
                                                    chunks=3,
                                                    process=True,
                                                    sliding=True)

        print("Computed word chunks")

        for (transformer, apply_on_word_chunks, preprocess, scaler,
             predictors), model_weights in zip(self.stack,
                                               self.global_model_weights):
            if (preprocess):
                raw_test = test_word_chunks_preprocessed if apply_on_word_chunks else test_x_preprocessed
            else:
                raw_test = test_word_chunks if apply_on_word_chunks else test_x

            data_transformed = scaler.transform(transformer(raw_test))

            local_predictions = [
                predictor.predict_proba(data_transformed).tolist()
                for predictor in predictors
            ]

            predictions.append(
                self.weight_local_predictions(local_predictions,
                                              model_weights))

        if (self.params['meta_learner']):
            prediction_converted = self.convert_to_meta_input(predictions)

            return self.meta_learner.predict(prediction_converted)
        else:
            predictions_prob = [[sum(y) for y in zip(*x)]
                                for x in zip(*predictions)]

            return [x[0] < x[1] for x in predictions_prob]
예제 #6
0
    def pipeline(self, X, pos_tag_x, fit_scalers):
        feature_names = []

        X = [preprocessor.process_text(x) for x in X]

        X_word_chunks = word_chunks(X, chunks=4, process=True, sliding=True)
        X_pos_chunks = word_chunks(pos_tag_x,
                                   chunks=4,
                                   process=True,
                                   sliding=True)

        lexical_features = max_diff(lexical(X_word_chunks, feature_names))

        stop_word_features = max_diff(
            self.ngrams.get_stop_words(X_word_chunks, feature_names))

        function_word_features = max_diff(
            self.ngrams.get_function_words(X_word_chunks, feature_names))

        pos_tag_features = max_diff(
            self.ngrams.get_pos_tags(X_pos_chunks, feature_names))

        process_tag_features = max_diff(
            processed_tags(X_word_chunks, feature_names))

        word_frequency = max_diff(
            wf.average_word_frequency(X_word_chunks, feature_names))
        readability_features = max_diff(
            readability(X_word_chunks, feature_names=feature_names))
        #tfidf = max_diff(self.ngrams.get_word_tfidf(X_word_chunks, feature_names))
        num_par = num_paragraphs(X, feature_names)

        X = np.concatenate(
            (lexical_features, stop_word_features, function_word_features,
             pos_tag_features, process_tag_features, word_frequency,
             readability_features, num_par),
            axis=1)

        X = minmax_scale(X)

        return X, feature_names
def main(cutoff=10000, persist=True):
    train_x, train_y, train_positions, train_file_names = get_data(
        external_file=TRAINING_EXTERNAL_FILE
    )

    if cutoff:
        train_x = train_x[:cutoff]
        train_y = train_y[:cutoff]
        train_positions = train_positions[:cutoff]

    pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x]
    ngrams = NGrams(train_x, pos_tag_x)

    X = [preprocessor.process_text(x) for x in train_x]

    X_word_chunks = word_chunks(X, n=300, process=True)
    X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True)

    max_segments = 10

    lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments)
    stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments)
    function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments)
    pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments)
    process_tag_features = sequence.pad_sequences(processed_tags(X_word_chunks), maxlen=max_segments)
    word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments)
    readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments)

    print(type(lexical_features))

    X = np.concatenate([lexical_features, stop_word_features,
                            function_word_features, pos_tag_features, 
                            process_tag_features, word_frequency,
                            readability_features], axis=2)

    if persist:
        np.save(TRAIN_X_FILE, X)
        np.save(TRAIN_Y_FILE, train_y)
예제 #8
0
    def stack_fit_predict(self,
                          train_x,
                          train_y,
                          test_x=None,
                          test_y=None,
                          test_additional=None):
        predictions = []

        train_x_preprocessed = [preprocessor.process_text(x) for x in train_x]
        train_word_chunks = word_chunks(train_x,
                                        **self.params['word_chunk_params'])
        train_word_chunks_preprocessed = word_chunks(train_x_preprocessed,
                                                     chunks=3,
                                                     process=True,
                                                     sliding=True)

        if (test_x): test_size = len(test_x)

        cnt = 0
        if self.params['use_raw_text_models']:
            for model in self.raw_text_models:
                if self.params['fit_with_train'] and test_additional:
                    print("Fitting LightGBM ...")
                    model.fit_with_test(train_x, train_y, [], test_additional)
                else:
                    print("Fitting LightGBM without train")
                    model.fit(train_x, train_y, [])
                if test_x:
                    predictions.append(model.predict_proba(test_x).tolist())
                elif self.params['output_probabilities_train']:
                    res = pd.DataFrame(model.predict_proba(train_x))
                    res.to_csv("model_{0}_train.csv".format(cnt))
                    cnt += 1
        else:
            print("No")

        train_word_chunks = word_chunks(train_x,
                                        **self.params['word_chunk_params'])
        test_word_chunks = None
        if test_x:
            test_x_preprocessed = [
                preprocessor.process_text(x) for x in test_x
            ]
            test_word_chunks = word_chunks(test_x,
                                           **self.params['word_chunk_params'])
            test_word_chunks_preprocessed = word_chunks(test_x_preprocessed,
                                                        chunks=3,
                                                        process=True,
                                                        sliding=True)
        print("Computed word chunks")

        for transformer, apply_on_word_chunks, preprocess, scaler, predictors in self.stack:
            if preprocess:
                raw_data = train_word_chunks_preprocessed if apply_on_word_chunks else train_x_preprocessed
            else:
                raw_data = train_word_chunks if apply_on_word_chunks else train_x

            data_transformed_zero = scaler.fit_transform(transformer(raw_data))

            if (test_x):
                if preprocess:
                    raw_test = test_word_chunks_preprocessed if apply_on_word_chunks else test_x_preprocessed
                else:
                    raw_test = test_word_chunks if apply_on_word_chunks else test_x

                data_transformed_meta = scaler.transform(transformer(raw_test))

            local_predictions = []
            for predictor in predictors:
                predictor.fit(data_transformed_zero, train_y)

                if (test_x):
                    local_predictions.append(
                        predictor.predict_proba(
                            data_transformed_meta).tolist())
                if self.params['output_probabilities_train']:
                    res = pd.DataFrame(
                        predictor.predict_proba(data_transformed_zero))
                    res.to_csv("model_{0}_train.csv".format(cnt))
                    cnt += 1

            if (test_x):
                scores = []
                for model_predictions in local_predictions:
                    acc = [(prob[0] < prob[1]) == truth
                           for prob, truth in zip(model_predictions, test_y)
                           ].count(True) / test_size
                    scores.append(acc)

                sum_scores = sum(scores)
                model_weights = [x / sum_scores for x in scores]

                print('Model weights: ', model_weights)
                self.global_model_weights.append(model_weights)

                predictions.append(
                    self.weight_local_predictions(local_predictions,
                                                  model_weights))

        if (test_x): return predictions
예제 #9
0
    def stack_fit_predict(self, train_x, train_y, test_x=None, test_y=None):
        train_x_preprocessed = [preprocessor.process_text(x) for x in train_x]
        train_word_chunks = word_chunks(train_x,
                                        **self.params['word_chunk_params'])
        train_word_chunks_preprocessed = word_chunks(train_x_preprocessed,
                                                     chunks=3,
                                                     process=True,
                                                     sliding=True)

        test_word_chunks = None
        if test_x:
            test_x_preprocessed = [
                preprocessor.process_text(x) for x in test_x
            ]
            test_word_chunks = word_chunks(test_x,
                                           **self.params['word_chunk_params'])
            test_word_chunks_preprocessed = word_chunks(test_x_preprocessed,
                                                        chunks=3,
                                                        process=True,
                                                        sliding=True)
        print("Computed word chunks")

        predictions = []

        if (test_x): test_size = len(test_x)

        for transformer, apply_on_word_chunks, preprocess, scaler, predictors in self.stack:
            if (preprocess):
                raw_data = train_word_chunks_preprocessed if apply_on_word_chunks else train_x_preprocessed
            else:
                raw_data = train_word_chunks if apply_on_word_chunks else train_x

            data_transformed_zero = scaler.fit_transform(transformer(raw_data))

            if (test_x):
                if (preprocess):
                    raw_test = test_word_chunks_preprocessed if apply_on_word_chunks else test_x_preprocessed
                else:
                    raw_test = test_word_chunks if apply_on_word_chunks else test_x

                data_transformed_meta = scaler.transform(transformer(raw_test))

            local_predictions = []
            for predictor in predictors:
                predictor.fit(data_transformed_zero, train_y)

                if (test_x):
                    local_predictions.append(
                        predictor.predict_proba(
                            data_transformed_meta).tolist())

            if (test_x):
                scores = []
                for model_predictions in local_predictions:
                    acc = [(prob[0] < prob[1]) == truth
                           for prob, truth in zip(model_predictions, test_y)
                           ].count(True) / test_size
                    scores.append(acc)

                sum_scores = sum(scores)
                model_weights = [x / sum_scores for x in scores]

                print('Model weights: ', model_weights)
                self.global_model_weights.append(model_weights)

                predictions.append(
                    self.weight_local_predictions(local_predictions,
                                                  model_weights))

        if (test_x): return predictions