Ejemplo n.º 1
0
    def fit(self, train_x, train_y, train_positions):
        pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x]
        self.ngrams = NGrams(train_x, pos_tag_x)

        self.train_x, names = self.pipeline(train_x,
                                            pos_tag_x,
                                            fit_scalers=True)

        self.print_feature_importance(self.train_x, train_y, names)
    def fit(self, train_x, train_y, train_positions):
        pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x]
        self.ngrams = NGrams(train_x, pos_tag_x)

        if self.params['train_from_file']:
            print("Loading features from file...")
            X = np.load(TRAIN_X_FILE)
            train_y = np.load(TRAIN_Y_FILE)
        else:
            X = self.pipeline(train_x, pos_tag_x)

        self.model = self.get_model((X.shape[1], X.shape[2]))
        print('Fitting LSTM model...')

        self.model.fit(X, np.array(train_y), **self.params['lstm_params'])
Ejemplo n.º 3
0
    def predict(self, test_x):
        pos_tag_x = [NGrams.to_pos_tags(x) for x in test_x]
        test_x, _ = self.pipeline(test_x, pos_tag_x, fit_scalers=False)

        predictions = self.model.predict(test_x,
                                         **self.params['mlp']['predict'])

        return predictions.argmax(axis=-1)
def main(cutoff=None, persist=False):
    train_x, train_y, train_positions, train_file_names = get_data(
        main_dir=TRAINING_DIR)

    if cutoff:
        train_x = train_x[:cutoff]
        train_y = train_y[:cutoff]
        train_positions = train_positions[:cutoff]

    df = pd.DataFrame(data={'label': train_y, 'pos': train_positions})

    pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x]
    ngrams = NGrams(train_x, pos_tag_x)

    X = [preprocessor.process_text(x) for x in train_x]

    X_word_chunks = word_chunks(X, n=300, process=True)
    X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True)

    fmap = {
        'lexical_features': lexical(X_word_chunks),
        'stop_word_features': ngrams.get_stop_words(X_word_chunks),
        'function_word_features': ngrams.get_function_words(X_word_chunks),
        'pos_tag_features': ngrams.get_pos_tags(X_pos_chunks),
        'process_tag_features': processed_tags(X_word_chunks),
        'word_frequency': wf.average_word_frequency(X_word_chunks),
        'readability_features': readability(X_word_chunks)
    }

    for key, feature in fmap.items():
        df[key] = feature

    if persist:
        df.to_csv(TRAIN_CSV_FILE)
Ejemplo n.º 5
0
def main(cutoff=None, persist=True):
    train_x, train_y, train_positions, train_file_names = get_data(
        #main_dir=TRAINING_DIR,
        external_file=TRAINING_EXTERNAL_FILE
    )

    if cutoff:
        train_x = train_x[:cutoff]
        train_y = train_y[:cutoff]
        train_positions = train_positions[:cutoff]

    pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x]
    ngrams = NGrams(train_x, pos_tag_x)

    X = [preprocessor.process_text(x) for x in train_x]

    X_word_chunks = word_chunks(X, n=300, process=True, sliding=True)
    #print('Word', max([len(s) for s in X_word_chunks]))
    X_char_chunks = char_chunks(X, n=2000, sliding=True)
    #print('Char', max([len(s) for s in X_char_chunks]))

    X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True, sliding=True)

    max_segments = 20

    lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments)
    stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments)
    function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments)
    pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments)
    word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments)
    readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments)

    # lexical_features = lexical(X_word_chunks)
    # stop_word_features = ngrams.get_stop_words(X_word_chunks)
    # function_word_features = ngrams.get_function_words(X_word_chunks)
    # pos_tag_features = ngrams.get_pos_tags(X_pos_chunks)
    # process_tag_features = processed_tags(X_word_chunks)
    # word_frequency = wf.average_word_frequency(X_word_chunks)
    # readability_features = readability(X_word_chunks)
    # tfidf = ngrams.get_word_tfidf(X_word_chunks)


    X = np.concatenate([lexical_features, stop_word_features,
                            function_word_features, pos_tag_features, 
                            word_frequency,
                            readability_features, paragraph_features], axis=2)

    print(X.shape)

    if persist:
        np.save(TRAIN_X_FILE, X)
        np.save(TRAIN_Y_FILE, train_y)
def main(cutoff=10000, persist=True):
    train_x, train_y, train_positions, train_file_names = get_data(
        external_file=TRAINING_EXTERNAL_FILE
    )

    if cutoff:
        train_x = train_x[:cutoff]
        train_y = train_y[:cutoff]
        train_positions = train_positions[:cutoff]

    pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x]
    ngrams = NGrams(train_x, pos_tag_x)

    X = [preprocessor.process_text(x) for x in train_x]

    X_word_chunks = word_chunks(X, n=300, process=True)
    X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True)

    max_segments = 10

    lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments)
    stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments)
    function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments)
    pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments)
    process_tag_features = sequence.pad_sequences(processed_tags(X_word_chunks), maxlen=max_segments)
    word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments)
    readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments)

    print(type(lexical_features))

    X = np.concatenate([lexical_features, stop_word_features,
                            function_word_features, pos_tag_features, 
                            process_tag_features, word_frequency,
                            readability_features], axis=2)

    if persist:
        np.save(TRAIN_X_FILE, X)
        np.save(TRAIN_Y_FILE, train_y)
    def predict(self, test_x):
        pos_tag_x = [NGrams.to_pos_tags(x) for x in test_x]
        test_x = self.pipeline(test_x, pos_tag_x)

        predictions = self.model.predict_classes(test_x)
        return predictions.flatten()
Ejemplo n.º 8
0
    def predict(self, test_x):
        pos_tag_x = [NGrams.to_pos_tags(x) for x in test_x]
        test_x, _ = self.pipeline(test_x, pos_tag_x, fit_scalers=False)

        return self.model.predict(test_x).tolist()