Ejemplo n.º 1
0
def load_text_features(train, test, helper_data_path, tf_idf=True):
    print('Loading text features...')

    # Stephan's nlp
    def get_df(filename):
        final_nlp_df = None
        with zipfile.ZipFile(path.join(helper_data_path, filename),
                             'r') as zip_ref:
            for name in zip_ref.namelist():
                nlp_df = pd.read_pickle(zip_ref.open(name))
                nlp_df = nlp_df[[
                    'item_id', 'title_word_count',
                    'description_non_regular_chars_ratio',
                    'description_word_count', 'merged_params_word_count',
                    'description_sentence_count',
                    'description_words/sentence_ratio',
                    'title_capital_letters_ratio',
                    'description_capital_letters_ratio',
                    'title_non_regular_chars_ratio',
                    'title_num_of_newrow_char',
                    'description_num_of_newrow_char', 'title_num_adj',
                    'title_num_nouns', 'title_adj_to_len_ratio',
                    'title_noun_to_len_ratio', 'description_num_adj',
                    'description_num_nouns', 'description_adj_to_len_ratio',
                    'description_noun_to_len_ratio',
                    'title_first_noun_stemmed', 'title_second_noun_stemmed',
                    'title_third_noun_stemmed',
                    'description_first_noun_stemmed',
                    'description_second_noun_stemmed',
                    'description_third_noun_stemmed',
                    'title_first_adj_stemmed', 'title_second_adj_stemmed',
                    'title_third_adj_stemmed', 'description_first_adj_stemmed',
                    'description_second_adj_stemmed',
                    'description_third_adj_stemmed', 'title_sentiment',
                    'description_sentiment'
                ]]
                if final_nlp_df is not None:
                    final_nlp_df = pd.concat([final_nlp_df, nlp_df])
                else:
                    final_nlp_df = nlp_df
        return final_nlp_df

    train = train.merge(get_df('train_NLP_enriched.zip'),
                        on='item_id',
                        how='left')
    if test is not None:
        test = test.merge(get_df('test_NLP_enriched.zip'),
                          on='item_id',
                          how='left')

    # tf-idf
    if tf_idf:
        print('loading tfidf features...')
        tfidf_df = load_df(helper_data_path, 'train_tfidf_svd.csv.gz')
        train = pd.concat([train, tfidf_df], axis=1)
        if test is not None:
            tfidf_df = load_df(helper_data_path, 'test_tfidf_svd.csv.gz')
            test = pd.concat([test, tfidf_df], axis=1)

    print('Done loading text features.')
    gc.collect()
    return train, test
Ejemplo n.º 2
0
def add_aggregated_features(train, test, helper_data_path):
    aggregated_features = load_df(helper_data_path,
                                  'aggregated_features.csv.gz')
    train = add_aggregated_features_inner(train, aggregated_features)
    test = add_aggregated_features_inner(test, aggregated_features)
    return train, test
Ejemplo n.º 3
0
def load_text_features(train, test, helper_data_path, tf_idf=True):
    print('Loading text features...')

    # Stephan's nlp
    def get_df(filename):
        final_nlp_df = None
        with zipfile.ZipFile(path.join(helper_data_path, filename),
                             'r') as zip_ref:
            for name in zip_ref.namelist():
                nlp_df = pd.read_pickle(zip_ref.open(name))
                nlp_df = nlp_df[[
                    'item_id', 'title_word_count',
                    'description_non_regular_chars_ratio',
                    'description_word_count', 'merged_params_word_count',
                    'description_sentence_count',
                    'description_words/sentence_ratio',
                    'title_capital_letters_ratio',
                    'description_capital_letters_ratio',
                    'title_non_regular_chars_ratio',
                    'title_num_of_newrow_char',
                    'description_num_of_newrow_char', 'title_num_adj',
                    'title_num_nouns', 'title_adj_to_len_ratio',
                    'title_noun_to_len_ratio', 'description_num_adj',
                    'description_num_nouns', 'description_adj_to_len_ratio',
                    'description_noun_to_len_ratio',
                    'title_first_noun_stemmed', 'title_second_noun_stemmed',
                    'title_third_noun_stemmed',
                    'description_first_noun_stemmed',
                    'description_second_noun_stemmed',
                    'description_third_noun_stemmed',
                    'title_first_adj_stemmed', 'title_second_adj_stemmed',
                    'title_third_adj_stemmed', 'description_first_adj_stemmed',
                    'description_second_adj_stemmed',
                    'description_third_adj_stemmed', 'title_sentiment',
                    'description_sentiment'
                ]]
                if final_nlp_df is not None:
                    final_nlp_df = pd.concat([final_nlp_df, nlp_df])
                else:
                    final_nlp_df = nlp_df
        return final_nlp_df

    train = train.merge(get_df('train_NLP_enriched.zip'),
                        on='item_id',
                        how='left')
    if test is not None:
        test = test.merge(get_df('test_NLP_enriched.zip'),
                          on='item_id',
                          how='left')

    # tf-idf
    if tf_idf:
        print('loading tfidf features...')
        tfidf_df = load_df(helper_data_path, 'train_tfidf_svd.csv.gz')
        train = pd.concat([train, tfidf_df], axis=1)
        if test is not None:
            tfidf_df = load_df(helper_data_path, 'test_tfidf_svd.csv.gz')
            test = pd.concat([test, tfidf_df], axis=1)

    def more_text_count_features(df):
        count = lambda l1, l2: sum([1 for x in l1 if x in l2])
        for col in ['description', 'title']:
            df['num_unique_words_' + col] = df[col].apply(
                lambda comment: len(set(w for w in comment.split())))

        df['num_desc_punct'] = df['description'].apply(
            lambda x: count(x, set(string.punctuation)))

        df['words_vs_unique_title'] = df['num_unique_words_title'] / df[
            'title_word_count'] * 100
        df['words_vs_unique_description'] = df[
            'num_unique_words_description'] / df['description_word_count'] * 100
        return df

    train = more_text_count_features(train)
    test = more_text_count_features(test)

    print('Done loading text features.')
    gc.collect()
    return train, test