Esempio n. 1
0
def main(full = False):
    vectorizer = make_union(
        #on_field('user', Tfidf(max_features=100000 , token_pattern='\w+')), #100000
        on_field('name', Tfidf(max_features=15000 , token_pattern='\w+')), #100000
        on_field('text', Tfidf(max_features=60000, token_pattern='\w+', ngram_range=(1, 2))), #100000
        on_field(['region', 'city', 'price_cut', 'item_seq_number_cut', 'image_top_1', \
                  'param_1', 'param_3', 'param_3', 'user_type', 'user'],
                 FunctionTransformer(to_records, validate=False), DictVectorizer()),
        n_jobs=4)
    y_scaler = StandardScaler()
    with timer('process train'):
        train, valid, test, y_train, y_valid, trndex, tstdex = load_data(full)
        y_train = y_train.values
        X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32)
        print(f'X_train: {X_train.shape} of {X_train.dtype}')
        del train
    with timer('process valid'):
        X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)
        if full:
            X_test = vectorizer.transform(preprocess(test)).astype(np.float32)
    with ThreadPool(processes=4) as pool:
        if full:
            Xb_train, Xb_test = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_test]]
            xs = [[Xb_train, Xb_test], [X_train, X_test]] * 2
        else:
            Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]]
            xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2
        y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0)
    if not full:
        print('Valid RMSE: {:.4f}'.format(np.sqrt(metrics.mean_squared_error(y_valid.values, y_pred))) )
        return y_pred, trndex
    return y_pred, tstdex
def main():
    vectorizer = make_union(
        on_field('title', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))),
        on_field('text', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))),
        on_field(['price', 'user_type', 'image_top_1', 'city', 'region'],
                 FunctionTransformer(to_records, validate=False), DictVectorizer()),
        n_jobs=4)
    y_scaler = StandardScaler()
    with timer('process train'):
        train = pd.read_csv('../input/train.csv', parse_dates = ["activation_date"])
        train = train[train['deal_probability'] > 0.5].reset_index(drop=True)
        cv = KFold(n_splits=20, shuffle=True, random_state=42)
        train_ids, valid_ids = next(cv.split(train))
        train, valid = train.iloc[train_ids], train.iloc[valid_ids]
        y_train = y_scaler.fit_transform(np.log1p(train['deal_probability'].values.reshape(-1, 1)))
        X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32)
        print(f'X_train: {X_train.shape} of {X_train.dtype}')
        del train
    with timer('process valid'):
        X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)
    with ThreadPool(processes=4) as pool:
        Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]]
        xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2
        y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0)
    y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0]
    print('Valid RMSLE: {:.4f}'.format(np.sqrt(mean_squared_log_error(valid['deal_probability'], y_pred))))
Esempio n. 3
0
 def init_preprocesses():
     vectorizer = make_union(
         on_field('name',       Tfidf(max_features=15000 , token_pattern='\w+')), #100000
         on_field('all_titles', Tfidf(max_features=80000 , token_pattern='\w+')), #100000
         #on_field('user_categories', Tfidf(max_features=10000 , token_pattern='\w+')), #100000
         on_field('text',       Tfidf(max_features=60000, token_pattern='\w+', ngram_range=(1, 2))), #100000
         on_field(['region', 'city', 'price_cut', 'item_seq_number_cut', 'image_top_1', 'user_avg_price_cut', \
                   'param_1', 'param_3', 'param_3', 'user_type', 'user', 'user_ad_ct', 'usercat_avg_price_cut', 'usercat_ad_ct'],
                  FunctionTransformer(to_records, validate=False), DictVectorizer()),
         n_jobs=4)
     y_scaler = StandardScaler()
     return vectorizer, y_scaler
Esempio n. 4
0
def evaluate(train):
    vectorizer = make_union(make_pipeline(
        PandasSelector("name"),
        Tfidf(max_features=100000, token_pattern=r"\w+"),
    ),
                            make_pipeline(
                                PandasSelector("text"),
                                Tfidf(max_features=100000,
                                      token_pattern=r"\w+",
                                      ngram_range=(1, 2)),
                            ),
                            make_pipeline(
                                PandasSelector(
                                    ["shipping", "item_condition_id"],
                                    records=True), DictVectorizer()),
                            n_jobs=4)
    y_scaler = StandardScaler()

    with timer("process train"):
        train = train[train["price"] > 0].reset_index(drop=True)
        cv = KFold(n_splits=20, shuffle=True, random_state=42)
        train_ids, valid_ids = next(cv.split(train))
        train, valid = train.iloc[train_ids], train.iloc[valid_ids]
        y_train = y_scaler.fit_transform(
            np.log1p(train["price"].values.reshape(-1, 1)))
        X_train = vectorizer.fit_transform(preprocess(train)).astype(
            np.float32)
        print(f"X_train: {X_train.shape} of {X_train.dtype}")
        del train

    with timer("process valid"):
        X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)

    with ThreadPool(processes=4) as pool:
        Xb_train, Xb_valid = [
            x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]
        ]
        xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2
        y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs),
                         axis=0)

    y_pred = np.expm1(y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0])
    print("Valid RMSLE: {:.4f}".format(
        np.sqrt(mean_squared_log_error(valid["price"], y_pred))))
Esempio n. 5
0
def train():
    vectorizer = make_union(
        on_field('title', Tfidf(max_features=1000000, min_df=5,
                                # token_pattern='[\w\?,\.;:\(\)\[\]]+',
                                token_pattern='\w+',
                                stop_words=stops, ngram_range=(1, 2),
                                # lowercase=True,
                                # smooth_idf=False
                                )),
        on_field('description', Tfidf(max_features=1000000, min_df=5,
                                      lowercase=True,
                                      # token_pattern='[\w\?,\.;:\(\)\[\]]+',
                                      token_pattern='\w+',
                                      ngram_range=(1, 2),
                                      # stop_words=stops,
                                      # smooth_idf=False
                                      )
                 ),
        n_jobs=1)
    df = pd.DataFrame()
    list_size = []
    #
    #
    for path in ['../input/train.csv', '../input/test.csv', '../input/train_active.csv', '../input/test_active.csv']:
        _df = pd.read_csv(path,
                          usecols=['title',
                                   'description',
                                   'price', 'item_seq_number',
                                   'parent_category_name',
                                   'region', 'city', 'category_name', 'param_1', 'param_2', 'param_3', 'user_type'
                                   ])
        list_size.append(_df.shape[0])
        df = pd.concat([df, _df], axis=0, ignore_index=True)
    data = vectorizer.fit_transform(preprocess(df)).astype(np.float32)
    X_train = data[:list_size[0], :]
    X_test = data[list_size[0]:list_size[0] + list_size[1], :]
    with open('train_tfidf_all.pkl', 'wb') as f:
        pickle.dump(X_train, f, -1)
    with open('test_tfidf_all.pkl', 'wb') as f:
        pickle.dump(X_test, f, -1)

    with open('vectorizer_tfidf_all.pkl', 'wb') as f:
        pickle.dump(vectorizer, f, -1)
def main():

    vectorizer = make_union(
        on_field('name', Tfidf(max_features=100000, token_pattern='\w+')),
        on_field(
            'text',
            Tfidf(max_features=100000, token_pattern='\w+',
                  ngram_range=(1, 2))),
        on_field(['shipping', 'item_condition_id'],
                 FunctionTransformer(to_records, validate=False),
                 DictVectorizer()))

    y_scaler = StandardScaler()

    train = pd.read_table(TRAIN_FILE_MERCARI)

    train = train[train['price'] > 0].reset_index(drop=True)
    cv = KFold(n_splits=20, shuffle=True, random_state=42)
    train_ids, valid_ids = next(cv.split(train))
    train, valid = train.iloc[train_ids], train.iloc[valid_ids]

    y_train = y_scaler.fit_transform(
        np.log1p(train['price'].values.reshape(-1, 1)))

    X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32)

    print(f'X_train: {X_train.shape} of {X_train.dtype}')
    del train

    X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)

    with ThreadPool(processes=8) as pool:
        Xb_train, Xb_valid = [
            x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]
        ]
        xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2
        y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs),
                         axis=0)

    y_pred = np.expm1(y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0])

    print('Valid RMSLE: {:.4f}'.format(
        np.sqrt(mean_squared_log_error(valid['price'], y_pred))))
Esempio n. 7
0
def create_vectorizers(train):
    train = preprocess(train)
    name_vectorizer = Tfidf(max_features=100000,
                            lowercase=False,
                            encoding="ascii",
                            decode_error="strict",
                            analyzer="word")
    name_vectorizer.fit(train["name"])
    text_vectorizer = Tfidf(max_features=100000,
                            ngram_range=(1, 1),
                            lowercase=False,
                            encoding="ascii",
                            decode_error="strict",
                            analyzer="word")
    text_vectorizer.fit(train["text"])
    valid_records = to_records(train[["shipping", "item_condition_id"]])
    dict_vectorizer = DictVectorizer()
    dict_vectorizer.fit(valid_records)
    return name_vectorizer, text_vectorizer, dict_vectorizer
Esempio n. 8
0
 def _make_tfidf_NB_clf(self, **cfg):
     max_f = cfg.get('max_features', 1200)
     max_df = cfg.get('max_df', 0.7)
     sublin = cfg.get('sublin', True)
     vectorizer = Tfidf(stop_words='english',
                        norm='l2',
                        max_df=max_df,
                        max_features=max_f,
                        sublinear_tf=sublin)
     model = MNB()
     clf = Pipeline(steps=[('v', vectorizer), ('nb', model)])
     return clf
def main():
    stopWords = stopwords.words('russian')
    vectorizer = make_union(
        on_field('title',
                 Tfidf(max_features=100000,
                       stop_words=stopWords)),  #  token_pattern='\w+',
        on_field(
            'text',
            Tfidf(max_features=100000,
                  ngram_range=(1, 2),
                  stop_words=stopWords)),
        on_field('param', Tfidf(max_features=1000, stop_words=stopWords)),
        # on_field(['shipping', 'item_condition_id'],
        #          FunctionTransformer(to_records, validate=False), DictVectorizer()),
        n_jobs=4)
    y_scaler = StandardScaler()
    with timer('process train'):
        train = pd.read_csv('../data/train.csv')
        cv = KFold(n_splits=20, shuffle=True, random_state=42)
        train_ids, valid_ids = next(cv.split(train))
        train, valid = train.iloc[train_ids], train.iloc[valid_ids]
        # y_train = y_scaler.fit_transform(np.log1p(train['deal_probability'].values.reshape(-1, 1)))
        y_train = y_scaler.fit_transform(
            train['deal_probability'].values.reshape(-1, 1))
        X_train = vectorizer.fit_transform(preprocess(train)).astype(
            np.float32)
        print(f'X_train: {X_train.shape} of {X_train.dtype}')
        del train
    with timer('process valid'):
        X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)
    with ThreadPool(processes=4) as pool:
        Xb_train, Xb_valid = [
            x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]
        ]
        xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2
        y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs),
                         axis=0)
    y_pred = np.expm1(y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0])
    print('Valid RMSLE: {:.4f}'.format(
        np.sqrt(mean_squared_log_error(valid['deal_probability'], y_pred))))
Esempio n. 10
0
def main():
    vectorizer = make_union(on_field('name', Tfidf(max_features=1000)),
                            on_field('text', Tfidf(max_features=1000)),
                            on_field(['#friends', '#followers', '#favorites'],
                                     FunctionTransformer(to_records,
                                                         validate=False),
                                     DictVectorizer()),
                            n_jobs=4)
    y_scaler = StandardScaler()
    with timer('process train'):
        train, label, __, __ = load_data()
        train['label'] = label
        #train = train.sample(10000)

        train, valid = train_test_split(train, test_size=0.2, random_state=123)

        y_train = y_scaler.fit_transform(
            np.log1p(train['label'].values.reshape(-1, 1)))
        X_train = vectorizer.fit_transform(preprocess(train)).astype(
            np.float32)
        print(f'X_train: {X_train.shape} of {X_train.dtype}')

        del train
    with timer('process valid'):
        X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)

    with timer('fit predict'):
        Xb_train, Xb_valid = [
            x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]
        ]
        xs = [[Xb_train, Xb_valid], [X_train, X_valid]]
        #y_pred0 = fit_predict(xs[0], y_train=y_train)
        y_pred1 = fit_predict(xs[1], y_train=y_train)
        y_pred = y_pred1

        y_pred = np.expm1(
            y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0])
        print('Valid MSLE: {:.4f}'.format(
            mean_squared_log_error(valid['label'],
                                   np.where(y_pred < 0, 0, y_pred))))
Esempio n. 11
0
def lr_model():
    steps = [("vec", Count(max_features=20000, ngram_range=(1, 3))),
             ("tfidf", Tfidf()),
             ("clf",
              LR(penalty="l2",
                 solver="saga",
                 multi_class="auto",
                 random_state=42,
                 max_iter=1e4,
                 n_jobs=-1,
                 C=9.930234540337622,
                 tol=0.0007738614855921237))]
    return Pipeline(steps=steps)
Esempio n. 12
0
def main1():
    vectorizer = make_union(
        on_field(
            'text',
            Tfidf(max_features=300000, token_pattern='\w+',
                  ngram_range=(1, 2))),
        on_field(['len_text'], FunctionTransformer(to_records, validate=False),
                 DictVectorizer()))

    with timer('process train'):
        if os.path.exists(resource_path + 'dataset.pik'):
            with open(resource_path + 'dataset.pik', 'rb') as f:
                trainX, testX, trainY = pickle.load(f)
        else:
            train = pd.read_csv(data_path + 'train.csv')
            train['Discuss'] = train['Discuss'].apply(
                lambda x: ' '.join(jieba.cut(x)))

            test = pd.read_csv(data_path + 'test.csv')
            test['Discuss'] = test['Discuss'].apply(
                lambda x: ' '.join(jieba.cut(x)))

            train = train[train['Score'] > 0].reset_index(drop=True)
            trainY = train['Score'].values
            trainX = vectorizer.fit_transform(get_dataset_x(train)).astype(
                np.float32)
            testX = vectorizer.fit_transform(get_dataset_x(test)).astype(
                np.float32)

            sk = SelectKBest(chi2, k=100000)
            trainX = sk.fit_transform(trainX, trainY)
            testX = sk.transform(testX)

            with open(resource_path + 'dataset.pik', 'wb') as f:
                pickle.dump((trainX, testX, trainY), f)

        print(f'trainX: {trainX.shape} of {trainX.dtype} with{type(trainX)}')
        print(f'testX: {testX.shape} of {testX.dtype} with{type(testX)}')

        #pred=model_lgb(trainX,testX,trainY)
        pred = model_svm(trainX, testX, trainY)
        store_result(pred)
Esempio n. 13
0
    return [doc_names[docs_indices[0, x]] for x in range(num_items)]


def find_images(docs):
    images = []
    for d in docs:
        filename = d.split("/")[-1].split(".")[0]
        img_name = "/".join(filename.split("_")) + ".jpg"
        images.append(img_name)
    return images


if __name__ == "__main__":
    docs = os.listdir(docs_folder)
    doc_names = [os.path.join(docs_folder, x) for x in docs]
    words = {
        str(x): x
        for x in range(sum([codebook_size, codebook_size, codebook_size]))
    }

    print("Training model")
    model = Tfidf(input='filename', ngram_range=(1, 2), vocabulary=words)
    matrix = model.fit_transform(doc_names)

    pickle.dump((doc_names, matrix), open(tfidf_model, "wb"))

    test_words = [12, 23, 1323, 234, 214, 1224, 1532]
    docs_match = find_matching_docs(matrix, test_words, doc_names)
    imgs = find_images(docs_match)
    pdb.set_trace()
Esempio n. 14
0
def train():
    russian_stop = set(stopwords.words('russian'))
    vectorizer = make_union(on_field(
        'title',
        Tfidf(max_features=1000000,
              min_df=5,
              token_pattern='\w+',
              stop_words=russian_stop,
              ngram_range=(1, 2))),
                            on_field(
                                'description',
                                Tfidf(max_features=1000000,
                                      min_df=5,
                                      token_pattern='\w+',
                                      ngram_range=(1, 2),
                                      stop_words=russian_stop)),
                            on_field(['item_seq_number'],
                                     FunctionTransformer(to_records,
                                                         validate=False),
                                     DictVectorizer()),
                            FunctionTransformer(itemgetter(['price']),
                                                validate=False),
                            n_jobs=4)
    df = pd.read_csv('../input/train.csv',
                     usecols=[
                         'title', 'description', 'price', 'item_seq_number',
                         'parent_category_name', 'region', 'city',
                         'category_name', 'param_1', 'param_2', 'param_3',
                         'user_type', 'deal_probability', 'activation_date'
                     ])
    print('load end')
    target = df['deal_probability']
    X_train = vectorizer.fit_transform(preprocess(df)).astype(np.float32)
    with open('train_nn.pkl', 'wb') as f:
        pickle.dump(X_train, f, -1)
    """
    with open('train_nn.pkl', 'rb') as f:
        X_train = pickle.load(f)
    """

    with open('vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f, -1)

    y_train = target.values
    metric = 'val_loss'
    mode = 'min'
    callbacks = [
        EarlyStopping(monitor=metric,
                      patience=10,
                      verbose=1,
                      min_delta=1e-6,
                      mode=mode),
        ReduceLROnPlateau(monitor=metric,
                          factor=0.1,
                          patience=2,
                          verbose=1,
                          epsilon=1e-4,
                          mode=mode),
        ModelCheckpoint(monitor=metric,
                        filepath='weights/best_weights.hdf5',
                        save_best_only=True,
                        save_weights_only=True,
                        mode=mode),
        TensorBoard(log_dir='logs'),
        LoggingCallback()
    ]
    model = get_model(X_train.shape[1])

    train = df['activation_date'] < '2017-03-25'
    test = df['activation_date'] >= '2017-03-25'

    trn_x = X_train[[i for i in range(X_train.shape[0]) if train[i]]]
    val_x = X_train[[i for i in range(X_train.shape[0]) if test[i]]]

    trn_y = y_train[train]
    val_y = y_train[test]

    model.fit(x=trn_x,
              y=trn_y,
              validation_data=(val_x, val_y),
              batch_size=2**11,
              epochs=1000,
              callbacks=callbacks)
Esempio n. 15
0
cv = KFold(n_splits=20, shuffle=True, random_state=42)

train_ids, valid_ids = next(cv.split(data))

train = data.iloc[train_ids]
valid = data.iloc[valid_ids]

del data

y_train = y_scaler.fit_transform(np.log1p(train['price'].values.reshape(-1,
                                                                        1)))

train = preprocess(train)

name_tfidf = Tfidf(max_features=100000, token_pattern='\w+')

name_p: Pipeline = make_pipeline(
    FunctionTransformer(itemgetter('name'), validate=False), name_tfidf)

text_tfidf = Tfidf(max_features=100000,
                   token_pattern='\w+',
                   ngram_range=(1, 2))

text_p = make_pipeline(FunctionTransformer(itemgetter('text'), validate=False),
                       text_tfidf)

shipping_p = make_pipeline(
    FunctionTransformer(itemgetter('shipping'), validate=False),
    FunctionTransformer(to_records, validate=False), DictVectorizer())
Esempio n. 16
0
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline, make_union, Pipeline
from sklearn.feature_extraction import DictVectorizer
from operator import itemgetter
import pandas as pd

class Vectorizer():
    def __init__(self):
        self.vectorizer = None

    def on_field(self, f: str, *vec):
        return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)

    def to_records(self, df: pd.DataFrame):
        return df.to_dict(orient='records')
    
    def tfidf_vectorizer(self, title_feat=100000, description_feat=500000)
        self.vectorizer = make_union(
                                    self.on_field("title", Tfidf(max_features=title_feat, token_pattern="\w+")),
                                    self.on_field("description", Tfidf(max_features=description_feat, token_pattern="\w+", ngram_range=(1, 2))),
                                    self.on_field(['shipping', 'status'],
                                    FunctionTransformer(self.to_records, validate=False), DictVectorizer())
                                    )
        return self.vectorizer
Esempio n. 17
0
def main(evalation=True):
    #make_union将各个特征组合到一起
    vectorizer = make_union(
        #先获取pd中name,进行Tfidf,根据语料库的出现词的频率排序,选择前300000个词,\w+匹配数字字母下划线的多个字符
        #on_field('name', Tfidf(max_features=1000, token_pattern='\w+')),
        #获取pd中的text,也是tfidf,不同的是使用ngram
        on_field('text', Tfidf(max_features=300000, token_pattern='\w+', ngram_range=(1, 2))),
        on_field(['len_discuss'],FunctionTransformer(to_records,validate=False),DictVectorizer()),
        #on_field(['shipping', 'item_condition_id'],
                 #FunctionTransformer(to_records, validate=False), DictVectorizer()),
        n_jobs=1)
    y_scaler = StandardScaler()
    with timer('process train'):
        train = pd.read_csv(data_path+'train_split.csv')
        train['len_discuss']=train['Discuss'].apply(lambda x:len(x))

        train['Discuss']=train['Discuss'].apply(lambda x:' '.join(jieba.cut(x)))

        test=pd.read_csv(data_path+"dev_split.csv")
        test['len_discuss']=test['Discuss'].apply(lambda x:len(x))
        test['Discuss']=test['Discuss'].apply(lambda x:' '.join(jieba.cut(x)))
        y_true=None
        if evalation:
            y_true=test['Score'].values

##################### noun
        # print('load noun set...')
        #
        # if os.path.exists(resource_path+'noun_set.pik'):
        #     with open(resource_path+'noun_set.pik','rb') as f:
        #         noun_set=pickle.load(f)
        #         # noun_set=filter_noun(noun_set)
        # else:
        #     noun_set=get_nouns(train['Discuss'].values)
        #     with open(resource_path+'noun_set.pik','wb') as f:
        #         pickle.dump(noun_set,f)
        #     # noun_set=filter_noun(noun_set)
        #
        # print(f'noun size:{len(noun_set)}')
#######################

###################### keyword
        print('load keyword set...')

        if os.path.exists(resource_path+'keyword_set.pik'):
            with open(resource_path+'keyword_set.pik','rb') as f:
                keyword_set=pickle.load(f)
        else:
            keyword_set=get_keywords(train['Discuss'].values)
            with open(resource_path+'keyword_set.pik','wb') as f:
                pickle.dump(keyword_set,f)

        print(f'keyword size:{len(keyword_set)}')
######################

        train = train[train['Score'] > 0].reset_index(drop=True)#取出所有价格大于0的数据
        # cv = KFold(n_splits=10, shuffle=True, random_state=42)#20折
        # train_ids, valid_ids = next(cv.split(train))
        # valid=train.iloc[valid_ids]
        # train=train.iloc[train_ids]
        y_train_start=train['Score'].values
        y_train=y_scaler.fit_transform(train['Score'].values.reshape(-1,1))
        X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32)
        X_test=vectorizer.transform(preprocess(test)).astype(np.float32)

        #y_test=valid['Score']

        sk=SelectKBest(chi2,k=100000)
        X_train=sk.fit_transform(X_train,y_train_start)
        X_test=sk.transform(X_test)

        print(f'X_train: {X_train.shape} of {X_train.dtype}')
        print(f'X_test: {X_test.shape} of {X_test.dtype}')
        # del train
    # with timer('process valid'):
    #     X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)
    with ThreadPool(processes=6) as pool:
        Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_test]]

############################### noun
        # vec=CountVectorizer(binary=True,tokenizer=seg_sentence)
        # vec.fit(noun_set)
        # Xn_train,Xn_valid=[vec.transform(x) for x in [train['Discuss'].values,test['Discuss'].values]]
##################################

############################# keyword
        if os.path.exists(resource_path+'keyword_train.pik'):
            with open(resource_path+'resource_train.pik','rb') as f:
                Xk_train,Xk_valid=pickle.load(f)
        else:
            vec=CountVectorizer(binary=True,tokenizer=seg_sentence)
            vec.fit(keyword_set)
            Xk_train,Xk_valid=[vec.transform(x) for x in [train['Discuss'].values,test['Discuss'].values]]
            with open(resource_path+'resource_train.pik','wb') as f:
                pickle.dump([Xk_train,Xk_valid],f)
#############################

############################

####下面的xn_train,Xn_valid

##############

############# 拼接在内部
        # Xb_a_train=np.concatenate([Xb_train,Xk_train],axis=1)
        # Xb_a_valid=np.concatenate([Xb_valid,Xk_valid],axis=1)
        # X_a_train=np.concatenate([X_train,Xk_train],axis=1)
        # X_a_test=np.concatenate([X_test,Xk_valid],axis=1)
        xs = [[Xb_train, Xb_valid], [X_train, X_test],[Xk_train,Xk_valid]]*2 #复制一遍  #Xb表示单词的出现与否,而X使用的是tfidf特征权重
############## 放在训练
        # xs = [[Xb_train, Xb_valid],[X_train, X_test],[Xk_train,Xk_valid]]*2 #复制一遍  #Xb表示单词的出现与否,而X使用的是tfidf特征权重

###############

        print(len(xs),len(xs[0]))
        #print(len(xs[1]))
        xx=pool.map(partial(fit_predict, y_train=y_train), xs)#np.mean指传入多次进行平均
        print(len(xx))
        y_pred = np.mean(xx,axis=0)
        y_pred=y_scaler.inverse_transform(y_pred)
    # print(y_pred)

    pre=[]
    for i in y_pred:
        if i>4.7:
            pre.append(5)
        else:
            pre.append(i)

    if evalation and y_true is not None:
        print('the score is :',evaluate(y_true,pre))

    result=pd.DataFrame({'ID':test.Id,'Discuss':test.Discuss,'Score':pre})
    result.to_csv('MLP_simple_jieba_stopword_chibest.csv',header=None,index=None)
Esempio n. 18
0
def create_text_pipeline():
    l = []
    l.append ( ('td_text', Tfidf(max_features=200000, token_pattern='\w+', ngram_range=(1, 3))))

    return Pipeline(l)
        def __init__(self, cols):
            self.cols = cols

        def fit(self, X, y=None):
            # stateless transformer
            return self

        def transform(self, X):
            # assumes X is a DataFrame
            Xdict = X.to_dict('records')
            return Xdict

vec_name = FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnExtractor('name')),
            ('tfidf', Tfidf(max_features=100000, token_pattern='\w+'))
        ])),
    ])
vec_text = Pipeline([
            ('extract', ColumnExtractor('text')),
            ('tfidf', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2)))
        ])
vec_num =  Pipeline([
            ('extract', ColumnExtractor(['shipping', 'item_condition_id'])),
            ('ToDict', ToDict(['shipping', 'item_condition_id'])),
            ('DictVectorizer', DictVectorizer())
        ])

y_scaler = StandardScaler()

def load_data():
def main():
    vectorizer = make_union(on_field(
        'title', Tfidf(max_features=100000, token_pattern='\w+')),
                            on_field(
                                'text',
                                Tfidf(max_features=100000,
                                      token_pattern='\w+',
                                      ngram_range=(1, 2))),
                            on_field(['price', 'user_type', 'image_top_1'],
                                     FunctionTransformer(to_records,
                                                         validate=False),
                                     DictVectorizer()),
                            n_jobs=4)
    y_scaler = StandardScaler()
    with timer('process train'):
        train = pd.read_csv('../input/train.csv',
                            parse_dates=["activation_date"])
        #train = train[train['deal_probability'] > 0].reset_index(drop=True)
        cv = KFold(n_splits=20, shuffle=True, random_state=42)
        train_ids, valid_ids = next(cv.split(train))
        train, valid = train.iloc[train_ids], train.iloc[valid_ids]
        y_train = y_scaler.fit_transform(
            np.log1p(train['deal_probability'].values.reshape(-1, 1)))
        X_train = vectorizer.fit_transform(preprocess(train)).astype(
            np.float32)
        print('X_train: {} of {}'.format(X_train.shape, X_train.dtype))
        del train
        print('read test data ...')
        test = pd.read_csv('../input/test.csv',
                           parse_dates=["activation_date"])
        del test['image']

    with timer('process valid'):
        X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)
        # TODO
        X_test = vectorizer.transform(preprocess(test))

    with ThreadPool(processes=4) as pool:
        # Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]]
        Xb_train, Xb_valid, Xb_test = [
            x.astype(np.bool) for x in [X_train, X_valid, X_test]
        ]
        xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2
        # TODO
        xs_test = [[Xb_train, Xb_test], [X_train, Xb_test]]

        y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs),
                         axis=0)

        # TODO
        y_pred_test = np.mean(pool.map(partial(fit_predict, y_train=y_train),
                                       xs_test),
                              axis=0)
    print("Start to join...")
    pool.join()

    y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0]
    print('Valid RMSLE: {:.4f}'.format(
        np.sqrt(mean_squared_log_error(valid['deal_probability'], y_pred))))
    # TODO
    y_pred_test = y_scaler.inverse_transform(y_pred_test.reshape(-1, 1))[:, 0]
    pool.close()
    sub = pd.read_csv('../input/sample_submission.csv')
    sub['deal_probability'] = y_pred_test
    sub.to_csv('sub3.csv', index=False)
    print('all done!')
Esempio n. 21
0
###########add target encoding (mean)
for cf in categorical_features_tobe[1:]:
    new_f = "{}_dl".format(cf)
    temp = train[[cf,"deal_probability"]].groupby(cf).mean().reset_index().rename(columns={"deal_probability": new_f})
    df = df.merge(temp, how="left", on=cf)
    df[new_f] = np.log1p(df[new_f])
    df[new_f] = df[new_f].fillna(df[new_f].mean())
    features.append(new_f)
    del temp
gc.collect()

###### text features 
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf
from scipy.sparse import hstack, csr_matrix

tfidf = Tfidf(ngram_range=(1,2), max_features=20000, sublinear_tf=True)
textfeats = ["description", "title"]
for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('missing') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Word
    features.extend([cols + '_num_words',cols + '_num_unique_words', cols + '_words_vs_unique'])
    
df["text"] = df["title"].astype(str) + " " + df["description"].astype(str)
x = tfidf.fit_transform(df["text"]) 
features.extend(categorical_features)

######### add aggragated features
Esempio n. 22
0
def train():

    vectorizer = make_union(
        on_field(
            'title',
            Tfidf(ngram_range=(3, 3),
                  analyzer='char',
                  max_features=1000000,
                  min_df=5)),
        on_field(
            'description',
            Tfidf(ngram_range=(3, 3),
                  analyzer='char',
                  max_features=1000000,
                  min_df=5)),
        on_field(['item_seq_number'],
                 FunctionTransformer(to_records, validate=False),
                 DictVectorizer()),
        FunctionTransformer(itemgetter(['price']), validate=False),
        # n_jobs=4
    )

    df = pd.read_csv('../input/train.csv',
                     usecols=[
                         'title', 'description', 'price', 'item_seq_number',
                         'parent_category_name', 'region', 'city',
                         'category_name', 'param_1', 'param_2', 'param_3',
                         'user_type', 'deal_probability'
                     ])
    print('load end')
    target = df['deal_probability']
    X_train = vectorizer.fit_transform(preprocess(df)).astype(np.float32)
    with open('train_nn_chargram.pkl', 'wb') as f:
        pickle.dump(X_train, f, -1)
    with open('vectorizer_chargram.pkl', 'wb') as f:
        pickle.dump(vectorizer, f, -1)

    y_train = target.values
    metric = 'val_loss'
    mode = 'min'

    callbacks = [
        EarlyStopping(monitor=metric,
                      patience=10,
                      verbose=1,
                      min_delta=1e-6,
                      mode=mode),
        ReduceLROnPlateau(monitor=metric,
                          factor=0.1,
                          patience=2,
                          verbose=1,
                          epsilon=1e-4,
                          mode=mode),
        ModelCheckpoint(monitor=metric,
                        filepath='weights/best_weights_chargram.hdf5',
                        save_best_only=True,
                        save_weights_only=True,
                        mode=mode),
        TensorBoard(log_dir='logs'),
        LoggingCallback()
    ]

    model = get_model(X_train.shape[1])
    cv = KFold(n_splits=5, shuffle=True, random_state=871)
    for train, test in cv.split(X_train, y_train):
        trn_x = X_train[train, :]
        val_x = X_train[test, :]
        trn_y = y_train[train]
        val_y = y_train[test]
        break
    model.fit(x=trn_x,
              y=trn_y,
              validation_data=(val_x, val_y),
              batch_size=2**11,
              epochs=1000,
              callbacks=callbacks)
Esempio n. 23
0
                    best_model = ks.models.clone_model(model)
                    best_model.set_weights(model.get_weights())
        res = dict(pred_valid=pred_valid_best,
                   pred_test=best_model.predict(X_test))
        return res


class_names = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]

## from https://www.kaggle.com/tunguz/logistic-regression-with-words-and-char-n-grams
word_vectorizer = Tfidf(sublinear_tf=True,
                        strip_accents='unicode',
                        analyzer='word',
                        token_pattern=r'\w{1,}',
                        stop_words='english',
                        ngram_range=(1, 2),
                        max_features=20000)
char_vectorizer = Tfidf(sublinear_tf=True,
                        strip_accents='unicode',
                        analyzer='char',
                        stop_words='english',
                        ngram_range=(2, 6),
                        max_features=30000)
vectorizer = make_union(on_field('comment_text', word_vectorizer),
                        on_field('comment_text', char_vectorizer),
                        n_jobs=4)
with timer('process train'):
    train = pd.read_csv('../input/train.csv')
    cv = KFold(n_splits=20, shuffle=True, random_state=42)
Esempio n. 24
0
content = []
for newspaper in newspapers:
    c.execute(
        u'select title, body from {} where title IS NOT NULL and body IS NOT NULL and date >= "{}" and date <= "{}";'
        .format(newspaper, init_date, final_date))
    content += [row[0] + row[1] for row in c]

conn.close()

#Palabras comunes
fp = codecs.open(stopwords, "r", encoding="utf-8")
data = fp.read()
fp.close()

if 'spanish' in stopwords:
    aux = data.split('\r\n')
elif 'english' in stopwords:
    aux = data.split('\n')

words = [a.lower() for a in aux]
"""
# Entrenamiento de la valorizacion tfidf
"""
tfidf = Tfidf(min_df = 2, max_df = 0.95, \
              stop_words = words, \
              ngram_range = (1,1))

tfidf.fit(content)

pk.dump(tfidf, open('idf.pk', 'w'))
def main():
    vectorizer = make_union(
        on_field(
            'description',
            Tfidf(max_features=100000,
                  stop_words=sw,
                  token_pattern='\w+',
                  norm='l2',
                  min_df=3,
                  sublinear_tf=True,
                  smooth_idf=False,
                  ngram_range=(1, 2))),  #max_df=0.3,
        on_field(
            'title',
            Tfidf(max_features=100000,
                  stop_words=sw,
                  token_pattern='\w+',
                  norm='l2',
                  min_df=3,
                  sublinear_tf=True,
                  smooth_idf=False,
                  ngram_range=(1, 2))),
        on_field([
            'image_top_1', 'region', 'category_name', 'parent_category_name',
            'user_type'
        ], FunctionTransformer(to_records, validate=False), DictVectorizer()),
        n_jobs=1)
    with timer('reading data '):
        dtypes = {
            'category_name': 'category',
            'parent_category_name': 'category',
            'region': 'category',
            'item_seq_number': 'uint32',
            'user_type': 'category',
            'image_top_1': 'category',
            'price': 'float32',
            'deal_probability': 'float32'
        }
        train = pd.read_csv(path + 'train.csv', dtype=dtypes)
        test = pd.read_csv(path + 'test.csv', dtype=dtypes)
    with timer('add new features'):
        cat_cols = [
            'image_top_1', 'region', 'city', 'parent_category_name',
            'category_name', 'param_1', 'param_2', 'param_3', 'user_type'
        ]
        num_cols = ['price', 'deal_probability']
        for c in cat_cols:
            for c2 in num_cols:
                enc = train.groupby(c)[c2].agg(['mean']).astype(
                    np.float32).reset_index()
                enc.columns = [
                    '_'.join([str(c), str(c2), str(c3)]) if c3 != c else c
                    for c3 in enc.columns
                ]
                train = pd.merge(train, enc, how='left', on=c)
                test = pd.merge(test, enc, how='left', on=c)
        del (enc)
    with timer('process train'):
        cv = KFold(n_splits=20, shuffle=True, random_state=42)
        train_ids, valid_ids = next(cv.split(train))
        train, valid = train.iloc[train_ids], train.iloc[valid_ids]
        y_train = train['deal_probability'].values
        X_train = vectorizer.fit_transform(preprocess(train)).astype(
            np.float32)
        print(f'X_train: {X_train.shape} of {X_train.dtype}')
        del train
    with timer('process valid'):
        X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)
    gc.collect()
    print('train shape', X_train.shape)
    print('valid shape', X_valid.shape)
    with timer('process test'):
        X_test = vectorizer.transform(preprocess(test)).astype(np.float32)
        del test
        gc.collect()
    print('test shape', X_test.shape)

    valid_length = X_valid.shape[0]
    X_valid = vstack([X_valid, X_test])
    del (X_test)
    gc.collect()
    xs = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]]
    del (X_train, X_valid)
    gc.collect()
    y_pred = fit_predict(xs, y_train=y_train)
    test_pred = y_pred[valid_length:]
    y_pred = y_pred[:valid_length]
    print('Valid RMSLE: {:.4f}'.format(
        np.sqrt(mean_squared_error(valid['deal_probability'], y_pred))))
    submission = pd.read_csv(path + 'test.csv', usecols=["item_id"])
    submission["deal_probability"] = test_pred
    submission['deal_probability'].clip(0.0, 1.0, inplace=True)
    submission.to_csv(subpath + "MLP_V15.csv", index=False)
Esempio n. 26
0
c = conn.cursor()

content = []
c.execute('select title, body from lanacion where title IS NOT NULL;')

for row in c:
    try:
        content.append(row[0] + row[1])
    except:
        content.append(row[0])

c.execute('select title, body from pagina12 where title IS NOT NULL;')

for row in c:
    try:
        content.append(row[0] + row[1])
    except:
        content.append(row[0])

conn.close()
"""
Entrenamiento de la valorizacion tfidf
"""
m = 0.00
while m <= 0.95:

    tfidf = Tfidf(min_df=m, max_df=m + 0.05, ngram_range=(1, 2))
    x_tfidf = tfidf.fit_transform(content)
    print m, x_tfidf.shape[1]
    m += 0.05
    logger = get_logger(exp=args.exp)
    with timer("Load Data", logger):
        loader = DataLoader()

    with timer("tokenize", logger):
        loader.tokenize(tokenizer, {
            "stopwords": get_stopwords(),
            "include_verb": True
        })

    train, test = loader.load()
    X = train["tokenized"].fillna("")
    X_test = test["tokenized"].fillna("")
    y = train["label"].values
    y_test = test["label"].values

    with timer("vectorize", logger):
        tv = Tfidf(max_features=20000, ngram_range=(1, 3))
        X = tv.fit_transform(X)
        X_test = tv.transform(X_test)

    with timer("optimize", logger):
        study = optuna.create_study()
        study.optimize(optimal_params,
                       n_trials=args.ntrial,
                       n_jobs=args.n_jobs)

    logger.info(f"Best params: {study.best_params}")
    logger.info(f"Best value: {study.best_value}")
    logger.info(f"Best trial: {study.best_trial}")
Esempio n. 28
0
                      optimizer=ks.optimizers.Adam(lr=3e-3))
        for i in range(3):
            with timer(f'epoch {i + 1}'):
                model.fit(x=X_train,
                          y=y_train,
                          batch_size=2**(11 + i),
                          epochs=1,
                          verbose=0)

        y_pred = model.predict(X_test)[:, 0]
        return y_pred


stopWords = stopwords.words('russian')
vectorizer = make_union(
    on_field('title', Tfidf(max_features=100000,
                            stop_words=stopWords)),  # token_pattern='\w+',
    on_field(
        'text',
        Tfidf(max_features=100000, ngram_range=(1, 2), stop_words=stopWords)),
    on_field('param', Tfidf(max_features=1000, stop_words=stopWords)),
    # on_field(['shipping', 'item_condition_id'],
    #          FunctionTransformer(to_records, validate=False), DictVectorizer()),
    n_jobs=4)
y_scaler = StandardScaler()
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
cv = KFold(n_splits=5, shuffle=True, random_state=42)
train_ids, valid_ids = next(cv.split(train))
train, valid = train.iloc[train_ids], train.iloc[valid_ids]
# y_train = y_scaler.fit_transform(np.log1p(train['deal_probability'].values.reshape(-1, 1)))
y_train = y_scaler.fit_transform(train['deal_probability'].values.reshape(
def tfidf_fabric(x=1):
    return Tfidf(max_features=15000, token_pattern='\w+', ngram_range=(1, x))