Beispiel #1
0
def analyze_mel(train_df, test_df):
    print('analyze_mel')
    feature_names = mel_spectral_features(return_fnames=True)
    train_df[feature_names] = train_df['fname'].progress_apply(
        mel_spectral_features, root=train_root_trimmed)
    test_df[feature_names] = test_df['fname'].progress_apply(
        mel_spectral_features, root=test_root_trimmed)

    train_df.to_csv('../data/train_mel.csv', index=False, float_format='%.4f')
    test_df.to_csv('../data/test_mel.csv', index=False, float_format='%.4f')

    return reduce_mem_usage(train_df), reduce_mem_usage(test_df)
Beispiel #2
0
def trim(train_df, test_df):
    print('trim')
    train_df[['length_int', 'length_final', 'ratio_int', 'ratio_final', 'mean_max_splits', 'std_max_splits', 'mean_len_splits', 'std_len_splits', 'cnt_splits']] = \
        train_df['fname'].progress_apply(trim_silence, root=train_root)

    train_df.to_csv('../data/train_2.csv', index=False, float_format='%.4f')

    test_df[['length_int', 'length_final', 'ratio_int', 'ratio_final', 'mean_max_splits', 'std_max_splits', 'mean_len_splits', 'std_len_splits', 'cnt_splits']] = \
        test_df['fname'].progress_apply(trim_silence, root=test_root)

    test_df.to_csv('../data/test_2.csv', index=False, float_format='%.4f')

    return reduce_mem_usage(train_df), reduce_mem_usage(test_df)
Beispiel #3
0
def extract_segment_feature(train, test):
    train_files = train.fname.values
    train_features = extract_features(train_files, train_root_trimmed)

    test_files = test.fname.values
    test_features = extract_features(test_files, test_root_trimmed)

    train = train.merge(train_features, on='fname', how='left')
    test = test.merge(test_features, on='fname', how='left')

    train.to_csv('../data/train_seg.csv', index=False, float_format='%.4f')
    test.to_csv('../data/test_seg.csv', index=False, float_format='%.4f')

    return reduce_mem_usage(train), reduce_mem_usage(test)
Beispiel #4
0
def get_crossvalid_data(frm, to):
    #test_path = '../input/test.csv'
    #train_path = '../input/train.csv'

    test_path = '../input/imgtop_test.csv'
    train_path = '../input/imgtop_train.csv'

    testing = pd.read_csv(test_path,
                          skiprows=range(1, frm),
                          nrows=to - frm,
                          index_col="item_id",
                          parse_dates=["activation_date"])
    testdex = testing.index
    len_test = len(testing)

    tot_filename = '/media/extend/cache/total_{}_{}.csv'.format(frm, to)
    tot_yname = '/media/extend/cache/total_y_{}_{}.csv'.format(frm, to)
    if os.path.exists(tot_filename) and os.path.exists(tot_yname):
        print('load from feather')
        #df = pd.read_feather(tot_filename).set_index("item_id")
        #y = pd.read_feather(tot_yname).set_index("item_id").deal_probability.copy()
        df = pd.read_csv(tot_filename).set_index("item_id")
        y = pd.read_csv(tot_yname).set_index("item_id").deal_probability.copy()

        len_train = to - frm
    else:
        training = pd.read_csv(train_path,
                               skiprows=range(1, frm),
                               nrows=to - frm,
                               index_col="item_id",
                               parse_dates=["activation_date"])
        len_train = len(training)

        y = training.deal_probability.copy()
        training.drop("deal_probability", axis=1, inplace=True)
        #y.reset_index().to_feather(tot_yname)
        y.reset_index().to_csv(tot_yname)

        print('Train shape: {} Rows, {} Columns'.format(*training.shape))
        print('Test shape: {} Rows, {} Columns'.format(*testing.shape))

        df = pd.concat([training, testing], axis=0)
        del training, testing

    predictors = []
    y, df, ready_df, tfvocab, predictors, len_train, categorical =  \
        preparTotalData(y, df, predictors, len_train, len_test, frm, to, tot_filename)

    #none_categorical = [x for x in df.columns if x not in categorical]

    df = df[predictors]
    df = kaggle_util.reduce_mem_usage(df)
    print(df.info())

    tfvocab = df.columns.tolist() + tfvocab
    testing = hstack([csr_matrix(df[len_train:].values), ready_df[len_train:]])

    return df, y, testing, ready_df, tfvocab, predictors, len_train, categorical, tfvocab, testdex
Beispiel #5
0
def basic_analyze(train_df, test_df, train_root, test_root):
    print('basic_analyze')
    train_df[[
        'length', 'data_mean', 'data_min', 'data_max', 'data_std', 'data_rms',
        'skewness', 'kurtosis'
    ]] = train_df['fname'].progress_apply(wavfile_stats, root=train_root)
    test_df[[
        'length', 'data_mean', 'data_min', 'data_max', 'data_std', 'data_rms',
        'skewness', 'kurtosis'
    ]] = test_df['fname'].progress_apply(wavfile_stats, root=test_root)

    train_df['rms_std'] = train_df['data_rms'] / train_df['data_std']
    test_df['rms_std'] = test_df['data_rms'] / test_df['data_std']

    train_df['max_min'] = train_df['data_max'] / train_df['data_min']
    test_df['max_min'] = test_df['data_max'] / test_df['data_min']

    train_df.to_csv('../data/train_1.csv', index=False, float_format='%.4f')
    test_df.to_csv('../data/test_1.csv', index=False, float_format='%.4f')

    return reduce_mem_usage(train_df), reduce_mem_usage(test_df)
Beispiel #6
0
def keras_train_transform(dataset):
    print('transform...')
    dataset, txt_stats = deal_text_feature(dataset)

    for key in dict_encoder.keys():
        #print(key)
        dataset[key] = dict_encoder[key].transform(dataset[key])

    dataset = kaggle_util.reduce_mem_usage(dataset)
    print("Transform on test function completed.")

    dataset = num_log(dataset)

    return dataset, txt_stats
Beispiel #7
0
def preparTotalData(y, df, predictors, len_train, len_test, frm, to,
                    tot_filename):

    y, df, predictors, len_train, categorical, textfeats = preparBaseData(
        y, df, predictors, len_train, len_test, frm, to, tot_filename)

    print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage")
    russian_stop = set(stopwords.words('russian'))

    tfidf_para = {
        "stop_words": russian_stop,
        "analyzer": 'word',
        "token_pattern": r'\w{1,}',
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": 'l2',
        #"min_df":5,
        #"max_df":.9,
        "smooth_idf": False
    }

    def get_col(col_name):
        return lambda x: x[col_name]

    vectorizer = FeatureUnion([
        ('description',
         TfidfVectorizer(ngram_range=(1, 2),
                         max_features=17000,
                         **tfidf_para,
                         preprocessor=get_col('description'))),
        (
            'title',
            TfidfVectorizer(
                ngram_range=(1, 2),
                **tfidf_para,
                #max_features=7000,
                preprocessor=get_col('title')))
    ])

    start_vect = time.time()
    #vectorizer.fit(df.loc[traindex,:].to_dict('records'))
    vectorizer.fit(df[:len_train].to_dict('records'))
    ready_df = vectorizer.transform(df.to_dict('records'))
    tfvocab = vectorizer.get_feature_names()
    print("Vectorization Runtime: %0.2f Minutes" %
          ((time.time() - start_vect) / 60))

    # Drop Text Cols
    df.drop(textfeats, axis=1, inplace=True)

    #from sklearn.metrics import mean_squared_error
    from math import sqrt

    kf = KFold(len_train, n_folds=NFOLDS, shuffle=True, random_state=SEED)
    ridge_params = {
        'alpha': 30.0,
        'fit_intercept': True,
        'normalize': False,
        'copy_X': True,
        'max_iter': None,
        'tol': 0.001,
        'solver': 'auto',
        'random_state': SEED
    }
    ridge = SklearnWrapper(clf=Ridge, seed=SEED, params=ridge_params)
    ridge_oof_train, ridge_oof_test = get_oof(ridge, ready_df[:len_train], y,
                                              ready_df[len_train:], len_train,
                                              len_test, kf)
    #rms = sqrt(mean_squared_error(y, ridge_oof_train))
    ridge_preds = np.concatenate([ridge_oof_train, ridge_oof_test])
    df['ridge_preds'] = ridge_preds
    predictors.append('ridge_preds')

    df = kaggle_util.reduce_mem_usage(df)
    return y, df, ready_df, tfvocab, predictors, len_train, categorical
Beispiel #8
0
def preparBaseData(y, df, predictors, len_train, len_test, frm, to,
                   tot_filename):
    changed = False
    """
    geo_cols = ['latitude', 'longitude', 
                'lat_lon_hdbscan_cluster_05_03', 'lat_lon_hdbscan_cluster_10_03', 
                'lat_lon_hdbscan_cluster_20_03']
    """
    geo_cols = ['latitude', 'longitude']
    if 'longitude' not in df.columns:
        geo = pd.read_csv('../input/avito_region_city_features.csv')[
            geo_cols + ['region', 'city']]
        df = df.reset_index().merge(geo, how='left',
                                    on=["region", "city"]).set_index('item_id')
        changed = True
    predictors += geo_cols

    if 'reg_dense' not in df.columns:
        regional = pd.read_csv('../input/regional.csv', index_col=0)
        regional.index = regional.index.str.lower()

        df['region'] = df['region'].apply(lambda x: region_map[x])
        df['region'] = df['region'].str.lower()
        df["reg_dense"] = df['region'].apply(
            lambda x: regional.loc[x, "Density_of_region(km2)"])
        df["rural"] = df['region'].apply(lambda x: regional.loc[x, "Rural_%"])
        df["reg_Time_zone"] = df['region'].apply(
            lambda x: regional.loc[x, "Time_zone"])
        df["reg_Population"] = df['region'].apply(
            lambda x: regional.loc[x, "Total_population"])
        df["reg_Urban"] = df['region'].apply(
            lambda x: regional.loc[x, "Urban%"])
        changed = True
    predictors += [
        'reg_dense', 'rural', 'reg_Time_zone', 'reg_Population', 'reg_Urban'
    ]

    if 'avg_days_up_user' not in df.columns:
        train_features = pd.read_csv('../input/aggregated_features.csv')
        df = df.reset_index().merge(train_features, on=['user_id'],
                                    how='left').set_index('item_id')
        df['avg_days_up_user'].fillna(0, inplace=True)
        df['avg_times_up_user'].fillna(0, inplace=True)
        df['n_user_items'].fillna(0, inplace=True)
    predictors += ['avg_days_up_user', 'avg_times_up_user', 'n_user_items']

    #df, timechanged = calcTimeDelta(df, frm, to, predictors)
    #changed |= timechanged

    gc.collect()
    print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

    if 'population' not in df.columns:
        print('merge population')
        population = pd.read_csv('../input/city_population.csv')
        df = df.reset_index().merge(population, how='left',
                                    on='city').set_index('item_id')
        changed = True
    predictors.append('population')

    print("Feature Engineering")
    df["price"] = np.log(df["price"] + 0.001)
    df["price"].fillna(-999, inplace=True)
    df["image_top_1"].fillna(-999, inplace=True)
    df["population"].fillna(df["population"].min(), inplace=True)
    df["population"] = np.log(df["population"] + 0.001)
    df['reg_Population'] = np.log(df["reg_Population"] + 0.001)
    df['reg_dense'] = np.log(df["reg_dense"] + 0.001)

    if 'activation_date' in df.columns:
        df.loc[df['activation_date'] > '2017-04-18',
               'activation_date'] = np.datetime64('2017-04-18')

        print("\nCreate Time Variables")
        dt = df['activation_date'].dt
        df["Weekday"] = dt.weekday.astype(np.uint8)
        #df["Weekd of Year"] = dt.week.astype(np.uint8)
        #df["DayofMonth"] = dt.day.astype(np.uint8)
        #df["DayofYear"] = dt.dayofyear.astype(np.uint16)
        #df["Month"] = dt.month.astype(np.uint8)

        del (dt)
        gc.collect()

    #predictors += ["Weekday", "Weekd of Year", "DayofMonth", "Month", "price", "item_seq_number"]
    predictors += ["Weekday", "price", "item_seq_number"]

    if 'whratio' not in df.columns:
        df_imgatt = pd.read_csv('../input/df_imgatt.csv')
        df = df.reset_index().merge(df_imgatt, how='left',
                                    on=["image"]).set_index('item_id')
        changed = True

    img_atts = [
        'whratio', 'laplacian', 'colorfull', 'brightness', 'median', 'rms',
        'stddev', 'resnet_conf', 'xception_conf', 'inception_conf'
    ]
    predictors += img_atts
    #df, imgchanged = calcImgAtt(df, predictors)
    #changed |= imgchanged

    # Create Validation Index and Remove Dead Variables
    #training_index = df.loc[df.activation_date<=pd.to_datetime('2017-04-07')].index
    #validation_index = df.loc[df.activation_date>=pd.to_datetime('2017-04-08')].index

    print("\nEncode Variables")
    categorical = [
        "user_id",
        "region",
        "city",
        "parent_category_name",
        "category_name",
        "user_type",
        "image_top_1",
        "param_1",
        "param_2",
        "param_3",
        'reg_Time_zone',
    ]
    predictors += categorical
    print("Encoding :", categorical)

    # Encoder:
    lbl = preprocessing.LabelEncoder()
    for col in tqdm(categorical):
        df[col].fillna('Unknown')
        df[col] = lbl.fit_transform(df[col].astype(str))
        if col == 'user_id':
            df[col] = df[col].astype(np.uint32)
        else:
            df[col] = df[col].astype(np.uint16)
        #print('max {} {}'.format(col, df[col].max()))

    # Feature Engineering

    count = lambda l1, l2: sum([1 for x in l1 if x in l2])
    count_digit = lambda s: sum(c.isdigit() for c in s)
    count_num = lambda s: sum(c.isnumeric() for c in s.split())

    # Meta Text Features
    df['desc_punc'] = df['description'].apply(
        lambda x: len([c for c in str(x) if c in string.punctuation]))
    textfeats = ["description", "title"]
    for cols in tqdm(textfeats):
        df[cols] = df[cols].astype(str).fillna('nicapotato')  # FILL NA
        df[cols] = df[cols].str.lower(
        )  # Lowercase all text, so that capitalized words dont get treated differently
        #df[cols] = df[cols].apply(lambda x: cleanName(x))

        att_name = cols + '_num_chars'
        predictors.append(att_name)
        if att_name not in df.columns:
            df[att_name] = df[cols].apply(len).astype(
                np.uint16)  # Count number of Characters
            changed |= True

        att_name = cols + '_num_words'
        predictors.append(att_name)
        if att_name not in df.columns:
            df[att_name] = df[cols].apply(
                lambda comment: len(comment.split())).astype(
                    np.uint16)  # Count number of Words
            changed |= True

        att_name = cols + '_num_unique_words'
        predictors.append(att_name)
        if att_name not in df.columns:
            df[att_name] = df[cols].apply(
                lambda comment: len(set(w for w in comment.split()))).astype(
                    np.uint16)
            changed |= True

        att_name = cols + '_words_vs_unique'
        predictors.append(att_name)
        if att_name not in df.columns:
            df[att_name] = (df[cols + '_num_unique_words'] /
                            df[cols + '_num_words'] * 100).astype(
                                np.float32)  # Count Unique Words
            changed |= True

        att_name = cols + '_punctuation'
        predictors.append(att_name)
        if att_name not in df.columns:
            df[att_name] = df[cols].apply(
                count, args=(string.punctuation, )).astype(np.uint16)
            changed |= True

        att_name = cols + '_digit'
        predictors.append(att_name)
        if att_name not in df.columns:
            df[att_name] = df[cols].apply(count_digit).astype(np.uint16)
            changed |= True

        att_name = cols + '_num'
        predictors.append(att_name)
        if att_name not in df.columns:
            df[att_name] = df[cols].apply(count_num).astype(np.uint16)
            changed |= True

        att_name = cols + '_num_letters'
        predictors.append(att_name)
        if att_name not in df.columns:
            df[att_name] = df[cols].apply(lambda comment: len(comment)).astype(
                np.uint16)
            changed |= True

    #df['description_num_letters'] = df['description_num_letters'] + 1
    #df['description_num_words'] = df['description_num_words'] + 1
    df['title_desc_len_ratio'] = df['title_num_letters'] / df[
        'description_num_letters']
    df['desc_num_ratio'] = df['description_num'] / df['description_num_words']
    predictors += ['title_desc_len_ratio', 'desc_num_ratio']

    df = parse_att.checkDrop_Bulk(df, ["activation_date", "image"])

    feature_list = [
        (['city', 'category_name', 'param_1'], ['count', 'nunique']),
        (['category_name', 'param_1', 'price'], ['count', 'zscore']),
        (['user_id', 'price'], ['count']),
        (['user_id', 'category_name', 'param_1', 'price'], ['count']),
        (['city', 'category_name', 'param_1', 'price'], ['count', 'zscore']),
        (['category_name', 'param_1', 'param_2',
          'description_num_chars'], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'description_num_words'], ['zscore']),
        ([
            'category_name', 'param_1', 'param_2',
            'description_num_unique_words'
        ], ['zscore']),
        ([
            'category_name', 'param_1', 'param_2',
            'description_words_vs_unique'
        ], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'description_punctuation'], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'description_digit'], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'description_num'], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'title_num_chars'], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'title_num_words'], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'title_num_unique_words'], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'title_words_vs_unique'], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'title_punctuation'], ['zscore']),
        (['category_name', 'param_1', 'param_2', 'title_digit'], ['zscore']),
        (['category_name', 'param_1', 'param_2', 'title_num'], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'title_desc_len_ratio'], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'desc_num_ratio'], ['zscore']),
        (['category_name', 'param_1', 'param_2', 'whratio'], ['zscore']),
        (['category_name', 'param_1', 'param_2', 'laplacian'], ['zscore']),
        (['category_name', 'param_1', 'param_2', 'colorfull'], ['zscore']),
        (['category_name', 'param_1', 'param_2', 'brightness'], ['zscore']),
        (['category_name', 'param_1', 'param_2', 'median'], ['zscore']),
        (['category_name', 'param_1', 'param_2', 'rms'], ['zscore']),
        (['category_name', 'param_1', 'param_2', 'stddev'], ['zscore']),
        (['category_name', 'param_1', 'param_2', 'resnet_conf'], ['zscore']),
        (['category_name', 'param_1', 'param_2', 'xception_conf'], ['zscore']),
        (['category_name', 'param_1', 'param_2',
          'inception_conf'], ['zscore']),
    ]

    for (selcol, how) in tqdm(feature_list):
        print('{} {}'.format(selcol, how))
        df, sub_changed = parse_att.calcGroupFeatureBulk(
            df, selcol, how, frm, to, predictors)
        changed |= sub_changed

    for col in df.columns:
        if 'zscore' in col:
            df[col].fillna(0, inplace=True)
            df[col].replace(np.Inf, 0, inplace=True)
            df[col].replace(-np.Inf, 0, inplace=True)
            df[col][df[col] < -4] = -4
            df[col][df[col] > 4] = 4

    df = kaggle_util.reduce_mem_usage(df)

    if not changed:
        print('df not changed')
    else:
        print('df changed, save...')
        #df.reset_index().to_feather(tot_filename)
        df.reset_index().to_csv(tot_filename)

    old_num = len(predictors)
    predictors = list(set(predictors))
    print('unique feature num from [{}] to [{}]'.format(
        old_num, len(predictors)))
    return y, df, predictors, len_train, categorical, textfeats
Beispiel #9
0
def statistic_features():
    argucnt = 5
    train = kaggle_util.reduce_mem_usage(
        pd.read_csv('../data/train_mel.csv', nrows=nrows))
    test = kaggle_util.reduce_mem_usage(
        pd.read_csv('../data/test_mel.csv', nrows=nrows))
    #y = pd.get_dummies(train.label)
    y_label = np.load('../cache/train_y.npy')
    if DEBUG:
        y_label = y_label[:nrows]

    LABELS = list(train.label.unique())
    n_categories = len(LABELS)

    train = train.drop(['fname', 'label', 'manually_verified'], axis=1)
    feature_names = list(test.drop(['fname', 'label'], axis=1).columns.values)
    test = test.drop(['fname', 'label'], axis=1)

    argucnt = 2
    mixup = 3
    mixup_alpha = 1
    train = train[:len(y_label) * argucnt]

    y = np.copy(y_label)
    for i in range(argucnt):
        y = np.vstack([y, y_label])

    y_label = [np.argmax(row) for row in y_label]
    len_y = len(y_label)
    print(len_y, train.shape)
    #exit()
    PREDICTION_FOLDER = '../result/predictions/lgb'
    if not os.path.exists(PREDICTION_FOLDER):
        os.mkdir(PREDICTION_FOLDER)

    cvscores = []
    skf = StratifiedKFold(y_label, n_folds=nfold)
    for i, (train_split, val_split) in enumerate(skf):
        train_split = np.hstack([(train_split + len_y * i)
                                 for i in range(argucnt)])
        X_train = train.iloc[train_split].values
        y_train = y[train_split]
        #y_train = [np.argmax(row) for row in y[train_split]]

        if mixup > 0:
            x_train_sub = None
            y_train_sub = None
            for j in range(mixup):
                print('mixup', mixup)
                x_tmp, y_tmp = mixup_all(X_train, y_train, mixup_alpha)
                x_train_sub = x_tmp if x_train_sub is None else np.vstack(
                    (x_train_sub, x_tmp))
                y_train_sub = y_tmp if y_train_sub is None else np.vstack(
                    (y_train_sub, y_tmp))
            X_train = x_train_sub
            y_train = y_train_sub

        y_train = [np.argmax(row) for row in y_train]

        #exit()
        X_valid = train.iloc[val_split].values
        #y_valid = y[val_split]
        y_valid = [np.argmax(row) for row in y[val_split]]

        print(X_train.shape, X_valid.shape)
        #print(feature_names)

        d_train = lgb.Dataset(X_train,
                              label=y_train,
                              feature_name=feature_names)
        d_valid = lgb.Dataset(X_valid,
                              label=y_valid,
                              feature_name=feature_names)

        params = {
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'max_depth': 5,
            'num_leaves': 9,
            'learning_rate': 0.005,
            'feature_fraction': 0.85,
            'bagging_fraction': 0.85,
            'bagging_freq': 2,
            'num_threads': 8,
            'lambda_l2': 1.0,
            'min_gain_to_split': 0,
            'num_class': n_categories
        }

        clf = lgb.train(params,
                        d_train,
                        num_boost_round=nround,
                        valid_sets=d_valid,
                        verbose_eval=200,
                        early_stopping_rounds=100)
        p = clf.predict(X_valid, num_iteration=clf.best_iteration)

        #predictions = [list(np.argsort(p[i])[::-1][:3]) for i in range(len(p))]
        #actual = [[i] for i in y_valid]
        #valid_score = mapk(actual, predictions, k=3)
        valid_score = get_valid_score(y_valid, p)
        print("Score = {:.4f}".format(valid_score))
        cvscores.append(valid_score)

        f, ax = plt.subplots(figsize=[7, 100])
        lgb.plot_importance(clf, max_num_features=200, ax=ax)
        plt.title("Light GBM Feature Importance")
        plt.savefig('feature_import.png', bbox_inches='tight')

        pre_test = clf.predict(test, num_iteration=clf.best_iteration)
        savepath = "/p{}.npy"
        savepath = savepath.format(i)
        np.save(PREDICTION_FOLDER + savepath, pre_test)

    cvmean = np.mean(cvscores)
    cvstd = np.std(cvscores)
    print('mean {0:.3f} std {1:.3f}'.format(cvmean, cvstd))
    actual_prefix = '{:.3f}'.format(cvmean)
    ensemble(LABELS, nfold, [PREDICTION_FOLDER], actual_prefix, 'lgb', False)
Beispiel #10
0
    for col in emb_cols:
        dict_emb_max[col] = train[col].max() + 2

    num_cols = get_numcols(train, emb_cols)

    for col in num_cols:
        train[col].fillna(0, inplace=True)
        train[col].replace(np.Inf, 0, inplace=True)
        train[col].replace(-np.Inf, 0, inplace=True)
        if 'zscore' in col:
            train[col][train[col] < -4] = -4
            train[col][train[col] > 4] = 4

    scaler = MinMaxScaler()
    train[num_cols] = scaler.fit_transform(train[num_cols])
    train = kaggle_util.reduce_mem_usage(train)

    print('num_cols:')
    print(num_cols)

    train['title_description'] = (train['title'] + " " +
                                  train['description']).astype(str)
    print("Start Tokenization.....")
    tokenizer = kaggle_util.get_text_tokenizer(train, 'title_description',
                                               max_words_title_description)

    train['seq_description'] = tokenizer.texts_to_sequences(
        train.description.str.lower())
    train['seq_title'] = tokenizer.texts_to_sequences(train.title.str.lower())

    EMBEDDING_FILE1 = '../input/wiki.ru.vec'
Beispiel #11
0
plt.switch_backend('agg')
import kaggle_util
from util import *
from xgboost import XGBClassifier

DEBUG = 0
nfold = 10
nround = 50000
if DEBUG:
    nfold = 2
    nround = 5

nrows = None if not DEBUG else 1000

if __name__ == "__main__":
    train = kaggle_util.reduce_mem_usage(
        pd.read_csv('../data/train_mel.csv', nrows=nrows))
    test = kaggle_util.reduce_mem_usage(
        pd.read_csv('../data/test_mel.csv', nrows=nrows))
    #y = pd.get_dummies(train.label)
    y = np.load('../cache/train_y.npy')
    if DEBUG:
        y = y[:nrows]

    LABELS = list(train.label.unique())
    n_categories = len(LABELS)

    train = train.drop(['fname', 'label', 'manually_verified'], axis=1)
    feature_names = list(test.drop(['fname', 'label'], axis=1).columns.values)
    test = test.drop(['fname', 'label'], axis=1).values

    #labels = y.columns.values