def analyze_mel(train_df, test_df): print('analyze_mel') feature_names = mel_spectral_features(return_fnames=True) train_df[feature_names] = train_df['fname'].progress_apply( mel_spectral_features, root=train_root_trimmed) test_df[feature_names] = test_df['fname'].progress_apply( mel_spectral_features, root=test_root_trimmed) train_df.to_csv('../data/train_mel.csv', index=False, float_format='%.4f') test_df.to_csv('../data/test_mel.csv', index=False, float_format='%.4f') return reduce_mem_usage(train_df), reduce_mem_usage(test_df)
def trim(train_df, test_df): print('trim') train_df[['length_int', 'length_final', 'ratio_int', 'ratio_final', 'mean_max_splits', 'std_max_splits', 'mean_len_splits', 'std_len_splits', 'cnt_splits']] = \ train_df['fname'].progress_apply(trim_silence, root=train_root) train_df.to_csv('../data/train_2.csv', index=False, float_format='%.4f') test_df[['length_int', 'length_final', 'ratio_int', 'ratio_final', 'mean_max_splits', 'std_max_splits', 'mean_len_splits', 'std_len_splits', 'cnt_splits']] = \ test_df['fname'].progress_apply(trim_silence, root=test_root) test_df.to_csv('../data/test_2.csv', index=False, float_format='%.4f') return reduce_mem_usage(train_df), reduce_mem_usage(test_df)
def extract_segment_feature(train, test): train_files = train.fname.values train_features = extract_features(train_files, train_root_trimmed) test_files = test.fname.values test_features = extract_features(test_files, test_root_trimmed) train = train.merge(train_features, on='fname', how='left') test = test.merge(test_features, on='fname', how='left') train.to_csv('../data/train_seg.csv', index=False, float_format='%.4f') test.to_csv('../data/test_seg.csv', index=False, float_format='%.4f') return reduce_mem_usage(train), reduce_mem_usage(test)
def get_crossvalid_data(frm, to): #test_path = '../input/test.csv' #train_path = '../input/train.csv' test_path = '../input/imgtop_test.csv' train_path = '../input/imgtop_train.csv' testing = pd.read_csv(test_path, skiprows=range(1, frm), nrows=to - frm, index_col="item_id", parse_dates=["activation_date"]) testdex = testing.index len_test = len(testing) tot_filename = '/media/extend/cache/total_{}_{}.csv'.format(frm, to) tot_yname = '/media/extend/cache/total_y_{}_{}.csv'.format(frm, to) if os.path.exists(tot_filename) and os.path.exists(tot_yname): print('load from feather') #df = pd.read_feather(tot_filename).set_index("item_id") #y = pd.read_feather(tot_yname).set_index("item_id").deal_probability.copy() df = pd.read_csv(tot_filename).set_index("item_id") y = pd.read_csv(tot_yname).set_index("item_id").deal_probability.copy() len_train = to - frm else: training = pd.read_csv(train_path, skiprows=range(1, frm), nrows=to - frm, index_col="item_id", parse_dates=["activation_date"]) len_train = len(training) y = training.deal_probability.copy() training.drop("deal_probability", axis=1, inplace=True) #y.reset_index().to_feather(tot_yname) y.reset_index().to_csv(tot_yname) print('Train shape: {} Rows, {} Columns'.format(*training.shape)) print('Test shape: {} Rows, {} Columns'.format(*testing.shape)) df = pd.concat([training, testing], axis=0) del training, testing predictors = [] y, df, ready_df, tfvocab, predictors, len_train, categorical = \ preparTotalData(y, df, predictors, len_train, len_test, frm, to, tot_filename) #none_categorical = [x for x in df.columns if x not in categorical] df = df[predictors] df = kaggle_util.reduce_mem_usage(df) print(df.info()) tfvocab = df.columns.tolist() + tfvocab testing = hstack([csr_matrix(df[len_train:].values), ready_df[len_train:]]) return df, y, testing, ready_df, tfvocab, predictors, len_train, categorical, tfvocab, testdex
def basic_analyze(train_df, test_df, train_root, test_root): print('basic_analyze') train_df[[ 'length', 'data_mean', 'data_min', 'data_max', 'data_std', 'data_rms', 'skewness', 'kurtosis' ]] = train_df['fname'].progress_apply(wavfile_stats, root=train_root) test_df[[ 'length', 'data_mean', 'data_min', 'data_max', 'data_std', 'data_rms', 'skewness', 'kurtosis' ]] = test_df['fname'].progress_apply(wavfile_stats, root=test_root) train_df['rms_std'] = train_df['data_rms'] / train_df['data_std'] test_df['rms_std'] = test_df['data_rms'] / test_df['data_std'] train_df['max_min'] = train_df['data_max'] / train_df['data_min'] test_df['max_min'] = test_df['data_max'] / test_df['data_min'] train_df.to_csv('../data/train_1.csv', index=False, float_format='%.4f') test_df.to_csv('../data/test_1.csv', index=False, float_format='%.4f') return reduce_mem_usage(train_df), reduce_mem_usage(test_df)
def keras_train_transform(dataset): print('transform...') dataset, txt_stats = deal_text_feature(dataset) for key in dict_encoder.keys(): #print(key) dataset[key] = dict_encoder[key].transform(dataset[key]) dataset = kaggle_util.reduce_mem_usage(dataset) print("Transform on test function completed.") dataset = num_log(dataset) return dataset, txt_stats
def preparTotalData(y, df, predictors, len_train, len_test, frm, to, tot_filename): y, df, predictors, len_train, categorical, textfeats = preparBaseData( y, df, predictors, len_train, len_test, frm, to, tot_filename) print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage") russian_stop = set(stopwords.words('russian')) tfidf_para = { "stop_words": russian_stop, "analyzer": 'word', "token_pattern": r'\w{1,}', "sublinear_tf": True, "dtype": np.float32, "norm": 'l2', #"min_df":5, #"max_df":.9, "smooth_idf": False } def get_col(col_name): return lambda x: x[col_name] vectorizer = FeatureUnion([ ('description', TfidfVectorizer(ngram_range=(1, 2), max_features=17000, **tfidf_para, preprocessor=get_col('description'))), ( 'title', TfidfVectorizer( ngram_range=(1, 2), **tfidf_para, #max_features=7000, preprocessor=get_col('title'))) ]) start_vect = time.time() #vectorizer.fit(df.loc[traindex,:].to_dict('records')) vectorizer.fit(df[:len_train].to_dict('records')) ready_df = vectorizer.transform(df.to_dict('records')) tfvocab = vectorizer.get_feature_names() print("Vectorization Runtime: %0.2f Minutes" % ((time.time() - start_vect) / 60)) # Drop Text Cols df.drop(textfeats, axis=1, inplace=True) #from sklearn.metrics import mean_squared_error from math import sqrt kf = KFold(len_train, n_folds=NFOLDS, shuffle=True, random_state=SEED) ridge_params = { 'alpha': 30.0, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': SEED } ridge = SklearnWrapper(clf=Ridge, seed=SEED, params=ridge_params) ridge_oof_train, ridge_oof_test = get_oof(ridge, ready_df[:len_train], y, ready_df[len_train:], len_train, len_test, kf) #rms = sqrt(mean_squared_error(y, ridge_oof_train)) ridge_preds = np.concatenate([ridge_oof_train, ridge_oof_test]) df['ridge_preds'] = ridge_preds predictors.append('ridge_preds') df = kaggle_util.reduce_mem_usage(df) return y, df, ready_df, tfvocab, predictors, len_train, categorical
def preparBaseData(y, df, predictors, len_train, len_test, frm, to, tot_filename): changed = False """ geo_cols = ['latitude', 'longitude', 'lat_lon_hdbscan_cluster_05_03', 'lat_lon_hdbscan_cluster_10_03', 'lat_lon_hdbscan_cluster_20_03'] """ geo_cols = ['latitude', 'longitude'] if 'longitude' not in df.columns: geo = pd.read_csv('../input/avito_region_city_features.csv')[ geo_cols + ['region', 'city']] df = df.reset_index().merge(geo, how='left', on=["region", "city"]).set_index('item_id') changed = True predictors += geo_cols if 'reg_dense' not in df.columns: regional = pd.read_csv('../input/regional.csv', index_col=0) regional.index = regional.index.str.lower() df['region'] = df['region'].apply(lambda x: region_map[x]) df['region'] = df['region'].str.lower() df["reg_dense"] = df['region'].apply( lambda x: regional.loc[x, "Density_of_region(km2)"]) df["rural"] = df['region'].apply(lambda x: regional.loc[x, "Rural_%"]) df["reg_Time_zone"] = df['region'].apply( lambda x: regional.loc[x, "Time_zone"]) df["reg_Population"] = df['region'].apply( lambda x: regional.loc[x, "Total_population"]) df["reg_Urban"] = df['region'].apply( lambda x: regional.loc[x, "Urban%"]) changed = True predictors += [ 'reg_dense', 'rural', 'reg_Time_zone', 'reg_Population', 'reg_Urban' ] if 'avg_days_up_user' not in df.columns: train_features = pd.read_csv('../input/aggregated_features.csv') df = df.reset_index().merge(train_features, on=['user_id'], how='left').set_index('item_id') df['avg_days_up_user'].fillna(0, inplace=True) df['avg_times_up_user'].fillna(0, inplace=True) df['n_user_items'].fillna(0, inplace=True) predictors += ['avg_days_up_user', 'avg_times_up_user', 'n_user_items'] #df, timechanged = calcTimeDelta(df, frm, to, predictors) #changed |= timechanged gc.collect() print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape)) if 'population' not in df.columns: print('merge population') population = pd.read_csv('../input/city_population.csv') df = df.reset_index().merge(population, how='left', on='city').set_index('item_id') changed = True predictors.append('population') print("Feature Engineering") df["price"] = np.log(df["price"] + 0.001) df["price"].fillna(-999, inplace=True) df["image_top_1"].fillna(-999, inplace=True) df["population"].fillna(df["population"].min(), inplace=True) df["population"] = np.log(df["population"] + 0.001) df['reg_Population'] = np.log(df["reg_Population"] + 0.001) df['reg_dense'] = np.log(df["reg_dense"] + 0.001) if 'activation_date' in df.columns: df.loc[df['activation_date'] > '2017-04-18', 'activation_date'] = np.datetime64('2017-04-18') print("\nCreate Time Variables") dt = df['activation_date'].dt df["Weekday"] = dt.weekday.astype(np.uint8) #df["Weekd of Year"] = dt.week.astype(np.uint8) #df["DayofMonth"] = dt.day.astype(np.uint8) #df["DayofYear"] = dt.dayofyear.astype(np.uint16) #df["Month"] = dt.month.astype(np.uint8) del (dt) gc.collect() #predictors += ["Weekday", "Weekd of Year", "DayofMonth", "Month", "price", "item_seq_number"] predictors += ["Weekday", "price", "item_seq_number"] if 'whratio' not in df.columns: df_imgatt = pd.read_csv('../input/df_imgatt.csv') df = df.reset_index().merge(df_imgatt, how='left', on=["image"]).set_index('item_id') changed = True img_atts = [ 'whratio', 'laplacian', 'colorfull', 'brightness', 'median', 'rms', 'stddev', 'resnet_conf', 'xception_conf', 'inception_conf' ] predictors += img_atts #df, imgchanged = calcImgAtt(df, predictors) #changed |= imgchanged # Create Validation Index and Remove Dead Variables #training_index = df.loc[df.activation_date<=pd.to_datetime('2017-04-07')].index #validation_index = df.loc[df.activation_date>=pd.to_datetime('2017-04-08')].index print("\nEncode Variables") categorical = [ "user_id", "region", "city", "parent_category_name", "category_name", "user_type", "image_top_1", "param_1", "param_2", "param_3", 'reg_Time_zone', ] predictors += categorical print("Encoding :", categorical) # Encoder: lbl = preprocessing.LabelEncoder() for col in tqdm(categorical): df[col].fillna('Unknown') df[col] = lbl.fit_transform(df[col].astype(str)) if col == 'user_id': df[col] = df[col].astype(np.uint32) else: df[col] = df[col].astype(np.uint16) #print('max {} {}'.format(col, df[col].max())) # Feature Engineering count = lambda l1, l2: sum([1 for x in l1 if x in l2]) count_digit = lambda s: sum(c.isdigit() for c in s) count_num = lambda s: sum(c.isnumeric() for c in s.split()) # Meta Text Features df['desc_punc'] = df['description'].apply( lambda x: len([c for c in str(x) if c in string.punctuation])) textfeats = ["description", "title"] for cols in tqdm(textfeats): df[cols] = df[cols].astype(str).fillna('nicapotato') # FILL NA df[cols] = df[cols].str.lower( ) # Lowercase all text, so that capitalized words dont get treated differently #df[cols] = df[cols].apply(lambda x: cleanName(x)) att_name = cols + '_num_chars' predictors.append(att_name) if att_name not in df.columns: df[att_name] = df[cols].apply(len).astype( np.uint16) # Count number of Characters changed |= True att_name = cols + '_num_words' predictors.append(att_name) if att_name not in df.columns: df[att_name] = df[cols].apply( lambda comment: len(comment.split())).astype( np.uint16) # Count number of Words changed |= True att_name = cols + '_num_unique_words' predictors.append(att_name) if att_name not in df.columns: df[att_name] = df[cols].apply( lambda comment: len(set(w for w in comment.split()))).astype( np.uint16) changed |= True att_name = cols + '_words_vs_unique' predictors.append(att_name) if att_name not in df.columns: df[att_name] = (df[cols + '_num_unique_words'] / df[cols + '_num_words'] * 100).astype( np.float32) # Count Unique Words changed |= True att_name = cols + '_punctuation' predictors.append(att_name) if att_name not in df.columns: df[att_name] = df[cols].apply( count, args=(string.punctuation, )).astype(np.uint16) changed |= True att_name = cols + '_digit' predictors.append(att_name) if att_name not in df.columns: df[att_name] = df[cols].apply(count_digit).astype(np.uint16) changed |= True att_name = cols + '_num' predictors.append(att_name) if att_name not in df.columns: df[att_name] = df[cols].apply(count_num).astype(np.uint16) changed |= True att_name = cols + '_num_letters' predictors.append(att_name) if att_name not in df.columns: df[att_name] = df[cols].apply(lambda comment: len(comment)).astype( np.uint16) changed |= True #df['description_num_letters'] = df['description_num_letters'] + 1 #df['description_num_words'] = df['description_num_words'] + 1 df['title_desc_len_ratio'] = df['title_num_letters'] / df[ 'description_num_letters'] df['desc_num_ratio'] = df['description_num'] / df['description_num_words'] predictors += ['title_desc_len_ratio', 'desc_num_ratio'] df = parse_att.checkDrop_Bulk(df, ["activation_date", "image"]) feature_list = [ (['city', 'category_name', 'param_1'], ['count', 'nunique']), (['category_name', 'param_1', 'price'], ['count', 'zscore']), (['user_id', 'price'], ['count']), (['user_id', 'category_name', 'param_1', 'price'], ['count']), (['city', 'category_name', 'param_1', 'price'], ['count', 'zscore']), (['category_name', 'param_1', 'param_2', 'description_num_chars'], ['zscore']), (['category_name', 'param_1', 'param_2', 'description_num_words'], ['zscore']), ([ 'category_name', 'param_1', 'param_2', 'description_num_unique_words' ], ['zscore']), ([ 'category_name', 'param_1', 'param_2', 'description_words_vs_unique' ], ['zscore']), (['category_name', 'param_1', 'param_2', 'description_punctuation'], ['zscore']), (['category_name', 'param_1', 'param_2', 'description_digit'], ['zscore']), (['category_name', 'param_1', 'param_2', 'description_num'], ['zscore']), (['category_name', 'param_1', 'param_2', 'title_num_chars'], ['zscore']), (['category_name', 'param_1', 'param_2', 'title_num_words'], ['zscore']), (['category_name', 'param_1', 'param_2', 'title_num_unique_words'], ['zscore']), (['category_name', 'param_1', 'param_2', 'title_words_vs_unique'], ['zscore']), (['category_name', 'param_1', 'param_2', 'title_punctuation'], ['zscore']), (['category_name', 'param_1', 'param_2', 'title_digit'], ['zscore']), (['category_name', 'param_1', 'param_2', 'title_num'], ['zscore']), (['category_name', 'param_1', 'param_2', 'title_desc_len_ratio'], ['zscore']), (['category_name', 'param_1', 'param_2', 'desc_num_ratio'], ['zscore']), (['category_name', 'param_1', 'param_2', 'whratio'], ['zscore']), (['category_name', 'param_1', 'param_2', 'laplacian'], ['zscore']), (['category_name', 'param_1', 'param_2', 'colorfull'], ['zscore']), (['category_name', 'param_1', 'param_2', 'brightness'], ['zscore']), (['category_name', 'param_1', 'param_2', 'median'], ['zscore']), (['category_name', 'param_1', 'param_2', 'rms'], ['zscore']), (['category_name', 'param_1', 'param_2', 'stddev'], ['zscore']), (['category_name', 'param_1', 'param_2', 'resnet_conf'], ['zscore']), (['category_name', 'param_1', 'param_2', 'xception_conf'], ['zscore']), (['category_name', 'param_1', 'param_2', 'inception_conf'], ['zscore']), ] for (selcol, how) in tqdm(feature_list): print('{} {}'.format(selcol, how)) df, sub_changed = parse_att.calcGroupFeatureBulk( df, selcol, how, frm, to, predictors) changed |= sub_changed for col in df.columns: if 'zscore' in col: df[col].fillna(0, inplace=True) df[col].replace(np.Inf, 0, inplace=True) df[col].replace(-np.Inf, 0, inplace=True) df[col][df[col] < -4] = -4 df[col][df[col] > 4] = 4 df = kaggle_util.reduce_mem_usage(df) if not changed: print('df not changed') else: print('df changed, save...') #df.reset_index().to_feather(tot_filename) df.reset_index().to_csv(tot_filename) old_num = len(predictors) predictors = list(set(predictors)) print('unique feature num from [{}] to [{}]'.format( old_num, len(predictors))) return y, df, predictors, len_train, categorical, textfeats
def statistic_features(): argucnt = 5 train = kaggle_util.reduce_mem_usage( pd.read_csv('../data/train_mel.csv', nrows=nrows)) test = kaggle_util.reduce_mem_usage( pd.read_csv('../data/test_mel.csv', nrows=nrows)) #y = pd.get_dummies(train.label) y_label = np.load('../cache/train_y.npy') if DEBUG: y_label = y_label[:nrows] LABELS = list(train.label.unique()) n_categories = len(LABELS) train = train.drop(['fname', 'label', 'manually_verified'], axis=1) feature_names = list(test.drop(['fname', 'label'], axis=1).columns.values) test = test.drop(['fname', 'label'], axis=1) argucnt = 2 mixup = 3 mixup_alpha = 1 train = train[:len(y_label) * argucnt] y = np.copy(y_label) for i in range(argucnt): y = np.vstack([y, y_label]) y_label = [np.argmax(row) for row in y_label] len_y = len(y_label) print(len_y, train.shape) #exit() PREDICTION_FOLDER = '../result/predictions/lgb' if not os.path.exists(PREDICTION_FOLDER): os.mkdir(PREDICTION_FOLDER) cvscores = [] skf = StratifiedKFold(y_label, n_folds=nfold) for i, (train_split, val_split) in enumerate(skf): train_split = np.hstack([(train_split + len_y * i) for i in range(argucnt)]) X_train = train.iloc[train_split].values y_train = y[train_split] #y_train = [np.argmax(row) for row in y[train_split]] if mixup > 0: x_train_sub = None y_train_sub = None for j in range(mixup): print('mixup', mixup) x_tmp, y_tmp = mixup_all(X_train, y_train, mixup_alpha) x_train_sub = x_tmp if x_train_sub is None else np.vstack( (x_train_sub, x_tmp)) y_train_sub = y_tmp if y_train_sub is None else np.vstack( (y_train_sub, y_tmp)) X_train = x_train_sub y_train = y_train_sub y_train = [np.argmax(row) for row in y_train] #exit() X_valid = train.iloc[val_split].values #y_valid = y[val_split] y_valid = [np.argmax(row) for row in y[val_split]] print(X_train.shape, X_valid.shape) #print(feature_names) d_train = lgb.Dataset(X_train, label=y_train, feature_name=feature_names) d_valid = lgb.Dataset(X_valid, label=y_valid, feature_name=feature_names) params = { 'boosting_type': 'gbdt', 'objective': 'multiclass', 'metric': 'multi_logloss', 'max_depth': 5, 'num_leaves': 9, 'learning_rate': 0.005, 'feature_fraction': 0.85, 'bagging_fraction': 0.85, 'bagging_freq': 2, 'num_threads': 8, 'lambda_l2': 1.0, 'min_gain_to_split': 0, 'num_class': n_categories } clf = lgb.train(params, d_train, num_boost_round=nround, valid_sets=d_valid, verbose_eval=200, early_stopping_rounds=100) p = clf.predict(X_valid, num_iteration=clf.best_iteration) #predictions = [list(np.argsort(p[i])[::-1][:3]) for i in range(len(p))] #actual = [[i] for i in y_valid] #valid_score = mapk(actual, predictions, k=3) valid_score = get_valid_score(y_valid, p) print("Score = {:.4f}".format(valid_score)) cvscores.append(valid_score) f, ax = plt.subplots(figsize=[7, 100]) lgb.plot_importance(clf, max_num_features=200, ax=ax) plt.title("Light GBM Feature Importance") plt.savefig('feature_import.png', bbox_inches='tight') pre_test = clf.predict(test, num_iteration=clf.best_iteration) savepath = "/p{}.npy" savepath = savepath.format(i) np.save(PREDICTION_FOLDER + savepath, pre_test) cvmean = np.mean(cvscores) cvstd = np.std(cvscores) print('mean {0:.3f} std {1:.3f}'.format(cvmean, cvstd)) actual_prefix = '{:.3f}'.format(cvmean) ensemble(LABELS, nfold, [PREDICTION_FOLDER], actual_prefix, 'lgb', False)
for col in emb_cols: dict_emb_max[col] = train[col].max() + 2 num_cols = get_numcols(train, emb_cols) for col in num_cols: train[col].fillna(0, inplace=True) train[col].replace(np.Inf, 0, inplace=True) train[col].replace(-np.Inf, 0, inplace=True) if 'zscore' in col: train[col][train[col] < -4] = -4 train[col][train[col] > 4] = 4 scaler = MinMaxScaler() train[num_cols] = scaler.fit_transform(train[num_cols]) train = kaggle_util.reduce_mem_usage(train) print('num_cols:') print(num_cols) train['title_description'] = (train['title'] + " " + train['description']).astype(str) print("Start Tokenization.....") tokenizer = kaggle_util.get_text_tokenizer(train, 'title_description', max_words_title_description) train['seq_description'] = tokenizer.texts_to_sequences( train.description.str.lower()) train['seq_title'] = tokenizer.texts_to_sequences(train.title.str.lower()) EMBEDDING_FILE1 = '../input/wiki.ru.vec'
plt.switch_backend('agg') import kaggle_util from util import * from xgboost import XGBClassifier DEBUG = 0 nfold = 10 nround = 50000 if DEBUG: nfold = 2 nround = 5 nrows = None if not DEBUG else 1000 if __name__ == "__main__": train = kaggle_util.reduce_mem_usage( pd.read_csv('../data/train_mel.csv', nrows=nrows)) test = kaggle_util.reduce_mem_usage( pd.read_csv('../data/test_mel.csv', nrows=nrows)) #y = pd.get_dummies(train.label) y = np.load('../cache/train_y.npy') if DEBUG: y = y[:nrows] LABELS = list(train.label.unique()) n_categories = len(LABELS) train = train.drop(['fname', 'label', 'manually_verified'], axis=1) feature_names = list(test.drop(['fname', 'label'], axis=1).columns.values) test = test.drop(['fname', 'label'], axis=1).values #labels = y.columns.values