def lr_with_fs(): """ Submission: lr_with_fs_0620_02.csv E_val: <missing> E_in: 0.856252488379 E_out: 0.8552577388980213 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = util.fetch(util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_new, y) print(auc_score(clf, X_new, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('lr', clf)]), 'lr_with_fs_0620_02')
def lr_with_fs(): """ Submission: lr_with_fs_0620_02.csv E_val: <missing> E_in: 0.856252488379 E_out: 0.8552577388980213 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = util.fetch(util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_new, y) print(auc_score(clf, X_new, y)) to_submission( Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('lr', clf)]), 'lr_with_fs_0620_02')
def main(): if os.path.isfile(cache_path(_ARGS.name)): if _ARGS.name == 'clean': # c, d = load_cache('run/' + _ARGS.name) # documents = df['tokens'].to_list() # dump_cache((c, d, documents), 'run/' + _ARGS.name) pipline(None) return else: df = load_cache(_ARGS.name) else: if _ARGS.name == 'clean': dfs = list() for i in range(6): path = f"./dev/data/clean{i}_covid19.xlsx" if os.path.isfile(cache_path(f'clean{i}')): part = load_cache(f'clean{i}') else: part = read(path) dump_cache(part, f'clean{i}') dfs.append(part) df = concat(dfs, ignore_index=True) else: path = f"./dev/data/{_ARGS.name}_covid19.xlsx" df = read(path) dump_cache(df, _ARGS.name) # logging.disable(level=logging.INFO) pipline(df) return
def sgd(): """ Submission: sgd_0620_03.csv E_val: 0.863628 E_in: 0.854373 E_out: """ from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import StratifiedKFold X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = util.fetch(util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) sgd = SGDClassifier(n_iter=50, n_jobs=-1) params = { 'loss': [ 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive' ] } grid = GridSearchCV(sgd, param_grid=params, cv=StratifiedKFold(y, 5), scoring='roc_auc', n_jobs=-1) grid.fit(X_new, y) logger.debug('Best score (E_val): %f', grid.best_score_) sgd = grid.best_estimator_ logger.debug('E_in: %f', auc_score(sgd, X_new, y)) to_submission( Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('sgd', sgd)]), 'sgd_0620_03')
def lr(): """ Submission: lr_0618.csv E_val: <missing> E_in: <missing> E_out: 0.8119110960575004 """ from sklearn.linear_model import LogisticRegressionCV X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X, y) print(auc_score(clf, X, y)) to_submission(clf, 'lr_0618_xxx')
def load_test(): """ Load dataset for testing. Returns ------- X: numpy ndarray, shape: (num_of_enrollments, num_of_features) Rows of features. """ pkl_path = util.cache_path('test_X') if os.path.exists(pkl_path): X = util.fetch(pkl_path) else: enroll_set = np.sort(util.load_enrollment_test()['enrollment_id']) # log = util.load_logs() # base_date = log['time'].max().to_datetime() base_date = datetime(2014, 8, 1, 22, 0, 47) X = None for f in MODELING['features']: X_ = f(enroll_set, base_date) if X is None: X = X_ else: X = np.c_[X, X_] util.dump(X, pkl_path) return X
def sgd(): """ Submission: sgd_0620_03.csv E_val: 0.863628 E_in: 0.854373 E_out: """ from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import StratifiedKFold X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = util.fetch(util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) sgd = SGDClassifier(n_iter=50, n_jobs=-1) params = { 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'] } grid = GridSearchCV(sgd, param_grid=params, cv=StratifiedKFold(y, 5), scoring='roc_auc', n_jobs=-1) grid.fit(X_new, y) logger.debug('Best score (E_val): %f', grid.best_score_) sgd = grid.best_estimator_ logger.debug('E_in: %f', auc_score(sgd, X_new, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('sgd', sgd)]), 'sgd_0620_03')
def pipline(data: DataFrame): if os.path.isfile(cache_path('run/' + _ARGS.name)): corpus, dictionary, documents = load_cache('run/' + _ARGS.name) elif data: documents = data['tokens'].to_list() # Create a dictionary representation of the documents. dictionary = Dictionary(documents) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=20, no_above=0.5) # 去停用词 bad_ids = [dictionary.token2id[t] for t in STOP_WORDS if t in dictionary.token2id] dictionary.filter_tokens(bad_ids=bad_ids) # Bag-of-words representation of the documents. corpus = [dictionary.doc2bow(doc) for doc in documents] dump_cache((corpus, dictionary, documents), 'run/' + _ARGS.name) else: raise ValueError('cache不存在且未传入data') _ = dictionary[0] # This is only to "load" the dictionary. output('Number of unique tokens: ', len(dictionary)) output('Number of documents: ', len(corpus)) # test = get_model(6, corpus, dictionary.id2token) topic_range = tuple(int(s.strip()) for s in _ARGS.range.split(',')) kwargs = dict( id2word=dictionary.id2token, chunksize=len(corpus), passes=_ARGS.passes, alpha='auto', eta='auto', eval_every=1, iterations=_ARGS.iterations, random_state=123) if len(corpus) < 1e6: # 并行训练模型 pool = Pool(_ARGS.pool_size) result_dict = dict() for k in range(*topic_range): result_dict[k] = pool.apply_async(get_model, (corpus, k, kwargs)) result_dict = {k: v.get() for k, v in result_dict.items()} pool.close() # 等子进程执行完毕后关闭进程池 pool.join() output(f"Searched range{topic_range}") # 计算一致性的代码自己有多进程,所以只能串行 for k, (model, ids) in result_dict.items(): eval_and_write(data, k, documents, dictionary, corpus, model, ids) else: # kwargs['alpha'] = 'symmetric' kwargs['chunksize'] = len(corpus) // 8 // _ARGS.pool_size + 1 # kwargs['batch'] = True for k in range(*topic_range, 2): # 大数据就粗点筛 # model = LdaMulticore(corpus, k, workers=_ARGS.pool_size, **kwargs) model = LdaModel(corpus, k, **kwargs) ids = save_and_inference(model, corpus, k, kwargs['chunksize']) # result_dict[k] = (model, ids) # 内存不够用啊,4M句子 eval_and_write(None, k, documents, dictionary, corpus, model, ids) del model, ids gc.collect() output(f"===> {_ARGS.name} compete. \n")
def dt(): """ Submission: dt_0620_05.csv E_val: 0.820972 E_in: 0.835177 E_out: Comment: {'max_depth': 5} """ from sklearn.tree import DecisionTreeClassifier, export_graphviz X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) dt = DecisionTreeClassifier(max_depth=5, class_weight='auto') dt.fit(X, y) export_graphviz(dt, 'tree.dot') logger.debug('E_in: %f', auc_score(dt, X, y)) to_submission(dt, 'dt_0620_05')
def lr_with_scale(): """ Submission: lr_with_scale_0620_04.csv E_val: <missing> E_in: 0.857351105162 E_out: 0.854097855439904 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_scaled, y) print(auc_score(clf, X_scaled, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('lr', clf)]), 'lr_with_scale_0620_04')
def dropout_history(enrollment_set, base_date): X_pkl_path = util.cache_path('dropout_history_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(X_pkl_path): return util.fetch(X_pkl_path) logger = logging.getLogger('dropout_history') n_proc = par.cpu_count() pkl_path = util.cache_path('Dropout_count_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): logger.debug('load from cache') Dropout_count = util.fetch(pkl_path) else: logger.debug('preparing datasets') Enroll_all = util.load_enrollments() Log = util.load_logs() Log = Log[Log['time'] <= base_date] Log_enroll_ids = pd.DataFrame(np.unique(Log['enrollment_id']), columns=['enrollment_id']) logger.debug('datasets prepared') params = [] enroll_ids = [] for i, df in Log.groupby(['enrollment_id']): params.append(df) enroll_ids.append(i) pool = par.Pool(processes=min(n_proc, len(params))) enroll_dropout_count = dict( zip(enroll_ids, pool.map(__get_dropout_feature__, params))) pool.close() pool.join() enroll_dropout_count = pd.Series(enroll_dropout_count, name='dropout_count') enroll_dropout_count.index.name = 'enrollment_id' enroll_dropout_count = enroll_dropout_count.reset_index() Enroll_counted = pd.merge(Enroll_all, enroll_dropout_count, how='left', on=['enrollment_id']) Dropout_count = pd.merge(Log_enroll_ids, Enroll_counted, how='left', on=['enrollment_id']) util.dump(Dropout_count, pkl_path) Dgb = Dropout_count.groupby('username') total_dropout = Dgb.agg({ 'dropout_count': np.sum }).reset_index().rename(columns={'dropout_count': 'total_dropout'}) avg_dropout = Dgb.agg({ 'dropout_count': np.average }).reset_index().rename(columns={'dropout_count': 'avg_dropout'}) drop_courses = Dgb.agg( {'dropout_count': lambda x: len([i for i in x if i > 0])})\ .reset_index().rename(columns={'dropout_count': 'drop_courses'}) course_count = Dgb.agg({ 'dropout_count': len }).reset_index().rename(columns={'dropout_count': 'course_count'}) Dropout_count = pd.merge(Dropout_count, total_dropout, how='left', on=['username']) Dropout_count = pd.merge(Dropout_count, avg_dropout, how='left', on=['username']) Dropout_count = pd.merge(Dropout_count, drop_courses, how='left', on=['username']) Dropout_count = pd.merge(Dropout_count, course_count, how='left', on=['username']) Dropout_count['drop_ratio'] = (Dropout_count['drop_courses'] / Dropout_count['course_count']) Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\ .reset_index() X = pd.merge(Enroll, Dropout_count, how='left', on=['enrollment_id'])\ .as_matrix(columns=['dropout_count', 'total_dropout', 'avg_dropout', 'drop_courses', 'course_count', 'drop_ratio']) logger.debug('dropout history, has nan: %s, shape: %s', np.any(np.isnan(X)), repr(X.shape)) util.dump(X, X_pkl_path) return X
# update instances and labels if not cache_only: X = np.r_[X, X_temp] y = np.append(y, y_temp) # update base_date and enroll_ids base_date -= Dw enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date) return X, y if __name__ == '__main__': import glob if sys.argv[1] == 'clean': cached_files = glob.glob(util.cache_path('train_X*.pkl')) cached_files += glob.glob(util.cache_path('train_X*.pklz')) cached_files += glob.glob(util.cache_path('train_X*.pkl.gz')) cached_files += glob.glob(util.cache_path('train_y*.pkl')) cached_files += glob.glob(util.cache_path('train_y*.pklz')) cached_files += glob.glob(util.cache_path('train_y*.pkl.gz')) cached_files += glob.glob(util.cache_path('test_X*.pkl')) cached_files += glob.glob(util.cache_path('test_X*.pklz')) cached_files += glob.glob(util.cache_path('test_X*.pkl.gz')) for path in cached_files: os.remove(path) elif sys.argv[1] == 'gen': X, y = load_train(cache_only=True) print('X.shape: %d x %d' % X.shape) print('y.shape: %d' % y.shape)
def svc_1(): """ Submission: svc_1_0620_01.csv E_val: 0.866856950449 E_in: 0.855948 E_out: 0.8546898189645258 """ from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFE from sklearn.grid_search import RandomizedSearchCV from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from scipy.stats import expon logger.debug('svc_1') X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = RFE(estimator=LogisticRegression(class_weight='auto'), step=1, n_features_to_select=21) rfe.fit(X_scaled, y) util.dump(rfe, util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) logger.debug('Features selected.') new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) svc = LinearSVC(dual=False, class_weight='auto') rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), param_distributions={'C': expon()}) rs.fit(X_new, y) logger.debug('Got best SVC.') logger.debug('Grid scores: %s', rs.grid_scores_) logger.debug('Best score (E_val): %s', rs.best_score_) logger.debug('Best params: %s', rs.best_params_) svc = rs.best_estimator_ util.dump(svc, util.cache_path('new_data.SVC')) isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5), method='isotonic') isotonic.fit(X_new, y) util.dump(isotonic, util.cache_path('new_data.CalibratedClassifierCV.isotonic')) logger.debug('Got best isotonic CalibratedClassifier.') logger.debug('E_in (isotonic): %f', auc_score(isotonic, X_new, y)) to_submission( Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('svc', isotonic)]), 'svc_1_0620_01')
def load_train(earlist_base_date=None, depth=1, cache_only=False): """ Load dataset for training and validating. *NOTE* If you need a validating set, you SHOULD split from training set by yourself. Parameters ---------- earlist_base_date: datetime, None by default Base date won't be smaller than earlist_base_date. depth: int, 1 by default Maximum moves of time window. cache_only: bool, False by default Cache data of every period, do not return full spanned data. Returns ------- X: numpy ndarray, shape: (num_of_enrollments, num_of_features) Rows of features. It is the features of all time if cache_only is True. y: numpy ndarray, shape: (num_of_enrollments,) Vector of labels. It is the labels of all time if cache_only is True. """ logger = logging.getLogger('load_train') enroll_ids = np.sort(util.load_enrollment_train()['enrollment_id']) log = util.load_logs()[['enrollment_id', 'time']] # base_date = log['time'].max().to_datetime() base_date = datetime(2014, 8, 1, 22, 0, 47) logger.debug('load features before %s', base_date) pkl_X_path = util.cache_path('train_X_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) pkl_y_path = util.cache_path('train_y_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path): logger.debug('fetch cached') X = util.fetch(pkl_X_path) y = util.fetch(pkl_y_path) else: X, _ = __load_dataset__(enroll_ids, log, base_date) y_with_id = util.load_val_y() if not np.all(y_with_id[:, 0] == enroll_ids): logger.fatal('something wrong with enroll_ids') raise RuntimeError('something wrong with enroll_ids') y = y_with_id[:, 1] util.dump(X, pkl_X_path) util.dump(y, pkl_y_path) # base_date = log['time'].max().to_datetime() - timedelta(days=10) base_date = datetime(2014, 7, 22, 22, 0, 47) Dw = timedelta(days=7) enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date) for _ in range(depth - 1): if enroll_ids.size <= 0: break if earlist_base_date is not None and base_date < earlist_base_date: break logger.debug('load features before %s', base_date) # get instances and labels pkl_X_path = util.cache_path('train_X_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) pkl_y_path = util.cache_path('train_y_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path): logger.debug('fetch cached') X_temp = util.fetch(pkl_X_path) y_temp = util.fetch(pkl_y_path) else: X_temp, y_temp = __load_dataset__(enroll_ids, log, base_date) util.dump(X_temp, pkl_X_path) util.dump(y_temp, pkl_y_path) # update instances and labels if not cache_only: X = np.r_[X, X_temp] y = np.append(y, y_temp) # update base_date and enroll_ids base_date -= Dw enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date) return X, y
def source_event_counter(enrollment_set, base_date): """ Counts the source-event pairs. Features -------- """ X_pkl_path = util.cache_path('source_event_counter_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(X_pkl_path): return util.fetch(X_pkl_path) logger = logging.getLogger('source_event_counter') logger.debug('preparing datasets') Enroll_all = util.load_enrollments() pkl_path = util.cache_path('Log_all_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): Log = util.fetch(pkl_path) else: Log = util.load_logs() Log = Log[Log['time'] <= base_date] Log['source_event'] = Log['source'] + '-' + Log['event'] Log['day_diff'] = (base_date - Log['time']).dt.days Log['week_diff'] = Log['day_diff'] // 7 Log['event_count'] = 1 util.dump(Log, pkl_path) Log_counted = Log.groupby(['enrollment_id', 'source_event', 'week_diff'])\ .agg({'event_count': np.sum}).reset_index() logger.debug('datasets prepared') Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\ .reset_index() n_proc = par.cpu_count() pkl_path = util.cache_path('event_count_by_eid_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): event_count_by_eid = util.fetch(pkl_path) else: params = [] eids = [] for eid, df in pd.merge(Enroll_all, Log_counted, on=['enrollment_id'])\ .groupby(['enrollment_id']): params.append(df) eids.append(eid) pool = par.Pool(processes=min(n_proc, len(params))) event_count_by_eid = dict( zip(eids, pool.map(__get_counting_feature__, params))) pool.close() pool.join() util.dump(event_count_by_eid, pkl_path) X0 = np.array([event_count_by_eid[i] for i in Enroll['enrollment_id']]) logger.debug('source-event pairs counted, has nan: %s, shape: %s', np.any(np.isnan(X0)), repr(X0.shape)) pkl_path = util.cache_path('D_full_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): D_full = util.fetch(pkl_path) else: D_full = pd.merge(Enroll_all, Log, on=['enrollment_id']) util.dump(D_full, pkl_path) pkl_path = util.cache_path('user_wn_courses_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): user_wn_courses = util.fetch(pkl_path) else: user_wn_courses = {} for u, df in D_full.groupby(['username']): x = [] for wn in __week_span__: x.append(len(df[df['week_diff'] == wn]['course_id'].unique())) user_wn_courses[u] = x util.dump(user_wn_courses, pkl_path) X1 = np.array([user_wn_courses[u] for u in Enroll['username']]) logger.debug('courses by user counted, has nan: %s, shape: %s', np.any(np.isnan(X1)), repr(X1.shape)) pkl_path = util.cache_path('course_population_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): course_population = util.fetch(pkl_path) else: course_population = {} for c, df in D_full.groupby(['course_id']): course_population[c] = len(df['username'].unique()) util.dump(course_population, pkl_path) X2 = np.array([course_population.get(c, 0) for c in Enroll['course_id']]) logger.debug('course population counted, has nan: %s, shape: %s', np.any(np.isnan(X2)), repr(X2.shape)) pkl_path = util.cache_path('course_dropout_count_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): course_dropout_count = util.fetch(pkl_path) else: course_dropout_count = course_population.copy() for c, df in D_full[D_full['day_diff'] < 10].groupby(['course_id']): course_dropout_count[c] -= len(df['username'].unique()) util.dump(course_dropout_count, pkl_path) X3 = np.array( [course_dropout_count.get(c, 0) for c in Enroll['course_id']]) logger.debug('course dropout counted, has nan: %s, shape: %s', np.any(np.isnan(X3)), repr(X3.shape)) pkl_path = util.cache_path('user_ops_count_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): user_ops_count = util.fetch(pkl_path) else: user_ops_on_all_courses = D_full.groupby( ['username', 'source_event', 'week_diff'])\ .agg({'event_count': np.sum}).reset_index() params = [] users = [] for u, df in user_ops_on_all_courses.groupby(['username']): params.append(df) users.append(u) pool = par.Pool(processes=min(n_proc, len(params))) user_ops_count = dict( zip(users, pool.map(__get_counting_feature__, params))) pool.close() pool.join() util.dump(user_ops_count, pkl_path) X4 = X0 / [user_ops_count[u] for u in Enroll['username']] X4[np.isnan(X4)] = 0 logger.debug('ratio of user ops on all courses, has nan: %s, shape: %s', np.any(np.isnan(X4)), repr(X4.shape)) pkl_path = util.cache_path('course_ops_count_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): course_ops_count = util.fetch(pkl_path) else: course_ops_of_all_users = D_full.groupby( ['course_id', 'source_event', 'week_diff'])\ .agg({'event_count': np.sum}).reset_index() params = [] courses = [] for c, df in course_ops_of_all_users.groupby(['course_id']): params.append(df) courses.append(c) pool = par.Pool(processes=min(n_proc, len(params))) course_ops_count = dict( zip(courses, pool.map(__get_counting_feature__, params))) pool.close() pool.join() util.dump(course_ops_count, pkl_path) X5 = X0 / [course_ops_count[c] for c in Enroll['course_id']] X5[np.isnan(X5)] = 0 logger.debug('ratio of courses ops of all users, has nan: %s, shape: %s', np.any(np.isnan(X5)), repr(X5.shape)) X6 = np.array([ course_dropout_count.get(c, 0) / course_population.get(c, 1) for c in Enroll['course_id'] ]) logger.debug('dropout ratio of courses, has nan: %s, shape: %s', np.any(np.isnan(X6)), repr(X6.shape)) Obj = util.load_object() Obj = Obj[Obj['start'] <= base_date] course_time = {} for c, df in Obj.groupby(['course_id']): start_time = np.min(df['start']) update_time = np.max(df['start']) course_time[c] = [(base_date - start_time).days, (base_date - update_time).days] avg_start_days = np.average([t[0] for _, t in course_time.items()]) avg_update_days = np.average([t[1] for _, t in course_time.items()]) default_case = [avg_start_days, avg_update_days] X7 = np.array( [course_time.get(c, default_case)[0] for c in Enroll['course_id']]) logger.debug('days from course first update, has nan: %s, shape: %s', np.any(np.isnan(X7)), repr(X7.shape)) X8 = np.array( [course_time.get(c, default_case)[1] for c in Enroll['course_id']]) logger.debug('days from course last update, has nan: %s, shape: %s', np.any(np.isnan(X8)), repr(X8.shape)) user_ops_time = pd.merge(Enroll, Log, how='left', on=['enrollment_id'])\ .groupby(['enrollment_id']).agg({'day_diff': [np.min, np.max]})\ .fillna(0) X9 = np.array(user_ops_time['day_diff']['amin']) logger.debug('days from user last op, has nan: %s, shape: %s', np.any(np.isnan(X9)), repr(X9.shape)) X10 = np.array(user_ops_time['day_diff']['amax']) logger.debug('days from user first op, has nan: %s, shape: %s', np.any(np.isnan(X10)), repr(X10.shape)) X11 = X7 - X10 logger.debug( 'days from course first update to user first op, has nan: %s' ', shape: %s', np.any(np.isnan(X11)), repr(X11.shape)) X = np.c_[X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11] util.dump(X, X_pkl_path) return X
def read(path) -> DataFrame: def _clean(row): text = URL_REGEX.sub('', row.contents) if row.is_forward and '//@' in text: # 如果是转发的且格式正确 if text.startswith('//@'): # 如果单纯转发,则内容设置为最原始微博的内容 try: text = FORWARD_CONTENT.findall(text)[-1] i = FORWARD_SPLIT.match(text).regs[0][1] text = text[i:] except IndexError: text = text.replace('//@', '') # TODO 可以用weibo的API处理 else: # 否则截取新内容 text = text[:text.find('//@')] return text temp_name = os.path.basename(path).replace('.xlsx', '') if os.path.isfile(cache_path(temp_name)): data, texts = load_cache(temp_name) else: output(f"===> Reading from <{path}>.") data: DataFrame = read_excel(path) # .iloc[:280] # 只保留想要的4列,并去除空值,截取日期 data = data[['contents', 'time', 'id', 'is_forward']].dropna().reset_index() data['date'] = data['time'].apply(lambda s: s[:10]) data['contents'] = data['contents'].astype(str) # 预处理文本 texts = data.apply(_clean, axis=1).to_list() dump_cache((data, texts), temp_name) output(f"===> got {len(data)} rows from <{path}>.") # 解析GPU ID ltp_ids = [i.strip() for i in _ARGS.ltpIDS.split(',')] skep_ids = [i.strip() for i in _ARGS.skepIDS.split(',')] # 初始化进程池,管理器,数据队列 pool = Pool(1 + len(ltp_ids) + len(skep_ids)) # 分别分词、获取skep输入、skep运算 manager = Manager() feqture_queue = manager.Queue(16 * len(skep_ids)) result_queue = manager.Queue(16 * len(skep_ids)) # 异步任务启动 pool.apply_async(skep_producer, (feqture_queue, texts, 16, len(skep_ids))) tokens = dict() for i, (s, p) in zip(ltp_ids, generate_batch(texts, len(texts) // len(ltp_ids) + 1)): tokens[(s.start, s.stop)] = pool.apply_async(ltp_tokenzier, (p, 192, i)) for i in skep_ids: pool.apply_async(skep_consumer, (feqture_queue, result_queue, i)) # 接收结果 scores, counter = zeros(len(texts)), 1 while True: _slice, array = result_queue.get() # print(_slice) if array is None: if counter < len(skep_ids): counter += 1 else: break else: scores[_slice] = array data['tokens'] = None for s, t in tokens.items(): data['tokens'].update(Series(t.get(), range(*s))) data['sentiment_score'] = scores pool.close() pool.join() return data[['date', 'tokens', 'id', 'sentiment_score']]
def svc_1(): """ Submission: svc_1_0620_01.csv E_val: 0.866856950449 E_in: 0.855948 E_out: 0.8546898189645258 """ from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFE from sklearn.grid_search import RandomizedSearchCV from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from scipy.stats import expon logger.debug('svc_1') X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = RFE(estimator=LogisticRegression(class_weight='auto'), step=1, n_features_to_select=21) rfe.fit(X_scaled, y) util.dump(rfe, util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) logger.debug('Features selected.') new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) svc = LinearSVC(dual=False, class_weight='auto') rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), param_distributions={'C': expon()}) rs.fit(X_new, y) logger.debug('Got best SVC.') logger.debug('Grid scores: %s', rs.grid_scores_) logger.debug('Best score (E_val): %s', rs.best_score_) logger.debug('Best params: %s', rs.best_params_) svc = rs.best_estimator_ util.dump(svc, util.cache_path('new_data.SVC')) isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5), method='isotonic') isotonic.fit(X_new, y) util.dump(isotonic, util.cache_path('new_data.CalibratedClassifierCV.isotonic')) logger.debug('Got best isotonic CalibratedClassifier.') logger.debug('E_in (isotonic): %f', auc_score(isotonic, X_new, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('svc', isotonic)]), 'svc_1_0620_01')