def lr_with_fs(): """ Submission: lr_with_fs_0620_02.csv E_val: <missing> E_in: 0.856252488379 E_out: 0.8552577388980213 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = util.fetch(util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_new, y) print(auc_score(clf, X_new, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('lr', clf)]), 'lr_with_fs_0620_02')
def lr_with_fs(): """ Submission: lr_with_fs_0620_02.csv E_val: <missing> E_in: 0.856252488379 E_out: 0.8552577388980213 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = util.fetch(util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_new, y) print(auc_score(clf, X_new, y)) to_submission( Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('lr', clf)]), 'lr_with_fs_0620_02')
def send_sms(phone, text): options = { 'api_id': config.SMS_ID, 'to': phone, 'text': text.encode('utf-8'), } if hasattr(config, 'SMS_FROM'): options['from'] = config.SMS_FROM util.fetch('http://sms.ru/sms/send', options) logging.info('Sent an SMS to %s' % phone)
def submit(id, url): """ Отправляет на сервер ссылку на готовый album.xml. Ссылка должна быть доступна извне, сервер будет её запрашивать. """ print "Submitting item %u (%s)" % (id, url) fetch('http://' + settings['host'] + '/upload/queue', { 'id': id, 'url': url, 'signature': sign(url), })
def sgd(): """ Submission: sgd_0620_03.csv E_val: 0.863628 E_in: 0.854373 E_out: """ from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import StratifiedKFold X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = util.fetch(util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) sgd = SGDClassifier(n_iter=50, n_jobs=-1) params = { 'loss': [ 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive' ] } grid = GridSearchCV(sgd, param_grid=params, cv=StratifiedKFold(y, 5), scoring='roc_auc', n_jobs=-1) grid.fit(X_new, y) logger.debug('Best score (E_val): %f', grid.best_score_) sgd = grid.best_estimator_ logger.debug('E_in: %f', auc_score(sgd, X_new, y)) to_submission( Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('sgd', sgd)]), 'sgd_0620_03')
def lr(): """ Submission: lr_0618.csv E_val: <missing> E_in: <missing> E_out: 0.8119110960575004 """ from sklearn.linear_model import LogisticRegressionCV X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X, y) print(auc_score(clf, X, y)) to_submission(clf, 'lr_0618_xxx')
def load_test(): """ Load dataset for testing. Returns ------- X: numpy ndarray, shape: (num_of_enrollments, num_of_features) Rows of features. """ pkl_path = util.cache_path('test_X') if os.path.exists(pkl_path): X = util.fetch(pkl_path) else: enroll_set = np.sort(util.load_enrollment_test()['enrollment_id']) # log = util.load_logs() # base_date = log['time'].max().to_datetime() base_date = datetime(2014, 8, 1, 22, 0, 47) X = None for f in MODELING['features']: X_ = f(enroll_set, base_date) if X is None: X = X_ else: X = np.c_[X, X_] util.dump(X, pkl_path) return X
def main(): global update_id, misc, plugins, database url = "https://api.telegram.org/bot{}/getUpdates?offset={}".format( misc['token'], update_id) response = util.fetch(url, misc['session']) executor = concurrent.futures.ThreadPoolExecutor( max_workers=config.workers) # See docs executor.submit(check_time_args) try: response = response.json() except (AttributeError, ValueError) as e: print("Error parsing Telegram response: {}\nResponse: {}".format( e, response)) time.sleep(config.sleep) return if response['ok'] and response[ 'result']: # Response ok and contains results update_id = response['result'][-1]['update_id'] + 1 for result in response['result']: # Loop through result executor.submit(run_extension(result)) if 'message' in result: # For message updates executor.submit( RouteMessage(result['message'], misc, plugins, database).route_update) elif 'callback_query' in result: # For callback query updates executor.submit(route_callback_query, result['callback_query'], database, plugins, misc) elif not response['ok']: print('Response not OK\nResponse: {}'.format(response)) executor.shutdown( wait=False ) # returns immediately, sub processes will close by themselves time.sleep(config.sleep) # Sleep for time defined in config
def get_similarities(): global model check_model() comparison = int(flask.request.args.get("article")) ctoken = util.tokenize([util.fetch(comparison)[1]]) recent_articles = util.fetch_top_100(comparison) recent_article_ids = [a[0] for a in recent_articles] founds, not_founds = util.fetch_with(comparison, recent_article_ids) result_ids = [f[0] for f in founds] similarities = [f[1] for f in founds] not_found_articles = [a for a in recent_articles if a[0] in not_founds] for a in not_found_articles: sim = model.docvecs.similarity_unseen_docs(model, ctoken[0], util.tokenize([a[1]])[0]) util.insert_similarity(comparison, a[0], sim.item()) result_ids.append(a[0]) similarities.append(str(sim)) return flask.jsonify( result="ok", articleIds=result_ids, similarities=similarities )
def main(): global update_id, misc, plugins, database url = "https://api.telegram.org/bot{}/getUpdates?offset={}".format( misc['token'], update_id) response = util.fetch(url, misc['session']) try: response = response.json() except AttributeError: print("Error parsing Telegram response\nResponse: {}".format(response)) time.sleep(5) return if response['ok'] and response[ 'result']: # Response ok and contains results update_id = response['result'][-1]['update_id'] + 1 executor = concurrent.futures.ThreadPoolExecutor( max_workers=config.workers) # See docs for result in response['result']: # Loop through result if_old = int(time.time()) - int(result['message'][ 'date']) >= 180 # check if message is older than 3 min executor.submit(RouteMessage, result['message'], misc, plugins, database, check_db_only=if_old) executor.shutdown( wait=False ) # returns immediately, sub processes will close by themselves elif not response['ok']: # Response not ok print('Response not OK\nResponse: {}'.format(response)) time.sleep(config.sleep) # Sleep for time defined in config
def sgd(): """ Submission: sgd_0620_03.csv E_val: 0.863628 E_in: 0.854373 E_out: """ from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import StratifiedKFold X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = util.fetch(util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) sgd = SGDClassifier(n_iter=50, n_jobs=-1) params = { 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'] } grid = GridSearchCV(sgd, param_grid=params, cv=StratifiedKFold(y, 5), scoring='roc_auc', n_jobs=-1) grid.fit(X_new, y) logger.debug('Best score (E_val): %f', grid.best_score_) sgd = grid.best_estimator_ logger.debug('E_in: %f', auc_score(sgd, X_new, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('sgd', sgd)]), 'sgd_0620_03')
def get_similarity(): global model check_model() article_1 = flask.request.args.get("article1") article_2 = flask.request.args.get("article2") article1 = util.fetch(article_1) article2 = util.fetch(article_2) tokens = util.tokenize([article1[1], article2[1]]) similarity = str(model.docvecs.similarity_unseen_docs(model, tokens[0], tokens[1])) return flask.jsonify( result="ok", similarity=str(similarity) )
def _handle_timeline_notification(self, data): """Handle timeline notification.""" for user_action in data.get('userActions', []): if user_action.get('type') == 'REPLY': # Fetch the timeline item. item = self.mirror_service.timeline().get(id=data['itemId']).execute() attachments = item.get('attachments', []) media = None if attachments: # Get the first attachment on that timeline item and do stuff with it. attachment = self.mirror_service.timeline().attachments().get( itemId=data['itemId'], attachmentId=attachments[0]['id']).execute() resp, content = self.mirror_service._http.request( attachment['contentUrl']) if resp.status == 200: media = MediaIoBaseUpload( io.BytesIO(content), attachment['contentType'], resumable=True) else: logging.info('Unable to retrieve attachment: %s', resp.status) bgg_resp = util.fetch('http://boardgamegeek.com/xmlapi2/search?query=%s' % urllib.quote(item.get('text',''))) xmldoc = minidom.parseString( bgg_resp['data'] ) itemlist = xmldoc.getElementsByTagName( "item" ) item_id = itemlist[0].attributes[ 'id' ].value; bgg_resp = util.fetch("http://boardgamegeek.com/xmlapi2/thing?id=%s" % item_id ) xmldoc = minidom.parseString( bgg_resp['data'] ) itemlist = xmldoc.getElementsByTagName( "name" ) bgg_name = itemlist[0].attributes[ 'value' ].value; itemlist = xmldoc.getElementsByTagName("thumbnail") bgg_thumbnail = itemlist[0].firstChild.data itemlist = xmldoc.getElementsByTagName("description") bgg_description = itemlist[0].firstChild.data logging.info( "<article>\n<figure>\n <img src=\"%s\">\n</figure>\n<section>\n<div style=\"\">\n<p class=\"yellow\">%s</sub></p>\n<p>%s\n</section>\n</article>\n" % (bgg_thumbnail, bgg_name, bgg_description) ) body = { 'html': "<article>\n<figure>\n <img src=\"%s\">\n</figure>\n<section>\n<div style=\"text-auto-size\">\n<p class=\"yellow\">%s</sub></p>\n<p>%s\n</section>\n</article>\n" % (bgg_thumbnail, bgg_name, bgg_description), 'notification': { 'level' : "DEFAULT" } } self.mirror_service.timeline().insert( body=body, media_body=media).execute() # Only handle the first successful action. break else: logging.info( "I don't know what to do with this notification: %s", user_action)
def get_me(misc): # getMe url = "{}{}getMe".format(misc['base_url'], misc['token']) response = util.fetch(url, misc['session']).json() if response['ok']: return response['result'] else: print("There seems to be an error :(\nCheck your API key and connection to the internet") print(response) sys.exit()
def update_challenges(): """Fetch challenge list from vimgolf and update datastore.""" logging.info('update_challenges()') rows = BeautifulSoup(fetch('/')).findAll('h5') count = increment('challenge_tasks', len(rows)) logging.info('init challenge_tasks = %d' % count) for row in rows: handle = row.a['href'].split('/')[-1] taskqueue.add(url='/challenges/'+handle)
def dt(): """ Submission: dt_0620_05.csv E_val: 0.820972 E_in: 0.835177 E_out: Comment: {'max_depth': 5} """ from sklearn.tree import DecisionTreeClassifier, export_graphviz X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) dt = DecisionTreeClassifier(max_depth=5, class_weight='auto') dt.fit(X, y) export_graphviz(dt, 'tree.dot') logger.debug('E_in: %f', auc_score(dt, X, y)) to_submission(dt, 'dt_0620_05')
def get_me(misc): # getMe url = "{}{}getMe".format(misc['base_url'], misc['token']) response = util.fetch(url, misc['session']).json() if response['ok']: return response['result'] else: print( "There seems to be an error :(\nCheck your API key and connection to the internet" ) print(response) sys.exit()
def lr_with_scale(): """ Submission: lr_with_scale_0620_04.csv E_val: <missing> E_in: 0.857351105162 E_out: 0.854097855439904 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_scaled, y) print(auc_score(clf, X_scaled, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('lr', clf)]), 'lr_with_scale_0620_04')
def geturl(): f = open('urlTest.txt','r') urls = f.readlines() f.close() for i in range(0,len(urls)): test_url = urls[i].strip().split(',') hostname = urlparse(test_url[0]).netloc path = urlparse(test_url[0]).path + '?' + urlparse(test_url[0]).query res = util.fetch(hostname, path) if not(util.assertEqual(res.status, int(test_url[1]))): # util.reportEmail('http status error:' + status) print 'failed', test_url[0] else: print 'success', test_url[0]
def update_challenge(handle): """Fetch Leaderboard and active golfers of the specified challenge, and update datastore.""" logging.info('update_challenge(%s)' % handle) soup = BeautifulSoup(fetch('challenges/' + handle)) title = soup.findAll('h3')[1].text golfers = [row.text.split('@')[-1] for row in soup.findAll('h5')[-1].parent.findAll('h6')] record = Challenge(key_name=handle, handle=handle, title=title, active_golfers=golfers) record.put() logging.info('updated Challenge(%s, %s) with %d golfers' % (handle, title, len(golfers))) count = increment('challenge_tasks', -1) logging.info('challenge_tasks = %d' % count) if count == 0: taskqueue.add(url='/top')
def main(): global update_id, misc, plugins, database url = "https://api.telegram.org/bot{}/getUpdates?offset={}".format(misc['token'], update_id) response = util.fetch(url, misc['session']) try: response = response.json() except AttributeError: print("Error parsing Telegram response\nResponse: {}".format(response)) time.sleep(5) return if response['ok'] and response['result']: # Response ok and contains results update_id = response['result'][-1]['update_id'] + 1 executor = concurrent.futures.ThreadPoolExecutor(max_workers=config.workers) # See docs for result in response['result']: # Loop through result if_old = int(time.time()) - int(result['message']['date']) >= 180 # check if message is older than 3 min executor.submit(RouteMessage, result['message'], misc, plugins, database, check_db_only=if_old) executor.shutdown(wait=False) # returns immediately, sub processes will close by themselves elif not response['ok']: # Response not ok print('Response not OK\nResponse: {}'.format(response)) time.sleep(config.sleep) # Sleep for time defined in config
def get_update(self): # Gets new messages and sends them to plugin_handler url = "{}{}getUpdates?offset={}".format(self.misc['base_url'], self.misc['token'], self.update_id) response = util.fetch(url, self.misc['session']) try: response = response.json() except AttributeError: time.sleep(5) return if response['ok']: try: self.update_id = response['result'][-1]['update_id'] + 1 except IndexError: time.sleep(self.config.sleep) return with concurrent.futures.ThreadPoolExecutor(max_workers=5) as e: for i in response['result']: if self.time - int(i['message']['date']) <= 180000: e.submit(self.route_message, TelegramApi(i['message'], self.misc)) time.sleep(self.config.sleep) else: print('Error fetching new messages:\nCode: {}'.format(response['error_code'])) time.sleep(self.config.sleep)
def dropout_history(enrollment_set, base_date): X_pkl_path = util.cache_path('dropout_history_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(X_pkl_path): return util.fetch(X_pkl_path) logger = logging.getLogger('dropout_history') n_proc = par.cpu_count() pkl_path = util.cache_path('Dropout_count_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): logger.debug('load from cache') Dropout_count = util.fetch(pkl_path) else: logger.debug('preparing datasets') Enroll_all = util.load_enrollments() Log = util.load_logs() Log = Log[Log['time'] <= base_date] Log_enroll_ids = pd.DataFrame(np.unique(Log['enrollment_id']), columns=['enrollment_id']) logger.debug('datasets prepared') params = [] enroll_ids = [] for i, df in Log.groupby(['enrollment_id']): params.append(df) enroll_ids.append(i) pool = par.Pool(processes=min(n_proc, len(params))) enroll_dropout_count = dict( zip(enroll_ids, pool.map(__get_dropout_feature__, params))) pool.close() pool.join() enroll_dropout_count = pd.Series(enroll_dropout_count, name='dropout_count') enroll_dropout_count.index.name = 'enrollment_id' enroll_dropout_count = enroll_dropout_count.reset_index() Enroll_counted = pd.merge(Enroll_all, enroll_dropout_count, how='left', on=['enrollment_id']) Dropout_count = pd.merge(Log_enroll_ids, Enroll_counted, how='left', on=['enrollment_id']) util.dump(Dropout_count, pkl_path) Dgb = Dropout_count.groupby('username') total_dropout = Dgb.agg({ 'dropout_count': np.sum }).reset_index().rename(columns={'dropout_count': 'total_dropout'}) avg_dropout = Dgb.agg({ 'dropout_count': np.average }).reset_index().rename(columns={'dropout_count': 'avg_dropout'}) drop_courses = Dgb.agg( {'dropout_count': lambda x: len([i for i in x if i > 0])})\ .reset_index().rename(columns={'dropout_count': 'drop_courses'}) course_count = Dgb.agg({ 'dropout_count': len }).reset_index().rename(columns={'dropout_count': 'course_count'}) Dropout_count = pd.merge(Dropout_count, total_dropout, how='left', on=['username']) Dropout_count = pd.merge(Dropout_count, avg_dropout, how='left', on=['username']) Dropout_count = pd.merge(Dropout_count, drop_courses, how='left', on=['username']) Dropout_count = pd.merge(Dropout_count, course_count, how='left', on=['username']) Dropout_count['drop_ratio'] = (Dropout_count['drop_courses'] / Dropout_count['course_count']) Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\ .reset_index() X = pd.merge(Enroll, Dropout_count, how='left', on=['enrollment_id'])\ .as_matrix(columns=['dropout_count', 'total_dropout', 'avg_dropout', 'drop_courses', 'course_count', 'drop_ratio']) logger.debug('dropout history, has nan: %s, shape: %s', np.any(np.isnan(X)), repr(X.shape)) util.dump(X, X_pkl_path) return X
def factorize(users, movies, ratings, test_users, test_movies, test_ratings, blocks=1, latent=10, steps=10, gpu_steps=2, alpha=0.0002, beta=0.01, delta=0.01, rmse_repeat_count=3, debug=2, dataset=''): U, V = initUV( np.max(users)-np.min(users)+1, latent, np.max(movies)-np.min(movies)+1) U = np.array(U) V = np.array(V) size = max(np.max(users)+1, np.max(movies)+1) split = int(size/blocks) us = int(math.ceil( np.float(np.max(users))/split ) ) vs = int(math.ceil( np.float(np.max(movies))/split ) ) if debug>1: print("Total splits : ",split, us, vs, us*vs) print("U, V shapes :", U.shape, V.shape) start_time=time.clock() y1, y2 = [], [] count, error = 0, 100 for k in range(steps): if debug>1: print("Step : ", k) u1, v1 = 0, 0 t4 = time.clock() for i in range(us): u1 = i*split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((i+1)*split - 1) if np.max(users) < u2: u2 = int(np.max(users)) stemp = 0 UU, MM, RR = [], [], [] ulimits = [0] for j in range(vs): xtemp = int((i+stemp)%us) print("i, j, ii, jj ", i, j, xtemp, j) u1 = xtemp*split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((xtemp+1)*split - 1) if np.max(users) < u2: u2 = int(np.max(users)) v1 = j*split if np.max(movies) < v1: v1 = int(np.max(movies)) v2 = (j+1)*split -1 if np.max(movies) < v2: v2 = int(np.max(movies)) print("Processing split : " , i , j, u1, u2, v1, v2) uu, mm, rr = fetch(u1,u2, v1,v2, users,movies,ratings) if(len(uu)!=0 and len(mm)!=0): UU,MM,RR, ulimits = pack(UU,MM,RR, uu,mm,rr, ulimits) stemp+=1 U, V = matrix_factorization(UU,MM,RR, U,V, ulimits,np.min(users), np.min(movies)) t5 = time.clock() if debug>1: print(" Step time taken : ", round(t5-t4,2)) y1.append(round(t5-start_time,3)) train_rmse = rmse(users, movies, ratings, U, V) test_rmse = rmse(test_users, test_movies, test_ratings, U, V) print("Train error:", round(train_rmse, 3) , " Test error:", round(test_rmse,3) ) y2.append(round(test_rmse,3) ) step_error=round(test_rmse,4) if step_error < delta: break elif error<step_error : break elif rmse_repeat_count<count: break elif step_error==error: count=count+1 else: count = 0 error=step_error np.savetxt('blocks_'+str(gpu_steps)+'iterations_y2.txt', y2, fmt='%.3f') np.savetxt('blocks_'+str(gpu_steps)+'iterations_y1.txt', y1, fmt='%.3f')
def svc_1(): """ Submission: svc_1_0620_01.csv E_val: 0.866856950449 E_in: 0.855948 E_out: 0.8546898189645258 """ from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFE from sklearn.grid_search import RandomizedSearchCV from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from scipy.stats import expon logger.debug('svc_1') X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = RFE(estimator=LogisticRegression(class_weight='auto'), step=1, n_features_to_select=21) rfe.fit(X_scaled, y) util.dump(rfe, util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) logger.debug('Features selected.') new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) svc = LinearSVC(dual=False, class_weight='auto') rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), param_distributions={'C': expon()}) rs.fit(X_new, y) logger.debug('Got best SVC.') logger.debug('Grid scores: %s', rs.grid_scores_) logger.debug('Best score (E_val): %s', rs.best_score_) logger.debug('Best params: %s', rs.best_params_) svc = rs.best_estimator_ util.dump(svc, util.cache_path('new_data.SVC')) isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5), method='isotonic') isotonic.fit(X_new, y) util.dump(isotonic, util.cache_path('new_data.CalibratedClassifierCV.isotonic')) logger.debug('Got best isotonic CalibratedClassifier.') logger.debug('E_in (isotonic): %f', auc_score(isotonic, X_new, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('svc', isotonic)]), 'svc_1_0620_01')
def factorize(users, movies, ratings, test_users, test_movies, test_ratings, blocks=1, latent=10, steps=10, gpu_steps=2, alpha=0.00001, beta=0.01, delta=0.01, rmse_repeat_count=3, debug=2, dataset=''): #U, V = np.ones((R.shape[0], latent)), np.ones((latent, R.shape[1])) size = max(np.max(users)+1, np.max(movies)+1) split = int(size/blocks) us = int(math.ceil( np.float(np.max(users))/split ) ) vs = int(math.ceil( np.float(np.max(movies))/split ) ) if debug>1: print("Total splits : ",split, us, vs, us*vs) print("U, V shapes :", U.shape, V.shape) start_time=time.clock() y1, y2 = [], [] count = 0 flag1 = error(R, U, V,0, R.shape[1], 0, R.shape[0]) for k in range(steps): if debug>1: print("Step : ", k) rmse = 0 u1, v1 = 0, 0 t4 = time.clock() for i in range(us): u1 = i*split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((i+1)*split - 1) if np.max(users) < u2: u2 = int(np.max(users)) stemp = 0 tpool = [None]*vs for j in range(vs): xtemp = int((i+stemp)%us) print("i, j, ii, jj ", i, j, xtemp, j) u1 = xtemp*split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((xtemp+1)*split - 1) if np.max(users) < u2: u2 = int(np.max(users)) v1 = j*split if np.max(movies) < v1: v1 = int(np.max(movies)) v2 = (j+1)*split -1 if np.max(movies) < v2: v2 = int(np.max(movies)) #print("Processing split : " , i , j, u1, u2, v1, v2) uu, mm, rr = fetch(u1, u2, v1, v2, users, movies, ratings) if debug>1: print("Shapes of uu,mm,rr :", uu.shape, mm.shape, rr.shape) t6 = time.clock() P, Q = U[u1:u2+1, 0:latent], V[0:latent, v1:v2+1] if debug>1: print("P Q shapes : " , P.shape, Q.shape) t7 = time.clock() if debug>1: print("Length of uu,mm ", len(uu), len(mm), u2-u1+1, v2-v1+1, P.shape, Q.shape) if(len(uu)!=0 and len(mm)!=0): t = tpool[j] if t is not None: while t.isAlive(): print('waiting for the thread ...') time.sleep(5) t = threading.Thread(target=block_factorization, args=(P,Q,R, u1, u2, v1, v2, gpu_steps)) tpool[j] = t t.start() t8 = time.clock() stemp+=1 t5 = time.clock() if debug>1: print(" Step time taken : ", round(t5-t4,2)) y1.append(round(t5-start_time,3)) # temporarily commented out to work out of train data set alone test_rmse = error(R, U, V,0, R.shape[1], 0, R.shape[0]) #e(U, V , test_users, test_movies, test_ratings, min(split, max(np.max(test_users), np.max(test_movies))), latent=latent, debug=debug) print("Step error :", round(test_rmse,3) ) y2.append(round(test_rmse,3) ) flag=round(test_rmse,4) gpu_steps = int(gpu_steps*flag/flag1) #if flag < delta: # break #elif flag1<flag : # break #elif rmse_repeat_count<count: # break #elif flag==flag1: # count=count+1 #else: # count = 0 #flag1=flag np.savetxt(str(blocks*blocks)+'blocks_'+str(gpu_steps)+'iterations_y2.txt', y2, fmt='%.3f') np.savetxt(str(blocks*blocks)+'blocks_'+str(gpu_steps)+'iterations_y1.txt', y1, fmt='%.3f')
def svc_1(): """ Submission: svc_1_0620_01.csv E_val: 0.866856950449 E_in: 0.855948 E_out: 0.8546898189645258 """ from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFE from sklearn.grid_search import RandomizedSearchCV from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from scipy.stats import expon logger.debug('svc_1') X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = RFE(estimator=LogisticRegression(class_weight='auto'), step=1, n_features_to_select=21) rfe.fit(X_scaled, y) util.dump(rfe, util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) logger.debug('Features selected.') new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) svc = LinearSVC(dual=False, class_weight='auto') rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), param_distributions={'C': expon()}) rs.fit(X_new, y) logger.debug('Got best SVC.') logger.debug('Grid scores: %s', rs.grid_scores_) logger.debug('Best score (E_val): %s', rs.best_score_) logger.debug('Best params: %s', rs.best_params_) svc = rs.best_estimator_ util.dump(svc, util.cache_path('new_data.SVC')) isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5), method='isotonic') isotonic.fit(X_new, y) util.dump(isotonic, util.cache_path('new_data.CalibratedClassifierCV.isotonic')) logger.debug('Got best isotonic CalibratedClassifier.') logger.debug('E_in (isotonic): %f', auc_score(isotonic, X_new, y)) to_submission( Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('svc', isotonic)]), 'svc_1_0620_01')
def load_train(earlist_base_date=None, depth=1, cache_only=False): """ Load dataset for training and validating. *NOTE* If you need a validating set, you SHOULD split from training set by yourself. Parameters ---------- earlist_base_date: datetime, None by default Base date won't be smaller than earlist_base_date. depth: int, 1 by default Maximum moves of time window. cache_only: bool, False by default Cache data of every period, do not return full spanned data. Returns ------- X: numpy ndarray, shape: (num_of_enrollments, num_of_features) Rows of features. It is the features of all time if cache_only is True. y: numpy ndarray, shape: (num_of_enrollments,) Vector of labels. It is the labels of all time if cache_only is True. """ logger = logging.getLogger('load_train') enroll_ids = np.sort(util.load_enrollment_train()['enrollment_id']) log = util.load_logs()[['enrollment_id', 'time']] # base_date = log['time'].max().to_datetime() base_date = datetime(2014, 8, 1, 22, 0, 47) logger.debug('load features before %s', base_date) pkl_X_path = util.cache_path('train_X_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) pkl_y_path = util.cache_path('train_y_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path): logger.debug('fetch cached') X = util.fetch(pkl_X_path) y = util.fetch(pkl_y_path) else: X, _ = __load_dataset__(enroll_ids, log, base_date) y_with_id = util.load_val_y() if not np.all(y_with_id[:, 0] == enroll_ids): logger.fatal('something wrong with enroll_ids') raise RuntimeError('something wrong with enroll_ids') y = y_with_id[:, 1] util.dump(X, pkl_X_path) util.dump(y, pkl_y_path) # base_date = log['time'].max().to_datetime() - timedelta(days=10) base_date = datetime(2014, 7, 22, 22, 0, 47) Dw = timedelta(days=7) enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date) for _ in range(depth - 1): if enroll_ids.size <= 0: break if earlist_base_date is not None and base_date < earlist_base_date: break logger.debug('load features before %s', base_date) # get instances and labels pkl_X_path = util.cache_path('train_X_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) pkl_y_path = util.cache_path('train_y_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path): logger.debug('fetch cached') X_temp = util.fetch(pkl_X_path) y_temp = util.fetch(pkl_y_path) else: X_temp, y_temp = __load_dataset__(enroll_ids, log, base_date) util.dump(X_temp, pkl_X_path) util.dump(y_temp, pkl_y_path) # update instances and labels if not cache_only: X = np.r_[X, X_temp] y = np.append(y, y_temp) # update base_date and enroll_ids base_date -= Dw enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date) return X, y
def mf_rmse(U, V, users, movies, ratings, split, latent=30, debug=1): us = int(math.ceil(np.float(np.max(users)) / split)) vs = int(math.ceil(np.float(np.max(movies)) / split)) u1, v1 = 0, 0 error = 0.0 totnum = 0 totmse = 0.0 t4 = time.clock() for i in range(us): u1 = i * split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((i + 1) * split - 1) if np.max(users) < u2: u2 = int(np.max(users)) for j in range(vs): v1 = j * split if np.max(movies) < v1: v1 = int(np.max(movies)) v2 = (j + 1) * split - 1 if np.max(movies) < v2: v2 = int(np.max(movies)) if debug > 1: print("Processing split : ", i, j, u1, u2, v1, v2) uu, mm, rr = fetch(u1, u2, v1, v2, users, movies, ratings) if debug > 1: print("Shapes of uu,mm,rr :", uu.shape, mm.shape, rr.shape) t6 = time.clock() P, Q = U[u1:u2 + 1, 0:latent], V[0:latent, v1:v2 + 1] P = P.reshape(P.shape[0] * P.shape[1], 1).astype(np.float32) Q = Q.reshape(Q.shape[0] * Q.shape[1], 1).astype(np.float32) tools.clear_context_caches() a_gpu = gpuarray.to_gpu(P) b_gpu = gpuarray.to_gpu(Q) t7 = time.clock() u_gpu = gpuarray.to_gpu(uu) v_gpu = gpuarray.to_gpu(mm) r_gpu = gpuarray.to_gpu(rr) ex_gpu = gpuarray.zeros((3072, 1), np.float32) ey_gpu = gpuarray.zeros((3072, 1), np.int32) if len(uu) > 0: rmse(a_gpu, b_gpu, u_gpu, v_gpu, r_gpu, ex_gpu, ey_gpu, np.int32(u2 - u1 + 1), np.int32(latent), np.int32(v2 - v1 + 1), np.int32(u1), np.int32(u2), np.int32(v1), np.int32(v2), np.int32(len(uu)), np.int32(len(mm)), block=(16, 16, 1), grid=(3, 4)) ex = ex_gpu.get() ey = ey_gpu.get() num = np.sum(ey) mse = np.sum(np.dot(ex.T, ey)) temp = np.float((totnum + num)) error = error * (totnum / temp) + (mse / temp) totnum += num totmse += mse if debug > 1: print(" mse , error ", totmse, mse, mse / num, error, num, len(uu)) t8 = time.clock() return np.sqrt(error)
def factorize(users, movies, ratings, test_users, test_movies, test_ratings, blocks=1, latent=10, steps=10, gpu_steps=1, alpha=0.000001, beta=0.01, delta=0.01, rmse_repeat_count=3, debug=2, dataset=''): U, V = np.ones((R.shape[0], latent)), np.ones((latent, R.shape[1])) size = max(np.max(users)+1, np.max(movies)+1) split = int(size/blocks) us = int(math.ceil( np.float(np.max(users))/split ) ) vs = int(math.ceil( np.float(np.max(movies))/split ) ) if debug>1: print("Total splits : ",split, us, vs, us*vs) print("U, V shapes :", U.shape, V.shape) start_time=time.clock() y1, y2 = [], [] flag1, count = 1000, 0 for k in range(steps): if debug>1: print("Step : ", k) rmse = 0 u1, v1 = 0, 0 t4 = time.clock() for i in range(us): u1 = i*split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((i+1)*split - 1) if np.max(users) < u2: u2 = int(np.max(users)) for j in range(vs): v1 = j*split if np.max(movies) < v1: v1 = int(np.max(movies)) v2 = (j+1)*split -1 if np.max(movies) < v2: v2 = int(np.max(movies)) #print("Processing split : " , i , j, u1, u2, v1, v2) uu, mm, rr = fetch(u1, u2, v1, v2, users, movies, ratings) if debug>1: print("Shapes of uu,mm,rr :", uu.shape, mm.shape, rr.shape) t6 = time.clock() P, Q = U[u1:u2+1, 0:latent], V[0:latent, v1:v2+1] if debug>1: print("P Q shapes : " , P.shape, Q.shape) t7 = time.clock() if debug>1: print("Length of uu,mm ", len(uu), len(mm), u2-u1+1, v2-v1+1, P.shape, Q.shape) if(len(uu)!=0 and len(mm)!=0): P,Q = block_factorization(P,Q,R, u1, u2, v1, v2, steps=gpu_steps) t8 = time.clock() if debug>1: print("Shape of P, Q :", P.shape, Q.shape) U[u1:u2+1, 0:latent] = P.reshape( (u2-u1+1, latent)) V[0:latent, v1:v2+1] = Q.reshape( (latent, v2-v1+1)) t9 = time.clock() if debug>1: print("Timer :", round(t7-t6,2), round(t8-t7,2), round(t9-t8,2)) temp = error(R, P,Q, u1, u2, v1, v2) rmse += temp if debug>1: print("Completed processing : ", i , j, round(rmse,3)) t5 = time.clock() if debug>1: print(" Step time taken : ", round(t5-t4,2)) y1.append(round(t5-start_time,3)) # temporarily hardcoded to work just with train dataset alone test_rmse = error(R, U, V,0, R.shape[1], 0, R.shape[0]) #e(U, V , test_users, test_movies, test_ratings, min(split, max(np.max(test_users), np.max(test_movies))), latent=latent, debug=debug) print("Step error :", round(test_rmse,3) ) y2.append(round(test_rmse,3) ) flag=round(test_rmse,4) # un-comment for early convergence stopping # if flag < delta: # break #elif flag1<flag : # break #elif rmse_repeat_count<count: # break #elif flag==flag1: # count=count+1 #else: # count = 0 #flag1=flag np.savetxt(str(blocks*blocks)+'blocks_'+str(gpu_steps)+'iterations_y2.txt', y2, fmt='%.3f') np.savetxt(str(blocks*blocks)+'blocks_'+str(gpu_steps)+'iterations_y1.txt', y1, fmt='%.3f')
return servers[i] else: server = "ss13" return servers[server] while True: try: server = get_server() print(server) if len(server) == 5: try: if server[4] == "fetch": status = util.fetch(server[2], server[3], "status") elif server[4] == "http": status = requests.get(server[2]).json() #print(status) if server[0] in ["Baystation 12"]: details = status["map"]+" | "+str(status["players"])+" players" elif server[0] in ["Goonstation #2","Goonstation RP #1", "BeeStation", "FTL13", "Station Bagil", "Station Terry", "Station Sybil", "Citadel Station"]: details = status["map_name"]+" | "+str(status["players"])+" players" if server[0] in ["Goonstation #2","Goonstation RP #1"]: if status["shuttle_time"] != 'welp' and status["shuttle_time"] != '600': rp.set_activity(state=server[0],details=details,large_text=server[0],large_image=server[1], start=int(time.time())-int(status["elapsed"]), end=int(time.time())+int(status["shuttle_time"])) else: rp.set_activity(state=server[0],details=details,large_text=server[0],large_image=server[1], start=int(time.time())-int(status["elapsed"])) elif server[0] in ["BeeStation", "FTL13", "Station Bagil", "Station Terry", "Station Sybil", "Citadel Station"]:
def factorize(users, movies, ratings, test_users, test_movies, test_ratings, blocks=1, latent=30, steps=10, block_steps=1, alpha=0.00001, beta=0.01, delta=0.01, rmse_repeat_count=3, debug=2, dataset=''): global U, V U, V = initUV(np.max(users) + 1, latent, np.max(movies) + 1) R = csr_matrix((ratings, (users, movies))).todense() size = max(np.max(users) + 1, np.max(movies) + 1) split = int(size / blocks) us = int(math.ceil(np.float(np.max(users)) / split)) vs = int(math.ceil(np.float(np.max(movies)) / split)) if debug > 1: print("Total splits : ", split, us, vs, us * vs) print("U, V shapes :", U.shape, V.shape) start_time = time.clock() y1, y2 = [], [] count, error = 0, 100 for k in range(steps): if debug > 1: print("Step : ", k) u1, v1 = 0, 0 t4 = time.clock() for i in range(us): u1 = i * split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((i + 1) * split - 1) if np.max(users) < u2: u2 = int(np.max(users)) stemp = 0 tpool = [None] * vs for j in range(vs): xtemp = int((i + stemp) % us) if debug > 1: print("i, j, ii, jj ", i, j, xtemp, j) u1 = xtemp * split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((xtemp + 1) * split - 1) if np.max(users) < u2: u2 = int(np.max(users)) v1 = j * split if np.max(movies) < v1: v1 = int(np.max(movies)) v2 = (j + 1) * split - 1 if np.max(movies) < v2: v2 = int(np.max(movies)) if debug > 1: print("Processing split : ", i, j, u1, u2, v1, v2) uu, mm, rr = fetch(u1, u2, v1, v2, users, movies, ratings) if debug > 1: print("Shapes of uu,mm,rr :", uu.shape, mm.shape, rr.shape) t6 = time.clock() P, Q = U[u1:u2 + 1, 0:latent], V[v1:v2 + 1, 0:latent] if debug > 1: print("P Q shapes : ", P.shape, Q.shape) t7 = time.clock() if debug > 1: print("Length of uu,mm ", len(uu), len(mm), u2 - u1 + 1, v2 - v1 + 1, P.shape, Q.shape) if (len(uu) != 0 and len(mm) != 0): t = tpool[j] if t is not None: while t.isAlive(): print('waiting for the thread ...') time.sleep(5) t = threading.Thread(target=block_factorization, args=(P, Q, R, u1, u2, v1, v2, block_steps)) tpool[j] = t t.start() t8 = time.clock() stemp += 1 t5 = time.clock() if debug > 1: print(" Step time taken : ", round(t5 - t4, 2)) y1.append(round(t5 - start_time, 3)) test_rmse = rmse(test_users, test_movies, test_ratings, U, V) print("Step error :", round(test_rmse, 3)) y2.append(round(test_rmse, 3)) step_error = round(test_rmse, 4) if step_error < delta: break elif error < step_error: break elif rmse_repeat_count < count: break elif error == step_error: count = count + 1 else: count = 0 error = step_error np.savetxt(str(blocks * blocks) + 'blocks_y2.txt', y2, fmt='%.3f') np.savetxt(str(blocks * blocks) + 'blocks_y1.txt', y1, fmt='%.3f')
def source_event_counter(enrollment_set, base_date): """ Counts the source-event pairs. Features -------- """ X_pkl_path = util.cache_path('source_event_counter_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(X_pkl_path): return util.fetch(X_pkl_path) logger = logging.getLogger('source_event_counter') logger.debug('preparing datasets') Enroll_all = util.load_enrollments() pkl_path = util.cache_path('Log_all_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): Log = util.fetch(pkl_path) else: Log = util.load_logs() Log = Log[Log['time'] <= base_date] Log['source_event'] = Log['source'] + '-' + Log['event'] Log['day_diff'] = (base_date - Log['time']).dt.days Log['week_diff'] = Log['day_diff'] // 7 Log['event_count'] = 1 util.dump(Log, pkl_path) Log_counted = Log.groupby(['enrollment_id', 'source_event', 'week_diff'])\ .agg({'event_count': np.sum}).reset_index() logger.debug('datasets prepared') Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\ .reset_index() n_proc = par.cpu_count() pkl_path = util.cache_path('event_count_by_eid_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): event_count_by_eid = util.fetch(pkl_path) else: params = [] eids = [] for eid, df in pd.merge(Enroll_all, Log_counted, on=['enrollment_id'])\ .groupby(['enrollment_id']): params.append(df) eids.append(eid) pool = par.Pool(processes=min(n_proc, len(params))) event_count_by_eid = dict( zip(eids, pool.map(__get_counting_feature__, params))) pool.close() pool.join() util.dump(event_count_by_eid, pkl_path) X0 = np.array([event_count_by_eid[i] for i in Enroll['enrollment_id']]) logger.debug('source-event pairs counted, has nan: %s, shape: %s', np.any(np.isnan(X0)), repr(X0.shape)) pkl_path = util.cache_path('D_full_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): D_full = util.fetch(pkl_path) else: D_full = pd.merge(Enroll_all, Log, on=['enrollment_id']) util.dump(D_full, pkl_path) pkl_path = util.cache_path('user_wn_courses_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): user_wn_courses = util.fetch(pkl_path) else: user_wn_courses = {} for u, df in D_full.groupby(['username']): x = [] for wn in __week_span__: x.append(len(df[df['week_diff'] == wn]['course_id'].unique())) user_wn_courses[u] = x util.dump(user_wn_courses, pkl_path) X1 = np.array([user_wn_courses[u] for u in Enroll['username']]) logger.debug('courses by user counted, has nan: %s, shape: %s', np.any(np.isnan(X1)), repr(X1.shape)) pkl_path = util.cache_path('course_population_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): course_population = util.fetch(pkl_path) else: course_population = {} for c, df in D_full.groupby(['course_id']): course_population[c] = len(df['username'].unique()) util.dump(course_population, pkl_path) X2 = np.array([course_population.get(c, 0) for c in Enroll['course_id']]) logger.debug('course population counted, has nan: %s, shape: %s', np.any(np.isnan(X2)), repr(X2.shape)) pkl_path = util.cache_path('course_dropout_count_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): course_dropout_count = util.fetch(pkl_path) else: course_dropout_count = course_population.copy() for c, df in D_full[D_full['day_diff'] < 10].groupby(['course_id']): course_dropout_count[c] -= len(df['username'].unique()) util.dump(course_dropout_count, pkl_path) X3 = np.array( [course_dropout_count.get(c, 0) for c in Enroll['course_id']]) logger.debug('course dropout counted, has nan: %s, shape: %s', np.any(np.isnan(X3)), repr(X3.shape)) pkl_path = util.cache_path('user_ops_count_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): user_ops_count = util.fetch(pkl_path) else: user_ops_on_all_courses = D_full.groupby( ['username', 'source_event', 'week_diff'])\ .agg({'event_count': np.sum}).reset_index() params = [] users = [] for u, df in user_ops_on_all_courses.groupby(['username']): params.append(df) users.append(u) pool = par.Pool(processes=min(n_proc, len(params))) user_ops_count = dict( zip(users, pool.map(__get_counting_feature__, params))) pool.close() pool.join() util.dump(user_ops_count, pkl_path) X4 = X0 / [user_ops_count[u] for u in Enroll['username']] X4[np.isnan(X4)] = 0 logger.debug('ratio of user ops on all courses, has nan: %s, shape: %s', np.any(np.isnan(X4)), repr(X4.shape)) pkl_path = util.cache_path('course_ops_count_before_%s' % base_date.strftime('%Y-%m-%d_%H-%M-%S')) if os.path.exists(pkl_path): course_ops_count = util.fetch(pkl_path) else: course_ops_of_all_users = D_full.groupby( ['course_id', 'source_event', 'week_diff'])\ .agg({'event_count': np.sum}).reset_index() params = [] courses = [] for c, df in course_ops_of_all_users.groupby(['course_id']): params.append(df) courses.append(c) pool = par.Pool(processes=min(n_proc, len(params))) course_ops_count = dict( zip(courses, pool.map(__get_counting_feature__, params))) pool.close() pool.join() util.dump(course_ops_count, pkl_path) X5 = X0 / [course_ops_count[c] for c in Enroll['course_id']] X5[np.isnan(X5)] = 0 logger.debug('ratio of courses ops of all users, has nan: %s, shape: %s', np.any(np.isnan(X5)), repr(X5.shape)) X6 = np.array([ course_dropout_count.get(c, 0) / course_population.get(c, 1) for c in Enroll['course_id'] ]) logger.debug('dropout ratio of courses, has nan: %s, shape: %s', np.any(np.isnan(X6)), repr(X6.shape)) Obj = util.load_object() Obj = Obj[Obj['start'] <= base_date] course_time = {} for c, df in Obj.groupby(['course_id']): start_time = np.min(df['start']) update_time = np.max(df['start']) course_time[c] = [(base_date - start_time).days, (base_date - update_time).days] avg_start_days = np.average([t[0] for _, t in course_time.items()]) avg_update_days = np.average([t[1] for _, t in course_time.items()]) default_case = [avg_start_days, avg_update_days] X7 = np.array( [course_time.get(c, default_case)[0] for c in Enroll['course_id']]) logger.debug('days from course first update, has nan: %s, shape: %s', np.any(np.isnan(X7)), repr(X7.shape)) X8 = np.array( [course_time.get(c, default_case)[1] for c in Enroll['course_id']]) logger.debug('days from course last update, has nan: %s, shape: %s', np.any(np.isnan(X8)), repr(X8.shape)) user_ops_time = pd.merge(Enroll, Log, how='left', on=['enrollment_id'])\ .groupby(['enrollment_id']).agg({'day_diff': [np.min, np.max]})\ .fillna(0) X9 = np.array(user_ops_time['day_diff']['amin']) logger.debug('days from user last op, has nan: %s, shape: %s', np.any(np.isnan(X9)), repr(X9.shape)) X10 = np.array(user_ops_time['day_diff']['amax']) logger.debug('days from user first op, has nan: %s, shape: %s', np.any(np.isnan(X10)), repr(X10.shape)) X11 = X7 - X10 logger.debug( 'days from course first update to user first op, has nan: %s' ', shape: %s', np.any(np.isnan(X11)), repr(X11.shape)) X = np.c_[X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11] util.dump(X, X_pkl_path) return X