def run_with_target(label, target, data_key, model_fn, kf, train_key=None, eval_fn=None): if is_in_cache(label + '_' + target): return load_cache(label + '_' + target)[0] else: print('-') print_step('Training ' + target) if train_key is None: train, test = get_data() else: train, test = load_cache(train_key) post_train, post_test = load_cache(data_key) if isinstance(post_train, pd.DataFrame): post_train = post_train.values post_test = post_test.values train_y = train[target] cv_scores = [] pred_full_test = 0 pred_train = np.zeros(train.shape[0]) i = 1 if isinstance(kf, StratifiedKFold): fold_splits = kf.split(post_train, train_y) else: fold_splits = kf.split(post_train) for dev_index, val_index in fold_splits: print_step('Started ' + label + ' ' + target + ' fold ' + str(i)) dev_X, val_X = post_train[dev_index], post_train[val_index] dev_y, val_y = train_y[dev_index], train_y[val_index] pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y, post_test, target, dev_index, val_index) pred_full_test = pred_full_test + pred_test_y pred_train[val_index] = pred_val_y cv_score = eval_fn(val_y, pred_val_y) cv_scores.append(eval_fn(val_y, pred_val_y)) print_step(label + ' ' + target + ' cv score ' + str(i) + ' : ' + str(cv_score)) i += 1 print_step(label + ' ' + target + ' cv scores : ' + str(cv_scores)) mean_cv_score = np.mean(cv_scores) print_step(label + ' ' + target + ' mean cv score : ' + str(mean_cv_score)) pred_full_test = pred_full_test / 5. results = { 'label': label, 'target': target, 'train': pred_train, 'test': pred_full_test, 'cv': cv_scores } save_in_cache(label + '_' + target, results, None) return results
def index(): if cache.exists(): worklists = cache.get_data() else: worklists = generate_worklist(50) cache.write_data(worklists) write_file_worklist(worklists) context = { "worklist_address": config("WORKLIST_ADDRESS"), "worklist_port": config("WORKLIST_PORT"), "calling_ae_title": config("CALLING_AE_TITLE"), "called_ae_title": config("CALLED_AE_TITLE"), "worklists": worklists, } return render_template("index.html", **context)
def urls(): col1 = session['col1'] col2 = session['col2'] key = request.args.get('f') url_type = request.args.get('type') # session['url_types'][0] value must always be the default one, meaning no filters enabled session['url_types'] = ['All pages', 'bgg', 'bgc', 'g', 's', 'bgr'] if not url_type: url_type = session['url_type'] = session['url_types'][0] urls_data = get_data(col1, col2, key=key, urls=url_type) pages = urls_data.pages return render_template('urls.html', pages=pages, url_types=session['url_types'], url_type=url_type, key=key)
def index(): session['cols'] = [x.date for x in db.query(Collection).all()] session['cols'].sort() session['cols'].reverse() cols = session['cols'] if not session.get('col1'): col1 = session['col1'] = session['cols'][0] col2 = session['col2'] = session['cols'][1] else: col1 = session['col1'] col2 = session['col2'] if request.method == 'POST': col1 = session['col1'] = request.form.get('col1') col2 = session['col2'] = request.form.get('col2') main_data = get_data(col1, col2) keys = main_data.keys() return render_template('index.html', data=main_data, keys=keys)
def _ondemand(syms, t0, t1, col='a'): """\ Get price data with the following policy: - use offline data if db/cache has it - on demand download data from online sources if db/cache does not have the data required - save downloaded data to db/cache - return price data Args: syms (list of str) : a list of stock symbols, e.g. ['SPY', 'XLK', 'XLF'] t0 (str) : start date datestr, e.g. '2016-01-01' t1 (str) : last date datestr, e.g. '2016-06-06' col (str) : a character in colnames_yf.keys() Returns: A dataframe with price data """ from cache import get_data colname = colnames_db[col] data = get_data(syms, t0, t1, colname) df = pd.DataFrame(data) df = df[list(syms)] # keep syms in original order return df
def data(self): value = cache.get_data(self.title) if value is None: value = super(WikiPage, self).data cache.set_data(self.title, value) return value
import re import string import pandas as pd import numpy as np from nltk.corpus import stopwords from utils import print_step, bin_and_ohe_data from cache import get_data, is_in_cache, load_cache, save_in_cache print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) if not is_in_cache('data_with_fe'): print('~~~~~~~~~~~~') print_step('Merging') merge = pd.concat([train, test]) print('~~~~~~~~~~~~~~~~~~~') print_step('Imputation 1/7')
print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score)) EMBEDDING_FILE = 'cache/crawl/crawl-300d-2M.vec' max_features = 30000 maxlen = 100 embed_size = 300 epochs = 4 batch_size = 32 predict_batch_size = 1024 if not is_in_cache('lvl1_double-gru'): train_df, test_df = get_data() classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] X_train = train_df['comment_text'].fillna('peterhurford').values y_train = train_df[classes].values X_test = test_df['comment_text'].fillna('peterhurford').values print_step('Tokenizing data...') tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) x_train = tokenizer.texts_to_sequences(X_train) x_test = tokenizer.texts_to_sequences(X_test) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int))) print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
def run_nn_model(label, model, max_features, maxlen, epochs, batch_size, predict_batch_size): classes = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] for embedding_name, embedding_file in EMBEDDING_FILES.items(): if is_in_cache(label + '_' + embedding_name): print_step('Already trained ' + label + '_' + embedding_name + '! Skipping...') else: train_df, test_df = get_data() print_step('Loading embed ' + embedding_name + '...') embed_size = EMBED_SIZE_LOOKUP[embedding_name] x_train, x_test, embedding_matrix = tokenize_and_embed( train_df, test_df, embedding_file, max_features, maxlen, embed_size, embedding_name) y_train = train_df[classes].values print_step('Build model...') model = model(max_features, maxlen, embed_size, embedding_matrix) model.save_weights('cache/gru-model-weights.h5') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) i = 1 cv_scores = [] pred_train = np.zeros((train_df.shape[0], 6)) pred_full_test = np.zeros((test_df.shape[0], 6)) for dev_index, val_index in kf.split(x_train, y_train[:, 0]): print_step('Started fold ' + str(i)) model.load_weights('cache/' + label + '_' + embedding_name + '-model-weights.h5') dev_X, val_X = x_train[dev_index], x_train[val_index] dev_y, val_y = y_train[dev_index, :], y_train[val_index, :] RocAuc = RocAucEvaluation(validation_data=(val_X, val_y), interval=1) model.fit(dev_X, dev_y, batch_size=batch_size, epochs=epochs, validation_data=(val_X, val_y), callbacks=[RocAuc]) val_pred = model.predict(val_X, batch_size=predict_batch_size, verbose=1) pred_train[val_index, :] = val_pred test_pred = model.predict(x_test, batch_size=predict_batch_size, verbose=1) pred_full_test = pred_full_test + test_pred cv_score = [ roc_auc_score(val_y[:, j], val_pred[:, j]) for j in range(6) ] print_step('Fold ' + str(i) + ' done') pprint(zip(classes, cv_score)) cv_scores.append(cv_score) i += 1 print_step('All folds done!') print('CV scores') pprint(zip(classes, np.mean(cv_scores, axis=0))) mean_cv_score = np.mean(np.mean(cv_scores, axis=0)) print('mean cv score : ' + str(mean_cv_score)) pred_full_test = pred_full_test / 5. for k, classx in enumerate(classes): train_df['gru_' + classx] = pred_train[:, k] test_df['gru_' + classx] = pred_full_test[:, k] print('~~~~~~~~~~~~~~~~~~') print_step('Cache Level 1') save_in_cache('lvl1_' + label + '_' + embedding_name, train_df, test_df) print_step('Done!') print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Prepping submission file') submission = pd.DataFrame() submission['id'] = test_df['id'] submission['toxic'] = test_df[label + '_' + embedding_name + '_toxic'] submission['severe_toxic'] = test_df[label + '_' + embedding_name + '_severe_toxic'] submission['obscene'] = test_df[label + '_' + embedding_name + '_obscene'] submission['threat'] = test_df[label + '_' + embedding_name + '_threat'] submission['insult'] = test_df[label + '_' + embedding_name + '_insult'] submission['identity_hate'] = test_df[label + '_' + embedding_name + '_identity_hate'] submission.to_csv('submit/submit_lvl1_' + label + '_' + embedding_name + '.csv', index=False) print_step('Done')
def create_course_objects( tables: List[BeautifulSoup]) -> Tuple[Course, Instructor]: course = Course() instructor = Instructor() for i, table in enumerate(tables[:max_tables]): for row in table.findAll('tr'): if row.has_attr('bgcolor'): continue data = [data for data in row.findAll('td')] cells = [list(cell.stripped_strings) for cell in data] if i == table_type['GENERAL_INFO']: log.info("parsing through course info") course.sln = cells[0][0] if cells[0] else None if cells[1]: course_tokens = cells[1][0].split() course.department = course_tokens[0] course.number = course_tokens[1] course.section = cells[2][0] if cells[2] else None course.type = cells[3][0] if cells[3] else None # TODO: Add way to handle fractional credits (i.e, 2.5) if len(cells) > 7: credit_tokens = cells[5][0].strip().split( '-') if cells[5] else [] if len(credit_tokens) > 1: course.lower_credits = credit_tokens[0] course.upper_credits = credit_tokens[1] else: course.lower_credits = credit_tokens[0] course.upper_credits = credit_tokens[0] course.name = cells[6][0] gen_ed_marker = cells[7] else: credit_tokens = cells[4][0].strip().split( '-') if cells[4] else [] if len(credit_tokens) > 1: course.lower_credits = credit_tokens[0] course.upper_credits = credit_tokens[1] else: course.lower_credits = credit_tokens[0] course.upper_credits = credit_tokens[0] course.name = cells[5][0] if cells[5] else None gen_ed_marker = cells[6][0] if cells[6] else None log.info(gen_ed_marker) if gen_ed_marker: gen_eds = gen_ed_marker.split(",") # QSR,NW --> [QSR, NW] for gen_end in gen_eds: if gen_end in course.general_education: course.general_education[gen_end] = True elif i == table_type['ENROLLMENT']: log.info("parsing through course info (enrollment)") course.current_size = cells[0][0] if cells[0] else None course.max_size = cells[1][0] if cells[1] else None if len(cells) > 4 and cells[4][0] == 'Entry Code required': course.add_code_required = True elif i == table_type['MEETINGS']: log.info("parsing through meeting times") log.info(cells) # If there is more than one meeting location: # Ex: TTh 08:45-09:45 UW1 121 GUNNERSON,KIM N. # TTh 09:45-10:50 UW2 131 GUNNERSON,KIM N. # meeting_days: [TTh, TTh] # start_times: [08:45, 09:45] # end_times: [09:45, 10:50] # rooms: [UW1 121, UW2 131] if cells[0] and cells[0][0] != 'To be arranged': meeting_days = cells[0] start_times = [ time_range.split('-')[0].replace('\u00a0', ' ') for time_range in cells[1] ] end_times = [ time_range.split('-')[1].replace('\u00a0', ' ') for time_range in cells[1] ] rooms = [room.replace('\u00a0', ' ') for room in cells[2]] for days, start_time, end_time, room in zip( meeting_days, start_times, end_times, rooms): room_tokens = room.split() if len(room_tokens) == 1: room_building = room_tokens[0] room_number = None else: room_building, room_number = room_tokens new_meeting = { "room_building": room_building, "room_number": room_number, "meeting_days": days, "start_time": start_time, "end_time": end_time } course.meetings.append(new_meeting) instructor_name = cells[3][0] if cells[3] else None log.info(f"instructor name: {instructor_name}") instructor_tokens = instructor_name.split(',') if len(instructor_tokens) > 1: instructor.first_name = instructor_tokens[1] instructor.last_name = instructor_tokens[0] log.info(f"split instructor name: {instructor_tokens}") first_name_tokens = instructor.first_name.split(' ') log.info(f"first name: {first_name_tokens}") if len(first_name_tokens) > 1: instructor.first_name = first_name_tokens[0] instructor.middle_name = first_name_tokens[1] else: instructor.middle_name = "" log.info( f"{instructor.first_name}, {instructor.middle_name}, {instructor.last_name}" ) log.info( "retrieving data for instructor email and phone number" ) data = get_data(instructor.first_name, instructor.last_name) if data and not data.get('error'): instructor.email = data['teacher'][0]['email'] instructor.phone_number = data['teacher'][0]['phone'] elif i == table_type['NOTES']: log.info("Retrieving course description...") log.info(cells) lines = cells[0] course.description = "\n".join( [line if line else "" for line in lines]) break log.info("Done collecting course information and instructor information.") return course, instructor
'lambda_l2': 1 } model = lgb.train(params, train_set=d_train, num_boost_round=rounds_lookup[label], valid_sets=watchlist, verbose_eval=100) print(model.feature_importance()) pred_test_y = model.predict(test_X) pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2 if not is_in_cache('convai_with_fe'): print_step('Importing base data') train_base, test_base = get_data() print_step('Importing ConvAI data') train, test = load_cache('convai_data') print_step('Importing FE') train_fe, test_fe = load_cache('fe_lgb_data') print_step('Merging') train_fe['id'] = train_base['id'] test_fe['id'] = test_base['id'] train_ = pd.merge(train_fe, train, on='id') test_ = pd.merge(test_fe, test, on='id') del train_base del test_base del train_fe
import numpy as np from scipy.optimize import minimize from sklearn.metrics import roc_auc_score from cache import get_data, load_cache def auc_func(weights): final_prediction = 0 for weight, prediction in zip(weights, blend_train): final_prediction += weight * prediction return 1 - roc_auc_score(y_train, final_prediction) base_train, base_test = get_data() train, test = load_cache('lvl3_all_mix') labels = ['toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate'] for label in labels: y_train = base_train[label] print('\n Finding Blending Weights for ' + label + '...') blend_train = np.array([train['lvl2_all_lgb_' + label].rank().values, train['lvl2_all_xgb_' + label].rank().values, train['final-cnn_' + label].rank().values, train['lvl2_all_rf_' + label].rank().values]) blend_test = np.array([test['lvl2_all_lgb_' + label].rank().values, test['lvl2_all_xgb_' + label].rank().values, test['final-cnn_' + label].rank().values, test['lvl2_all_rf_' + label].rank().values])
def runRidge(train_X, train_y, test_X, test_y, test_X2, params): model = Ridge(**params) print_step('Fit Ridge') model.fit(train_X, train_y) print_step('Ridge Predict 1/2') pred_test_y = model.predict(test_X) print_step('Ridge Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) if not is_in_cache('tfidf_ridges') or not is_in_cache( 'titlecat_tfidf') or not is_in_cache('text_tfidf') or not is_in_cache( 'text_char_tfidf'): print('~~~~~~~~~~~~~~~~~~~~') print_step('Title TFIDF 1/2') tfidf = TfidfVectorizer(ngram_range=(1, 1),