def main(argv): global CACHE dest_dir = '/home/killbots/killbots.net/random/' if (len(argv) > 1): dest_dir = argv[1] CACHE = os.path.join(dest_dir, CACHE) data = cache.load_cache(CACHE) latest_update = get_latest_update() url_suffix = latest_update[0] time = latest_update[1:] places = ['Kita Ibaraki City', 'Takahagi City', 'Daigo Town', 'KEK', 'AIST (3F)', 'AIST (Carpark)'] try: data = get_levels(url_suffix, data) except: traceback.print_exc() try: data = get_kek(data) except: traceback.print_exc() try: data = get_aist(data) except: traceback.print_exc() cache.save_cache(data, CACHE) plot_data(places, dest_dir)
def main(): logging.basicConfig(level=logging.DEBUG) # Parser and args parser = create_parser() args = parser.parse_args() # Setup resources and dirs dest = open(args.out, 'w') res_dir = os.path.split(os.path.abspath(__file__))[0] template = open(os.path.join(res_dir, 'template.html'), 'r').read() output = HTMLOutput(dest, template) cache_dir = os.path.split(args.out)[0] # Use cache dbs = [FilmwebDatabase()] if not args.force: cache = load_cache(cache_dir, args.out) if cache: logging.info("using cache file") dbs = cache # Get movies movies = find_movies_info(args.dirs, dbs, output, '-rating') # Histogram? if args.histogram: path = os.path.join(cache_dir, '.movierank-histogram.png') histogram(movies, path) output.add_extra('histogram', path) # Finish store_cache(cache_dir, dbs, suffix=args.out) output.flush() # Run browser? if args.run: subprocess.Popen(["xdg-open", args.out], stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
def compute_count(): """Computes the stats""" global JxStatsArray,JxPartitionLists,NoType JxCache = load_cache() try: Query = """select cards.factId, cards.id, cards.reps, cards.interval from cards, cards as mCards where mCards.modified>%s and cards.factId=mCards.factId group by cards.id order by cards.factId""" % JxCache['TimeCached'] JxStatsArray = JxCache['Stats'] JxPartitionLists = JxCache['Partitions'] NoType = JxCache['NoType'] except: Query = """select factId, id, reps, interval from cards order by factId""" NoType = 0 # known/seen/in deck for (Type,List) in JxStatsMap.iteritems(): for (k, Map) in enumerate(List): for (Key,String) in Map.Order+[('Other','Other')]: if k != 1: JxStatsArray[(Type,k,Key)] = (0, 0, 0, len([Item for (Item,Value) in Map.Dict.iteritems() if Value == Key])) elif Type =='Word': JxStatsArray[(Type,k,Key)] = (0, 0, 0, sum([Jx_Word_Occurences[Item] for (Item,Value) in Map.Dict.iteritems() if Value == Key])) else: JxStatsArray[(Type,k,Key)] = (0, 0, 0, sum([Jx_Kanji_Occurences[Item] for (Item,Value) in Map.Dict.iteritems() if Value == Key])) for Label in ['Known','Seen','InDeck']: JxPartitionLists[(Type,k,Key,Label)] = [] # we compute known/seen/in deck/total stats for each value of each map and each type Rows = mw.deck.s.all(Query) CardState = [] Length = len(Rows) Index = 0 while True and Index<Length: (FactId,CardId,CardRep,Interval) = Rows[Index] # set the card's status if Interval > 21 and CardRep: CardState.append(0) elif CardRep: CardState.append(1) else: CardState.append(2) Index += 1 if Index == Length: # we have finished parsing the Entries.Flush the last Fact and break JxFlushFactStats(CardState,CardId) break # we have finished parsing the Entries, flush the Status change elif FactId == Rows[Index][0]: # Same Fact : Though it does nothing, we put this here for speed purposes because it happens a lot. pass else: # Fact change JxFlushFactStats(CardState,CardId) CardState = [] # now cache the updated stats JxCache['Stats'] = JxStatsArray JxCache['Partitions'] = JxPartitionLists JxCache['NoType'] = NoType JxCache['TimeCached'] = time() # among the few things that coul corrupt the cache : # new entries in the database before the cache was saved...sigh... save_cache(JxCache)
#!/usr/bin/env python import datetime import cache f = open('previous.txt', 'r') lines = f.readlines()[2:] f.close() dest = cache.load_cache('fukushima.dat') for l in lines: cells = l.rstrip().split('\t') print cells, ' | ', ts = datetime.datetime.strptime(cells[1], '%m/%d/%Y %H:%M:%S') cells = cells[3:] print cells for ii, c in enumerate(cells[:7]): try: float(c.strip()) except ValueError: continue dest.set_value(ts, ii, c.strip()) if len(cells) >= 8: dest.set_value(ts, 8, cells[7].strip()) cache.save_cache(dest, 'fukushima.dat')
image_chunks = list(chunks(image_files, 1000)) print_step('Chunked images into %d groups with %d images per group' % (len(image_chunks), len(image_chunks[0]))) pool = mp.ProcessingPool(n_nodes, maxtasksperchild=500) print_step('Starting a jobs server with %d nodes' % n_nodes) res = pool.map(lambda dat: get_img_data(dat[0], dat[1]), enumerate(image_chunks)) pool.close() pool.join() pool.terminate() pool.restart() print('~~~~~~~~~~~~~~~~') print_step('Merging 1/9') dfs = pool.map(lambda c: load_cache('img_data_' + str(c)), range(len(image_chunks))) print_step('Merging 2/9') dfs = map(lambda x: x[0], dfs) print_step('Merging 3/9') merge = pd.concat(dfs) print_step('Merging 4/9') train, test = get_data() print_step('Merging 5/9') merge['img_path'] = merge['img_path'].apply( lambda x: x.replace('test_jpg/', '')) print_step('Merging 6/9') merge['img_path'] = merge['img_path'].apply( lambda x: x.replace('train_jpg/', '')) print_step('Merging 7/9') merge['img_path'] = merge['img_path'].apply(lambda x: x.replace('.jpg', ''))
'random_state': 16, 'verbose': 2 } model = RandomForestClassifier(**params) model.fit(train_X, train_y) pred_test_y = model.predict_proba(test_X)[:, 1] pred_test_y2 = model.predict_proba(test_X2)[:, 1] pred_test_y = minmax_scale( pd.Series(pred_test_y).rank().values) # Rank transform pred_test_y2 = minmax_scale(pd.Series(pred_test_y2).rank().values) return pred_test_y, pred_test_y2 if not is_in_cache('lvl2_all'): print_step('Importing 1/21: LRs') lr_train, lr_test = load_cache('lvl1_lr') print_step('Importing 2/21: FE') train_fe, test_fe = load_cache('fe_lgb_data') print_step('Importing 3/21: Sparse LGBs') lgb_train, lgb_test = load_cache('lvl1_sparse_lgb') print_step('Importing 4/21: FE LGB') fe_lgb_train, fe_lgb_test = load_cache('lvl1_fe_lgb') print_step('Importing 5/21: Sparse FE LGB') sfe_lgb_train, sfe_lgb_test = load_cache('lvl1_sparse_fe_lgb') print_step('Importing 6/21: FM') fm_train, fm_test = load_cache('lvl1_fm') print_step('IMporting 7/21: Ridge') ridge_train, ridge_test = load_cache('lvl1_ridge') print_step('Importing 8/21: GRU') gru_train, gru_test = load_cache('lvl1_gru') print_step('Importing 9/21: GRU2')
def runChainedFM(train_X, train_y, test_X, test_y, test_X2, label, dev_index, val_index): print_step('Loading Lvl1') lvl1_train, lvl1_test = load_cache('lvl1_fm') [ lvl1_train[c].apply(lambda x: 0 if x < 0.5 else 1) for c in lvl1_train.columns if 'fm_' in c and c != label ] lvl1_train = csr_matrix( pd.concat([ lvl1_train[c].apply(lambda x: 0 if x < 0.5 else 1) for c in lvl1_train.columns if 'fm_' in c and c != label ], axis=1).values) lvl1_test = csr_matrix( pd.concat([ lvl1_test[c].apply(lambda x: 0 if x < 0.5 else 1) for c in lvl1_test.columns if 'fm_' in c and c != label ], axis=1).values) print_step('Merging 1/3') lvl1_valid = lvl1_train[val_index] lvl1_train = lvl1_train[dev_index] train_X = csr_matrix(hstack([train_X, lvl1_train])) print_step('Merging 2/3') test_X = csr_matrix(hstack([test_X, lvl1_valid])) print_step('Merging 3/3') test_X2 = csr_matrix(hstack([test_X2, lvl1_test])) print_step('Modeling') class_weights = { 'toxic': 1.0, 'severe_toxic': 0.2, 'obscene': 1.0, 'threat': 0.1, 'insult': 0.8, 'identity_hate': 0.2 } model = FM_FTRL(alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_X.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", e_clip=1.0, threads=4, use_avx=1, verbose=1) train_weight = np.array( [1.0 if x == 1 else class_weights[label] for x in train_y]) model.fit(train_X, train_y, train_weight, reset=False) pred_test_y = sigmoid(model.predict(test_X)) pred_test_y2 = sigmoid(model.predict(test_X2)) return pred_test_y, pred_test_y2
print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 1/11') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) if not is_in_cache('deep_text_feats4'): print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 2/11') tfidf_train, tfidf_test = load_cache('titlecat_tfidf') print_step('Importing Data 3/11') tfidf_train2, tfidf_test2 = load_cache('text_tfidf') print_step('Importing Data 4/11') tfidf_train3, tfidf_test3 = load_cache('text_char_tfidf') print_step('Importing Data 5/11') train = hstack((tfidf_train, tfidf_train2, tfidf_train3)).tocsr() print_step('Importing Data 6/11') test = hstack((tfidf_test, tfidf_test2, tfidf_test3)).tocsr() print(train.shape) print(test.shape)
print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 1/15') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 2/15') train_fe, test_fe = load_cache('data_with_fe') print_step('Importing Data 2/15') train_ridge, test_ridge = load_cache('tfidf_ridges') drops = [c for c in train_ridge.columns if 'svd' in c or 'tfidf' in c] train_ridge.drop(drops, axis=1, inplace=True) test_ridge.drop(drops, axis=1, inplace=True) train_ = train_ridge test_ = test_ridge print_step('Importing Data 2/15') train_['parent_category_name'] = train_fe['parent_category_name'] test_['parent_category_name'] = test_fe['parent_category_name'] train_['price'] = train_fe['price'] test_['price'] = test_fe['price']
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=300000, min_df=2, max_df=0.8, binary=True, encoding='KOI8-R') tfidf_train = tfidf.fit_transform(train['titlecat']) print(tfidf_train.shape) print_step('Titlecat TFIDF 3/3') tfidf_test = tfidf.transform(test['titlecat']) print(tfidf_test.shape) print_step('Saving to cache...') save_in_cache('titlecat_tfidf', tfidf_train, tfidf_test) else: print_step('Loading from cache...') tfidf_train, tfidf_test = load_cache('titlecat_tfidf') print_step('Titlecat Stats 1/6') train['titlecat_tfidf_sum'] = tfidf_train.sum(axis=1) print_step('Titlecat Stats 2/6') train['titlecat_tfidf_mean'] = tfidf_train.mean(axis=1) print_step('Titlecat Stats 3/6') train['titlecat_tfidf_nnz'] = tfidf_train.getnnz(axis=1) print_step('Titlecat Stats 4/6') test['titlecat_tfidf_sum'] = tfidf_test.sum(axis=1) print_step('Titlecat Stats 5/6') test['titlecat_tfidf_mean'] = tfidf_test.mean(axis=1) print_step('Titlecat Stats 6/6') test['titlecat_tfidf_nnz'] = tfidf_test.getnnz(axis=1) print_step('Titlecat SVD 1/4')
toxic = toxic.drop('worker_id', axis=1).groupby('rev_id').mean().reset_index() print_step('Processing 7/9') toxic = toxic_comments[['rev_id', 'comment']].merge(toxic, on='rev_id').drop('rev_id', axis=1) print_step('Processing 8/9') toxic['toxicity_label'] = toxic['toxicity'].apply(lambda x: 1 if x > 0.1 else 0) toxic['comment_text'] = toxic['comment'] toxic.drop('comment', axis=1, inplace=True) print_step('Processing 9/9') save_in_cache('extra_data_toxic', toxic, test) else: attack, test = load_cache('extra_data_attack') toxic, test = load_cache('extra_data_toxic') print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) kf_for_regression = KFold(n_splits=5, shuffle=True, random_state=2017) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Run TFIDF WORD-CHAR UNION 1/2') if is_in_cache('tfidf_char_union_extra_data_attack'): post_train, post_test = load_cache('tfidf_char_union_extra_data_attack') else: TFIDF_UNION1.update({'train': attack, 'test': test}) post_trainw, post_testw = run_tfidf(**TFIDF_UNION1) TFIDF_UNION2.update({'train': attack, 'test': test})
wordbatch_test = wb.transform(test['titlecat']) print(wordbatch_test.shape) del(wb) gc.collect() print_step('Titlecat Wordbatch 4/5') mask = np.where(wordbatch_train.getnnz(axis=0) > 3)[0] wordbatch_train = wordbatch_train[:, mask] print(wordbatch_train.shape) print_step('Titlecat Wordbatch 5/5') wordbatch_test = wordbatch_test[:, mask] print(wordbatch_test.shape) print_step('Saving to cache...') save_in_cache('titlecat_wordbatch', wordbatch_train, wordbatch_test) else: print_step('Loading from cache...') wordbatch_train, wordbatch_test = load_cache('titlecat_wordbatch') print('~~~~~~~~~~~~~~~~~~~~~~~') print_step('Text Wordbatch 1/5') train['desc'] = train['title'].fillna('') + ' ' + train['description'].fillna('') test['desc'] = test['title'].fillna('') + ' ' + test['description'].fillna('') if not is_in_cache('text_wordbatch'): print_step('Text Wordbatch 2/5') wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0, "idf": None}), procs=8) wb.dictionary_freeze = True wordbatch_train = wb.fit_transform(train['desc'].fillna(''))
print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 1/19') train, test = get_data() print('~~~~~~~~~~~~~~~') print_step('Subsetting') target = train['deal_probability'] train_id = train['item_id'] test_id = test['item_id'] train.drop(['deal_probability', 'item_id'], axis=1, inplace=True) test.drop(['item_id'], axis=1, inplace=True) print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Importing Data 2/19') train_fe, test_fe = load_cache('data_with_fe') print_step('Importing Data 2/15') train_ridge, test_ridge = load_cache('tfidf_ridges') print_step('Importing Data 4/15 3/4') train_fe = pd.concat([train_fe, train_ridge], axis=1) print_step('Importing Data 4/15 4/4') test_fe = pd.concat([test_fe, test_ridge], axis=1) print_step('Importing Data 3/15 1/3') train_base_lgb, test_base_lgb = load_cache('base_lgb') print_step('Importing Data 3/15 2/3') train_fe['base_lgb'] = train_base_lgb['base_lgb'] print_step('Importing Data 3/15 3/3') test_fe['base_lgb'] = test_base_lgb['base_lgb']
cat_bins = list( map(lambda s: s.replace('/', '-'), list(set(train['cat_bin'].values)))) n_cpu = mp.cpu_count() n_nodes = max(n_cpu - 3, 2) print('Starting a jobs server with %d nodes' % n_nodes) pool = mp.ProcessingPool(n_nodes, maxtasksperchild=500) res = pool.map(run_ridge_on_cat_bin, cat_bins) pool.close() pool.join() pool.terminate() pool.restart() print('~~~~~~~~~~~~~~~~') print_step('Merging 1/5') pool = mp.ProcessingPool(n_nodes, maxtasksperchild=500) dfs = pool.map(lambda c: load_cache('cat_bin_ridges_' + c), cat_bins) pool.close() pool.join() pool.terminate() pool.restart() print_step('Merging 2/5') train_dfs = map(lambda x: x[0], dfs) test_dfs = map(lambda x: x[1], dfs) print_step('Merging 3/5') train_df = pd.concat(train_dfs) test_df = pd.concat(test_dfs) print_step('Merging 4/5') train_ridge = train.merge(train_df, on='item_id') print_step('Merging 5/5') test_ridge = test.merge(test_df, on='item_id')
def update_stats_cache(): global JxStateArray,JxStatsMap JxProfile('Load Cache') JxCache = load_cache() JxProfile('Load Cache ended') try: query = """ select cards.factId,reviewHistory.cardId, reviewHistory.time, reviewHistory.lastInterval, reviewHistory.nextInterval, reviewHistory.ease from reviewHistory,cards where cards.id = reviewHistory.cardId and cards.modified>%s order by cards.factId,reviewHistory.cardId,reviewHistory.time""" % JxCache['TimeCached'] JxStateArray = JxCache['StateArray'] except: query = """ select cards.factId ,reviewHistory.cardId, reviewHistory.time, reviewHistory.lastInterval, reviewHistory.nextInterval, reviewHistory.ease from reviewHistory,cards where cards.id = reviewHistory.cardId order by cards.factId ,reviewHistory.cardId,reviewHistory.time""" JxStateArray = {} rows = mw.deck.s.all(query) JxProfile("Query ended") length = len(rows) index = 0 JxCardState = [] JxCardStateArray = [] StatusStart = 0 # We will initialize other stuff on the fly ! while index < length: # 0:FactId 1:CardId, 2:Time, 3: lastInterval, 4: next interval, 5:ease (FactId,CardId,Time,Interval,NextInterval,Ease) = rows[index] # first, we have to build a list of the days where status changes happened for this card (+ - + - + - ...) if (Interval <= 21 and NextInterval > 21): #Card Status Change Day = int(Time / 86400.0) JxCardState.append(Day) if StatusStart == 0: StatusStart = 1 elif (Interval > 21 and Ease == 1): #Card Status Change Day = int(Time / 86400.0) JxCardState.append(Day) if StatusStart == 0: StatusStart = -1 index += 1 if index == length: # we have finished parsing the Entries.Flush the last Fact and break JxCardStateArray.append((StatusStart,JxCardState[:])) flush_facts(JxCardStateArray,CardId) break # we have finished parsing the Entries, flush the Status change elif CardId == rows[index][1]: # Same Card : Though it does nothing, we put this here for speed purposes because it happens a lot. pass elif FactId != rows[index][0]: # Fact change : Though it happens a bit less often than cardId change, we have to put it there or it won't be caught, flush the status change. JxCardStateArray.append((StatusStart,JxCardState[:])) flush_facts(JxCardStateArray,CardId) JxCardState = [] JxCardStateArray = [] StatusStart = 0 else: # CardId change happens just a little bit more often than fact changes (if deck has more than 3 card models;..). Store and intit the card status change JxCardStateArray.append((StatusStart,JxCardState[:])) JxCardState = [] StatusStart = 0 JxProfile("NewAlgorythm Ends") # let's partition the deck now #try: #query = """select id, factId, interval, reps from cards where modified>%s order by factId""" % dJxCache['TimeCached'] #except: query = """select id, factId, interval, reps from cards order by factId""" rows = mw.deck.s.all(query) # let's create a list of Facts with all associated cards and their state : Known/Seen and produce the equivalent list for facts TempFacts={} def munge_row(x): if x[2] > 21: y = (x[0], 1) # Known elif x[3] > 0: y = (x[0], -1) # Seen else: y = (x[0], 0) # In Deck try: TempFacts[x[1]].append(y) except KeyError: TempFacts[x[1]] = [y] map(munge_row,rows) # now update the fact list to include the fact state def partition(x): L = zip(*x[1])[1] if not any(L): Facts[x[0]]= (2, x[1])# InDeck elif sum(L)>=0 : Facts[x[0]]= (0, x[1])# Known else: Facts[x[0]]= (1, x[1])# Seen map(partition,TempFacts.iteritems()) JxProfile(str(len(filter(lambda x:(x[0]==0),Facts.values())))+" "+str(len(filter(lambda x:(x[0]==1),Facts.values())))+" "+str(len(filter(lambda x:(x[0]==2),Facts.values())))) # now cache the updated graphs JxCache['StateArray'] = JxStateArray JxCache['TimeCached'] = time.time() # among the few things that could corrupt the cache : # new entries in the database before the cache was saved...sigh... save_cache(JxCache) JxProfile("Saving Cache")
from scipy.optimize import minimize from sklearn.metrics import roc_auc_score from cache import get_data, load_cache def auc_func(weights): final_prediction = 0 for weight, prediction in zip(weights, blend_train): final_prediction += weight * prediction return 1 - roc_auc_score(y_train, final_prediction) base_train, base_test = get_data() train, test = load_cache('lvl3_all_mix') labels = ['toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate'] for label in labels: y_train = base_train[label] print('\n Finding Blending Weights for ' + label + '...') blend_train = np.array([train['lvl2_all_lgb_' + label].rank().values, train['lvl2_all_xgb_' + label].rank().values, train['final-cnn_' + label].rank().values, train['lvl2_all_rf_' + label].rank().values]) blend_test = np.array([test['lvl2_all_lgb_' + label].rank().values, test['lvl2_all_xgb_' + label].rank().values, test['final-cnn_' + label].rank().values, test['lvl2_all_rf_' + label].rank().values]) res_list = []
print(i) print_step('Predict Val 1/2') pred_val_y = model.predict(val_X) print_step('Predict Test 2/2') pred_test_y = model.predict(test_X) return pred_val_y, pred_test_y print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() target = train['deal_probability'] test_id = test['item_id'] print_step('Importing Data 2/15') train_ridge, test_ridge = load_cache('tfidf_ridges') drops = [c for c in train_ridge.columns if 'svd' in c or 'tfidf' in c] train_ridge.drop(drops, axis=1, inplace=True) test_ridge.drop(drops, axis=1, inplace=True) train_ = train_ridge test_ = test_ridge print_step('Importing Data 1/5') train_['deal_probability'] = target train_['item_id'] = train['item_id'] test_['item_id'] = test['item_id'] print_step('Importing Data 3/15 1/3') train_base_lgb, test_base_lgb = load_cache('base_lgb') print_step('Importing Data 3/15 2/3') train_['base_lgb'] = train_base_lgb['base_lgb']