Esempio n. 1
0
def main(argv):
    global CACHE
    dest_dir = '/home/killbots/killbots.net/random/'
    if (len(argv) > 1):
        dest_dir = argv[1]
    CACHE = os.path.join(dest_dir, CACHE)

    data = cache.load_cache(CACHE)
    latest_update = get_latest_update()
    url_suffix = latest_update[0]
    time = latest_update[1:]
    places = ['Kita Ibaraki City', 'Takahagi City', 'Daigo Town', 'KEK',
            'AIST (3F)', 'AIST (Carpark)']
    try:
        data = get_levels(url_suffix, data)
    except:
        traceback.print_exc()
    try:
        data = get_kek(data)
    except:
        traceback.print_exc()
    try:
        data = get_aist(data)
    except:
        traceback.print_exc()
    cache.save_cache(data, CACHE)
    plot_data(places, dest_dir)
Esempio n. 2
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    # Parser and args
    parser = create_parser()
    args = parser.parse_args()
    # Setup resources and dirs
    dest = open(args.out, 'w')
    res_dir = os.path.split(os.path.abspath(__file__))[0]
    template = open(os.path.join(res_dir, 'template.html'), 'r').read()
    output = HTMLOutput(dest, template)
    cache_dir = os.path.split(args.out)[0]
    # Use cache
    dbs = [FilmwebDatabase()]
    if not args.force:
        cache = load_cache(cache_dir, args.out)
        if cache:
            logging.info("using cache file")
            dbs = cache

    # Get movies
    movies = find_movies_info(args.dirs, dbs, output, '-rating')

    # Histogram?
    if args.histogram:
        path = os.path.join(cache_dir, '.movierank-histogram.png')
        histogram(movies, path)
        output.add_extra('histogram', path)

    # Finish
    store_cache(cache_dir, dbs, suffix=args.out)
    output.flush()

    # Run browser?
    if args.run:
        subprocess.Popen(["xdg-open", args.out],
                         stderr=subprocess.STDOUT,
                         stdout=subprocess.PIPE)
Esempio n. 3
0
def compute_count(): 
    """Computes the stats"""
    
    global JxStatsArray,JxPartitionLists,NoType
    
    JxCache = load_cache()
    try:
        Query = """select cards.factId, cards.id, cards.reps, cards.interval from cards, 
	cards as mCards where mCards.modified>%s and cards.factId=mCards.factId 
	group by cards.id order by cards.factId""" % JxCache['TimeCached']
        JxStatsArray = JxCache['Stats']
        JxPartitionLists = JxCache['Partitions']
        NoType = JxCache['NoType']
    except:
        Query = """select factId, id, reps, interval from cards order by factId"""
        NoType = 0 # known/seen/in deck
        for (Type,List) in JxStatsMap.iteritems():
            for (k, Map) in enumerate(List):
                for (Key,String) in Map.Order+[('Other','Other')]:
                    if k != 1:
                        JxStatsArray[(Type,k,Key)] = (0, 0, 0, 
			        len([Item for (Item,Value) in Map.Dict.iteritems() if Value == Key])) 
                    elif Type =='Word':
                        JxStatsArray[(Type,k,Key)] = (0, 0, 0, sum([Jx_Word_Occurences[Item] 
				for (Item,Value) in Map.Dict.iteritems() if Value == Key]))
                    else:
                        JxStatsArray[(Type,k,Key)] = (0, 0, 0, sum([Jx_Kanji_Occurences[Item] 
			        for (Item,Value) in Map.Dict.iteritems() if Value == Key])) 
                    for Label in ['Known','Seen','InDeck']:
                        JxPartitionLists[(Type,k,Key,Label)] = []	
			
    # we compute known/seen/in deck/total stats for each value of each map and each type
    Rows = mw.deck.s.all(Query)  
    CardState = []
    Length = len(Rows)
    Index = 0
    while True and Index<Length:
        (FactId,CardId,CardRep,Interval) = Rows[Index]
        # set the card's status                       
        if Interval > 21 and CardRep:
            CardState.append(0)
        elif CardRep:
            CardState.append(1)
        else:
            CardState.append(2)
        Index += 1
        if Index == Length: 
            # we have finished parsing the Entries.Flush the last Fact and break
            JxFlushFactStats(CardState,CardId)
            break
            # we have finished parsing the Entries, flush the Status change
        elif FactId == Rows[Index][0]:
            # Same Fact : Though it does nothing, we put this here for speed purposes because it happens a lot.
            pass
        else:                        
            # Fact change
            JxFlushFactStats(CardState,CardId)
            CardState = []
	    
    # now cache the updated stats  
    JxCache['Stats'] = JxStatsArray
    JxCache['Partitions'] = JxPartitionLists
    JxCache['NoType'] = NoType
    JxCache['TimeCached'] = time() # among the few things that coul corrupt the cache : 
    # new entries in the database before the cache was saved...sigh...
    save_cache(JxCache)
Esempio n. 4
0
#!/usr/bin/env python

import datetime

import cache


f = open('previous.txt', 'r')
lines = f.readlines()[2:]
f.close()

dest = cache.load_cache('fukushima.dat')
for l in lines:
    cells = l.rstrip().split('\t')
    print cells, ' | ',
    ts = datetime.datetime.strptime(cells[1], '%m/%d/%Y %H:%M:%S')
    cells = cells[3:]
    print cells
    for ii, c in enumerate(cells[:7]):
        try:
            float(c.strip())
        except ValueError:
            continue
        dest.set_value(ts, ii, c.strip())
    if len(cells) >= 8:
        dest.set_value(ts, 8, cells[7].strip())
cache.save_cache(dest, 'fukushima.dat')

image_chunks = list(chunks(image_files, 1000))
print_step('Chunked images into %d groups with %d images per group' %
           (len(image_chunks), len(image_chunks[0])))

pool = mp.ProcessingPool(n_nodes, maxtasksperchild=500)
print_step('Starting a jobs server with %d nodes' % n_nodes)
res = pool.map(lambda dat: get_img_data(dat[0], dat[1]),
               enumerate(image_chunks))
pool.close()
pool.join()
pool.terminate()
pool.restart()

print('~~~~~~~~~~~~~~~~')
print_step('Merging 1/9')
dfs = pool.map(lambda c: load_cache('img_data_' + str(c)),
               range(len(image_chunks)))
print_step('Merging 2/9')
dfs = map(lambda x: x[0], dfs)
print_step('Merging 3/9')
merge = pd.concat(dfs)
print_step('Merging 4/9')
train, test = get_data()
print_step('Merging 5/9')
merge['img_path'] = merge['img_path'].apply(
    lambda x: x.replace('test_jpg/', ''))
print_step('Merging 6/9')
merge['img_path'] = merge['img_path'].apply(
    lambda x: x.replace('train_jpg/', ''))
print_step('Merging 7/9')
merge['img_path'] = merge['img_path'].apply(lambda x: x.replace('.jpg', ''))
Esempio n. 6
0
        'random_state': 16,
        'verbose': 2
    }
    model = RandomForestClassifier(**params)
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)[:, 1]
    pred_test_y2 = model.predict_proba(test_X2)[:, 1]
    pred_test_y = minmax_scale(
        pd.Series(pred_test_y).rank().values)  # Rank transform
    pred_test_y2 = minmax_scale(pd.Series(pred_test_y2).rank().values)
    return pred_test_y, pred_test_y2


if not is_in_cache('lvl2_all'):
    print_step('Importing 1/21: LRs')
    lr_train, lr_test = load_cache('lvl1_lr')
    print_step('Importing 2/21: FE')
    train_fe, test_fe = load_cache('fe_lgb_data')
    print_step('Importing 3/21: Sparse LGBs')
    lgb_train, lgb_test = load_cache('lvl1_sparse_lgb')
    print_step('Importing 4/21: FE LGB')
    fe_lgb_train, fe_lgb_test = load_cache('lvl1_fe_lgb')
    print_step('Importing 5/21: Sparse FE LGB')
    sfe_lgb_train, sfe_lgb_test = load_cache('lvl1_sparse_fe_lgb')
    print_step('Importing 6/21: FM')
    fm_train, fm_test = load_cache('lvl1_fm')
    print_step('IMporting 7/21: Ridge')
    ridge_train, ridge_test = load_cache('lvl1_ridge')
    print_step('Importing 8/21: GRU')
    gru_train, gru_test = load_cache('lvl1_gru')
    print_step('Importing 9/21: GRU2')
Esempio n. 7
0
def runChainedFM(train_X, train_y, test_X, test_y, test_X2, label, dev_index,
                 val_index):
    print_step('Loading Lvl1')
    lvl1_train, lvl1_test = load_cache('lvl1_fm')
    [
        lvl1_train[c].apply(lambda x: 0 if x < 0.5 else 1)
        for c in lvl1_train.columns if 'fm_' in c and c != label
    ]
    lvl1_train = csr_matrix(
        pd.concat([
            lvl1_train[c].apply(lambda x: 0 if x < 0.5 else 1)
            for c in lvl1_train.columns if 'fm_' in c and c != label
        ],
                  axis=1).values)
    lvl1_test = csr_matrix(
        pd.concat([
            lvl1_test[c].apply(lambda x: 0 if x < 0.5 else 1)
            for c in lvl1_test.columns if 'fm_' in c and c != label
        ],
                  axis=1).values)
    print_step('Merging 1/3')
    lvl1_valid = lvl1_train[val_index]
    lvl1_train = lvl1_train[dev_index]
    train_X = csr_matrix(hstack([train_X, lvl1_train]))
    print_step('Merging 2/3')
    test_X = csr_matrix(hstack([test_X, lvl1_valid]))
    print_step('Merging 3/3')
    test_X2 = csr_matrix(hstack([test_X2, lvl1_test]))

    print_step('Modeling')
    class_weights = {
        'toxic': 1.0,
        'severe_toxic': 0.2,
        'obscene': 1.0,
        'threat': 0.1,
        'insult': 0.8,
        'identity_hate': 0.2
    }
    model = FM_FTRL(alpha=0.02,
                    beta=0.01,
                    L1=0.00001,
                    L2=30.0,
                    D=train_X.shape[1],
                    alpha_fm=0.1,
                    L2_fm=0.5,
                    init_fm=0.01,
                    weight_fm=50.0,
                    D_fm=200,
                    e_noise=0.0,
                    iters=3,
                    inv_link="identity",
                    e_clip=1.0,
                    threads=4,
                    use_avx=1,
                    verbose=1)
    train_weight = np.array(
        [1.0 if x == 1 else class_weights[label] for x in train_y])
    model.fit(train_X, train_y, train_weight, reset=False)
    pred_test_y = sigmoid(model.predict(test_X))
    pred_test_y2 = sigmoid(model.predict(test_X2))
    return pred_test_y, pred_test_y2
print('~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 1/11')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

if not is_in_cache('deep_text_feats4'):
    print('~~~~~~~~~~~~~~~~~~~~~~~')
    print_step('Importing Data 2/11')
    tfidf_train, tfidf_test = load_cache('titlecat_tfidf')

    print_step('Importing Data 3/11')
    tfidf_train2, tfidf_test2 = load_cache('text_tfidf')

    print_step('Importing Data 4/11')
    tfidf_train3, tfidf_test3 = load_cache('text_char_tfidf')


    print_step('Importing Data 5/11')
    train = hstack((tfidf_train, tfidf_train2, tfidf_train3)).tocsr()
    print_step('Importing Data 6/11')
    test = hstack((tfidf_test, tfidf_test2, tfidf_test3)).tocsr()
    print(train.shape)
    print(test.shape)
Esempio n. 9
0
print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 1/15')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 2/15')
train_fe, test_fe = load_cache('data_with_fe')

print_step('Importing Data 2/15')
train_ridge, test_ridge = load_cache('tfidf_ridges')
drops = [c for c in train_ridge.columns if 'svd' in c or 'tfidf' in c]
train_ridge.drop(drops, axis=1, inplace=True)
test_ridge.drop(drops, axis=1, inplace=True)
train_ = train_ridge
test_ = test_ridge

print_step('Importing Data 2/15')
train_['parent_category_name'] = train_fe['parent_category_name']
test_['parent_category_name'] = test_fe['parent_category_name']
train_['price'] = train_fe['price']
test_['price'] = test_fe['price']
Esempio n. 10
0
#!/usr/bin/env python

import datetime

import cache

f = open('previous.txt', 'r')
lines = f.readlines()[2:]
f.close()

dest = cache.load_cache('fukushima.dat')
for l in lines:
    cells = l.rstrip().split('\t')
    print cells, ' | ',
    ts = datetime.datetime.strptime(cells[1], '%m/%d/%Y %H:%M:%S')
    cells = cells[3:]
    print cells
    for ii, c in enumerate(cells[:7]):
        try:
            float(c.strip())
        except ValueError:
            continue
        dest.set_value(ts, ii, c.strip())
    if len(cells) >= 8:
        dest.set_value(ts, 8, cells[7].strip())
cache.save_cache(dest, 'fukushima.dat')
Esempio n. 11
0
        tfidf = TfidfVectorizer(ngram_range=(1, 2),
                                max_features=300000,
                                min_df=2,
                                max_df=0.8,
                                binary=True,
                                encoding='KOI8-R')
        tfidf_train = tfidf.fit_transform(train['titlecat'])
        print(tfidf_train.shape)
        print_step('Titlecat TFIDF 3/3')
        tfidf_test = tfidf.transform(test['titlecat'])
        print(tfidf_test.shape)
        print_step('Saving to cache...')
        save_in_cache('titlecat_tfidf', tfidf_train, tfidf_test)
    else:
        print_step('Loading from cache...')
        tfidf_train, tfidf_test = load_cache('titlecat_tfidf')

    print_step('Titlecat Stats 1/6')
    train['titlecat_tfidf_sum'] = tfidf_train.sum(axis=1)
    print_step('Titlecat Stats 2/6')
    train['titlecat_tfidf_mean'] = tfidf_train.mean(axis=1)
    print_step('Titlecat Stats 3/6')
    train['titlecat_tfidf_nnz'] = tfidf_train.getnnz(axis=1)
    print_step('Titlecat Stats 4/6')
    test['titlecat_tfidf_sum'] = tfidf_test.sum(axis=1)
    print_step('Titlecat Stats 5/6')
    test['titlecat_tfidf_mean'] = tfidf_test.mean(axis=1)
    print_step('Titlecat Stats 6/6')
    test['titlecat_tfidf_nnz'] = tfidf_test.getnnz(axis=1)

    print_step('Titlecat SVD 1/4')
    toxic = toxic.drop('worker_id',
                       axis=1).groupby('rev_id').mean().reset_index()
    print_step('Processing 7/9')
    toxic = toxic_comments[['rev_id',
                            'comment']].merge(toxic,
                                              on='rev_id').drop('rev_id',
                                                                axis=1)
    print_step('Processing 8/9')
    toxic['toxicity_label'] = toxic['toxicity'].apply(lambda x: 1
                                                      if x > 0.1 else 0)
    toxic['comment_text'] = toxic['comment']
    toxic.drop('comment', axis=1, inplace=True)
    print_step('Processing 9/9')
    save_in_cache('extra_data_toxic', toxic, test)
else:
    attack, test = load_cache('extra_data_attack')
    toxic, test = load_cache('extra_data_toxic')

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)
kf_for_regression = KFold(n_splits=5, shuffle=True, random_state=2017)

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Run TFIDF WORD-CHAR UNION 1/2')
if is_in_cache('tfidf_char_union_extra_data_attack'):
    post_train, post_test = load_cache('tfidf_char_union_extra_data_attack')
else:
    TFIDF_UNION1.update({'train': attack, 'test': test})
    post_trainw, post_testw = run_tfidf(**TFIDF_UNION1)
    TFIDF_UNION2.update({'train': attack, 'test': test})
Esempio n. 13
0
        wordbatch_test = wb.transform(test['titlecat'])
        print(wordbatch_test.shape)
        del(wb)
        gc.collect()
        print_step('Titlecat Wordbatch 4/5')
        mask = np.where(wordbatch_train.getnnz(axis=0) > 3)[0]
        wordbatch_train = wordbatch_train[:, mask]
        print(wordbatch_train.shape)
        print_step('Titlecat Wordbatch 5/5')
        wordbatch_test = wordbatch_test[:, mask]
        print(wordbatch_test.shape)
        print_step('Saving to cache...')
        save_in_cache('titlecat_wordbatch', wordbatch_train, wordbatch_test)
    else:
        print_step('Loading from cache...')
        wordbatch_train, wordbatch_test = load_cache('titlecat_wordbatch')

    print('~~~~~~~~~~~~~~~~~~~~~~~')
    print_step('Text Wordbatch 1/5')
    train['desc'] = train['title'].fillna('') + ' ' + train['description'].fillna('')
    test['desc'] = test['title'].fillna('') + ' ' + test['description'].fillna('')
    if not is_in_cache('text_wordbatch'):
        print_step('Text Wordbatch 2/5')
        wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2,
                                                                      "hash_ngrams_weights": [1.0, 1.0],
                                                                      "hash_size": 2 ** 28,
                                                                      "norm": "l2",
                                                                      "tf": 1.0,
                                                                      "idf": None}), procs=8)
        wb.dictionary_freeze = True
        wordbatch_train = wb.fit_transform(train['desc'].fillna(''))
Esempio n. 14
0
print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 1/19')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data 2/19')
train_fe, test_fe = load_cache('data_with_fe')

print_step('Importing Data 2/15')
train_ridge, test_ridge = load_cache('tfidf_ridges')
print_step('Importing Data 4/15 3/4')
train_fe = pd.concat([train_fe, train_ridge], axis=1)
print_step('Importing Data 4/15 4/4')
test_fe = pd.concat([test_fe, test_ridge], axis=1)

print_step('Importing Data 3/15 1/3')
train_base_lgb, test_base_lgb = load_cache('base_lgb')
print_step('Importing Data 3/15 2/3')
train_fe['base_lgb'] = train_base_lgb['base_lgb']
print_step('Importing Data 3/15 3/3')
test_fe['base_lgb'] = test_base_lgb['base_lgb']
cat_bins = list(
    map(lambda s: s.replace('/', '-'), list(set(train['cat_bin'].values))))
n_cpu = mp.cpu_count()
n_nodes = max(n_cpu - 3, 2)
print('Starting a jobs server with %d nodes' % n_nodes)
pool = mp.ProcessingPool(n_nodes, maxtasksperchild=500)
res = pool.map(run_ridge_on_cat_bin, cat_bins)
pool.close()
pool.join()
pool.terminate()
pool.restart()

print('~~~~~~~~~~~~~~~~')
print_step('Merging 1/5')
pool = mp.ProcessingPool(n_nodes, maxtasksperchild=500)
dfs = pool.map(lambda c: load_cache('cat_bin_ridges_' + c), cat_bins)
pool.close()
pool.join()
pool.terminate()
pool.restart()
print_step('Merging 2/5')
train_dfs = map(lambda x: x[0], dfs)
test_dfs = map(lambda x: x[1], dfs)
print_step('Merging 3/5')
train_df = pd.concat(train_dfs)
test_df = pd.concat(test_dfs)
print_step('Merging 4/5')
train_ridge = train.merge(train_df, on='item_id')
print_step('Merging 5/5')
test_ridge = test.merge(test_df, on='item_id')
Esempio n. 16
0
def update_stats_cache():
    global JxStateArray,JxStatsMap
    JxProfile('Load Cache')
    JxCache = load_cache()
    JxProfile('Load Cache ended')
    try:
        query = """
	select cards.factId,reviewHistory.cardId, reviewHistory.time, reviewHistory.lastInterval, reviewHistory.nextInterval, reviewHistory.ease 
	from reviewHistory,cards where cards.id = reviewHistory.cardId and cards.modified>%s 
	order by cards.factId,reviewHistory.cardId,reviewHistory.time""" % JxCache['TimeCached']
        JxStateArray = JxCache['StateArray']
    except:
        query = """
	select cards.factId ,reviewHistory.cardId, reviewHistory.time, reviewHistory.lastInterval, reviewHistory.nextInterval, reviewHistory.ease 
	from reviewHistory,cards where cards.id = reviewHistory.cardId order by cards.factId ,reviewHistory.cardId,reviewHistory.time"""
        JxStateArray = {}
    rows = mw.deck.s.all(query)	
    JxProfile("Query ended")

    length = len(rows)
    index = 0
    JxCardState = []
    JxCardStateArray = []
    StatusStart = 0
    # We will initialize other stuff on the fly !
    while index < length:
        # 0:FactId 1:CardId, 2:Time, 3: lastInterval, 4: next interval, 5:ease
        (FactId,CardId,Time,Interval,NextInterval,Ease) = rows[index]                  
        # first, we have to build a list of the days where status changes happened for this card (+ - + - + - ...)
        if (Interval <= 21 and NextInterval > 21): 
            #Card Status Change
            Day = int(Time / 86400.0)
            JxCardState.append(Day)
            if StatusStart == 0:
                StatusStart = 1
        elif (Interval > 21 and Ease == 1):
            #Card Status Change
            Day = int(Time / 86400.0)
            JxCardState.append(Day)
            if StatusStart == 0:
                StatusStart = -1		
        index += 1
        if index == length: 
            # we have finished parsing the Entries.Flush the last Fact and break
            JxCardStateArray.append((StatusStart,JxCardState[:]))
            flush_facts(JxCardStateArray,CardId)
            break
            # we have finished parsing the Entries, flush the Status change
        elif CardId == rows[index][1]:
            # Same Card : Though it does nothing, we put this here for speed purposes because it happens a lot.
            pass
        elif FactId != rows[index][0]:                        
            # Fact change : Though it happens a bit less often than cardId change, we have to put it there or it won't be caught, flush the status change.
            JxCardStateArray.append((StatusStart,JxCardState[:]))
            flush_facts(JxCardStateArray,CardId)
            JxCardState = []
            JxCardStateArray = []
            StatusStart = 0
        else:
            # CardId change happens just a little bit more often than fact changes (if deck has more than 3 card models;..). Store and intit the card status change
            JxCardStateArray.append((StatusStart,JxCardState[:]))
            JxCardState = []
	    StatusStart = 0
	    
    JxProfile("NewAlgorythm Ends")
    
    
    # let's partition the deck now
    #try:
        #query = """select id, factId, interval, reps from cards where modified>%s order by factId""" % dJxCache['TimeCached']
    #except:
    query = """select id, factId, interval, reps from cards order by factId"""

    rows = mw.deck.s.all(query)
    # let's create a list of Facts with all associated cards and their state : Known/Seen and produce the equivalent list for facts
    
    TempFacts={}
    def munge_row(x):
            if x[2] > 21:
                y = (x[0], 1) # Known
            elif x[3] > 0:
                y = (x[0], -1) # Seen
            else:
                y = (x[0], 0) # In Deck
            try:
                TempFacts[x[1]].append(y)
            except KeyError:
                TempFacts[x[1]] = [y]
    map(munge_row,rows)
    
    # now update the fact list to include the fact state 
    def partition(x):
            L = zip(*x[1])[1]
            if not any(L):
                Facts[x[0]]= (2, x[1])# InDeck                    
            elif sum(L)>=0 :
                Facts[x[0]]= (0, x[1])# Known
            else:
                Facts[x[0]]= (1, x[1])# Seen
    map(partition,TempFacts.iteritems())
    JxProfile(str(len(filter(lambda x:(x[0]==0),Facts.values())))+" "+str(len(filter(lambda x:(x[0]==1),Facts.values())))+" "+str(len(filter(lambda x:(x[0]==2),Facts.values()))))    


    
    
    # now cache the updated graphs
    JxCache['StateArray'] = JxStateArray
    JxCache['TimeCached'] = time.time() # among the few things that could corrupt the cache : 
    # new entries in the database before the cache was saved...sigh...
    save_cache(JxCache)
    JxProfile("Saving Cache")
Esempio n. 17
0
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score

from cache import get_data, load_cache


def auc_func(weights):
    final_prediction = 0
    for weight, prediction in zip(weights, blend_train):
        final_prediction += weight * prediction
    return 1 - roc_auc_score(y_train, final_prediction)


base_train, base_test = get_data()
train, test = load_cache('lvl3_all_mix')
labels = ['toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate']

for label in labels:
    y_train = base_train[label]
    print('\n Finding Blending Weights for ' + label + '...')
    blend_train = np.array([train['lvl2_all_lgb_' + label].rank().values,
                            train['lvl2_all_xgb_' + label].rank().values,
                            train['final-cnn_' + label].rank().values,
                            train['lvl2_all_rf_' + label].rank().values])
    blend_test = np.array([test['lvl2_all_lgb_' + label].rank().values,
                           test['lvl2_all_xgb_' + label].rank().values,
                           test['final-cnn_' + label].rank().values,
                           test['lvl2_all_rf_' + label].rank().values])

    res_list = []
Esempio n. 18
0
        print(i)
    print_step('Predict Val 1/2')
    pred_val_y = model.predict(val_X)
    print_step('Predict Test 2/2')
    pred_test_y = model.predict(test_X)
    return pred_val_y, pred_test_y


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()
target = train['deal_probability']
test_id = test['item_id']

print_step('Importing Data 2/15')
train_ridge, test_ridge = load_cache('tfidf_ridges')
drops = [c for c in train_ridge.columns if 'svd' in c or 'tfidf' in c]
train_ridge.drop(drops, axis=1, inplace=True)
test_ridge.drop(drops, axis=1, inplace=True)
train_ = train_ridge
test_ = test_ridge

print_step('Importing Data 1/5')
train_['deal_probability'] = target
train_['item_id'] = train['item_id']
test_['item_id'] = test['item_id']

print_step('Importing Data 3/15 1/3')
train_base_lgb, test_base_lgb = load_cache('base_lgb')
print_step('Importing Data 3/15 2/3')
train_['base_lgb'] = train_base_lgb['base_lgb']