Esempio n. 1
0
def run_with_target(label,
                    target,
                    data_key,
                    model_fn,
                    kf,
                    train_key=None,
                    eval_fn=None):
    if is_in_cache(label + '_' + target):
        return load_cache(label + '_' + target)[0]
    else:
        print('-')
        print_step('Training ' + target)
        if train_key is None:
            train, test = get_data()
        else:
            train, test = load_cache(train_key)
        post_train, post_test = load_cache(data_key)
        if isinstance(post_train, pd.DataFrame):
            post_train = post_train.values
            post_test = post_test.values

        train_y = train[target]
        cv_scores = []
        pred_full_test = 0
        pred_train = np.zeros(train.shape[0])
        i = 1

        if isinstance(kf, StratifiedKFold):
            fold_splits = kf.split(post_train, train_y)
        else:
            fold_splits = kf.split(post_train)

        for dev_index, val_index in fold_splits:
            print_step('Started ' + label + ' ' + target + ' fold ' + str(i))
            dev_X, val_X = post_train[dev_index], post_train[val_index]
            dev_y, val_y = train_y[dev_index], train_y[val_index]
            pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y,
                                               post_test, target, dev_index,
                                               val_index)
            pred_full_test = pred_full_test + pred_test_y
            pred_train[val_index] = pred_val_y
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(eval_fn(val_y, pred_val_y))
            print_step(label + ' ' + target + ' cv score ' + str(i) + ' : ' +
                       str(cv_score))
            i += 1
        print_step(label + ' ' + target + ' cv scores : ' + str(cv_scores))
        mean_cv_score = np.mean(cv_scores)
        print_step(label + ' ' + target + ' mean cv score : ' +
                   str(mean_cv_score))
        pred_full_test = pred_full_test / 5.
        results = {
            'label': label,
            'target': target,
            'train': pred_train,
            'test': pred_full_test,
            'cv': cv_scores
        }
        save_in_cache(label + '_' + target, results, None)
        return results
Esempio n. 2
0
def index():
    if cache.exists():
        worklists = cache.get_data()
    else:
        worklists = generate_worklist(50)
        cache.write_data(worklists)
        write_file_worklist(worklists)

    context = {
        "worklist_address": config("WORKLIST_ADDRESS"),
        "worklist_port": config("WORKLIST_PORT"),
        "calling_ae_title": config("CALLING_AE_TITLE"),
        "called_ae_title": config("CALLED_AE_TITLE"),
        "worklists": worklists,
    }
    return render_template("index.html", **context)
Esempio n. 3
0
def urls():

    col1 = session['col1']
    col2 = session['col2']
    key = request.args.get('f')
    url_type = request.args.get('type')

    # session['url_types'][0] value must always be the default one, meaning no filters enabled
    session['url_types'] = ['All pages', 'bgg', 'bgc', 'g', 's', 'bgr']

    if not url_type:
        url_type = session['url_type'] = session['url_types'][0]

    urls_data = get_data(col1, col2, key=key, urls=url_type)
    pages = urls_data.pages

    return render_template('urls.html', pages=pages, url_types=session['url_types'], url_type=url_type, key=key)
Esempio n. 4
0
def urls():

    col1 = session['col1']
    col2 = session['col2']
    key = request.args.get('f')
    url_type = request.args.get('type')

    # session['url_types'][0] value must always be the default one, meaning no filters enabled
    session['url_types'] = ['All pages', 'bgg', 'bgc', 'g', 's', 'bgr']

    if not url_type:
        url_type = session['url_type'] = session['url_types'][0]

    urls_data = get_data(col1, col2, key=key, urls=url_type)
    pages = urls_data.pages

    return render_template('urls.html',
                           pages=pages,
                           url_types=session['url_types'],
                           url_type=url_type,
                           key=key)
Esempio n. 5
0
def index():

    session['cols'] = [x.date for x in db.query(Collection).all()]
    session['cols'].sort()
    session['cols'].reverse()
    cols = session['cols']

    if not session.get('col1'):
        col1 = session['col1'] = session['cols'][0]
        col2 = session['col2'] = session['cols'][1]
    else:
        col1 = session['col1']
        col2 = session['col2']

    if request.method == 'POST':
        col1 = session['col1'] = request.form.get('col1')
        col2 = session['col2'] = request.form.get('col2')

    main_data = get_data(col1, col2)
    keys = main_data.keys()

    return render_template('index.html', data=main_data, keys=keys)
Esempio n. 6
0
def index():

    session['cols'] = [x.date for x in db.query(Collection).all()]
    session['cols'].sort()
    session['cols'].reverse()
    cols = session['cols']

    if not session.get('col1'):
        col1 = session['col1'] = session['cols'][0]
        col2 = session['col2'] = session['cols'][1]
    else:
        col1 = session['col1']
        col2 = session['col2']

    if request.method == 'POST':
        col1 = session['col1'] = request.form.get('col1')
        col2 = session['col2'] = request.form.get('col2')

    main_data = get_data(col1, col2)
    keys = main_data.keys()

    return render_template('index.html', data=main_data, keys=keys)
Esempio n. 7
0
def _ondemand(syms, t0, t1, col='a'):
    """\
    Get price data with the following policy:
        - use offline data if db/cache has it
        - on demand download data from online sources if db/cache does not have the data required
        - save downloaded data to db/cache
        - return price data

    Args:
        syms (list of str) : a list of stock symbols, e.g. ['SPY', 'XLK', 'XLF']
        t0 (str)           : start date datestr, e.g. '2016-01-01'
        t1 (str)           : last date datestr,  e.g. '2016-06-06'
        col (str)          : a character in colnames_yf.keys()

    Returns:
        A dataframe with price data
    """

    from cache import get_data
    colname = colnames_db[col]
    data = get_data(syms, t0, t1, colname)
    df = pd.DataFrame(data)
    df = df[list(syms)]  # keep syms in original order
    return df
Esempio n. 8
0
 def data(self):
     value = cache.get_data(self.title)
     if value is None:
         value = super(WikiPage, self).data
         cache.set_data(self.title, value)
     return value
Esempio n. 9
0
import re
import string

import pandas as pd
import numpy as np

from nltk.corpus import stopwords

from utils import print_step, bin_and_ohe_data
from cache import get_data, is_in_cache, load_cache, save_in_cache


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

if not is_in_cache('data_with_fe'):
    print('~~~~~~~~~~~~')
    print_step('Merging')
    merge = pd.concat([train, test])

    print('~~~~~~~~~~~~~~~~~~~')
    print_step('Imputation 1/7')
Esempio n. 10
0
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


EMBEDDING_FILE = 'cache/crawl/crawl-300d-2M.vec'


max_features = 30000
maxlen = 100
embed_size = 300
epochs = 4
batch_size = 32
predict_batch_size = 1024


if not is_in_cache('lvl1_double-gru'):
    train_df, test_df = get_data()

    classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    X_train = train_df['comment_text'].fillna('peterhurford').values
    y_train = train_df[classes].values
    X_test = test_df['comment_text'].fillna('peterhurford').values

    print_step('Tokenizing data...')
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    x_train = tokenizer.texts_to_sequences(X_train)
    x_test = tokenizer.texts_to_sequences(X_test)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
Esempio n. 11
0
def run_nn_model(label, model, max_features, maxlen, epochs, batch_size,
                 predict_batch_size):
    classes = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    for embedding_name, embedding_file in EMBEDDING_FILES.items():
        if is_in_cache(label + '_' + embedding_name):
            print_step('Already trained ' + label + '_' + embedding_name +
                       '! Skipping...')
        else:
            train_df, test_df = get_data()

            print_step('Loading embed ' + embedding_name + '...')
            embed_size = EMBED_SIZE_LOOKUP[embedding_name]
            x_train, x_test, embedding_matrix = tokenize_and_embed(
                train_df, test_df, embedding_file, max_features, maxlen,
                embed_size, embedding_name)
            y_train = train_df[classes].values

            print_step('Build model...')
            model = model(max_features, maxlen, embed_size, embedding_matrix)
            model.save_weights('cache/gru-model-weights.h5')

            print_step('Making KFold for CV')
            kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

            i = 1
            cv_scores = []
            pred_train = np.zeros((train_df.shape[0], 6))
            pred_full_test = np.zeros((test_df.shape[0], 6))
            for dev_index, val_index in kf.split(x_train, y_train[:, 0]):
                print_step('Started fold ' + str(i))
                model.load_weights('cache/' + label + '_' + embedding_name +
                                   '-model-weights.h5')
                dev_X, val_X = x_train[dev_index], x_train[val_index]
                dev_y, val_y = y_train[dev_index, :], y_train[val_index, :]
                RocAuc = RocAucEvaluation(validation_data=(val_X, val_y),
                                          interval=1)
                model.fit(dev_X,
                          dev_y,
                          batch_size=batch_size,
                          epochs=epochs,
                          validation_data=(val_X, val_y),
                          callbacks=[RocAuc])
                val_pred = model.predict(val_X,
                                         batch_size=predict_batch_size,
                                         verbose=1)
                pred_train[val_index, :] = val_pred
                test_pred = model.predict(x_test,
                                          batch_size=predict_batch_size,
                                          verbose=1)
                pred_full_test = pred_full_test + test_pred
                cv_score = [
                    roc_auc_score(val_y[:, j], val_pred[:, j])
                    for j in range(6)
                ]
                print_step('Fold ' + str(i) + ' done')
                pprint(zip(classes, cv_score))
                cv_scores.append(cv_score)
                i += 1
            print_step('All folds done!')
            print('CV scores')
            pprint(zip(classes, np.mean(cv_scores, axis=0)))
            mean_cv_score = np.mean(np.mean(cv_scores, axis=0))
            print('mean cv score : ' + str(mean_cv_score))
            pred_full_test = pred_full_test / 5.
            for k, classx in enumerate(classes):
                train_df['gru_' + classx] = pred_train[:, k]
                test_df['gru_' + classx] = pred_full_test[:, k]

            print('~~~~~~~~~~~~~~~~~~')
            print_step('Cache Level 1')
            save_in_cache('lvl1_' + label + '_' + embedding_name, train_df,
                          test_df)
            print_step('Done!')

            print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
            print_step('Prepping submission file')
            submission = pd.DataFrame()
            submission['id'] = test_df['id']
            submission['toxic'] = test_df[label + '_' + embedding_name +
                                          '_toxic']
            submission['severe_toxic'] = test_df[label + '_' + embedding_name +
                                                 '_severe_toxic']
            submission['obscene'] = test_df[label + '_' + embedding_name +
                                            '_obscene']
            submission['threat'] = test_df[label + '_' + embedding_name +
                                           '_threat']
            submission['insult'] = test_df[label + '_' + embedding_name +
                                           '_insult']
            submission['identity_hate'] = test_df[label + '_' +
                                                  embedding_name +
                                                  '_identity_hate']
            submission.to_csv('submit/submit_lvl1_' + label + '_' +
                              embedding_name + '.csv',
                              index=False)
            print_step('Done')
Esempio n. 12
0
def create_course_objects(
        tables: List[BeautifulSoup]) -> Tuple[Course, Instructor]:
    course = Course()
    instructor = Instructor()
    for i, table in enumerate(tables[:max_tables]):
        for row in table.findAll('tr'):
            if row.has_attr('bgcolor'):
                continue
            data = [data for data in row.findAll('td')]
            cells = [list(cell.stripped_strings) for cell in data]

            if i == table_type['GENERAL_INFO']:
                log.info("parsing through course info")
                course.sln = cells[0][0] if cells[0] else None
                if cells[1]:
                    course_tokens = cells[1][0].split()
                    course.department = course_tokens[0]
                    course.number = course_tokens[1]
                course.section = cells[2][0] if cells[2] else None
                course.type = cells[3][0] if cells[3] else None

                # TODO: Add way to handle fractional credits (i.e, 2.5)
                if len(cells) > 7:
                    credit_tokens = cells[5][0].strip().split(
                        '-') if cells[5] else []
                    if len(credit_tokens) > 1:
                        course.lower_credits = credit_tokens[0]
                        course.upper_credits = credit_tokens[1]
                    else:
                        course.lower_credits = credit_tokens[0]
                        course.upper_credits = credit_tokens[0]

                    course.name = cells[6][0]
                    gen_ed_marker = cells[7]

                else:
                    credit_tokens = cells[4][0].strip().split(
                        '-') if cells[4] else []
                    if len(credit_tokens) > 1:
                        course.lower_credits = credit_tokens[0]
                        course.upper_credits = credit_tokens[1]
                    else:
                        course.lower_credits = credit_tokens[0]
                        course.upper_credits = credit_tokens[0]

                    course.name = cells[5][0] if cells[5] else None
                    gen_ed_marker = cells[6][0] if cells[6] else None

                log.info(gen_ed_marker)
                if gen_ed_marker:
                    gen_eds = gen_ed_marker.split(",")  # QSR,NW --> [QSR, NW]
                    for gen_end in gen_eds:
                        if gen_end in course.general_education:
                            course.general_education[gen_end] = True
            elif i == table_type['ENROLLMENT']:
                log.info("parsing through course info (enrollment)")

                course.current_size = cells[0][0] if cells[0] else None
                course.max_size = cells[1][0] if cells[1] else None
                if len(cells) > 4 and cells[4][0] == 'Entry Code required':
                    course.add_code_required = True

            elif i == table_type['MEETINGS']:
                log.info("parsing through meeting times")
                log.info(cells)
                # If there is more than one meeting location:
                # Ex: TTh   08:45-09:45     UW1 121	GUNNERSON,KIM N.
                #     TTh   09:45-10:50	    UW2 131 GUNNERSON,KIM N.
                # meeting_days: [TTh, TTh]
                # start_times: [08:45, 09:45]
                # end_times: [09:45, 10:50]
                # rooms: [UW1 121, UW2 131]
                if cells[0] and cells[0][0] != 'To be arranged':
                    meeting_days = cells[0]

                    start_times = [
                        time_range.split('-')[0].replace('\u00a0', ' ')
                        for time_range in cells[1]
                    ]
                    end_times = [
                        time_range.split('-')[1].replace('\u00a0', ' ')
                        for time_range in cells[1]
                    ]
                    rooms = [room.replace('\u00a0', ' ') for room in cells[2]]

                    for days, start_time, end_time, room in zip(
                            meeting_days, start_times, end_times, rooms):
                        room_tokens = room.split()
                        if len(room_tokens) == 1:
                            room_building = room_tokens[0]
                            room_number = None
                        else:
                            room_building, room_number = room_tokens

                        new_meeting = {
                            "room_building": room_building,
                            "room_number": room_number,
                            "meeting_days": days,
                            "start_time": start_time,
                            "end_time": end_time
                        }
                        course.meetings.append(new_meeting)

                    instructor_name = cells[3][0] if cells[3] else None
                    log.info(f"instructor name: {instructor_name}")
                    instructor_tokens = instructor_name.split(',')
                    if len(instructor_tokens) > 1:
                        instructor.first_name = instructor_tokens[1]
                        instructor.last_name = instructor_tokens[0]
                    log.info(f"split instructor name: {instructor_tokens}")
                    first_name_tokens = instructor.first_name.split(' ')
                    log.info(f"first name: {first_name_tokens}")

                    if len(first_name_tokens) > 1:
                        instructor.first_name = first_name_tokens[0]
                        instructor.middle_name = first_name_tokens[1]
                    else:
                        instructor.middle_name = ""
                    log.info(
                        f"{instructor.first_name}, {instructor.middle_name}, {instructor.last_name}"
                    )
                    log.info(
                        "retrieving data for instructor email and phone number"
                    )

                    data = get_data(instructor.first_name,
                                    instructor.last_name)

                    if data and not data.get('error'):
                        instructor.email = data['teacher'][0]['email']
                        instructor.phone_number = data['teacher'][0]['phone']

            elif i == table_type['NOTES']:
                log.info("Retrieving course description...")
                log.info(cells)
                lines = cells[0]
                course.description = "\n".join(
                    [line if line else "" for line in lines])
            break
    log.info("Done collecting course information and instructor information.")
    return course, instructor
Esempio n. 13
0
        'lambda_l2': 1
    }
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[label],
                      valid_sets=watchlist,
                      verbose_eval=100)
    print(model.feature_importance())
    pred_test_y = model.predict(test_X)
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2


if not is_in_cache('convai_with_fe'):
    print_step('Importing base data')
    train_base, test_base = get_data()

    print_step('Importing ConvAI data')
    train, test = load_cache('convai_data')

    print_step('Importing FE')
    train_fe, test_fe = load_cache('fe_lgb_data')

    print_step('Merging')
    train_fe['id'] = train_base['id']
    test_fe['id'] = test_base['id']
    train_ = pd.merge(train_fe, train, on='id')
    test_ = pd.merge(test_fe, test, on='id')
    del train_base
    del test_base
    del train_fe
Esempio n. 14
0
import numpy as np

from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score

from cache import get_data, load_cache


def auc_func(weights):
    final_prediction = 0
    for weight, prediction in zip(weights, blend_train):
        final_prediction += weight * prediction
    return 1 - roc_auc_score(y_train, final_prediction)


base_train, base_test = get_data()
train, test = load_cache('lvl3_all_mix')
labels = ['toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate']

for label in labels:
    y_train = base_train[label]
    print('\n Finding Blending Weights for ' + label + '...')
    blend_train = np.array([train['lvl2_all_lgb_' + label].rank().values,
                            train['lvl2_all_xgb_' + label].rank().values,
                            train['final-cnn_' + label].rank().values,
                            train['lvl2_all_rf_' + label].rank().values])
    blend_test = np.array([test['lvl2_all_lgb_' + label].rank().values,
                           test['lvl2_all_xgb_' + label].rank().values,
                           test['final-cnn_' + label].rank().values,
                           test['lvl2_all_rf_' + label].rank().values])
Esempio n. 15
0

def runRidge(train_X, train_y, test_X, test_y, test_X2, params):
    model = Ridge(**params)
    print_step('Fit Ridge')
    model.fit(train_X, train_y)
    print_step('Ridge Predict 1/2')
    pred_test_y = model.predict(test_X)
    print_step('Ridge Predict 2/2')
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()

print('~~~~~~~~~~~~~~~')
print_step('Subsetting')
target = train['deal_probability']
train_id = train['item_id']
test_id = test['item_id']
train.drop(['deal_probability', 'item_id'], axis=1, inplace=True)
test.drop(['item_id'], axis=1, inplace=True)

if not is_in_cache('tfidf_ridges') or not is_in_cache(
        'titlecat_tfidf') or not is_in_cache('text_tfidf') or not is_in_cache(
            'text_char_tfidf'):
    print('~~~~~~~~~~~~~~~~~~~~')
    print_step('Title TFIDF 1/2')
    tfidf = TfidfVectorizer(ngram_range=(1, 1),