Beispiel #1
0
def convert(ori, des, feats):
    df_ori = utils.load_df(ori)
    for f in feats:
        tmp = utils.load_df(config.feat+'m3_' +f)
        print(f)
        df_ori = pd.concat([df_ori,tmp.drop(['session_id','impressions'],axis=1)],axis=1)
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori,des)
Beispiel #2
0
def convert(ori, des, feat):
    df_ori = utils.load_df(ori)
    print(df_ori.shape)
    df_feat = utils.load_df(config.feat + feat)
    df_ori = df_ori.merge(df_feat,
                          on=['session_id', 'impressions'],
                          how='left')
    print(df_ori.shape)
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
Beispiel #3
0
def convert(ori, des, prefix):
    df_ori = utils.load_df(ori)
    print(df_ori.shape)
    for feat in feats:
        df_feat = utils.load_df(config.model + prefix +
                                '%s.csv' % feat).rename(
                                    columns={'target': feat})
        df_ori = df_ori.merge(df_feat[['session_id', 'impressions', feat]],
                              on=['session_id', 'impressions'],
                              how='left')
        print(df_ori.shape)
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
Beispiel #4
0
def convert(ori, des, sample):
    tr = utils.load_df(ori)
    print(tr.shape)
    tr_out = tr[['session_id', 'impressions']]
    dfs = utils.load_df(sample)
    dfs['impr_rank'] = dfs.groupby(['session_id', 'step']).cumcount().values
    print(dfs.head())
    tr_out = cate_encoding.cate_num_stat(dfs, tr_out,
                                         ['session_id', 'impressions'],
                                         'impr_rank', ['min', 'max', 'median'])
    tr_out.columns = tr_out.columns.astype(str)
    print(tr_out.head())
    utils.save_df(tr_out, des)
Beispiel #5
0
def convert(ori, des, feats):
    df_ori = utils.load_df(ori)
    print(df_ori.shape)
    for feat in feats:
        df_feat = utils.load_df(config.feat + feat)
        df_ori = df_ori.merge(df_feat,
                              on=['session_id', 'impressions'],
                              how='left')
        print(df_ori.shape)
        del df_feat
        gc.collect()
    df_ori = utils.reduce_mem(df_ori)
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
Beispiel #6
0
def build_sampler(model, dataset, split, dt):
    params = dict(sd_peds=0, sd_speed=0)
    traj_list = load_trajectories(
        ut.load_df(ut.get_dataset_split_path(dataset, split, is_train=True)))
    df = ut.load_df(
        ut.get_dataset_split_path(dataset, 'split_1.0_0', is_train=True))
    if model.is_sd_s:
        params['sd_speed'] = calculate_sd_speed(df, dt)
    mu_peds, sd_peds = calculate_mu_sd_n_peds(df)
    params['mu_peds'] = mu_peds
    if model.is_sd_p:
        params['sd_peds'] = sd_peds
    r = 4.0
    sampler = Sampler(traj_list, r=r, dt=dt, **params)
    return sampler
Beispiel #7
0
def get_data(adate):
    '''
    download data from internet and save it to file
    
    params: adate          str - the update date
    
    return df if success, or None if adate is not present in df
      - df             pandas dataframe - as u.COLUMNS_WORLD['en']
      
    collateral effects: if success, save df as csv file
    '''

    # load datafram from internet
    url = SOURCE
    fn = FNAME.format(adate)
    df = u.load_df(url, pd.read_csv)

    # test for udated data available
    check_date = datetime.strptime(adate, u.D_FMT).date().strftime(u.D_FMT2)
    bflag = df['dateRep'].str.contains(check_date).any()
    if bflag:
        df.to_csv(Path(u.DIR_DATA) / fn)
    else:
        df = None

    return df
Beispiel #8
0
def main(afile):
    # national trend
    #df = pd.read_csv(Path(u.DIR_DATA) / afile)
    #df = shape_data(df)
    #make_plot(df, afile, 'it')
    #make_plot(df, afile, 'en')

    #regional trend
    #df = u.load_df(Path(u.DIR_DATA) / afile, pd.read_csv, u.COLUMNS_RITALY['it'], encoding='utf-8')
    #df = shape_data(df)
    #make_rplot(df, afile, xlabel='data', ylabel='numero totale di casi', title=f'Covid-19: andamento temporale per le {N_MOST_HITTED} regioni più colpite', lang = 'it' )
    #make_rplot(df, afile, xlabel='date', ylabel='total number of cases', title=f'Covid-19: temporal trend for the {N_MOST_HITTED} most hitted regions', lang = 'en' )

    df = u.load_df(Path(u.DIR_DATA) / afile,
                   pd.read_csv,
                   u.COLUMNS_ITALY['it'],
                   encoding='utf-8')
    df = shape_data(df)
    make_national(
        df,
        'nuovi_positivi',
        afile,
        xlabel='data',
        ylabel='numero giornaliero di nuovi casi',
        title=
        f'Covid-19: andamento temporale giornaliero di nuovi positivi (nazionale)',
        lang='it')
    make_national(df,
                  'nuovi_positivi',
                  afile,
                  xlabel='date',
                  ylabel='daily number of new cases',
                  title=f'Covid-19: time trend of daily (national) new cases',
                  lang='en')
Beispiel #9
0
def load_and_log_params(cli_params):
    cli_params = AttributeDict(cli_params)
    if cli_params.get('load_from'):
        p = load_df(cli_params.load_from, 'params').to_dict()[0]
        p = AttributeDict(p)
        for key in cli_params.keys():
            if key not in p:
                p[key] = None
        new_params = cli_params
        loaded = True
    else:
        p = cli_params
        new_params = {}
        loaded = False

        # Make dseed seed unless specified explicitly
        if p.get('dseed') is None and p.get('seed') is not None:
            p['dseed'] = p['seed']

    logger.info('== COMMAND LINE ==')
    logger.info(' '.join(sys.argv))

    logger.info('== PARAMETERS ==')
    for k, v in p.items():
        if new_params.get(k) is not None:
            p[k] = new_params[k]
            replace_str = "<- " + str(new_params.get(k))
        else:
            replace_str = ""
        logger.info(" {!s:20s}: {!s:<20s} {}".format(k, v, replace_str))
    return p, loaded
Beispiel #10
0
def load_and_log_params(cli_params):
    cli_params = AttributeDict(cli_params)
    if cli_params.get('load_from'):
        p = load_df(cli_params.load_from, 'params').to_dict()[0]
        p = AttributeDict(p)
        for key in cli_params.iterkeys():
            if key not in p:
                p[key] = None
        new_params = cli_params
        loaded = True
    else:
        p = cli_params
        new_params = {}
        loaded = False

        # Make dseed seed unless specified explicitly
        if p.get('dseed') is None and p.get('seed') is not None:
            p['dseed'] = p['seed']

    logger.info('== COMMAND LINE ==')
    logger.info(' '.join(sys.argv))

    logger.info('== PARAMETERS ==')
    for k, v in p.iteritems():
        if new_params.get(k) is not None:
            p[k] = new_params[k]
            replace_str = "<- " + str(new_params.get(k))
        else:
            replace_str = ""
        logger.info(" {:20}: {:<20} {}".format(k, v, replace_str))
    return p, loaded
Beispiel #11
0
 def fit(self, dataset, dirty_train):
     dirty_raw_path = utils.get_dir(dataset, 'raw', 'raw.csv')
     clean_raw_path = utils.get_dir(dataset, 'raw',
                                    'inconsistency_clean_raw.csv')
     if not os.path.exists(clean_raw_path):
         print(
             "Must provide clean version of raw data for cleaning inconsistency"
         )
         sys.exit(1)
     dirty_raw = utils.load_df(dataset, dirty_raw_path)
     clean_raw = utils.load_df(dataset, clean_raw_path)
     N, m = dirty_raw.shape
     dirty_raw = dirty_raw.values
     clean_raw = clean_raw.values
     mask = (dirty_raw != clean_raw)
     dirty = dirty_raw[mask]
     clean = clean_raw[mask]
     self.incon_dict = dict(zip(dirty, clean))
Beispiel #12
0
 def fit(self, dataset, dirty_train):
     index_train_path = utils.get_dir(dataset, 'raw', 'idx_train.csv')
     index_test_path = utils.get_dir(dataset, 'raw', 'idx_test.csv')
     index_train = pd.read_csv(index_train_path).values.reshape(-1)
     index_test = pd.read_csv(index_test_path).values.reshape(-1)
     clean_path = utils.get_dir(dataset, 'raw', 'mislabel_clean_raw.csv')
     clean = utils.load_df(dataset, clean_path)
     self.clean_train = clean.loc[index_train, :]
     self.clean_test = clean.loc[index_test, :]
Beispiel #13
0
 def get_n_tracks(self, n_steps, dataset, split):
     if self.is_large:
         n_tracks = 500 if dataset not in ['univ'] else 100
     else:
         df = ut.load_df(
             ut.get_dataset_split_path(dataset, split, is_train=True))
         n_frames = df['t'].unique().size
         n_tracks = n_frames // (n_steps + 1
                                 )  # n_steps -> n_steps+1 positions
     return n_tracks
def load_data(dataset, train_path, test_path_list):
    """Load and split data into features and label.

    Args: 
        dataset (dict): dataset dict in config
        train_path (string): path for training set
        test_path_list (list): a list of paths for test set (missing values and outlier have multiple test sets)
    """
    # load data
    train = utils.load_df(dataset, train_path)
    test_list = [
        utils.load_df(dataset, test_dir) for test_dir in test_path_list
    ]

    # split X, y
    label = dataset['label']
    features = [v for v in train.columns if not v == label]
    X_train, y_train = train.loc[:, features], train.loc[:, label]
    X_test_list = [test.loc[:, features] for test in test_list]
    y_test_list = [test.loc[:, label] for test in test_list]

    return X_train, y_train, X_test_list, y_test_list
def load_features(path):
    """
    Cargar pickle que se generó durante la transformación
    :param path: Path donde se encuentra el pickle
    :return:
    """
    print("Opening feature engineering pickle from output path")
    output_path = os.path.join(path, "output", "fe_df.pkl")

    # Recuperar el pickle
    incidentes_pkl = load_df(output_path)
    print("Feature Engineering pickle successfully retrieved.")

    return incidentes_pkl
Beispiel #16
0
def convert(ori, des, feat):
    df_ori = utils.load_df(ori)
    print(df_ori.shape)
    for c in cols:
        df_ori = cate_encoding.cate_num_rank(df_ori, ['session_id'],
                                             c,
                                             ascending=True,
                                             show_agg=True)
    df_ori = df_ori.reset_index(drop=True)
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
    utils.save_df(
        df_ori[[
            'session_id', 'impressions', 'session_id_by_prices_rank',
            'session_id_by_ctr_rank', 'session_id_by_last_ts_sub_max_rank'
        ]], feat)
Beispiel #17
0
def load_and_log_params(cli_params):
    cli_params = AttributeDict(cli_params)
    # 如果有load_from参数,说明参数是有文件里面读取的,文件格式为hdf
    if cli_params.get('load_from'):
        # load_from值 + params组成完整地址
        # string => dict
        p = load_df(cli_params.load_from, 'params').to_dict()[0]
        # dict => AttributeDict
        p = AttributeDict(p)

        for key in cli_params.iterkeys():
            if key not in p:
                p[key] = None
        new_params = cli_params
        loaded = True

    # 如果没有load from参数,直接封装一下cli_params
    else:
        p = cli_params
        new_params = {}
        loaded = False

        # Make dseed seed unless specified explicitly
        # dseed为空而seed不为空时,dseed复制为seed
        if p.get('dseed') is None and p.get('seed') is not None:
            p['dseed'] = p['seed']

    # log相关
    logger.info('== COMMAND LINE ==')
    logger.info(' '.join(sys.argv))

    logger.info('== PARAMETERS ==')
    for k, v in p.iteritems():
        if new_params.get(k) is not None:
            p[k] = new_params[k]
            replace_str = "<- " + str(new_params.get(k))
        else:
            replace_str = ""
        logger.info(" {:20}: {:<20} {}".format(k, v, replace_str))
    return p, loaded
Beispiel #18
0
def inject(dataset):
    """ Inject mislabels
        Args:
            dataset (dict): dataset dict in config
    """
    # create saving folder
    major_save_dir = utils.makedirs(
        [config.data_dir, dataset["data_dir"] + "_major", 'raw'])
    minor_save_dir = utils.makedirs(
        [config.data_dir, dataset["data_dir"] + "_minor", 'raw'])
    uniform_save_dir = utils.makedirs(
        [config.data_dir, dataset["data_dir"] + "_uniform", 'raw'])

    # load clean data
    clean_path = utils.get_dir(dataset, 'raw', 'raw.csv')
    clean = utils.load_df(dataset, clean_path)
    clean = clean.dropna().reset_index(drop=True)

    major_clean_path = os.path.join(major_save_dir, 'mislabel_clean_raw.csv')
    minor_clean_path = os.path.join(minor_save_dir, 'mislabel_clean_raw.csv')
    uniform_clean_path = os.path.join(uniform_save_dir,
                                      'mislabel_clean_raw.csv')
    clean.to_csv(major_clean_path, index=False)
    clean.to_csv(minor_clean_path, index=False)
    clean.to_csv(uniform_clean_path, index=False)

    label = dataset['label']

    # uniform flip
    uniform = uniform_class_noise(clean, label)
    # pairwise flip
    major, minor = pairwise_class_noise(clean, label)

    major_raw_path = os.path.join(major_save_dir, 'raw.csv')
    minor_raw_path = os.path.join(minor_save_dir, 'raw.csv')
    uniform_raw_path = os.path.join(uniform_save_dir, 'raw.csv')

    major.to_csv(major_raw_path, index=False)
    minor.to_csv(minor_raw_path, index=False)
    uniform.to_csv(uniform_raw_path, index=False)
def extract(sample, ori, feat):
    nrows = None
    df = pd.read_csv(
        sample,
        nrows=nrows,
        usecols=['session_id', 'step', 'reference', 'impressions'])
    print(df.head())
    df_ori = utils.load_df(ori)
    print(df_ori.head())
    df = df.merge(df_ori[['session_id', 'step']].drop_duplicates(),
                  on='session_id',
                  how='left')
    print(df.head())
    df = df[df.step_x < df.step_y]

    tmp = df.drop_duplicates(subset=['session_id', 'step_x'])
    df_clk = tmp.groupby(['session_id',
                          'reference'])['step_x'].agg('count').reset_index()
    print(df_clk.head())
    df_clk.rename(columns={
        'reference': 'impressions',
        'step_x': 'item_sid_clk_cnt'
    },
                  inplace=True)
    df_impr = df.groupby(['session_id',
                          'impressions'])['step_x'].agg('count').reset_index()
    print(df_impr.head())
    df_impr.rename(columns={'step_x': 'item_sid_impr_cnt'}, inplace=True)

    df_out = df_ori[['session_id', 'impressions']]
    df_out = df_out.merge(df_clk, on=['session_id', 'impressions'], how='left')
    df_out = df_out.merge(df_impr,
                          on=['session_id', 'impressions'],
                          how='left')
    print(df_out.head())
    df_out.columns = df_out.columns.astype(str)
    utils.save_df(df_out, feat)
import sys
import utils
import cate_encoding 
import config

def extract(df_ori, des):
    print(df_ori.shape)
    df_ori = df_ori.merge(df, on = ['session_id'], how = 'left')
    df_ori['last_act_gap'] = df_ori['timestamp'] - df_ori['timestamp_x']
    df_ori.drop(['timestamp','timestamp_x'],axis=1,inplace=True)
    print(df_ori.head(10))
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)

nrows = None
tr = utils.load_df(config.data+'train.csv',nrows=nrows)
te = utils.load_df(config.data+'test.csv',nrows=nrows)

df = pd.concat([tr,te])
df = df[['session_id','timestamp','step','action_type','reference']]
trs = utils.load_df(config.feat+'m3_tr_0.ftr')
tes = utils.load_df(config.feat+'m3_te_0.ftr')
df_sample = pd.concat([trs,tes])
df_sample = df_sample[['session_id','timestamp']].drop_duplicates()
df = df.merge(df_sample,on='session_id',how='left')
print(df.head(10))
df = df[df.timestamp_x < df.timestamp_y]

df = df[['session_id','timestamp_x','action_type']].drop_duplicates(subset=['session_id'],keep='last')
df = cate_encoding.label_encode(df,'action_type')
df.rename(columns={'action_type':'sid_last_act'},inplace=True)
Beispiel #21
0
                    type=str,
                    default="none")

parser.add_argument("-G",
                    help="gravitational constant for the simulation",
                    default=1,
                    type=float)

args = parser.parse_args()
"""/arguments"""
"""adjust spark settings"""
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.caseSensitive", "true")
"""load data"""
df_t0 = utils.load_df(args.input,
                      schema=schemas.clust,
                      part="id",
                      limit=args.limit)
"""setup simulation"""
methods = {
    "eul1": IntergratorEuler(args.dt, args.G),
    "eul2": IntergratorEuler2(args.dt, args.G),
    "rk4": IntegratorRungeKutta4(args.dt, args.G),
    "vlf": IntegratorLeapfrog(args.dt, args.G),
}

nameStr = utils.clean_str(
    spark.conf.get("spark.app.name")) + "-" + spark.conf.get("spark.app.id")
sopts = utils.SaveOptions(os.path.join(args.outputDir, nameStr),
                          fformat=args.f,
                          compression=args.comp,
                          header="true")
Beispiel #22
0
# -*-coding:utf-8-*-
# @Time         :2020/3/6/9:58
# @Author       :Lwf
# @Email        :S
# @File         :predict.py
# 导包
from pandas import DataFrame
import joblib
import numpy as np

from utils import load_df


#  按照指定的格式生成结果
def create_submission(ids, predictions, filename='submission.csv'):
    submissions = np.concatenate(
        (ids.reshape(len(ids), 1), predictions.reshape(len(predictions), 1)),
        axis=1)
    df = DataFrame(submissions)
    df.to_csv(filename, header=['id', 'click'], index=False)


classifier = joblib.load('classifier.pkl')
test_data_df = load_df('csv/test.csv', training=False)
print(test_data_df)
ids = test_data_df.values[0:, 0]
print(ids)
predictions = classifier.predict(test_data_df.values[0:, 1:])
print(predictions)
create_submission(ids, predictions)
Beispiel #23
0
from pandas import DataFrame
from sklearn.externals import joblib
import numpy as np

from utils import load_df


def create_submission(ids, predictions, filename='../out/submission_unit.csv'):
    submissions = np.concatenate(
        (ids.reshape(len(ids), 1), predictions.reshape(len(predictions), 1)),
        axis=1)
    df = DataFrame(submissions)
    df.to_csv(filename, header=['id', 'click'], index=False)


print("*******************test starting***************************")

classifier = joblib.load('../out/model/classifier_unit.pkl')
# test_data_df = load_df('csv/test', training=False)
test_data_df = load_df('../datasets/test/test', training=False)
ids = test_data_df.values[0:, 0]

print(ids)

predictions = classifier.predict(test_data_df.values[0:, 1:])
create_submission(ids, predictions)

print("*******************test end***************************")
Beispiel #24
0
def dump_feat(ori, des):
    df = utils.load_df(ori)
    df = df[cols + ['session_id', 'impressions']]
    df.columns = re_cols + ['session_id', 'impressions']
    print(df.shape)
    utils.save_df(df, des)
Beispiel #25
0
    print("AUC: ", metrics.roc_auc_score(true_values, predicted_values))
    print("Confusion Matrix: ",
          +metrics.confusion_matrix(true_values, predicted_values))
    print(metrics.classification_report(true_values, predicted_values))


# 拟合分类器
def classify(classifier_class, train_input, train_targets):
    classifier_object = classifier_class()
    classifier_object.fit(train_input, train_targets)
    return classifier_object


# 模型存储
def save_model(clf):
    joblib.dump(clf, 'classifier.pkl')


train_data = load_df('csv/train_small.csv').values
# print(train_data)
X_train, X_test, y_train, y_test = train_test_split(train_data[0::, 1::],
                                                    train_data[0::, 0],
                                                    test_size=0.3,
                                                    random_state=0)
# print(X_train)
# print(y_train)
classifier = classify(LogisticRegression, X_train, y_train)
predictions = classifier.predict(X_test)
print_metrics(y_test, predictions)
save_model(classifier)
Beispiel #26
0
def main(afile=p.FN):
    # ATTENZIONE RIMETTERE IN LINEA LE SEGUENTI
    if BACKUP:
        backup()

    if DOWNLOAD:
        dt = datetime.now()
        rc = get_data(dt.date().strftime(DATEURL_FMT))
        if rc == -1:
            print("error downloading today's data")
            sys.exit(-1)

    #import pdb; pdb.set_trace()

    #df = pd.read_csv(Path(u.DIR_DATA) / afile)
    df = u.load_df(
        Path(u.DIR_DATA) / afile, pd.read_csv, u.COLUMNS_ITALY['it'])

    df = p.shape_data(df)
    p.make_plot(df, afile, 'it')
    p.make_plot(df, afile, 'en')
    # FINE ATTENZIONE

    p.make_national(
        df,
        'nuovi_positivi',
        afile,
        xlabel='data',
        ylabel='numero giornaliero di nuovi casi',
        title=
        f'Covid-19: andamento temporale giornaliero di nuovi positivi (nazionale)',
        lang='it')
    p.make_national(
        df,
        'nuovi_positivi',
        afile,
        xlabel='date',
        ylabel='daily number of new cases',
        title=f'Covid-19: time trend of daily (national) new cases',
        lang='en')

    rdf = u.load_df(
        Path(u.DIR_DATA) / h.FN, pd.read_csv, u.COLUMNS_RITALY['it'])
    rdf = h.shape_data(rdf)

    #p.make_rplot(rdf)
    p.make_rplot(
        rdf,
        h.FN,
        xlabel='data',
        ylabel='numero totale di casi',
        title=
        f'Covid-19: andamento temporale per le {p.N_MOST_HITTED} regioni più colpite',
        lang='it')
    p.make_rplot(
        rdf,
        h.FN,
        xlabel='date',
        ylabel='total number of cases',
        title=
        f'Covid-19: temporal trend for the {p.N_MOST_HITTED} most hitted regions',
        lang='en')

    h.make_histogram(rdf, h.FN, 'it')
    h.make_histogram(rdf, h.FN, 'en')

    article = make_article(FN_IT_TEMPLATE, df, rdf, 'it', TAB_TITLE)
    article = make_article(FN_EN_TEMPLATE, df, rdf, 'en', TAB_EN_TITLE)

    # ATTENZIONE RIMETTERE IN LINEA LE SEGUENTI
    if u.ENABLE_LDFA:
        to_ldfa()
    if u.ENABLE_PRODUCTION:
        to_production()
Beispiel #27
0
from utils import load_df


def print_metrics(true_values, predicted_values):
    print "Accuracy: ", metrics.accuracy_score(true_values, predicted_values)
    print "AUC: ", metrics.roc_auc_score(true_values, predicted_values)
    print "Confusion Matrix: ", + metrics.confusion_matrix(true_values, predicted_values)
    print metrics.classification_report(true_values, predicted_values)


def classify(classifier_class, train_input, train_targets):
    classifier_object = classifier_class()
    classifier_object.fit(train_input, train_targets)
    return classifier_object


def save_model(clf):
    joblib.dump(clf, 'classifier.pkl')


train_data = load_df('csv/train_small.csv').values

X_train, X_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0],
                                                    test_size=0.3, random_state=0)

classifier = classify(LogisticRegression, X_train, y_train)
predictions = classifier.predict(X_test)
print_metrics(y_test, predictions)
save_model(classifier)

    tr.drop(['current_filters', 'reference', 'action_type'],
            axis=1,
            inplace=True)
    te.drop(['current_filters', 'reference', 'action_type', 'target'],
            axis=1,
            inplace=True)
    utils.save_df(te, config.data + 'm3_te.ftr')
    return tr, te


def gen_tr_click(df):
    df = df[['session_id',
             'reference']].drop_duplicates(subset='session_id',
                                           keep='last').reset_index(drop=True)
    print(df.shape)
    df = df[pd.notnull(df.reference)].reset_index(drop=True)
    print(df.shape)
    utils.save_df(df, config.data + 'm3_tr_click.ftr')


if __name__ == '__main__':
    nrow = None
    train = utils.load_df(config.data + 'sample_train.csv', nrows=nrow)
    test = utils.load_df(config.data + 'sample_test.csv', nrows=nrow)
    df = pd.concat([train, test]).reset_index(drop=True)
    tr1 = gen_train_sample(train)
    tr2, te = get_test_sample(test)
    tr = pd.concat([tr1, tr2]).reset_index(drop=True)
    utils.save_df(tr1, config.data + 'm3_tr.ftr')
    gen_tr_click(df)
Beispiel #29
0
from pandas import DataFrame
from sklearn.externals import joblib
import numpy as np

from utils import load_df


def create_submission(ids, predictions, filename='submission.csv'):
    submissions = np.concatenate((ids.reshape(len(ids), 1), predictions.reshape(len(predictions), 1)), axis=1)
    df = DataFrame(submissions)
    df.to_csv(filename, header=['id', 'click'], index=False)

classifier = joblib.load('classifier.pkl')
test_data_df = load_df('csv/test', training=False)
ids = test_data_df.values[0:, 0]
predictions = classifier.predict(test_data_df.values[0:, 1:])
create_submission(ids, predictions)




Beispiel #30
0
def main(adate):
    if u.ENABLE_DEBUG:
        breakpoint(
            header=f'world.py, main({adate}), break before download data')
    # get data
    if DOWNLOAD:
        df = get_data(adate)
        if type(df) == type(
                None
        ):  # we cannot do: df == None because if df is dataframe it raises ValueError
            print('data from Internet not updated. run interrupted')
            sys.exit(1)
    else:
        df = u.load_df(u.DIR_DATA + '/' + FNAME.format(adate),
                       pd.read_csv,
                       u.COLUMNS_WORLD['en'],
                       encoding='cp1250')
    df = world_shape(df)

    df_world = df.copy()

    df_eu = modify_by_area(df)
    # make two graphs:
    #    10 most hitted countries
    make_spagetti(df,
                  datetime.strptime('2020-01-14', u.D_FMT).date(),
                  Path(u.DIR_WIMG) /
                  (os.path.splitext(FNAME.format(adate))[0] + '_1-10.png'),
                  title=TITLE10.format('2020-01-14'),
                  ss=0,
                  se=10)

    #    from 3rd to 10th of the 10 most hitted countries
    make_spagetti(df,
                  datetime.strptime('2020-02-20', u.D_FMT).date(),
                  Path(u.DIR_WIMG) /
                  (os.path.splitext(FNAME.format(adate))[0] + '_3-10.png'),
                  title=TITLE09.format('2020-02-20'),
                  ss=2,
                  se=10)

    #    from 1st to 10th of the 10 most hitted EU countries
    make_spagetti(df_eu,
                  datetime.strptime('2020-02-20', u.D_FMT).date(),
                  Path(u.DIR_WIMG) /
                  (os.path.splitext(FNAME.format(adate))[0] + '_eu_1-10.png'),
                  title=TITLEEU.format('2020-02-20'),
                  ss=0,
                  se=10)

    # make articles
    make_article(u.DIR_TEMPLATE + '/' + FN_IT_TEMPLATE, adate, df, df_world)
    make_article(u.DIR_TEMPLATE + '/' + FN_EN_TEMPLATE, adate, df, df_world)

    # make summary about all world and save it to disk
    df2 = world_get_sum_on_last_day(df_world)
    columns = [
        'date', 'cases', 'death', 'death/cases', 'cases/population',
        'death/population', 'country'
    ]
    df2.to_csv(Path(u.DIR_DATA) / FSNAME.format(adate),
               columns=columns,
               index=False,
               float_format='%.6f')

    df.to_csv(
        Path(u.DIR_DATA) / FEUNAME,
        index=False,
    )

    # copy results to ldfa filesystem and to production
    if u.ENABLE_LDFA:
        to_ldfa(adate)
    if u.ENABLE_PRODUCTION:
        to_production(adate)
Beispiel #31
0
output_file_path = '/'.join(output_filepath.split('/')[:-1])
output_file_name = output_filepath.split('/')[-1]
try:
	os.mkdir(output_file_path)
except OSError:
	print('Failed to create output directory.')
else:
	print('Succesfully created directory.')

# Get connect engine   
engine = postgres_connector(
   database_host,
   5432,
   "intern_task",
   "candidate",
   "dcard-data-intern-2020"
)
df = load_df(engine, mode='test')
learn = load_learner(file_path, file_name, test=TabularList.from_df(df))
preds = learn.get_preds(ds_type=DatasetType.Test)[1].numpy()
final_df = pd.DataFrame({'post_key': df['post_key'], 'is_trending': preds})
final_df.to_csv('predictions.csv', header=True, index=False)


df['is_trending'] = df['like_count_36_hour'] >= 1000
df.is_trending = df.is_trending.astype(int)
y_true = df['is_trending'].to_numpy()
y_pred = preds
print('f1_socre:')
print(f1_score(y_true, y_pred, average='macro'))
    print(df_ori.shape)
    df_ori = df_ori.merge(df_sid, on=['session_id'], how='left')
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)


rows = None
tr = pd.read_csv(
    config.data + 'train.csv',
    usecols=['session_id', 'action_type', 'reference', 'impressions'],
    nrows=rows)
print(tr.shape)
te = pd.read_csv(
    config.data + 'test.csv',
    usecols=['session_id', 'action_type', 'reference', 'impressions'],
    nrows=rows)
print(te.shape)
df = pd.concat([tr, te])
print(df.shape)

df = df[df.action_type == 'clickout item']
df_sid = extract_list_cnt(df)
print(df_sid.head())

trs = utils.load_df(config.feat + 'm3_tr_0.ftr')
tes = utils.load_df(config.feat + 'm3_te_0.ftr')
tr_out = trs[['session_id', 'impressions']]
te_out = tes[['session_id', 'impressions']]
extract(tr_out, config.feat + 'm3_tr_imprlist_feat2.ftr')
extract(te_out, config.feat + 'm3_te_imprlist_feat2.ftr')
Beispiel #33
0
def get_reciprocal_ranks(ps):
    """Calculate reciprocal ranks for recommendations."""
    mask = ps.reference == np.array(ps.impressions)

    if mask.sum() == 1:
        rranks = generate_rranks_range(0, len(ps.impressions))
        return np.array(rranks)[mask].min()
    else:
        return 0.0


def generate_rranks_range(start, end):
    """Generate reciprocal ranks for a given list length."""

    return 1.0 / (np.arange(start, end) + 1)


tr = pd.read_csv(config.model + '%s/tr_pred.csv' % sys.argv[1])

tr = tr.sort_values(by='target', ascending=False).reset_index(drop=True)
tr = tr.groupby(['session_id'])['impressions'].apply(list).reset_index()
print(tr.head())
print(tr.shape)

tr_click = utils.load_df(config.data + 'm3_tr_click.ftr')
tr = tr.merge(tr_click, on='session_id')
print(tr.shape)
print(tr.head())
tr['score'] = tr.apply(get_reciprocal_ranks, axis=1)
print(tr.score.mean())
Beispiel #34
0
args = parser.parse_args()
"""/arguments"""

G = 1
TOLERANCE = 1e-04
res = []
data = [
    'c_0500.csv', 'c_0700.csv', 'c_0600.csv', 'c_1000.csv', 'c_0900.csv',
    'c_1200.csv', 'c_1100.csv', 'c_1500.csv', 'c_0300.csv', 'c_1800.csv',
    'c_1300.csv', 'c_0800.csv', 'c_1700.csv', 'c_0200.csv', 'c_0100.csv',
    'c_0400.csv', 'c_0000.csv', 'c_1600.csv', 'c_1400.csv'
]

for fname in data:
    df = utils.load_df(os.path.join(args.input, fname),
                       schema=schemas.clust,
                       part="id")
    e = cluster.calc_E(df)
    diff = abs(e - (-0.25))
    res.append([
        fname,
        e,
        -0.25,
        diff,
    ])

sc = SparkContext.getOrCreate()
res = sc.parallelize(res).toDF(schema=schemas.E_test_res)

utils.save_df(res, "E_TEST", args.outputDir, fformat="csv")