Ejemplo n.º 1
0
def main():
    print("Reading configuration")

    config = read_config()

    print("Fetching data")

    raw_epc_data = get_epc_data(config.epc_data_url, True)

    print("Cleaning data")

    data = clean_data(raw_epc_data, config.data_columns)

    print("Writing data")

    # Wait for postgres to start ... should be handled at the infra level
    time.sleep(5)

    pg = Postgres(config)

    pg.write_data(data, config.postgres_table_name)

    print("Verifying data")

    print(pg.read_data())

    print("Complete")
Ejemplo n.º 2
0
def create_dataset(fold=True):
    '''Run this function once to create train,val,test files for K folds'''
    data_all = pd.read_csv('asap/training_set_rel3.fixed.tsv.zip',
                           sep='\t',
                           encoding='latin1')
    data_all['essay'] = convert_to_ascii(data_all['essay'])
    data_all['essay'] = clean_data(data_all['essay'])

    for p in range(1, 9):
        data_prompt = data_all[data_all['essay_set'] == p].reset_index(
            drop=True)
        print(data_prompt.head())

        if fold:
            kf = KFold(n_splits=5, shuffle=True, random_state=420)
            n = 1
            for train_index, test_index in kf.split(data_prompt):
                # print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
                val_index = test_index[:len(test_index) // 2]
                test_index = test_index[len(test_index) // 2:]
                print(len(train_index), len(val_index), len(test_index))

                fold_path = utils.mkpath('asap/fold_{}/'.format(n))
                data_prompt.loc[train_index].to_csv(
                    fold_path + 'prompt_{}_train.tsv'.format(p),
                    sep='\t',
                    index=False)
                data_prompt.loc[val_index].to_csv(
                    fold_path + 'prompt_{}_val.tsv'.format(p),
                    sep='\t',
                    index=False)
                data_prompt.loc[test_index].to_csv(
                    fold_path + 'prompt_{}_test.tsv'.format(p),
                    sep='\t',
                    index=False)
                n += 1
        else:
            train, test = train_test_split(data_prompt,
                                           test_size=0.2,
                                           random_state=420,
                                           shuffle=False)
            val = test[:len(test) // 2]
            test = test[len(test) // 2:]
            path = utils.mkpath('asap/')
            print(len(train), len(val), len(test))
            train.to_csv(path + 'prompt_{}_train.tsv'.format(p),
                         sep='\t',
                         index=False)
            val.to_csv(path + 'prompt_{}_val.tsv'.format(p),
                       sep='\t',
                       index=False)
            test.to_csv(path + 'prompt_{}_test.tsv'.format(p),
                        sep='\t',
                        index=False)
Ejemplo n.º 3
0
                    choices=['standard', 'no', 'no_var'])
parser.add_argument('--loss_fusion', action='store_true')
parser.add_argument('--dropout', type=float, default=0.1)
parser.add_argument('--bound_opts_relu', type=str, default='zero-lb')

args = parser.parse_args()

writer = SummaryWriter(os.path.join(args.dir, 'log'), flush_secs=10)
file_handler = logging.FileHandler(os.path.join(args.dir, 'log/train.log'))
file_handler.setFormatter(
    logging.Formatter('%(levelname)-8s %(asctime)-12s %(message)s'))
logger.addHandler(file_handler)

data_train_all_nodes, data_train, data_dev, data_test = load_data(args.data)
if args.robust:
    data_dev, data_test = clean_data(data_dev), clean_data(data_test)
if args.auto_test:
    random.seed(args.seed)
    random.shuffle(data_test)
    data_test = data_test[:10]
    assert args.batch_size >= 10
logger.info('Dataset sizes: {}/{}/{}/{}'.format(len(data_train_all_nodes),
                                                len(data_train), len(data_dev),
                                                len(data_test)))

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

dummy_embeddings = torch.zeros(1,