Beispiel #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_data_name", type=str, help="Dataset name as registered for use")
    parser.add_argument("--dropped_columns", type=str, help="Columns to be dropped. To be inputed one after the other.")
    parser.add_argument("--threshold", type=float, help="Percentage of missing values dropped for dropped columns")
    parser.add_argument("--output_data", type=str, help="Output cleansed data.")

    # KNNImputer args
    parser.add_argument("--n_neighbors", type=int, default=5, help="Number of neighbors for KNNImputer")
    args = parser.parse_args()

    # Get dataset by name
    data = run.input_datasets[args.input_data_name]
    data_df = data.to_pandas_dataframe()
    clean_df = clean_data(data_df, threshold=args.threshold, dropped_columns=literal_eval(args.dropped_columns))
    print("Shape of cleaned dataset:\n", clean_df.shape)
    
    # Pop default_status so that it is not included in KNNImputer model 
    y = clean_df.pop("default_status")
    imputer = KNNImputer(n_neighbors=args.n_neighbors, weights="distance", add_indicator=False)
    imputed_df = DataFrame(imputer.fit_transform(clean_df), columns=clean_df.columns)
    imputed_df["default_status"] = y
    print("Fitted KNNImputer. Filled missing values.")

    # Dump model artifact 
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(imputer, "outputs/knnimputer.joblib")
    print("Saved KNNImputer artifact.")

    if not (args.output_data is None):
        write_output(imputed_df, path=args.output_data, filename="/cleaned.parquet")
Beispiel #2
0
def main():
    # Load pre-trained model
    model = Classifier(num_chan=num_chan).to(device)
    model.load_state_dict(
        torch.load(os.path.join(epoch_dir,
                                str(epoch) + '.model')))
    # Create dataset loader over all training set
    dataset = testDL(data_dir, 'test')
    dataloader = data.DataLoader(dataset=dataset,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 num_workers=4)

    out_arr = []
    fname_arr = []
    data_iter = iter(dataloader)
    for idx, dat in enumerate(data_iter):
        pics = dat[0].to(device).float()
        fnames = dat[1]
        out = torch.sigmoid(model(pics)).detach().cpu().numpy()
        out[out >= threshold] = 1
        out[out < threshold] = 0
        fname_arr += [*fnames]
        out_arr += [*out]
    write_output(fname_arr, out_arr, results_fname)
Beispiel #3
0
    def post(self):
        name = self.request.get('alias')
        acct = self.request.get('username')
        mail = self.request.get('mail')
        pw = self.request.get('password')
        icon = self.request.get('icon')
        status = SUCCESS
        json_query_data = ''
        if pw == '' or acct == '' or name == '':
            status = 502
            json_query_data = 'Missing alias, username or password'
        else:
            if utils.exists(User, [['username', acct]]):
                status = 503
                json_query_data = 'Already existing account'
            else:
                # Create the User object and return its key
                newuser = User()
                newuser.name = name
                newuser.username = acct
                newuser.mail = mail
                newuser.icon = icon
                c = Credentials()
                c.set_dk(pw)
                newuser.password = c
                pkey = newuser.put()
                json_query_data = str(pkey.id())

        utils.write_output(self, json_query_data, status)
def run_inputs(event, input_file, output_file, best_so_far_file):
    print("\nBeginning " + input_file)
    num_wizards, num_constraints, wizards, constraints = utils.read_input(
        input_file)
    solution = backtrack_solve(wizards, constraints)
    print("\nFound Solution")
    print(solution)
    utils.write_output(output_file, solution)
Beispiel #5
0
    def parse_arguments(self, arguments):
        input_bytes = read_input(arguments.input, use_bytes=True)
        key_bytes = read_input(arguments.key, use_bytes=True)

        # process input
        xored_bytes = self.xor_bytes(input_bytes, key_bytes)

        write_output(xored_bytes, arguments.output, use_bytes=True)
Beispiel #6
0
def main():
    path_uno = "./asset/bonazzi.it.test.data.txt"
    path_due = "./asset/toscano.it.test.data.txt"
    reading_uno = utils.read_file(path_uno)
    reading_due = utils.read_file(path_due)
    eval_uno = utils.extract_word(reading_uno)
    eval_due = utils.extract_word(reading_due)
    correlation = compute_correlations(init_annotation(eval_uno, eval_due))
    utils.write_output(eval_uno, eval_due, correlation)
    print(correlation)
Beispiel #7
0
def test_anneal():
    G = g
    result, score = annealing.anneal(G, 120000, 1, 1, 0.0004, print_energy=True)
    print(score)
    print("C: ", utils.cost_fn(result))
    nx.draw(utils.mat_to_nx(G))
    plt.savefig("/tmp/g.png")
    plt.figure()
    nx.draw(utils.mat_to_nx(result))
    plt.savefig("/tmp/gres.png")
    utils.write_output(result, G, "/tmp/res.txt")
    assert utils.verify_in_out(G, "/tmp/res.txt")
Beispiel #8
0
 def post(self):
     json_query_data = ''
     status = SUCCESS
     mailto = self.request.get('to')
     mailsubject = self.request.get('subject')
     mailbody = self.request.get('body')
     if mailto == '' or mailsubject == '':
         json_query_data = 'Missing adress or subject'
         status = 510
     else:
         (status,
          json_query_data) = utils.send_email(mailto, mailsubject, mailbody)
     utils.write_output(self, json_query_data, status)
Beispiel #9
0
def test_initial():
    G = utils.read_input("inputs/large-14.in")
    print("Running")
    plt.figure()
    nx.draw(utils.mat_to_nx(G))
    plt.savefig("/tmp/G.png")
    state = annealing.initial_fn(G)
    cost = utils.cost_fn(state)
    print("Done with cost ", cost)
    plt.figure()
    nx.draw(utils.mat_to_nx(state))
    plt.savefig("/tmp/tree.png")
    utils.write_output(state, G, "/tmp/res.txt")
    assert utils.verify_in_out(G, "/tmp/res.txt")
Beispiel #10
0
def search(input_path, output_path):
    words, grid = parse_input(input_path)
    max_row = len(grid)
    max_col = len(grid[0])

    solution = {}
    for word in words:
        for r in range(max_row):
            for c in range(max_col):
                if grid[r][c] == word[0]:
                    coords = find_word(r, c, max_row, max_col, word, grid)
                    if coords:
                        solution[word] = coords

    write_output(solution, output_path)
Beispiel #11
0
def save_changes():
    global routes, geocodes

    if routes == None:
        setup_input(inputs_plotroutes)

    output_filename = datetime.now().strftime(
        "output//results_%Y-%m-%d_%H%M%S.csv")
    write_output(None, output_filename, routes)

    object_output_filename = datetime.now().strftime(
        "output//obj_files_%Y-%m-%d_%H%M%S.obj")
    saving_objects = open(object_output_filename, "wb")
    pickle.dump(routes, saving_objects)
    saving_objects.close()
Beispiel #12
0
def full_run():
    global start_time, start_time_orig, working_on_sped, run_finished
    run_finished = False
    setup_map_data(constants.FILENAMES[3])
    #First, try to find good parameters by doing quick runs that
    #don't do improvement procedures or bus assignment.
    setup_parameters(constants.FILENAMES[6], True)
    working_on_sped = True
    start_time = process_time()
    start_time_orig = process_time()
    print("Searching for good algorithm parameters for special ed.")
    vary_params(True, minutes=min(20, constants.MINUTES_PER_SEGMENT / 2))
    print("Special ed parameters chosen. Beginning routing")
    start_time = process_time()
    sped_routes = permutation_approach(True,
                                       minutes=constants.MINUTES_PER_SEGMENT *
                                       3 / 2)
    print("Special ed routing finished.")
    setup_parameters(constants.FILENAMES[6], False)
    working_on_sped = False
    start_time = process_time()
    start_time_orig = process_time()
    print("Searching for good algorithm parameters for magnet routing.")
    vary_params(False, minutes=min(20, constants.MINUTES_PER_SEGMENT / 2))
    print("Magnet parameters chosen. Beginning routing")
    start_time = process_time()
    magnet_routes = permutation_approach(
        False, minutes=constants.MINUTES_PER_SEGMENT * 3 / 2)
    print("Magnet routing finished.")
    all_routes = sped_routes + magnet_routes
    print("Final number of magnet routes: " + str(len(magnet_routes)))
    print("Mean student travel time of magnet routes: " +
          str(mstt(magnet_routes)) + " minutes")
    print("Final number of special ed routes: " + str(len(sped_routes)))
    print("Mean student travel time of special ed routes: " +
          str(mstt(sped_routes)) + " minutes")

    output_filename = datetime.now().strftime(
        "output//results_%Y-%m-%d_%H%M%S.csv")
    write_output(constants.FILENAMES[0], output_filename, all_routes)

    object_output_filename = datetime.now().strftime(
        "output//obj_files_%Y-%m-%d_%H%M%S.obj")
    saving_objects = open(object_output_filename, "wb")
    pickle.dump(all_routes, saving_objects)
    saving_objects.close()
    run_finished = True
    return all_routes
Beispiel #13
0
def main(args) -> None:
    """
    This is the main pipe line to analyze barseq counts.
    """
    # creating folder to put log file and barcode counts
    runner = Run(args) # here we create folder name which is equal to experiment name
    make_barseq_directories(runner) # if there is already folder then this will return error massage
    # Add file handler
    fh = logging.FileHandler(runner.log, mode="w")  # creating a log file
    fh.setFormatter(logging.Formatter(
        "%(asctime)s - %(levelname)s - %(module)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M"))

    logger.addHandler(fh)
    logger.info("***** Starting barseq *****")

    # read barcode from fasta files
    logger.info(f"Reading in barcodes from {runner.barcodes.name}")

    # read barcode
    # barcodes = read_barcodes(runner.barcodes) # this is the old script

    barcodes = read_barcodes_new(runner.barcodes) # this is the old script

    # Process each sequencing file
    seq_files_list = sorted(os.listdir(runner.sequences))
    for seq_file in seq_files_list:
        if not seq_file.endswith(".DS_Store"):
            sample = format_filename(seq_file)
            logger.info(f"Counting Barcodes in {sample}")
            runner.sample_dict[sample+'_F'] = deepcopy(barcodes)
            runner.sample_dict[sample+'_R'] = deepcopy(barcodes)
            # Change cwd
            with Cd(runner.sequences):

                count_barcodes(seq_file, runner.sample_dict,[sample+'_F',sample+'_R'])



    # Write to output

    logger.info(f"Writing results to {runner.path}")
    write_output(runner.sample_dict, barcodes, runner)

    # Confirm completion of barseq
    logger.info("***** barseq is complete! *****")
Beispiel #14
0
def main_loop():
    model = initialize_ai_model()

    while True:
        begin = time.time()
        try:
            input_pdf_files = utils.sensor.scan()
            if input_pdf_files:
                batch_id = f'{datetime.now().strftime("%Y%m%d%H%M%S")}'
                logger.info(
                    f'Starting batch "{batch_id}" with '
                    f'{len(input_pdf_files)} files: "{", ".join([file.name for file in input_pdf_files])}"'
                )

                working_pdf_files = utils.move_to_working_folder(
                    input_pdf_files)
                for file in working_pdf_files:
                    logger.info(f'Reading "{file.name}"')
                    try:
                        result = run_pipeline(model, file)
                        output_file = utils.move_to_output_folder(
                            batch_id, file)
                        utils.write_output(output_file, result)
                        utils.log_master_record(
                            file.name, result['SE_output'][-1]['page_number'])
                    except BaseException as e:
                        logger.exception(
                            f'Error while processing file "{file}" due to \n{e}'
                        )

                if not utils.enable_debug:
                    utils.delete_cache()
                os.system(f'chmod -R a+rw {utils.Folder.output}')
                logger.info(f'Batch "{batch_id}" finished.')

        except BaseException as e:
            logger.exception(e)
            if utils.enable_debug:
                raise
        finally:
            sleep_time = int(
                utils.settings['sensor']['schedule']) - (time.time() - begin)
            time.sleep(sleep_time if sleep_time > 0 else 0)
Beispiel #15
0
 def post(self):
     acct = self.request.get('username')
     pw = self.request.get('password')
     json_query_data = ''
     status = SUCCESS
     if acct == '' or pw == '':
         json_query_data = 'Missing username or password'
         status = 506
     else:
         results = User.query(User.username == acct).fetch(limit=1)
         if len(results) == 0:
             json_query_data = 'Unexisting account'
             status = 505
         else:
             user = results[0]
             c = Credentials()
             c.set_dk(pw)
             user.password = c
             pkey = user.put()
             json_query_data = str(pkey.id())
     utils.write_output(self, json_query_data, status)
    def parse_arguments(self, arguments):
        if arguments.lang == 'en':
            self.alphabet = Alphabet(en_letters)
        elif arguments.lang == 'ru':
            self.alphabet = Alphabet(ru_letters)

        # read input
        input_text = read_input(arguments.input)

        # process input
        try:
            action = getattr(self, arguments.action + '_action')
        except AttributeError:
            raise NotImplementedError("Action {0} is not implemented. You have"
                                      "to add method {0}_action".format(
                                          arguments.action))

        output_text = action(input_text, arguments)

        # write to output file
        write_output(output_text, arguments.output)
Beispiel #17
0
def gen_complete_test_set(tfc_reader):
    """
    It generates Complete Test Set for gate having all
    positive controls and no constant line, garbage
    For NCT and GT library
    """
    rev_non_tfc(tfc_reader)

    length = get_length()

    tp = list(itertools.product([0, 1], repeat=length))
    tfc_gates = get_tfc_gates()

    output_writter = open("cts.txt", 'w')
    output_writter.write(str("||"))

    for i in range(0, len(tp)):
        pattern = tp[i]
        Flag = 0

        olist = initalize_output_dict(pattern)

        for gate in tfc_gates:
            gate = gate.split()

            gate_len = len(gate)
            # Check if all values are 1 or not
            if (all(x == 1 for x in olist.values())):
                if gate_len >= 2:
                    Flag = 1

            olist = gate_operation(gate, olist)

        if Flag == 1:
            write_output(output_writter, olist)

    output_writter.seek(0, 0)
    output_writter.close()

    return
Beispiel #18
0
def runfile(infile, outfile, score_to_beat = 1e99):
    G = utils.read_input(infile)
    # print("Processing", infile)
    if "small" in infile:
        param = params["small"]
    elif "medium" in infile:
        param = params["medium"]
    elif "large" in infile:
        param = params["large"]
    else:
        print("NOT A RECOGNIZED FILETYPE!!")
        param = params["medium"]
    iters, ps, pp, scale = param
    result, score = annealing.anneal(G, iters, ps, pp, scale, print_energy=False)

    # print(f"- {score}")
    if score < score_to_beat:
        utils.write_output(result, G, outfile)
    # assert utils.verify_in_out(G, outfile)
    # TODO REMOVE, Just while fixing bugs
    # os.remove(infile)
    return score
Beispiel #19
0
    def post(self):
        acct = self.request.get('username')
        #    pw = self.request.get('password')
        json_query_data = ''
        status = SUCCESS
        if acct == '':
            json_query_data = 'Missing username'
            status = 507
        if status == SUCCESS:
            results = User.query(User.username == acct).fetch(limit=1)
            if len(results) == 0:
                json_query_data = 'Unexisting account'
                status = 505
        if status == SUCCESS:
            user = results[0]
            mailto = user.mail
            if len(mailto) == 0:
                json_query_data = 'Account without email. Password cannot be reset'
                status = 508
        if status == SUCCESS:
            pw = utils.id_generator(PASSWORD_DEFAULT_LENGTH)
            c = Credentials()
            c.set_dk(pw)
            name = user.name
            user.password = c
            pkey = user.put()
            json_query_data = str(pkey.id())
            with open('html/mail_reset.html', 'r') as myfile:
                subject = myfile.read().replace('\n', '')
            logging.info(subject)
            subject = subject.format(name, acct, pw)
            logging.info(subject)

            (ret, status) = utils.send_email(mailto, "Password reset", subject,
                                             True)
            if status != SUCCESS:
                json_query_data = ret
        utils.write_output(self, json_query_data, status)
Beispiel #20
0
    def post(self):
        status = SUCCESS
        json_query_data = ''
        acct = self.request.get('username')
        pw = self.request.get('password')
        if pw == '' or acct == '':
            status = 501
            json_query_data = 'Missing username or password'
        else:
            results = User.query(User.username == acct).fetch(limit=1)
            if len(results) > 0:
                c = results[0].password
                if not c.verify(pw):
                    status = 502
                    json_query_data = 'Invalid username or password'  # no hints to hackers
                else:
                    json_query_data = utils.json_formatter(results)
            else:
                status = 502
                json_query_data = 'Invalid username or password'  # no hints to hackers
                time.sleep(3)  # hackers can wait...

        utils.write_output(self, json_query_data, status)
Beispiel #21
0
 def post(self):
     name = self.request.get('alias')
     acct = self.request.get('username')
     mail = self.request.get('mail')
     icon = self.request.get('icon')
     json_query_data = ''
     status = SUCCESS
     if acct == '' or name == '':
         json_query_data = 'Missing alias or username'
         status = 504
     else:
         results = User.query(User.username == acct).fetch(limit=1)
         if len(results) == 0:
             json_query_data = 'Unexisting account'
             status = 505
         else:
             user = results[0]
             user.name = name
             user.mail = mail
             user.icon = icon
             pkey = user.put()
             json_query_data = str(pkey.id())
     utils.write_output(self, json_query_data, status)
Beispiel #22
0
def gen_level_wise_output(tfc_reader):
    """
    It generates leve_wise_output for gate having all
    positive controls and no constant line, garbage
    For NCT and GT library
    """
    gen_non_tfc(tfc_reader)

    length = get_length()

    # tp -- TestPattern
    tp = list(itertools.product([0, 1], repeat=length))
    tfc_gates = get_tfc_gates()

    output_writter = open("lwo.txt", 'w')
    for i in range(0, len(tp)):
        pattern = tp[i]

        output_writter.write(str("||"))
        olist = initalize_output_dict(pattern)

        # Printing Input Pattern
        write_output(output_writter, olist)

        for gate in tfc_gates:
            gate = gate.split()

            olist = gate_operation(gate, olist)

            # Printing Level-Wise-Output
            write_output(output_writter, olist)

        output_writter.write(str("\n"))
    output_writter.seek(0, 0)
    output_writter.close()

    return
Beispiel #23
0
#early_stopping = EarlyStopping(monitor='val_rmse', min_delta=0.01, patience=7, verbose=1)
#callbacks_list = [early_stopping]

target = np.array(train.deal_probability)

hist = model.fit(train_cont['cat_d']['cat_data'] + train_cont['cat_d']["parent_data"] +\
                 train_cont['cat_d']["region_data"] + train_cont['cat_d']["city_data"] + train_cont['cat_d']["image_data"] +\
                 train_cont['cat_d']["user_data"] + train_cont['cat_d']["day_data"] + train_cont["other_feat"]  + \
                 [padded_words] ,
                 target,
                 batch_size=conf.modelling.batch_size,
                 epochs=3,# conf.modelling.num_epochs
                 validation_split = 0.1,
                 shuffle=True,
                 verbose=2,
                # callbacks = callbacks_list
                )


preds = model.predict(
                 test_cont['cat_d']['cat_data'] + test_cont['cat_d']["parent_data"] +\
                 test_cont['cat_d']["region_data"] + test_cont['cat_d']["city_data"] + test_cont['cat_d']["image_data"] +\
                 test_cont['cat_d']["user_data"] + test_cont['cat_d']["day_data"] + test_cont["other_feat"]  + \
                 [padded_test] ,
)

from utils import write_output, plot_history

plot_history(hist)
write_output(preds, conf)
Beispiel #24
0
 def save(self):
     self._community['node_pool'] = self._node_pool
     self._community['members'] = self._community_members
     utils.write_output(self._community, self._write_path)
Beispiel #25
0
def main():
    # -----------------------------------------------------------------------------------
    # Adjustable Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--train', action = 'store_true', help = 'training or scoring')
    parser.add_argument(
        '--inputfile', type = str, help = 'input data file name')
    parser.add_argument(
        '--outputfile', type = str, help = 'output prediction file name')
    args = parser.parse_args()

    # directory for the input data and output prediction:
    DATA_DIR = 'data'
    OUTPUT_DIR = 'output'

    # columns used:
    CAT_COLS = ['Auction', 'Transmission', 'WheelType', 'Nationality',
                'Size', 'TopThreeAmericanName', 'IsOnlineSale']
    NUM_COLS = ['VehicleAge', 'VehOdo', 'VehBCost', 'WarrantyCost',
                'MMRCurrentAuctionAveragePrice', 'MMRAcquisitionAuctionAveragePrice',
                'MMRCurrentAuctionCleanPrice', 'MMRAcquisitionAuctionCleanPrice',
                'MMRCurrentRetailAveragePrice', 'MMRAcquisitionRetailAveragePrice',
                'MMRCurrentRetailCleanPrice', 'MMRAcquisitonRetailCleanPrice']
    DATE_COLS = ['PurchDate']
    LABEL_COL = 'IsBadBuy'
    IDS_COL = 'RefId'

    # current time for computing recency feature
    NOW = '2010-12-31'

    # modeling step:
    # model checkpoint for future scoring
    MODEL_DIR = 'model'
    CHECKPOINT_XGB = 'xgb.pkl'
    CHECKPOINT_PREPROCESS = 'preprocess.pkl'

    # parameter that only relevant for training stage and not scoring
    if args.train:
        # number of cross validation and hyperparameter settings to try
        CV = 10
        N_ITER = 5
        MODEL_RANDOM_STATE = 4321

        # train/validation stratified split
        VAL_SIZE = 0.1
        TEST_SIZE = 0.1
        SPLIT_RANDOM_STATE = 1234

    # -----------------------------------------------------------------------------------
    logger.info('preprocessing')
    checkpoint_preprocess = os.path.join(MODEL_DIR, CHECKPOINT_PREPROCESS)
    checkpoint_xgb = os.path.join(MODEL_DIR, CHECKPOINT_XGB)
    input_path = os.path.join(DATA_DIR, args.inputfile)

    if args.train:
        data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL, LABEL_COL)
        ids = data[IDS_COL].values
        label = data[LABEL_COL].values
        data = data.drop([IDS_COL, LABEL_COL], axis = 1)

        # train/test split twice to achieve train/validation/test three way split
        df_train, df_test, y_train, y_test, ids_train, ids_test = train_test_split(
            data, label, ids, test_size = TEST_SIZE,
            random_state = SPLIT_RANDOM_STATE, stratify = label)

        df_train, df_val, y_train, y_val, ids_train, ids_val = train_test_split(
            df_train, y_train, ids_train, test_size = VAL_SIZE,
            random_state = SPLIT_RANDOM_STATE, stratify = y_train)

        # obtain finalized columns
        num_cols_cleaned = list(SortedSet(df_train.columns) - SortedSet(CAT_COLS))
        preprocess = Preprocesser(num_cols = num_cols_cleaned, cat_cols = CAT_COLS)
        X_train = preprocess.fit_transform(df_train)
        X_val = preprocess.transform(df_val)
        X_test = preprocess.transform(df_test)

        logger.info('modeling')
        eval_set = [(X_train, y_train), (X_val, y_val)]
        xgb_tuned = build_xgb(N_ITER, CV, MODEL_RANDOM_STATE, eval_set)
        xgb_tuned.fit(X_train, y_train)

        if not os.path.isdir(MODEL_DIR):
            os.mkdir(MODEL_DIR)

        dump(preprocess, checkpoint_preprocess)
        dump(xgb_tuned, checkpoint_xgb)

        # model evaluation metric reporting
        y_pred = []
        xgb_best = xgb_tuned.best_estimator_
        zipped = zip(
            ('train', 'validation', 'test'),
            (X_train, X_val, X_test),
            (y_train, y_val, y_test))
        for name, X, y in zipped:
            xgb_pred = xgb_best.predict_proba(
                X, ntree_limit = xgb_best.best_ntree_limit)[:, 1]
            score = round(roc_auc_score(y, xgb_pred), 3)
            logger.info('{} AUC: {}'.format(name, score))
            y_pred.append(xgb_pred)

        ids = np.hstack((ids_train, ids_val, ids_test))
        y_pred = np.hstack(y_pred)
    else:
        data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL)
        ids = data[IDS_COL].values
        data = data.drop(IDS_COL, axis = 1)

        logger.info('scoring')
        preprocess = load(checkpoint_preprocess)
        xgb_tuned = load(checkpoint_xgb)
        X = preprocess.transform(data)
        xgb_best = xgb_tuned.best_estimator_
        y_pred = xgb_best.predict_proba(
            X, ntree_limit = xgb_best.best_ntree_limit)[:, 1]

    if not os.path.isdir(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)

    output_path = os.path.join(OUTPUT_DIR, args.outputfile)
    write_output(ids, IDS_COL, y_pred, LABEL_COL, output_path)
Beispiel #26
0
  test_data = random.sample(train_data, int(len(train_data)*.1)) 
  train_data = [post for post in train_data if post not in test_data] 

train_data = np.array(train_data)
# intialize model
model = pizza_model.PizzaModel(params)

# train 
#model.train(train_data)

# test 
#predictions = model.test(test_data)

# K-fold testing
kf = cross_validation.KFold(len(train_data), n_folds=10)
predictions = np.zeros(len(train_data))
for train_index, test_index in kf:
  train_fold, test_fold = train_data[train_index], train_data[test_index]
  model.train(train_fold)
  predictions[test_index] = model.test(test_fold)

desired = utils.get_labels_from_post_list(train_data)

# evaluate
if not params.TESTING:
  #desired = utils.get_labels_from_post_list(test_data)
  print classification_report(desired, predictions)
else:
  # write output to file
  utils.write_output(test_data, predictions) 
Beispiel #27
0
def main():
    global cuda
    cuda = torch.cuda.is_available()
    if cuda:
        train_sequence.cuda = cuda
        sequence_tagger.cuda = cuda
        utils.cuda = cuda
        train_sequence_crafted.cuda = cuda

    if args.crafted:
        this_train_sequence = train_sequence_crafted
    else:
        this_train_sequence = train_sequence

    utils.log('start reading ner file ')
    (token_list, tag_list,
     raw_token_list) = utils.prepare_data(args.input, True)
    vocabs = pickle.load(open(args.vocab_path, 'rb'))
    y = list(
        map(lambda x: np.array(list(map(lambda y: vocabs['y_dict'][y], x))),
            tag_list))

    x = tdh.build_input_data(token_list, vocabs['vocabulary'])

    #extract crafted features
    train_data = utils.get_data_with_pos_tag(raw_token_list, tag_list)
    features = utils.extract_features(train_data, vocabs['uptl'],
                                      vocabs['treatment_suffix'],
                                      vocabs['disease_suffix'], vocabs['dis'])

    ds_data = {'x': x, 'y': y, 'z': features}

    ds = sequence_dataset.sequence_dataset(
        '.',
        'test',
        ds_data,
        word_counts=vocabs['word_counts'],
        vocabulary_inv=vocabs['vocabulary_inv'],
        crafted_features=args.crafted)

    val_loader = DataLoader(ds,
                            batch_sampler=data_samplers.BatchSampler(
                                list(map(lambda x: min(999999, len(x[0])),
                                         ds)),
                                256,
                                shuffle=False),
                            num_workers=4)

    vocab_size = ds.vocab_size
    embedding_init = vocabs['embedding_init']
    embedding_init = embedding_init[:vocab_size]
    if args.model == 'bilstm':
        if args.crafted:
            model = sequence_tagger.BilstmSequenceTaggerCraftedFeatures(
                len(vocabs['y_dict']),
                vocab_size,
                embedding_size=embedding_init.shape[1],
                hidden_size=args.hidden_size,
                intermediate_size=args.intermediate_size,
                embedding_init=embedding_init,
                crafted_features_size=args.num_crafted)
            criterion = nn.CrossEntropyLoss()
            if cuda:
                criterion.cuda()
            #
            my_loss_fn = lambda x, y, z, m: utils.std_loss_fn_crafted(
                x, y, z, m, criterion)
        else:
            model = sequence_tagger.BilstmSequenceTagger(
                len(vocabs['y_dict']),
                vocab_size,
                embedding_size=embedding_init.shape[1],
                hidden_size=args.hidden_size,
                intermediate_size=args.intermediate_size,
                embedding_init=embedding_init)
            criterion = nn.CrossEntropyLoss()
            if cuda:
                criterion.cuda()
            #
            my_loss_fn = lambda x, y, m: utils.std_loss_fn(x, y, m, criterion)

    else:
        model = sequence_tagger.BilstmCRFSequenceTagger(
            len(vocabs['y_dict']),
            vocab_size,
            embedding_size=embedding_init.shape[1],
            hidden_size=args.hidden_size,
            intermediate_size=args.intermediate_size,
            embedding_init=embedding_init)
        my_loss_fn = utils.lstm_crf_neg_log_likelihood_loss1

    checkpoint = torch.load(args.checkpoint)
    model.load_state_dict(checkpoint['model'])
    rec, i, all_pred = this_train_sequence.compute_sequence(-1,
                                                            model,
                                                            my_loss_fn,
                                                            val_loader,
                                                            None,
                                                            'eval',
                                                            None,
                                                            None, [],
                                                            return_preds=True)
    utils.write_output(all_pred, raw_token_list, vocabs['y_dict_inv'],
                       args.output)
        # print progress only ten times, expensive because of get_score
        if cur_step % (num_steps // 10) == 0:
            progress_printer.print(
                cur_step, get_score(vehicle_to_rides, bonus, num_steps))

    return vehicle_to_rides


def to_string(vehicle_to_rides):
    lines = []
    for vehicle in vehicle_to_rides:
        rides = vehicle_to_rides[vehicle]
        lines.extend(
            [f"{len(rides)} {' '.join(map(lambda r: str(r.number), rides))}"])
    return lines


if __name__ == '__main__':
    instances = [
        'a_example', 'b_should_be_easy', 'c_no_hurry', 'd_metropolis',
        'e_high_bonus'
    ]
    for instance in instances:
        print(f'\n\033[95msolving instance {instance}:\033[0m')
        rows, columns, num_vehicles, num_rides, bonus, num_steps, rides = utils.read_input(
            instance, get_input)
        vehicle_to_rides = get_greedy_solution2(num_vehicles, rides, bonus,
                                                num_steps)
        utils.write_output(instance, to_string(vehicle_to_rides))
def get_initial_allocation(cache_importance, video_sizes, num_caches, cache_size):
    allocation = {c: set() for c in range(num_caches)}
    space_left = {c: cache_size for c in range(num_caches)}
    already_cached = []
    for cache in cache_importance:
        for video, _ in cache_importance[cache]:
            if video_sizes[video] <= space_left[cache] and video not in already_cached:
                already_cached.append(video)
                allocation[cache].add(video)
                space_left[cache] -= video_sizes[video]
    return allocation


def to_string(allocation):
    used_caches = {cache for cache in allocation if allocation[cache]}
    lines = [f"{len(used_caches)}"]
    for cache in allocation:
        lines.extend([f"{cache} {' '.join(map(str, allocation[cache]))}"])
    return lines


if __name__ == '__main__':
    instances = ['example', 'me_at_the_zoo', 'videos_worth_spreading', 'trending_today', 'kittens']

    for instance in instances:
        print(f'\n\033[95msolving instance {instance}:\033[0m')
        latencies, requests, video_sizes, cache_size, num_caches = utils.read_input(instance, get_input)
        cache_importance = get_cache_importance(latencies, requests, video_sizes, num_caches)
        allocation = solve_by_local_search(cache_importance, latencies, requests, video_sizes, cache_size, num_caches)
        utils.write_output(instance, to_string(allocation))
Beispiel #30
0
def populate_graph(graph, node1, node2):
    if node1 in graph.keys():
        graph[node1].append(node2)
    else:
        graph[node1] = [node2]
    return graph


def calc(lines):
    parser = re.compile("[A-Z0-9]+")
    values = [tuple(parser.findall(line.strip())) for line in lines]
    graph = {}
    nodes = []

    for value in values:
        nodes.append(value[1])
        graph = populate_graph(graph, value[0], value[1])

    you_path = find_path(graph, "YOU", "COM", [])
    san_path = find_path(graph, "SAN", "COM", [])

    xor = set(you_path) ^ set(san_path)
    return len(xor)


if __name__ == '__main__':
    lines = read_input()
    result = str(calc(lines))
    write_output(result)
    check_result(result)
Beispiel #31
0
def main():
    # -----------------------------------------------------------------------------------
    # Adjustable Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--train',
                        action='store_true',
                        help='training or scoring')
    parser.add_argument('--inputfile', type=str, help='input data file name')
    parser.add_argument('--outputfile',
                        type=str,
                        help='output prediction file name')
    args = parser.parse_args()

    # directory for the input data and output prediction:
    DATA_DIR = 'data'
    OUTPUT_DIR = 'output'

    # columns used:
    CAT_COLS = [
        'Auction', 'Transmission', 'WheelType', 'Nationality', 'Size',
        'TopThreeAmericanName', 'IsOnlineSale'
    ]
    NUM_COLS = [
        'VehicleAge', 'VehOdo', 'VehBCost', 'WarrantyCost',
        'MMRCurrentAuctionAveragePrice', 'MMRAcquisitionAuctionAveragePrice',
        'MMRCurrentAuctionCleanPrice', 'MMRAcquisitionAuctionCleanPrice',
        'MMRCurrentRetailAveragePrice', 'MMRAcquisitionRetailAveragePrice',
        'MMRCurrentRetailCleanPrice', 'MMRAcquisitonRetailCleanPrice'
    ]
    DATE_COLS = ['PurchDate']
    LABEL_COL = 'IsBadBuy'
    IDS_COL = 'RefId'

    # current time for computing recency feature
    NOW = '2010-12-31'

    # modeling step:
    # model checkpoint for future scoring
    MODEL_DIR = 'model'
    CHECKPOINT_PREPROCESS = os.path.join(MODEL_DIR, 'preprocess.pkl')
    CHECKPOINT_XGB = os.path.join(MODEL_DIR, 'xgb.pkl')

    # parameter that only relevant for training stage and not scoring
    if args.train:
        # number of cross validation and hyperparameter settings to try
        CV = 10
        N_ITER = 5
        MODEL_RANDOM_STATE = 4321

        # train/validation stratified split
        VAL_SIZE = 0.1
        TEST_SIZE = 0.1
        SPLIT_RANDOM_STATE = 1234

    # -----------------------------------------------------------------------------------
    logger.info('preprocessing')
    input_path = os.path.join(DATA_DIR, args.inputfile)
    if args.train:
        data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL,
                     LABEL_COL)
        ids = data[IDS_COL].values
        label = data[LABEL_COL].values
        data = data.drop([IDS_COL, LABEL_COL], axis=1)

        # train/test split twice to achieve train/validaion/test three way split
        df_train, df_test, y_train, y_test, ids_train, ids_test = train_test_split(
            data,
            label,
            ids,
            test_size=TEST_SIZE,
            random_state=SPLIT_RANDOM_STATE,
            stratify=label)

        df_train, df_val, y_train, y_val, ids_train, ids_val = train_test_split(
            df_train,
            y_train,
            ids_train,
            test_size=VAL_SIZE,
            random_state=SPLIT_RANDOM_STATE,
            stratify=y_train)

        # obtain finalized columns
        num_cols_cleaned = list(
            SortedSet(df_train.columns) - SortedSet(CAT_COLS))
        preprocess = Preprocesser(num_cols=num_cols_cleaned, cat_cols=CAT_COLS)
        X_train = preprocess.fit_transform(df_train)
        X_val = preprocess.transform(df_val)
        X_test = preprocess.transform(df_test)

        logger.info('modeling')
        eval_set = [(X_train, y_train), (X_val, y_val)]
        xgb_tuned = build_xgb(N_ITER, CV, MODEL_RANDOM_STATE, eval_set)
        xgb_tuned.fit(X_train, y_train)
        if not os.path.isdir(MODEL_DIR):
            os.mkdir(MODEL_DIR)

        dump(preprocess, CHECKPOINT_PREPROCESS)
        dump(xgb_tuned, CHECKPOINT_XGB)

        # model evaluation metric reporting
        y_pred = []
        xgb_best = xgb_tuned.best_estimator_
        zipped = zip(('train', 'validation', 'test'), (X_train, X_val, X_test),
                     (y_train, y_val, y_test))
        for name, X, y in zipped:
            xgb_pred = xgb_best.predict_proba(
                X, ntree_limit=xgb_best.best_ntree_limit)[:, 1]
            score = round(roc_auc_score(y, xgb_pred), 3)
            logger.info('{} AUC: {}'.format(name, score))
            y_pred.append(xgb_pred)

        ids = np.hstack((ids_train, ids_val, ids_test))
        y_pred = np.hstack(y_pred)
    else:
        data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL)
        ids = data[IDS_COL].values
        data = data.drop(IDS_COL, axis=1)

        logger.info('scoring')
        preprocess = load(CHECKPOINT_PREPROCESS)
        xgb_tuned = load(CHECKPOINT_XGB)
        X = preprocess.transform(data)
        xgb_best = xgb_tuned.best_estimator_
        y_pred = xgb_best.predict_proba(
            X, ntree_limit=xgb_best.best_ntree_limit)[:, 1]

    if not os.path.isdir(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)

    output_path = os.path.join(OUTPUT_DIR, args.outputfile)
    write_output(ids, IDS_COL, y_pred, LABEL_COL, output_path)
Beispiel #32
0
    if args.max_level is None:
        max_level = class_tree.get_height()
    else:
        max_level = args.max_level

    wstc = WSTC(input_shape=x.shape,
                class_tree=class_tree,
                max_level=max_level,
                sup_source=args.sup_source,
                y=y,
                vocab_sz=vocab_sz,
                word_embedding_dim=word_embedding_dim,
                block_thre=args.gamma,
                block_level=args.block_level)

    total_counts = sum(word_counts[ele] for ele in word_counts)
    total_counts -= word_counts[vocabulary_inv_list[0]]
    background_array = np.zeros(vocab_sz)
    for i in range(1, vocab_sz):
        background_array[i] = word_counts[vocabulary_inv[i]] / total_counts

    for level in range(max_level):
        y_pred = proceed_level(x, sequences, wstc, args, pretrain_epochs,
                               self_lr, decay, update_interval, delta,
                               class_tree, level, expand_num, background_array,
                               max_doc_length, max_sent_length, len_avg,
                               len_std, beta, alpha, vocabulary_inv,
                               common_words)
    write_output(y_pred, perm, class_tree, './' + args.dataset)
    compute_metrics(y_pred, y)
    model = CrossEntropyClassifier(nclasses)
    history = model.fit(Xtrain, Ytrain, best, lr)
elif classifier == 'svm_ovo':
    K = svm_kernel.build_K(Xtrain)

    if do_validation:
        model = KernelSVMOneVsOneClassifier(nclasses, svm_kernel)
        model.fit(Xtrain, Ytrain, C, validation, K=K, check=True)

    if do_prediction:
        model = KernelSVMOneVsOneClassifier(nclasses, svm_kernel)
        model.fit(Xtrain, Ytrain, C, K=K)
elif classifier == 'svm_ova':
    K = svm_kernel.build_K(Xtrain)

    if do_validation:
        model = KernelSVMOneVsAllClassifier(nclasses, svm_kernel)
        model.fit(Xtrain, Ytrain, C, validation, K=K, check=True)

    if do_prediction:
        model = KernelSVMOneVsAllClassifier(nclasses, svm_kernel)
        model.fit(Xtrain, Ytrain, C, K=K)
else:
    raise Exception("Unknown classifier")

if do_prediction:
    print("Predicting on test data")
    Ytest = model.predict(Xtest)
    write_output(Ytest, 'results/Yte_' + output_suffix + '.csv')