def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_data_name", type=str, help="Dataset name as registered for use") parser.add_argument("--dropped_columns", type=str, help="Columns to be dropped. To be inputed one after the other.") parser.add_argument("--threshold", type=float, help="Percentage of missing values dropped for dropped columns") parser.add_argument("--output_data", type=str, help="Output cleansed data.") # KNNImputer args parser.add_argument("--n_neighbors", type=int, default=5, help="Number of neighbors for KNNImputer") args = parser.parse_args() # Get dataset by name data = run.input_datasets[args.input_data_name] data_df = data.to_pandas_dataframe() clean_df = clean_data(data_df, threshold=args.threshold, dropped_columns=literal_eval(args.dropped_columns)) print("Shape of cleaned dataset:\n", clean_df.shape) # Pop default_status so that it is not included in KNNImputer model y = clean_df.pop("default_status") imputer = KNNImputer(n_neighbors=args.n_neighbors, weights="distance", add_indicator=False) imputed_df = DataFrame(imputer.fit_transform(clean_df), columns=clean_df.columns) imputed_df["default_status"] = y print("Fitted KNNImputer. Filled missing values.") # Dump model artifact os.makedirs('outputs', exist_ok=True) joblib.dump(imputer, "outputs/knnimputer.joblib") print("Saved KNNImputer artifact.") if not (args.output_data is None): write_output(imputed_df, path=args.output_data, filename="/cleaned.parquet")
def main(): # Load pre-trained model model = Classifier(num_chan=num_chan).to(device) model.load_state_dict( torch.load(os.path.join(epoch_dir, str(epoch) + '.model'))) # Create dataset loader over all training set dataset = testDL(data_dir, 'test') dataloader = data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=4) out_arr = [] fname_arr = [] data_iter = iter(dataloader) for idx, dat in enumerate(data_iter): pics = dat[0].to(device).float() fnames = dat[1] out = torch.sigmoid(model(pics)).detach().cpu().numpy() out[out >= threshold] = 1 out[out < threshold] = 0 fname_arr += [*fnames] out_arr += [*out] write_output(fname_arr, out_arr, results_fname)
def post(self): name = self.request.get('alias') acct = self.request.get('username') mail = self.request.get('mail') pw = self.request.get('password') icon = self.request.get('icon') status = SUCCESS json_query_data = '' if pw == '' or acct == '' or name == '': status = 502 json_query_data = 'Missing alias, username or password' else: if utils.exists(User, [['username', acct]]): status = 503 json_query_data = 'Already existing account' else: # Create the User object and return its key newuser = User() newuser.name = name newuser.username = acct newuser.mail = mail newuser.icon = icon c = Credentials() c.set_dk(pw) newuser.password = c pkey = newuser.put() json_query_data = str(pkey.id()) utils.write_output(self, json_query_data, status)
def run_inputs(event, input_file, output_file, best_so_far_file): print("\nBeginning " + input_file) num_wizards, num_constraints, wizards, constraints = utils.read_input( input_file) solution = backtrack_solve(wizards, constraints) print("\nFound Solution") print(solution) utils.write_output(output_file, solution)
def parse_arguments(self, arguments): input_bytes = read_input(arguments.input, use_bytes=True) key_bytes = read_input(arguments.key, use_bytes=True) # process input xored_bytes = self.xor_bytes(input_bytes, key_bytes) write_output(xored_bytes, arguments.output, use_bytes=True)
def main(): path_uno = "./asset/bonazzi.it.test.data.txt" path_due = "./asset/toscano.it.test.data.txt" reading_uno = utils.read_file(path_uno) reading_due = utils.read_file(path_due) eval_uno = utils.extract_word(reading_uno) eval_due = utils.extract_word(reading_due) correlation = compute_correlations(init_annotation(eval_uno, eval_due)) utils.write_output(eval_uno, eval_due, correlation) print(correlation)
def test_anneal(): G = g result, score = annealing.anneal(G, 120000, 1, 1, 0.0004, print_energy=True) print(score) print("C: ", utils.cost_fn(result)) nx.draw(utils.mat_to_nx(G)) plt.savefig("/tmp/g.png") plt.figure() nx.draw(utils.mat_to_nx(result)) plt.savefig("/tmp/gres.png") utils.write_output(result, G, "/tmp/res.txt") assert utils.verify_in_out(G, "/tmp/res.txt")
def post(self): json_query_data = '' status = SUCCESS mailto = self.request.get('to') mailsubject = self.request.get('subject') mailbody = self.request.get('body') if mailto == '' or mailsubject == '': json_query_data = 'Missing adress or subject' status = 510 else: (status, json_query_data) = utils.send_email(mailto, mailsubject, mailbody) utils.write_output(self, json_query_data, status)
def test_initial(): G = utils.read_input("inputs/large-14.in") print("Running") plt.figure() nx.draw(utils.mat_to_nx(G)) plt.savefig("/tmp/G.png") state = annealing.initial_fn(G) cost = utils.cost_fn(state) print("Done with cost ", cost) plt.figure() nx.draw(utils.mat_to_nx(state)) plt.savefig("/tmp/tree.png") utils.write_output(state, G, "/tmp/res.txt") assert utils.verify_in_out(G, "/tmp/res.txt")
def search(input_path, output_path): words, grid = parse_input(input_path) max_row = len(grid) max_col = len(grid[0]) solution = {} for word in words: for r in range(max_row): for c in range(max_col): if grid[r][c] == word[0]: coords = find_word(r, c, max_row, max_col, word, grid) if coords: solution[word] = coords write_output(solution, output_path)
def save_changes(): global routes, geocodes if routes == None: setup_input(inputs_plotroutes) output_filename = datetime.now().strftime( "output//results_%Y-%m-%d_%H%M%S.csv") write_output(None, output_filename, routes) object_output_filename = datetime.now().strftime( "output//obj_files_%Y-%m-%d_%H%M%S.obj") saving_objects = open(object_output_filename, "wb") pickle.dump(routes, saving_objects) saving_objects.close()
def full_run(): global start_time, start_time_orig, working_on_sped, run_finished run_finished = False setup_map_data(constants.FILENAMES[3]) #First, try to find good parameters by doing quick runs that #don't do improvement procedures or bus assignment. setup_parameters(constants.FILENAMES[6], True) working_on_sped = True start_time = process_time() start_time_orig = process_time() print("Searching for good algorithm parameters for special ed.") vary_params(True, minutes=min(20, constants.MINUTES_PER_SEGMENT / 2)) print("Special ed parameters chosen. Beginning routing") start_time = process_time() sped_routes = permutation_approach(True, minutes=constants.MINUTES_PER_SEGMENT * 3 / 2) print("Special ed routing finished.") setup_parameters(constants.FILENAMES[6], False) working_on_sped = False start_time = process_time() start_time_orig = process_time() print("Searching for good algorithm parameters for magnet routing.") vary_params(False, minutes=min(20, constants.MINUTES_PER_SEGMENT / 2)) print("Magnet parameters chosen. Beginning routing") start_time = process_time() magnet_routes = permutation_approach( False, minutes=constants.MINUTES_PER_SEGMENT * 3 / 2) print("Magnet routing finished.") all_routes = sped_routes + magnet_routes print("Final number of magnet routes: " + str(len(magnet_routes))) print("Mean student travel time of magnet routes: " + str(mstt(magnet_routes)) + " minutes") print("Final number of special ed routes: " + str(len(sped_routes))) print("Mean student travel time of special ed routes: " + str(mstt(sped_routes)) + " minutes") output_filename = datetime.now().strftime( "output//results_%Y-%m-%d_%H%M%S.csv") write_output(constants.FILENAMES[0], output_filename, all_routes) object_output_filename = datetime.now().strftime( "output//obj_files_%Y-%m-%d_%H%M%S.obj") saving_objects = open(object_output_filename, "wb") pickle.dump(all_routes, saving_objects) saving_objects.close() run_finished = True return all_routes
def main(args) -> None: """ This is the main pipe line to analyze barseq counts. """ # creating folder to put log file and barcode counts runner = Run(args) # here we create folder name which is equal to experiment name make_barseq_directories(runner) # if there is already folder then this will return error massage # Add file handler fh = logging.FileHandler(runner.log, mode="w") # creating a log file fh.setFormatter(logging.Formatter( "%(asctime)s - %(levelname)s - %(module)s - %(message)s", datefmt="%Y-%m-%d %H:%M")) logger.addHandler(fh) logger.info("***** Starting barseq *****") # read barcode from fasta files logger.info(f"Reading in barcodes from {runner.barcodes.name}") # read barcode # barcodes = read_barcodes(runner.barcodes) # this is the old script barcodes = read_barcodes_new(runner.barcodes) # this is the old script # Process each sequencing file seq_files_list = sorted(os.listdir(runner.sequences)) for seq_file in seq_files_list: if not seq_file.endswith(".DS_Store"): sample = format_filename(seq_file) logger.info(f"Counting Barcodes in {sample}") runner.sample_dict[sample+'_F'] = deepcopy(barcodes) runner.sample_dict[sample+'_R'] = deepcopy(barcodes) # Change cwd with Cd(runner.sequences): count_barcodes(seq_file, runner.sample_dict,[sample+'_F',sample+'_R']) # Write to output logger.info(f"Writing results to {runner.path}") write_output(runner.sample_dict, barcodes, runner) # Confirm completion of barseq logger.info("***** barseq is complete! *****")
def main_loop(): model = initialize_ai_model() while True: begin = time.time() try: input_pdf_files = utils.sensor.scan() if input_pdf_files: batch_id = f'{datetime.now().strftime("%Y%m%d%H%M%S")}' logger.info( f'Starting batch "{batch_id}" with ' f'{len(input_pdf_files)} files: "{", ".join([file.name for file in input_pdf_files])}"' ) working_pdf_files = utils.move_to_working_folder( input_pdf_files) for file in working_pdf_files: logger.info(f'Reading "{file.name}"') try: result = run_pipeline(model, file) output_file = utils.move_to_output_folder( batch_id, file) utils.write_output(output_file, result) utils.log_master_record( file.name, result['SE_output'][-1]['page_number']) except BaseException as e: logger.exception( f'Error while processing file "{file}" due to \n{e}' ) if not utils.enable_debug: utils.delete_cache() os.system(f'chmod -R a+rw {utils.Folder.output}') logger.info(f'Batch "{batch_id}" finished.') except BaseException as e: logger.exception(e) if utils.enable_debug: raise finally: sleep_time = int( utils.settings['sensor']['schedule']) - (time.time() - begin) time.sleep(sleep_time if sleep_time > 0 else 0)
def post(self): acct = self.request.get('username') pw = self.request.get('password') json_query_data = '' status = SUCCESS if acct == '' or pw == '': json_query_data = 'Missing username or password' status = 506 else: results = User.query(User.username == acct).fetch(limit=1) if len(results) == 0: json_query_data = 'Unexisting account' status = 505 else: user = results[0] c = Credentials() c.set_dk(pw) user.password = c pkey = user.put() json_query_data = str(pkey.id()) utils.write_output(self, json_query_data, status)
def parse_arguments(self, arguments): if arguments.lang == 'en': self.alphabet = Alphabet(en_letters) elif arguments.lang == 'ru': self.alphabet = Alphabet(ru_letters) # read input input_text = read_input(arguments.input) # process input try: action = getattr(self, arguments.action + '_action') except AttributeError: raise NotImplementedError("Action {0} is not implemented. You have" "to add method {0}_action".format( arguments.action)) output_text = action(input_text, arguments) # write to output file write_output(output_text, arguments.output)
def gen_complete_test_set(tfc_reader): """ It generates Complete Test Set for gate having all positive controls and no constant line, garbage For NCT and GT library """ rev_non_tfc(tfc_reader) length = get_length() tp = list(itertools.product([0, 1], repeat=length)) tfc_gates = get_tfc_gates() output_writter = open("cts.txt", 'w') output_writter.write(str("||")) for i in range(0, len(tp)): pattern = tp[i] Flag = 0 olist = initalize_output_dict(pattern) for gate in tfc_gates: gate = gate.split() gate_len = len(gate) # Check if all values are 1 or not if (all(x == 1 for x in olist.values())): if gate_len >= 2: Flag = 1 olist = gate_operation(gate, olist) if Flag == 1: write_output(output_writter, olist) output_writter.seek(0, 0) output_writter.close() return
def runfile(infile, outfile, score_to_beat = 1e99): G = utils.read_input(infile) # print("Processing", infile) if "small" in infile: param = params["small"] elif "medium" in infile: param = params["medium"] elif "large" in infile: param = params["large"] else: print("NOT A RECOGNIZED FILETYPE!!") param = params["medium"] iters, ps, pp, scale = param result, score = annealing.anneal(G, iters, ps, pp, scale, print_energy=False) # print(f"- {score}") if score < score_to_beat: utils.write_output(result, G, outfile) # assert utils.verify_in_out(G, outfile) # TODO REMOVE, Just while fixing bugs # os.remove(infile) return score
def post(self): acct = self.request.get('username') # pw = self.request.get('password') json_query_data = '' status = SUCCESS if acct == '': json_query_data = 'Missing username' status = 507 if status == SUCCESS: results = User.query(User.username == acct).fetch(limit=1) if len(results) == 0: json_query_data = 'Unexisting account' status = 505 if status == SUCCESS: user = results[0] mailto = user.mail if len(mailto) == 0: json_query_data = 'Account without email. Password cannot be reset' status = 508 if status == SUCCESS: pw = utils.id_generator(PASSWORD_DEFAULT_LENGTH) c = Credentials() c.set_dk(pw) name = user.name user.password = c pkey = user.put() json_query_data = str(pkey.id()) with open('html/mail_reset.html', 'r') as myfile: subject = myfile.read().replace('\n', '') logging.info(subject) subject = subject.format(name, acct, pw) logging.info(subject) (ret, status) = utils.send_email(mailto, "Password reset", subject, True) if status != SUCCESS: json_query_data = ret utils.write_output(self, json_query_data, status)
def post(self): status = SUCCESS json_query_data = '' acct = self.request.get('username') pw = self.request.get('password') if pw == '' or acct == '': status = 501 json_query_data = 'Missing username or password' else: results = User.query(User.username == acct).fetch(limit=1) if len(results) > 0: c = results[0].password if not c.verify(pw): status = 502 json_query_data = 'Invalid username or password' # no hints to hackers else: json_query_data = utils.json_formatter(results) else: status = 502 json_query_data = 'Invalid username or password' # no hints to hackers time.sleep(3) # hackers can wait... utils.write_output(self, json_query_data, status)
def post(self): name = self.request.get('alias') acct = self.request.get('username') mail = self.request.get('mail') icon = self.request.get('icon') json_query_data = '' status = SUCCESS if acct == '' or name == '': json_query_data = 'Missing alias or username' status = 504 else: results = User.query(User.username == acct).fetch(limit=1) if len(results) == 0: json_query_data = 'Unexisting account' status = 505 else: user = results[0] user.name = name user.mail = mail user.icon = icon pkey = user.put() json_query_data = str(pkey.id()) utils.write_output(self, json_query_data, status)
def gen_level_wise_output(tfc_reader): """ It generates leve_wise_output for gate having all positive controls and no constant line, garbage For NCT and GT library """ gen_non_tfc(tfc_reader) length = get_length() # tp -- TestPattern tp = list(itertools.product([0, 1], repeat=length)) tfc_gates = get_tfc_gates() output_writter = open("lwo.txt", 'w') for i in range(0, len(tp)): pattern = tp[i] output_writter.write(str("||")) olist = initalize_output_dict(pattern) # Printing Input Pattern write_output(output_writter, olist) for gate in tfc_gates: gate = gate.split() olist = gate_operation(gate, olist) # Printing Level-Wise-Output write_output(output_writter, olist) output_writter.write(str("\n")) output_writter.seek(0, 0) output_writter.close() return
#early_stopping = EarlyStopping(monitor='val_rmse', min_delta=0.01, patience=7, verbose=1) #callbacks_list = [early_stopping] target = np.array(train.deal_probability) hist = model.fit(train_cont['cat_d']['cat_data'] + train_cont['cat_d']["parent_data"] +\ train_cont['cat_d']["region_data"] + train_cont['cat_d']["city_data"] + train_cont['cat_d']["image_data"] +\ train_cont['cat_d']["user_data"] + train_cont['cat_d']["day_data"] + train_cont["other_feat"] + \ [padded_words] , target, batch_size=conf.modelling.batch_size, epochs=3,# conf.modelling.num_epochs validation_split = 0.1, shuffle=True, verbose=2, # callbacks = callbacks_list ) preds = model.predict( test_cont['cat_d']['cat_data'] + test_cont['cat_d']["parent_data"] +\ test_cont['cat_d']["region_data"] + test_cont['cat_d']["city_data"] + test_cont['cat_d']["image_data"] +\ test_cont['cat_d']["user_data"] + test_cont['cat_d']["day_data"] + test_cont["other_feat"] + \ [padded_test] , ) from utils import write_output, plot_history plot_history(hist) write_output(preds, conf)
def save(self): self._community['node_pool'] = self._node_pool self._community['members'] = self._community_members utils.write_output(self._community, self._write_path)
def main(): # ----------------------------------------------------------------------------------- # Adjustable Parameters parser = argparse.ArgumentParser() parser.add_argument( '--train', action = 'store_true', help = 'training or scoring') parser.add_argument( '--inputfile', type = str, help = 'input data file name') parser.add_argument( '--outputfile', type = str, help = 'output prediction file name') args = parser.parse_args() # directory for the input data and output prediction: DATA_DIR = 'data' OUTPUT_DIR = 'output' # columns used: CAT_COLS = ['Auction', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'IsOnlineSale'] NUM_COLS = ['VehicleAge', 'VehOdo', 'VehBCost', 'WarrantyCost', 'MMRCurrentAuctionAveragePrice', 'MMRAcquisitionAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRAcquisitionRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'MMRAcquisitonRetailCleanPrice'] DATE_COLS = ['PurchDate'] LABEL_COL = 'IsBadBuy' IDS_COL = 'RefId' # current time for computing recency feature NOW = '2010-12-31' # modeling step: # model checkpoint for future scoring MODEL_DIR = 'model' CHECKPOINT_XGB = 'xgb.pkl' CHECKPOINT_PREPROCESS = 'preprocess.pkl' # parameter that only relevant for training stage and not scoring if args.train: # number of cross validation and hyperparameter settings to try CV = 10 N_ITER = 5 MODEL_RANDOM_STATE = 4321 # train/validation stratified split VAL_SIZE = 0.1 TEST_SIZE = 0.1 SPLIT_RANDOM_STATE = 1234 # ----------------------------------------------------------------------------------- logger.info('preprocessing') checkpoint_preprocess = os.path.join(MODEL_DIR, CHECKPOINT_PREPROCESS) checkpoint_xgb = os.path.join(MODEL_DIR, CHECKPOINT_XGB) input_path = os.path.join(DATA_DIR, args.inputfile) if args.train: data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL, LABEL_COL) ids = data[IDS_COL].values label = data[LABEL_COL].values data = data.drop([IDS_COL, LABEL_COL], axis = 1) # train/test split twice to achieve train/validation/test three way split df_train, df_test, y_train, y_test, ids_train, ids_test = train_test_split( data, label, ids, test_size = TEST_SIZE, random_state = SPLIT_RANDOM_STATE, stratify = label) df_train, df_val, y_train, y_val, ids_train, ids_val = train_test_split( df_train, y_train, ids_train, test_size = VAL_SIZE, random_state = SPLIT_RANDOM_STATE, stratify = y_train) # obtain finalized columns num_cols_cleaned = list(SortedSet(df_train.columns) - SortedSet(CAT_COLS)) preprocess = Preprocesser(num_cols = num_cols_cleaned, cat_cols = CAT_COLS) X_train = preprocess.fit_transform(df_train) X_val = preprocess.transform(df_val) X_test = preprocess.transform(df_test) logger.info('modeling') eval_set = [(X_train, y_train), (X_val, y_val)] xgb_tuned = build_xgb(N_ITER, CV, MODEL_RANDOM_STATE, eval_set) xgb_tuned.fit(X_train, y_train) if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) dump(preprocess, checkpoint_preprocess) dump(xgb_tuned, checkpoint_xgb) # model evaluation metric reporting y_pred = [] xgb_best = xgb_tuned.best_estimator_ zipped = zip( ('train', 'validation', 'test'), (X_train, X_val, X_test), (y_train, y_val, y_test)) for name, X, y in zipped: xgb_pred = xgb_best.predict_proba( X, ntree_limit = xgb_best.best_ntree_limit)[:, 1] score = round(roc_auc_score(y, xgb_pred), 3) logger.info('{} AUC: {}'.format(name, score)) y_pred.append(xgb_pred) ids = np.hstack((ids_train, ids_val, ids_test)) y_pred = np.hstack(y_pred) else: data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL) ids = data[IDS_COL].values data = data.drop(IDS_COL, axis = 1) logger.info('scoring') preprocess = load(checkpoint_preprocess) xgb_tuned = load(checkpoint_xgb) X = preprocess.transform(data) xgb_best = xgb_tuned.best_estimator_ y_pred = xgb_best.predict_proba( X, ntree_limit = xgb_best.best_ntree_limit)[:, 1] if not os.path.isdir(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) output_path = os.path.join(OUTPUT_DIR, args.outputfile) write_output(ids, IDS_COL, y_pred, LABEL_COL, output_path)
test_data = random.sample(train_data, int(len(train_data)*.1)) train_data = [post for post in train_data if post not in test_data] train_data = np.array(train_data) # intialize model model = pizza_model.PizzaModel(params) # train #model.train(train_data) # test #predictions = model.test(test_data) # K-fold testing kf = cross_validation.KFold(len(train_data), n_folds=10) predictions = np.zeros(len(train_data)) for train_index, test_index in kf: train_fold, test_fold = train_data[train_index], train_data[test_index] model.train(train_fold) predictions[test_index] = model.test(test_fold) desired = utils.get_labels_from_post_list(train_data) # evaluate if not params.TESTING: #desired = utils.get_labels_from_post_list(test_data) print classification_report(desired, predictions) else: # write output to file utils.write_output(test_data, predictions)
def main(): global cuda cuda = torch.cuda.is_available() if cuda: train_sequence.cuda = cuda sequence_tagger.cuda = cuda utils.cuda = cuda train_sequence_crafted.cuda = cuda if args.crafted: this_train_sequence = train_sequence_crafted else: this_train_sequence = train_sequence utils.log('start reading ner file ') (token_list, tag_list, raw_token_list) = utils.prepare_data(args.input, True) vocabs = pickle.load(open(args.vocab_path, 'rb')) y = list( map(lambda x: np.array(list(map(lambda y: vocabs['y_dict'][y], x))), tag_list)) x = tdh.build_input_data(token_list, vocabs['vocabulary']) #extract crafted features train_data = utils.get_data_with_pos_tag(raw_token_list, tag_list) features = utils.extract_features(train_data, vocabs['uptl'], vocabs['treatment_suffix'], vocabs['disease_suffix'], vocabs['dis']) ds_data = {'x': x, 'y': y, 'z': features} ds = sequence_dataset.sequence_dataset( '.', 'test', ds_data, word_counts=vocabs['word_counts'], vocabulary_inv=vocabs['vocabulary_inv'], crafted_features=args.crafted) val_loader = DataLoader(ds, batch_sampler=data_samplers.BatchSampler( list(map(lambda x: min(999999, len(x[0])), ds)), 256, shuffle=False), num_workers=4) vocab_size = ds.vocab_size embedding_init = vocabs['embedding_init'] embedding_init = embedding_init[:vocab_size] if args.model == 'bilstm': if args.crafted: model = sequence_tagger.BilstmSequenceTaggerCraftedFeatures( len(vocabs['y_dict']), vocab_size, embedding_size=embedding_init.shape[1], hidden_size=args.hidden_size, intermediate_size=args.intermediate_size, embedding_init=embedding_init, crafted_features_size=args.num_crafted) criterion = nn.CrossEntropyLoss() if cuda: criterion.cuda() # my_loss_fn = lambda x, y, z, m: utils.std_loss_fn_crafted( x, y, z, m, criterion) else: model = sequence_tagger.BilstmSequenceTagger( len(vocabs['y_dict']), vocab_size, embedding_size=embedding_init.shape[1], hidden_size=args.hidden_size, intermediate_size=args.intermediate_size, embedding_init=embedding_init) criterion = nn.CrossEntropyLoss() if cuda: criterion.cuda() # my_loss_fn = lambda x, y, m: utils.std_loss_fn(x, y, m, criterion) else: model = sequence_tagger.BilstmCRFSequenceTagger( len(vocabs['y_dict']), vocab_size, embedding_size=embedding_init.shape[1], hidden_size=args.hidden_size, intermediate_size=args.intermediate_size, embedding_init=embedding_init) my_loss_fn = utils.lstm_crf_neg_log_likelihood_loss1 checkpoint = torch.load(args.checkpoint) model.load_state_dict(checkpoint['model']) rec, i, all_pred = this_train_sequence.compute_sequence(-1, model, my_loss_fn, val_loader, None, 'eval', None, None, [], return_preds=True) utils.write_output(all_pred, raw_token_list, vocabs['y_dict_inv'], args.output)
# print progress only ten times, expensive because of get_score if cur_step % (num_steps // 10) == 0: progress_printer.print( cur_step, get_score(vehicle_to_rides, bonus, num_steps)) return vehicle_to_rides def to_string(vehicle_to_rides): lines = [] for vehicle in vehicle_to_rides: rides = vehicle_to_rides[vehicle] lines.extend( [f"{len(rides)} {' '.join(map(lambda r: str(r.number), rides))}"]) return lines if __name__ == '__main__': instances = [ 'a_example', 'b_should_be_easy', 'c_no_hurry', 'd_metropolis', 'e_high_bonus' ] for instance in instances: print(f'\n\033[95msolving instance {instance}:\033[0m') rows, columns, num_vehicles, num_rides, bonus, num_steps, rides = utils.read_input( instance, get_input) vehicle_to_rides = get_greedy_solution2(num_vehicles, rides, bonus, num_steps) utils.write_output(instance, to_string(vehicle_to_rides))
def get_initial_allocation(cache_importance, video_sizes, num_caches, cache_size): allocation = {c: set() for c in range(num_caches)} space_left = {c: cache_size for c in range(num_caches)} already_cached = [] for cache in cache_importance: for video, _ in cache_importance[cache]: if video_sizes[video] <= space_left[cache] and video not in already_cached: already_cached.append(video) allocation[cache].add(video) space_left[cache] -= video_sizes[video] return allocation def to_string(allocation): used_caches = {cache for cache in allocation if allocation[cache]} lines = [f"{len(used_caches)}"] for cache in allocation: lines.extend([f"{cache} {' '.join(map(str, allocation[cache]))}"]) return lines if __name__ == '__main__': instances = ['example', 'me_at_the_zoo', 'videos_worth_spreading', 'trending_today', 'kittens'] for instance in instances: print(f'\n\033[95msolving instance {instance}:\033[0m') latencies, requests, video_sizes, cache_size, num_caches = utils.read_input(instance, get_input) cache_importance = get_cache_importance(latencies, requests, video_sizes, num_caches) allocation = solve_by_local_search(cache_importance, latencies, requests, video_sizes, cache_size, num_caches) utils.write_output(instance, to_string(allocation))
def populate_graph(graph, node1, node2): if node1 in graph.keys(): graph[node1].append(node2) else: graph[node1] = [node2] return graph def calc(lines): parser = re.compile("[A-Z0-9]+") values = [tuple(parser.findall(line.strip())) for line in lines] graph = {} nodes = [] for value in values: nodes.append(value[1]) graph = populate_graph(graph, value[0], value[1]) you_path = find_path(graph, "YOU", "COM", []) san_path = find_path(graph, "SAN", "COM", []) xor = set(you_path) ^ set(san_path) return len(xor) if __name__ == '__main__': lines = read_input() result = str(calc(lines)) write_output(result) check_result(result)
def main(): # ----------------------------------------------------------------------------------- # Adjustable Parameters parser = argparse.ArgumentParser() parser.add_argument('--train', action='store_true', help='training or scoring') parser.add_argument('--inputfile', type=str, help='input data file name') parser.add_argument('--outputfile', type=str, help='output prediction file name') args = parser.parse_args() # directory for the input data and output prediction: DATA_DIR = 'data' OUTPUT_DIR = 'output' # columns used: CAT_COLS = [ 'Auction', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'IsOnlineSale' ] NUM_COLS = [ 'VehicleAge', 'VehOdo', 'VehBCost', 'WarrantyCost', 'MMRCurrentAuctionAveragePrice', 'MMRAcquisitionAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRAcquisitionRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'MMRAcquisitonRetailCleanPrice' ] DATE_COLS = ['PurchDate'] LABEL_COL = 'IsBadBuy' IDS_COL = 'RefId' # current time for computing recency feature NOW = '2010-12-31' # modeling step: # model checkpoint for future scoring MODEL_DIR = 'model' CHECKPOINT_PREPROCESS = os.path.join(MODEL_DIR, 'preprocess.pkl') CHECKPOINT_XGB = os.path.join(MODEL_DIR, 'xgb.pkl') # parameter that only relevant for training stage and not scoring if args.train: # number of cross validation and hyperparameter settings to try CV = 10 N_ITER = 5 MODEL_RANDOM_STATE = 4321 # train/validation stratified split VAL_SIZE = 0.1 TEST_SIZE = 0.1 SPLIT_RANDOM_STATE = 1234 # ----------------------------------------------------------------------------------- logger.info('preprocessing') input_path = os.path.join(DATA_DIR, args.inputfile) if args.train: data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL, LABEL_COL) ids = data[IDS_COL].values label = data[LABEL_COL].values data = data.drop([IDS_COL, LABEL_COL], axis=1) # train/test split twice to achieve train/validaion/test three way split df_train, df_test, y_train, y_test, ids_train, ids_test = train_test_split( data, label, ids, test_size=TEST_SIZE, random_state=SPLIT_RANDOM_STATE, stratify=label) df_train, df_val, y_train, y_val, ids_train, ids_val = train_test_split( df_train, y_train, ids_train, test_size=VAL_SIZE, random_state=SPLIT_RANDOM_STATE, stratify=y_train) # obtain finalized columns num_cols_cleaned = list( SortedSet(df_train.columns) - SortedSet(CAT_COLS)) preprocess = Preprocesser(num_cols=num_cols_cleaned, cat_cols=CAT_COLS) X_train = preprocess.fit_transform(df_train) X_val = preprocess.transform(df_val) X_test = preprocess.transform(df_test) logger.info('modeling') eval_set = [(X_train, y_train), (X_val, y_val)] xgb_tuned = build_xgb(N_ITER, CV, MODEL_RANDOM_STATE, eval_set) xgb_tuned.fit(X_train, y_train) if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) dump(preprocess, CHECKPOINT_PREPROCESS) dump(xgb_tuned, CHECKPOINT_XGB) # model evaluation metric reporting y_pred = [] xgb_best = xgb_tuned.best_estimator_ zipped = zip(('train', 'validation', 'test'), (X_train, X_val, X_test), (y_train, y_val, y_test)) for name, X, y in zipped: xgb_pred = xgb_best.predict_proba( X, ntree_limit=xgb_best.best_ntree_limit)[:, 1] score = round(roc_auc_score(y, xgb_pred), 3) logger.info('{} AUC: {}'.format(name, score)) y_pred.append(xgb_pred) ids = np.hstack((ids_train, ids_val, ids_test)) y_pred = np.hstack(y_pred) else: data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL) ids = data[IDS_COL].values data = data.drop(IDS_COL, axis=1) logger.info('scoring') preprocess = load(CHECKPOINT_PREPROCESS) xgb_tuned = load(CHECKPOINT_XGB) X = preprocess.transform(data) xgb_best = xgb_tuned.best_estimator_ y_pred = xgb_best.predict_proba( X, ntree_limit=xgb_best.best_ntree_limit)[:, 1] if not os.path.isdir(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) output_path = os.path.join(OUTPUT_DIR, args.outputfile) write_output(ids, IDS_COL, y_pred, LABEL_COL, output_path)
if args.max_level is None: max_level = class_tree.get_height() else: max_level = args.max_level wstc = WSTC(input_shape=x.shape, class_tree=class_tree, max_level=max_level, sup_source=args.sup_source, y=y, vocab_sz=vocab_sz, word_embedding_dim=word_embedding_dim, block_thre=args.gamma, block_level=args.block_level) total_counts = sum(word_counts[ele] for ele in word_counts) total_counts -= word_counts[vocabulary_inv_list[0]] background_array = np.zeros(vocab_sz) for i in range(1, vocab_sz): background_array[i] = word_counts[vocabulary_inv[i]] / total_counts for level in range(max_level): y_pred = proceed_level(x, sequences, wstc, args, pretrain_epochs, self_lr, decay, update_interval, delta, class_tree, level, expand_num, background_array, max_doc_length, max_sent_length, len_avg, len_std, beta, alpha, vocabulary_inv, common_words) write_output(y_pred, perm, class_tree, './' + args.dataset) compute_metrics(y_pred, y)
model = CrossEntropyClassifier(nclasses) history = model.fit(Xtrain, Ytrain, best, lr) elif classifier == 'svm_ovo': K = svm_kernel.build_K(Xtrain) if do_validation: model = KernelSVMOneVsOneClassifier(nclasses, svm_kernel) model.fit(Xtrain, Ytrain, C, validation, K=K, check=True) if do_prediction: model = KernelSVMOneVsOneClassifier(nclasses, svm_kernel) model.fit(Xtrain, Ytrain, C, K=K) elif classifier == 'svm_ova': K = svm_kernel.build_K(Xtrain) if do_validation: model = KernelSVMOneVsAllClassifier(nclasses, svm_kernel) model.fit(Xtrain, Ytrain, C, validation, K=K, check=True) if do_prediction: model = KernelSVMOneVsAllClassifier(nclasses, svm_kernel) model.fit(Xtrain, Ytrain, C, K=K) else: raise Exception("Unknown classifier") if do_prediction: print("Predicting on test data") Ytest = model.predict(Xtest) write_output(Ytest, 'results/Yte_' + output_suffix + '.csv')