def table_tokenizer(table, uncase=True): textTable = [] # maxLen = 0 for text in tqdm(table, file=sys.stdout): text = text_cleaner(text, uncase=uncase) textTable.append(text) return textTable
def filter_entailment(args): with open(args.output_file, 'w') as outfile, open(args.input_file, 'r') as infile: print("Writing to {} from {}".format(args.output_file, args.input_file)) line_tqdm = tqdm(infile, dynamic_ncols=True) filtered_entailment = {} for line in line_tqdm: qa_json = json.loads(line) num_q = 0 # Filter entailment score if qa_json['score'] > args.min_entail_score: filtered_entailment.setdefault(qa_json["id"], []).append(qa_json) #line_tqdm.set_postfix(hits=num_q) for qid, qa_list in filtered_entailment.items(): print('\r q', num_q, end="") # Filter number of entailment texts if len(qa_list) <= args.max_entail_docs: continue qa_list = sorted(qa_list, key=lambda qa: qa['score'], reverse=True) qa_list = qa_list[:args.max_entail_docs] # Write to outfile output_dict = create_output_dict(qa_list) outfile.write(json.dumps(output_dict) + '\n') num_q += 1 print()
def main(): ''' Runs this script. ''' parser = argparse.ArgumentParser() parser.add_argument( 'output_folder', type=str, help='Path to the root folder where data will be stored.') parser.add_argument('--num', dest='num_cars', type=int, required=True, help='Amount of cars to crawl.') parser.add_argument( '--from-cheapest', action='store_true', required=True, help= 'If set, crawls the cars starting from the cheapest one, otherwise from the most expensive one.' ) parser.add_argument('--save-every', type=int, default=100, help='Interval between two security saves.') parser.add_argument( '--max-pages', type=int, default=10000, help='Sets the maximum number of pages to crawl. Avoids infinity loops.' ) args = parser.parse_args([ '/home/tom/second_hand_cars_data', '--num', '1000', '--from-cheapest', '--save-every', '200', '--max-pages', '1000', ]) img_folder = os.path.join(args.output_folder, 'imgs') sort_value = from_most_expensive_code if args.from_cheapest: sort_value = from_most_expensive_code with CrawlerStatus(status_folder=args.output_folder) as status: for page_id in tqdm(range(args.max_pages)): page_url = base_url.format(page_id, sort_value) try: crawl_page(page_url, status, img_folder, args.save_every) except: # Could not complete to crawl this page, go to next one. logging.error('Unable to crawl page: {}'.format(page_url)) continue if status.size >= args.num_cars: # Collected enough cars: quit crawling. break
def text_cleaner(table): textTable = [] maxLen = 0 for text in tqdm(table, file=sys.stdout): text = get_words(text) textTable.append(text) if len(text) > maxLen: maxLen = len(text) return textTable, maxLen
def add_retrieved_text(qa_file, output_file): with open(output_file, 'w') as output_handle, open(qa_file, 'r') as qa_handle: print("Writing to {} from {}".format(output_file, qa_file)) line_tqdm = tqdm(qa_handle, dynamic_ncols=True) for line in line_tqdm: json_line = json.loads(line) num_hits = 0 for output_dict in add_hits_to_qajson(json_line): output_handle.write(json.dumps(output_dict) + "\n") num_hits += 1 line_tqdm.set_postfix(hits=num_hits)
def csv_processing(path, test=False): texts_1 = [] texts_2 = [] labels = [] test_ids = [] with codecs.open(path, encoding='utf-8') as f: reader = csv.reader(f, delimiter=',') header = next(reader) if test == False: for values in tqdm(reader): texts_1.append(text_to_wordlist(values[3])) texts_2.append(text_to_wordlist(values[4])) labels.append(int(values[5])) return texts_1, texts_2, labels else: for values in tqdm(reader): texts_1.append(text_to_wordlist(values[1])) texts_2.append(text_to_wordlist(values[2])) test_ids.append(values[0]) return texts_1, texts_2, test_ids
def add_retrieved_text(qa_file, output_file, query_mode='common'): print("Query mode is {}".format(query_mode)) with open(output_file, 'w') as output_handle, open(qa_file, 'r') as qa_handle: print("Writing to {} from {}".format(output_file, qa_file)) line_tqdm = tqdm(qa_handle, dynamic_ncols=True) # read json for line in line_tqdm: json_line = json.loads(line) qa_json = add_hits_to_qajson(json_line, query_mode) output_handle.write(json.dumps(qa_json) + "\n")
def reformulate_query(self, qa_file, output_file): with open(output_file, 'w') as reform_qa, open(qa_file, 'r') as origin_qa: print("Writing to {} from {}".format(output_file, qa_file)) line_tqdm = tqdm(origin_qa, dynamic_ncols=True) for line in line_tqdm: json_line = json.loads(line) num_reform = 0 for output_dict in self.reform_query_to_qajson(json_line): reform_qa.write(json.dumps(output_dict) + "\n") num_reform += 1 line_tqdm.set_postfix(hits=num_reform)
def load_emb(vocab): print("Reading pre-trained embeddings") embeddings = np.random.normal(0, 0.01, (len(vocab['w2i']), 300)) with open("/home/data/glove/glove.840B.300d.txt", "r") as embed_in: line_tqdm = tqdm(embed_in, dynamic_ncols=True) for idx, line in enumerate(line_tqdm): row = line.split() if len(row) != 301: continue if row[0] in vocab['w2i']: embeddings[vocab['w2i'][row[0]], :] = np.asarray( [float(v) for v in row[1:]]) embeddings[vocab['w2i']['<pad>']] = np.zeros((1, 300)) return embeddings
def generate_pkl_nis(pred_dets_hdf5, best_binary_file, out_dir, file_name): pred_dets = h5py.File(pred_dets_hdf5, 'r') binary_file = h5py.File(best_binary_file, 'r') print(pred_dets_hdf5) print(best_binary_file) assert len(pred_dets.keys()) == 4539 print(len(binary_file)) hoi_list = io.load_json_object("data/vcoco/annotations/hoi_list_234.json") hoi_dict = {int(hoi["id"]) - 1: hoi for hoi in hoi_list} result_list = [] for global_id in tqdm(pred_dets.keys()): image_id = int(global_id.split("_")[1]) start_end_ids = pred_dets[global_id]['start_end_ids'] assert len(start_end_ids) == 234 for hoi_id in range(234): start_id, end_id = pred_dets[global_id]['start_end_ids'][int( hoi_id)] if start_id == end_id: continue for j in range(start_id, end_id): hoi_dets = pred_dets[global_id]['human_obj_boxes_scores'][j] inter_score = binary_file[global_id]["binary_score_data"][j] final_score = hoi_dets[8] * inter_score * hoi_dets[9] person_boxes = hoi_dets[:4].tolist() per_image_dict = {} per_image_dict["image_id"] = image_id per_image_dict["person_box"] = person_boxes aciton = hoi_dict[hoi_id]["verb"] role = hoi_dict[hoi_id]["role"] per_image_dict[aciton + "_" + role] = [ hoi_dets[4], hoi_dets[5], hoi_dets[6], hoi_dets[7], final_score ] result_list.append(per_image_dict) io.dump_pickle_object(result_list, os.path.join(out_dir, file_name + ".pkl"))
def w2vEmbdReader(embd_path, reVocab, embd_dim): logger.info(' getting pre-trained embedding from file... ') logger.info(' embedding length: %i dim: %i ' % (len(reVocab), embd_dim)) embd_matrix = np.zeros((len(reVocab), embd_dim)) with open(embd_path, 'r', encoding='utf8') as fhd: idx = 1 # let 1st padding line all zeros for line in tqdm(fhd, total=len(reVocab)): elem = line.strip().split(' ') assert len( elem ) == embd_dim + 1, 'Incorrect Embedding Dimension, expect %d but got %d ' % ( embd_dim, len(elem) - 1) w2vec = np.asarray(elem[1:], dtype='float32') embd_matrix[idx] = w2vec idx += 1 return embd_matrix
def test(model, test_para, test_relation, test_label, loss_function=None, pred_flag=False): total_loss = [] model.eval() pred = [] N = len(test_para) test_size = 16 line_tqdm = tqdm(range(N // test_size + 1), dynamic_ncols=True) for i in line_tqdm: para_test = test_para[i * test_size:min((i + 1) * test_size, N)] relation_test = test_relation[i * test_size:min((i + 1) * test_size, N)] label_test = test_label[i * test_size:min((i + 1) * test_size, N)] score = model(para_test, relation_test) target = Variable(torch.LongTensor(label_test)) if model.use_cuda: target = target.cuda() if loss_function is not None: loss = loss_function(score, target) total_loss.extend(loss.data.cpu().numpy().tolist()) pred.extend(torch.argmax(score, dim=-1).cpu().tolist()) acc, precision, recall, f1 = getScores(pred, test_label) if loss_function is not None: print("\t\tLoss: {:0.5f}".format(sum(total_loss) / len(total_loss))) print("\t\tAccuracy: {:0.5f}".format(acc)) print("\t\tPrecision: {:0.5f}".format(precision)) print("\t\tRecall: {:0.5f}".format(recall)) print("\t\tF1: {:0.5f}".format(f1)) if loss_function is not None: out = (acc, precision, recall, f1, sum(total_loss) / len(total_loss)) else: out = (acc, precision, recall, f1) if pred_flag: out = (pred, ) + out return out
def download(self, url, output_path): # Streaming, so we can iterate over the response. r = requests.get(url, stream=True) # Total size in bytes. total_size = int(r.headers.get('content-length', 0)) block_size = 1024 wrote = 0 os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, 'wb') as f: for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size // block_size), unit='KB', unit_scale=True): wrote = wrote + len(data) f.write(data) if total_size != 0 and wrote != total_size: raise ConnectionError("ERROR, something went wrong")
def tokenizeIt(table, clean=False, addHead=None): tokenizedTable = [] maxLen = 0 for text in tqdm(table, file=sys.stdout): if clean: # text = stripTagsAndUris(text) text = word_tokenize(get_words(text)) if not addHead is None: text = [addHead] + text tokenizedTable.append(text) if len(text) > maxLen: maxLen = len(text) else: text = str(text).split(' ') if not addHead is None: text = [addHead] + text tokenizedTable.append(text) if len(text) > maxLen: maxLen = len(text) return tokenizedTable, maxLen
def add_retrieved_text(args): es_search = EsSearch(es_client="node008", max_hits_retrieved=args.num_retrieve * 2, min_hit_length=5, max_hit_length=100, max_hits_per_choice=args.num_retrieve) with open(args.output_file, 'w') as output_handle, open(args.input_file, 'r') as qa_handle: print("Writing to {} from {}".format(args.output_file, args.input_file)) line_tqdm = tqdm(qa_handle, dynamic_ncols=True) for line in line_tqdm: json_line = json.loads(line) num_hits = 0 for output_dict in add_hits_to_qajson(es_search, json_line, args.num_retrieve): output_handle.write(json.dumps(output_dict) + "\n") num_hits += 1 line_tqdm.set_postfix(hits=num_hits)
def run_exp(configs, n_jobs=1): datasets = get_datasets(configs['data']) exp_results = [] for dataset, data in datasets.items(): for encoder_config in tqdm(configs['encoder'], 'Process: {}'.format(dataset)): transformer_name, transformer = init_transformer( encoder_config, data) exps = make_exp(data, transformer, configs) exp_results.append({ 'dataset': dataset, 'transformer': transformer_name, 'metrics': Parallel(n_jobs=n_jobs, prefer='threads')(delayed(run_one_exp)(exp) for exp in exps) }) return exp_results
pin_memory=True, drop_last=False, listen='*:%d' % (args.port + 3), timeout=600) testloader2 = RemoteDataLoader(augmented_dataset_t, batch_size=1, shuffle=False, pin_memory=True, drop_last=False, listen='*:%d' % (args.port + 4), timeout=600) logger.info(f'---- use data cache @ {cache_prefix} ---- ') trainloader = CachedDataLoader(trainloader, tag=f'{cache_prefix}_tr', cache_max=MAX_CACHE_EPOCH) # testloader1 = CachedDataLoader(testloader1, tag=f'{cache_prefix}_ts1', cache_max=CachedDataLoader.NO_SEPARATE_EPOCH) # validloader = CachedDataLoader(validloader, tag=f'{cache_prefix}_ts2', cache_max=CachedDataLoader.NO_SEPARATE_EPOCH) # testloadera = CachedDataLoader(testloadera, tag=f'{cache_prefix}_tsa', cache_max=CachedDataLoader.NO_SEPARATE_EPOCH) # testloader2 = CachedDataLoader(testloader2, tag=f'{cache_prefix}_ts3', cache_max=CachedDataLoader.NO_SEPARATE_EPOCH) trainloader = tqdm(trainloader) for epoch in range(args.epoch): cnt = 0 start_t = time.time() for _ in tqdm(trainloader, desc="%04d" % epoch): cnt += 1 if cnt > 38400: break
stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text) if load_train_test_pkl is '': texts_1 = [] texts_2 = [] labels = [] with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f: reader = csv.reader(f, delimiter=',') header = next(reader) for values in tqdm(reader): texts_1.append(text_to_wordlist(values[3])) texts_2.append(text_to_wordlist(values[4])) labels.append(int(values[5])) print('Found %s texts in train.csv' % len(texts_1)) test_texts_1 = [] test_texts_2 = [] test_ids = [] with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f: reader = csv.reader(f, delimiter=',') header = next(reader) for values in tqdm(reader): test_texts_1.append(text_to_wordlist(values[1])) test_texts_2.append(text_to_wordlist(values[2])) test_ids.append(values[0])
def main(args): print("Load Data") print(args.train_data, args.dev_data) files = {'train': args.train_data, 'dev': args.dev_data} model_name = 'gpt2' tokenizer = GPT2Tokenizer.from_pretrained(model_name, pad_token='<PAD>') # add tokens for precondition generation tokenizer.add_tokens([ '<sep>', '<event>', '</event>', '<pre>', '</pre>', '<eos>', '[BLANK]' ]) encdec = GPT2LMHeadModel.from_pretrained(model_name) encdec.resize_token_embeddings(len(tokenizer)) # dataset load dataset = load_data(files, max_len=args.max_sequence_length, eos='<eos>') if args.load_model is not None: model = torch.load(args.load_model) else: model = Model(tokenizer, encdec) if model.use_cuda: model.cuda() data_input, gen_seed, target, target_weights = prepare(dataset, tokenizer) # Set a path for saving model save_model_path = os.path.join(args.save_model_path, args.experiment) if not os.path.exists(save_model_path): os.makedirs(save_model_path) # Optimizer no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, eps=1e-8) n_params = sum([np.prod(p.size()) for p in model.parameters()]) print("#parameters: {}".format(n_params)) N = len(data_input['train']) print(N // args.batch_size) best_dev_loss = 9999 for epoch in range(1, args.epochs + 1): print("Epoch {}:".format(epoch)) start_time = time.time() batch_idxs = np.random.permutation(N // args.batch_size + 1) line_tqdm = tqdm(batch_idxs, dynamic_ncols=True) total_loss = [] model.train() for batch_idx in line_tqdm: enc_input = data_input['train'][batch_idx * args.batch_size:min( (batch_idx + 1) * args.batch_size, N)] tmp = gen_seed['train'][batch_idx * args.batch_size:min((batch_idx + 1) * args.batch_size, N)] event_lens = [len(s) for s in tmp] if len(enc_input) == 0: continue model.zero_grad() loss = model(enc_input, copy.deepcopy(enc_input), event_lens) total_loss.append(loss.data.cpu().numpy().tolist()) loss.backward() optimizer.step() gc.collect() torch.cuda.empty_cache() end_time = time.time() print("Time elapsed: {:.3f}".format(end_time - start_time)) print("Loss: {}".format(sum(total_loss) / len(total_loss))) model.eval() with torch.no_grad(): for set_info in ['train', 'dev']: NN = len(data_input[set_info]) total_loss = [] for idx in range(NN // args.batch_size): enc_input = data_input[set_info][idx * args.batch_size:min( (idx + 1) * args.batch_size, NN)] tmp = gen_seed[set_info][idx * args.batch_size:min( (idx + 1) * args.batch_size, NN)] event_lens = [len(s) for s in tmp] if len(enc_input) == 0: continue loss = model(enc_input, copy.deepcopy(enc_input), event_lens) total_loss.append(loss.data.cpu().numpy().tolist()) loss = sum(total_loss) / len(total_loss) print("Test on {} set:".format(set_info)) print("\tLoss: {}".format(loss)) if set_info == 'dev': if best_dev_loss > loss: best_dev_loss = loss torch.save(model, os.path.join(save_model_path, "DevBest.pt")) for d, t in zip(gen_seed['dev'][:10], target['dev'][:10]): sent = model.generate(d) print("Target Event: ", tokenizer.decode(d)) print("Generated Precondition: ", sent) print("Reference: ", tokenizer.decode(t)) return
def create_feature_map(img_map: dict, model: FeatureExtractor): feature_map = {} for img_index, img_path in tqdm(list(img_map.items()), desc="Extracting features.."): img = Image.open(img_path) feature_map[img_index] = model.extract_features(img) return feature_map
else: ts.insert(corrupt_idx, lambda img: PIL.Image.fromarray(corrupt(np.array(img), corrupt_level, None, int(corrupt_type)))) transform_test = transforms.Compose(ts) testset = ImageNet(root='/data/public/rw/datasets/imagenet-pytorch', split='val', transform=transform_test) sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0) for _ in range(1): sss = sss.split(list(range(len(testset))), testset.targets) train_idx, valid_idx = next(sss) testset = Subset(testset, valid_idx) testloader = torch.utils.data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=32, pin_memory=True, drop_last=False) metric = Accumulator() dl_test = tqdm(testloader) data_id = 0 tta_rule_cnt = [0] * tta_num for data, label in dl_test: data = data.view(-1, data.shape[-3], data.shape[-2], data.shape[-1]) data = data.cuda() with torch.no_grad(): preds = model_target(data) preds = torch.softmax(preds, dim=1) preds = preds.view(len(label), -1, preds.shape[-1]) preds_merged = torch.mean(preds, dim=1) # simple averaging # TODO : weighted average mean? # preds_merged = torch.max(preds, dim=1)[0] # simple maximum peak
def main(): parser = argparse.ArgumentParser( description='XGB with Handcrafted Features') parser.add_argument('--save', type=str, default='XGB_leaky', help='save_file_names') args = parser.parse_args() timestr = time.strftime("%Y%m%d-%H%M%S-") output_dir = '../output/' + time.strftime("%m%d") # mkdir(output_dir) print("Reading train features...") df_train = pd.read_csv(train_feature, encoding="ISO-8859-1") X_train_ab = df_train.iloc[:, 2:] # X_train_ab = X_train_ab.drop('euclidean_distance', axis=1) # X_train_ab = X_train_ab.drop('jaccard_distance', axis=1) print("Reading train material...") df_train = pd.read_csv(train_file) df_train = df_train.fillna(' ') print("Reading test material...") df_test = pd.read_csv(test_file) ques = pd.concat([df_train[['question1', 'question2']], \ df_test[['question1', 'question2']]], axis=0).reset_index(drop='index') q_dict = defaultdict(set) for i in tqdm(range(ques.shape[0])): q_dict[ques.question1[i]].add(ques.question2[i]) q_dict[ques.question2[i]].add(ques.question1[i]) def q1_freq(row): return (len(q_dict[row['question1']])) def q2_freq(row): return (len(q_dict[row['question2']])) def q1_q2_intersect(row): return (len( set(q_dict[row['question1']]).intersection( set(q_dict[row['question2']])))) df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True) df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True) df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True) df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True) df_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True) df_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True) test_leaky = df_test.loc[:, ['q1_q2_intersect', 'q1_freq', 'q2_freq']] del df_test train_leaky = df_train.loc[:, ['q1_q2_intersect', 'q1_freq', 'q2_freq']] # explore stops = set(stopwords.words("english")) df_train['question1'] = df_train['question1'].map( lambda x: str(x).lower().split()) df_train['question2'] = df_train['question2'].map( lambda x: str(x).lower().split()) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()) words = [x for y in train_qs for x in y] counts = Counter(words) weights = {word: get_weight(count) for word, count in counts.items()} print('Building Features') X_train = build_features(df_train, stops, weights) X_train = pd.concat((X_train, X_train_ab, train_leaky), axis=1) y_train = df_train['is_duplicate'].values df_train1 = pd.read_csv(train_file) X_train1 = pd.concat((df_train1, X_train), axis=1) X_train1.to_csv(output_dir + '/' + timestr + 'train_extra_features.csv', index=False) del df_train1, X_train1 del df_train, X_train_ab, train_leaky print('Dumped train extra features to file ' + timestr + 'train_extra_features.csv') X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=4242) #UPDownSampling print("Train Sampling...") pos_train = X_train[y_train == 1] neg_train = X_train[y_train == 0] X_train = pd.concat( (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train)) y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0]) print(np.mean(y_train)) del pos_train, neg_train print("Valid Sampling...") pos_valid = X_valid[y_valid == 1] neg_valid = X_valid[y_valid == 0] X_valid = pd.concat( (neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid)) y_valid = np.array([0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0]) print(np.mean(y_valid)) del pos_valid, neg_valid params = {} params['objective'] = 'binary:logistic' params['eval_metric'] = 'logloss' params['eta'] = 0.02 params['max_depth'] = 7 params['subsample'] = 0.6 params['base_score'] = 0.2 # params['scale_pos_weight'] = 0.2 print("DMatrix...") d_train = xgb.DMatrix(X_train, label=y_train) d_valid = xgb.DMatrix(X_valid, label=y_valid) watchlist = [(d_train, 'train'), (d_valid, 'valid')] print("XGBoost training...") bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50) print(log_loss(y_valid, bst.predict(d_valid))) bst.save_model(output_dir + '/' + timestr + args.save + '.mdl') print('Building Test Features') df_test = pd.read_csv(test_feature, encoding="ISO-8859-1") x_test_ab = df_test.iloc[:, 2:] # x_test_ab = x_test_ab.drop('euclidean_distance', axis=1) # x_test_ab = x_test_ab.drop('jaccard_distance', axis=1) df_test = pd.read_csv(test_file) df_test = df_test.fillna(' ') df_test['question1'] = df_test['question1'].map( lambda x: str(x).lower().split()) df_test['question2'] = df_test['question2'].map( lambda x: str(x).lower().split()) x_test = build_features(df_test, stops, weights) x_test = pd.concat((x_test, x_test_ab, test_leaky), axis=1) del x_test_ab, test_leaky df_test1 = pd.read_csv(test_file) x_test1 = pd.concat((df_test1, x_test), axis=1) x_test1.to_csv(output_dir + '/' + timestr + 'test_extra_features.csv', index=False) del df_test1, x_test1 print('Dumped test extra features to file ' + timestr + 'test_extra_features.csv') d_test = xgb.DMatrix(x_test) p_test = bst.predict(d_test) sub = pd.DataFrame() sub['test_id'] = df_test['test_id'] sub['is_duplicate'] = p_test sub.to_csv(output_dir + '/' + timestr + args.save + '.csv', index=False) print('Dumped inference to file ' + timestr + args.save + '.csv') print('Finished.')
return (x_start, int(x_start + size)) if __name__ == '__main__': args = argsProcessor() dir = args.dataPath if (not os.path.isdir(args.outputFiles)): os.mkdir(args.outputFiles) import csv with open(args.outputFiles + 'gt.csv', 'a') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for folder in tqdm(os.listdir(dir)): a = 0 # print (str(folder)) if (os.path.isdir(dir + "/" + folder)): for file in tqdm(os.listdir(dir + "/" + folder)): images_dir = dir + "/" + folder + "/" + file if (os.path.isdir(images_dir)): list_gt = [] tree = ET.parse(images_dir + "/" + file + ".gt") root = tree.getroot() for a in root.iter("frame"): list_gt.append(a) # print (list_gt) for image in os.listdir(images_dir):
''' Created on Jun 1, 2017 @author: tonyq ''' import pandas as pd from tqdm._tqdm import tqdm import csv train = pd.read_csv("../output/0604/20170604-165432-XGB_leaky.csv") totalen = len(train.is_duplicate) print('Total size: ', totalen) fulllist = zip(train.test_id, train.is_duplicate) length = len(train.is_duplicate) del train with open("../output/0604/20170604-165432-XGB_leaky.clean.csv", "w", encoding='utf8') as fwrt: writer_sub = csv.writer(fwrt) writer_sub.writerow(['test_id','is_duplicate']) for (theid, dup) in tqdm(fulllist, total=length): writer_sub.writerow([theid, dup])
def train(args): out_dir = os.path.join(args.logdir, args.experiment) # setup tensorboard logging if args.tensorboard_logging: writer = SummaryWriter(out_dir) if not os.path.exists(out_dir): os.makedirs(out_dir) # load data data = load_data('../data/peko_all.jsonl') del data['test'] # load transformer tokenizer, model model_name = 'xlnet-base-cased' tokenizer = XLNetTokenizer.from_pretrained(model_name, pad_token='<PAD>') tokenizer.padding_side = "right" encoder = XLNetModel.from_pretrained(model_name) # apply tokenizer to data and re-align the token indices paragraphs = {} relations = {} labels = {} for set_info, raw_data in data.items(): paragraphs[set_info], relations[set_info], labels[set_info] = prepare( raw_data, tokenizer) # model instantiation embedding_dim = 768 model = Model(tokenizer, encoder, embedding_dim, 2) if model.use_cuda: model.cuda() # batchify batch_data = batchify(paragraphs['train'], relations['train'], labels['train'], args.batch_size) weight = torch.FloatTensor([ sum(labels['train']) / (len(labels['train']) - sum(labels['train'])), 1. ]) if model.use_cuda: weight = weight.cuda() loss_function = nn.NLLLoss(weight=weight, reduction='none') no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) n_params = sum([np.prod(p.size()) for p in model.parameters()]) print("#parameters: {}".format(n_params)) dev_best = 0 # train the model N = len(batch_data) for epoch in range(1, args.epochs + 1): print("Epoch {}:".format(epoch)) start_time = time.time() total_loss = [] batch_idxs = np.random.permutation(N) line_tqdm = tqdm(batch_idxs, dynamic_ncols=True) model.train() for batch_idx in line_tqdm: para, relation, label = batch_data[batch_idx] model.zero_grad() score = model(para, relation) target = torch.LongTensor(label) target = Variable(target) if model.use_cuda: target = target.cuda() loss = loss_function(score, target) total_loss.extend(loss.data.cpu().numpy().tolist()) loss.mean().backward() optimizer.step() gc.collect() torch.cuda.empty_cache() end_time = time.time() print("Time Elapsed: {:.3f}".format(end_time - start_time)) print("train Loss: {}".format(sum(total_loss) / len(total_loss))) if args.tensorboard_logging: writer.add_histogram("losses", np.asarray(total_loss), epoch, bins='auto') writer.add_scalar("TRAIN/loss", sum(total_loss) / len(total_loss), epoch) for set_info in ['train', 'dev']: print("Test on {} set".format(set_info)) with torch.no_grad(): acc, precision, recall, f1, loss = test( model, paragraphs[set_info], relations[set_info], labels[set_info], loss_function) if args.tensorboard_logging: writer.add_scalar("{}/Accuracy".format(set_info.upper()), acc, epoch) writer.add_scalar("{}/Precision".format(set_info.upper()), precision, epoch) writer.add_scalar("{}/Recall".format(set_info.upper()), recall, epoch) writer.add_scalar("{}/F1".format(set_info.upper()), f1, epoch) if set_info == 'dev': writer.add_scalar("{}/Loss".format(set_info.upper()), loss, epoch) if set_info == 'dev': if f1 > dev_best: print("Save Model...\n") torch.save(model, os.path.join(out_dir, 'baseline_best_model.pt')) best_acc = acc best_precision = precision best_recall = recall dev_best = f1 print("Best Result:") print("\tAccuracy: {:0.5f}".format(best_acc)) print("\tPrecision: {:0.5f}".format(best_precision)) print("\tRecall: {:0.5f}".format(best_recall)) print("\tF1: {:0.5f}".format(dev_best)) return
def run_bert_predict(input_data, pb_path): label_list = get_bert_labels() cat_to_id = [] id_to_cat = [] # id_to_cat, cat_to_id = read_labels(label_path) # _, vocab = read_vocab(vocab_path) # contents, y_test_cls = get_encoded_texts_and_labels(input_data, vocab, seq_length, cat_to_id) global lines lines = [] batch_size_test = 64 with open(input_data, 'r', encoding='utf-8') as f: lines = f.readlines() contents = [] y_test_cls = [] y_label_cls = [] for line in lines: contents.append(line.split('\t')[0]) y_label_cls.append(line.split('\t')[1]) for item in y_label_cls: y_test_cls.append(label_list.index(item.strip())) with tf.Graph().as_default(): graph = tf.GraphDef() with open(pb_path, "rb") as f: graph.ParseFromString(f.read()) tf.import_graph_def(graph, name="") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) input_ids = sess.graph.get_tensor_by_name("input_ids:0") input_mask = sess.graph.get_tensor_by_name("input_mask:0") seg_ids = sess.graph.get_tensor_by_name("segment_ids:0") output_tensor_name = sess.graph.get_tensor_by_name( "loss/Softmax:0") # for line in test_data: # prob = sess.run(output_tensor_name, # feed_dict={input_ids: np.reshape([line.input_ids], [1, FLAGS.max_seq_length]), # input_mask: np.reshape([line.input_mask], [1, FLAGS.max_seq_length]), # seg_ids: line.seg_ids}) # label_id = sess.run(tf.argmax(tf.nn.softmax(prob[0], name='softmax'))) # label = label_list[label_id] # print("BERT class_id:{}, label: {}, prob:{}".format(label_id, label, prob[0][label_id])) # # # return prob[0] y_pred_cls = [] for x_batch, y_batch in tqdm( batch_iter_x_y(contents, y_test_cls, batch_size_test)): x_batch = process_unsgetext_for_batch(x_batch) feed_dict = { input_ids: np.reshape([i.input_ids for i in x_batch[:]], [batch_size_test, seq_length]), input_mask: np.reshape([i.input_mask for i in x_batch[:]], [batch_size_test, seq_length]), seg_ids: np.reshape([i.seg_ids for i in x_batch[:]], [batch_size_test, seq_length]) } y_pred_cls.extend( np.argmax( sess.run(output_tensor_name, feed_dict=feed_dict), 64)) print(y_pred_cls) print('===writing log report ======') log_dir = os.path.join('.', 'bert-logs') if not os.path.exists(log_dir): os.makedirs(log_dir) log_path = os.path.join(log_dir, 'result.log') f = open(log_path, 'w', encoding='utf-8') with open(input_data, 'r', encoding='utf-8') as f_in: testdata = f_in.readlines() for i in tqdm(range(len(y_test_cls))): is_sucess = 'pass' if (y_pred_cls[i] == y_test_cls[i]) else 'fail' f.write( str(testdata[i].strip()) + '\t' + id_to_cat[y_pred_cls[i]] + '\t' + is_sucess + "\n") f.close() print('=====testing=====') target_idx = set(list(set(y_test_cls)) + list(set(y_pred_cls))) # map classification index into class name target_names = [cat_to_id.get(x_batch) for x_batch in target_idx] print( metrics.classification_report(y_test_cls, y_pred_cls, target_names=target_names, digits=4))
def run_epoch(loader, model, criterion, optimizer, epoch, tag): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() if optimizer: current_lr = get_learning_rate(optimizer)[0] else: current_lr = None tqdm_disable = bool(os.environ.get('TASK_NAME', '')) # for KakaoBrain loader = tqdm(loader, disable=tqdm_disable) loader.set_description('[%s %04d/%04d]' % (tag, epoch, args.epochs)) for i, (input, target) in enumerate(loader): # measure data loading time data_time.update(time.time() - end) input, target = input.cuda(), target.cuda() output = model(input) loss = criterion(output, target) # measure accuracy and record loss losses.update(loss.item(), input.size(0)) if len(target.size()) == 1: err1, err5 = accuracy(output.data, target, topk=(1, 5)) top1.update(err1.item(), input.size(0)) top5.update(err5.item(), input.size(0)) if optimizer: # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() else: del loss, output # measure elapsed time batch_time.update(time.time() - end) end = time.time() loader.set_postfix(lr=current_lr, batch_time=batch_time.avg, data_time=data_time.avg, loss=losses.avg, top1=top1.avg, top5=top5.avg) if tqdm_disable: print('[%s %03d/%03d] %s' % (tag, epoch, args.epochs, dict(lr=current_lr, batch_time=batch_time.avg, data_time=data_time.avg, loss=losses.avg, top1=top1.avg, top5=top5.avg))) return top1.avg, top5.avg, losses.avg
def train(self, load_model=False, model_path=None): if load_model: if model_path is not None: self.load_weights(model_path) ## Training utterances all_input_ids, all_input_len, all_label_ids = convert_examples_to_features( self.train_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) print('all input ids size: ', all_input_ids.size()) num_train_batches = all_input_ids.size(0) num_train_steps = int( num_train_batches / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) logger.info("***** training *****") logger.info(" Num examples = %d", len(self.train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to( DEVICE), all_label_ids.to(DEVICE) train_data = TensorDataset(all_input_ids, all_input_len, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) all_input_ids_dev, all_input_len_dev, all_label_ids_dev = convert_examples_to_features( self.dev_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) logger.info("***** validation *****") logger.info(" Num examples = %d", len(self.dev_examples)) logger.info(" Batch size = %d", args.dev_batch_size) all_input_ids_dev, all_input_len_dev, all_label_ids_dev = \ all_input_ids_dev.to(DEVICE), all_input_len_dev.to(DEVICE), all_label_ids_dev.to(DEVICE) dev_data = TensorDataset(all_input_ids_dev, all_input_len_dev, all_label_ids_dev) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.dev_batch_size) logger.info("Loaded data!") if args.fp16: self.sumbt_model.half() self.sumbt_model.to(DEVICE) # ## Get domain-slot-type embeddings # slot_token_ids, slot_len = \ # get_label_embedding(self.processor.target_slot, args.max_label_length, self.tokenizer, DEVICE) # # for slot_idx, slot_str in zip(slot_token_ids, self.processor.target_slot): # # self.idx2slot[slot_idx] = slot_str # ## Get slot-value embeddings # label_token_ids, label_len = [], [] # for slot_idx, labels in zip(slot_token_ids, self.label_list): # # self.idx2value[slot_idx] = {} # token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, DEVICE) # label_token_ids.append(token_ids) # label_len.append(lens) # # for label, token_id in zip(labels, token_ids): # # self.idx2value[slot_idx][token_id] = label # logger.info('embeddings prepared') # if USE_CUDA and N_GPU > 1: # self.sumbt_model.module.initialize_slot_value_lookup(label_token_ids, slot_token_ids) # else: # self.sumbt_model.initialize_slot_value_lookup(label_token_ids, slot_token_ids) def get_optimizer_grouped_parameters(model): param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, 'lr': args.learning_rate}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': args.learning_rate}, ] return optimizer_grouped_parameters if not USE_CUDA or N_GPU == 1: optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model) else: optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model.module) t_total = num_train_steps if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.fp16_loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total) logger.info(optimizer) # Training code ############################################################################### print(torch.cuda.memory_allocated()) logger.info("Training...") global_step = 0 last_update = None best_loss = None model = self.sumbt_model if not args.do_not_use_tensorboard: summary_writer = None else: summary_writer = SummaryWriter("./tensorboard_summary/logs_1214/") for epoch in trange(int(args.num_train_epochs), desc="Epoch"): # Train model.train() tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 for step, batch in enumerate(tqdm(train_dataloader)): batch = tuple(t.to(DEVICE) for t in batch) input_ids, input_len, label_ids = batch # print(input_ids.size()) # Forward if N_GPU == 1: loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) else: loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) # average to multi-gpus loss = loss.mean() acc = acc.mean() acc_slot = acc_slot.mean(0) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # Backward if args.fp16: optimizer.backward(loss) else: loss.backward() # tensrboard logging if summary_writer is not None: summary_writer.add_scalar("Epoch", epoch, global_step) summary_writer.add_scalar("Train/Loss", loss, global_step) summary_writer.add_scalar("Train/JointAcc", acc, global_step) if N_GPU == 1: for i, slot in enumerate(self.processor.target_slot): summary_writer.add_scalar("Train/Loss_%s" % slot.replace(' ', '_'), loss_slot[i], global_step) summary_writer.add_scalar("Train/Acc_%s" % slot.replace(' ', '_'), acc_slot[i], global_step) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify lealrning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) if summary_writer is not None: summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step if scheduler is not None: torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0) optimizer.step() if scheduler is not None: scheduler.step() optimizer.zero_grad() global_step += 1 # Perform evaluation on validation dataset model.eval() dev_loss = 0 dev_acc = 0 dev_loss_slot, dev_acc_slot = None, None nb_dev_examples, nb_dev_steps = 0, 0 for step, batch in enumerate(tqdm(dev_dataloader, desc="Validation")): batch = tuple(t.to(DEVICE) for t in batch) input_ids, input_len, label_ids = batch if input_ids.dim() == 2: input_ids = input_ids.unsqueeze(0) input_len = input_len.unsqueeze(0) label_ids = label_ids.unsuqeeze(0) with torch.no_grad(): if N_GPU == 1: loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) else: loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) # average to multi-gpus loss = loss.mean() acc = acc.mean() acc_slot = acc_slot.mean(0) num_valid_turn = torch.sum(label_ids[:, :, 0].view(-1) > -1, 0).item() dev_loss += loss.item() * num_valid_turn dev_acc += acc.item() * num_valid_turn if N_GPU == 1: if dev_loss_slot is None: dev_loss_slot = [l * num_valid_turn for l in loss_slot] dev_acc_slot = acc_slot * num_valid_turn else: for i, l in enumerate(loss_slot): dev_loss_slot[i] = dev_loss_slot[i] + l * num_valid_turn dev_acc_slot += acc_slot * num_valid_turn nb_dev_examples += num_valid_turn dev_loss = dev_loss / nb_dev_examples dev_acc = dev_acc / nb_dev_examples if N_GPU == 1: dev_acc_slot = dev_acc_slot / nb_dev_examples # tensorboard logging if summary_writer is not None: summary_writer.add_scalar("Validate/Loss", dev_loss, global_step) summary_writer.add_scalar("Validate/Acc", dev_acc, global_step) if N_GPU == 1: for i, slot in enumerate(self.processor.target_slot): summary_writer.add_scalar("Validate/Loss_%s" % slot.replace(' ', '_'), dev_loss_slot[i] / nb_dev_examples, global_step) summary_writer.add_scalar("Validate/Acc_%s" % slot.replace(' ', '_'), dev_acc_slot[i], global_step) dev_loss = round(dev_loss, 6) output_model_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin") if last_update is None or dev_loss < best_loss: last_update = epoch best_loss = dev_loss best_acc = dev_acc if not USE_CUDA or N_GPU == 1: torch.save(model.state_dict(), output_model_file) else: torch.save(model.module.state_dict(), output_model_file) logger.info( "*** Model Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % ( last_update, best_loss, best_acc, global_step)) else: logger.info( "*** Model NOT Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % ( epoch, dev_loss, dev_acc, global_step)) if last_update + args.patience <= epoch: break
''' Created on Apr 23, 2017 @author: tonyq ''' from tqdm._tqdm import tqdm import pickle as pkl with open('contencVocab.pkl', 'rb') as vocab_file: trainSet = pkl.load(vocab_file) with open('testVocab.pkl', 'rb') as vocab_file: testSet = pkl.load(vocab_file) contentSet = trainSet.union(testSet) with open('../dsk16g/glove.840B.300d.txt', 'r', encoding='utf8') as fhd: with open('../dsk16g/glove.840B.quoraVocab.300d.txt', 'w', encoding='utf8') as fwrt: for line in tqdm(fhd): if line.strip().split(' ')[0] in contentSet: fwrt.write(line)
def test(self, mode='dev', model_path=os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin")): '''Testing funciton of TRADE (to be added)''' # Evaluation self.load_weights(model_path) if mode == 'test': eval_examples = self.dev_examples elif mode == 'dev': eval_examples = self.test_examples all_input_ids, all_input_len, all_label_ids = convert_examples_to_features( eval_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to( DEVICE), all_label_ids.to(DEVICE) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.dev_batch_size) eval_data = TensorDataset(all_input_ids, all_input_len, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.dev_batch_size) model = self.sumbt_model eval_loss, eval_accuracy = 0, 0 eval_loss_slot, eval_acc_slot = None, None nb_eval_steps, nb_eval_examples = 0, 0 accuracies = {'joint7': 0, 'slot7': 0, 'joint5': 0, 'slot5': 0, 'joint_rest': 0, 'slot_rest': 0, 'num_turn': 0, 'num_slot7': 0, 'num_slot5': 0, 'num_slot_rest': 0} for input_ids, input_len, label_ids in tqdm(eval_dataloader, desc="Evaluating"): # if input_ids.dim() == 2: # input_ids = input_ids.unsqueeze(0) # input_len = input_len.unsqueeze(0) # label_ids = label_ids.unsuqeeze(0) with torch.no_grad(): if not USE_CUDA or N_GPU == 1: loss, loss_slot, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, 1) else: loss, _, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, N_GPU) nbatch = label_ids.size(0) nslot = pred_slot.size(3) pred_slot = pred_slot.view(nbatch, -1, nslot) accuracies = eval_all_accs(pred_slot, label_ids, accuracies) nb_eval_ex = (label_ids[:, :, 0].view(-1) != -1).sum().item() nb_eval_examples += nb_eval_ex nb_eval_steps += 1 if not USE_CUDA or N_GPU == 1: eval_loss += loss.item() * nb_eval_ex eval_accuracy += acc.item() * nb_eval_ex if eval_loss_slot is None: eval_loss_slot = [l * nb_eval_ex for l in loss_slot] eval_acc_slot = acc_slot * nb_eval_ex else: for i, l in enumerate(loss_slot): eval_loss_slot[i] = eval_loss_slot[i] + l * nb_eval_ex eval_acc_slot += acc_slot * nb_eval_ex else: eval_loss += sum(loss) * nb_eval_ex eval_accuracy += sum(acc) * nb_eval_ex # exit(1) eval_loss = eval_loss / nb_eval_examples eval_accuracy = eval_accuracy / nb_eval_examples if not USE_CUDA or N_GPU == 1: eval_acc_slot = eval_acc_slot / nb_eval_examples loss = None if not USE_CUDA or N_GPU == 1: result = { # 'num': '\t'.join([str(x) for x in model.num_labels]), 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'loss': loss, 'eval_loss_slot': '\t'.join([str(val / nb_eval_examples) for val in eval_loss_slot]), 'eval_acc_slot': '\t'.join([str((val).item()) for val in eval_acc_slot]), } else: result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'loss': loss } out_file_name = 'eval_results' # if TARGET_SLOT == 'all': # out_file_name += '_all' output_eval_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name) if not USE_CUDA or N_GPU == 1: with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) out_file_name = 'eval_all_accuracies' with open(os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name), 'w') as f: s = '{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}'.format( 'joint acc (7 domain)', 'slot acc (7 domain)', 'joint acc (5 domain)', 'slot acc (5 domain)', 'joint restaurant', 'slot acc restaurant') f.write(s + '\n') print(s) s = '{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}'.format( (accuracies['joint7'] / accuracies['num_turn']).item(), (accuracies['slot7'] / accuracies['num_slot7']).item(), (accuracies['joint5'] / accuracies['num_turn']).item(), (accuracies['slot5'] / accuracies['num_slot5']).item(), (accuracies['joint_rest'] / accuracies['num_turn']).item(), (accuracies['slot_rest'] / accuracies['num_slot_rest']).item() ) f.write(s + '\n') print(s)