def main(): txt_path = './data.txt' txt_list = load_txt(txt_path) data_path = './dataset.txt' dataset_txt = load_dataset(data_path) fin_result_list = [] # 对划分的内容逐个处理 print('{} sentence to do.'.format(len(txt_list))) for txt_line in txt_list: result_list = [] seg_list = seg_words(txt_line) print('Sentence index : {}.'.format(txt_list.index(txt_line) + 1)) for seg_word in tqdm.tqdm(list(set(seg_list))): if len(seg_word) <= 1: continue result = judge_txt(seg_word, dataset_txt) result_list.append((seg_word, result)) # print('{} : {}'.format(seg_word, result)) fin_result_list.extend(result_list) write_txt(fin_result_list, './result.txt') result_list = load_txt('./result.txt') result_list = [ result.replace('(', '').replace(')', '').replace(' ', '').split(',') for result in result_list ] top_list = pridict(result_list, 10) print(top_list)
def get_train_test_data(train_label_fea_file, test_label_fea_file): train_label_fea = load_txt(train_label_fea_file)[1:] test_label_fea = load_txt(test_label_fea_file)[1:] all_labels = [] # train train_doc_wordID = {} train_label = {} train_feature_str = {} for i in xrange(len(train_label_fea)): line = train_label_fea[i] labels_str, feature_str = line.split(' ', 1) # label labels_str = labels_str.split(',') try: labels = [int(label) for label in labels_str] except ValueError: continue all_labels.append(labels) train_label[i] = labels # feature train_feature_str[i] = feature_str # word word_tfidf = {} for str in feature_str.split(' '): word, tfidf = str.split(':') word_tfidf[int(word)] = float(tfidf) train_doc_wordID[i] = word_tfidf # test test_doc_wordID = {} test_label = {} test_feature_str = {} for i in xrange(len(test_label_fea)): line = test_label_fea[i] labels_str, feature_str = line.split(' ', 1) # label labels_str = labels_str.split(',') try: labels = [int(label) for label in labels_str] except ValueError: continue test_label[i] = labels # feature test_feature_str[i] = feature_str # word word_tfidf = {} for str in feature_str.split(' '): word, tfidf = str.split(':') word_tfidf[int(word)] = float(tfidf) test_doc_wordID[i] = word_tfidf all_labels = np.unique(np.concatenate(all_labels)).tolist() dump_pickle(train_feature_str, data_source_path + 'train_feature.pkl') dump_pickle(test_feature_str, data_source_path + 'test_feature.pkl') dump_pickle(train_doc_wordID, data_source_path + 'train_doc_wordID.pkl') dump_pickle(train_label, data_source_path + 'train_label.pkl') dump_pickle(test_doc_wordID, data_source_path + 'test_doc_wordID.pkl') dump_pickle(test_label, data_source_path + 'test_label.pkl') return all_labels, train_doc_wordID, train_label, test_doc_wordID, test_label, train_feature_str, test_feature_str
def create_labels_words(path=path_dataset + "semcor+omsti.gold.key.txt"): ############################################################################### # This function, given the gold file, creates a dictionary of labels (babelnet id, Wndomain, Lexname) # for each ambiguous words # # Input: # path: path of the gold file # # Output: # dict_sensekey: dictionary of labels ############################################################################### sense_keys = [sensekey.split() for sensekey in utils.load_txt(path)] dict_sensekey = {} for list_info in sense_keys: # take the synset from the sense key synset = wn.lemma_from_key(list_info[1]).synset() # take the wordnet id from the sense key wn_id = "wn:" + str(synset.offset()).zfill(8) + synset.pos() bn_id = Wordnet_match[wn_id] try: dict_sensekey[list_info[0]] = [ bn_id, Wndomain_match[bn_id], lexname_match[bn_id] ] # add the factotum label to all the words which don't have a wndomain label except: dict_sensekey[list_info[0]] = [ bn_id, "factotum", lexname_match[bn_id] ] return dict_sensekey
def load(cls, filename): buffer = utils.load_txt(filename) p1 = buffer[0] m = int(buffer[1]) n = int(buffer[2]) alpha = buffer[3:m+3] a = buffer[3+m:3+m+n] b = buffer[3+m+n:3+m+n+n] return cls(p1, alpha, a, b)
def create_labels_words(path_input, save_gt=False): ############################################################################### # This function, given the gold file, creates a dictionary of labels for each # ambiguous words (babelnet id, Wndomain, Lexname) and if save_gt is True, # this function saves the ground truth # # Input: # path: path of the gold file # save_gt: if True, the ground truth is saved, otherwise no # # Output: # dict_sensekey: dictionary of labels ############################################################################### sense_keys = [sensekey.split() for sensekey in utils.load_txt(path_input)] dict_sensekey = {} for list_info in sense_keys: # take the synset from the sense key synset = wn.lemma_from_key(list_info[1]).synset() # take the wordnet id from the sense key wn_id = "wn:" + str(synset.offset()).zfill(8) + synset.pos() bn_id = Wordnet_match[wn_id] try: dict_sensekey[list_info[0]] = [ bn_id, Wndomain_match[bn_id], lexname_match[bn_id] ] # add the factotum label to all the words which don't have a wndomain label except: dict_sensekey[list_info[0]] = [ bn_id, "factotum", lexname_match[bn_id] ] # save the ground truth if save_gt: name_words = list(dict_sensekey.keys()) gt_words = list(dict_sensekey.values()) combined = list(zip(name_words, gt_words)) utils.save_txt( list(map(lambda word: word[0] + " " + word[1][0], combined)), "../resources/Test/fine_grained_gt.txt") utils.save_txt( list(map(lambda word: word[0] + " " + word[1][1], combined)), "../resources/Test/wndomain_gt.txt") utils.save_txt( list(map(lambda word: word[0] + " " + word[1][2], combined)), "../resources/Test/lexname_gt.txt") return None return dict_sensekey
def load_dataset(dataset_path): # 加载自己写的简单的语料库 dataset_txt = [] dataset_list = load_txt(dataset_path) for line in dataset_list: temp_list = line.split('|') for i in temp_list: if len(i) >= 2: dataset_txt.append(i) return dataset_txt
def __init__(self, model_path, device, pretrained_model_path, model_name, sent_embd_path, word_embd_path='', stopword_path='./lib/hit_stopwords.txt', num_classes=2): # super().__init__() self.device = device self.model = load_model(model_path, device, pretrained_model_path, model_name, num_classes) print('Load classification model {}'.format(model_path)) self.stopword_lst = load_txt(stopword_path) # self.w2i = pickle.load(open(os.path.join(embd_path, 'w2i.pkl'),'rb')) # self.i2w = pickle.load(open(os.path.join(embd_path, 'i2w.pkl'),'rb')) # self.embd_matrix = np.load(os.path.join(embd_path, 'i2w.pkl')) self.sent_embd = hub.load(sent_embd_path) # spacy.prefer_gpu() if word_embd_path: self.nlp = spacy.load(word_embd_path) else: self.nlp = spacy.load('zh_core_web_lg') print('Textfooler init success!')
def load(save_path, max_size, skip=0): code = load_txt(save_path + 'code.txt', skip) print('loading code finished') parent_matrix = load_txt(save_path + 'parent_matrix.txt', skip).view(-1, max_size, max_size) print('loading parent_matrix finished') brother_matrix = load_txt(save_path + 'brother_matrix.txt', skip).view(-1, max_size, max_size) print('loading brother_matrix finished') rel_par_ids = load_txt(save_path + 'relative_parents.txt', skip).view(-1, max_size, max_size) print('loading rel_par_ids finished') rel_bro_ids = load_txt(save_path + 'relative_brothers.txt', skip).view(-1, max_size, max_size) print('loading rel_bro_ids finished') comments = load_txt(save_path + 'comments.txt', skip) print('loading comments finished') return code, parent_matrix, brother_matrix, rel_par_ids, rel_bro_ids, comments
def prepare_ans_conditional_data( data_file, out_dir, out_prefix, n_ans_per_txt=10, use_no_ans=False, use_only_no_ans=False, ): """ Given a text file, extract possible answer candidates for each line. Will generate n_ans_per_text instances for each line in txt """ txt_w_ans_file = f"{out_dir}/{out_prefix}_w_{n_ans_per_txt}ans.txt" txt_file = f"{out_dir}/{out_prefix}.txt" ans_file = f"{out_dir}/{out_prefix}_{n_ans_per_txt}ans.txt" print( f"Preparing answer conditional question generation data for {data_file}" ) if use_only_no_ans: print("\twith ONLY NO_ANS!") elif use_no_ans: print("\twith NO_ANS option!") else: print("\twithout NO_ANS option!") all_txts = load_txt(data_file) print("Extracting entities...") all_anss = extract_ans(all_txts) print("\tDone!") print(f"\tMin ans count: {min(len(a) for a in all_anss)}") print(f"\tMax ans count: {max(len(a) for a in all_anss)}") print("Writing...") txts_w_ans = list() all_txt = list() all_ans = list() for txt, anss in zip(all_txts, all_anss): if use_only_no_ans: anss = [NO_ANS_TOK] * n_ans_per_txt elif use_no_ans: if len(anss) > n_ans_per_txt - 1: anss = random.sample(anss, k=n_ans_per_txt - 1) anss += [NO_ANS_TOK] * (n_ans_per_txt - len(anss)) assert NO_ANS_TOK in anss, ipdb.set_trace() else: if len(anss) < n_ans_per_txt: extra_anss = random.choices(anss, k=n_ans_per_txt - len(anss)) anss += extra_anss if len(anss) > n_ans_per_txt: anss = random.sample(anss, n_ans_per_txt) assert len(anss) == n_ans_per_txt, ipdb.set_trace() for ans in anss: txts_w_ans.append(f"{txt} {ANS_TOK} {ans}") all_txt.append(txt) all_ans.append(ans) if not os.path.exists(out_dir): os.makedirs(out_dir) with open(txt_w_ans_file, 'w') as out_fh: for txt in txts_w_ans: out_fh.write(f'{txt}\n') with open(txt_file, 'w') as out_fh: for txt in all_txt: out_fh.write(f'{txt}\n') with open(ans_file, 'w') as out_fh: for ans in all_ans: out_fh.write(f'{ans}\n') print("\tDone!") print(f"\tWrote {len(txts_w_ans)} sentences to {txt_w_ans_file}")
from main import * import config import utils import visited import globalvar as gl gl._init_globalar() config._init() v = utils.load_txt(config.get_value("filename")) for i in range(30): print("run...", i) config._init() visited._init_visited_set() filename = config.get_value("filename") main(data=v, wmin=config.get_value("wmin"), wmax=config.get_value("wmax"), pop_size=config.get_value("pop_size"), max_evaluations=config.get_value("max_evaluations"), display=True) if (config.get_value("SHOW_CONVERGENCE_RATE")): utils.show_convergence_rate() if (config.get_value("SHOW_SWARM_DISTRIBUTION")): utils.show_swarm_distribution() print("Best_SUM= ", gl.get_value("gbest_sum"))
def create_input_Train(): ############################################################################### # This function creates the training input sentences, the labels and sentences # with the candidate synsets # # Input: # None # # Output: # : list with sentences, labels and candidate synsets # : list with ids of the candidate synsets ############################################################################### # list of padded sentences (string sentences) X_train = [ sentence.split() + [''] * (MAX_LENGTH - len(sentence.split())) if len(sentence.split()) <= MAX_LENGTH else sentence.split()[:MAX_LENGTH] for sentence in utils.load_txt( "../resources/Train/semcor_omsti_string.txt") ] # list of padded sentences (ids sentences) X_train_int = [ list(map(int, sentence.split())) for sentence in utils.load_txt("../resources/Train/semcor_omsti.txt") ] X_train_int = pad_sequences(X_train_int, truncating='pre', padding='post', maxlen=MAX_LENGTH) # lists of padded labels Y_POS = [ list(map(int, label.split())) for label in utils.load_txt("../resources/Train/POS.txt") ] Y_POS = pad_sequences(Y_POS, truncating='pre', padding='post', maxlen=MAX_LENGTH) Y_wndomain = [ list(map(int, label.split())) for label in utils.load_txt("../resources/Train/Wndomain.txt") ] Y_wndomain = pad_sequences(Y_wndomain, truncating='pre', padding='post', maxlen=MAX_LENGTH) Y_lex = [ list(map(int, label.split())) for label in utils.load_txt("../resources/Train/lexnames.txt") ] Y_lex = pad_sequences(Y_lex, truncating='pre', padding='post', maxlen=MAX_LENGTH) # lists of padded candidate synsets sentences X_candidate_wndomain, ids_candidates_wndomain = cs.create_candidates( path_train_json, type_=1) X_candidate_wndomain = pad_sequences(X_candidate_wndomain, truncating='pre', padding='post', maxlen=MAX_LENGTH) X_candidate_lexname, ids_candidates_lexname = cs.create_candidates( path_train_json, type_=2) X_candidate_lexname = pad_sequences(X_candidate_lexname, truncating='pre', padding='post', maxlen=MAX_LENGTH) return [ X_train, X_train_int, Y_wndomain, Y_lex, Y_POS, X_candidate_wndomain, X_candidate_lexname ], [ids_candidates_wndomain, ids_candidates_lexname]
def aggregate_questions_from_txt(out_dir, src_txt_file, gen_txt_file, gen_qst_file, gen_prob_file=None, gen_ans_file=None, gen_prd_file=None, src_w_trg_txt_file=None, use_all_qsts=False, use_act_anss=False, use_exp_anss=False, n_gen_qsts=10, n_ans=10, n_qsts=20): """ Extract questions generated from src, trg, and gen with the corresponding field from fseq logs (one log/txt) and write to jsonl. Each fseq log should have the txt field as 'source' (S) and the questions as generated 'hypotheses' (H) args: - src_txt_file: txt file or source inputs (e.g. articles for summarization) - src_w_trg_txt_file (optional): special src inputs with the trg re-appended for XSUM - gen_txt_file: txt file of model-generated targets (e.g. summaries for summarization) - gen_qst_file: txt file of questions generated conditioned on src/gen - gen_prob_file: txt file of {src/gen} question probabilities according to QG model - gen_prd_file (optional): txt file of answers predicted by QA model on src/gen_qst_file n_ans: the number of answer candidates per text n_gen_qsts: the number of questions generated per (text, answer) pair n_qsts: the number of questions to use for each example use_all_qsts: use all questions use_act_anss: filter out [NO_ANS] questions use_exp_anss: filter out questions where prediction doesn't match expected answer """ assert not (use_exp_anss and (gen_ans_file is None) ), "Trying to use expected answers, but not provided any!" assert not ( use_act_anss and (gen_ans_file is None) ), "Trying to use predicted answers, but not provided expected answers!" assert not (use_act_anss and (gen_prd_file is None) ), "Trying to use predicted answers, but not provided any!" files = { "src": { "txt": src_txt_file }, "gen": { "txt": gen_txt_file, "qst": gen_qst_file, "prb": gen_prob_file, "ans": gen_ans_file, "prd": gen_prd_file }, } # the number of original examples (not counting answer candidates) n_exs = None # number of total generated questions per example (across answer candidates and generated questions) n_qsts_per_ex = n_ans * n_gen_qsts # load all data all_txts, all_qsts = {}, {} for txt_fld, field_files in files.items(): txts = load_txt(field_files["txt"]) all_txts[txt_fld] = txts if txt_fld == "src" and src_w_trg_txt_file is not None: txts = load_txt(src_w_trg_txt_file) all_txts["src_w_trg"] = txts if n_exs is None: # infer number of examples n_exs = len(txts) else: assert len( txts ) == n_exs, "Different numbers of txts detected! Expected {n_exs} but found {len(txts)} for {txt_fld}." # load questions, probabilities, (expected) answers only based on generation if txt_fld != "gen": continue qsts = load_txt(field_files["qst"]) prbs = [float(f) for f in load_txt(field_files["prb"]) ] #if field_files["prb"] is not None else list() anss = load_txt(field_files["ans"]) if use_exp_anss else [] # optionally load QA model predictions if use_act_anss: raw_prds = json.load(open(field_files["prd"])) prds = [raw_prds[str(i)] for i in range(len(raw_prds))] else: prds = list() all_qsts[txt_fld] = (qsts, prbs, anss, prds) print( f"Formatting QA data for {n_exs} examples, filtering {n_qsts_per_ex} questions per example to {n_qsts}" ) # build the data then write out a SQuAD format file # dummy iterator in case we want to condition questions on something else outside of gen for qst_src in all_qsts: qsts, prbs, anss, prds = all_qsts[qst_src] all_clean_qsts = [] # Filter questions # Extract questions assuming there's a constant number per example and in order for i in tqdm(range(n_exs), desc="Filtering questions"): cand_qsts = qsts[(i * n_qsts_per_ex):((i + 1) * n_qsts_per_ex)] cand_prbs = prbs[(i * n_qsts_per_ex):((i + 1) * n_qsts_per_ex)] cand_anss = anss[(i * n_ans):((i + 1) * n_ans)] if anss else None cand_prds = prds[(i * n_qsts_per_ex):((i + 1) * n_qsts_per_ex)] if prds else None if not use_all_qsts: ret = filter_qsts(cand_qsts, n_qsts, prbs=cand_prbs, reverse_prob=False, exp_anss=cand_anss, act_anss=cand_prds) else: ret = { 'qsts': cand_qsts, 'n_clean_qsts': len(cand_qsts), 'n_qsts_w_ans': None, 'n_qsts_w_match_ans': None, } clean_qsts = ret['qsts'] for qst in clean_qsts: assert not isinstance(qst, list), "List instead of string detected!" all_clean_qsts.append(clean_qsts) # Construct data in SQuAD-like format, using both src (article) and gen (model generation) as context for txt_fld in all_txts: if use_all_qsts and txt_fld != "gen": # case where we want to get answers for all our questions # and we want to just use the generations to do that, assuming we generated from generations continue txts = all_txts[txt_fld] raw_data = {} for i in tqdm(range(n_exs), desc="Formatting data"): if len(txts) == len(qsts): txt = txts[i * n_ans].split() elif len(txts) < len(qsts): assert len(qsts) / len(txts) == n_qsts_per_ex, \ f"Expected constant number of questions ({n_qsts_per_ex}) per example! Found {len(qsts)} total questions for {len(txts)} examples" txt = txts[i].split() else: raise IndexError( "Number of questions should be weakly greater than number of examples!" ) clean_qsts = all_clean_qsts[i] raw_data[i] = {txt_fld: txt, "hypotheses": clean_qsts} data = format_squad(raw_data, context=txt_fld, ctx_split=True) out_file = f"{out_dir}/{txt_fld}.json" print(f"Writing to {out_file}") json.dump(data, open(out_file, "w", encoding="utf-8"))
import numpy as np import os import PIL import torch import torchvision from PIL import Image from torch.utils.data import Subset from torchvision import datasets from utils import load_txt corruptions = load_txt('./src/corruptions.txt') class CIFAR10C(datasets.VisionDataset): def __init__(self, root :str, name :str, transform=None, target_transform=None): assert name in corruptions super(CIFAR10C, self).__init__( root, transform=transform, target_transform=target_transform ) data_path = os.path.join(root, name + '.npy') target_path = os.path.join(root, 'labels.npy') self.data = np.load(data_path) self.targets = np.load(target_path) def __getitem__(self, index): img, targets = self.data[index], self.targets[index]
def get_train_test_data(vocab, asin_map_file, text_data_file, train_asin_map_file, test_asin_map_file, train_asin_label_fea_file, test_asin_label_fea_file): vocab = load_pickle('../material/' + vocab) vocab_dict = dict(zip(vocab, range(len(vocab)))) asin_map = load_pickle(asin_map_file) asin_map_dict = dict(zip(asin_map, range(len(asin_map)))) text_data = load_pickle(text_data_file) # train_asin_label_fea = load_txt(train_asin_label_fea_file)[1:] test_asin_label_fea = load_txt(test_asin_label_fea_file)[1:] # remove all replica in both train and test asin map train_asin_map = get_asin_from_map_file(train_asin_map_file) test_asin_map = get_asin_from_map_file(test_asin_map_file) train_asin_map_dict = dict(zip(train_asin_map, range(len(train_asin_map)))) test_asin_map_dict = dict(zip(test_asin_map, range(len(test_asin_map)))) # train_rep = [item for item, count in collections.Counter(train_asin_map).items() if count > 1] test_rep = [item for item, count in collections.Counter(test_asin_map).items() if count > 1] train_asin = set(train_asin_map) - set(train_rep) test_asin = set(test_asin_map) - set(test_rep) # remove asins from test data which appear in train data test_asin = test_asin - train_asin train_asin = list(train_asin) test_asin = list(test_asin) # train print 'get train data' all_labels = [] train_doc_wordID = {} train_label = {} train_feature = {} k = 0 for t in train_asin: k = k + 1 if k % 10000 == 0: print k try: #ind = asin_map.index(t) ind = asin_map_dict[t] except: continue #train_data[ind] = clean_str(text_data[ind]) text = clean_str(text_data[ind]) wordID = get_wordID_from_vocab_dict_for_raw_text(text, vocab_dict) if not wordID: continue train_doc_wordID[ind] = wordID # #line = train_asin_label_fea[train_asin_map.index(t)] line = train_asin_label_fea[train_asin_map_dict[t]] labels_str, feature_str = line.split(' ', 1) labels = [int(label) for label in labels_str.split(',')] all_labels.append(labels) train_label[ind] = labels train_feature[ind] = feature_str # test print 'get test data' test_doc_wordID = {} test_label = {} test_feature = {} k = 0 for t in test_asin: k = k + 1 if k % 10000 == 0: print k try: ind = asin_map_dict[t] #ind = asin_map.index(t) except: continue #test_data[ind] = clean_str(text_data[ind]) text = clean_str(text_data[ind]) wordID = get_wordID_from_vocab_dict_for_raw_text(text, vocab_dict) if not wordID: continue test_doc_wordID[ind] = wordID # #line = test_asin_label_fea[test_asin_map.index(t)] line = test_asin_label_fea[test_asin_map_dict[t]] labels_str, feature_str = line.split(' ', 1) labels = [int(label) for label in labels_str.split(',')] test_label[ind] = labels test_feature[ind] = feature_str all_labels = np.unique(np.concatenate(all_labels)).tolist() print 'dump train/test data' dump_pickle(train_feature, data_source_path + 'train_feature.pkl') dump_pickle(test_feature, data_source_path + 'test_feature.pkl') dump_pickle(train_doc_wordID, data_source_path + 'train_doc_wordID.pkl') dump_pickle(train_label, data_source_path + 'train_label.pkl') dump_pickle(test_doc_wordID, data_source_path + 'test_doc_wordID.pkl') dump_pickle(test_label, data_source_path + 'test_label.pkl') return all_labels, train_doc_wordID, train_label, test_doc_wordID, test_label, train_feature, test_feature
def __init__(self, split, exp_dict, root): self.transform_function = ut.bgrNormalize() self.collate_fn = ut.collate_fn_0_4 self.split = split self.img_names = [] self.mask_names = [] self.cls_names = [] # train train_img_names = ut.load_txt(root + "/ImageSets/Segmentation/train.txt") val_img_names = ut.load_txt(root + "/ImageSets/Segmentation/val.txt") assert len(train_img_names) == 1464 assert len(val_img_names) == 1449 if split == 'train': for name in train_img_names: name = name.replace("\n", "") name_img = os.path.join(root, 'JPEGImages/' + name + '.jpg') name_mask = os.path.join(root, 'SegmentationObject/' + name + '.png') name_cls = os.path.join(root, 'SegmentationClass/' + name + '.png') self.img_names += [name_img] self.mask_names += [name_mask] self.cls_names += [name_cls] self.img_names.sort() self.cls_names.sort() self.mask_names.sort() self.img_names = np.array(self.img_names) self.cls_names = np.array(self.cls_names) self.mask_names = np.array(self.mask_names) elif split in ['val']: for k, name in enumerate(val_img_names): name = name.replace("\n", "") name_img = os.path.join(root, 'JPEGImages/' + name + '.jpg') name_mask = os.path.join(root, 'SegmentationObject/' + name + '.png') name_cls = os.path.join(root, 'SegmentationClass/' + name + '.png') assert os.path.exists(name_img) assert os.path.exists(name_mask) assert os.path.exists(name_cls) self.img_names += [name_img] self.mask_names += [name_mask] self.cls_names += [name_cls] self.n_classes = 21 self.ignore_index = 255 self.exp_dict = exp_dict if split == "val": annList_path = "./datasets/annotations/val_gt_annList.json" self.annList_path = annList_path self.sm_proposal_dict = ut.load_json("./datasets/proposal_dict.json") self.prm_point_dict = ut.load_json("./datasets/prm_point_dict.json")
import pprint import torch import torchvision import tqdm from glob import glob import torch.nn.functional as F from torch.utils.data import DataLoader from torchvision import transforms, datasets from tqdm import tqdm from utils import load_txt, accuracy, create_barplot, get_fname, AverageMeter from models.resnet import ResNet56 from dataset import CIFAR10C CORRUPTIONS = load_txt('./src/corruptions.txt') MEAN = [0.49139968, 0.48215841, 0.44653091] STD = [0.24703223, 0.24348513, 0.26158784] def main(opt, weight_path: str): device = torch.device(opt.gpu_id) # model if opt.arch == 'resnet56': model = ResNet56() else: raise ValueError() try: model.load_state_dict(torch.load(weight_path, map_location='cpu'))
def get_candidate_labels(path, out_path, type, format): train_titles_file = path + 'train_map.txt' test_titles_file = path + 'test_map.txt' # train_candidate_label_file = path + type + '_candidate/candidate_train.mat' # test_candidate_label_file = path + type + '_candidate/candidate_test.mat' #label_index_file = path + 'label_dict.pkl' index_label_file = path + 'all_labels.pkl' train_titles = load_txt(train_titles_file) test_titles = load_txt(test_titles_file) index_label = load_pickle(index_label_file) train_candidate_labels = {} test_candidate_labels = {} if format == 'mat': train_candidate_label_file = path + type + '_candidate/candidate_train.mat' test_candidate_label_file = path + type + '_candidate/candidate_test.mat' train_candidate_all = sio.loadmat(train_candidate_label_file)['candidate_train'] test_candidate_all = sio.loadmat(test_candidate_label_file)['candidate_test'] for i in xrange(len(train_titles)): pid = train_titles[i].strip() pid = int(pid) candidate_label_index = train_candidate_all[i] candidate_labels = [index_label[ind] for ind in candidate_label_index] train_candidate_labels[pid] = candidate_labels for i in xrange(len(test_titles)): pid = test_titles[i].strip() pid = int(pid) candidate_label_index = test_candidate_all[i] candidate_labels = [index_label[ind] for ind in candidate_label_index] test_candidate_labels[pid] = candidate_labels elif format == 'txt': train_candidate_label_file = path + type + '_candidate/train_score_mat.txt' test_candidate_label_file = path + type + '_candidate/test_score_mat.txt' train_candidate_all = load_txt(train_candidate_label_file)[1:] test_candidate_all = load_txt(test_candidate_label_file)[1:] for i in xrange(len(train_titles)): pid = int(train_titles[i].strip()) candidate_label_line = train_candidate_all[i].strip() temp = [] for l_s in candidate_label_line.split(' '): l_, s_ = l_s.split(':') ll = index_label[int(l_)] temp.append((ll, float(s_))) sorted_temp = sorted(temp, key=lambda e: e[1], reverse=True) train_candidate_labels[pid] = dict(sorted_temp[:50]) # candidate_label_score = {} # for l_s in candidate_label_line.split(' ')[:30]: # l_, s_ = l_s.split(':') # ll = index_label[int(l_)] # candidate_label_score[ll] = float(s_) # train_candidate_labels[pid] = candidate_label_score for i in xrange(len(test_titles)): pid = int(test_titles[i].strip()) candidate_label_line = test_candidate_all[i].strip() temp = [] for l_s in candidate_label_line.split(' '): l_, s_ = l_s.split(':') ll = index_label[int(l_)] temp.append((ll, float(s_))) sorted_temp = sorted(temp, key=lambda e: e[1], reverse=True) test_candidate_labels[pid] = dict(sorted_temp[:50]) # candidate_label_score = {} # for l_s in candidate_label_line.split(' ')[:30]: # l_, s_ = l_s.split(':') # ll = index_label[int(l_)] # candidate_label_score[ll] = float(s_) # test_candidate_labels[pid] = candidate_label_score dump_pickle(train_candidate_labels, out_path + type + '_candidate/train_candidate_label.pkl') dump_pickle(test_candidate_labels, out_path + type + '_candidate/test_candidate_label.pkl')
def __init__(self): ([jieba.add_word(d) for d in load_txt('./cut/my_word.txt')]) self.stop_dict = set([d for d in load_txt('./cut/my_stop_word.txt')])