def save_doc_covid(self): utils.save_dict( self.document_posting_covid, "doc_posting_covid" + str(self.doc_posting_covid_counter), self.config.get_out_path()) self.doc_posting_covid_counter += 1 self.document_posting_covid = {}
def seg_stats(path): sumary_dict = { "wt_seg_size": [], "tc_seg_size": [], "et_seg_size": [], "image_size": [] } file_list = [] for folder in os.listdir(path): print(folder) file = os.path.join(path, folder, folder + "_seg.nii.gz") file_list.append(file) i = 1 for file in file_list: print("Doing :", i, "/", len(file_list)) sumary_dict = explore.process_all(file, sumary_dict) i += 1 utils.save_dict(sumary_dict) print("END") return 0
def saveEvalResults(generator, cfg, obj_mapping, hoi_mapping, evalData=None): my_output_path = cfg.results_path + 'hoi' + cfg.my_results_dir + '/res/' + generator.data_type + generator.approach + '/' path = cfg.part_results_path + cfg.dataset + "/hoi" + cfg.my_results_dir mode = generator.data_type if not os.path.exists(path): path = path[:-1] path += '/' nb_empty = 0 if evalData is None: evalData = [] for batchidx, (imageID, imageMeta) in enumerate(generator.imagesMeta.items()): if (batchidx+1) % (max(1,generator.nb_batches // 100)) == 0 or batchidx==1 or (batchidx+1) == generator.nb_batches: utils.update_progress_new(batchidx+1, generator.nb_batches, imageID) if os.path.exists(my_output_path + imageID + '.pkl'): data = utils.load_obj(my_output_path + imageID) if data is not None and len(data) > 0: evalData.extend(data) else: nb_empty += 1 evalData = cp.copy(evalData) mAP, AP, _ = metrics.computeHOImAP(evalData, generator.imagesMeta, obj_mapping, hoi_mapping, cfg) saveMeta = {'mAP': mAP, 'zAP': AP.tolist(), 'nb_empties': nb_empty} utils.save_dict(saveMeta, path+mode+'_mAP') print('mAP', mode, mAP) print('empties', nb_empty)
def get_subreddit_entry_info_thread(sub_list): import os from utils import load_dict, save_dict subreddits_dlist = [] #log('**** thread running:'+repr(sub_list)) if os.path.exists(subredditsPickle): #log('****file exists ' + repr( subredditsPickle )) subreddits_dlist = load_dict(subredditsPickle) #for e in subreddits_dlist: log(e.get('entry_name')) #log( pprint.pformat(subreddits_dlist, indent=1) ) #log('****------before for -------- ' + repr(sub_list )) for subreddit in sub_list: #remove old instance of subreddit #log('****processing ' + repr( subreddit )) subreddits_dlist = [ x for x in subreddits_dlist if x.get('entry_name') != subreddit.lower() ] #log('getting sub info') sub_info = get_subreddit_info(subreddit) log(' retrieved subreddit info ' + repr(sub_info)) if sub_info: subreddits_dlist.append(sub_info) save_dict(subreddits_dlist, subredditsPickle)
def get_subreddit_entry_info_thread(sub_list): from utils import load_dict, save_dict, get_domain_icon, setting_entry_is_domain global subreddits_dlist #subreddits_dlist=[] if not subreddits_dlist: if os.path.exists(subredditsPickle): subreddits_dlist=load_dict(subredditsPickle) for subreddit in sub_list: subreddit=subreddit.lower().strip() subreddits_dlist=[x for x in subreddits_dlist if x.get('entry_name','') != subreddit ] domain=setting_entry_is_domain(subreddit) if domain: log(' getting domain info '+domain) sub_info=get_domain_icon(subreddit,domain) else: log(' getting sub info '+subreddit) sub_info=get_subreddit_info(subreddit) log(' retrieved subreddit info ' + repr( sub_info )) if sub_info: subreddits_dlist.append(sub_info) save_dict(subreddits_dlist, subredditsPickle)
def prepare_data(video_dir, save_dir='datas', frame_gap=10, frame_size=224, frames_per_group=16): # "video-group" dictionary frame_info = dict() # extract frame from directory for lidx, label in enumerate(os.listdir(video_dir)): # check data director, if not exist, create save_class_dir = os.path.join(save_dir, label) if not os.path.exists(save_class_dir): os.makedirs(save_class_dir) print('create dirctory: {}'.format(save_class_dir)) # read from each video for vidx, video in enumerate(os.listdir(os.path.join(video_dir, label))): # read video cap = cv2.VideoCapture(os.path.join(video_dir, label, video)) frame_count = 0 while(cap.isOpened()): _, frame = cap.read() # image save each "frame_gap" time if frame is not None: frame_count += 1 if(frame_count % frame_gap == 1): group = frame_count // (frame_gap * frames_per_group) index = (frame_count // frame_gap ) % frames_per_group im = cv2.resize(frame, (frame_size,frame_size)) im = im[:,56:168,:] im = cv2.resize(im, (frame_size,frame_size)) cv2.imwrite(os.path.join(save_class_dir, 'video{}_group{}_index{}.jpg'.format(vidx, group, index)), im) if (cv2.waitKey(1) & 0xFF == ord('q')) or frame is None: break # When everything done, release the capture cap.release() print('process video: {}, frames: {}, frame_gap: {}, groups: {}'.format(os.path.join(video_dir, label, video), frame_count, frame_gap, group)) frame_info[os.path.join(save_class_dir, 'video{}@{}'.format(vidx, lidx))] = frame_count // (frame_gap * frames_per_group) # dave dictionary save_dict('model/frame_info_{}.pkl'.format(frames_per_group), frame_info)
def performance(parameter_name, values, config=CONFIG): save_folder = generate_folder_name() print("\n\nPerformance test for parameter '{}' with values {}".format( parameter_name, values)) results = np.empty((len(values), 2), dtype=float) drop = np.empty((len(values), 2), dtype=float) for index, v in enumerate(values): print("Run {} with value {} on GPU".format(index + 1, v)) results[index, 0], drop[index, 0] = run((parameter_name, v), ('gpu', True)) utils.clear_current_line() # print("Run {} with value {} on CPU".format(index+1, v)) # results[index, 1], drop[index, 1] = run((parameter_name, v), ('gpu', False)) # utils.clear_current_line() # print("Results: \n{}".format(results)) print("Number of dropped solutions: \n " + "GPU: " + format(drop[:, 0])) gpu_results = tuple(results[:, 0]) # cpu_results = tuple(results[:, 1]) cpu_results = 0 exp_name = tuple(values) utils.save_dict(config, save_folder, 'config_' + parameter_name + '.txt') acoc_plotter.plot_bar_graph(gpu_results, cpu_results, exp_name, save=True, show=True, save_folder=SAVE_DIR)
def performance(parameter_name, values, config=CONFIG): save_folder = generate_folder_name() print("\n\nPerformance test for parameter '{}' with values {}".format(parameter_name, values)) results = np.empty((len(values), 2), dtype=float) drop = np.empty((len(values), 2), dtype=float) for index, v in enumerate(values): print("Run {} with value {} on GPU".format(index+1, v)) results[index, 0], drop[index, 0] = run((parameter_name, v), ('gpu', True)) utils.clear_current_line() # print("Run {} with value {} on CPU".format(index+1, v)) # results[index, 1], drop[index, 1] = run((parameter_name, v), ('gpu', False)) # utils.clear_current_line() # print("Results: \n{}".format(results)) print("Number of dropped solutions: \n " + "GPU: " + format(drop[:, 0])) gpu_results = tuple(results[:, 0]) # cpu_results = tuple(results[:, 1]) cpu_results = 0 exp_name = tuple(values) utils.save_dict(config, save_folder, 'config_' + parameter_name + '.txt') acoc_plotter.plot_bar_graph(gpu_results, cpu_results, exp_name, save=True, show=True, save_folder=SAVE_DIR)
def main(): args = params() tag2id_path = os.path.join(args["output_path"], args["tag2id"]) if not os.path.exists(args["output_path"]): os.makedirs(args["output_path"]) if not os.path.join(args["pb_path"]): os.makedirs(args["pb_path"]) tag2id = {"体育": 0, "健康": 1, "军事": 2, "教育": 3, "汽车": 4} max_len = args["max_len"] batch_size = args["batch_size"] epoch = args["epoch"] # load data data, label = load_data(args["data_file"], tag2id) logger.info("total data size: {}".format(len(data))) logger.info("total label size: {}".format(len(label))) # random 乱序 data, label = random_shuffle(data, label) # save tag2id save_dict(tag2id, tag2id_path) # label encoder total_label = label_encoder(label, len(tag2id)) # get train test data train_data, dev_data, train_label, dev_label = train_test_split( data, total_label, test_size=0.2) logger.info("train data size: {}".format(len(train_data))) logger.info("dev data size: {}".format(len(dev_data))) # bert tokenizer tokenizer = get_tokenizer() # tokenizer = get_roberta_tokenizer() # 准备模型数据 train_x, train_y = create_inputs_targets(train_data, train_label, max_len, tokenizer) dev_x, dev_y = create_inputs_targets(dev_data, dev_label, max_len, tokenizer) # create model bert # model = create_model(len(tag2id)) model = create_model(args["bert_model_name"], len(tag2id)) # model.summary() model.fit(train_x, train_y, epochs=epoch, verbose=1, batch_size=batch_size, validation_data=(dev_x, dev_y), validation_batch_size=batch_size) # , validation_split=0.1 # model save model_path = os.path.join(args["output_path"], "classification_model.h5") model.save_weights(model_path, overwrite=True) # save pb model tf.keras.models.save_model(model, args["pb_path"], save_format="tf", overwrite=True)
def collect_dispersion_from_earthsr_and_save(nside, options): data_dispersion_file_fund = utils.load(options['global_folder'] + 'disp_vconly.input_code_earthsr') data_dispersion = [{} for i in range(0, options['nb_modes'][1])] list_modes_side = [{} for j in range(0, options['nb_modes'][1])] for nmode in range(0, options['nb_modes'][1]): list_modes_side[nmode]['loc'] = np.where( data_dispersion_file_fund[:, 0] == nmode) freq_domain = 0 if (list_modes_side[nmode]['loc'][0].size > 0): data_dispersion[nmode]['period'] = data_dispersion_file_fund[ list_modes_side[nmode]['loc'][0], 1] data_dispersion[nmode]['cphi'] = data_dispersion_file_fund[ list_modes_side[nmode]['loc'][0], 2] data_dispersion[nmode]['cg'] = data_dispersion_file_fund[ list_modes_side[nmode]['loc'][0], 3] data_dispersion[nmode]['QR'] = data_dispersion_file_fund[ list_modes_side[nmode]['loc'][0], 4] ## Add nan for periods where 1st mode has not been calculated if (nmode > 0 and list_modes_side[nmode]['loc'][0].size > 0): cpt = len(data_dispersion[nmode]['period']) - 1 save_cphi = data_dispersion[nmode]['cphi'][-1] * 0. + np.inf save_cg = data_dispersion[nmode]['cg'][-1] * 0. + np.inf save_QR = data_dispersion[nmode]['QR'][-1] * 0. + np.inf while data_dispersion[nmode]['period'][-1] < data_dispersion[0][ 'period'][-1]: cpt += 1 data_dispersion[nmode]['period'] = np.concatenate([ data_dispersion[nmode]['period'], [data_dispersion[0]['period'][cpt]] ]) data_dispersion[nmode]['cphi'] = np.concatenate( [data_dispersion[nmode]['cphi'], [save_cphi]]) data_dispersion[nmode]['cg'] = np.concatenate( [data_dispersion[nmode]['cg'], [save_cg]]) data_dispersion[nmode]['QR'] = np.concatenate( [data_dispersion[nmode]['QR'], [save_QR]]) ## Save with name "current_struct" to be consistent with resonance_eigen current_struct = data_dispersion for nmode in range(0, len(current_struct)): if (len(current_struct[nmode]) > 0): current_struct[nmode]['fks'] = 1. / current_struct[nmode]['period'] utils.save_dict(current_struct, options['global_folder'] + 'PARAM_dispersion.mat') return current_struct
def get_subreddit_entry_info_thread(sub_list): from utils import load_dict, save_dict, get_domain_icon, setting_entry_is_domain from domains import ClassYoutube global subreddits_dlist #subreddits_dlist=[] #log('**** thread running:'+repr(sub_list)) if not subreddits_dlist: if os.path.exists(subredditsPickle): #log('****file exists ' + repr( subredditsPickle )) subreddits_dlist = load_dict(subredditsPickle) #for e in subreddits_dlist: log(e.get('entry_name')) #log( pprint.pformat(subreddits_dlist, indent=1) ) #log('****------before for -------- ' + repr(sub_list )) for subreddit in sub_list: #handle link shortcuts if subreddit.startswith('https://'): entry_in_file = subreddit without_alias = re.sub(r"[\(\[].*?[\)\]]", "", entry_in_file) yt = ClassYoutube(without_alias) url_type, id_ = yt.get_video_channel_user_or_playlist_id_from_url( without_alias) if url_type == 'channel': sub_info = yt.get_channel_info(id_, entry_name=entry_in_file) else: #this part not used, right now only youtube channels are supported. log(' getting link info:entry_in_file=%s without_alias=%s' % (repr(entry_in_file), repr(without_alias))) sub_info = get_domain_icon(entry_in_file, None, without_alias) else: subreddit = subreddit.lower().strip() #remove old instance of subreddit #log('****processing ' + repr( subreddit )) subreddits_dlist = [ x for x in subreddits_dlist if x.get('entry_name', '') != subreddit ] domain = setting_entry_is_domain(subreddit) if domain: log(' getting domain info ' + domain) sub_info = get_domain_icon(subreddit, domain) #icon="http://%s/favicon.ico"%domain else: log(' getting sub info ' + subreddit) sub_info = get_subreddit_info(subreddit) log(' retrieved subreddit info ' + repr(sub_info)) if sub_info: subreddits_dlist.append(sub_info) save_dict(subreddits_dlist, subredditsPickle)
def prepare_data(im_size, data_dir, extra_data_dir=False): # create directory if not exist if not os.path.exists('data/images'): os.makedirs('data/images') # prepare self data prepare_data_self(im_size, data_dir) # if use extra data, add kdef data into list if extra_data_dir: prepare_data_kdef(im_size, extra_data_dir, 70) # save name-label dictionary print 'Use extra data: {}, #persons: {}, #emotions: {}'.format( extra_data_dir, len(name_dict.keys()), len(emos_dict.keys())) save_dict('model/name_dict.txt', name_dict) save_dict('model/emotion_dict.txt', emos_dict)
def get_dataset_stats(limit): DATA_STATS_FILE = P.STATS_FOLDER + '/cifar10_' + str(limit) + '.pt' MEAN_KEY = 'mean' STD_KEY = 'std' ZCA_KEY = 'zca' # Load statistics stats = utils.load_dict(DATA_STATS_FILE) # Try to load stats from file if stats is None: # Stats file does not exist --> Compute statistics print("Computing statistics on dataset[0:" + str(limit) + "] (this might take a while)") # Load dataset cifar10 = CIFAR10(root=P.DATA_FOLDER, train=True, download=True) # Load CIFAR10 dataset X = cifar10.data[ 0: limit] # X is M x N (M = limit: samples, N = 3072: variables per dataset sample) # Normalize the data to [0 1] range X = X / 255. # Compute mean and st. dev. and normalize the data to zero mean and unit variance mean = X.mean(axis=(0, 1, 2), keepdims=True) std = X.std(axis=(0, 1, 2), keepdims=True) X = (X - mean) / std # Transpose image tensors dimensions in order to put channel dimension in pos. 1, as expected by pytorch X = X.transpose(0, 3, 1, 2) # Reshape image tensors from shape 32x32x3 to vectors of size 32*32*3=3072 X = X.reshape(limit, -1) # Compute ZCA matrix cov = np.cov(X, rowvar=False) U, S, V = np.linalg.svd(cov) SMOOTHING_CONST = 1e-1 zca = np.dot(U, np.dot(np.diag(1.0 / np.sqrt(S + SMOOTHING_CONST)), U.T)) # Save statistics stats = { MEAN_KEY: mean.squeeze().tolist(), STD_KEY: std.squeeze().tolist(), ZCA_KEY: torch.from_numpy(zca).float() } utils.save_dict(stats, DATA_STATS_FILE) print("Statistics computed and saved") return stats[MEAN_KEY], stats[STD_KEY], stats[ZCA_KEY]
def get_subreddit_entry_info_thread(sub_list): from utils import load_dict, save_dict, get_domain_icon, setting_entry_is_domain from domains import ClassYoutube global subreddits_dlist #subreddits_dlist=[] #log('**** thread running:'+repr(sub_list)) if not subreddits_dlist: if os.path.exists(subredditsPickle): #log('****file exists ' + repr( subredditsPickle )) subreddits_dlist=load_dict(subredditsPickle) #for e in subreddits_dlist: log(e.get('entry_name')) #log( pprint.pformat(subreddits_dlist, indent=1) ) #log('****------before for -------- ' + repr(sub_list )) for subreddit in sub_list: #handle link shortcuts if subreddit.startswith('https://'): entry_in_file=subreddit without_alias=re.sub(r"[\(\[].*?[\)\]]", "", entry_in_file) yt=ClassYoutube(without_alias) url_type,id_=yt.get_video_channel_user_or_playlist_id_from_url(without_alias) if url_type=='channel': sub_info=yt.get_channel_info(id_, entry_name=entry_in_file) else: #this part not used, right now only youtube channels are supported. log(' getting link info:entry_in_file=%s without_alias=%s'%(repr(entry_in_file),repr(without_alias)) ) sub_info=get_domain_icon(entry_in_file,None,without_alias ) else: subreddit=subreddit.lower().strip() #remove old instance of subreddit #log('****processing ' + repr( subreddit )) subreddits_dlist=[x for x in subreddits_dlist if x.get('entry_name','') != subreddit ] domain=setting_entry_is_domain(subreddit) if domain: log(' getting domain info '+domain) sub_info=get_domain_icon(subreddit,domain) #icon="http://%s/favicon.ico"%domain else: log(' getting sub info '+subreddit) sub_info=get_subreddit_info(subreddit) log(' retrieved subreddit info ' + repr( sub_info )) if sub_info: subreddits_dlist.append(sub_info) save_dict(subreddits_dlist, subredditsPickle)
def benchmark(parameter_name, values, config=CONFIG): save_folder = generate_folder_name() print("\n\nBenchmark for parameter '{}' with values {}".format(parameter_name, values)) results = np.empty((len(values), 2), dtype=float) for index, v in enumerate(values): print("Run {} with value {} on GPU".format(index+1, v)) results[index, 0] = run((parameter_name, v), ('gpu', True)) utils.clear_current_line() print("Run {} with value {} on CPU".format(index+1, v)) results[index, 1] = run((parameter_name, v), ('gpu', False)) utils.clear_current_line() result_str = "Results: \n{}".format(results) print(result_str) utils.save_dict(config, save_folder, 'config_' + parameter_name + '.txt') utils.save_string_to_file(result_str, save_folder, 'results.txt')
def saveEvalResults(evalData, generator, cfg, obj_mapping): save_path = cfg.results_path + "rpn" + cfg.my_results_dir + '/' mode = generator.data_type if not os.path.exists(save_path): save_path = save_path[:-1] save_path += '/' utils.save_dict(evalData, save_path + mode + '_res') # evalData = utils.load_dict(save_path+mode+'_res') AR, R5, IoU = metrics.computeRPNAR(evalData, generator.imagesMeta, obj_mapping, cfg) saveMeta = {'AR': AR, 'R5': R5, 'IoU': IoU.tolist()} utils.save_dict(saveMeta, save_path + mode + '_mAP') print('R5', mode, R5) print('AR', mode, AR) return IoU
def caculate_revenue(): series_dict = utils.get_dict('series_dict') revenue_days = 60 revenue_dict = {} for key in series_dict: value_list = series_dict[key] code_df = utils.read(key) for date_string in value_list: print key print date_string series_date = (datetime.datetime.strptime(date_string, '%Y-%m-%d')).date() series_df = code_df[code_df['date'].isin([date_string])] series_close = float(series_df['close']) revenue_date = series_date for days in range(revenue_days): revenue_list = revenue_dict.get(str(days), []) revenue_date = revenue_date + datetime.timedelta(days=1) revenue_date_string = revenue_date.strftime('%Y-%m-%d') revenue_df = code_df[code_df['date'].isin( [revenue_date_string])] if revenue_df.empty: for nextday in range(10): revenue_date = revenue_date + datetime.timedelta( days=1) revenue_date_string = revenue_date.strftime('%Y-%m-%d') revenue_df = code_df[code_df['date'].isin( [revenue_date_string])] if not revenue_df.empty: break if revenue_df.empty: continue revenue_close = float(revenue_df['close']) revenue = revenue_close / series_close revenue_list.append(revenue) revenue_dict[str(days)] = revenue_list # print days # print revenue_date # print revenue utils.save_dict(revenue_dict, 'revenue_dict')
def benchmark(parameter_name, values, config=CONFIG): save_folder = generate_folder_name() print("\n\nBenchmark for parameter '{}' with values {}".format( parameter_name, values)) results = np.empty((len(values), 2), dtype=float) for index, v in enumerate(values): print("Run {} with value {} on GPU".format(index + 1, v)) results[index, 0] = run((parameter_name, v), ('gpu', True)) utils.clear_current_line() print("Run {} with value {} on CPU".format(index + 1, v)) results[index, 1] = run((parameter_name, v), ('gpu', False)) utils.clear_current_line() result_str = "Results: \n{}".format(results) print(result_str) utils.save_dict(config, save_folder, 'config_' + parameter_name + '.txt') utils.save_string_to_file(result_str, save_folder, 'results.txt')
def saveEvalResults(generator, cfg): my_output_path = cfg.results_path + 'det' + cfg.my_results_dir + '/res/' + generator.data_type + '/' evalData = [] nb_empty = 0 for batchidx, (imageID, imageMeta) in enumerate(generator.imagesMeta.items()): if os.path.exists(my_output_path + str(imageID) + '.pkl'): data = utils.load_obj(my_output_path + imageID) if data is not None and len(data) > 0: evalData.extend(data) else: nb_empty += 1 path = cfg.results_path + "det" + cfg.my_results_dir + '/' mode = generator.data_type utils.save_dict(evalData, path + mode + '_res') print('nb_empty', nb_empty)
def gen_p(name,start_year, end_year, minimum_mpg = 15, minimum_g = 30, verbose = False): """Given a filename and a range of seasons, create a pickle file of a dictionary containing all the players meting the minutes per game and game requirements Args: name (str): The name of the pickle file to be saved start year (int): start of data query end_year (int): end of data query ### Note: For a range from the 2009-2010 season to the 2013-2014 season, start_year would be 2010 and end_year would be 2014 minimum_mpg (int): minimum minutes per game for someone to be include in this list minimum_g (int): minimum games for someone to be include in this list verbose (bool): when True, print out when the function moves on to the next year Returns: None Todo: * Is there an way to not make the arguments of this the exact same as those of the above function?? """ x = get_player_names(start_year, end_year, minimum_mpg, minimum_g, verbose) utils.save_dict(x,name)
def load_problems(): identifier = 1001 utils.load_dict(problem_dict, 'problem_dict') while True: try: problem_id = str(identifier) if problem_id not in problem_dict.keys(): print identifier problem_html = utils.get_html("problem", problem_id=problem_id) if problem_html == 'ERROR': break tmp_dict = dict() tmp_dict["problem_id"] = problem_id tmp_dict["level"] = utils.get_level_from_html(problem_html) tmp_dict["category"] = utils.get_category_from_html(problem_html) tmp_dict["title"] = utils.get_title_from_html(problem_html) problem_dict[problem_id] = tmp_dict identifier += 1 except KeyboardInterrupt: break utils.save_dict(problem_dict, 'problem_dict')
def get_docs_vocab(filenames, treshhold, stop_lemms, path_to_save_vocab=None, path_to_save_docs=None): vectorizer = CountVectorizer(input=u'filename', encoding=u'utf-8', lowercase=True, preprocessor=lemmer, # None tokenizer=None, token_pattern=u'(?u)[A-zА-я\-]{2,}', stop_words=stop_lemms, analyzer=u'word', max_df=treshhold, min_df=0.0, binary=False, # True ) vectorizer.fit(filenames) del_meaningless_words(vectorizer.vocabulary_) if path_to_save_vocab: save_dict(vectorizer.vocabulary_, path_to_save_vocab) docs = get_docs(vectorizer, filenames) if path_to_save_docs: with open(path_to_save_docs, 'wb') as f: pickle.dump(docs, f) return docs, vectorizer.vocabulary_
def run_engine(config): """ :return: """ number_of_documents = 0 sum_of_doc_lengths = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) indexer = Indexer(config, glove_dict) # documents_list = r.read_file(file_name=config.get__corpusPath()) parquet_documents_list = r.read_folder(config.get__corpusPath()) for parquet_file in parquet_documents_list: documents_list = r.read_file(file_name=parquet_file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 sum_of_doc_lengths += parsed_document.doc_length # index the document data indexer.add_new_doc(parsed_document) # saves last posting file after indexer has done adding documents. indexer.save_postings() if len(indexer.doc_posting_dict) > 0: indexer.save_doc_posting() utils.save_dict(indexer.document_dict, "documents_dict", config.get_out_path()) if len(indexer.document_posting_covid) > 0: indexer.save_doc_covid() indexer.delete_dict_after_saving() # merges posting files. indexer.merge_chunks() utils.save_dict(indexer.inverted_idx, "inverted_idx", config.get_out_path()) dits = {'number_of_documents': number_of_documents, "avg_length_per_doc": sum_of_doc_lengths/number_of_documents } utils.save_dict(dits, 'details', config.get_out_path())
clean_unpacked = clean_unpacked.squeeze().cpu().detach().numpy() lycon.save(args.result_dir + 'png/clean/' + filename[:-4] + '.png', (clean_unpacked * 255).astype(np.uint8)) noisy_packed = raw_noisy noisy_packed = noisy_packed[:, padh[j] // 2:-padh[j] // 2, padw[j] // 2:-padw[j] // 2] ## RGGB channels noisy_unpacked = utils.unpack_raw(noisy_packed.unsqueeze( 0)) ## Rearrange RGGB channels into Bayer pattern noisy_unpacked = noisy_unpacked.squeeze().cpu().detach().numpy() lycon.save(args.result_dir + 'png/noisy/' + filename[:-4] + '.png', (noisy_unpacked * 255).astype(np.uint8)) variance_packed = variance[:, padh[j] // 2:-padh[j] // 2, padw[j] // 2:-padw[j] // 2] ## RGGB channels #import pdb;pdb.set_trace() dict_ = {} dict_['clean'] = clean_packed.cpu().detach().numpy( ) ## (4 x H/2 x W/2) dict_['noisy'] = noisy_packed.cpu().detach().numpy( ) ## (4 x H/2 x W/2) dict_['variance'] = variance_packed.cpu().detach().numpy( ) ## (4 x H/2 x W/2) dict_['shot_noise'] = shot_noise.cpu().detach().numpy() dict_['read_noise'] = read_noise.cpu().detach().numpy() utils.save_dict(dict_, args.result_dir + 'pkl/' + filename[:-4] + '.pkl')
def get_best_model(self, param_grid, scoring='accuracy', n_jobs=1, verbose=1, save=False): """ Takes data and model information and returns a dictionary of metrics on the best estimator for each data set via grid search Params: - param_grid: (dict or list of dicts) values to perform grid search over - scoring: str, optional (default='accuracy') specifies how to rank each estimator - n_jobs: int, optional (default='1') number of cores to run training on. -1 includes all cores - verbose: int, optional (default='1') specifies if output messages should be provided - save: bool, optional (default=False) flag for saving metric dict Returns: - clf: gridsearch object with best performance """ # Make sure proper data was passed in try: assert type(param_grid) in [list, set, tuple, dict] except AssertionError: raise ValueError('Unexpected data type passed in for param_grid') try: if type(param_grid) is not dict: assert type(param_grid[0]) == dict except AssertionError: raise ValueError('Unexpected data type passed in for param_grid') self.metric_dict = dict() clf = GridSearchCV(estimator=self.estimator, param_grid=param_grid, cv=5, n_jobs=n_jobs, verbose=verbose, scoring=scoring) X, y = self.data[:, 1:], self.data[:, :1] #Treats first column as label for i in range(3): # Completes 3 trials X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=5000, shuffle=True) clf.fit(X_train, y_train.ravel()) # Fit training data to model # Gather training set metrics y_train_pred = clf.predict(X_train) acc_train = accuracy_score(y_train, y_train_pred) precision_train, recall_train, f1_train, _ = precision_recall_fscore_support( y_train, y_train_pred) # Gather testing set metrics y_test_pred = clf.predict( X_test ) # Predict test values using best parameters from classifier acc_test = accuracy_score( y_test, y_test_pred) # Get accuracy for predictions precision_test, recall_test, f1_test, _ = precision_recall_fscore_support( y_test, y_test_pred) # Save metrics to dict for further analysis self.metric_dict[(self.data_name, i)] = { 'acc_test': acc_test, 'acc_train': acc_train, 'precision_test': precision_test, 'precision_train': precision_train, 'recall_test': recall_test, 'recall_train': recall_train, 'f1_test': f1_test, 'f1_train': f1_train, 'model': clf, 'cv_results': clf.cv_results_ } # Add metrics to dict for analysis if save: # Save checkpoint results in case of hardware failure loc_str = self.estimator.__class__.__name__ # this just gets clf type (eg SVC, LogisticRegression, etc) # Checks if the output path already exists, and makes it if not save_dir = os.path.join('..', 'checkpoints', f'{loc_str}') if not os.path.isdir(save_dir): print(f'Creating {loc_str} directory now') os.mkdir(os.path.join('..', 'checkpoints', loc_str)) save_path = os.path.join( save_dir, f'{loc_str}_{self.data_name}_{i}.pkl') save_dict(self.metric_dict, save_path) return clf
rels.append(prsBB) if rels: imagesMeta[imageID.split('.')[0]] = { 'imageName': imageID, 'objects': rels } # print(mlk) return imagesMeta if __name__ == "__main__": # metaData = sio.loadmat(url + 'anno.mat', struct_as_record=False, squeeze_me=True) bbData = sio.loadmat(url + 'anno_bbox.mat', struct_as_record=False, squeeze_me=True) # actions = bbData['list_action'] # trainYMatrix = metaData['anno_train'] bbDataTrain = bbData['bbox_train'] cfg = basic_config() cfg = set_config(cfg) cfg.dataset = 'HICO' cfg.get_data_path() cfg.get_results_paths() labels = utils.load_dict(cfg.data_path + 'labels') print("Extract meta data") tmpTrainMeta = extractMetaData(bbDataTrain, labels) utils.save_dict(tmpTrainMeta, url + 'train_objs')
data_set = data_manager.load_data_set(conf.data_set) X = data_set.data y = data_set.target class_indices = list(set(y)) # Split data into training and testing set if conf.training_test_split: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) else: X_train = X_test = X y_train = y_test = y clf = acoc.PolyACO(X.shape[1], class_indices, save_folder=SAVE_FOLDER) clf.train(X_train, y_train, start_time=conf.start_time) predictions = clf.evaluate(X_test) return acoc.compute_score(predictions, y_test) if __name__ == "__main__": utils.save_dict(CLASSIFIER_CONFIG, parent_folder=SAVE_FOLDER, file_name='config.json') scores = [] runs = 1 result_str = '' start_time = time() for i in range(runs): scores.append(run(start_time=start_time)) print("\nRun {}/{} score: {:.4f}".format(i + 1, runs, scores[-1])) result_str = ','.join([str(s) for s in scores]) + "\nAverage score with {}-fold cross validation: {:.5f}".format(runs, sum(scores) / runs) utils.save_string_to_file(result_str, parent_folder=SAVE_FOLDER, file_name='result.txt') print("\n" + result_str)
for j, c in enumerate(configurations): results[j].append(run(**c)) mean_results = np.array(results).mean(1).tolist() def np_list_to_csv_string(npl): return ",".join(list(map(lambda f: "{:.4f}".format(f), npl))) csv = [] for arr in mean_results: csv.append(np_list_to_csv_string(arr)) utils.save_object(mean_results, SAVE_FOLDER, 'results') utils.save_string_to_file("\n".join(csv), SAVE_FOLDER, 'results.csv') utils.save_dict(CLASSIFIER_CONFIG, SAVE_FOLDER, 'config.json') data = np.array(mean_results) x = range(data.shape[1]) fig, ax = plt.subplots() plotter.hide_top_and_right_axis(ax) ax.yaxis.grid(color='gray') ax.set_xlabel('Time (seconds)') ax.set_ylabel('Best polygon solution') ax.set_prop_cycle(cycler('color', ['c', 'm', 'y', 'k', 'r', 'g', 'b'])) lines = [] for i in range(len(configurations)): lines.append(ax.plot(x, data[i], label=labels[i]))
filenames = sorted(os.path.join(path_to_dir, file) for file in os.listdir(path_to_dir) if file[-4:] == '.txt') path_to_save_vocab = '../data/vocabulary.txt' path_to_save_docs = '../data/docs' vectorizer = CountVectorizer(input=u'filename', encoding=u'utf-8', lowercase=True, preprocessor=lemmer, # None tokenizer=None, token_pattern=u'(?u)[A-zА-я\-]{2,}', stop_words=stop_lemms, analyzer=u'word', max_df=treshhold, min_df=0.0, binary=False, # True ) vectorizer.fit(filenames) del_meaningless_words(vectorizer.vocabulary_) save_dict(vectorizer.vocabulary_, '../data/vocabulary.txt') transform = vectorizer.transform(filenames) with open('../data/transforms', 'wb') as f: pickle.dump(transform, f) docs = get_docs(vectorizer, filenames) with open('../data/docs', 'wb') as f: pickle.dump(docs, f)
cm_rnn = test_model(model, test_dataloader, device) acc = accuracy(cm_rnn) f1_score = f1_score_mean(cm_rnn) precision, recall = precision_recall_mean(cm_rnn) hyper_parameters = { 'epochs': num_epochs, 'batch_size': batch_size, 'lr': learning_rate, 'dropout': dropout } results = { 'confusion_matrix': cm_rnn.tolist(), 'accuracy': acc, 'f1 score': f1_score, 'precision': precision, 'recall': recall } save_dict(results, f'experiments/{model_name}/results.txt') save_dict(hyper_parameters, f'experiments/{model_name}/hyper.txt') model_path = f'experiments/{model_name}/{model_name}.pt' torch.save(model.state_dict(), model_path) print('f1 score', f1_score) print(model_name, 'done')
raw_gt = raw_gt.squeeze(0).cpu().detach() tile_output[:, Y_lower // 2:Y_upper // 2, X_lower // 2:X_upper // 2] = raw_gt[:, :size_Y // 2, :size_X // 2] #### Unpadding and saving print(f'output shape={tile_output.shape}') clean_packed = tile_output[:, :, :] ## RGGB channels (4 x H/2 x W/2) clean_unpacked = utils.unpack_raw(clean_packed.unsqueeze( 0)) ## Rearrange RGGB channels into Bayer pattern clean_unpacked = clean_unpacked.squeeze().cpu().detach().numpy() try: print(os.path.join(args.result_dir, 'png', filename[:-4] + '.png')) # lycon.save(os.path.join(args.result_dir, 'png', filename[:-4]+'.png'),(clean_unpacked*255).astype(np.uint8)) cv2.imwrite( os.path.join(args.result_dir, 'png', filename[:-4] + '.png'), (clean_unpacked * 255).astype(np.uint8)) # cv2.imwrite(args.result_dir+'png/clean/'+filename[:-4]+'.png',(clean_unpacked*255).astype(np.uint8)) except cv2.error as e: print(filename) print(clean_packed) #import pdb;pdb.set_trace() dict_ = {} dict_['raw'] = clean_packed.cpu().detach().numpy() ## (4 x H/2 x W/2) utils.save_dict( dict_, os.path.join(args.result_dir, 'pkl', filename[:-4] + '.pkl')) # gc.collect()
def test(config, dbg_img_label_dict=None, dbg_mode=False, export_output=True, dbg_size=10, dbg_img_indices=[], calc_stability=True): # ============= Experiment Folder============= assets_dir = os.path.join(config['log_dir'], config['name']) log_dir = os.path.join(assets_dir, 'log') ckpt_dir = os.path.join(assets_dir, 'ckpt_dir') sample_dir = os.path.join(assets_dir, 'sample') # Whether this is for saving the results for substitutability metric or the regular testing process. # If only for substitutability, we skip saving large arrays and additional multiple random outputs to avoid OOM calc_substitutability = config['calc_substitutability'] if calc_substitutability: substitutability_attr = config['substitutability_attr'] test_dir = os.path.join(assets_dir, 'test', 'substitutability_input') substitutability_exported_img_label_dict = os.path.join( test_dir, '{}_dims_{}_clss_{}.txt'.format(substitutability_attr, config['w_dim'], config['num_bins'])) substitutability_label_scaler = config['num_bins'] - 1 exported_dict = {} substitutability_classifier_config = config[ 'substitutability_classifier_config'] _cls_config = yaml.load(open(config['classifier_config'])) substitutability_img_subset = _cls_config['train'] substitutability_img_label_dict = _cls_config['image_label_dict'] _edited_cls_config = deepcopy(_cls_config) _edited_cls_config['image_dir'] = os.path.join(test_dir, 'images') if not os.path.exists(_edited_cls_config['image_dir']): os.makedirs(_edited_cls_config['image_dir']) _edited_cls_config[ 'image_label_dict'] = substitutability_exported_img_label_dict _edited_cls_config['train'] = os.path.join(test_dir, 'train_ids.npy') _edited_cls_config['test'] = '' # skips evaluating on test _edited_cls_config['log_dir'] = test_dir _edited_cls_config['ckpt_dir_continue'] = '' save_config_dict(_edited_cls_config, substitutability_classifier_config) else: test_dir = os.path.join(assets_dir, 'test') # ============= Experiment Parameters ============= ckpt_dir_cls = config['cls_experiment'] if 'evaluation_batch_size' in config.keys(): BATCH_SIZE = config['evaluation_batch_size'] else: BATCH_SIZE = config['batch_size'] channels = config['num_channel'] input_size = config['input_size'] NUMS_CLASS_cls = config['num_class'] NUMS_CLASS = config['num_bins'] MU_CLUSTER = config['mu_cluster'] VAR_CLUSTER = config['var_cluster'] TRAVERSAL_N_SIGMA = config['traversal_n_sigma'] STEP_SIZE = 2 * TRAVERSAL_N_SIGMA * VAR_CLUSTER / (NUMS_CLASS - 1) OFFSET = MU_CLUSTER - TRAVERSAL_N_SIGMA * VAR_CLUSTER metrics_stability_nx = config['metrics_stability_nx'] metrics_stability_var = config['metrics_stability_var'] target_class = config['target_class'] ckpt_dir_continue = ckpt_dir if dbg_img_label_dict is not None: image_label_dict = dbg_img_label_dict elif calc_substitutability: image_label_dict = substitutability_img_label_dict else: image_label_dict = config['image_label_dict'] # CSVAE parameters beta1 = config['beta1'] beta2 = config['beta2'] beta3 = config['beta3'] beta4 = config['beta4'] beta5 = config['beta5'] z_dim = config['z_dim'] w_dim = config['w_dim'] if dbg_mode: num_samples = dbg_size else: num_samples = config['count_to_save'] dataset = config['dataset'] if dataset == 'CelebA': my_data_loader = ImageLabelLoader(input_size=128) pretrained_classifier = celeba_classifier EncoderZ = EncoderZ_128 EncoderW = EncoderW_128 DecoderX = DecoderX_128 DecoderY = DecoderY_128 elif dataset == 'shapes': if calc_substitutability: my_data_loader = ShapesLoader() else: # my_data_loader = ShapesLoader() # for efficiency, let's just load as many samples as we need my_data_loader = ShapesLoader( dbg_mode=True, dbg_size=num_samples, dbg_image_label_dict=image_label_dict, dbg_img_indices=dbg_img_indices) dbg_mode = True pretrained_classifier = shapes_classifier EncoderZ = EncoderZ_64 EncoderW = EncoderW_64 DecoderX = DecoderX_64 DecoderY = DecoderY_64 elif dataset == 'CelebA64' or dataset == 'dermatology': my_data_loader = ImageLabelLoader(input_size=64) pretrained_classifier = celeba_classifier EncoderZ = EncoderZ_64 EncoderW = EncoderW_64 DecoderX = DecoderX_64 DecoderY = DecoderY_64 elif dataset == 'synthderm': my_data_loader = ImageLabelLoader(input_size=64) pretrained_classifier = celeba_classifier EncoderZ = EncoderZ_64 EncoderW = EncoderW_64 DecoderX = DecoderX_64 DecoderY = DecoderY_64 # ============= Data ============= try: categories, file_names_dict = read_data_file(image_label_dict) except: print("Problem in reading input data file : ", image_label_dict) sys.exit() if calc_substitutability: data = np.load(substitutability_img_subset) num_samples = len(data) elif dbg_mode and dataset == 'shapes': data = np.array([str(ind) for ind in my_data_loader.tmp_list]) else: if len(dbg_img_indices) > 0: data = np.asarray(dbg_img_indices) else: data = np.asarray(list(file_names_dict.keys())) print("The classification categories are: ") print(categories) print('The size of the test set: ', data.shape[0]) # ============= placeholder ============= x_source = tf.placeholder(tf.float32, [None, input_size, input_size, channels], name='x_source') y_s = tf.placeholder(tf.int32, [None, NUMS_CLASS_cls], name='y_s') y_source = y_s[:, NUMS_CLASS_cls - 1] train_phase = tf.placeholder(tf.bool, name='train_phase') y_target = tf.placeholder(tf.int32, [None, w_dim], name='y_target') # between 0 and NUMS_CLASS generation_dim = w_dim # ============= CSVAE ============= encoder_z = EncoderZ('encoder_z') encoder_w = EncoderW('encoder_w') decoder_x = DecoderX('decoder_x') decoder_y = DecoderY('decoder_y') # encode x to get mean, log variance, and samples from the latent subspace Z mu_z, logvar_z, z = encoder_z(x_source, z_dim) # encode x and y to get mean, log variance, and samples from the latent subspace W mu_w, logvar_w, w = encoder_w(x_source, y_source, w_dim) # pass samples of z and w to get predictions of x pred_x = decoder_x(tf.concat([w, z], axis=-1)) # get predicted labels based only on the latent subspace Z pred_y = decoder_y(z, NUMS_CLASS_cls) # Create a single image based on y_target target_w = STEP_SIZE * tf.cast(y_target, dtype=tf.float32) + OFFSET fake_target_img = decoder_x(tf.concat([target_w, z], axis=-1)) # ============= pre-trained classifier ============= real_img_cls_logit_pretrained, real_img_cls_prediction = pretrained_classifier( x_source, NUMS_CLASS_cls, reuse=False, name='classifier') fake_recon_cls_logit_pretrained, fake_recon_cls_prediction = pretrained_classifier( pred_x, NUMS_CLASS_cls, reuse=True) fake_img_cls_logit_pretrained, fake_img_cls_prediction = pretrained_classifier( fake_target_img, NUMS_CLASS_cls, reuse=True) # ============= predicted probabilities ============= fake_target_p_tensor = tf.reduce_max(tf.cast(y_target, tf.float32) * 1.0 / float(NUMS_CLASS - 1), axis=1) # ============= session ============= sess = tf.Session() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # ============= Checkpoints ============= print(" [*] Reading checkpoint...") ckpt = tf.train.get_checkpoint_state(ckpt_dir_continue) if ckpt and ckpt.model_checkpoint_path: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) saver.restore(sess, os.path.join(ckpt_dir_continue, ckpt_name)) print(ckpt_dir_continue, ckpt_name) print("Successful checkpoint upload") else: print("Failed checkpoint load") sys.exit() # ============= load pre-trained classifier checkpoint ============= class_vars = [ var for var in slim.get_variables_to_restore() if 'classifier' in var.name ] name_to_var_map_local = {var.op.name: var for var in class_vars} temp_saver = tf.train.Saver(var_list=name_to_var_map_local) ckpt = tf.train.get_checkpoint_state(ckpt_dir_cls) ckpt_name = os.path.basename(ckpt.model_checkpoint_path) temp_saver.restore(sess, os.path.join(ckpt_dir_cls, ckpt_name)) print("Classifier checkpoint loaded.................") print(ckpt_dir_cls, ckpt_name) # ============= Testing ============= def _save_output_array(name, values): np.save(os.path.join(test_dir, '{}.npy'.format(name)), values) if not calc_substitutability: names = np.empty([num_samples], dtype=object) real_imgs = np.empty([num_samples, input_size, input_size, channels]) fake_t_imgs = np.empty([ num_samples, generation_dim, NUMS_CLASS, input_size, input_size, channels ]) fake_s_recon_imgs = np.empty([ num_samples, generation_dim, NUMS_CLASS, input_size, input_size, channels ]) real_ps = np.empty( [num_samples, generation_dim, NUMS_CLASS, NUMS_CLASS_cls]) recon_ps = np.empty( [num_samples, generation_dim, NUMS_CLASS, NUMS_CLASS_cls]) fake_target_ps = np.empty([num_samples, generation_dim, NUMS_CLASS]) fake_ps = np.empty( [num_samples, generation_dim, NUMS_CLASS, NUMS_CLASS_cls]) # For stability metric stability_fake_t_imgs = np.empty([ num_samples, metrics_stability_nx, generation_dim, NUMS_CLASS, input_size, input_size, channels ]) stability_fake_s_recon_imgs = np.empty([ num_samples, metrics_stability_nx, generation_dim, NUMS_CLASS, input_size, input_size, channels ]) stability_recon_ps = np.empty([ num_samples, metrics_stability_nx, generation_dim, NUMS_CLASS, NUMS_CLASS_cls ]) stability_fake_ps = np.empty([ num_samples, metrics_stability_nx, generation_dim, NUMS_CLASS, NUMS_CLASS_cls ]) arrs_to_save = [ 'names', 'real_imgs', 'fake_t_imgs', 'fake_s_recon_imgs', 'real_ps', 'recon_ps', 'fake_target_ps', 'fake_ps', 'stability_fake_t_imgs', 'stability_fake_s_recon_imgs', 'stability_recon_ps', 'stability_fake_ps' ] np.random.shuffle(data) data = data[0:num_samples] for i in range(math.ceil(data.shape[0] / BATCH_SIZE)): image_paths = data[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] # num_seed_imgs is either BATCH_SIZE # or if the number of samples is not divisible by BATCH_SIZE a smaller value num_seed_imgs = np.shape(image_paths)[0] img, _labels = my_data_loader.load_images_and_labels( image_paths, config['image_dir'], 1, file_names_dict, channels, do_center_crop=True) img_repeat = np.repeat(img, NUMS_CLASS * generation_dim, 0) labels = np.repeat(_labels, NUMS_CLASS * generation_dim, 0) labels = labels.ravel() labels = np.eye(NUMS_CLASS_cls)[labels.astype(int)] _dim_bin_arr = np.zeros((generation_dim * NUMS_CLASS, generation_dim)) for _gen_dim in range(generation_dim): _start = _gen_dim * NUMS_CLASS _end = (_gen_dim + 1) * NUMS_CLASS _dim_bin_arr_sub = np.zeros((NUMS_CLASS, generation_dim)) _dim_bin_arr_sub[:, _gen_dim] = np.asarray(range(NUMS_CLASS)) _dim_bin_arr[_start:_end, :] = _dim_bin_arr_sub target_labels = np.tile( _dim_bin_arr, (num_seed_imgs, 1)) # [num_seed_imgs * w_dim * NUMS_CLASS, w_dim] # target_labels = np.tile( # np.repeat(np.expand_dims(np.asarray(range(NUMS_CLASS)), axis=1), generation_dim, axis=1), # (num_seed_imgs*generation_dim, 1)) # [num_seed_imgs * w_dim * NUMS_CLASS, w_dim] my_feed_dict = { y_target: target_labels, x_source: img_repeat, train_phase: False, y_s: labels } fake_t_img, fake_s_recon_img, real_p, recon_p, fake_target_p, fake_p = sess.run( [ fake_target_img, pred_x, real_img_cls_prediction, fake_recon_cls_prediction, fake_target_p_tensor, fake_img_cls_prediction ], feed_dict=my_feed_dict) print('{} / {}'.format(i + 1, math.ceil(data.shape[0] / BATCH_SIZE))) _num_cur_samples = len(image_paths) if calc_substitutability: _ind_generation_dim = np.random.randint(low=0, high=generation_dim, size=_num_cur_samples) reshaped_imgs = np.reshape( fake_t_img, (_num_cur_samples, generation_dim, NUMS_CLASS, input_size, input_size, channels)) sub_exported_dict = save_batch_images( reshaped_imgs, image_paths, _ind_generation_dim, _labels, substitutability_label_scaler, _edited_cls_config['image_dir'], has_extension=(dataset != 'shapes')) exported_dict.update(sub_exported_dict) else: start_ind = i * BATCH_SIZE end_ind = start_ind + _num_cur_samples names[start_ind:end_ind] = np.asarray(image_paths) if calc_stability: for j in range(metrics_stability_nx): noisy_img = img + np.random.normal( loc=0.0, scale=metrics_stability_var, size=np.shape(img)) stability_img_repeat = np.repeat( noisy_img, NUMS_CLASS * generation_dim, 0) stability_feed_dict = { y_target: target_labels, x_source: stability_img_repeat, train_phase: False, y_s: labels } _stability_fake_t_img, _stability_fake_s_recon_img, _stability_recon_p, _stability_fake_p = sess.run( [ fake_target_img, pred_x, fake_recon_cls_prediction, fake_img_cls_prediction ], feed_dict=stability_feed_dict) stability_fake_t_imgs[start_ind:end_ind, j] = np.reshape( _stability_fake_t_img, (_num_cur_samples, generation_dim, NUMS_CLASS, input_size, input_size, channels)) stability_fake_s_recon_imgs[ start_ind:end_ind, j] = np.reshape( _stability_fake_s_recon_img, (_num_cur_samples, generation_dim, NUMS_CLASS, input_size, input_size, channels)) stability_recon_ps[start_ind:end_ind, j] = np.reshape( _stability_recon_p, (_num_cur_samples, generation_dim, NUMS_CLASS, NUMS_CLASS_cls)) stability_fake_ps[start_ind:end_ind, j] = np.reshape( _stability_fake_p, (_num_cur_samples, generation_dim, NUMS_CLASS, NUMS_CLASS_cls)) real_imgs[start_ind:end_ind] = img fake_t_imgs[start_ind:end_ind] = np.reshape( fake_t_img, (_num_cur_samples, generation_dim, NUMS_CLASS, input_size, input_size, channels)) fake_s_recon_imgs[start_ind:end_ind] = np.reshape( fake_s_recon_img, (_num_cur_samples, generation_dim, NUMS_CLASS, input_size, input_size, channels)) real_ps[start_ind:end_ind] = np.reshape( real_p, (_num_cur_samples, generation_dim, NUMS_CLASS, NUMS_CLASS_cls)) recon_ps[start_ind:end_ind] = np.reshape( recon_p, (_num_cur_samples, generation_dim, NUMS_CLASS, NUMS_CLASS_cls)) fake_target_ps[start_ind:end_ind] = np.reshape( fake_target_p, (_num_cur_samples, generation_dim, NUMS_CLASS)) fake_ps[start_ind:end_ind] = np.reshape( fake_p, (_num_cur_samples, generation_dim, NUMS_CLASS, NUMS_CLASS_cls)) output_dict = {} if calc_substitutability: save_dict(exported_dict, substitutability_exported_img_label_dict, substitutability_attr) np.save(_edited_cls_config['train'], np.asarray(list(exported_dict.keys()))) # retrain the classifier with the new generated images tf.reset_default_graph() train_classif(config['substitutability_classifier_config']) else: if export_output: for arr_name in arrs_to_save: _save_output_array(arr_name, eval(arr_name)) for arr_name in arrs_to_save: output_dict.update({arr_name: eval(arr_name)}) return output_dict
if __name__ == "__main__": # metaData = sio.loadmat(url + 'anno.mat', struct_as_record=False, squeeze_me=True) bbData = sio.loadmat(url + 'anno_bbox.mat', struct_as_record=False, squeeze_me=True) # actions = bbData['list_action'] # trainYMatrix = metaData['anno_train'] bbDataTrain = bbData['bbox_train'] cfg = basic_config() cfg = set_config(cfg) cfg.dataset = 'HICO' cfg.get_data_path() cfg.get_results_paths() labels = utils.load_dict(cfg.data_path + 'labels') print("Extract meta data") tmpTrainMeta = extractMetaData(bbDataTrain) print("Combine similar BBs") newTrainMeta = combineSimilarBBs(tmpTrainMeta, labels, 0.4) newTrainMetaID = list(newTrainMeta.keys()) newTrainMetaID.sort() # imagesID = imagesID[6490:7000] # images = pp.loadImages(imagesID, imagesMeta, url+"images/train2015/") # [dataXP, dataXB, dataY, dataMeta] = pp.getData(imagesID, imagesMeta, images, (224,244)) # trainYMatrix = pp.getMatrixLabels(len(actions), dataY) utils.save_dict(tmpTrainMeta, url + 'HICO_train_GT') utils.save_dict(newTrainMeta, url + 'HICO_train_P') # sampleMeta = imagesMeta[imagesID[0]] # i = 0 # pdata.drawImages(imagesID[i*9:(i+1)*9], imagesMeta, url+'images/train2015/', False)
def doccano2SA(doccano_file, save_ext_dir, save_cls_dir, splits=[0.8, 0.9], is_shuffle=True): """ @Description: Consvert doccano file to data format which is suitable to input to this Application. @Param doccano_file: The annotated file exported from doccano labeling platform. @Param save_ext_dir: The directory of ext data that you wanna save. @Param save_cls_dir: The directory of cls data that you wanna save. @Param splits: Whether to split doccano file into train/dev/test, note: Only []/ len(splits)==2 accepted. @Param is_shuffle: Whether to shuffle data. """ if not os.path.exists(doccano_file): raise ValueError("Please input the correct path of doccano file.") if not os.path.exists(save_ext_dir): os.makedirs(save_ext_dir) if not os.path.exists(save_cls_dir): os.makedirs(save_cls_dir) if len(splits) != 0 and len(splits) != 2: raise ValueError("Only []/ len(splits)==2 accepted for splits.") if splits and (splits[0] >= splits[1] or splits[0] >= 1.0 or splits[1] >= 1.0 or splits[0] <= 0. or splits[1] <= 0): raise ValueError( "Please set correct splits, the element in it should be in (0,1), and splits[1]>splits[0]." ) def label_ext_with_label_term(ext_label, start, end, tag): if tag == "Opinion": b_tag = "B-Opinion" i_tag = "I-Opinion" else: b_tag = "B-Aspect" i_tag = "I-Aspect" ext_label[start] = b_tag for i in range(start + 1, end): ext_label[i] = i_tag ext_examples, cls_examples = [], [] with open(doccano_file, "r", encoding="utf-8") as f: raw_examples = f.readlines() # start to label for ext and cls data for line in raw_examples: items = json.loads(line) text, label_terms = items["data"], items["label"] # label ext data with label_terms ext_label = ["O"] * len(text) aspect_mapper = {} for label_term in label_terms: start, end, tag = label_term label_ext_with_label_term(ext_label, start, end, tag) if tag == "Pos-Aspect": aspect_mapper[text[start:end]] = "1" elif tag == "Neg-Aspect": aspect_mapper[text[start:end]] = "0" ext_examples.append((text, " ".join(ext_label))) # label cls data aps = decoding(text, ext_label) for ap in aps: aspect, opinions = ap[0], list(set(ap[1:])) if aspect not in aspect_mapper: continue aspect_text = concate_aspect_and_opinion(text, aspect, opinions) cls_examples.append((aspect_mapper[aspect], aspect_text, text)) # index for saving data ext_idx = np.arange(len(ext_examples)) cls_idx = np.arange(len(cls_examples)) if is_shuffle: ext_idx = np.random.permutation(ext_idx) cls_idx = np.random.permutation(cls_idx) if len(splits) == 0: # save ext data save_ext_path = os.path.join(save_ext_dir, "doccano.txt") save_examples(ext_examples, save_ext_path, ext_idx) print(f"\next: save data to {save_ext_path}.") # save cls data save_cls_path = os.path.join(save_cls_dir, "doccano.txt") save_examples(cls_examples, save_cls_path, cls_idx) print(f"\ncls: save data to {save_cls_path}.") else: # save ext data eth1, eth2 = int(len(ext_examples) * splits[0]), int( len(ext_examples) * splits[1]) save_ext_train_path = os.path.join(save_ext_dir, "train.txt") save_ext_dev_path = os.path.join(save_ext_dir, "dev.txt") save_ext_test_path = os.path.join(save_ext_dir, "test.txt") save_examples(ext_examples, save_ext_train_path, ext_idx[:eth1]) save_examples(ext_examples, save_ext_dev_path, ext_idx[eth1:eth2]) save_examples(ext_examples, save_ext_test_path, ext_idx[eth2:]) print(f"\next: save train data to {save_ext_train_path}.") print(f"ext: save dev data to {save_ext_dev_path}.") print(f"ext: save test data to {save_ext_test_path}.") # save cls data cth1, cth2 = int(len(cls_examples) * splits[0]), int( len(cls_examples) * splits[1]) save_cls_train_path = os.path.join(save_cls_dir, "train.txt") save_cls_dev_path = os.path.join(save_cls_dir, "dev.txt") save_cls_test_path = os.path.join(save_cls_dir, "test.txt") save_examples(cls_examples, save_cls_train_path, cls_idx[:cth1]) save_examples(cls_examples, save_cls_dev_path, cls_idx[cth1:cth2]) save_examples(cls_examples, save_cls_test_path, cls_idx[cth2:]) print(f"\ncls: save train data to {save_cls_train_path}.") print(f"cls: save dev data to {save_cls_dev_path}.") print(f"cls: save test data to {save_cls_test_path}.") # save ext dict ext_dict_path = os.path.join(save_ext_dir, "label.dict") cls_dict_path = os.path.join(save_cls_dir, "label.dict") save_dict(ext_dict_path, "ext") save_dict(cls_dict_path, "cls") print(f"\next: save dict to {ext_dict_path}.") print(f"cls: save dict to {cls_dict_path}.")
def save_model(self, path_dir): if not os.path.exists(path_dir): os.makedirs(path_dir) torch.save(self.model.state_dict(), path_dir + '/model.pt') save_dict(path_dir, self.config)
def save_doc_posting(self): utils.save_dict(self.doc_posting_dict, "doc_posting" + str(self.doc_posting_counter), self.config.get_out_path()) self.doc_posting_counter += 1 self.doc_posting_dict = {}