def __init__(self, config, run_id=None, verbosity=0): """ Class to parse configuration json file. Handles hyper-parameters for training, initializations of modules, checkpoint saving and logging module. :param config: Dict containing configurations. :param run_id: Unique Identifier for training processes. Timestamp is being used as default :param verbosity: default 0. """ # 1\ Define the config and run_id. self._config = config self._run_id = str(run_id) if run_id is 'None': self._run_id = datetime.now().strftime(r'%m%d_%H%M%S') self._verbosity = verbosity # 2\ Set _save_dir, _checkpoint_dir and _log_dir where checkpoints and logs will be saved. save_dir = './result/' self._result_dir = os.path.join(save_dir, self.config['name'] + '/', self._run_id + '/') self._checkpoint_dir = os.path.join(save_dir, self.config['name'] + '/', self._run_id + '/', 'checkpoints/') self._summary_dir = os.path.join(save_dir, self.config['name'] + '/', self._run_id + '/', 'logs/') # 3\ Create directory for saving checkpoints and log. create_dirs([self.result_dir, self._checkpoint_dir, self.summary_dir]) # 4\ Save relative config file to the relative dir write_json(self.config, os.path.join(self.result_dir, 'config.json')) self.config['trainer']['args']['verbosity'] = verbosity
def execute(self, conf_path: str, input_path: str, output_path: str, on_adls: bool): """ Clean a list of JSON files and writing them Args: conf_path: File path of the params.json input_path: Folder path to read raw files output_path: Folder path to write files on_adls: If the data are on the Azure Data Lake set true to use the correct package Returns: Nothing the data are directly write at the desired location """ self.load_params(conf_path) self.params.get("json") self.data_lake = uts.connect_to_data_lake_store( self.params) if on_adls else None res = [] for file in self.params.get("json"): json_file_name = "{}.json".format(file) read_path = path.join(input_path, json_file_name) self.logger.info( "Reading and parsing JSON from: {}".format(read_path)) data = uts.read_json(read_path, self.data_lake, advanced_parsing=True) write_path = path.join(output_path, json_file_name) self.logger.info( "Writing the parsed JSON to: {}".format(write_path)) uts.write_json(data, write_path, self.data_lake) res.append(data) return res
def start_spiders(): new_product_list = [] start_time = time.strftime("%d.%m.%Y-%H.%M") # cel.ro spider cel = Cel(start_time) new_product_list.extend(cel.start_requests()) # emag.ro spider emag = Emag(start_time) new_product_list.extend(emag.start_requests()) # pcgarage.ro spider pcgarage = Pcgarage(start_time) #new_product_list.extend(pcgarage.start_requests()) # altex.ro spider altex = Altex(start_time) #new_product_list.extend(altex.start_requests()) # ceasboutique.ro spider ceasboutique = Ceasboutique(start_time) #new_product_list.extend(ceasboutique.start_requests()) # compare new data with old data get_new_deals(new_product_list) # write new data if new_product_list: write_json(start_time, new_product_list)
def _extract_1stframe(self, dir_path, json_path, relabel): if osp.exists(json_path): print("=> {} generated before, awesome!".format(json_path)) split = read_json(json_path) return split['tracklets'] print( "=> Automatically generating split (might take a while for the first time, have a coffe)" ) pdirs = glob.glob(osp.join(dir_path, '*')) # avoid .DS_Store print("Processing {} with {} person identities".format( dir_path, len(pdirs))) pid_container = set() for pdir in pdirs: pid = int(osp.basename(pdir)) pid_container.add(pid) pid2label = {pid: label for label, pid in enumerate(pid_container)} tracklets = [] for pdir in pdirs: pid = int(osp.basename(pdir)) if relabel: pid = pid2label[pid] tdirs = glob.glob(osp.join(pdir, '*')) for tdir in tdirs: raw_img_paths = glob.glob(osp.join(tdir, '*.jpg')) num_imgs = len(raw_img_paths) if num_imgs < self.min_seq_len: continue img_paths = [] for img_idx in range(num_imgs): # some tracklet starts from 0002 instead of 0001 img_idx_name = 'F' + str(img_idx + 1).zfill(4) res = glob.glob( osp.join(tdir, '*' + img_idx_name + '*.jpg')) if len(res) == 0: print( "Warn: index name {} in {} is missing, jump to next" .format(img_idx_name, tdir)) continue img_paths.append(res[0]) img_name = osp.basename(img_paths[0]) if img_name.find('_') == -1: # old naming format: 0001C6F0099X30823.jpg camid = int(img_name[5]) - 1 else: # new naming format: 0001_C6_F0099_X30823.jpg camid = int(img_name[6]) - 1 img_paths = tuple(img_paths) tracklets.append((img_paths[0], pid, camid)) print("Saving split to {}".format(json_path)) split_dict = { 'tracklets': tracklets, } write_json(split_dict, json_path) return tracklets
def __init__(self, config, resume: bool, model, loss_function, optim): """ 构建模型训练器的基类,包含以下功能: - 初始化 CUDA 与并行 - 初始化 模型与优化器 - 导入参数 - 存储模型断点,加载模型断点 Args: config: 配置文件 resume: 本次实验是否接最近一次的断点继续运行 model: 模型 optim: 优化器 """ self.n_gpu = config["n_gpu"] self.dev = self._prepare_device(self.n_gpu, use_cudnn=config["use_cudnn"]) self.model = model.to(self.dev) if self.n_gpu > 1: self.model = torch.nn.DataParallel(self.model, device_ids=list( range(self.n_gpu))) self.optimizer = optim self.loss_function = loss_function self.epochs = config["trainer"]["epochs"] self.save_period = config["trainer"]["save_period"] self.start_epoch = 1 # 非配置项,当 resume == True 时,参数会被重置 self.best_score = 0.0 # 非配置项 self.save_location = Path(config["save_location"]) self.root_dir = self.save_location / config["name"] self.checkpoints_dir = self.root_dir / "checkpoints" self.tensorboardX_logs_dir = self.root_dir / "logs" self._prepare_empty_dir([ self.save_location, self.root_dir, self.checkpoints_dir, self.tensorboardX_logs_dir ], resume) self.viz = TensorboardXWriter(self.tensorboardX_logs_dir.as_posix()) self.visualize_metrics_period = config["visualize_metrics_period"] self.viz.writer.add_text( "Configuration", "```\n" + json.dumps(config, indent=2, sort_keys=False) + "\n```", global_step=1) self.viz.writer.add_text("Description", config["description"], global_step=1) if resume: self._resume_checkpoint() print("模型,优化器,参数,目录初始化完毕,本实验中使用的配置信息如下:") print(json.dumps(config, indent=2, sort_keys=False)) config_save_path = os.path.join(self.root_dir, "config.json") write_json(config, config_save_path) self._print_networks([self.model])
def __init__(self, args, options=""): """ - class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving and logging module. input: args: Dict containing configurations, hyperparameters for training. contents of `parameters.json` file for example. options: Dict keychain:value, specifying position values to be replaced from config dict. """ # parse default and custom cli options for opt in options: args.add_argument(*opt.flags, default=None, type=opt.type) args = args.parse_args() self.cfg_fname = Path(args.config) # load json file as python dictionary config = read_json(self.cfg_fname) config["src_data"] = args.src_data config["tgt_data"] = args.tgt_data config["src_data_prefix"] = args.src_data_prefix config["tgt_data_prefix"] = args.tgt_data_prefix # load config file and apply custom cli options self._config = _update_config(config, options, args) # set save directory where trained embedding and log will be saved save_dir_name = args.save_name if args.save_name else config[ "src_data_prefix"] + "_" + config["tgt_data_prefix"] save_dir = Path(args.save) / save_dir_name timestamp = datetime.now().strftime(r'%m%d_%H%M%S') exper_name = self.config['name'] print(f"Result will be saved in {save_dir}") self._save_dir = save_dir / 'best' / exper_name / timestamp self._log_dir = save_dir / 'log' / exper_name / timestamp self.save_dir.mkdir(parents=True, exist_ok=True) self.log_dir.mkdir(parents=True, exist_ok=True) # save updated config file to the checkpoint dir write_json(self.config, self.save_dir / "parameters.json") # configure logging module setup_logging(self.log_dir) self.log_levels = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG }
def execute(): missing_posters = {"posters": [], "collections": []} missing_posters = add_posters_for_new_videos( video_type="movies", missing_posters=missing_posters) missing_posters = add_posters_for_collections( missing_posters=missing_posters) upload_new_posters(poster_type="movies") utils.write_json("missing_posters", missing_posters)
def main(): ############################################################################# # 0. # # Check if tmp folder exists, otherwise create it check_create_folder(settings.tmp_dir) # Build the list with countries and states admin_areas = get_aa_list() for chart in settings.charts: ind_source = settings.src_auxiliary + str(settings.current_edition) + '-' + str(chart["id"]) + '.csv' global_avg = False # Calculate the global average for this chart if "global_average" in chart and chart["global_average"]: global_avg = get_avg(chart, ind_source) for aa in admin_areas: iso = aa.lower() for lang in settings.langs: # Initialize the array that will be written to JSON json_data = {"name": iso, "iso": iso, "meta": {"title": chart["title"][lang], "label-x": chart["labelx"][lang], "label-y": chart["labely"][lang]}, "data": []} for serie in chart["series"]: if serie["id"] == 'country': # If we're dealing with a country, use the country name as label of serie serie_name = aa else: serie_name = serie["name"][lang] # Initialize the object for the serie serie_to_append = {"name": serie_name, "id": serie["id"], "values": []} # Add a note to the serie if chart["note"]: serie_to_append["note"] = add_note(serie, ind_source, aa) # Generate the actual data serie_to_append["values"] = chart['function'](serie, ind_source, lang, aa, chart["years"],global_avg) json_data["data"].append(serie_to_append) # Write the list to a JSON file file_path = (settings.exp_aux_json).format(lang=lang,indicator=chart["export"],aa=iso) write_json(file_path, json_data) # Fully remove the temp directory clean_dir(settings.tmp_dir, True) print "All done. The auxiliary data has been prepared for use on global-climatescope.org."
def __init__(self, config=None, resume=None, modification=None, run_id=None): """ class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving and logging module. :param config: Dict containing configurations, hyperparameters for training. contents of `config.json` file for example. :param resume: String, path to the checkpoint being loaded. :param modification: Dict keychain:value, specifying position values to be replaced from config dict. :param run_id: Unique Identifier for training processes. Used to save checkpoints and training log. Timestamp is being used as default """ if config is None: config_path = UTILS_DIR / 'config.json' config = read_json(config_path) # load config file and apply modification self._config = _update_config(config, modification) self.resume = resume # set save_dir where trained model and log will be saved. save_dir = ROOT_DIR / self.config['trainer']['save_dir'] # set the data_dir self.data_dir = ROOT_DIR / self.config['data_loader']['args'][ 'data_dir'] exper_name = self.config['name'] if run_id is None: # use timestamp as default run-id run_id = datetime.now().strftime(r'%m%d_%H%M%S') self._save_dir = save_dir / 'models' / exper_name / run_id self._log_dir = save_dir / 'log' / exper_name / run_id # make directory for saving checkpoints and log. exist_ok = run_id == '' self.save_dir.mkdir(parents=True, exist_ok=exist_ok) self.log_dir.mkdir(parents=True, exist_ok=exist_ok) # save updated config file to the checkpoint dir write_json(self.config, self.save_dir / 'config.json') # configure logging module # setup_logging(self.log_dir) self.log_levels = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG }
def main(): ############################################################################# # 0. # # Check if tmp folder exists, otherwise create it check_create_folder(settings.tmp_dir) # Build the list with countries and states admin_areas = get_aa_list() for chart in settings.charts: ind_source = settings.src_auxiliary + str(settings.current_edition) + '-' + str(chart["id"]) + '.csv' global_avg = False # Calculate the global average for this chart if "global_average" in chart and chart["global_average"]: global_avg = get_avg(chart, ind_source) for aa in admin_areas: iso = aa.lower() for lang in settings.langs: # Initialize the array that will be written to JSON json_data = {"name": iso, "iso": iso, "meta": {"title": chart["title"][lang], "label-x": chart["labelx"][lang], "label-y": chart["labely"][lang]}, "data": []} for serie in chart["series"]: if serie["id"] == 'country': # If we're dealing with a country, use the country name as label of serie serie_name = aa else: serie_name = serie["name"][lang] # Initialize the object for the serie serie_to_append = {"name": serie_name, "id": serie["id"], "values": []} # Generate the actual data serie_to_append["values"] = chart['function'](serie, ind_source, lang, aa, chart["years"],global_avg) json_data["data"].append(serie_to_append) # Write the list to a JSON file file_path = (settings.exp_aux_json).format(lang=lang,indicator=chart["export"],aa=iso) write_json(file_path, json_data) # Fully remove the temp directory clean_dir(settings.tmp_dir, True) print "All done. The auxiliary data has been prepared for use on global-climatescope.org."
def _prepare_split(self): if not osp.exists(self.split_path): print("Creating splits") mat_split_data = np.load(self.split_mat_path)['ls_set'] num_splits = mat_split_data.shape[0] num_total_ids = mat_split_data.shape[1] assert num_splits == 10 assert num_total_ids == 300 num_ids_each = num_total_ids / 2 # pids in mat_split_data are indices, so we need to transform them # to real pids person_cam1_dirs = os.listdir(self.cam_1_path) person_cam2_dirs = os.listdir(self.cam_2_path) # make sure persons in one camera view can be found in the other camera view assert set(person_cam1_dirs) == set(person_cam2_dirs) splits = [] for i_split in range(num_splits): # first 50% for testing and the remaining for training, following Wang et al. ECCV'14. train_idxs = sorted( list(mat_split_data[i_split, num_ids_each:])) test_idxs = sorted(list( mat_split_data[i_split, :num_ids_each])) train_idxs = [int(i) - 1 for i in train_idxs] test_idxs = [int(i) - 1 for i in test_idxs] # transform pids to person dir names train_dirs = [person_cam1_dirs[i] for i in train_idxs] test_dirs = [person_cam1_dirs[i] for i in test_idxs] split = {'train': train_dirs, 'test': test_dirs} splits.append(split) print( "Totally {} splits are created, following Wang et al. ECCV'14". format(len(splits))) print("Split file is saved to {}".format(self.split_path)) write_json(splits, self.split_path) print("Splits created")
def __init__(self, annot_path, video_id_path, metadata_path, fps, window_size, out_path): ''' Given videos, we create segments (of frames) and their corresponding labels. A segment is a start/end frame numbers (for a video) and label is whether compression occurs or not in that segment. We use the annotations in secs of the videos to calc the label. :param annot_path: str, path to the cpr annotations :param video_id_path: str, path for the video ids by train/val/test splits :param metadata_path: str, path to the metadata of the videos :param fps: int, fps of frames videos were converted to :param window_size: int, num of frames in a sliding window :param out_path: str, path to output the segment and labels json ''' self.fps = fps self.window_size = window_size self.annot_json = read_json(annot_path) video_id_by_split = read_json(video_id_path) self.metadata_path = read_json(metadata_path) # store each split here all_data = {} # loop thru each data split for split_type in video_id_by_split.keys(): video_id_list = video_id_by_split[ split_type] # retrieve a video id segments, labels = self._create_segments_labels( video_id_list) # create the segments/labels data = { 'segments': segments, 'labels': labels } # store both in a dict all_data[split_type] = data # store for entire video # write all to disk out_path = os.path.join(out_path, 'segments_and_labels.json') write_json(all_data, out_path, indent=None)
def execute(): """ Compare Trakt list with Plex collections to find missing videos. """ missing_videos = {} plex = PlexServer(settings.PLEX_URL, settings.PLEX_TOKEN) sections_by_type = utils.get_sections_by_type(plex=plex) for section_title in sections_by_type["movies"]: section_config = utils.open_trakt_json("movies") section = plex.library.section(section_title) trakt_videos = section_config.keys() for trakt_video in trakt_videos: all_collections = section_config[trakt_video].get( "collections", None) if all_collections: collections = [ collection for collection in all_collections if collection not in settings.IGNORE_MISSING_VIDEOS ] if collections: if not any(x for x in section.all() if "{title} ({year})".format( title=x.title, year=x.year) == trakt_video): for collection in collections: print( "Missing '{title}' for collection '{collection}'" .format(title=trakt_video, collection=collection)) try: missing_videos[collection].append(trakt_video) except KeyError: missing_videos[collection] = [trakt_video] utils.write_json("missing_videos", missing_videos)
def __init__(self, config, resume=None, modification=None, run_id=None): """ class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving and logging module. :param config: Dict containing configurations, hyperparameters for training. contents of `config.json` file for example. :param resume: String, path to the checkpoint being loaded. :param modification: Dict keychain:value, specifying position values to be replaced from config dict. :param run_id: Unique Identifier for training processes. Used to save checkpoints and training log. Timestamp is being used as default """ # load config file and apply modification self._config = _update_config(config, modification) self.resume = resume # set save_dir where trained model and log will be saved. save_dir = Path(self.config["trainer"]["save_dir"]) exper_name = self.config["name"] if run_id is None: # use timestamp as default run-id run_id = datetime.now().strftime(r"%m%d_%H%M%S") self._save_dir = save_dir / "checkpoints" / exper_name / run_id self._log_dir = save_dir / "logs" / exper_name / run_id self._tensorboard_dir = save_dir / "runs" / exper_name / run_id # make directory for saving checkpoints, logs, and tensorboard files. exist_ok = run_id == "" self.save_dir.mkdir(parents=True, exist_ok=exist_ok) self.log_dir.mkdir(parents=True, exist_ok=exist_ok) self.tensorboard_dir.mkdir(parents=True, exist_ok=exist_ok) # save updated config file to the checkpoint dir write_json(self.config, self.save_dir / "config.json") # configure logging module setup_logging(self.log_dir) self.log_levels = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG }
def main(): ############################################################################# # 0. # # Check if tmp folder exists, otherwise create it if check_dir(settings.tmp_dir) == True: sys.exit(0) else: os.makedirs(settings.tmp_dir) # Run some checks on the source folder with core data. if not get_years(): # Is there anything in the source folder to begin with? print "We were not able to find a XLSX file with core data in the folder: "\ "%s. Make sure this folder contains at least one XLSX file named "\ "after the year (eg. 2014.xlsx). Check the readme for more info "\ "about the required structure of these files.\n"\ "Quiting..." % (settings.src_core) sys.exit(0) # Provide feedback that the script only processes XLSX files with properly # formatted filenames. (eg. 2014.xlsx) fn_pattern = re.compile('^20[0-9]{2}$') for f in os.listdir(settings.src_core): fn = os.path.splitext(f)[0] ext = os.path.splitext(f)[-1].lower() path = os.path.join(settings.src_core, fn) if not os.path.isdir(path): # Only check files if ext == ".xlsx": if not fn_pattern.match(fn): print "The XLSX file %s doesn't have a properly formatted year as "\ "filename and will be ignored." % (f) else: print "The script only processes XLSX files. %s will be ignored." % (f) print "Loading the core and meta data..." # Build the different sets of admin areas with things we have to loop over. countries = build_set('country','type','iso',settings.src_meta_aa) states = build_set('state','type','iso',settings.src_meta_aa) admin_areas = countries | states # Build sets for the variables we loop over global index_param index_param = build_set('param','type','id',settings.src_meta_index) index_score = build_set('score','type','id',settings.src_meta_index) sp = list(index_score | index_param) # Build set for the years we're interested in global years years = get_years() global current_yr current_yr = max(years) # Read in the files with meta-data and set the scope to global global df_meta_aa df_meta_aa = pd.read_csv(settings.src_meta_aa,index_col='iso') global df_meta_index df_meta_index = pd.read_csv(settings.src_meta_index,index_col='id') ############################################################################# # 1. Store the relevant core data in one DF (df_full) # # # Output: df_full # # 2014 2015 # iso ind value data value data # AR 0 1.2420 NaN 1.2235 NaN # 1.01 0.1802 78.17 0.1795 75.16 # ... first_yr = True for yr in years: # All core data files are named after the year of the edition fn = settings.src_core + yr + '.xlsx' df_yr = pd.DataFrame() for sheet in settings.core_data_sheets: # Build an index to parse only the relevant columns cols_index = build_col_index(fn,sheet) # Read Excel (parsing only relevant cols) df_sheet = pd.read_excel(fn,sheet,parse_cols=cols_index) # Ensure that the iso codes don't contain strange characters. They can only # contain letters, numbers and hyphens. (eg. CN, CN-65 or IN-MP) df_sheet['iso'].replace(to_replace='[^a-zA-Z0-9-]', value='',inplace=True,regex=True) # Append each sheet to a dataframe holding the data for that year df_yr = df_yr.append(df_sheet) # Set the index of the DF to the ISO code and ID of the indicator df_yr.set_index(['iso','id'],inplace=True) # Make sure the index is sorted so the slicing works well df_yr.sortlevel(inplace=True) # Rename the column 'score' to value df_yr.rename(columns={'score':'value'}, inplace=True) # Add an extra level in the hierarchy of the columns (Mutli-index) # containing an indication of the year # Create list that repeats 'value' for the amount of years available c = [yr] * len(df_yr.columns) # Add a level to the cols df_yr.columns = [c, df_yr.columns] if first_yr: # If it's the first year, we initialize the full DataFrame df_full = df_yr first_yr = False else: # Every subsequent year will have to be merged into df_full df_full = pd.merge(df_full,df_yr,how='outer',left_index=True,right_index=True) df_full.sortlevel(axis=1,inplace=True) ############################################################################# # 2. CSV downloads # # For all the CSV exports, prepare a dataframe that combines the data with # the meta. print "Building the CSV files for the download section..." # For the CSV, we're only interested in the value column of each year df_full_csv = df_full.loc[:,(slice(None),'value')] df_full_csv.columns = df_full_csv.columns.get_level_values(0) # The full DF is a multi-index. Since the meta-files have a single index, # it is necessary to reset the indexes before joining on the column. df_full_csv = df_full_csv.reset_index() df_meta_aa_csv = df_meta_aa.reset_index() df_meta_index_csv = df_meta_index.reset_index() # Merge the country meta df_full_csv = pd.merge(df_full_csv,df_meta_aa_csv,on='iso') # Merge the index meta data df_full_csv = pd.merge(df_full_csv,df_meta_index_csv,on='id',suffixes=('_aa','_var')) # Re-index the DF on iso & id and make sure it's sorted df_full_csv.set_index(['iso','id'],inplace=True) df_full_csv.sortlevel(inplace=True) # 2.0 Export the full dataset to CSV for lang in settings.langs: # Build a list with the meta-data that needs to be included columns = ['name:' + lang + '_aa','name:' + lang + '_var','type_var'] columns = columns + list(years) file_path = (settings.exp_full_csv).format(lang=lang) df_full_csv.loc[slice(None),columns].to_csv(file_path,encoding='UTF-8',index=False) # 2.1 Generate the main CSV files # Slice the DF to only contain the score and parameters for the current year. df_main_csv = df_full_csv.loc[(slice(None),sp),:] for lang in settings.langs: # Pivot the DF and export it file_path = (settings.exp_current_csv).format(lang=lang, yr=current_yr) pivot_df(df_main_csv,'name:' + lang + '_aa','name:' + lang + '_var',current_yr).to_csv(file_path,encoding='UTF-8') # 2.3 Generate the country + state CSV files for aa in admin_areas: # Select the data of this admin area df_aa_csv = df_full_csv.loc[(aa,slice(None)),:] for lang in settings.langs: # Include the name of the var, its type and the years columns = ['name:' + lang + '_var','type_var'] + list(years) # Select the proper columns and generate the CSV file_path = (settings.exp_aa_csv).format(lang = lang, aa = aa.lower()) df_aa_csv.loc[slice(None),columns].to_csv(file_path,encoding='UTF-8',index=False) ############################################################################# # 3. Calculate the rankings # # # Output: df_full # # 2014 2015 # value data gr sr value data gr sr # iso id # AR 0 1.2420 NaN 13 NaN 1.2235 NaN 12 NaN # 1.01 0.1802 73.1 5 NaN 0.1795 75.8 6 NaN # ... print "Calculating the ranking..." # 3.0 Prepare the structure # Add placeholder cols with NaN that can be updated later with df.update() for year in years: for rank in ('gr', 'sr'): df_full[(year,rank)] = np.nan # Make sure its sorted df_full.sortlevel(axis=1,inplace=True) # 3.1 Global rank # The global rank (gr) is a rank of all the COUNTRIES in the project df_full = get_rank(countries,df_full,'gr') # 3.3 State rank # The state rank ('sr') ranks the STATES of a particular country for country in countries: # Check if there are any states or provinces for this country cs = build_set(country,'country','iso',settings.src_meta_aa) if cs: df_full = get_rank(cs,df_full,'sr') ############################################################################# # 4. JSON api # print "Building the JSON files for the API..." # 4.1 Generate the main JSON file for lang in settings.langs: # The JSON will contain a list with dicts json_data = [] # Loop over the countries list for country in countries: country_data = build_json_aa(country,df_full,lang, historic=True) # Sort the list of states / provinces if country_data['states']: country_data['states'] = sorted(country_data['states'], key=lambda k: k['name']) json_data.append(country_data) # Sort the list of countries by name sorted_data = sorted(json_data, key=lambda k: k['name']) # Write the list to a JSON file file_path = (settings.exp_core).format(lang=lang) write_json(file_path, sorted_data) # 4.3 Generate the country + state JSON files for aa in admin_areas: for lang in settings.langs: # Get the data for this admin area in a dict json_data = build_json_aa(aa,df_full,lang,indicators=True,historic=True) # Write the dict to a JSON file file_path = (settings.exp_aa).format(lang=lang,aa=aa.lower()) write_json(file_path, json_data) # Fully remove the temp directory clean_dir(settings.tmp_dir , True) print "All done. The data has been prepared for use on global-climatescope.org."
def save_file(self, content, fname): write_json(content, self.save_dir / fname)
def eval_search_cuhk( gallery_dataset, query_dataset, gallery_dets, gallery_feats, query_box_feats, query_dets, query_feats, k1=10, k2=3, det_thresh=0.5, cbgm=False, gallery_size=100, ): """ gallery_dataset/query_dataset: an instance of BaseDataset gallery_det (list of ndarray): n_det x [x1, x2, y1, y2, score] per image gallery_feat (list of ndarray): n_det x D features per image query_feat (list of ndarray): D dimensional features per query image det_thresh (float): filter out gallery detections whose scores below this gallery_size (int): gallery size [-1, 50, 100, 500, 1000, 2000, 4000] -1 for using full set """ assert len(gallery_dataset) == len(gallery_dets) assert len(gallery_dataset) == len(gallery_feats) assert len(query_dataset) == len(query_box_feats) use_full_set = gallery_size == -1 fname = "TestG{}".format(gallery_size if not use_full_set else 50) protoc = loadmat(osp.join(gallery_dataset.root, "annotation/test/train_test", fname + ".mat")) protoc = protoc[fname].squeeze() # mapping from gallery image to (det, feat) annos = gallery_dataset.annotations name_to_det_feat = {} for anno, det, feat in zip(annos, gallery_dets, gallery_feats): name = anno["img_name"] if det != []: scores = det[:, 4].ravel() inds = np.where(scores >= det_thresh)[0] if len(inds) > 0: name_to_det_feat[name] = (det[inds], feat[inds]) aps = [] accs = [] topk = [1, 5, 10] ret = {"image_root": gallery_dataset.img_prefix, "results": []} for i in range(len(query_dataset)): y_true, y_score = [], [] imgs, rois = [], [] count_gt, count_tp = 0, 0 # get L2-normalized feature vector feat_q = query_box_feats[i].ravel() # ignore the query image query_imname = str(protoc["Query"][i]["imname"][0, 0][0]) query_roi = protoc["Query"][i]["idlocate"][0, 0][0].astype(np.int32) query_roi[2:] += query_roi[:2] query_gt = [] tested = set([query_imname]) name2sim = {} name2gt = {} sims = [] imgs_cbgm = [] # 1. Go through the gallery samples defined by the protocol for item in protoc["Gallery"][i].squeeze(): gallery_imname = str(item[0][0]) # some contain the query (gt not empty), some not gt = item[1][0].astype(np.int32) count_gt += gt.size > 0 # compute distance between query and gallery dets if gallery_imname not in name_to_det_feat: continue det, feat_g = name_to_det_feat[gallery_imname] # no detection in this gallery, skip it if det.shape[0] == 0: continue # get L2-normalized feature matrix NxD assert feat_g.size == np.prod(feat_g.shape[:2]) feat_g = feat_g.reshape(feat_g.shape[:2]) # compute cosine similarities sim = feat_g.dot(feat_q).ravel() if gallery_imname in name2sim: continue name2sim[gallery_imname] = sim name2gt[gallery_imname] = gt sims.extend(list(sim)) imgs_cbgm.extend([gallery_imname] * len(sim)) # 2. Go through the remaining gallery images if using full set if use_full_set: # TODO: support CBGM when using full set for gallery_imname in gallery_dataset.imgs: if gallery_imname in tested: continue if gallery_imname not in name_to_det_feat: continue det, feat_g = name_to_det_feat[gallery_imname] # get L2-normalized feature matrix NxD assert feat_g.size == np.prod(feat_g.shape[:2]) feat_g = feat_g.reshape(feat_g.shape[:2]) # compute cosine similarities sim = feat_g.dot(feat_q).ravel() # guaranteed no target query in these gallery images label = np.zeros(len(sim), dtype=np.int32) y_true.extend(list(label)) y_score.extend(list(sim)) imgs.extend([gallery_imname] * len(sim)) rois.extend(list(det)) if cbgm: # -------- Context Bipartite Graph Matching (CBGM) ------- # sims = np.array(sims) imgs_cbgm = np.array(imgs_cbgm) # only process the top-k1 gallery images for efficiency inds = np.argsort(sims)[-k1:] imgs_cbgm = set(imgs_cbgm[inds]) for img in imgs_cbgm: sim = name2sim[img] det, feat_g = name_to_det_feat[img] # only regard the people with top-k2 detection confidence # in the query image as context information qboxes = query_dets[i][:k2] qfeats = query_feats[i][:k2] assert ( query_roi - qboxes[0][:4] ).sum() <= 0.001, "query_roi must be the first one in pboxes" # build the bipartite graph and run Kuhn-Munkres (K-M) algorithm # to find the best match graph = [] for indx_i, pfeat in enumerate(qfeats): for indx_j, gfeat in enumerate(feat_g): graph.append((indx_i, indx_j, (pfeat * gfeat).sum())) km_res, max_val = run_kuhn_munkres(graph) # revise the similarity between query person and its matching for indx_i, indx_j, _ in km_res: # 0 denotes the query roi if indx_i == 0: sim[indx_j] = max_val break for gallery_imname, sim in name2sim.items(): gt = name2gt[gallery_imname] det, feat_g = name_to_det_feat[gallery_imname] # assign label for each det label = np.zeros(len(sim), dtype=np.int32) if gt.size > 0: w, h = gt[2], gt[3] gt[2:] += gt[:2] query_gt.append({"img": str(gallery_imname), "roi": list(map(float, list(gt)))}) iou_thresh = min(0.5, (w * h * 1.0) / ((w + 10) * (h + 10))) inds = np.argsort(sim)[::-1] sim = sim[inds] det = det[inds] # only set the first matched det as true positive for j, roi in enumerate(det[:, :4]): if _compute_iou(roi, gt) >= iou_thresh: label[j] = 1 count_tp += 1 break y_true.extend(list(label)) y_score.extend(list(sim)) imgs.extend([gallery_imname] * len(sim)) rois.extend(list(det)) tested.add(gallery_imname) # 3. Compute AP for this query (need to scale by recall rate) y_score = np.asarray(y_score) y_true = np.asarray(y_true) assert count_tp <= count_gt recall_rate = count_tp * 1.0 / count_gt ap = 0 if count_tp == 0 else average_precision_score(y_true, y_score) * recall_rate aps.append(ap) inds = np.argsort(y_score)[::-1] y_score = y_score[inds] y_true = y_true[inds] accs.append([min(1, sum(y_true[:k])) for k in topk]) # 4. Save result for JSON dump new_entry = { "query_img": str(query_imname), "query_roi": list(map(float, list(query_roi))), "query_gt": query_gt, "gallery": [], } # only record wrong results if int(y_true[0]): continue # only save top-10 predictions for k in range(10): new_entry["gallery"].append( { "img": str(imgs[inds[k]]), "roi": list(map(float, list(rois[inds[k]]))), "score": float(y_score[k]), "correct": int(y_true[k]), } ) ret["results"].append(new_entry) print("search ranking:") print(" mAP = {:.2%}".format(np.mean(aps))) accs = np.mean(accs, axis=0) for i, k in enumerate(topk): print(" top-{:2d} = {:.2%}".format(k, accs[i])) write_json(ret, "vis/results.json") ret["mAP"] = np.mean(aps) ret["accs"] = accs return ret
def train_classifier(clf, itr_train, itr_valid, params): """Train a classifier. Args: clf (classifier): a classifier we wish to train. itr_train (Iterator): an iterator over training data. itr_valid (Iterator): an iterator over validation data. params (dict): flags for training. """ # Dump the parameters we used to a JSON file. params_file = os.path.join(params['results_dir'], 'params.json') utils.write_json(params_file, params) run_avg_len = params['run_avg_len'] max_steps = params['max_steps_train'] write_freq = params['write_freq'] # RALoss is an object which tracks the running average of a loss. ra_loss = RALoss('loss', run_avg_len) ra_error = RALoss('error', run_avg_len) ra_trainloss = RALoss('train-loss', run_avg_len) ra_trainerr = RALoss('train-err', run_avg_len) min_val_loss = sys.maxsize min_val_step = 0 opt = tf.compat.v1.train.AdamOptimizer(learning_rate=params['lr']) finished_training = False start_printing = 0 for i in range(max_steps): batch_x, batch_y = itr_train.next() with tf.GradientTape() as tape: train_loss, train_err = clf.get_loss(batch_x, batch_y) mean_train_loss = tf.reduce_mean(train_loss) val_batch_x, val_batch_y = itr_valid.next() valid_loss, valid_err = clf.get_loss(val_batch_x, val_batch_y) loss_list = [ra_loss, ra_error, ra_trainloss, ra_trainerr] losses = zip(loss_list, [ tf.reduce_mean(l) for l in (valid_loss, valid_err, train_loss, train_err) ]) utils.update_losses(losses) grads = tape.gradient(mean_train_loss, clf.weights) opt.apply_gradients(zip(grads, clf.weights)) utils.print_losses(loss_list, i) curr_ra_loss = ra_loss.get_value() if (curr_ra_loss < min_val_loss and \ i - min_val_step > params['patience'] / 10) \ or i == max_steps - 1: # Early stopping: stop training when validation loss stops decreasing. # The second condition ensures we don't checkpoint every step early on. min_val_loss = curr_ra_loss min_val_step = i save_path, ckpt = utils.checkpoint_model(clf, params['ckptdir']) logging.info('Step {:d}: Checkpointed to {}'.format(i, save_path)) if i - min_val_step > params['patience'] or i == max_steps - 1: ckpt.restore(save_path) finished_training = True logging.info('Best validation loss was {:.3f} at step {:d}' ' - stopping training'.format(min_val_loss, min_val_step)) if i % write_freq == 0 or finished_training: # import pdb; pdb.set_trace() utils.write_losses_to_log(loss_list, range(start_printing, i + 1), params['logdir']) start_printing = i + 1 utils.plot_losses(params['figdir'], loss_list) logging.info('Step {:d}: Wrote losses and plots'.format(i)) if finished_training: break