コード例 #1
0
ファイル: configer.py プロジェクト: kaczmarj/Bioinfor-DeepATT
    def __init__(self, config, run_id=None, verbosity=0):
        """
        Class to parse configuration json file. Handles hyper-parameters for training, initializations of modules,
        checkpoint saving and logging module.
        :param config: Dict containing configurations.
        :param run_id: Unique Identifier for training processes. Timestamp is being used as default
        :param verbosity: default 0.
        """
        # 1\ Define the config and run_id.
        self._config = config
        self._run_id = str(run_id)
        if run_id is 'None':
            self._run_id = datetime.now().strftime(r'%m%d_%H%M%S')
        self._verbosity = verbosity

        # 2\ Set _save_dir, _checkpoint_dir and _log_dir where checkpoints and logs will be saved.
        save_dir = './result/'
        self._result_dir = os.path.join(save_dir, self.config['name'] + '/',
                                        self._run_id + '/')
        self._checkpoint_dir = os.path.join(save_dir,
                                            self.config['name'] + '/',
                                            self._run_id + '/', 'checkpoints/')
        self._summary_dir = os.path.join(save_dir, self.config['name'] + '/',
                                         self._run_id + '/', 'logs/')

        # 3\ Create directory for saving checkpoints and log.
        create_dirs([self.result_dir, self._checkpoint_dir, self.summary_dir])

        # 4\ Save relative config file to the relative dir
        write_json(self.config, os.path.join(self.result_dir, 'config.json'))
        self.config['trainer']['args']['verbosity'] = verbosity
コード例 #2
0
    def execute(self, conf_path: str, input_path: str, output_path: str,
                on_adls: bool):
        """
        Clean a list of JSON files and writing them
        Args:
            conf_path: File path of the params.json
            input_path: Folder path to read raw files
            output_path: Folder path to write files
            on_adls: If the data are on the Azure Data Lake set true to use the correct package

        Returns: Nothing the data are directly write at the desired location

        """
        self.load_params(conf_path)
        self.params.get("json")
        self.data_lake = uts.connect_to_data_lake_store(
            self.params) if on_adls else None

        res = []
        for file in self.params.get("json"):
            json_file_name = "{}.json".format(file)
            read_path = path.join(input_path, json_file_name)
            self.logger.info(
                "Reading and parsing JSON from: {}".format(read_path))
            data = uts.read_json(read_path,
                                 self.data_lake,
                                 advanced_parsing=True)

            write_path = path.join(output_path, json_file_name)
            self.logger.info(
                "Writing the parsed JSON to: {}".format(write_path))
            uts.write_json(data, write_path, self.data_lake)
            res.append(data)

        return res
コード例 #3
0
def start_spiders():
    new_product_list = []
    start_time = time.strftime("%d.%m.%Y-%H.%M")

    # cel.ro spider
    cel = Cel(start_time)
    new_product_list.extend(cel.start_requests())

    # emag.ro spider
    emag = Emag(start_time)
    new_product_list.extend(emag.start_requests())

    # pcgarage.ro spider
    pcgarage = Pcgarage(start_time)
    #new_product_list.extend(pcgarage.start_requests())

    # altex.ro spider
    altex = Altex(start_time)
    #new_product_list.extend(altex.start_requests())

    # ceasboutique.ro spider
    ceasboutique = Ceasboutique(start_time)
    #new_product_list.extend(ceasboutique.start_requests())

    # compare new data with old data
    get_new_deals(new_product_list)
    # write new data
    if new_product_list:
        write_json(start_time, new_product_list)
コード例 #4
0
ファイル: data_manager.py プロジェクト: Chezacar/AmurTiger2.0
    def _extract_1stframe(self, dir_path, json_path, relabel):
        if osp.exists(json_path):
            print("=> {} generated before, awesome!".format(json_path))
            split = read_json(json_path)
            return split['tracklets']

        print(
            "=> Automatically generating split (might take a while for the first time, have a coffe)"
        )
        pdirs = glob.glob(osp.join(dir_path, '*'))  # avoid .DS_Store
        print("Processing {} with {} person identities".format(
            dir_path, len(pdirs)))

        pid_container = set()
        for pdir in pdirs:
            pid = int(osp.basename(pdir))
            pid_container.add(pid)
        pid2label = {pid: label for label, pid in enumerate(pid_container)}

        tracklets = []
        for pdir in pdirs:
            pid = int(osp.basename(pdir))
            if relabel: pid = pid2label[pid]
            tdirs = glob.glob(osp.join(pdir, '*'))
            for tdir in tdirs:
                raw_img_paths = glob.glob(osp.join(tdir, '*.jpg'))
                num_imgs = len(raw_img_paths)

                if num_imgs < self.min_seq_len:
                    continue

                img_paths = []
                for img_idx in range(num_imgs):
                    # some tracklet starts from 0002 instead of 0001
                    img_idx_name = 'F' + str(img_idx + 1).zfill(4)
                    res = glob.glob(
                        osp.join(tdir, '*' + img_idx_name + '*.jpg'))
                    if len(res) == 0:
                        print(
                            "Warn: index name {} in {} is missing, jump to next"
                            .format(img_idx_name, tdir))
                        continue
                    img_paths.append(res[0])
                img_name = osp.basename(img_paths[0])
                if img_name.find('_') == -1:
                    # old naming format: 0001C6F0099X30823.jpg
                    camid = int(img_name[5]) - 1
                else:
                    # new naming format: 0001_C6_F0099_X30823.jpg
                    camid = int(img_name[6]) - 1
                img_paths = tuple(img_paths)
                tracklets.append((img_paths[0], pid, camid))

        print("Saving split to {}".format(json_path))
        split_dict = {
            'tracklets': tracklets,
        }
        write_json(split_dict, json_path)

        return tracklets
コード例 #5
0
    def __init__(self, config, resume: bool, model, loss_function, optim):
        """
        构建模型训练器的基类,包含以下功能:
            - 初始化 CUDA 与并行
            - 初始化 模型与优化器
            - 导入参数
            - 存储模型断点,加载模型断点
        Args:
            config: 配置文件
            resume: 本次实验是否接最近一次的断点继续运行
            model: 模型
            optim: 优化器
        """

        self.n_gpu = config["n_gpu"]
        self.dev = self._prepare_device(self.n_gpu,
                                        use_cudnn=config["use_cudnn"])

        self.model = model.to(self.dev)
        if self.n_gpu > 1:
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=list(
                                                   range(self.n_gpu)))

        self.optimizer = optim
        self.loss_function = loss_function
        self.epochs = config["trainer"]["epochs"]
        self.save_period = config["trainer"]["save_period"]
        self.start_epoch = 1  # 非配置项,当 resume == True 时,参数会被重置
        self.best_score = 0.0  # 非配置项
        self.save_location = Path(config["save_location"])
        self.root_dir = self.save_location / config["name"]
        self.checkpoints_dir = self.root_dir / "checkpoints"
        self.tensorboardX_logs_dir = self.root_dir / "logs"
        self._prepare_empty_dir([
            self.save_location, self.root_dir, self.checkpoints_dir,
            self.tensorboardX_logs_dir
        ], resume)
        self.viz = TensorboardXWriter(self.tensorboardX_logs_dir.as_posix())
        self.visualize_metrics_period = config["visualize_metrics_period"]
        self.viz.writer.add_text(
            "Configuration",
            "```\n" + json.dumps(config, indent=2, sort_keys=False) + "\n```",
            global_step=1)
        self.viz.writer.add_text("Description",
                                 config["description"],
                                 global_step=1)

        if resume:
            self._resume_checkpoint()

        print("模型,优化器,参数,目录初始化完毕,本实验中使用的配置信息如下:")
        print(json.dumps(config, indent=2, sort_keys=False))

        config_save_path = os.path.join(self.root_dir, "config.json")
        write_json(config, config_save_path)
        self._print_networks([self.model])
コード例 #6
0
    def __init__(self, args, options=""):
        """
        - class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving
        and logging module.

        input:
            args: Dict containing configurations, hyperparameters for training. contents of `parameters.json` file for example.
            options: Dict keychain:value, specifying position values to be replaced from config dict.
        """
        # parse default and custom cli options
        for opt in options:
            args.add_argument(*opt.flags, default=None, type=opt.type)
        args = args.parse_args()

        self.cfg_fname = Path(args.config)

        # load json file as python dictionary
        config = read_json(self.cfg_fname)

        config["src_data"] = args.src_data
        config["tgt_data"] = args.tgt_data

        config["src_data_prefix"] = args.src_data_prefix
        config["tgt_data_prefix"] = args.tgt_data_prefix

        # load config file and apply custom cli options
        self._config = _update_config(config, options, args)

        # set save directory where trained embedding and log will be saved
        save_dir_name = args.save_name if args.save_name else config[
            "src_data_prefix"] + "_" + config["tgt_data_prefix"]
        save_dir = Path(args.save) / save_dir_name

        timestamp = datetime.now().strftime(r'%m%d_%H%M%S')

        exper_name = self.config['name']

        print(f"Result will be saved in {save_dir}")

        self._save_dir = save_dir / 'best' / exper_name / timestamp
        self._log_dir = save_dir / 'log' / exper_name / timestamp

        self.save_dir.mkdir(parents=True, exist_ok=True)
        self.log_dir.mkdir(parents=True, exist_ok=True)

        # save updated config file to the checkpoint dir
        write_json(self.config, self.save_dir / "parameters.json")

        # configure logging module
        setup_logging(self.log_dir)
        self.log_levels = {
            0: logging.WARNING,
            1: logging.INFO,
            2: logging.DEBUG
        }
コード例 #7
0
ファイル: update_posters.py プロジェクト: zb140/flexy
def execute():
    missing_posters = {"posters": [], "collections": []}

    missing_posters = add_posters_for_new_videos(
        video_type="movies", missing_posters=missing_posters)
    missing_posters = add_posters_for_collections(
        missing_posters=missing_posters)

    upload_new_posters(poster_type="movies")

    utils.write_json("missing_posters", missing_posters)
コード例 #8
0
def main():

  #############################################################################
  # 0.
  #

  # Check if tmp folder exists, otherwise create it
  check_create_folder(settings.tmp_dir)
  
  # Build the list with countries and states
  admin_areas = get_aa_list()

  for chart in settings.charts:
    ind_source = settings.src_auxiliary + str(settings.current_edition) + '-' + str(chart["id"]) + '.csv'
      
    global_avg = False
    # Calculate the global average for this chart    
    if "global_average" in chart and chart["global_average"]:
      global_avg = get_avg(chart, ind_source)
    
    for aa in admin_areas:
      iso = aa.lower()
      for lang in settings.langs:
        # Initialize the array that will be written to JSON
        json_data = {"name": iso, "iso": iso, "meta": {"title": chart["title"][lang], "label-x": chart["labelx"][lang], "label-y": chart["labely"][lang]}, "data": []}

        for serie in chart["series"]:
          if serie["id"] == 'country':
            # If we're dealing with a country, use the country name as label of serie
            serie_name = aa
          else:
            serie_name = serie["name"][lang]

          # Initialize the object for the serie    
          serie_to_append = {"name": serie_name, "id": serie["id"], "values": []}

          # Add a note to the serie
          if chart["note"]:
            serie_to_append["note"] = add_note(serie, ind_source, aa)

          # Generate the actual data
          serie_to_append["values"] = chart['function'](serie, ind_source, lang, aa, chart["years"],global_avg)

          json_data["data"].append(serie_to_append)

        # Write the list to a JSON file
        file_path = (settings.exp_aux_json).format(lang=lang,indicator=chart["export"],aa=iso)
        write_json(file_path, json_data)
  
  # Fully remove the temp directory
  clean_dir(settings.tmp_dir, True)

  print "All done. The auxiliary data has been prepared for use on global-climatescope.org."
コード例 #9
0
ファイル: parse_config.py プロジェクト: erytheis/BattLeDIM
    def __init__(self,
                 config=None,
                 resume=None,
                 modification=None,
                 run_id=None):
        """
        class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving
        and logging module.
        :param config: Dict containing configurations, hyperparameters for training. contents of `config.json` file for example.
        :param resume: String, path to the checkpoint being loaded.
        :param modification: Dict keychain:value, specifying position values to be replaced from config dict.
        :param run_id: Unique Identifier for training processes. Used to save checkpoints and training log. Timestamp is being used as default
        """

        if config is None:
            config_path = UTILS_DIR / 'config.json'
            config = read_json(config_path)

        # load config file and apply modification
        self._config = _update_config(config, modification)
        self.resume = resume

        # set save_dir where trained model and log will be saved.
        save_dir = ROOT_DIR / self.config['trainer']['save_dir']

        # set the data_dir
        self.data_dir = ROOT_DIR / self.config['data_loader']['args'][
            'data_dir']

        exper_name = self.config['name']
        if run_id is None:  # use timestamp as default run-id
            run_id = datetime.now().strftime(r'%m%d_%H%M%S')
        self._save_dir = save_dir / 'models' / exper_name / run_id
        self._log_dir = save_dir / 'log' / exper_name / run_id

        # make directory for saving checkpoints and log.
        exist_ok = run_id == ''
        self.save_dir.mkdir(parents=True, exist_ok=exist_ok)
        self.log_dir.mkdir(parents=True, exist_ok=exist_ok)

        # save updated config file to the checkpoint dir
        write_json(self.config, self.save_dir / 'config.json')

        # configure logging module
        # setup_logging(self.log_dir)
        self.log_levels = {
            0: logging.WARNING,
            1: logging.INFO,
            2: logging.DEBUG
        }
コード例 #10
0
def main():

  #############################################################################
  # 0.
  #

  # Check if tmp folder exists, otherwise create it
  check_create_folder(settings.tmp_dir)
  
  # Build the list with countries and states
  admin_areas = get_aa_list()

  for chart in settings.charts:
    ind_source = settings.src_auxiliary + str(settings.current_edition) + '-' + str(chart["id"]) + '.csv'
      
    global_avg = False
    # Calculate the global average for this chart    
    if "global_average" in chart and chart["global_average"]:
      global_avg = get_avg(chart, ind_source)
    
    for aa in admin_areas:
      iso = aa.lower()
      for lang in settings.langs:
        # Initialize the array that will be written to JSON
        json_data = {"name": iso, "iso": iso, "meta": {"title": chart["title"][lang], "label-x": chart["labelx"][lang], "label-y": chart["labely"][lang]}, "data": []}

        for serie in chart["series"]:
          if serie["id"] == 'country':
            # If we're dealing with a country, use the country name as label of serie
            serie_name = aa
          else:
            serie_name = serie["name"][lang]

          # Initialize the object for the serie    
          serie_to_append = {"name": serie_name, "id": serie["id"], "values": []}

          # Generate the actual data
          serie_to_append["values"] = chart['function'](serie, ind_source, lang, aa, chart["years"],global_avg)

          json_data["data"].append(serie_to_append)

        # Write the list to a JSON file
        file_path = (settings.exp_aux_json).format(lang=lang,indicator=chart["export"],aa=iso)
        write_json(file_path, json_data)
  
  # Fully remove the temp directory
  clean_dir(settings.tmp_dir, True)

  print "All done. The auxiliary data has been prepared for use on global-climatescope.org."
コード例 #11
0
ファイル: data_manager.py プロジェクト: Chezacar/AmurTiger2.0
    def _prepare_split(self):
        if not osp.exists(self.split_path):
            print("Creating splits")
            mat_split_data = np.load(self.split_mat_path)['ls_set']

            num_splits = mat_split_data.shape[0]
            num_total_ids = mat_split_data.shape[1]
            assert num_splits == 10
            assert num_total_ids == 300
            num_ids_each = num_total_ids / 2

            # pids in mat_split_data are indices, so we need to transform them
            # to real pids
            person_cam1_dirs = os.listdir(self.cam_1_path)
            person_cam2_dirs = os.listdir(self.cam_2_path)

            # make sure persons in one camera view can be found in the other camera view
            assert set(person_cam1_dirs) == set(person_cam2_dirs)

            splits = []
            for i_split in range(num_splits):
                # first 50% for testing and the remaining for training, following Wang et al. ECCV'14.
                train_idxs = sorted(
                    list(mat_split_data[i_split, num_ids_each:]))
                test_idxs = sorted(list(
                    mat_split_data[i_split, :num_ids_each]))

                train_idxs = [int(i) - 1 for i in train_idxs]
                test_idxs = [int(i) - 1 for i in test_idxs]

                # transform pids to person dir names
                train_dirs = [person_cam1_dirs[i] for i in train_idxs]
                test_dirs = [person_cam1_dirs[i] for i in test_idxs]

                split = {'train': train_dirs, 'test': test_dirs}
                splits.append(split)

            print(
                "Totally {} splits are created, following Wang et al. ECCV'14".
                format(len(splits)))
            print("Split file is saved to {}".format(self.split_path))
            write_json(splits, self.split_path)

        print("Splits created")
コード例 #12
0
    def __init__(self, annot_path, video_id_path, metadata_path, fps,
                 window_size, out_path):
        '''

        Given videos, we create segments (of frames) and their corresponding labels.
        A segment is a start/end frame numbers (for a video) and label is whether compression
        occurs or not in that segment.  We use the annotations in secs of the videos to calc
        the label.

        :param annot_path: str, path to the cpr annotations
        :param video_id_path: str, path for the video ids by train/val/test splits
        :param metadata_path: str, path to the metadata of the videos
        :param fps: int, fps of frames videos were converted to
        :param window_size: int, num of frames in a sliding window
        :param out_path: str, path to output the segment and labels json

        '''

        self.fps = fps
        self.window_size = window_size

        self.annot_json = read_json(annot_path)
        video_id_by_split = read_json(video_id_path)
        self.metadata_path = read_json(metadata_path)

        # store each split here
        all_data = {}

        # loop thru each data split
        for split_type in video_id_by_split.keys():
            video_id_list = video_id_by_split[
                split_type]  # retrieve a video id
            segments, labels = self._create_segments_labels(
                video_id_list)  # create the segments/labels
            data = {
                'segments': segments,
                'labels': labels
            }  # store both in a dict
            all_data[split_type] = data  # store for entire video

        # write all to disk
        out_path = os.path.join(out_path, 'segments_and_labels.json')
        write_json(all_data, out_path, indent=None)
コード例 #13
0
def execute():
    """
    Compare Trakt list with Plex collections to find missing videos.
    """
    missing_videos = {}

    plex = PlexServer(settings.PLEX_URL, settings.PLEX_TOKEN)

    sections_by_type = utils.get_sections_by_type(plex=plex)

    for section_title in sections_by_type["movies"]:
        section_config = utils.open_trakt_json("movies")
        section = plex.library.section(section_title)

        trakt_videos = section_config.keys()

        for trakt_video in trakt_videos:
            all_collections = section_config[trakt_video].get(
                "collections", None)
            if all_collections:
                collections = [
                    collection for collection in all_collections
                    if collection not in settings.IGNORE_MISSING_VIDEOS
                ]

                if collections:
                    if not any(x for x in section.all()
                               if "{title} ({year})".format(
                                   title=x.title, year=x.year) == trakt_video):

                        for collection in collections:
                            print(
                                "Missing '{title}' for collection '{collection}'"
                                .format(title=trakt_video,
                                        collection=collection))
                            try:
                                missing_videos[collection].append(trakt_video)
                            except KeyError:
                                missing_videos[collection] = [trakt_video]

    utils.write_json("missing_videos", missing_videos)
コード例 #14
0
ファイル: parse_config.py プロジェクト: wangah/deeplabv3-plus
    def __init__(self, config, resume=None, modification=None, run_id=None):
        """
        class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving
        and logging module.
        :param config: Dict containing configurations, hyperparameters for training. contents of `config.json` file for example.
        :param resume: String, path to the checkpoint being loaded.
        :param modification: Dict keychain:value, specifying position values to be replaced from config dict.
        :param run_id: Unique Identifier for training processes. Used to save checkpoints and training log. Timestamp is being used as default
        """
        # load config file and apply modification
        self._config = _update_config(config, modification)
        self.resume = resume

        # set save_dir where trained model and log will be saved.
        save_dir = Path(self.config["trainer"]["save_dir"])

        exper_name = self.config["name"]
        if run_id is None:  # use timestamp as default run-id
            run_id = datetime.now().strftime(r"%m%d_%H%M%S")
        self._save_dir = save_dir / "checkpoints" / exper_name / run_id
        self._log_dir = save_dir / "logs" / exper_name / run_id
        self._tensorboard_dir = save_dir / "runs" / exper_name / run_id

        # make directory for saving checkpoints, logs, and tensorboard files.
        exist_ok = run_id == ""
        self.save_dir.mkdir(parents=True, exist_ok=exist_ok)
        self.log_dir.mkdir(parents=True, exist_ok=exist_ok)
        self.tensorboard_dir.mkdir(parents=True, exist_ok=exist_ok)

        # save updated config file to the checkpoint dir
        write_json(self.config, self.save_dir / "config.json")

        # configure logging module
        setup_logging(self.log_dir)
        self.log_levels = {
            0: logging.WARNING,
            1: logging.INFO,
            2: logging.DEBUG
        }
コード例 #15
0
def main():

  #############################################################################
  # 0.
  #

  # Check if tmp folder exists, otherwise create it
  if check_dir(settings.tmp_dir) == True:
    sys.exit(0)
  else:
    os.makedirs(settings.tmp_dir)

  # Run some checks on the source folder with core data.
  if not get_years():
    # Is there anything in the source folder to begin with?
    print "We were not able to find a XLSX file with core data in the folder: "\
          "%s. Make sure this folder contains at least one XLSX file named "\
          "after the year (eg. 2014.xlsx). Check the readme for more info "\
          "about the required structure of these files.\n"\
          "Quiting..." % (settings.src_core)
    sys.exit(0)

  # Provide feedback that the script only processes XLSX files with properly
  # formatted filenames. (eg. 2014.xlsx)
  fn_pattern = re.compile('^20[0-9]{2}$')
  for f in os.listdir(settings.src_core):
    fn = os.path.splitext(f)[0]
    ext = os.path.splitext(f)[-1].lower()
    path = os.path.join(settings.src_core, fn)
    
    if not os.path.isdir(path):
      # Only check files
      if ext == ".xlsx":
        if not fn_pattern.match(fn):
          print "The XLSX file %s doesn't have a properly formatted year as "\
                "filename and will be ignored." % (f)
      else:
        print "The script only processes XLSX files. %s will be ignored." % (f)


  print "Loading the core and meta data..."

  # Build the different sets of admin areas with things we have to loop over.
  countries = build_set('country','type','iso',settings.src_meta_aa)
  states = build_set('state','type','iso',settings.src_meta_aa)
  admin_areas = countries | states
  
  # Build sets for the variables we loop over
  global index_param
  index_param = build_set('param','type','id',settings.src_meta_index)
  index_score = build_set('score','type','id',settings.src_meta_index)
  sp = list(index_score | index_param)

  # Build set for the years we're interested in
  global years
  years = get_years()
  global current_yr
  current_yr = max(years)


  # Read in the files with meta-data and set the scope to global
  global df_meta_aa
  df_meta_aa = pd.read_csv(settings.src_meta_aa,index_col='iso')
  global df_meta_index
  df_meta_index = pd.read_csv(settings.src_meta_index,index_col='id')


  #############################################################################
  # 1. Store the relevant core data in one DF (df_full)
  #
  #
  # Output: df_full
  #
  #             2014            2015
  # iso   ind   value   data    value   data
  # AR    0     1.2420  NaN     1.2235  NaN
  #       1.01  0.1802  78.17   0.1795  75.16
  # ...


  first_yr = True

  for yr in years:
    # All core data files are named after the year of the edition
    fn = settings.src_core + yr + '.xlsx'

    df_yr = pd.DataFrame()
    for sheet in settings.core_data_sheets:
      
      # Build an index to parse only the relevant columns
      cols_index = build_col_index(fn,sheet)

      # Read Excel (parsing only relevant cols)
      df_sheet = pd.read_excel(fn,sheet,parse_cols=cols_index)

      # Ensure that the iso codes don't contain strange characters. They can only
      # contain letters, numbers and hyphens. (eg. CN, CN-65 or IN-MP)
      df_sheet['iso'].replace(to_replace='[^a-zA-Z0-9-]', value='',inplace=True,regex=True) 

      # Append each sheet to a dataframe holding the data for that year
      df_yr = df_yr.append(df_sheet)

    # Set the index of the DF to the ISO code and ID of the indicator
    df_yr.set_index(['iso','id'],inplace=True)
    # Make sure the index is sorted so the slicing works well
    df_yr.sortlevel(inplace=True)

    # Rename the column 'score' to value
    df_yr.rename(columns={'score':'value'}, inplace=True)

    
    # Add an extra level in the hierarchy of the columns (Mutli-index)
    # containing an indication of the year

    # Create list that repeats 'value' for the amount of years available
    c = [yr] * len(df_yr.columns)
    # Add a level to the cols
    df_yr.columns = [c, df_yr.columns]

    if first_yr:
      # If it's the first year, we initialize the full DataFrame
      df_full = df_yr
      first_yr = False
    else:
      # Every subsequent year will have to be merged into df_full
      df_full = pd.merge(df_full,df_yr,how='outer',left_index=True,right_index=True)

  df_full.sortlevel(axis=1,inplace=True)

  #############################################################################
  # 2. CSV downloads
  #
  # For all the CSV exports, prepare a dataframe that combines the data with
  # the meta.

  print "Building the CSV files for the download section..."

  # For the CSV, we're only interested in the value column of each year
  df_full_csv = df_full.loc[:,(slice(None),'value')]
  df_full_csv.columns = df_full_csv.columns.get_level_values(0)

  # The full DF is a multi-index. Since the meta-files have a single index,
  # it is necessary to reset the indexes before joining on the column.
  df_full_csv = df_full_csv.reset_index()
  df_meta_aa_csv = df_meta_aa.reset_index()
  df_meta_index_csv = df_meta_index.reset_index()

  # Merge the country meta
  df_full_csv = pd.merge(df_full_csv,df_meta_aa_csv,on='iso')

  # Merge the index meta data
  df_full_csv = pd.merge(df_full_csv,df_meta_index_csv,on='id',suffixes=('_aa','_var'))

  # Re-index the DF on iso & id  and make sure it's sorted
  df_full_csv.set_index(['iso','id'],inplace=True)
  df_full_csv.sortlevel(inplace=True)

  # 2.0 Export the full dataset to CSV

  for lang in settings.langs:
    # Build a list with the meta-data that needs to be included
    columns = ['name:' + lang + '_aa','name:' + lang + '_var','type_var']
    columns = columns + list(years)

    file_path = (settings.exp_full_csv).format(lang=lang)
    df_full_csv.loc[slice(None),columns].to_csv(file_path,encoding='UTF-8',index=False)
  

  # 2.1 Generate the main CSV files

  # Slice the DF to only contain the score and parameters for the current year.
  df_main_csv = df_full_csv.loc[(slice(None),sp),:]

  for lang in settings.langs:
    # Pivot the DF and export it
    file_path = (settings.exp_current_csv).format(lang=lang, yr=current_yr)
    pivot_df(df_main_csv,'name:' + lang + '_aa','name:' + lang + '_var',current_yr).to_csv(file_path,encoding='UTF-8')


  # 2.3 Generate the country + state CSV files
  for aa in admin_areas:
    # Select the data of this admin area
    df_aa_csv = df_full_csv.loc[(aa,slice(None)),:]
    for lang in settings.langs:
      # Include the name of the var, its type and the years
      columns = ['name:' + lang + '_var','type_var'] + list(years)

      # Select the proper columns and generate the CSV
      file_path = (settings.exp_aa_csv).format(lang = lang, aa = aa.lower())
      df_aa_csv.loc[slice(None),columns].to_csv(file_path,encoding='UTF-8',index=False)


  #############################################################################
  # 3. Calculate the rankings
  #
  #
  # Output: df_full
  #
  #             2014                    2015
  #             value   data  gr  sr    value  data  gr  sr
  # iso   id
  # AR    0     1.2420  NaN   13  NaN   1.2235 NaN   12  NaN
  #       1.01  0.1802  73.1  5   NaN   0.1795 75.8  6   NaN
  # ...


  print "Calculating the ranking..."

  # 3.0 Prepare the structure
  # Add placeholder cols with NaN that can be updated later with df.update()
  for year in years:
    for rank in ('gr', 'sr'):
      df_full[(year,rank)] = np.nan
  # Make sure its sorted
  df_full.sortlevel(axis=1,inplace=True)

 
  # 3.1 Global rank
  # The global rank (gr) is a rank of all the COUNTRIES in the project
  df_full = get_rank(countries,df_full,'gr')


  # 3.3 State rank
  # The state rank ('sr') ranks the STATES of a particular country
  for country in countries:
    # Check if there are any states or provinces for this country
    cs = build_set(country,'country','iso',settings.src_meta_aa)
    if cs:
      df_full = get_rank(cs,df_full,'sr')


  #############################################################################
  # 4. JSON api
  #

  print "Building the JSON files for the API..."

  # 4.1 Generate the main JSON file
  for lang in settings.langs:
    # The JSON will contain a list with dicts
    json_data = []
    
    # Loop over the countries list
    for country in countries:
      country_data = build_json_aa(country,df_full,lang, historic=True)
      # Sort the list of states / provinces
      if country_data['states']:
        country_data['states'] = sorted(country_data['states'], key=lambda k: k['name'])
      json_data.append(country_data)

    # Sort the list of countries by name
    sorted_data = sorted(json_data, key=lambda k: k['name'])

    # Write the list to a JSON file
    file_path = (settings.exp_core).format(lang=lang)
    write_json(file_path, sorted_data)


  # 4.3 Generate the country + state JSON files
  for aa in admin_areas:
    for lang in settings.langs:
      # Get the data for this admin area in a dict
      json_data = build_json_aa(aa,df_full,lang,indicators=True,historic=True)

      # Write the dict to a JSON file
      file_path = (settings.exp_aa).format(lang=lang,aa=aa.lower())
      write_json(file_path, json_data)


  # Fully remove the temp directory
  clean_dir(settings.tmp_dir , True)

  print "All done. The data has been prepared for use on global-climatescope.org."
コード例 #16
0
    def save_file(self, content, fname):

        write_json(content, self.save_dir / fname)
コード例 #17
0
def eval_search_cuhk(
    gallery_dataset,
    query_dataset,
    gallery_dets,
    gallery_feats,
    query_box_feats,
    query_dets,
    query_feats,
    k1=10,
    k2=3,
    det_thresh=0.5,
    cbgm=False,
    gallery_size=100,
):
    """
    gallery_dataset/query_dataset: an instance of BaseDataset
    gallery_det (list of ndarray): n_det x [x1, x2, y1, y2, score] per image
    gallery_feat (list of ndarray): n_det x D features per image
    query_feat (list of ndarray): D dimensional features per query image
    det_thresh (float): filter out gallery detections whose scores below this
    gallery_size (int): gallery size [-1, 50, 100, 500, 1000, 2000, 4000]
                        -1 for using full set
    """
    assert len(gallery_dataset) == len(gallery_dets)
    assert len(gallery_dataset) == len(gallery_feats)
    assert len(query_dataset) == len(query_box_feats)

    use_full_set = gallery_size == -1
    fname = "TestG{}".format(gallery_size if not use_full_set else 50)
    protoc = loadmat(osp.join(gallery_dataset.root, "annotation/test/train_test", fname + ".mat"))
    protoc = protoc[fname].squeeze()

    # mapping from gallery image to (det, feat)
    annos = gallery_dataset.annotations
    name_to_det_feat = {}
    for anno, det, feat in zip(annos, gallery_dets, gallery_feats):
        name = anno["img_name"]
        if det != []:
            scores = det[:, 4].ravel()
            inds = np.where(scores >= det_thresh)[0]
            if len(inds) > 0:
                name_to_det_feat[name] = (det[inds], feat[inds])

    aps = []
    accs = []
    topk = [1, 5, 10]
    ret = {"image_root": gallery_dataset.img_prefix, "results": []}
    for i in range(len(query_dataset)):
        y_true, y_score = [], []
        imgs, rois = [], []
        count_gt, count_tp = 0, 0
        # get L2-normalized feature vector
        feat_q = query_box_feats[i].ravel()
        # ignore the query image
        query_imname = str(protoc["Query"][i]["imname"][0, 0][0])
        query_roi = protoc["Query"][i]["idlocate"][0, 0][0].astype(np.int32)
        query_roi[2:] += query_roi[:2]
        query_gt = []
        tested = set([query_imname])

        name2sim = {}
        name2gt = {}
        sims = []
        imgs_cbgm = []
        # 1. Go through the gallery samples defined by the protocol
        for item in protoc["Gallery"][i].squeeze():
            gallery_imname = str(item[0][0])
            # some contain the query (gt not empty), some not
            gt = item[1][0].astype(np.int32)
            count_gt += gt.size > 0
            # compute distance between query and gallery dets
            if gallery_imname not in name_to_det_feat:
                continue
            det, feat_g = name_to_det_feat[gallery_imname]
            # no detection in this gallery, skip it
            if det.shape[0] == 0:
                continue
            # get L2-normalized feature matrix NxD
            assert feat_g.size == np.prod(feat_g.shape[:2])
            feat_g = feat_g.reshape(feat_g.shape[:2])
            # compute cosine similarities
            sim = feat_g.dot(feat_q).ravel()

            if gallery_imname in name2sim:
                continue
            name2sim[gallery_imname] = sim
            name2gt[gallery_imname] = gt
            sims.extend(list(sim))
            imgs_cbgm.extend([gallery_imname] * len(sim))
        # 2. Go through the remaining gallery images if using full set
        if use_full_set:
            # TODO: support CBGM when using full set
            for gallery_imname in gallery_dataset.imgs:
                if gallery_imname in tested:
                    continue
                if gallery_imname not in name_to_det_feat:
                    continue
                det, feat_g = name_to_det_feat[gallery_imname]
                # get L2-normalized feature matrix NxD
                assert feat_g.size == np.prod(feat_g.shape[:2])
                feat_g = feat_g.reshape(feat_g.shape[:2])
                # compute cosine similarities
                sim = feat_g.dot(feat_q).ravel()
                # guaranteed no target query in these gallery images
                label = np.zeros(len(sim), dtype=np.int32)
                y_true.extend(list(label))
                y_score.extend(list(sim))
                imgs.extend([gallery_imname] * len(sim))
                rois.extend(list(det))

        if cbgm:
            # -------- Context Bipartite Graph Matching (CBGM) ------- #
            sims = np.array(sims)
            imgs_cbgm = np.array(imgs_cbgm)
            # only process the top-k1 gallery images for efficiency
            inds = np.argsort(sims)[-k1:]
            imgs_cbgm = set(imgs_cbgm[inds])
            for img in imgs_cbgm:
                sim = name2sim[img]
                det, feat_g = name_to_det_feat[img]
                # only regard the people with top-k2 detection confidence
                # in the query image as context information
                qboxes = query_dets[i][:k2]
                qfeats = query_feats[i][:k2]
                assert (
                    query_roi - qboxes[0][:4]
                ).sum() <= 0.001, "query_roi must be the first one in pboxes"

                # build the bipartite graph and run Kuhn-Munkres (K-M) algorithm
                # to find the best match
                graph = []
                for indx_i, pfeat in enumerate(qfeats):
                    for indx_j, gfeat in enumerate(feat_g):
                        graph.append((indx_i, indx_j, (pfeat * gfeat).sum()))
                km_res, max_val = run_kuhn_munkres(graph)

                # revise the similarity between query person and its matching
                for indx_i, indx_j, _ in km_res:
                    # 0 denotes the query roi
                    if indx_i == 0:
                        sim[indx_j] = max_val
                        break
        for gallery_imname, sim in name2sim.items():
            gt = name2gt[gallery_imname]
            det, feat_g = name_to_det_feat[gallery_imname]
            # assign label for each det
            label = np.zeros(len(sim), dtype=np.int32)
            if gt.size > 0:
                w, h = gt[2], gt[3]
                gt[2:] += gt[:2]
                query_gt.append({"img": str(gallery_imname), "roi": list(map(float, list(gt)))})
                iou_thresh = min(0.5, (w * h * 1.0) / ((w + 10) * (h + 10)))
                inds = np.argsort(sim)[::-1]
                sim = sim[inds]
                det = det[inds]
                # only set the first matched det as true positive
                for j, roi in enumerate(det[:, :4]):
                    if _compute_iou(roi, gt) >= iou_thresh:
                        label[j] = 1
                        count_tp += 1
                        break
            y_true.extend(list(label))
            y_score.extend(list(sim))
            imgs.extend([gallery_imname] * len(sim))
            rois.extend(list(det))
            tested.add(gallery_imname)
        # 3. Compute AP for this query (need to scale by recall rate)
        y_score = np.asarray(y_score)
        y_true = np.asarray(y_true)
        assert count_tp <= count_gt
        recall_rate = count_tp * 1.0 / count_gt
        ap = 0 if count_tp == 0 else average_precision_score(y_true, y_score) * recall_rate
        aps.append(ap)
        inds = np.argsort(y_score)[::-1]
        y_score = y_score[inds]
        y_true = y_true[inds]
        accs.append([min(1, sum(y_true[:k])) for k in topk])
        # 4. Save result for JSON dump
        new_entry = {
            "query_img": str(query_imname),
            "query_roi": list(map(float, list(query_roi))),
            "query_gt": query_gt,
            "gallery": [],
        }
        # only record wrong results
        if int(y_true[0]):
            continue
        # only save top-10 predictions
        for k in range(10):
            new_entry["gallery"].append(
                {
                    "img": str(imgs[inds[k]]),
                    "roi": list(map(float, list(rois[inds[k]]))),
                    "score": float(y_score[k]),
                    "correct": int(y_true[k]),
                }
            )
        ret["results"].append(new_entry)

    print("search ranking:")
    print("  mAP = {:.2%}".format(np.mean(aps)))
    accs = np.mean(accs, axis=0)
    for i, k in enumerate(topk):
        print("  top-{:2d} = {:.2%}".format(k, accs[i]))

    write_json(ret, "vis/results.json")

    ret["mAP"] = np.mean(aps)
    ret["accs"] = accs
    return ret
コード例 #18
0
def train_classifier(clf, itr_train, itr_valid, params):
    """Train a classifier.

  Args:
    clf (classifier): a classifier we wish to train.
    itr_train (Iterator): an iterator over training data.
    itr_valid (Iterator): an iterator over validation data.
    params (dict): flags for training.

  """
    # Dump the parameters we used to a JSON file.
    params_file = os.path.join(params['results_dir'], 'params.json')
    utils.write_json(params_file, params)

    run_avg_len = params['run_avg_len']
    max_steps = params['max_steps_train']
    write_freq = params['write_freq']

    # RALoss is an object which tracks the running average of a loss.
    ra_loss = RALoss('loss', run_avg_len)
    ra_error = RALoss('error', run_avg_len)
    ra_trainloss = RALoss('train-loss', run_avg_len)
    ra_trainerr = RALoss('train-err', run_avg_len)

    min_val_loss = sys.maxsize
    min_val_step = 0
    opt = tf.compat.v1.train.AdamOptimizer(learning_rate=params['lr'])
    finished_training = False
    start_printing = 0
    for i in range(max_steps):
        batch_x, batch_y = itr_train.next()
        with tf.GradientTape() as tape:
            train_loss, train_err = clf.get_loss(batch_x, batch_y)
            mean_train_loss = tf.reduce_mean(train_loss)

        val_batch_x, val_batch_y = itr_valid.next()
        valid_loss, valid_err = clf.get_loss(val_batch_x, val_batch_y)
        loss_list = [ra_loss, ra_error, ra_trainloss, ra_trainerr]
        losses = zip(loss_list, [
            tf.reduce_mean(l)
            for l in (valid_loss, valid_err, train_loss, train_err)
        ])
        utils.update_losses(losses)

        grads = tape.gradient(mean_train_loss, clf.weights)
        opt.apply_gradients(zip(grads, clf.weights))

        utils.print_losses(loss_list, i)
        curr_ra_loss = ra_loss.get_value()
        if (curr_ra_loss < min_val_loss and \
                i - min_val_step > params['patience'] / 10) \
                or i == max_steps - 1:
            # Early stopping: stop training when validation loss stops decreasing.
            # The second condition ensures we don't checkpoint every step early on.
            min_val_loss = curr_ra_loss
            min_val_step = i
            save_path, ckpt = utils.checkpoint_model(clf, params['ckptdir'])
            logging.info('Step {:d}: Checkpointed to {}'.format(i, save_path))

        if i - min_val_step > params['patience'] or i == max_steps - 1:
            ckpt.restore(save_path)
            finished_training = True
            logging.info('Best validation loss was {:.3f} at step {:d}'
                         ' - stopping training'.format(min_val_loss,
                                                       min_val_step))

        if i % write_freq == 0 or finished_training:
            # import pdb; pdb.set_trace()
            utils.write_losses_to_log(loss_list, range(start_printing, i + 1),
                                      params['logdir'])
            start_printing = i + 1
            utils.plot_losses(params['figdir'], loss_list)
            logging.info('Step {:d}: Wrote losses and plots'.format(i))

        if finished_training:
            break