Beispiel #1
0
def conceptnet_preprocess_pipeline(_conceptnet_path,
                                   num_workers=20,
                                   check_exist=True):
    _conceptnet_en_path = _conceptnet_path + "_en.csv"
    _lemma_dict_out_path = _conceptnet_en_path + "_lemma_dict.json"  # lemma to a list of []
    _conceptnet_id2idx_out_path = _conceptnet_en_path + "_id2idx.txt"
    _conceptnet_rel2idx_out_path = _conceptnet_en_path + "_rel2idx.txt"
    _clean_lemma2tags_out_path = _conceptnet_en_path + "_clean_lemma2tags.json"
    _path_list = [
        _conceptnet_en_path, _lemma_dict_out_path, _conceptnet_id2idx_out_path,
        _conceptnet_rel2idx_out_path, _clean_lemma2tags_out_path
    ]

    if not (check_exist and all(file_exists(_path) for _path in _path_list)):
        if not file_exists(_conceptnet_en_path):
            clean_non_english_item(_conceptnet_path, _conceptnet_en_path)
        build_conceptnet_en_map_dump(_conceptnet_en_path, num_workers,
                                     _lemma_dict_out_path)
        build_conceptnet_id2idx(_conceptnet_en_path,
                                _conceptnet_id2idx_out_path)
        build_conceptnet_rel2idx(_conceptnet_en_path,
                                 _conceptnet_rel2idx_out_path)
        build_clean_lemma2tags(_conceptnet_en_path,
                               _clean_lemma2tags_out_path,
                               num_parallels=num_workers)
    return _path_list
def check_supplied_arguments(soft_match):
    # functions will check if a particular command line arguments were provided
    if soft_match:
        # if soft_match argument was provided, check that it contains an existing path to a file and save it to variable
        if cm.file_exists(Path(soft_match)):
            gc.CONFIG_CUSTOM_SOFT_MATCH = str(Path(soft_match))
    process_download_inquiries()
    def load_file(self, filepath=''):
        if len(str(filepath).strip()) == 0:
            filepath = self.filepath

        if cm.file_exists(filepath):
            with open(filepath) as json_file:
                self.json_data = json.load(json_file)
    def __init__(self, cfg_path):
        self.loaded = False
        self.cfg_path = cfg_path
        self.cfg = {}

        if cm.file_exists(cfg_path):
            with open(cfg_path, 'r') as ymlfile:
                self.cfg = yaml.safe_load(ymlfile)
            # self.prj_wrkdir = os.path.dirname(os.path.abspath(cfg_path))
            self.loaded = True
        else:
            self.cfg = None
    def __init__(self, cfg_path=None, cfg_content_dict=None):
        self.loaded = False

        if cfg_path and cm.file_exists(cfg_path):
            with open(cfg_path, 'r') as ymlfile:
                self.cfg = yaml.load(ymlfile)
            # self.prj_wrkdir = os.path.dirname(os.path.abspath(study_cfg_path))
            self.loaded = True
        else:
            if cfg_content_dict:
                self.cfg = cfg_content_dict
                self.loaded = True
            else:
                self.cfg = None
Beispiel #6
0
    def prepare_form(self, form_name):
        forms_location = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' +
                              self.req_obj.project)
        # identify paths for json and config (yaml) files
        fl_path_json_common = forms_location / (form_name + '.json')
        fl_path_json_assay = forms_location / (
            form_name + '_' + str(self.req_obj.assay).lower() + '.json')
        fl_path_json_schema = forms_location / (form_name + '_schema.json')
        fl_path_cfg_common = forms_location / (form_name + '.yaml')

        # fl_path_json_common = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '.json')
        # fl_path_json_assay = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '_' +
        #                           str(self.req_obj.assay).lower() + '.json')
        # fl_path_json_schema = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '_schema.json')
        # fl_path_cfg_common = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '.yaml')

        # check the value assigned to the current request's data_source_forms_assignment
        # and select assay config file accordingly
        if self.req_obj.data_source_forms_assignment == 'file':
            fl_path_cfg_assay = forms_location / (
                form_name + '_' + str(self.req_obj.assay).lower() + '.yaml')
        elif self.req_obj.data_source_forms_assignment == 'db':
            fl_path_cfg_assay = forms_location / (
                form_name + '_' + str(self.req_obj.assay).lower() + '_db.yaml')
        else:  # data_source_forms_assignment = 'db' will be treated as a default assignment
            fl_path_cfg_assay = forms_location / (
                form_name + '_' + str(self.req_obj.assay).lower() + '.yaml')

        # check if assay specific json exists; if yes - use it, if not - use common one
        if cm.file_exists(fl_path_json_assay):
            fl_path_json = fl_path_json_assay
        else:
            fl_path_json = fl_path_json_common

        # load json and config files
        self.fl_json = FileJson(fl_path_json, self.req_obj.error,
                                self.req_obj.logger)
        self.fl_json_schema = FileJson(fl_path_json_schema, self.req_obj.error,
                                       self.req_obj.logger)
        self.fl_cfg_common = ConfigData(fl_path_cfg_common)
        self.fl_cfg_assay = ConfigData(fl_path_cfg_assay)
        # self.fl_cfg_dict = ConfigData(gc.CONFIG_FILE_DICTIONARY)

        # print(self.fl_json.json_data)
        # loop through all json keys and fill those with associated data
        self.get_json_keys(self.fl_json.json_data)
        # print(self.fl_json.json_data)

        # validate final json file against json schema (if present)
        self.validate_json(self.fl_json, self.fl_json_schema)
Beispiel #7
0
    def get_column_values(self,
                          col_number,
                          header_row_number=None,
                          exclude_header=None):
        # setup default parameters
        if not header_row_number:
            header_row_number = 0
        if not exclude_header:
            exclude_header = True

        col_values = []
        # adjust passed numbers to the 0-based numbering
        # col_number = col_number - 1
        # header_row_number = header_row_number - 1
        if cm.file_exists(self.filepath):
            self.logger.debug('Loading column #{} from file "{}"'.format(
                col_number, self.filepath))

            with xlrd.open_workbook(self.filepath) as wb:
                sheet = self.get_wksheet_name(wb)
                if sheet:
                    sheet.cell_value(0, 0)
                    if sheet.ncols >= col_number >= 0:
                        for i in range(sheet.nrows):
                            if i < header_row_number:
                                # skip all rows before the header
                                pass
                            elif i == header_row_number and exclude_header:
                                pass
                            else:
                                cell = sheet.cell(i, col_number)
                                cell_value = self.validate_cell_value(cell, wb)
                                col_values.append(cell_value)
                else:
                    col_values = None
                    # self.loaded = False
                    # return col_values
        else:
            # no file found
            _str = 'Loading content of the file "{}" failed since the file does not appear to exist".' \
                .format(self.filepath)
            self.error.add_error(_str)
            self.logger.error(_str)

            col_values = None

        return col_values
    def __init__(self, cfg_path, cfg_dict=None):
        self.loaded = False

        if cfg_dict is None:
            if cm.file_exists(cfg_path):
                with open(cfg_path, 'r') as ymlfile:
                    self.cfg = yaml.safe_load(ymlfile)
                # self.prj_wrkdir = os.path.dirname(os.path.abspath(cfg_path))
                self.loaded = True
            else:
                self.cfg = None
                # self.prj_wrkdir = None
        elif isinstance(cfg_dict, dict):
            self.cfg = cfg_dict
            self.loaded = True
        else:
            self.cfg = None
Beispiel #9
0
    def set_value(self, value, yaml_path, delim=None):
        if not delim:
            delim = '/'

        out = False

        path_elems = yaml_path.split(delim)
        if not self.cfg:
            self.cfg = {}
        upd_item = self.cfg
        num_items = len(path_elems)
        cnt = 0

        for el in path_elems:
            cnt += 1
            if upd_item and el in upd_item:
                try:
                    if cnt < num_items:
                        if not upd_item[el]:
                            upd_item[el] = {}
                        upd_item = upd_item[el]
                    else:
                        upd_item[el] = value
                        out = True
                except Exception:
                    out = False
                    break
            else:
                if cnt < num_items:
                    upd_item[el] = {}
                    upd_item = upd_item[el]
                else:
                    upd_item[el] = value
                    out = True

        # self.cfg = upd_item
        if cm.file_exists(self.cfg_path):
            with open(self.cfg_path, 'w') as yaml_file:
                yaml_file.write(yaml.dump(self.cfg, default_flow_style=False))

        return out
Beispiel #10
0
    def get_file_content(self):
        if not self.logger:
            loc_log = logging.getLogger(StudyConfig.study_logger_name)
        else:
            loc_log = self.logger

        if not self.lineList:
            if cm.file_exists(self.filepath):
                loc_log.debug('Loading file content of "{}"'.format(
                    self.filepath))
                with open(self.filepath, "r") as fl:
                    self.lineList = [line.rstrip('\n') for line in fl]
                    fl.close()
                    self.loaded = True
            else:
                _str = 'Loading content of the file "{}" failed since the file does not appear to exist".'.format(
                    self.filepath)
                self.error.add_error(_str)
                loc_log.error(_str)
                self.lineList = None
                self.loaded = False
        return self.lineList
Beispiel #11
0
    def evaulate_single_model(self, logsdir):
        metrics_file = join([logsdir, 'metrics.list'])
        assert file_exists(
            metrics_file), "Metrics file {} doesn't exist!".format(
                metrics_file)

        with open(join([logsdir, 'metrics.list']), 'rb') as f:
            metrics = pickle.load(f)

        p_b, p_l, p_s, r_b, g_b, g_l, m_b = zip(*metrics.values())
        pred_bboxes, pred_labels, pred_scores, refn_bboxes, gt_bboxes, gt_labels, mtsm_bboxes = self.get_list_of_metrics(
            [p_b, p_l, p_s, r_b, g_b, g_l, m_b])

        for iou_thresh in np.linspace(0.5, 1.0, 11):
            detector_stats = eval_detection_voc(pred_bboxes,
                                                pred_labels,
                                                pred_scores,
                                                gt_bboxes,
                                                gt_labels,
                                                iou_thresh=iou_thresh)
            bbox_align_stats = eval_detection_voc(refn_bboxes,
                                                  pred_labels,
                                                  pred_scores,
                                                  gt_bboxes,
                                                  gt_labels,
                                                  iou_thresh=iou_thresh)
            straddling_stats = eval_detection_voc(mtsm_bboxes,
                                                  pred_labels,
                                                  pred_scores,
                                                  gt_bboxes,
                                                  gt_labels,
                                                  iou_thresh=iou_thresh)

            sstr = self.pprint(detector_stats, bbox_align_stats,
                               straddling_stats, iou_thresh)
            with open(
                    join([logsdir, 'metrics_{:.2f}.table'.format(iou_thresh)]),
                    'w') as f:
                f.write(sstr)
Beispiel #12
0
    def get_file_content(self):
        if not self.lineList:
            if cm.file_exists(self.filepath):
                self.logger.info('Loading file content of "{}"'.format(
                    self.filepath))

                with xlrd.open_workbook(self.filepath) as wb:
                    sheet = self.get_wksheet_name(wb)
                    if not sheet:
                        self.lineList = None
                        self.loaded = False
                        return self.lineList

                sheet.cell_value(0, 0)

                for i in range(sheet.nrows):
                    ln = []
                    for j in range(sheet.ncols):
                        # print(sheet.cell_value(i, j))
                        # ln.append('"' + sheet.cell_value(i,j) + '"')
                        cell = sheet.cell(i, j)
                        cell_value = self.validate_cell_value(cell, wb)
                        ln.append(cell_value)

                    self.lineList.append(ln)

                wb.unload_sheet(sheet.name)
                self.loaded = True
            else:
                _str = 'Loading content of the file "{}" failed since the file does not appear to exist".'.format(
                    self.filepath)
                self.error.add_error(_str)
                self.logger.error(_str)

                self.lineList = None
                self.loaded = False
        return self.lineList
Beispiel #13
0
    def get_file_content(self):
        if not self.columnlist:
            if cm.file_exists(self.filepath):
                self.logger.debug('Loading file content of "{}"'.format(
                    self.filepath))

                with xlrd.open_workbook(self.filepath) as wb:
                    if not self.sheet_name or len(self.sheet_name) == 0:
                        # by default retrieve the first sheet in the excel file
                        sheet = wb.sheet_by_index(0)
                    else:
                        # if sheet name was provided
                        sheets = wb.sheet_names()  # get list of all sheets
                        if self.sheet_name in sheets:
                            # if given sheet name in the list of available sheets, load the sheet
                            sheet = wb.sheet_by_name(self.sheet_name)
                        else:
                            # report an error if given sheet name not in the list of available sheets
                            _str = (
                                'Given worksheet name "{}" was not found in the file "{}". '
                                'Verify that the worksheet name exists in the file.'
                            ).format(self.sheet_name, self.filepath)
                            self.error.add_error(_str)
                            self.logger.error(_str)

                            self.lineList = None
                            self.loaded = False
                            return self.lineList

                sheet.cell_value(0, 0)

                lines = [
                ]  # will hold content of the request file as an array of arrays (rows)
                for i in range(sheet.ncols):
                    column = []
                    for j in range(sheet.nrows):
                        if i == 0:
                            lines.append(
                                []
                            )  # adds an array for each new row in the request file

                        # print(sheet.cell_value(i, j))
                        cell = sheet.cell(j, i)
                        cell_value = cell.value
                        # take care of number and dates received from Excel and converted to float by default
                        if cell.ctype == 2 and int(cell_value) == cell_value:
                            # the key is integer
                            cell_value = str(int(cell_value))
                        elif cell.ctype == 2:
                            # the key is float
                            cell_value = str(cell_value)
                        # convert date back to human readable date format
                        # print ('cell_value = {}'.format(cell_value))
                        if cell.ctype == 3:
                            cell_value_date = xlrd.xldate_as_datetime(
                                cell_value, wb.datemode)
                            cell_value = cell_value_date.strftime(
                                "%Y-%m-%directory")
                        column.append(
                            cell_value
                        )  # adds value to the current column array
                        lines[j].append(
                            '"' + str(cell_value) + '"'
                        )  # adds value in "csv" format for a current row

                    # self.columnlist.append(','.join(column))
                    self.columnlist.append(
                        column)  # adds a column to a list of columns

                # populate lineList property
                self.lineList = []
                for ln in lines:
                    self.lineList.append(','.join(ln))

                wb.unload_sheet(sheet.name)

                # load passed request parameters (by columns)
                self.get_request_parameters()

                # validate provided information
                self.logger.info(
                    'Validating provided request parameters. project: "{}", bulk location: "{}", '
                    'assay: "{}", db_center_code_or_id: "{}",'
                    'Sub-Aliquots: "{}"'.format(self.project,
                                                self.bulk_location, self.assay,
                                                self.center,
                                                self.sub_aliquots))
                self.validate_request_params()

                if self.error.exist():
                    # report that errors exist
                    self.loaded = False
                    # print(self.error.count)
                    # print(self.error.get_errors_to_str())
                    _str = 'Errors ({}) were identified during validating of the request. \nError(s): {}'.format(
                        self.error.count, self.error.get_errors_to_str())
                else:
                    self.loaded = True
                    _str = 'Request parameters were successfully validated - no errors found.'
                self.logger.info(_str)

                # combine Experiment_id out of request parameters
                if self.center_code and len(self.center_code.strip()) > 0:
                    # use center code if available
                    self.experiment_id = "_".join(
                        [self.project, self.center_code, self.assay])
                else:
                    # use provided value for the center column from request, if center_code is not available
                    self.experiment_id = "_".join(
                        [self.project, self.center, self.assay])

            else:
                _str = 'Loading content of the file "{}" failed since the file does not appear to exist".'.format(
                    self.filepath)
                self.error.add_error(_str)
                self.logger.error(_str)

                self.columnlist = None
                self.lineList = None
                self.loaded = False
        return self.lineList
    def get_file_content(self):
        if not self.columns_arr or not self.lines_arr:
            self.columns_arr = []
            self.lines_arr = []
            if cm.file_exists(self.filepath):
                self.logger.debug('Loading file content of "{}"'.format(
                    self.filepath))

                with xlrd.open_workbook(self.filepath) as wb:
                    if not self.sheet_name or len(self.sheet_name) == 0:
                        # by default retrieve the first sheet in the excel file
                        sheet = wb.sheet_by_index(0)
                    else:
                        # if sheet name was provided
                        sheets = wb.sheet_names()  # get list of all sheets
                        if self.sheet_name in sheets:
                            # if given sheet name in the list of available sheets, load the sheet
                            sheet = wb.sheet_by_name(self.sheet_name)
                        else:
                            # report an error if given sheet name not in the list of available sheets
                            _str = (
                                'Given worksheet name "{}" was not found in the file "{}". '
                                'Verify that the worksheet name exists in the file.'
                            ).format(self.sheet_name, self.filepath)
                            self.error.add_error(_str)
                            self.logger.error(_str)

                            self.lines_arr = None
                            self.loaded = False
                            return self.lines_arr

                sheet.cell_value(0, 0)

                lines = [
                ]  # will hold content of the inquiry file as an array of arrays (rows)
                columns = []
                for i in range(sheet.ncols):
                    column = []
                    for j in range(sheet.nrows):
                        if i == 0:
                            lines.append(
                                []
                            )  # adds an array for each new row in the inquiry file

                        # print(sheet.cell_value(i, j))
                        cell = sheet.cell(j, i)
                        cell_value = cell.value
                        # take care of number and dates received from Excel and converted to float by default
                        if cell.ctype == 2 and int(cell_value) == cell_value:
                            # the key is integer
                            cell_value = str(int(cell_value))
                        elif cell.ctype == 2:
                            # the key is float
                            cell_value = str(cell_value)
                        # convert date back to human readable date format
                        # print ('cell_value = {}'.format(cell_value))
                        if cell.ctype == 3:
                            cell_value_date = xlrd.xldate_as_datetime(
                                cell_value, wb.datemode)
                            cell_value = cell_value_date.strftime(
                                "%Y-%m-%directory")
                        column.append(
                            cell_value
                        )  # adds value to the current column array
                        # lines[j].append('"' + cell_value + '"')  # adds value in "csv" format for a current row
                        lines[j].append(cell_value)

                    # self.columns_arr.append(','.join(column))
                    columns.append(
                        column)  # adds a column to a list of columns

                # populate lines_arr and columns_arr properties
                self.lines_arr = lines
                self.columns_arr = columns

                # populate lineList value as required for the base class
                self.lineList = []
                for ln in lines:
                    self.lineList.append(','.join(str(ln)))

                wb.unload_sheet(sheet.name)

                # perform validation of the current inquiry file
                self.validate_inquiry_file()

                if self.error.exist():
                    # report that errors exist
                    self.loaded = False
                    # print(self.error.count)
                    # print(self.error.get_errors_to_str())
                    _str = 'Errors ({}) were identified during validating of the inquiry. \nError(s): {}'.format(
                        self.error.count, self.error.get_errors_to_str())
                else:
                    self.loaded = True

            else:
                _str = 'Loading content of the file "{}" failed since the file does not appear to exist".'.format(
                    self.filepath)
                self.error.add_error(_str)
                self.logger.error(_str)

                self.columns_arr = None
                self.lines_arr = None
                self.loaded = False
        return self.lineList
Beispiel #15
0
 def verify_config_stamp_file(self, file_path):
     if not cm.file_exists(file_path):
         # if file is not present, create it
         f = open(file_path, "w+")
         f.close
Beispiel #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_type_list", type=str, default="omcs,arc")
    parser.add_argument("--num_workers", type=int, default=4)
    parser.add_argument("--cache_dir", type=str, default=None)
    parser.add_argument("--k_hop", type=int, default=3)
    parser.add_argument("--max_num_nodes", type=int, default=1024)
    parser.add_argument("--disable_stop_ctk", action="store_true")
    parser.add_argument("--disable_nb", action="store_true")
    args = parser.parse_args()

    data_type_list = args.data_type_list.split(",")
    num_workers = args.num_workers
    cache_dir = args.cache_dir or index_sent_cache_dir
    k_hop = args.k_hop
    max_num_nodes = args.max_num_nodes
    disable_stop_ctk = args.disable_stop_ctk
    disable_nb = args.disable_nb
    data_type_list = [_e for _e in ["gen", "omcs", "arc", "wikipedia"] if _e in data_type_list]
    ctk_list, cid_list, ctk2idx, cid2idx, cididx2ctkidx, ctkidx2cididxs = load_conceptnet()
    rel_list, rel2idx, cg, cididx2neighbor = load_conceptnet_graph(cid_list, cid2idx)

    part_idxs = [0, ]
    sent_index_offset_list = []
    for _data_type in data_type_list:
        _offset_list = load_sent_index_offset(_data_type, cache_dir)
        sent_index_offset_list.extend(_offset_list)
        part_idxs.append(len(sent_index_offset_list))

    # read all sent
    if disable_stop_ctk:
        print("disable_stop_ctk!!!!!")
    else:
        print("reading all sent to count ctkidx2freq")
        ctkidx2freq_path = join(cache_dir, "cn_ctkidx2freq.pkl")
        if file_exists(ctkidx2freq_path):
            print("\tfound file, loading")
            ctkidx2freq = load_pickle(ctkidx2freq_path)
        else:
            print("\tnot found file, building")
            def _processor_ctkidx2freq(_sent_index_offset_list, _with_sent_index=False):
                local_ctkidx2freq = [0 for _ in range(len(ctk_list))]

                if _with_sent_index:
                    _iterator = tqdm(_sent_index_offset_list)
                else:
                    _iterator = enumerate(tqdm(_sent_index_offset_list))

                for _idx_sent, _sent_index_offset in _iterator:
                    _data_type = get_data_type(_idx_sent, part_idxs, data_type_list)
                    if _data_type != "gen":
                        _sent_data = load_sent_from_shard(_sent_index_offset, cache_dir, _data_type)
                        _tk2spans = _sent_data[2]
                        for _tk in _tk2spans:
                            local_ctkidx2freq[ctk2idx[_tk]] += 1
                return local_ctkidx2freq
            if num_workers == 1:
                ctkidx2freq = _processor_ctkidx2freq(sent_index_offset_list)
            else:
                sent_index_offset_list_with_index = list((_idx, _e) for _idx, _e in enumerate(sent_index_offset_list))
                local_ctkidx2freq_list = multiprocessing_map(
                    _processor_ctkidx2freq, dict_args_list=[
                        {"_sent_index_offset_list": _d, "_with_sent_index": True}
                        for _d in split_to_lists(sent_index_offset_list_with_index, num_workers)
                    ], num_parallels=num_workers
                )
                ctkidx2freq = [sum(_ll[_ctkidx] for _ll in local_ctkidx2freq_list) for _ctkidx in range(len(ctk_list))]
            save_pickle(ctkidx2freq, ctkidx2freq_path)
        print("\tDone")

        # sorting
        print("Getting stop ctk")
        sorted_ctkidx_freq_pairs = sorted(
            [(_ctkidx, _freq) for _ctkidx, _freq in enumerate(ctkidx2freq) if _freq > 0],
            key=lambda _e: _e[1], reverse=True)
        sorted_ctkidx_list, _ = [list(_e) for _e in zip(*sorted_ctkidx_freq_pairs)]
        save_pickle(sorted_ctkidx_list, join(cache_dir, stop_ctkidx_list_file_name))
        save_list_to_file([ctk_list[_ctkidx] for _ctkidx in sorted_ctkidx_list],
                          join(cache_dir, stop_ctk_list_file_name))
        print("\tDone")

    # find
    def _processor(_cididx_list):
        _local_res_list = []
        for _ct_cididx in tqdm(_cididx_list):
            _node_explored = set([_ct_cididx])
            _node_save = [[_ct_cididx], ] + [[] for _ in range(k_hop)]
            _node_buffer = [(_ct_cididx, 0)]
            while len(_node_buffer) > 0:
                _node_cididx, _prev_depth = _node_buffer.pop(0)
                if _prev_depth == k_hop:
                    continue
                _cur_depth = _prev_depth + 1
                _neighbors = cididx2neighbor[_node_cididx]
                # shuffle keys
                _nb_cididxs = list(_neighbors.keys())
                random.shuffle(_nb_cididxs)
                for _nb_cididx in _nb_cididxs:
                    _attr = _neighbors[_nb_cididx]
                    if _nb_cididx in _node_explored:
                        continue
                    _node_explored.add(_nb_cididx)
                    _node_buffer.append((_nb_cididx, _cur_depth))
                    if rel_list[_attr["relation"]] not in REDUNDANT_RELATIONS:  # remove REDUNDANT_RELATIONS
                        _node_save[_cur_depth].append(_nb_cididx)
                        if sum(len(_e) for _e in _node_save) > max_num_nodes:
                            _node_buffer = []
                            break

            _local_res_list.append(_node_save)
        return _local_res_list

    if disable_nb:
        print("disable_nb!!!!!")
    else:
        print("Getting neighbors")
        proc_buffer = []
        wfp_nb = open(join(cache_dir, neighbor_cididxs_file_name), "w", encoding="utf-8")
        nb_offsets = []
        for _ctkidx in tqdm(range(len(cid_list)), total=len(cid_list)):
            proc_buffer.append(_ctkidx)
            if len(proc_buffer) == num_workers * 10000 or _ctkidx == (len(cid_list)-1):
                if num_workers == 1:
                    _res_list = _processor(proc_buffer)
                else:
                    _res_list = combine_from_lists(
                        multiprocessing_map(
                            _processor, dict_args_list=[
                                {"_cididx_list": _d} for _d in split_to_lists(proc_buffer, num_parallels=num_workers)
                            ], num_parallels=num_workers
                        ), ordered=True
                    )
                assert len(_res_list) == len(proc_buffer)
                for _elem in _res_list:
                    nb_offsets.append(wfp_nb.tell())
                    _dump_str = json.dumps(_elem) + os.linesep
                    wfp_nb.write(_dump_str)
                proc_buffer = []
        wfp_nb.close()
        save_pickle(nb_offsets, join(cache_dir, neighbor_cididxs_offset_file_name))
        print("\tDone")