def conceptnet_preprocess_pipeline(_conceptnet_path, num_workers=20, check_exist=True): _conceptnet_en_path = _conceptnet_path + "_en.csv" _lemma_dict_out_path = _conceptnet_en_path + "_lemma_dict.json" # lemma to a list of [] _conceptnet_id2idx_out_path = _conceptnet_en_path + "_id2idx.txt" _conceptnet_rel2idx_out_path = _conceptnet_en_path + "_rel2idx.txt" _clean_lemma2tags_out_path = _conceptnet_en_path + "_clean_lemma2tags.json" _path_list = [ _conceptnet_en_path, _lemma_dict_out_path, _conceptnet_id2idx_out_path, _conceptnet_rel2idx_out_path, _clean_lemma2tags_out_path ] if not (check_exist and all(file_exists(_path) for _path in _path_list)): if not file_exists(_conceptnet_en_path): clean_non_english_item(_conceptnet_path, _conceptnet_en_path) build_conceptnet_en_map_dump(_conceptnet_en_path, num_workers, _lemma_dict_out_path) build_conceptnet_id2idx(_conceptnet_en_path, _conceptnet_id2idx_out_path) build_conceptnet_rel2idx(_conceptnet_en_path, _conceptnet_rel2idx_out_path) build_clean_lemma2tags(_conceptnet_en_path, _clean_lemma2tags_out_path, num_parallels=num_workers) return _path_list
def check_supplied_arguments(soft_match): # functions will check if a particular command line arguments were provided if soft_match: # if soft_match argument was provided, check that it contains an existing path to a file and save it to variable if cm.file_exists(Path(soft_match)): gc.CONFIG_CUSTOM_SOFT_MATCH = str(Path(soft_match)) process_download_inquiries()
def load_file(self, filepath=''): if len(str(filepath).strip()) == 0: filepath = self.filepath if cm.file_exists(filepath): with open(filepath) as json_file: self.json_data = json.load(json_file)
def __init__(self, cfg_path): self.loaded = False self.cfg_path = cfg_path self.cfg = {} if cm.file_exists(cfg_path): with open(cfg_path, 'r') as ymlfile: self.cfg = yaml.safe_load(ymlfile) # self.prj_wrkdir = os.path.dirname(os.path.abspath(cfg_path)) self.loaded = True else: self.cfg = None
def __init__(self, cfg_path=None, cfg_content_dict=None): self.loaded = False if cfg_path and cm.file_exists(cfg_path): with open(cfg_path, 'r') as ymlfile: self.cfg = yaml.load(ymlfile) # self.prj_wrkdir = os.path.dirname(os.path.abspath(study_cfg_path)) self.loaded = True else: if cfg_content_dict: self.cfg = cfg_content_dict self.loaded = True else: self.cfg = None
def prepare_form(self, form_name): forms_location = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + self.req_obj.project) # identify paths for json and config (yaml) files fl_path_json_common = forms_location / (form_name + '.json') fl_path_json_assay = forms_location / ( form_name + '_' + str(self.req_obj.assay).lower() + '.json') fl_path_json_schema = forms_location / (form_name + '_schema.json') fl_path_cfg_common = forms_location / (form_name + '.yaml') # fl_path_json_common = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '.json') # fl_path_json_assay = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '_' + # str(self.req_obj.assay).lower() + '.json') # fl_path_json_schema = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '_schema.json') # fl_path_cfg_common = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '.yaml') # check the value assigned to the current request's data_source_forms_assignment # and select assay config file accordingly if self.req_obj.data_source_forms_assignment == 'file': fl_path_cfg_assay = forms_location / ( form_name + '_' + str(self.req_obj.assay).lower() + '.yaml') elif self.req_obj.data_source_forms_assignment == 'db': fl_path_cfg_assay = forms_location / ( form_name + '_' + str(self.req_obj.assay).lower() + '_db.yaml') else: # data_source_forms_assignment = 'db' will be treated as a default assignment fl_path_cfg_assay = forms_location / ( form_name + '_' + str(self.req_obj.assay).lower() + '.yaml') # check if assay specific json exists; if yes - use it, if not - use common one if cm.file_exists(fl_path_json_assay): fl_path_json = fl_path_json_assay else: fl_path_json = fl_path_json_common # load json and config files self.fl_json = FileJson(fl_path_json, self.req_obj.error, self.req_obj.logger) self.fl_json_schema = FileJson(fl_path_json_schema, self.req_obj.error, self.req_obj.logger) self.fl_cfg_common = ConfigData(fl_path_cfg_common) self.fl_cfg_assay = ConfigData(fl_path_cfg_assay) # self.fl_cfg_dict = ConfigData(gc.CONFIG_FILE_DICTIONARY) # print(self.fl_json.json_data) # loop through all json keys and fill those with associated data self.get_json_keys(self.fl_json.json_data) # print(self.fl_json.json_data) # validate final json file against json schema (if present) self.validate_json(self.fl_json, self.fl_json_schema)
def get_column_values(self, col_number, header_row_number=None, exclude_header=None): # setup default parameters if not header_row_number: header_row_number = 0 if not exclude_header: exclude_header = True col_values = [] # adjust passed numbers to the 0-based numbering # col_number = col_number - 1 # header_row_number = header_row_number - 1 if cm.file_exists(self.filepath): self.logger.debug('Loading column #{} from file "{}"'.format( col_number, self.filepath)) with xlrd.open_workbook(self.filepath) as wb: sheet = self.get_wksheet_name(wb) if sheet: sheet.cell_value(0, 0) if sheet.ncols >= col_number >= 0: for i in range(sheet.nrows): if i < header_row_number: # skip all rows before the header pass elif i == header_row_number and exclude_header: pass else: cell = sheet.cell(i, col_number) cell_value = self.validate_cell_value(cell, wb) col_values.append(cell_value) else: col_values = None # self.loaded = False # return col_values else: # no file found _str = 'Loading content of the file "{}" failed since the file does not appear to exist".' \ .format(self.filepath) self.error.add_error(_str) self.logger.error(_str) col_values = None return col_values
def __init__(self, cfg_path, cfg_dict=None): self.loaded = False if cfg_dict is None: if cm.file_exists(cfg_path): with open(cfg_path, 'r') as ymlfile: self.cfg = yaml.safe_load(ymlfile) # self.prj_wrkdir = os.path.dirname(os.path.abspath(cfg_path)) self.loaded = True else: self.cfg = None # self.prj_wrkdir = None elif isinstance(cfg_dict, dict): self.cfg = cfg_dict self.loaded = True else: self.cfg = None
def set_value(self, value, yaml_path, delim=None): if not delim: delim = '/' out = False path_elems = yaml_path.split(delim) if not self.cfg: self.cfg = {} upd_item = self.cfg num_items = len(path_elems) cnt = 0 for el in path_elems: cnt += 1 if upd_item and el in upd_item: try: if cnt < num_items: if not upd_item[el]: upd_item[el] = {} upd_item = upd_item[el] else: upd_item[el] = value out = True except Exception: out = False break else: if cnt < num_items: upd_item[el] = {} upd_item = upd_item[el] else: upd_item[el] = value out = True # self.cfg = upd_item if cm.file_exists(self.cfg_path): with open(self.cfg_path, 'w') as yaml_file: yaml_file.write(yaml.dump(self.cfg, default_flow_style=False)) return out
def get_file_content(self): if not self.logger: loc_log = logging.getLogger(StudyConfig.study_logger_name) else: loc_log = self.logger if not self.lineList: if cm.file_exists(self.filepath): loc_log.debug('Loading file content of "{}"'.format( self.filepath)) with open(self.filepath, "r") as fl: self.lineList = [line.rstrip('\n') for line in fl] fl.close() self.loaded = True else: _str = 'Loading content of the file "{}" failed since the file does not appear to exist".'.format( self.filepath) self.error.add_error(_str) loc_log.error(_str) self.lineList = None self.loaded = False return self.lineList
def evaulate_single_model(self, logsdir): metrics_file = join([logsdir, 'metrics.list']) assert file_exists( metrics_file), "Metrics file {} doesn't exist!".format( metrics_file) with open(join([logsdir, 'metrics.list']), 'rb') as f: metrics = pickle.load(f) p_b, p_l, p_s, r_b, g_b, g_l, m_b = zip(*metrics.values()) pred_bboxes, pred_labels, pred_scores, refn_bboxes, gt_bboxes, gt_labels, mtsm_bboxes = self.get_list_of_metrics( [p_b, p_l, p_s, r_b, g_b, g_l, m_b]) for iou_thresh in np.linspace(0.5, 1.0, 11): detector_stats = eval_detection_voc(pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, iou_thresh=iou_thresh) bbox_align_stats = eval_detection_voc(refn_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, iou_thresh=iou_thresh) straddling_stats = eval_detection_voc(mtsm_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, iou_thresh=iou_thresh) sstr = self.pprint(detector_stats, bbox_align_stats, straddling_stats, iou_thresh) with open( join([logsdir, 'metrics_{:.2f}.table'.format(iou_thresh)]), 'w') as f: f.write(sstr)
def get_file_content(self): if not self.lineList: if cm.file_exists(self.filepath): self.logger.info('Loading file content of "{}"'.format( self.filepath)) with xlrd.open_workbook(self.filepath) as wb: sheet = self.get_wksheet_name(wb) if not sheet: self.lineList = None self.loaded = False return self.lineList sheet.cell_value(0, 0) for i in range(sheet.nrows): ln = [] for j in range(sheet.ncols): # print(sheet.cell_value(i, j)) # ln.append('"' + sheet.cell_value(i,j) + '"') cell = sheet.cell(i, j) cell_value = self.validate_cell_value(cell, wb) ln.append(cell_value) self.lineList.append(ln) wb.unload_sheet(sheet.name) self.loaded = True else: _str = 'Loading content of the file "{}" failed since the file does not appear to exist".'.format( self.filepath) self.error.add_error(_str) self.logger.error(_str) self.lineList = None self.loaded = False return self.lineList
def get_file_content(self): if not self.columnlist: if cm.file_exists(self.filepath): self.logger.debug('Loading file content of "{}"'.format( self.filepath)) with xlrd.open_workbook(self.filepath) as wb: if not self.sheet_name or len(self.sheet_name) == 0: # by default retrieve the first sheet in the excel file sheet = wb.sheet_by_index(0) else: # if sheet name was provided sheets = wb.sheet_names() # get list of all sheets if self.sheet_name in sheets: # if given sheet name in the list of available sheets, load the sheet sheet = wb.sheet_by_name(self.sheet_name) else: # report an error if given sheet name not in the list of available sheets _str = ( 'Given worksheet name "{}" was not found in the file "{}". ' 'Verify that the worksheet name exists in the file.' ).format(self.sheet_name, self.filepath) self.error.add_error(_str) self.logger.error(_str) self.lineList = None self.loaded = False return self.lineList sheet.cell_value(0, 0) lines = [ ] # will hold content of the request file as an array of arrays (rows) for i in range(sheet.ncols): column = [] for j in range(sheet.nrows): if i == 0: lines.append( [] ) # adds an array for each new row in the request file # print(sheet.cell_value(i, j)) cell = sheet.cell(j, i) cell_value = cell.value # take care of number and dates received from Excel and converted to float by default if cell.ctype == 2 and int(cell_value) == cell_value: # the key is integer cell_value = str(int(cell_value)) elif cell.ctype == 2: # the key is float cell_value = str(cell_value) # convert date back to human readable date format # print ('cell_value = {}'.format(cell_value)) if cell.ctype == 3: cell_value_date = xlrd.xldate_as_datetime( cell_value, wb.datemode) cell_value = cell_value_date.strftime( "%Y-%m-%directory") column.append( cell_value ) # adds value to the current column array lines[j].append( '"' + str(cell_value) + '"' ) # adds value in "csv" format for a current row # self.columnlist.append(','.join(column)) self.columnlist.append( column) # adds a column to a list of columns # populate lineList property self.lineList = [] for ln in lines: self.lineList.append(','.join(ln)) wb.unload_sheet(sheet.name) # load passed request parameters (by columns) self.get_request_parameters() # validate provided information self.logger.info( 'Validating provided request parameters. project: "{}", bulk location: "{}", ' 'assay: "{}", db_center_code_or_id: "{}",' 'Sub-Aliquots: "{}"'.format(self.project, self.bulk_location, self.assay, self.center, self.sub_aliquots)) self.validate_request_params() if self.error.exist(): # report that errors exist self.loaded = False # print(self.error.count) # print(self.error.get_errors_to_str()) _str = 'Errors ({}) were identified during validating of the request. \nError(s): {}'.format( self.error.count, self.error.get_errors_to_str()) else: self.loaded = True _str = 'Request parameters were successfully validated - no errors found.' self.logger.info(_str) # combine Experiment_id out of request parameters if self.center_code and len(self.center_code.strip()) > 0: # use center code if available self.experiment_id = "_".join( [self.project, self.center_code, self.assay]) else: # use provided value for the center column from request, if center_code is not available self.experiment_id = "_".join( [self.project, self.center, self.assay]) else: _str = 'Loading content of the file "{}" failed since the file does not appear to exist".'.format( self.filepath) self.error.add_error(_str) self.logger.error(_str) self.columnlist = None self.lineList = None self.loaded = False return self.lineList
def get_file_content(self): if not self.columns_arr or not self.lines_arr: self.columns_arr = [] self.lines_arr = [] if cm.file_exists(self.filepath): self.logger.debug('Loading file content of "{}"'.format( self.filepath)) with xlrd.open_workbook(self.filepath) as wb: if not self.sheet_name or len(self.sheet_name) == 0: # by default retrieve the first sheet in the excel file sheet = wb.sheet_by_index(0) else: # if sheet name was provided sheets = wb.sheet_names() # get list of all sheets if self.sheet_name in sheets: # if given sheet name in the list of available sheets, load the sheet sheet = wb.sheet_by_name(self.sheet_name) else: # report an error if given sheet name not in the list of available sheets _str = ( 'Given worksheet name "{}" was not found in the file "{}". ' 'Verify that the worksheet name exists in the file.' ).format(self.sheet_name, self.filepath) self.error.add_error(_str) self.logger.error(_str) self.lines_arr = None self.loaded = False return self.lines_arr sheet.cell_value(0, 0) lines = [ ] # will hold content of the inquiry file as an array of arrays (rows) columns = [] for i in range(sheet.ncols): column = [] for j in range(sheet.nrows): if i == 0: lines.append( [] ) # adds an array for each new row in the inquiry file # print(sheet.cell_value(i, j)) cell = sheet.cell(j, i) cell_value = cell.value # take care of number and dates received from Excel and converted to float by default if cell.ctype == 2 and int(cell_value) == cell_value: # the key is integer cell_value = str(int(cell_value)) elif cell.ctype == 2: # the key is float cell_value = str(cell_value) # convert date back to human readable date format # print ('cell_value = {}'.format(cell_value)) if cell.ctype == 3: cell_value_date = xlrd.xldate_as_datetime( cell_value, wb.datemode) cell_value = cell_value_date.strftime( "%Y-%m-%directory") column.append( cell_value ) # adds value to the current column array # lines[j].append('"' + cell_value + '"') # adds value in "csv" format for a current row lines[j].append(cell_value) # self.columns_arr.append(','.join(column)) columns.append( column) # adds a column to a list of columns # populate lines_arr and columns_arr properties self.lines_arr = lines self.columns_arr = columns # populate lineList value as required for the base class self.lineList = [] for ln in lines: self.lineList.append(','.join(str(ln))) wb.unload_sheet(sheet.name) # perform validation of the current inquiry file self.validate_inquiry_file() if self.error.exist(): # report that errors exist self.loaded = False # print(self.error.count) # print(self.error.get_errors_to_str()) _str = 'Errors ({}) were identified during validating of the inquiry. \nError(s): {}'.format( self.error.count, self.error.get_errors_to_str()) else: self.loaded = True else: _str = 'Loading content of the file "{}" failed since the file does not appear to exist".'.format( self.filepath) self.error.add_error(_str) self.logger.error(_str) self.columns_arr = None self.lines_arr = None self.loaded = False return self.lineList
def verify_config_stamp_file(self, file_path): if not cm.file_exists(file_path): # if file is not present, create it f = open(file_path, "w+") f.close
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_type_list", type=str, default="omcs,arc") parser.add_argument("--num_workers", type=int, default=4) parser.add_argument("--cache_dir", type=str, default=None) parser.add_argument("--k_hop", type=int, default=3) parser.add_argument("--max_num_nodes", type=int, default=1024) parser.add_argument("--disable_stop_ctk", action="store_true") parser.add_argument("--disable_nb", action="store_true") args = parser.parse_args() data_type_list = args.data_type_list.split(",") num_workers = args.num_workers cache_dir = args.cache_dir or index_sent_cache_dir k_hop = args.k_hop max_num_nodes = args.max_num_nodes disable_stop_ctk = args.disable_stop_ctk disable_nb = args.disable_nb data_type_list = [_e for _e in ["gen", "omcs", "arc", "wikipedia"] if _e in data_type_list] ctk_list, cid_list, ctk2idx, cid2idx, cididx2ctkidx, ctkidx2cididxs = load_conceptnet() rel_list, rel2idx, cg, cididx2neighbor = load_conceptnet_graph(cid_list, cid2idx) part_idxs = [0, ] sent_index_offset_list = [] for _data_type in data_type_list: _offset_list = load_sent_index_offset(_data_type, cache_dir) sent_index_offset_list.extend(_offset_list) part_idxs.append(len(sent_index_offset_list)) # read all sent if disable_stop_ctk: print("disable_stop_ctk!!!!!") else: print("reading all sent to count ctkidx2freq") ctkidx2freq_path = join(cache_dir, "cn_ctkidx2freq.pkl") if file_exists(ctkidx2freq_path): print("\tfound file, loading") ctkidx2freq = load_pickle(ctkidx2freq_path) else: print("\tnot found file, building") def _processor_ctkidx2freq(_sent_index_offset_list, _with_sent_index=False): local_ctkidx2freq = [0 for _ in range(len(ctk_list))] if _with_sent_index: _iterator = tqdm(_sent_index_offset_list) else: _iterator = enumerate(tqdm(_sent_index_offset_list)) for _idx_sent, _sent_index_offset in _iterator: _data_type = get_data_type(_idx_sent, part_idxs, data_type_list) if _data_type != "gen": _sent_data = load_sent_from_shard(_sent_index_offset, cache_dir, _data_type) _tk2spans = _sent_data[2] for _tk in _tk2spans: local_ctkidx2freq[ctk2idx[_tk]] += 1 return local_ctkidx2freq if num_workers == 1: ctkidx2freq = _processor_ctkidx2freq(sent_index_offset_list) else: sent_index_offset_list_with_index = list((_idx, _e) for _idx, _e in enumerate(sent_index_offset_list)) local_ctkidx2freq_list = multiprocessing_map( _processor_ctkidx2freq, dict_args_list=[ {"_sent_index_offset_list": _d, "_with_sent_index": True} for _d in split_to_lists(sent_index_offset_list_with_index, num_workers) ], num_parallels=num_workers ) ctkidx2freq = [sum(_ll[_ctkidx] for _ll in local_ctkidx2freq_list) for _ctkidx in range(len(ctk_list))] save_pickle(ctkidx2freq, ctkidx2freq_path) print("\tDone") # sorting print("Getting stop ctk") sorted_ctkidx_freq_pairs = sorted( [(_ctkidx, _freq) for _ctkidx, _freq in enumerate(ctkidx2freq) if _freq > 0], key=lambda _e: _e[1], reverse=True) sorted_ctkidx_list, _ = [list(_e) for _e in zip(*sorted_ctkidx_freq_pairs)] save_pickle(sorted_ctkidx_list, join(cache_dir, stop_ctkidx_list_file_name)) save_list_to_file([ctk_list[_ctkidx] for _ctkidx in sorted_ctkidx_list], join(cache_dir, stop_ctk_list_file_name)) print("\tDone") # find def _processor(_cididx_list): _local_res_list = [] for _ct_cididx in tqdm(_cididx_list): _node_explored = set([_ct_cididx]) _node_save = [[_ct_cididx], ] + [[] for _ in range(k_hop)] _node_buffer = [(_ct_cididx, 0)] while len(_node_buffer) > 0: _node_cididx, _prev_depth = _node_buffer.pop(0) if _prev_depth == k_hop: continue _cur_depth = _prev_depth + 1 _neighbors = cididx2neighbor[_node_cididx] # shuffle keys _nb_cididxs = list(_neighbors.keys()) random.shuffle(_nb_cididxs) for _nb_cididx in _nb_cididxs: _attr = _neighbors[_nb_cididx] if _nb_cididx in _node_explored: continue _node_explored.add(_nb_cididx) _node_buffer.append((_nb_cididx, _cur_depth)) if rel_list[_attr["relation"]] not in REDUNDANT_RELATIONS: # remove REDUNDANT_RELATIONS _node_save[_cur_depth].append(_nb_cididx) if sum(len(_e) for _e in _node_save) > max_num_nodes: _node_buffer = [] break _local_res_list.append(_node_save) return _local_res_list if disable_nb: print("disable_nb!!!!!") else: print("Getting neighbors") proc_buffer = [] wfp_nb = open(join(cache_dir, neighbor_cididxs_file_name), "w", encoding="utf-8") nb_offsets = [] for _ctkidx in tqdm(range(len(cid_list)), total=len(cid_list)): proc_buffer.append(_ctkidx) if len(proc_buffer) == num_workers * 10000 or _ctkidx == (len(cid_list)-1): if num_workers == 1: _res_list = _processor(proc_buffer) else: _res_list = combine_from_lists( multiprocessing_map( _processor, dict_args_list=[ {"_cididx_list": _d} for _d in split_to_lists(proc_buffer, num_parallels=num_workers) ], num_parallels=num_workers ), ordered=True ) assert len(_res_list) == len(proc_buffer) for _elem in _res_list: nb_offsets.append(wfp_nb.tell()) _dump_str = json.dumps(_elem) + os.linesep wfp_nb.write(_dump_str) proc_buffer = [] wfp_nb.close() save_pickle(nb_offsets, join(cache_dir, neighbor_cididxs_offset_file_name)) print("\tDone")