def parse_organism(org, in_path=cs.STRING_PATH, out_path=cs.JSON_PATH, check=True): ppi_name = '{}.protein.links.v10.5.txt'.format(org) node_name = '{}_parsed_nodes.json'.format(org) edge_name = '{}_parsed_edges.json'.format(org) ppi_path = utils.join_path(in_path, ppi_name) node_path = utils.join_path(out_path, node_name) edge_path = utils.join_path(out_path, edge_name) if (check and utils.files_exist([node_name, edge_name], out_path)): message = 'using existing parsed jsons for {}'.format(org) utils.print_log(message) else: message = ('parsing ppi information of {}').format(org) utils.print_log(message) parse_organism_ppi(org, ppi_path, node_path, edge_path) message = ('ppi parsing finished for {}').format(org) utils.print_log(message) return organism.Organism(nodes_file=node_path, edges_file=edge_path, org_id=org)
def __init__(self, opt, mode="train"): self.raw_train = load_json(opt.train_path) self.raw_test = load_json(opt.test_path) self.raw_valid = load_json(opt.valid_path) self.vcpt_dict = load_pickle(opt.vcpt_path) self.vfeat_load = opt.vid_feat_flag if self.vfeat_load: self.vid_h5 = h5py.File(opt.vid_feat_path, "r", driver=opt.h5driver) self.glove_embedding_path = opt.glove_path self.normalize_v = opt.normalize_v self.with_ts = opt.with_ts self.mode = mode self.cur_data_dict = self.get_cur_dict() # set word embedding / vocabulary self.word2idx_path = opt.word2idx_path self.idx2word_path = opt.idx2word_path self.vocab_embedding_path = opt.vocab_embedding_path self.embedding_dim = opt.embedding_size self.word2idx = {"<pad>": 0, "<unk>": 1, "<eos>": 2} self.idx2word = {0: "<pad>", 1: "<unk>", 2: "<eos>"} self.offset = len(self.word2idx) self.bert_tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', max_len=300) # set entry keys if self.with_ts: self.text_keys = [ "q", "a0", "a1", "a2", "a3", "a4", "located_sub_text" ] else: self.text_keys = ["q", "a0", "a1", "a2", "a3", "a4", "sub_text"] self.vcpt_key = "vcpt" self.label_key = "answer_idx" self.qid_key = "qid" self.vid_name_key = "vid_name" self.located_frm_key = "located_frame" for k in self.text_keys + [ self.vcpt_key, self.qid_key, self.vid_name_key ]: if k == "vcpt": continue assert k in self.raw_valid[0].keys() # build/load vocabulary if not files_exist([ self.word2idx_path, self.idx2word_path, self.vocab_embedding_path ]): print("\nNo cache founded.") self.build_word_vocabulary( word_count_threshold=opt.word_count_threshold) else: print("\nLoading cache ...") self.word2idx = load_pickle(self.word2idx_path) self.idx2word = load_pickle(self.idx2word_path) self.vocab_embedding = load_pickle(self.vocab_embedding_path)
def __init__(self, opt, mode="train"): self.opt = opt self.is_eval = mode != "train" # are we running from eval mode self.raw_train = load_json(opt.train_path) # self.raw_test = load_json(opt.test_path) self.raw_valid = load_json(opt.valid_path) self.sub_data = load_json(opt.sub_path) self.sub_flag = "sub" in opt.input_streams self.vfeat_flag = "vfeat" in opt.input_streams self.vfeat_type = opt.vfeat_type self.qa_bert_h5 = h5py.File(opt.qa_bert_path, "r", driver=opt.h5driver) # qid + key if self.sub_flag: self.sub_bert_h5 = h5py.File(opt.sub_bert_path, "r", driver=opt.h5driver) # vid_name if self.vfeat_flag: self.vid_h5 = h5py.File(opt.vfeat_path, "r", driver=opt.h5driver) # add core self.vcpt_flag = "vcpt" in opt.input_streams or self.vfeat_flag # if vfeat, must vcpt if self.vcpt_flag: self.vcpt_dict = load_pickle(opt.vcpt_path) if opt.vcpt_path.endswith(".pickle") \ else load_json(opt.vcpt_path) if opt.debug: self.raw_train = filter_list_dicts(self.raw_train, "vid_name", self.vcpt_dict.keys()) self.raw_valid = filter_list_dicts(self.raw_valid, "vid_name", self.vcpt_dict.keys()) # self.raw_test = filter_list_dicts(self.raw_test, "vid_name", self.vcpt_dict.keys()) print("number of training/valid", len(self.raw_train), len(self.raw_valid)) self.glove_embedding_path = opt.glove_path self.mode = mode self.num_region = opt.num_region self.use_sup_att = opt.use_sup_att self.att_iou_thd = opt.att_iou_thd self.cur_data_dict = self.get_cur_dict() # tmp self.frm_cnt_path = opt.frm_cnt_path self.frm_cnt_dict = load_json(self.frm_cnt_path) # build/load vocabulary self.word2idx_path = opt.word2idx_path self.embedding_dim = 300 self.word2idx = {"<pad>": 0, "<unk>": 1, "<eos>": 2} self.idx2word = {0: "<pad>", 1: "<unk>", 2: "<eos>"} self.offset = len(self.word2idx) text_keys = ["a0", "a1", "a2", "a3", "a4", "q", "sub_text"] if not files_exist([self.word2idx_path]): print("\nNo cache founded.") self.build_word_vocabulary(text_keys, word_count_threshold=2) else: print("\nLoading cache ...") # self.word2idx = load_pickle(self.word2idx_path) self.word2idx = load_json(self.word2idx_path) self.idx2word = {i: w for w, i in self.word2idx.items()}
def check_initial_files(org, in_path=cs.STRING_PATH): # the list of initial files needed for code execution check_list = [ '.protein.links.v10.5.txt', '.protein.sequences.v10.5.fa', ] file_names = [org + x for x in check_list] if utils.files_exist(file_names, in_path): message = ('initial files check passed for {}').format(org) utils.print_log(message) else: err = ('the initial files for {} not found').format(org) utils.print_log(err, mode='err')
def tf_records_preprocessing(n_in: int, n_out: int, raw_data_path: str, tf_records_name: str, feature_cols: Union[List[int], List[List[int]]], label_col: int, factor_type: FactorType, train_size_pct: float = .8): train_path, test_path = generate_tf_records_path(tf_records_name) if not files_exist([train_path, test_path]): # read from csv d = read_data_from_csv(raw_data_path) # select factor type d = get_data_by_type(d, factor_type=factor_type) # split train & test raw_trn_data, raw_tst_data = split_data(d, train_size_pct=train_size_pct) # split train/test-x/y trn_fea, trn_lbl = to_supervised(raw_trn_data, n_in, n_out, label_col=label_col, feature_cols=feature_cols, is_train=True) tst_fea, tst_lbl = to_supervised(raw_tst_data, n_in, n_out, label_col=label_col, feature_cols=feature_cols, is_train=True) # trn_lbl = transform_labels(trn_lbl) # tst_lbl = transform_labels(tst_lbl) data_to_tf_records(trn_fea, trn_lbl, tst_fea, tst_lbl, tf_records_name) else: print( 'files already exist, if new records pls rename, or delete current records if updated' )
def get_data(self): if files_exist([self._processed_file ]) and self._force_regenerate == False: print("Loading pre-processed {} data...".format(self._set_id)) self.data, self.relatives, self.order, hier, set_id = torch.load( self._processed_file) self._num_features = self.data[0]['features'].size()[0] print("Done loading.") if hier != self._hierarchical or set_id != self._set_id: print( "Loaded data metadata differs to current specs, regenerating data..." ) self._force_regenerate = True self.get_data() else: self._build_pairs() else: print("Processig {} data...".format(self._set_id)) self.pre_process() self._build_pairs() print("Done processing.")
def tf_records_preprocessing(n_in: int, n_out: int, raw_data_path: str, tf_records_name: str, feature_cols: Union[List[int], List[List[int]]]): """ :param n_in: :param n_out: :param raw_data_path: :param tf_records_name: :param feature_cols: :return: """ train_path, test_path = generate_tf_records_path(tf_records_name) if not files_exist([train_path, test_path]): # read from csv d = read_data_from_csv(raw_data_path) # split train & test raw_trn_data, raw_tst_data = split_data(d) # split train/test-x/y trn_fea, trn_lbl = to_supervised(raw_trn_data, n_in, n_out, feature_cols=feature_cols, label_col=0, is_train=True) tst_fea, tst_lbl = to_supervised(raw_tst_data, n_in, n_out, feature_cols=feature_cols, label_col=0, is_train=False) # write final train & test data to TFRecord data_to_tf_records(trn_fea, trn_lbl, tst_fea, tst_lbl, tf_records_name) else: print('files already exist')
def __init__(self, opt, mode="train"): self.opt = opt self.inference = mode == "test" # inference mode, no GT annotations self.raw_train = load_json(opt.train_path) if opt.test_path: self.raw_test = load_json(opt.test_path) self.raw_valid = load_json(opt.valid_path) self.sub_data = load_json(opt.sub_path) self.sub_flag = "sub" in opt.input_streams self.vfeat_flag = "vfeat" in opt.input_streams self.vfeat_type = opt.vfeat_type self.qa_bert_h5 = h5py.File(opt.qa_bert_path, "r", driver=opt.h5driver) # qid + key if self.sub_flag: self.sub_bert_h5 = h5py.File(opt.sub_bert_path, "r", driver=opt.h5driver) # vid_name if self.vfeat_flag: self.vid_h5 = h5py.File(opt.vfeat_path, "r", driver=opt.h5driver) # add core self.vcpt_flag = "vcpt" in opt.input_streams or self.vfeat_flag # if vfeat, must vcpt if self.vcpt_flag: self.vcpt_dict = load_pickle(opt.vcpt_path) if opt.vcpt_path.endswith(".pickle") \ else load_json(opt.vcpt_path) if opt.debug: self.raw_train = filter_list_dicts(self.raw_train, "vid_name", self.vcpt_dict.keys()) self.raw_valid = filter_list_dicts(self.raw_valid, "vid_name", self.vcpt_dict.keys()) if opt.test_path: self.raw_test = filter_list_dicts(self.raw_test, "vid_name", self.vcpt_dict.keys()) print("number of training/valid", len(self.raw_train), len(self.raw_valid)) self.glove_embedding_path = opt.glove_path self.mode = mode self.num_region = opt.num_region self.use_sup_att = opt.use_sup_att self.att_iou_thd = opt.att_iou_thd self.cur_data_dict = self.get_cur_dict() # self.vcpt_mtx_dict = load_pickle(opt.vcpt_mtx_path) # tmp self.frm_cnt_path = opt.frm_cnt_path self.frm_cnt_dict = load_json(self.frm_cnt_path) # build/load vocabulary self.word2idx_path = opt.word2idx_path self.embedding_dim = 300 # build initialized vocabulary self.word2idx = {"<pad>": 0, "<unk>": 1, "<eos>": 2} self.idx2word = {0: "<pad>", 1: "<unk>", 2: "<eos>"} self.offset = len(self.word2idx) self.vcpt_emb_dict = load_json( "/home/data/tvqa_plus_stage_features/test.json") text_keys = ["a0", "a1", "a2", "a3", "a4", "q", "sub_text"] # Build new vocabulary if word2idx.json not found if not files_exist([self.word2idx_path]): print("\nNo cache founded.") self.build_word_vocabulary(text_keys, word_count_threshold=2) # Use the built word2inx.json else: print("\nLoading cache ...") self.word2idx = load_json(self.word2idx_path) self.idx2word = {i: w for w, i in self.word2idx.items()} self.eval_object_vocab = load_json(opt.eval_object_vocab_path) self.eval_object_word_ids = [ self.word2idx[e] if e in self.word2idx else self.word2idx["<unk>"] for e in self.eval_object_vocab ]
def __init__(self, opt, mode="train"): self.raw_train = load_json(opt.train_path) self.raw_test = load_json(opt.test_path) self.raw_valid = load_json(opt.valid_path) self.vfeat_load = opt.vid_feat_flag self.reg_flag = "regional" in opt.input_streams self.regtopk_flag = (-1 != opt.regional_topk) # Options are useful to access self.opt = opt # Regional features loading if self.reg_flag: self.reg_h5 = h5py.File(opt.reg_feat_path, "r", driver=None) # Visual concept loaded if not opt.my_vcpt: # Their visual concepts self.vcpt_dict = load_pickle(opt.vcpt_path) else: # Load the visual concepts classes i got with open( '/home/jumperkables/regional_stuff/faster-rcnn.pytorch/data/pretrained_model/objects_vocab.txt', 'r') as f: data = f.readlines() pascal_classes = np.asarray(['__background__']) pascal_classes = np.append(pascal_classes, np.asarray(data)) self.vcpt_classes = [x.strip('\n') for x in pascal_classes] if not self.reg_flag: # even if we're not using regional features, we still need that file for my classes self.reg_h5 = h5py.File(opt.reg_feat_path, "r", driver=None) # Video features if self.vfeat_load: self.vid_h5 = h5py.File(opt.vid_feat_path, "r", driver=None) #opt.h5driver) # Motion features self.c3d_h5 = h5py.File(os.path.expanduser( "~/kable_management/data/tvqa/motion_features/fixed_tvqa_c3d_fc6_features.hdf5" ), "r", driver=None) # Get bert model ready if opt.bert == "default": self.bert_tokeniser = BertTokenizer.from_pretrained( 'bert-base-uncased') elif opt.bert == "multi_choice": self.bert_tokeniser = BertTokenizer.from_pretrained( 'bert-base-uncased') elif opt.bert == "qa": self.bert_tokeniser = BertTokenizer.from_pretrained( 'bert-base-uncased') self.glove_embedding_path = opt.glove_path self.normalize_v = opt.normalize_v self.with_ts = opt.with_ts self.mode = mode self.cur_data_dict = self.get_cur_dict() # set word embedding / vocabulary self.embedding_dim = opt.embedding_size if opt.bert is None: self.word2idx_path = opt.word2idx_path self.idx2word_path = opt.idx2word_path self.vocab_embedding_path = opt.vocab_embedding_path self.word2idx = {"<pad>": 0, "<unk>": 1, "<eos>": 2} self.idx2word = {0: "<pad>", 1: "<unk>", 2: "<eos>"} self.offset = len(self.word2idx) # set entry keys if self.with_ts: self.text_keys = [ "q", "a0", "a1", "a2", "a3", "a4", "located_sub_text" ] else: self.text_keys = ["q", "a0", "a1", "a2", "a3", "a4", "sub_text"] self.vcpt_key = "vcpt" self.label_key = "answer_idx" self.qid_key = "qid" self.vid_name_key = "vid_name" self.located_frm_key = "located_frame" for k in self.text_keys + [ self.vcpt_key, self.qid_key, self.vid_name_key ]: if k == "vcpt": continue assert k in list(self.raw_valid[0].keys()) # build/load vocabulary if opt.bert is None: if not files_exist([ self.word2idx_path, self.idx2word_path, self.vocab_embedding_path ]): print("\nNo cache founded.") self.build_word_vocabulary( word_count_threshold=opt.word_count_threshold) else: print("\nLoading cache ...") self.word2idx = load_pickle(self.word2idx_path) self.idx2word = load_pickle(self.idx2word_path) self.vocab_embedding = load_pickle(self.vocab_embedding_path)