Esempio n. 1
0
def parse_organism(org,
                   in_path=cs.STRING_PATH,
                   out_path=cs.JSON_PATH,
                   check=True):
    ppi_name = '{}.protein.links.v10.5.txt'.format(org)
    node_name = '{}_parsed_nodes.json'.format(org)
    edge_name = '{}_parsed_edges.json'.format(org)

    ppi_path = utils.join_path(in_path, ppi_name)
    node_path = utils.join_path(out_path, node_name)
    edge_path = utils.join_path(out_path, edge_name)

    if (check and utils.files_exist([node_name, edge_name], out_path)):
        message = 'using existing parsed jsons for {}'.format(org)
        utils.print_log(message)

    else:
        message = ('parsing ppi information of {}').format(org)
        utils.print_log(message)

        parse_organism_ppi(org, ppi_path, node_path, edge_path)

        message = ('ppi parsing finished for {}').format(org)
        utils.print_log(message)

    return organism.Organism(nodes_file=node_path,
                             edges_file=edge_path,
                             org_id=org)
Esempio n. 2
0
    def __init__(self, opt, mode="train"):
        self.raw_train = load_json(opt.train_path)
        self.raw_test = load_json(opt.test_path)
        self.raw_valid = load_json(opt.valid_path)
        self.vcpt_dict = load_pickle(opt.vcpt_path)
        self.vfeat_load = opt.vid_feat_flag
        if self.vfeat_load:
            self.vid_h5 = h5py.File(opt.vid_feat_path,
                                    "r",
                                    driver=opt.h5driver)
        self.glove_embedding_path = opt.glove_path
        self.normalize_v = opt.normalize_v
        self.with_ts = opt.with_ts
        self.mode = mode
        self.cur_data_dict = self.get_cur_dict()

        # set word embedding / vocabulary
        self.word2idx_path = opt.word2idx_path
        self.idx2word_path = opt.idx2word_path
        self.vocab_embedding_path = opt.vocab_embedding_path
        self.embedding_dim = opt.embedding_size
        self.word2idx = {"<pad>": 0, "<unk>": 1, "<eos>": 2}
        self.idx2word = {0: "<pad>", 1: "<unk>", 2: "<eos>"}
        self.offset = len(self.word2idx)

        self.bert_tokenizer = BertTokenizer.from_pretrained(
            'bert-base-uncased', max_len=300)

        # set entry keys
        if self.with_ts:
            self.text_keys = [
                "q", "a0", "a1", "a2", "a3", "a4", "located_sub_text"
            ]
        else:
            self.text_keys = ["q", "a0", "a1", "a2", "a3", "a4", "sub_text"]
        self.vcpt_key = "vcpt"
        self.label_key = "answer_idx"
        self.qid_key = "qid"
        self.vid_name_key = "vid_name"
        self.located_frm_key = "located_frame"
        for k in self.text_keys + [
                self.vcpt_key, self.qid_key, self.vid_name_key
        ]:
            if k == "vcpt":
                continue
            assert k in self.raw_valid[0].keys()

        # build/load vocabulary
        if not files_exist([
                self.word2idx_path, self.idx2word_path,
                self.vocab_embedding_path
        ]):
            print("\nNo cache founded.")
            self.build_word_vocabulary(
                word_count_threshold=opt.word_count_threshold)
        else:
            print("\nLoading cache ...")
            self.word2idx = load_pickle(self.word2idx_path)
            self.idx2word = load_pickle(self.idx2word_path)
            self.vocab_embedding = load_pickle(self.vocab_embedding_path)
Esempio n. 3
0
    def __init__(self, opt, mode="train"):
        self.opt = opt
        self.is_eval = mode != "train"  # are we running from eval mode
        self.raw_train = load_json(opt.train_path)
        # self.raw_test = load_json(opt.test_path)
        self.raw_valid = load_json(opt.valid_path)
        self.sub_data = load_json(opt.sub_path)
        self.sub_flag = "sub" in opt.input_streams
        self.vfeat_flag = "vfeat" in opt.input_streams
        self.vfeat_type = opt.vfeat_type
        self.qa_bert_h5 = h5py.File(opt.qa_bert_path, "r",
                                    driver=opt.h5driver)  # qid + key
        if self.sub_flag:
            self.sub_bert_h5 = h5py.File(opt.sub_bert_path,
                                         "r",
                                         driver=opt.h5driver)  # vid_name
        if self.vfeat_flag:
            self.vid_h5 = h5py.File(opt.vfeat_path, "r",
                                    driver=opt.h5driver)  # add core
        self.vcpt_flag = "vcpt" in opt.input_streams or self.vfeat_flag  # if vfeat, must vcpt
        if self.vcpt_flag:
            self.vcpt_dict = load_pickle(opt.vcpt_path) if opt.vcpt_path.endswith(".pickle") \
                else load_json(opt.vcpt_path)
            if opt.debug:
                self.raw_train = filter_list_dicts(self.raw_train, "vid_name",
                                                   self.vcpt_dict.keys())
                self.raw_valid = filter_list_dicts(self.raw_valid, "vid_name",
                                                   self.vcpt_dict.keys())
                # self.raw_test = filter_list_dicts(self.raw_test, "vid_name", self.vcpt_dict.keys())
                print("number of training/valid", len(self.raw_train),
                      len(self.raw_valid))
        self.glove_embedding_path = opt.glove_path
        self.mode = mode
        self.num_region = opt.num_region
        self.use_sup_att = opt.use_sup_att
        self.att_iou_thd = opt.att_iou_thd
        self.cur_data_dict = self.get_cur_dict()

        # tmp
        self.frm_cnt_path = opt.frm_cnt_path
        self.frm_cnt_dict = load_json(self.frm_cnt_path)

        # build/load vocabulary
        self.word2idx_path = opt.word2idx_path
        self.embedding_dim = 300
        self.word2idx = {"<pad>": 0, "<unk>": 1, "<eos>": 2}
        self.idx2word = {0: "<pad>", 1: "<unk>", 2: "<eos>"}
        self.offset = len(self.word2idx)
        text_keys = ["a0", "a1", "a2", "a3", "a4", "q", "sub_text"]
        if not files_exist([self.word2idx_path]):
            print("\nNo cache founded.")
            self.build_word_vocabulary(text_keys, word_count_threshold=2)
        else:
            print("\nLoading cache ...")
            # self.word2idx = load_pickle(self.word2idx_path)
            self.word2idx = load_json(self.word2idx_path)
        self.idx2word = {i: w for w, i in self.word2idx.items()}
Esempio n. 4
0
def check_initial_files(org, in_path=cs.STRING_PATH):
    # the list of initial files needed for code execution
    check_list = [
        '.protein.links.v10.5.txt',
        '.protein.sequences.v10.5.fa',
    ]

    file_names = [org + x for x in check_list]

    if utils.files_exist(file_names, in_path):
        message = ('initial files check passed for {}').format(org)
        utils.print_log(message)

    else:
        err = ('the initial files for {} not found').format(org)
        utils.print_log(err, mode='err')
def tf_records_preprocessing(n_in: int,
                             n_out: int,
                             raw_data_path: str,
                             tf_records_name: str,
                             feature_cols: Union[List[int], List[List[int]]],
                             label_col: int,
                             factor_type: FactorType,
                             train_size_pct: float = .8):
    train_path, test_path = generate_tf_records_path(tf_records_name)

    if not files_exist([train_path, test_path]):
        # read from csv
        d = read_data_from_csv(raw_data_path)
        # select factor type
        d = get_data_by_type(d, factor_type=factor_type)
        # split train & test
        raw_trn_data, raw_tst_data = split_data(d,
                                                train_size_pct=train_size_pct)
        # split train/test-x/y
        trn_fea, trn_lbl = to_supervised(raw_trn_data,
                                         n_in,
                                         n_out,
                                         label_col=label_col,
                                         feature_cols=feature_cols,
                                         is_train=True)

        tst_fea, tst_lbl = to_supervised(raw_tst_data,
                                         n_in,
                                         n_out,
                                         label_col=label_col,
                                         feature_cols=feature_cols,
                                         is_train=True)

        # trn_lbl = transform_labels(trn_lbl)
        # tst_lbl = transform_labels(tst_lbl)

        data_to_tf_records(trn_fea, trn_lbl, tst_fea, tst_lbl, tf_records_name)
    else:
        print(
            'files already exist, if new records pls rename, or delete current records if updated'
        )
Esempio n. 6
0
 def get_data(self):
     if files_exist([self._processed_file
                     ]) and self._force_regenerate == False:
         print("Loading pre-processed {} data...".format(self._set_id))
         self.data, self.relatives, self.order, hier, set_id = torch.load(
             self._processed_file)
         self._num_features = self.data[0]['features'].size()[0]
         print("Done loading.")
         if hier != self._hierarchical or set_id != self._set_id:
             print(
                 "Loaded data metadata differs to current specs, regenerating data..."
             )
             self._force_regenerate = True
             self.get_data()
         else:
             self._build_pairs()
     else:
         print("Processig {} data...".format(self._set_id))
         self.pre_process()
         self._build_pairs()
         print("Done processing.")
def tf_records_preprocessing(n_in: int, n_out: int, raw_data_path: str,
                             tf_records_name: str,
                             feature_cols: Union[List[int], List[List[int]]]):
    """

    :param n_in:
    :param n_out:
    :param raw_data_path:
    :param tf_records_name:
    :param feature_cols:
    :return:
    """

    train_path, test_path = generate_tf_records_path(tf_records_name)

    if not files_exist([train_path, test_path]):
        # read from csv
        d = read_data_from_csv(raw_data_path)
        # split train & test
        raw_trn_data, raw_tst_data = split_data(d)
        # split train/test-x/y
        trn_fea, trn_lbl = to_supervised(raw_trn_data,
                                         n_in,
                                         n_out,
                                         feature_cols=feature_cols,
                                         label_col=0,
                                         is_train=True)
        tst_fea, tst_lbl = to_supervised(raw_tst_data,
                                         n_in,
                                         n_out,
                                         feature_cols=feature_cols,
                                         label_col=0,
                                         is_train=False)
        # write final train & test data to TFRecord
        data_to_tf_records(trn_fea, trn_lbl, tst_fea, tst_lbl, tf_records_name)
    else:
        print('files already exist')
Esempio n. 8
0
    def __init__(self, opt, mode="train"):
        self.opt = opt
        self.inference = mode == "test"  # inference mode, no GT annotations
        self.raw_train = load_json(opt.train_path)
        if opt.test_path:
            self.raw_test = load_json(opt.test_path)
        self.raw_valid = load_json(opt.valid_path)
        self.sub_data = load_json(opt.sub_path)
        self.sub_flag = "sub" in opt.input_streams

        self.vfeat_flag = "vfeat" in opt.input_streams
        self.vfeat_type = opt.vfeat_type

        self.qa_bert_h5 = h5py.File(opt.qa_bert_path, "r",
                                    driver=opt.h5driver)  # qid + key
        if self.sub_flag:
            self.sub_bert_h5 = h5py.File(opt.sub_bert_path,
                                         "r",
                                         driver=opt.h5driver)  # vid_name

        if self.vfeat_flag:
            self.vid_h5 = h5py.File(opt.vfeat_path, "r",
                                    driver=opt.h5driver)  # add core
        self.vcpt_flag = "vcpt" in opt.input_streams or self.vfeat_flag  # if vfeat, must vcpt
        if self.vcpt_flag:
            self.vcpt_dict = load_pickle(opt.vcpt_path) if opt.vcpt_path.endswith(".pickle") \
                else load_json(opt.vcpt_path)
            if opt.debug:
                self.raw_train = filter_list_dicts(self.raw_train, "vid_name",
                                                   self.vcpt_dict.keys())
                self.raw_valid = filter_list_dicts(self.raw_valid, "vid_name",
                                                   self.vcpt_dict.keys())
                if opt.test_path:
                    self.raw_test = filter_list_dicts(self.raw_test,
                                                      "vid_name",
                                                      self.vcpt_dict.keys())
                print("number of training/valid", len(self.raw_train),
                      len(self.raw_valid))

        self.glove_embedding_path = opt.glove_path
        self.mode = mode
        self.num_region = opt.num_region
        self.use_sup_att = opt.use_sup_att
        self.att_iou_thd = opt.att_iou_thd
        self.cur_data_dict = self.get_cur_dict()
        # self.vcpt_mtx_dict = load_pickle(opt.vcpt_mtx_path)

        # tmp
        self.frm_cnt_path = opt.frm_cnt_path
        self.frm_cnt_dict = load_json(self.frm_cnt_path)

        # build/load vocabulary
        self.word2idx_path = opt.word2idx_path
        self.embedding_dim = 300
        # build initialized vocabulary
        self.word2idx = {"<pad>": 0, "<unk>": 1, "<eos>": 2}
        self.idx2word = {0: "<pad>", 1: "<unk>", 2: "<eos>"}
        self.offset = len(self.word2idx)
        self.vcpt_emb_dict = load_json(
            "/home/data/tvqa_plus_stage_features/test.json")
        text_keys = ["a0", "a1", "a2", "a3", "a4", "q", "sub_text"]

        # Build new vocabulary if word2idx.json not found
        if not files_exist([self.word2idx_path]):
            print("\nNo cache founded.")
            self.build_word_vocabulary(text_keys, word_count_threshold=2)
        # Use the built word2inx.json
        else:
            print("\nLoading cache ...")
            self.word2idx = load_json(self.word2idx_path)
        self.idx2word = {i: w for w, i in self.word2idx.items()}

        self.eval_object_vocab = load_json(opt.eval_object_vocab_path)
        self.eval_object_word_ids = [
            self.word2idx[e] if e in self.word2idx else self.word2idx["<unk>"]
            for e in self.eval_object_vocab
        ]
Esempio n. 9
0
    def __init__(self, opt, mode="train"):
        self.raw_train = load_json(opt.train_path)
        self.raw_test = load_json(opt.test_path)
        self.raw_valid = load_json(opt.valid_path)
        self.vfeat_load = opt.vid_feat_flag
        self.reg_flag = "regional" in opt.input_streams
        self.regtopk_flag = (-1 != opt.regional_topk)

        # Options are useful to access
        self.opt = opt

        # Regional features loading
        if self.reg_flag:
            self.reg_h5 = h5py.File(opt.reg_feat_path, "r", driver=None)
        # Visual concept loaded
        if not opt.my_vcpt:  # Their visual concepts
            self.vcpt_dict = load_pickle(opt.vcpt_path)
        else:
            # Load the visual concepts classes i got
            with open(
                    '/home/jumperkables/regional_stuff/faster-rcnn.pytorch/data/pretrained_model/objects_vocab.txt',
                    'r') as f:
                data = f.readlines()
            pascal_classes = np.asarray(['__background__'])
            pascal_classes = np.append(pascal_classes, np.asarray(data))
            self.vcpt_classes = [x.strip('\n') for x in pascal_classes]
            if not self.reg_flag:  # even if we're not using regional features, we still need that file for my classes
                self.reg_h5 = h5py.File(opt.reg_feat_path, "r", driver=None)

        # Video features
        if self.vfeat_load:
            self.vid_h5 = h5py.File(opt.vid_feat_path, "r",
                                    driver=None)  #opt.h5driver)

        # Motion features
        self.c3d_h5 = h5py.File(os.path.expanduser(
            "~/kable_management/data/tvqa/motion_features/fixed_tvqa_c3d_fc6_features.hdf5"
        ),
                                "r",
                                driver=None)

        # Get bert model ready
        if opt.bert == "default":
            self.bert_tokeniser = BertTokenizer.from_pretrained(
                'bert-base-uncased')
        elif opt.bert == "multi_choice":
            self.bert_tokeniser = BertTokenizer.from_pretrained(
                'bert-base-uncased')
        elif opt.bert == "qa":
            self.bert_tokeniser = BertTokenizer.from_pretrained(
                'bert-base-uncased')

        self.glove_embedding_path = opt.glove_path
        self.normalize_v = opt.normalize_v
        self.with_ts = opt.with_ts
        self.mode = mode
        self.cur_data_dict = self.get_cur_dict()

        # set word embedding / vocabulary
        self.embedding_dim = opt.embedding_size
        if opt.bert is None:
            self.word2idx_path = opt.word2idx_path
            self.idx2word_path = opt.idx2word_path
            self.vocab_embedding_path = opt.vocab_embedding_path
            self.word2idx = {"<pad>": 0, "<unk>": 1, "<eos>": 2}
            self.idx2word = {0: "<pad>", 1: "<unk>", 2: "<eos>"}
            self.offset = len(self.word2idx)

        # set entry keys
        if self.with_ts:
            self.text_keys = [
                "q", "a0", "a1", "a2", "a3", "a4", "located_sub_text"
            ]
        else:
            self.text_keys = ["q", "a0", "a1", "a2", "a3", "a4", "sub_text"]
        self.vcpt_key = "vcpt"
        self.label_key = "answer_idx"
        self.qid_key = "qid"
        self.vid_name_key = "vid_name"
        self.located_frm_key = "located_frame"
        for k in self.text_keys + [
                self.vcpt_key, self.qid_key, self.vid_name_key
        ]:
            if k == "vcpt":
                continue
            assert k in list(self.raw_valid[0].keys())

        # build/load vocabulary
        if opt.bert is None:
            if not files_exist([
                    self.word2idx_path, self.idx2word_path,
                    self.vocab_embedding_path
            ]):
                print("\nNo cache founded.")
                self.build_word_vocabulary(
                    word_count_threshold=opt.word_count_threshold)
            else:
                print("\nLoading cache ...")
                self.word2idx = load_pickle(self.word2idx_path)
                self.idx2word = load_pickle(self.idx2word_path)
                self.vocab_embedding = load_pickle(self.vocab_embedding_path)