Esempio n. 1
0
    def parse_graph_to_hin(self,
                           first_graph,
                           second_graph,
                           third_graph=None,
                           first_mapping_file='ec2compound.pkl',
                           second_mapping_file=None,
                           hin_file='hin.pkl',
                           ospath='objectset',
                           display_params: bool = True):
        if display_params:
            self.__print_arguments()
            time.sleep(2)
        print('\t>> Building a multi-modal graph...')
        logger.info('\t>> Building a multi-modal graph...')
        hin = self.__compose_graphs(first_graph=first_graph,
                                    second_graph=second_graph,
                                    first_adjaceny_matrix=first_mapping_file,
                                    third_graph=third_graph,
                                    second_adjaceny_matrix=second_mapping_file)
        if self.remove_isolates:
            print(
                '\t\t--> Removing {0:d} isolated nodes from the multi-modal graph...'
                .format(len(list(nx.isolates(hin)))))
            logger.info(
                '\t\t--> Removing {0:d} isolated nodes from the multi-modal graph...'
                .format(len(list(nx.isolates(hin)))))
            hin.remove_nodes_from(list(nx.isolates(hin)))

        save_data(data=hin,
                  file_name=hin_file,
                  save_path=ospath,
                  tag='heterogeneous information network',
                  mode='w+b')
Esempio n. 2
0
def synthesize_report(X, sample_ids, y_pred, y_dict_ids, y_common_name, component_dict, labels_components,
                      y_pred_score=None,
                      batch_size=30, num_jobs=1, rsfolder="Results", rspath="../.", dspath="../.", file_name='labels'):
    if y_pred is None:
        raise Exception("Please provide two matrices as numpy matrix format: "
                        "(num_samples, num_labels), representing pathway scores "
                        "and the status of prediction as binary values.")

    num_samples = len(sample_ids)
    main_folder_path = os.path.join(rspath, rsfolder)
    list_batches = np.arange(start=0, stop=num_samples, step=batch_size)
    parallel = Parallel(n_jobs=num_jobs, verbose=0)

    # Delete the previous main folder and recreate a new one
    create_remove_dir(folder_path=main_folder_path)
    if y_pred_score is not None:
        results = parallel(delayed(__synthesize_report)(X[batch:batch + batch_size],
                                                        sample_ids[batch:batch + batch_size],
                                                        y_pred_score[batch:batch + batch_size],
                                                        y_pred[batch:batch + batch_size],
                                                        y_dict_ids, y_common_name, component_dict,
                                                        labels_components, main_folder_path, batch_idx,
                                                        len(list_batches))
                           for batch_idx, batch in enumerate(list_batches))
    else:
        results = parallel(delayed(__synthesize_report)(X[batch:batch + batch_size],
                                                        sample_ids[batch:batch + batch_size],
                                                        y_pred_score, y_pred[batch:batch + batch_size],
                                                        y_dict_ids, y_common_name, component_dict,
                                                        labels_components, main_folder_path, batch_idx,
                                                        len(list_batches))
                           for batch_idx, batch in enumerate(list_batches))
    desc = '\t\t--> Synthesizing pathway reports {0:.4f}%...'.format(100)
    print(desc)
    y = list(zip(*results))
    y = [item for lst in y for item in lst]
    print('\t\t--> Storing predictions (label) to: {0:s}'.format(file_name + '_labels.pkl'))
    save_data(data=y, file_name=file_name + '_labels.pkl', save_path=dspath, mode="wb",
              print_tag=False)
    y_dict_ids = dict((y_id, y_idx) for y_idx, y_id in y_dict_ids.items())
    y_csr = np.zeros((len(y), len(y_dict_ids.keys())))
    for idx, lst in enumerate(y):
        for item in lst:
            if item in y_dict_ids:
                y_csr[idx, y_dict_ids[item]] = 1
    print('\t\t--> Storing predictions (label index) to: {0:s}'.format(file_name + '_y.pkl'))
    save_data(data=lil_matrix(y_csr), file_name=file_name + "_y.pkl", save_path=dspath, mode="wb",
              print_tag=False)
Esempio n. 3
0
    def generate_walks(self,
                       constraint_type,
                       just_type,
                       just_memory_size,
                       use_metapath_scheme,
                       metapath_scheme='ECTCE',
                       burn_in_phase: int = 10,
                       burn_in_input_size: float = 0.5,
                       hin='hin.pkl',
                       save_file_name='hin',
                       ospath='objectset',
                       dspath='dataset',
                       display_params: bool = True):
        if burn_in_phase < 0:
            burn_in_phase = 1
        self.burn_in_phase = burn_in_phase
        if burn_in_input_size < 0 or burn_in_input_size > 1:
            burn_in_input_size = 0.1
        self.burn_in_input_size = burn_in_input_size
        if use_metapath_scheme:
            if metapath_scheme.strip() != '' or metapath_scheme is not None:
                self.__check_metapath_validity(metapath_scheme=metapath_scheme)
                hin.metapath_scheme = metapath_scheme
            else:
                desc = '\n\t   --> Error: Please provide a metapath scheme...'
                logger.warning(desc)
                raise Exception(desc)
        else:
            hin.metapath_scheme = None
            metapath_scheme = None

        if display_params:
            self.__print_arguments(
                use_metapath_scheme='Use a metapath scheme: {0}'.format(
                    use_metapath_scheme),
                metapath_scheme='The specified metapath scheme: {0}'.format(
                    metapath_scheme),
                constraint_type='Use node type: {0}'.format(constraint_type),
                just_type='Use JUST algorithm: {0}'.format(just_type),
                burn_in_phase='Burn in phase count: {0}'.format(
                    self.burn_in_phase),
                burn_in_input_size=
                'Subsampling size of the number of walks and length for burn in phase: {0}'
                .format(self.burn_in_input_size))
            time.sleep(2)

        init_node_prob, type2index, type2prob = self.__init_probability(hin)
        hin.type2index = type2index
        hin.type2prob = type2prob
        hin.trans_metapath_scheme = use_metapath_scheme
        hin.trans_constraint_type = constraint_type
        hin.trans_just_type = just_type
        hin.q = self.q
        hin.p = self.p
        hin.learning_rate = self.learning_rate
        hin.num_walks = self.num_walks
        hin.walk_length = self.walk_length

        print('\t>> Calculate initial transition probabilities...')
        logger.info('\t>> Calculate initial transition probabilities...')
        N = (hin.number_of_nodes(), hin.number_of_nodes())
        trans_prob = lil_matrix((N[0], N[0]))
        for curr_node, curr_node_data in hin.nodes(data=True):
            neigh_curr_node = np.array([
                hin.nodes[edge[1]]['mapped_idx']
                for edge in hin.edges(curr_node)
            ],
                                       dtype=np.int)
            trans_prob[curr_node_data['mapped_idx'], neigh_curr_node] = 1
        trans_prob = lil_matrix(trans_prob.multiply(1 / trans_prob.sum(1)))

        print('\t>> Calculate transition probabilities...')
        logger.info('\t>> Calculate transition probabilities...')
        for burn_in_count in np.arange(start=1, stop=burn_in_phase + 1):
            desc = '\t\t## Burn in phase {0} (out of {1})...{2}'.format(
                burn_in_count, burn_in_phase, 20 * ' ')
            print(desc)
            for node_idx, node_data in enumerate(hin.nodes(data=True)):
                trans_prob = self._walks_per_node(
                    node_idx=node_idx,
                    node_curr=node_data[0],
                    node_curr_data=node_data[1],
                    hin=hin,
                    just_memory_size=just_memory_size,
                    trans_prob=trans_prob,
                    burn_in_phase=True)
        node_prob = trans_prob.T.dot(init_node_prob)
        results = node_prob.sum()
        node_prob = node_prob.multiply(1 / results)
        hin.trans_prob = trans_prob
        for node in hin.nodes(data=True):
            attrs = {node[0]: {'weight': node_prob[node[1]['mapped_idx']]}}
            nx.set_node_attributes(hin, attrs)
        save_data(data=hin,
                  file_name=save_file_name + '.pkl',
                  save_path=ospath,
                  tag='heterogeneous information network',
                  mode='wb')

        print('\t>> Generate walks...')
        logger.info('\t>> Generate walks...')
        save_file_name = 'X_' + save_file_name + '.txt'
        if os.path.exists(os.path.join(dspath, save_file_name)):
            os.remove(path=os.path.join(dspath, save_file_name))
        pool = Pool(processes=self.num_jobs)
        results = [
            pool.apply_async(self._walks_per_node,
                             args=(node_idx, node_data[0], node_data[1], hin,
                                   just_memory_size, trans_prob, dspath,
                                   save_file_name, False))
            for node_idx, node_data in enumerate(hin.nodes(data=True))
        ]
        output = [p.get() for p in results]
        desc = '\t\t## Stored generated walks to: {0}'.format(save_file_name)
        print(desc)
Esempio n. 4
0
    def _walks_per_node(self,
                        node_idx,
                        node_curr,
                        node_curr_data,
                        hin,
                        just_memory_size,
                        trans_prob,
                        dspath=".",
                        save_file_name=".",
                        burn_in_phase=False):
        if len(list(hin.neighbors(node_curr))) == 0:
            desc = '\t\t\t--> Extracted walks for {0:.4f}% of nodes...'.format(
                ((node_idx + 1) / hin.number_of_nodes()) * 100)
            print(desc, end="\r")
            if burn_in_phase:
                return trans_prob
            else:
                return
        if hin.trans_metapath_scheme:
            metapath_scheme = None
            if node_curr_data['type'] in hin.metapath_scheme:
                frequent_scheme = hin.metapath_scheme[:-1] * 2
                idx = str(frequent_scheme).index(node_curr_data['type'])
                metapath_scheme = frequent_scheme[idx:idx +
                                                  len(hin.metapath_scheme) - 1]
                metapath_scheme = metapath_scheme * (self.walk_length //
                                                     len(metapath_scheme))
            if metapath_scheme is None:
                if burn_in_phase:
                    return trans_prob
                else:
                    return
            if node_curr_data['type'] != metapath_scheme[0]:
                desc = '\t\t\t--> Extracted walks for {0:.4f}% of nodes...'.format(
                    ((node_idx + 1) / hin.number_of_nodes()) * 100)
                print(desc, end="\r")
                if burn_in_phase:
                    return trans_prob
                else:
                    return
        walk_length = self.walk_length
        num_walks = self.num_walks + 1
        if burn_in_phase:
            num_walks = int(self.num_walks * self.burn_in_input_size)
            walk_length = int(self.walk_length * self.burn_in_input_size)
            if num_walks < 0:
                num_walks = 10
            if walk_length < 0:
                walk_length = 10
        for curr_walk in np.arange(start=1, stop=num_walks):
            X = [node_curr_data['mapped_idx']]
            prev_node = [node_curr]
            curr_node = node_curr
            curr_node_data = node_curr_data
            # The size of memory to hold the nodes types
            q_hist = collections.deque(maxlen=just_memory_size)
            q_hist.extend(node_curr_data['type'])
            for curr_length in np.arange(start=1, stop=walk_length):
                if curr_length > 1:
                    list_neigh_idx_prev_node = [
                        hin.nodes[edge[1]]['mapped_idx']
                        for edge in hin.edges(prev_node[-2])
                    ]
                    prev_node_idx = X[-2]
                else:
                    list_neigh_idx_prev_node = [
                        hin.nodes[edge[1]]['mapped_idx']
                        for edge in hin.edges(prev_node[-1])
                    ]
                    prev_node_idx = X[-1]

                if hin.trans_metapath_scheme:
                    neigh_curr_node = [
                        (edge[1], edge[2])
                        for edge in hin.edges(curr_node, data='weight')
                        if hin.nodes[
                            edge[1]]['type'] == metapath_scheme[curr_length]
                    ]
                    if len(neigh_curr_node) == 0:
                        neigh_curr_node = [
                            (edge[1], edge[2])
                            for edge in hin.edges(curr_node, data='weight')
                            if hin.nodes[edge[1]]['type'] == metapath_scheme[
                                curr_length - 1]
                        ]
                else:
                    neigh_curr_node = [
                        (edge[1], edge[2])
                        for edge in hin.edges(curr_node, data='weight')
                    ]

                list_neigh_curr_node = np.array(
                    [node[0] for node in neigh_curr_node])
                neigh_type_curr_node = np.array(
                    [hin.nodes[v]['type'] for v in list_neigh_curr_node])
                neigh_idx_curr_node = np.array([
                    hin.nodes[node]['mapped_idx']
                    for node in list_neigh_curr_node
                ])

                # Retrieve weights of nodes (usually set to 1.) at the start of burn in phase;
                # otherwise, retrieve the previous transition probabilities.
                trans_from_curr_node = trans_prob[
                    X[-1], neigh_idx_curr_node].toarray()[0]

                if hin.trans_constraint_type or hin.trans_just_type:
                    # Compute the transition probability based on types of the current node's neighbours.
                    # We further smooth the transition probabilities by adding EPSILON to weights of current
                    # node, next node and current node type.
                    trans_node_type = [
                        self.__alpha(
                            next_node=hin.nodes[next_node]['mapped_idx'],
                            next_node_type=neigh_type_curr_node[idx],
                            curr_node_type=curr_node_data['type'],
                            weight_curr_node=len(list(
                                hin.neighbors(next_node))) + EPSILON,
                            weight_curr_node_type=sum(
                                neigh_type_curr_node == curr_node_data['type'])
                            + EPSILON,
                            weight_next_node_type=sum(
                                neigh_type_curr_node == hin.nodes[next_node]
                                ['type']) + EPSILON,
                            explore_layer=hin.trans_just_type,
                            constraint_type=True)
                        for idx, next_node in enumerate(list_neigh_curr_node)
                    ]
                    trans_node_type = np.multiply(trans_node_type,
                                                  trans_from_curr_node)

                    if hin.trans_just_type and not hin.trans_metapath_scheme:
                        if len(q_hist) == just_memory_size:
                            available_types = set(q_hist)
                            for t in available_types:
                                # Explore within a layer more frequently as suggested by JUST; however,
                                # the JUST algorithm is modified to explore a wider range when the memory
                                # size in Q_hist is larger than the nodes types.
                                # Note, when q == p then we recover the JUST algorithm.
                                if hin.q != hin.p:
                                    weight_decay = 1 / q_hist.count(t)
                                    if q_hist.count(t) == int(
                                            just_memory_size * hin.q):
                                        weight_decay = -q_hist.count(t)
                                else:
                                    weight_decay = -q_hist.count(t)
                                tmp = trans_node_type[neigh_type_curr_node ==
                                                      t] * np.exp(weight_decay)
                                trans_node_type[neigh_type_curr_node ==
                                                t] = tmp
                    trans_node_type = trans_node_type / np.sum(trans_node_type)
                    node_type = np.random.choice(neigh_type_curr_node,
                                                 size=1,
                                                 p=trans_node_type)

                    # Include only those nodes that have the same chosen type.
                    list_neigh_curr_node = [
                        (edge[1], edge[2])
                        for edge in hin.edges(curr_node, data='weight')
                        if hin.nodes[edge[1]]['type'] == node_type
                    ]
                    neigh_idx_curr_node = np.array([
                        hin.nodes[node[0]]['mapped_idx']
                        for node in list_neigh_curr_node
                    ])

                    # Retrieve weights of nodes (usually set to 1.) at the start of burn in phase;
                    # otherwise, retrieve the previous transition probabilities.
                    trans_from_curr_node = trans_prob[
                        X[-1], neigh_idx_curr_node].toarray()[0]
                    list_neigh_curr_node = np.array(
                        [node[0] for node in list_neigh_curr_node])

                # Compute the transition probability of the current node's neighbours based on the chosen type.
                # We further smooth the transition probabilities by adding EPSILON to weights of current node.
                trans_prob_next_node = [
                    self.__alpha(
                        next_node=hin.nodes[next_node]['mapped_idx'],
                        prev_node=prev_node_idx,
                        neighbours_prev_node=list_neigh_idx_prev_node,
                        weight_curr_node=len(list(hin.neighbors(next_node))) +
                        EPSILON) for next_node in list_neigh_curr_node
                ]
                trans_prob_next_node = np.multiply(trans_prob_next_node,
                                                   trans_from_curr_node)
                trans_prob_next_node = trans_prob_next_node / np.sum(
                    trans_prob_next_node)
                next_node = np.random.choice(neigh_idx_curr_node,
                                             1,
                                             p=trans_prob_next_node)[0]

                # If the transition probability is not computed then initialize it with the most recent
                # estimation; otherwise update the existing one.
                tmp = trans_prob_next_node[neigh_idx_curr_node == next_node]
                trans_prob[X[-1], next_node] = trans_prob[
                    X[-1], next_node] + tmp * self.learning_rate
                curr_node = list_neigh_curr_node[neigh_idx_curr_node ==
                                                 next_node][0]
                curr_node_data = hin.nodes[curr_node]
                # Store the sequence of simulated walks and nodes in Q hist upto predefined memory size.
                X = X + [next_node]
                prev_node = prev_node + [curr_node]
                q_hist.extend(hin.nodes[curr_node]['type'])
            # Save the generated instances into the .txt file
            if not burn_in_phase:
                X = '\t'.join([str(v) for v in X])
                save_data(data=X + '\n',
                          file_name=save_file_name,
                          save_path=dspath,
                          mode='a',
                          w_string=True,
                          print_tag=False)
                desc = '\t\t\t--> Extracted walks for {0:.4f}% of nodes...'.format(
                    ((node_idx + 1) / hin.number_of_nodes()) * 100)
                print(desc, end="\r")
        if burn_in_phase:
            return trans_prob
Esempio n. 5
0
def __train(arg):
    # Setup the number of operations to employ
    steps = 1
    # Whether to display parameters at every operation
    display_params = True

    ##########################################################################################################
    ######################                        PREPROCESSING                         ######################
    ##########################################################################################################

    if arg.define_bags:
        print("\n{0})- Construct bags_labels centroids...".format(steps))
        steps = steps + 1

        # load a hin file
        hin = load_data(file_name=arg.hin_name,
                        load_path=arg.mdpath,
                        tag="heterogeneous information network")
        node2idx_path2vec = dict(
            (node[0], node[1]["mapped_idx"]) for node in hin.nodes(data=True))
        # map pathways indices of vocab to path2vec pathways indices
        vocab = load_data(file_name=arg.vocab_name,
                          load_path=arg.dspath,
                          tag="vocabulary")
        idxvocab = np.array(
            [idx for idx, v in vocab.items() if v in node2idx_path2vec])
        del hin

        # define pathways 2 bags_labels
        phi = np.load(file=os.path.join(arg.mdpath, arg.bag_phi_name))
        phi = phi[phi.files[0]]
        bags_labels = np.argsort(-phi)
        bags_labels = bags_labels[:, :arg.top_k]
        labels_distr_idx = np.array(
            [[pathway for pathway in bag if pathway in idxvocab]
             for bag in bags_labels])
        bags_labels = preprocessing.MultiLabelBinarizer().fit_transform(
            labels_distr_idx)
        labels_distr_idx = [[
            list(idxvocab).index(label_idx) for label_idx in bag_idx
        ] for bag_idx in labels_distr_idx]

        # get trimmed phi distributions
        phi = -np.sort(-phi)
        phi = phi[:, :arg.top_k]

        # calculate correlation
        sigma = np.load(file=os.path.join(arg.mdpath, arg.bag_sigma_name))
        sigma = sigma[sigma.files[0]]
        sigma[sigma < 0] = EPSILON
        C = np.diag(np.sqrt(np.diag(sigma)))
        C_inv = np.linalg.inv(C)
        rho = np.dot(np.dot(C_inv, sigma), C_inv)
        min_rho = np.min(rho)
        max_rho = np.max(rho)
        rho = rho - min_rho
        rho = rho / (max_rho - min_rho)

        # extracting pathway features
        path2vec_features = np.load(
            file=os.path.join(arg.mdpath, arg.features_name))
        path2vec_features = path2vec_features[path2vec_features.files[0]]
        pathways_idx = np.array([
            node2idx_path2vec[v] for idx, v in vocab.items()
            if v in node2idx_path2vec
        ])
        features = path2vec_features[pathways_idx, :]
        features = features / np.linalg.norm(features, axis=1)[:, np.newaxis]

        # get centroids of bags_labels
        C = np.dot(bags_labels, features) / \
            np.sum(bags_labels, axis=1)[:, np.newaxis]
        C = arg.alpha * C

        # save files
        np.savez(os.path.join(arg.dspath, arg.file_name + "_exp_phi_trim.npz"),
                 phi)
        np.savez(os.path.join(arg.dspath, arg.file_name + "_rho.npz"), rho)
        np.savez(os.path.join(arg.dspath, arg.file_name + "_features.npz"),
                 features)
        np.savez(os.path.join(arg.dspath, arg.file_name + "_bag_centroid.npz"),
                 C)
        save_data(data=bags_labels,
                  file_name=arg.file_name + "_bag_pathway.pkl",
                  save_path=arg.dspath,
                  tag="bags_labels with associated pathways",
                  mode="wb")
        save_data(data=idxvocab,
                  file_name=arg.file_name + "_idxvocab.pkl",
                  save_path=arg.dspath,
                  tag="pathway ids to pathway features ids",
                  mode="wb")
        save_data(data=labels_distr_idx,
                  file_name=arg.file_name + "_labels_distr_idx.pkl",
                  save_path=arg.dspath,
                  tag="bags labels batch_idx with associated pathways",
                  mode="wb")
        print("\t>> Done...")

    if arg.recover_max_bags:
        print("\n{0})- Recover maximum expected bags_labels...".format(steps))
        steps = steps + 1

        # load files
        features = np.load(file=os.path.join(arg.dspath, arg.file_name +
                                             "_features.npz"))
        features = features[features.files[0]]
        C = np.load(file=os.path.join(arg.dspath, arg.file_name +
                                      "_bag_centroid.npz"))
        C = C[C.files[0]]
        bags_labels = load_data(file_name=arg.file_name + "_bag_pathway.pkl",
                                load_path=arg.dspath,
                                tag="bags_labels with associated pathways")
        idxvocab = load_data(file_name=arg.file_name + "_idxvocab.pkl",
                             load_path=arg.dspath,
                             tag="pathway ids to pathway features ids")
        y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag="y")
        y_Bag = np.zeros((y.shape[0], C.shape[0]), dtype=np.int)

        for s_idx, sample in enumerate(y):
            desc = "\t>> Recovering maximum number of bags_labels: {0:.2f}%...".format(
                ((s_idx + 1) / y.shape[0]) * 100)
            if (s_idx + 1) != y.shape[0]:
                print(desc, end="\r")
            if (s_idx + 1) == y.shape[0]:
                print(desc)
            pathways = np.zeros((len(list(idxvocab), )), dtype=np.int)
            for ptwy_idx in sample.rows[0]:
                if ptwy_idx in idxvocab:
                    pathways[list(idxvocab).index(ptwy_idx)] = 1
            pathways = np.diag(pathways)
            features = pathways @ features
            sample_bag_features = np.dot(bags_labels, features) / np.sum(
                bags_labels, axis=1)[:, np.newaxis]
            sample_bag_features = arg.alpha * sample_bag_features
            np.nan_to_num(sample_bag_features, copy=False)
            cos = cosine_distances(C, sample_bag_features) / 2
            cos = np.diag(cos)
            B_idx = np.argwhere(cos > arg.v_cos)
            B_idx = B_idx.reshape((B_idx.shape[0], ))
            y_Bag[s_idx, B_idx] = 1

        # save dataset with maximum bags_labels
        save_data(data=lil_matrix(y_Bag),
                  file_name=arg.file_name + "_B.pkl",
                  save_path=arg.dspath,
                  mode="wb",
                  tag="bags to labels data")
        print("\t>> Done...")

    ##########################################################################################################
    ######################                            TRAIN                             ######################
    ##########################################################################################################

    if arg.train:
        print("\n{0})- Training {1} dataset using reMap model...".format(
            steps, arg.y_name))
        steps = steps + 1

        # load files
        print("\t>> Loading files...")
        y_Bag = load_data(file_name=arg.yB_name, load_path=arg.dspath, tag="B")

        # set randomly bags
        if arg.random_allocation:
            num_samples = y_Bag.shape[0]
            y_Bag = y_Bag.toarray()
            for bag_idx in np.arange(y_Bag.shape[1]):
                if np.sum(y_Bag[:, bag_idx]) == num_samples:
                    y_Bag[:, bag_idx] = np.random.binomial(
                        1, arg.theta_bern, num_samples)
            y_Bag[y_Bag == 0] = -1
            y_Bag = lil_matrix(y_Bag)
            # save dataset with maximum bags_labels
            save_data(data=lil_matrix(y_Bag),
                      file_name=arg.model_name + "_B.pkl",
                      save_path=arg.dspath,
                      mode="wb",
                      tag="bags to labels data")
        else:
            features = np.load(
                file=os.path.join(arg.dspath, arg.features_name))
            features = features[features.files[0]]
            C = np.load(file=os.path.join(arg.dspath, arg.bag_centroid_name))
            C = C[C.files[0]]
            rho = np.load(file=os.path.join(arg.dspath, arg.rho_name))
            rho = rho[rho.files[0]]
            bags_labels = load_data(file_name=arg.bags_labels,
                                    load_path=arg.dspath,
                                    tag="bags_labels with associated pathways")
            X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X")
            y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag="y")
            model = reMap(alpha=arg.alpha,
                          binarize_input_feature=arg.binarize_input_feature,
                          fit_intercept=arg.fit_intercept,
                          decision_threshold=arg.decision_threshold,
                          learning_type=arg.learning_type,
                          lr=arg.lr,
                          lr0=arg.lr0,
                          forgetting_rate=arg.forgetting_rate,
                          delay_factor=arg.delay_factor,
                          max_sampling=arg.max_sampling,
                          subsample_input_size=arg.ssample_input_size,
                          subsample_labels_size=arg.ssample_label_size,
                          cost_subsample_size=arg.calc_subsample_size,
                          min_bags=arg.min_bags,
                          max_bags=arg.max_bags,
                          score_strategy=arg.score_strategy,
                          loss_threshold=arg.loss_threshold,
                          early_stop=arg.early_stop,
                          pi=arg.pi,
                          calc_bag_cost=arg.calc_bag_cost,
                          calc_label_cost=arg.calc_label_cost,
                          calc_total_cost=arg.calc_total_cost,
                          varomega=arg.varomega,
                          varrho=arg.varrho,
                          min_negatives_ratio=arg.min_negatives_ratio,
                          lambdas=arg.lambdas,
                          label_bag_sim=arg.label_bag_sim,
                          label_closeness_sim=arg.label_closeness_sim,
                          corr_bag_sim=arg.corr_bag_sim,
                          corr_label_sim=arg.corr_label_sim,
                          corr_input_sim=arg.corr_input_sim,
                          batch=arg.batch,
                          num_epochs=arg.num_epochs,
                          num_jobs=arg.num_jobs,
                          display_interval=arg.display_interval,
                          shuffle=arg.shuffle,
                          random_state=arg.random_state,
                          log_path=arg.logpath)
            model.fit(X=X,
                      y=y,
                      y_Bag=y_Bag,
                      bags_labels=bags_labels,
                      bags_correlation=rho,
                      label_features=features,
                      centroids=C,
                      model_name=arg.model_name,
                      model_path=arg.mdpath,
                      result_path=arg.rspath,
                      snapshot_history=arg.snapshot_history,
                      display_params=display_params)

    ##########################################################################################################
    ######################                           TRANSFORM                          ######################
    ##########################################################################################################

    if arg.transform:
        print("\n{0})- Predicting dataset using a pre-trained reMap model...".
              format(steps))

        # load files
        print("\t>> Loading files...")
        features = np.load(file=os.path.join(arg.dspath, arg.features_name))
        features = features[features.files[0]]
        C = np.load(file=os.path.join(arg.dspath, arg.bag_centroid_name))
        C = C[C.files[0]]
        rho = np.load(file=os.path.join(arg.dspath, arg.rho_name))
        rho = rho[rho.files[0]]
        bags_labels = load_data(file_name=arg.bags_labels,
                                load_path=arg.dspath,
                                tag="bags_labels with associated pathways")

        # load data
        X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X")
        y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag="y")
        model = load_data(file_name=arg.model_name + ".pkl",
                          load_path=arg.mdpath,
                          tag="reMap model")

        print("\t>> Predict bags...")
        y_Bag = model.transform(X=X,
                                y=y,
                                bags_labels=bags_labels,
                                bags_correlation=rho,
                                label_features=features,
                                centroids=C,
                                subsample_labels_size=arg.ssample_label_size,
                                max_sampling=arg.max_sampling,
                                snapshot_history=arg.snapshot_history,
                                decision_threshold=arg.decision_threshold,
                                batch_size=arg.batch,
                                num_jobs=arg.num_jobs,
                                file_name=arg.file_name,
                                result_path=arg.rspath)
        # save dataset with maximum bags_labels
        save_data(data=lil_matrix(y_Bag),
                  file_name=arg.file_name + "_B.pkl",
                  save_path=arg.dspath,
                  mode="wb",
                  tag="bags to labels data")
Esempio n. 6
0
    def fit(self, X, M=None, features=None, model_name='cbt', model_path="../../model", result_path=".",
            display_params: bool = True):

        if X is None:
            raise Exception("Please provide a dataset.")
        assert X.shape[1] == self.num_features
        X = self.__check_non_neg_array(X, "SparseCorrelatedBagPathway.fit")

        if not self.collapse2ctm:
            if features is not None:
                assert X.shape[1] == features.shape[0]
            else:
                features = np.ones((self.num_features, 20))
            features = features / np.linalg.norm(features, axis=1)[:, np.newaxis]

        # collect properties from data
        self.__init_latent_variables()
        num_samples = int(X.shape[0] * self.subsample_input_size)
        list_batches = np.arange(start=0, stop=num_samples, step=self.batch)

        if display_params:
            self.__print_arguments()
            time.sleep(2)

        if not self.collapse2ctm:
            if M is not None:
                assert M.shape == X.shape
                omega = M + self.xi_vec
            else:
                omega = np.zeros((X.shape[0], self.num_features)) + self.xi_vec
            omega = omega / np.sum(omega, axis=1)[:, np.newaxis]

        cost_file_name = model_name + "_cost.txt"
        save_data('', file_name=cost_file_name, save_path=result_path, mode='w', w_string=True, print_tag=False)

        print('\t>> Training by SOAP model...')
        logger.info('\t>> Training by SOAP model...')
        n_epochs = self.num_epochs + 1
        old_bound = np.inf

        timeref = time.time()

        for epoch in np.arange(start=1, stop=n_epochs):
            desc = '\t   {0:d})- Epoch count ({0:d}/{1:d})...'.format(epoch, n_epochs - 1)
            print(desc)
            logger.info(desc)

            learning_rate = np.power((epoch + self.delay_factor), -self.forgetting_rate)

            # Subsample dataset
            idx = np.random.choice(X.shape[0], num_samples, False)
            start_epoch = time.time()

            # E-step
            if not self.collapse2ctm:
                sstats, tmp = self.__batch_e_step(X=X[idx, :], omega=omega[idx, :], features=features,
                                                  list_batches=list_batches)
            else:
                sstats, tmp = self.__batch_e_step(X=X[idx, :], omega=None, features=features,
                                                  list_batches=list_batches)
            del tmp

            # M-step
            self.__m_step(sstats=sstats, learning_rate=learning_rate, num_samples=num_samples)

            end_epoch = time.time()

            self.is_fit = True

            # Compute approx bound
            if not self.collapse2ctm:
                new_bound = self.perplexity(X=X[idx, :], M=omega[idx, :], features=features, sstats=sstats)
            else:
                new_bound = self.perplexity(X=X[idx, :], M=M, features=features, sstats=sstats)

            print('\t\t## Epoch {0} took {1} seconds...'.format(epoch, round(end_epoch - start_epoch, 3)))
            logger.info('\t\t## Epoch {0} took {1} seconds...'.format(epoch, round(end_epoch - start_epoch, 3)))
            data = str(epoch) + '\t' + str(round(end_epoch - start_epoch, 3)) + '\t' + str(new_bound) + '\n'
            save_data(data=data, file_name=cost_file_name, save_path=result_path, mode='a', w_string=True,
                      print_tag=False)
            # Save models parameters based on test frequencies
            if (epoch % self.display_interval) == 0 or epoch == 1 or epoch == n_epochs - 1:
                print('\t\t  --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_bound, old_bound))
                logger.info('\t\t  --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_bound, old_bound))
                if new_bound <= old_bound or epoch == n_epochs - 1:
                    phi_file_name = model_name + '_exp_phi.npz'
                    sigma_file_name = model_name + '_sigma.npz'
                    mu_file_name = model_name + '_mu.npz'
                    model_file_name = model_name + '.pkl'
                    if epoch == n_epochs - 1:
                        phi_file_name = model_name + '_exp_phi_final.npz'
                        sigma_file_name = model_name + '_sigma_final.npz'
                        mu_file_name = model_name + '_mu_final.npz'
                        model_file_name = model_name + '_final.pkl'

                    print('\t\t  --> Storing the SOAP phi to: {0:s}'.format(phi_file_name))
                    logger.info('\t\t  --> Storing the SOAP phi to: {0:s}'.format(phi_file_name))
                    np.savez(os.path.join(model_path, phi_file_name), self.phi)

                    print('\t\t  --> Storing the SOAP sigma to: {0:s}'.format(sigma_file_name))
                    logger.info('\t\t  --> Storing the SOAP sigma to: {0:s}'.format(sigma_file_name))
                    np.savez(os.path.join(model_path, sigma_file_name), self.sigma)

                    print('\t\t  --> Storing the SOAP mu to: {0:s}'.format(mu_file_name))
                    logger.info('\t\t  --> Storing the SOAP mu to: {0:s}'.format(mu_file_name))
                    np.savez(os.path.join(model_path, mu_file_name), self.mu)

                    print('\t\t  --> Storing the SOAP model to: {0:s}'.format(model_file_name))
                    logger.info('\t\t  --> Storing the SOAP model to: {0:s}'.format(model_file_name))
                    save_data(data=copy.copy(self), file_name=model_file_name, save_path=model_path, mode="wb",
                              print_tag=False)
                    old_bound = new_bound
        print('\t  --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3)))
        logger.info('\t  --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3)))
Esempio n. 7
0
File: ctm.py Progetto: hallamlab/cbt
    def fit(self,
            X,
            model_name='CTM',
            model_path="../../model",
            result_path=".",
            display_params: bool = True):
        if X is None:
            raise Exception("Please provide a dataset.")
        assert X.shape[1] == self.num_features

        X = self.__check_non_neg_array(X, "CorrelatedTopicModel.fit")

        # collect properties from data
        self.__init_latent_variables()
        num_samples = int(X.shape[0] * self.subsample_input_size)
        list_batches = np.arange(start=0, stop=num_samples, step=self.batch)

        if display_params:
            self.__print_arguments()
            time.sleep(2)

        cost_file_name = model_name + "_cost.txt"
        save_data('',
                  file_name=cost_file_name,
                  save_path=result_path,
                  mode='w',
                  w_string=True,
                  print_tag=False)

        print('\t>> Training by CTM model...')
        logger.info('\t>> Training by CTM model...')
        n_epochs = self.num_epochs + 1
        old_bound = np.inf

        timeref = time.time()

        for epoch in np.arange(start=1, stop=n_epochs):
            desc = '\t   {0:d})- Epoch count ({0:d}/{1:d})...'.format(
                epoch, n_epochs - 1)
            print(desc)
            logger.info(desc)

            learning_rate = np.power((epoch + self.delay_factor),
                                     -self.forgetting_rate)

            # Subsample dataset
            idx = np.random.choice(X.shape[0], num_samples, False)
            start_epoch = time.time()

            # E-step
            sstats, tmp = self.__batch_e_step(X=X[idx, :],
                                              list_batches=list_batches)
            del tmp

            # M-step
            self.__m_step(sstats=sstats,
                          learning_rate=learning_rate,
                          num_samples=num_samples)

            end_epoch = time.time()

            self.is_fit = True

            # Compute predictive perplexity
            new_bound = self.perplexity(X=X[idx, :],
                                        sstats=sstats["phi_sstats"])

            print('\t\t## Epoch {0} took {1} seconds...'.format(
                epoch, round(end_epoch - start_epoch, 3)))
            logger.info('\t\t## Epoch {0} took {1} seconds...'.format(
                epoch, round(end_epoch - start_epoch, 3)))
            data = str(epoch) + '\t' + str(round(
                end_epoch - start_epoch, 3)) + '\t' + str(new_bound) + '\n'
            save_data(data=data,
                      file_name=cost_file_name,
                      save_path=result_path,
                      mode='a',
                      w_string=True,
                      print_tag=False)
            # Save models parameters based on test frequencies
            if (epoch % self.display_interval
                ) == 0 or epoch == 1 or epoch == n_epochs - 1:
                print('\t\t  --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(
                    new_bound, old_bound))
                logger.info(
                    '\t\t  --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(
                        new_bound, old_bound))

                if new_bound <= old_bound or epoch == n_epochs - 1:
                    omega_file_name = model_name + '_exp_omega.npz'
                    sigma_file_name = model_name + '_sigma.npz'
                    mu_file_name = model_name + '_mu.npz'
                    model_file_name = model_name + '.pkl'
                    if epoch == n_epochs - 1:
                        omega_file_name = model_name + '_exp_omega_final.npz'
                        sigma_file_name = model_name + '_sigma_final.npz'
                        mu_file_name = model_name + '_mu_final.npz'
                        model_file_name = model_name + '_final.pkl'

                    print('\t\t  --> Storing the CTM omega to: {0:s}'.format(
                        omega_file_name))
                    logger.info(
                        '\t\t  --> Storing the CTM omega to: {0:s}'.format(
                            omega_file_name))
                    np.savez(os.path.join(model_path, omega_file_name),
                             self.omega)

                    print('\t\t  --> Storing the CTM sigma to: {0:s}'.format(
                        sigma_file_name))
                    logger.info(
                        '\t\t  --> Storing the CTM sigma to: {0:s}'.format(
                            sigma_file_name))
                    np.savez(os.path.join(model_path, sigma_file_name),
                             self.sigma)

                    print('\t\t  --> Storing the CTM mu to: {0:s}'.format(
                        mu_file_name))
                    logger.info(
                        '\t\t  --> Storing the CTM mu to: {0:s}'.format(
                            mu_file_name))
                    np.savez(os.path.join(model_path, mu_file_name), self.mu)

                    print('\t\t  --> Storing the CTM model to: {0:s}'.format(
                        model_file_name))
                    logger.info(
                        '\t\t  --> Storing the CTM model to: {0:s}'.format(
                            model_file_name))
                    save_data(data=copy.copy(self),
                              file_name=model_file_name,
                              save_path=model_path,
                              mode="wb",
                              print_tag=False)
                    old_bound = new_bound
        print('\t  --> Training consumed %.2f mintues' % (round(
            (time.time() - timeref) / 60., 3)))
        logger.info('\t  --> Training consumed %.2f mintues' % (round(
            (time.time() - timeref) / 60., 3)))
Esempio n. 8
0
def __train(arg):
    # Setup the number of operations to employ
    steps = 1
    # Whether to display parameters at every operation
    display_params = True

    if arg.preprocess_dataset:
        print('\n{0})- Preprocessing dataset...'.format(steps))
        steps = steps + 1

        print('\t>> Loading files...')
        # load a biocyc file
        data_object = load_data(file_name=arg.object_name, load_path=arg.ospath, tag='the biocyc object')
        # extract pathway ids
        pathway_dict = data_object["pathway_id"]
        ec_dict = data_object["ec_id"]
        del data_object

        # load a hin file
        hin = load_data(file_name=arg.hin_name, load_path=arg.ospath,
                        tag='heterogeneous information network')
        # get path2vec mapping
        node2idx_path2vec = dict((node[0], node[1]['mapped_idx'])
                                 for node in hin.nodes(data=True))
        # get pathway2ec mapping
        node2idx_pathway2ec = [node[0] for node in hin.nodes(data=True)]
        Adj = nx.adj_matrix(G=hin)
        del hin

        # load pathway2ec mapping matrix
        pathway2ec_idx = load_data(file_name=arg.pathway2ec_idx_name, load_path=arg.ospath)
        path2vec_features = np.load(file=os.path.join(arg.mdpath, arg.features_name))

        # extracting pathway and ec features
        labels_components = load_data(file_name=arg.pathway2ec_name, load_path=arg.ospath, tag='M')
        path2vec_features = path2vec_features[path2vec_features.files[0]]
        pathways_idx = np.array([node2idx_path2vec[v] for v, idx in pathway_dict.items()
                                 if v in node2idx_path2vec])
        P = path2vec_features[pathways_idx, :]
        tmp = [idx for v, idx in ec_dict.items() if v in node2idx_pathway2ec]
        ec_idx = np.array([idx for idx in tmp if len(np.where(pathway2ec_idx == idx)[0]) > 0])
        E = path2vec_features[ec_idx, :]

        # constraint features space between 0 to 1 to avoid negative results
        min_rho = np.min(P)
        max_rho = np.max(P)
        P = P - min_rho
        P = P / (max_rho - min_rho)
        P = P / np.linalg.norm(P, axis=1)[:, np.newaxis]
        min_rho = np.min(E)
        max_rho = np.max(E)
        E = E - min_rho
        E = E / (max_rho - min_rho)
        E = E / np.linalg.norm(E, axis=1)[:, np.newaxis]

        # building A and B matrices
        lil_matrix.setdiag(Adj, 0)
        A = Adj[pathways_idx[:, None], pathways_idx]
        A = A / A.sum(1)
        A = np.nan_to_num(A)
        B = Adj[ec_idx[:, None], ec_idx]
        B = B / B.sum(1)
        B = np.nan_to_num(B)

        ## train size
        if arg.ssample_input_size < 1:
            # add white noise to M
            train_size = labels_components.shape[0] * arg.ssample_input_size
            idx = np.random.choice(a=np.arange(labels_components.shape[0]), size=int(train_size), replace=False)
            labels_components = labels_components.toarray()
            labels_components[idx] = np.zeros((idx.shape[0], labels_components.shape[1]))
        if arg.white_links:
            if arg.ssample_input_size < 1:
                # add white noise to A
                train_size = A.shape[0] * arg.ssample_input_size
                idx = np.random.choice(a=np.arange(A.shape[0]), size=int(train_size), replace=False)
                A = lil_matrix(A).toarray()
                tmp = np.zeros((idx.shape[0], A.shape[0]))
                A[idx] = tmp
                A[:, idx] = tmp.T
                # add white noise to B
                train_size = B.shape[0] * arg.ssample_input_size
                idx = np.random.choice(a=np.arange(B.shape[0]), size=int(train_size), replace=False)
                B = lil_matrix(B).toarray()
                tmp = np.zeros((idx.shape[0], B.shape[0]))
                B[idx] = tmp
                B[:, idx] = tmp.T

        # save files
        print('\t>> Saving files...')
        save_data(data=lil_matrix(labels_components), file_name=arg.M_name, save_path=arg.dspath, tag="M", mode="wb")
        save_data(data=lil_matrix(P), file_name=arg.P_name, save_path=arg.dspath, tag="P", mode="wb")
        save_data(data=lil_matrix(E), file_name=arg.E_name, save_path=arg.dspath, tag="E", mode="wb")
        save_data(data=lil_matrix(A), file_name=arg.A_name, save_path=arg.dspath, tag="A", mode="wb")
        save_data(data=lil_matrix(B), file_name=arg.B_name, save_path=arg.dspath, tag="B", mode="wb")
        print('\t>> Done...')

    ##########################################################################################################
    ######################                     TRAIN USING triUMPF                      ######################
    ##########################################################################################################

    if arg.train:
        print('\n{0})- Training {1} dataset using triUMPF model...'.format(steps, arg.y_name))
        steps = steps + 1

        # load files
        print('\t>> Loading files...')
        labels_components, W, H, P, E, A, B, X, y = None, None, None, None, None, None, None, None, None

        if arg.no_decomposition:
            W = load_data(file_name=arg.W_name, load_path=arg.mdpath, tag='W')
            H = load_data(file_name=arg.H_name, load_path=arg.mdpath, tag='H')
        else:
            labels_components = load_data(file_name=arg.M_name, load_path=arg.dspath, tag='M')
        if arg.fit_features:
            P = load_data(file_name=arg.P_name, load_path=arg.dspath, tag='P')
            E = load_data(file_name=arg.E_name, load_path=arg.dspath, tag='E')
        if arg.fit_comm:
            if not arg.fit_features:
                P = load_data(file_name=arg.P_name, load_path=arg.dspath, tag='P')
                E = load_data(file_name=arg.E_name, load_path=arg.dspath, tag='E')
            X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag='X')
            y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag='X')
            A = load_data(file_name=arg.A_name, load_path=arg.dspath, tag='A')
            B = load_data(file_name=arg.B_name, load_path=arg.dspath, tag='B')

        model = triUMPF(num_components=arg.num_components, num_communities_p=arg.num_communities_p,
                        num_communities_e=arg.num_communities_e, proxy_order_p=arg.proxy_order_p,
                        proxy_order_e=arg.proxy_order_e, mu_omega=arg.mu_omega, mu_gamma=arg.mu_gamma,
                        fit_features=arg.fit_features, fit_comm=arg.fit_comm, fit_pure_comm=arg.fit_pure_comm,
                        normalize_input_feature=arg.normalize_input_feature,
                        binarize_input_feature=arg.binarize_input_feature,
                        use_external_features=arg.use_external_features, cutting_point=arg.cutting_point,
                        fit_intercept=arg.fit_intercept, alpha=arg.alpha, beta=arg.beta, rho=arg.rho,
                        lambdas=arg.lambdas, eps=arg.eps, early_stop=arg.early_stop, penalty=arg.penalty,
                        alpha_elastic=arg.alpha_elastic, l1_ratio=arg.l1_ratio, loss_threshold=arg.loss_threshold,
                        decision_threshold=arg.decision_threshold, subsample_input_size=arg.ssample_input_size,
                        subsample_labels_size=arg.ssample_label_size, learning_type=arg.learning_type, lr=arg.lr,
                        lr0=arg.lr0, delay_factor=arg.delay_factor, forgetting_rate=arg.forgetting_rate,
                        batch=arg.batch, max_inner_iter=arg.max_inner_iter, num_epochs=arg.num_epochs,
                        num_jobs=arg.num_jobs, display_interval=arg.display_interval, shuffle=arg.shuffle,
                        random_state=arg.random_state, log_path=arg.logpath)
        model.fit(M=labels_components, W=W, H=H, X=X, y=y, P=P, E=E, A=A, B=B, model_name=arg.model_name,
                  model_path=arg.mdpath, result_path=arg.rspath, display_params=display_params)

    ##########################################################################################################
    ######################                    PREDICT USING triUMPF                     ######################
    ##########################################################################################################

    if arg.predict:
        print('\n{0})- Predicting using a pre-trained triUMPF model...'.format(steps))
        if arg.pathway_report:
            print('\t>> Loading biocyc object...')
            # load a biocyc file
            data_object = load_data(file_name=arg.object_name, load_path=arg.ospath, tag='the biocyc object',
                                    print_tag=False)
            pathway_dict = data_object["pathway_id"]
            pathway_common_names = dict((pidx, data_object['processed_kb']['metacyc'][5][pid][0][1])
                                        for pid, pidx in pathway_dict.items()
                                        if pid in data_object['processed_kb']['metacyc'][5])
            ec_dict = data_object['ec_id']
            del data_object
            pathway_dict = dict((idx, id) for id, idx in pathway_dict.items())
            ec_dict = dict((idx, id) for id, idx in ec_dict.items())
            labels_components = load_data(file_name=arg.pathway2ec_name, load_path=arg.ospath, tag='M')
            print('\t>> Loading label to component mapping file object...')
            pathway2ec_idx = load_data(file_name=arg.pathway2ec_idx_name, load_path=arg.ospath, print_tag=False)
            pathway2ec_idx = list(pathway2ec_idx)
            tmp = list(ec_dict.keys())
            ec_dict = dict((idx, ec_dict[tmp.index(ec)]) for idx, ec in enumerate(pathway2ec_idx))
            if arg.extract_pf:
                X, sample_ids = parse_files(ec_dict=ec_dict, input_folder=arg.dsfolder, rsfolder=arg.rsfolder,
                                            rspath=arg.rspath, num_jobs=arg.num_jobs)
                print('\t>> Storing X and sample_ids...')
                save_data(data=X, file_name=arg.file_name + '_X.pkl', save_path=arg.dspath,
                          tag='the pf dataset (X)', mode='w+b', print_tag=False)
                save_data(data=sample_ids, file_name=arg.file_name + '_ids.pkl', save_path=arg.dspath,
                          tag='samples ids', mode='w+b', print_tag=False)
                if arg.build_features:
                    # load a hin file
                    print('\t>> Loading heterogeneous information network file...')
                    hin = load_data(file_name=arg.hin_name, load_path=arg.ospath,
                                    tag='heterogeneous information network',
                                    print_tag=False)
                    # get pathway2ec mapping
                    node2idx_pathway2ec = [node[0] for node in hin.nodes(data=True)]
                    del hin
                    print('\t>> Loading path2vec_features file...')
                    path2vec_features = np.load(file=os.path.join(arg.mdpath, arg.features_name))
                    __build_features(X=X, pathwat_dict=pathway_dict, ec_dict=ec_dict,
                                     labels_components=labels_components,
                                     node2idx_pathway2ec=node2idx_pathway2ec,
                                     path2vec_features=path2vec_features,
                                     file_name=arg.file_name, dspath=arg.dspath,
                                     batch_size=arg.batch, num_jobs=arg.num_jobs)
        # load files
        print('\t>> Loading necessary files......')
        X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X")
        sample_ids = np.arange(X.shape[0])
        if arg.samples_ids in os.listdir(arg.dspath):
            sample_ids = load_data(file_name=arg.samples_ids, load_path=arg.dspath, tag="samples ids")

        # load model
        model = load_data(file_name=arg.model_name + '.pkl', load_path=arg.mdpath, tag='triUMPF model')

        # predict
        y_pred = model.predict(X=X.toarray(), estimate_prob=False, apply_t_criterion=arg.apply_tcriterion,
                               adaptive_beta=arg.adaptive_beta, decision_threshold=arg.decision_threshold,
                               top_k=arg.top_k, batch_size=arg.batch, num_jobs=arg.num_jobs)
        # labels prediction score
        y_pred_score = model.predict(X=X.toarray(), estimate_prob=True, apply_t_criterion=arg.apply_tcriterion,
                                     adaptive_beta=arg.adaptive_beta, decision_threshold=arg.decision_threshold,
                                     top_k=arg.top_k, batch_size=arg.batch, num_jobs=arg.num_jobs)

        if arg.pathway_report:
            print('\t>> Synthesizing pathway reports...')
            synthesize_report(X=X[:, :arg.cutting_point], sample_ids=sample_ids,
                              y_pred=y_pred, y_dict_ids=pathway_dict, y_common_name=pathway_common_names,
                              component_dict=ec_dict, labels_components=labels_components, y_pred_score=y_pred_score,
                              batch_size=arg.batch, num_jobs=arg.num_jobs, rsfolder=arg.rsfolder, rspath=arg.rspath,
                              dspath=arg.dspath, file_name=arg.file_name + '_triumpf')
        else:
            print('\t>> Storing predictions (label index) to: {0:s}'.format(arg.file_name + '_triumpf_y.pkl'))
            save_data(data=y_pred, file_name=arg.file_name + "_triumpf_y.pkl", save_path=arg.dspath,
                      mode="wb", print_tag=False)
Esempio n. 9
0
def __build_features(X, pathwat_dict, ec_dict, labels_components, node2idx_pathway2ec, path2vec_features, file_name,
                     dspath, batch_size=100, num_jobs=1):
    tmp = lil_matrix.copy(X)
    print('\t>> Build abundance and coverage features...')
    list_batches = np.arange(start=0, stop=tmp.shape[0], step=batch_size)
    total_progress = len(list_batches) * len(pathwat_dict.keys())
    parallel = Parallel(n_jobs=num_jobs, verbose=0)
    results = parallel(delayed(compute_abd_cov)(tmp[batch:batch + batch_size],
                                                labels_components, pathwat_dict,
                                                None, batch_idx, total_progress)
                       for batch_idx, batch in enumerate(list_batches))
    desc = '\t\t--> Building {0:.4f}%...'.format((100))
    print(desc)
    abd, cov = zip(*results)
    abd = np.vstack(abd)
    cov = np.vstack(cov)
    del results
    abd = preprocessing.normalize(abd)
    print('\t>> Use pathway2vec EC features...')
    path2vec_features = path2vec_features[path2vec_features.files[0]]
    path2vec_features = path2vec_features / np.linalg.norm(path2vec_features, axis=1)[:, np.newaxis]
    ec_features = [idx for idx, v in ec_dict.items() if v in node2idx_pathway2ec]
    path2vec_features = path2vec_features[ec_features, :]
    ec_features = [np.mean(path2vec_features[row.rows[0]] * np.array(row.data[0])[:, None], axis=0)
                   for idx, row in enumerate(X)]
    save_data(data=lil_matrix(ec_features), file_name=file_name + "_Xp.pkl", save_path=dspath, mode="wb",
              tag="transformed instances to ec features")
    X = lil_matrix(hstack((tmp, ec_features)))
    save_data(data=X, file_name=file_name + "_Xe.pkl", save_path=dspath, mode="wb",
              tag="concatenated ec features with instances")
    X = lil_matrix(hstack((tmp, abd)))
    save_data(data=X, file_name=file_name + "_Xa.pkl", save_path=dspath, mode="wb",
              tag="concatenated abundance features with instances")
    X = lil_matrix(hstack((tmp, cov)))
    save_data(data=X, file_name=file_name + "_Xc.pkl", save_path=dspath, mode="wb",
              tag="concatenated coverage features with instances")
    X = lil_matrix(hstack((tmp, ec_features)))
    X = lil_matrix(hstack((X, abd)))
    save_data(data=X, file_name=file_name + "_Xea.pkl", save_path=dspath, mode="wb",
              tag="concatenated ec and abundance features with instances")
    X = lil_matrix(hstack((tmp, ec_features)))
    X = lil_matrix(hstack((X, cov)))
    save_data(data=X, file_name=file_name + "_Xec.pkl", save_path=dspath, mode="wb",
              tag="concatenated ec and coverage features with instances")
    X = lil_matrix(hstack((tmp, ec_features)))
    X = lil_matrix(hstack((X, abd)))
    X = lil_matrix(hstack((X, cov)))
    save_data(data=X, file_name=file_name + "_Xm.pkl", save_path=dspath, mode="wb",
              tag="concatenated ec, abundance, and coverage features features with instances")
Esempio n. 10
0
def score(y_true,
          y_pred,
          item_lst,
          six_db=False,
          A=1,
          B=1,
          C=1,
          top_k=150,
          mode='a',
          file_name='results.txt',
          save_path=''):
    idx_lst = [1]
    if six_db:
        item_lst = [
            'AraCyc', 'EcoCyc', 'HumanCyc', 'LeishCyc', 'TrypanoCyc',
            'YeastCyc'
        ]
        if y_true.shape[0] == 4:
            item_lst = ['AraCyc', 'EcoCyc', 'HumanCyc', 'YeastCyc']
        idx_lst = [idx for idx in np.arange(len(item_lst))]
    print('\t>> Scores are saved to {0:s}...'.format(str(file_name)))
    for i, idx in enumerate(idx_lst):
        y = y_true
        y_hat = y_pred
        if six_db:
            y = y_true[idx]
            y_hat = y_pred[idx]
            y = y.reshape((1, y.shape[0]))
            y_hat = np.reshape(y_hat, (1, len(y_hat)))
            save_data(data='*** Scores for {0:s}...\n'.format(str(
                item_lst[i])),
                      file_name=file_name,
                      save_path=save_path,
                      mode=mode,
                      w_string=True,
                      print_tag=False)
        else:
            save_data(data='*** Scores for {0:s}...\n'.format(item_lst[i]),
                      file_name=file_name,
                      save_path=save_path,
                      mode='w',
                      w_string=True,
                      print_tag=False)
        ce_samples = coverage_error(y, y_hat)
        save_data(
            data='\t\t1)- Coverage error score: {0:.4f}\n'.format(ce_samples),
            file_name=file_name,
            save_path=save_path,
            mode=mode,
            w_string=True,
            print_tag=False)

        lrl_samples = label_ranking_loss(y, y_hat)
        save_data(
            data='\t\t2)- Ranking loss score: {0:.4f}\n'.format(lrl_samples),
            file_name=file_name,
            save_path=save_path,
            mode=mode,
            w_string=True,
            print_tag=False)

        lrap_samples = label_ranking_average_precision_score(y, y_hat)
        save_data(
            data='\t\t3)- Label ranking average precision score: {0:.4f}\n'.
            format(lrap_samples),
            file_name=file_name,
            save_path=save_path,
            mode=mode,
            w_string=True,
            print_tag=False)

        if not np.array_equal(y_pred, y_pred.astype(bool)):
            top_k = y_true.shape[1] if top_k > y_true.shape[1] else top_k
            psp_samples = psp(y_prob=y_hat,
                              y_true=y,
                              A=A,
                              B=B,
                              C=C,
                              top_k=top_k)
            save_data(
                data='\t\t4)- Propensity Scored Precision at {0}: {1:.4f}\n'.
                format(top_k, psp_samples),
                file_name=file_name,
                save_path=save_path,
                mode=mode,
                w_string=True,
                print_tag=False)

            ndcg_samples = psndcg(y_prob=y_hat,
                                  y_true=y,
                                  A=A,
                                  B=B,
                                  C=C,
                                  top_k=top_k)
            save_data(
                data='\t\t5)- Propensity Scored nDCG at {0}: {1:.4f}\n'.format(
                    top_k, ndcg_samples),
                file_name=file_name,
                save_path=save_path,
                mode=mode,
                w_string=True,
                print_tag=False)
            continue

        hl_samples = hamming_loss(y, y_hat)
        save_data(
            data='\t\t4)- Hamming-Loss score: {0:.4f}\n'.format(hl_samples),
            file_name=file_name,
            save_path=save_path,
            mode=mode,
            w_string=True,
            print_tag=False)

        pr_samples_average = precision_score(y, y_hat, average='samples')
        pr_samples_micro = precision_score(y, y_hat, average='micro')
        pr_samples_macro = precision_score(y, y_hat, average='macro')
        save_data(data='\t\t5)- Precision...\n',
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> Average sample precision: {0:.4f}\n'.format(
            pr_samples_average),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> Micro precision: {0:.4f}\n'.format(
            pr_samples_micro),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> Macro precision: {0:.4f}\n'.format(
            pr_samples_macro),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)

        rc_samples_average = recall_score(y, y_hat, average='samples')
        rc_samples_micro = recall_score(y, y_hat, average='micro')
        rc_samples_macro = recall_score(y, y_hat, average='macro')
        save_data(data='\t\t6)- Recall...\n',
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> Average sample recall: {0:.4f}\n'.format(
            rc_samples_average),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(
            data='\t\t\t--> Micro recall: {0:.4f}\n'.format(rc_samples_micro),
            file_name=file_name,
            save_path=save_path,
            mode=mode,
            w_string=True,
            print_tag=False)
        save_data(
            data='\t\t\t--> Macro recall: {0:.4f}\n'.format(rc_samples_macro),
            file_name=file_name,
            save_path=save_path,
            mode=mode,
            w_string=True,
            print_tag=False)

        f1_samples_average = f1_score(y, y_hat, average='samples')
        f1_samples_micro = f1_score(y, y_hat, average='micro')
        f1_samples_macro = f1_score(y, y_hat, average='macro')
        save_data(data='\t\t7)- F1-score...\n',
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> Average sample f1-score: {0:.4f}\n'.format(
            f1_samples_average),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> Micro f1-score: {0:.4f}\n'.format(
            f1_samples_micro),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> Macro f1-score: {0:.4f}\n'.format(
            f1_samples_macro),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)

        js_score_samples = jaccard_score(y, y_hat, average='samples')
        js_score_micro = jaccard_score(y, y_hat, average='micro')
        js_score_macro = jaccard_score(y, y_hat, average='macro')
        js_score_weighted = jaccard_score(y, y_hat, average='weighted')
        save_data(data='\t\t8)- Jaccard score...\n',
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> Jaccard score (samples): {0:.4f}\n'.format(
            js_score_samples),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> Jaccard score (micro): {0:.4f}\n'.format(
            js_score_micro),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> Jaccard score (macro): {0:.4f}\n'.format(
            js_score_macro),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> Jaccard score (weighted): {0:.4f}\n'.format(
            js_score_weighted),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)

        tn, fp, fn, tp = confusion_matrix(y.flatten(), y_hat.flatten()).ravel()
        save_data(data='\t\t9)- Confusion matrix...\n',
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> True positive: {0}\n'.format(tp),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> True negative: {0}\n'.format(tn),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> False positive: {0}\n'.format(fp),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
        save_data(data='\t\t\t--> False negative: {0}\n'.format(fn),
                  file_name=file_name,
                  save_path=save_path,
                  mode=mode,
                  w_string=True,
                  print_tag=False)
Esempio n. 11
0
def __train(arg):
    # Setup the number of operations to employ
    steps = 1
    # Whether to display parameters at every operation
    display_params = True

    ##########################################################################################################
    ######################                  PREPROCESSING DATASET                       ######################
    ##########################################################################################################

    if arg.preprocess_dataset:
        print('\n{0})- Preprocess dataset...'.format(steps))
        steps = steps + 1
        print('\t>> Loading files...')
        X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="instances")
        X = X[:, :arg.cutting_point]

        # load a biocyc file
        data_object = load_data(file_name=arg.object_name, load_path=arg.ospath, tag='the biocyc object')
        ec_dict = data_object["ec_id"]
        pathway_dict = data_object["pathway_id"]
        del data_object

        pathway_dict = dict((idx, id) for id, idx in pathway_dict.items())
        ec_dict = dict((idx, id) for id, idx in ec_dict.items())
        labels_components = load_data(file_name=arg.pathway2ec_name, load_path=arg.ospath, tag='M')
        print('\t>> Loading label to component mapping file object...')
        pathway2ec_idx = load_data(file_name=arg.pathway2ec_idx_name, load_path=arg.ospath, print_tag=False)
        pathway2ec_idx = list(pathway2ec_idx)
        tmp = list(ec_dict.keys())
        ec_dict = dict((idx, ec_dict[tmp.index(ec)]) for idx, ec in enumerate(pathway2ec_idx))

        # load path2vec features
        path2vec_features = np.load(file=os.path.join(arg.ospath, arg.features_name))

        # load a hin file
        hin = load_data(file_name=arg.hin_name, load_path=arg.ospath, tag='heterogeneous information network')
        # get pathway2ec mapping
        node2idx_pathway2ec = [node[0] for node in hin.nodes(data=True)]
        del hin

        __build_features(X=X, pathwat_dict=pathway_dict, ec_dict=ec_dict, labels_components=labels_components,
                         node2idx_pathway2ec=node2idx_pathway2ec,
                         path2vec_features=path2vec_features, file_name=arg.file_name, dspath=arg.dspath,
                         batch_size=arg.batch, num_jobs=arg.num_jobs)

    ##########################################################################################################
    ######################                            TRAIN                             ######################
    ##########################################################################################################

    if arg.train:
        print(
            '\n{0})- Training {1} dataset using leADS model...'.format(steps, arg.X_name))
        steps = steps + 1

        # load files
        print('\t>> Loading files...')
        X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X")
        y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag="y")
        y_Bags = None
        bags_labels = None
        label_features = None
        centroids = None

        if not arg.train_labels:
            y_Bags = load_data(file_name=arg.yB_name, load_path=arg.dspath, tag="B")
            bags_labels = load_data(file_name=arg.bags_labels, load_path=arg.ospath,
                                    tag="bags_labels with associated pathways")
            label_features = load_data(file_name=arg.features_name, load_path=arg.ospath, tag="features")
            centroids = np.load(file=os.path.join(arg.ospath, arg.centroids))
            centroids = centroids[centroids.files[0]]

        A = None
        if arg.fuse_weight:
            A = load_item_features(file_name=os.path.join(arg.ospath, arg.similarity_name), use_components=False)
        if arg.train_selected_sample:
            if os.path.exists(os.path.join(arg.rspath, arg.samples_ids)):
                sample_ids = load_data(file_name=arg.samples_ids, load_path=arg.rspath, tag="selected samples")
                sample_ids = np.array(sample_ids)
                X = X[sample_ids, :]
                y = y[sample_ids, :]
                if not arg.train_labels:
                    y_Bags = y_Bags[sample_ids, :]
            else:
                print('\t\t No sample ids file is provided...')

        model = leADS(alpha=arg.alpha, binarize_input_feature=arg.binarize_input_feature,
                      normalize_input_feature=arg.normalize_input_feature,
                      use_external_features=arg.use_external_features,
                      cutting_point=arg.cutting_point, fit_intercept=arg.fit_intercept,
                      decision_threshold=arg.decision_threshold, subsample_input_size=arg.ssample_input_size,
                      subsample_labels_size=arg.ssample_label_size, calc_ads=arg.calc_ads,
                      acquisition_type=arg.acquisition_type, top_k=arg.top_k, ads_percent=arg.ads_percent,
                      advanced_subsampling=arg.advanced_subsampling, tol_labels_iter=arg.tol_labels_iter,
                      cost_subsample_size=arg.calc_subsample_size, calc_label_cost=arg.calc_label_cost,
                      calc_bag_cost=arg.calc_bag_cost, calc_total_cost=arg.calc_total_cost,
                      label_uncertainty_type=arg.label_uncertainty_type, label_bag_sim=arg.label_bag_sim,
                      label_closeness_sim=arg.label_closeness_sim, corr_bag_sim=arg.corr_bag_sim,
                      corr_label_sim=arg.corr_label_sim, corr_input_sim=arg.corr_input_sim, penalty=arg.penalty,
                      alpha_elastic=arg.alpha_elastic, l1_ratio=arg.l1_ratio, sigma=arg.sigma,
                      fuse_weight=arg.fuse_weight, lambdas=arg.lambdas, loss_threshold=arg.loss_threshold,
                      early_stop=arg.early_stop, learning_type=arg.learning_type, lr=arg.lr, lr0=arg.lr0,
                      delay_factor=arg.delay_factor, forgetting_rate=arg.forgetting_rate, num_models=arg.num_models,
                      batch=arg.batch, max_inner_iter=arg.max_inner_iter, num_epochs=arg.num_epochs,
                      num_jobs=arg.num_jobs, display_interval=arg.display_interval, shuffle=arg.shuffle,
                      random_state=arg.random_state, log_path=arg.logpath)
        model.fit(X=X, y=y, y_Bag=y_Bags, bags_labels=bags_labels, label_features=label_features, centroids=centroids,
                  A=A, model_name=arg.model_name, model_path=arg.mdpath, result_path=arg.rspath,
                  display_params=display_params)

    ##########################################################################################################
    ######################                           EVALUATE                           ######################
    ##########################################################################################################

    if arg.evaluate:
        print('\n{0})- Evaluating leADS model...'.format(steps))
        steps = steps + 1

        # load files
        print('\t>> Loading files...')
        X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X")
        bags_labels = None
        label_features = None
        centroids = None
        if not arg.pred_bags:
            y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag="y")
        if arg.pred_bags:
            y_Bags = load_data(file_name=arg.yB_name, load_path=arg.dspath, tag="B")

        # load model
        model = load_data(file_name=arg.model_name + '.pkl', load_path=arg.mdpath, tag='leADS')

        if model.learn_bags:
            bags_labels = load_data(file_name=arg.bags_labels, load_path=arg.dspath,
                                    tag="bags_labels with associated pathways")
        if model.label_uncertainty_type == "dependent":
            label_features = load_data(file_name=arg.features_name, load_path=arg.dspath, tag="features")
            centroids = np.load(file=os.path.join(arg.dspath, arg.centroids))
            centroids = centroids[centroids.files[0]]

        # labels prediction score
        y_pred_Bags, y_pred = model.predict(X=X, bags_labels=bags_labels, label_features=label_features,
                                            centroids=centroids,
                                            estimate_prob=arg.estimate_prob, pred_bags=arg.pred_bags,
                                            pred_labels=arg.pred_labels,
                                            build_up=arg.build_up, pref_rank=arg.pref_rank, top_k_rank=arg.top_k_rank,
                                            subsample_labels_size=arg.ssample_label_size, soft_voting=arg.soft_voting,
                                            apply_t_criterion=arg.apply_tcriterion, adaptive_beta=arg.adaptive_beta,
                                            decision_threshold=arg.decision_threshold, batch_size=arg.batch,
                                            num_jobs=arg.num_jobs)

        file_name = arg.file_name + '_scores.txt'
        if arg.pred_bags:
            score(y_true=y_Bags.toarray(), y_pred=y_pred_Bags.toarray(), item_lst=['biocyc_bags'],
                  six_db=False, top_k=arg.top_k, mode='a', file_name=file_name, save_path=arg.rspath)
        if arg.pred_labels:
            if arg.dsname == 'golden':
                score(y_true=y.toarray(), y_pred=y_pred.toarray(), item_lst=[arg.dsname], six_db=True,
                      top_k=arg.top_k, mode='a', file_name=file_name, save_path=arg.rspath)
            else:
                score(y_true=y.toarray(), y_pred=y_pred.toarray(), item_lst=[arg.dsname], six_db=False,
                      top_k=arg.top_k, mode='a', file_name=file_name, save_path=arg.rspath)

    ##########################################################################################################
    ######################                            PREDICT                           ######################
    ##########################################################################################################

    if arg.predict:
        print('\n{0})- Predicting dataset using a pre-trained leADS model...'.format(steps))
        if arg.pathway_report or arg.extract_pf:
            print('\t>> Loading biocyc object...')
            # load a biocyc file
            data_object = load_data(file_name=arg.object_name, load_path=arg.ospath, tag='the biocyc object',
                                    print_tag=False)
            pathway_dict = data_object["pathway_id"]
            pathway_common_names = dict((pidx, data_object['processed_kb']['metacyc'][5][pid][0][1])
                                        for pid, pidx in pathway_dict.items()
                                        if pid in data_object['processed_kb']['metacyc'][5])
            ec_dict = data_object['ec_id']
            del data_object
            pathway_dict = dict((idx, id) for id, idx in pathway_dict.items())
            ec_dict = dict((idx, id) for id, idx in ec_dict.items())
            labels_components = load_data(file_name=arg.pathway2ec_name, load_path=arg.ospath, tag='M')
            print('\t>> Loading label to component mapping file object...')
            pathway2ec_idx = load_data(file_name=arg.pathway2ec_idx_name, load_path=arg.ospath, print_tag=False)
            pathway2ec_idx = list(pathway2ec_idx)
            tmp = list(ec_dict.keys())
            ec_dict = dict((idx, ec_dict[tmp.index(ec)]) for idx, ec in enumerate(pathway2ec_idx))
            if arg.extract_pf:
                X, sample_ids = parse_files(ec_dict=ec_dict, ds_folder=arg.dsfolder, dspath=arg.dspath,
                                            rspath=arg.rspath, num_jobs=arg.num_jobs)
                print('\t>> Storing X and sample_ids...')
                save_data(data=X, file_name=arg.file_name + '_X.pkl', save_path=arg.dspath,
                          tag='the pf dataset (X)', mode='w+b', print_tag=False)
                save_data(data=sample_ids, file_name=arg.file_name + '_ids.pkl', save_path=arg.dspath,
                          tag='samples ids', mode='w+b', print_tag=False)
                print('\t>> Loading heterogeneous information network file...')
                hin = load_data(file_name=arg.hin_name, load_path=arg.ospath,
                                tag='heterogeneous information network',
                                print_tag=False)
                # get pathway2ec mapping
                node2idx_pathway2ec = [node[0] for node in hin.nodes(data=True)]
                del hin
                print('\t>> Loading path2vec_features file...')
                path2vec_features = np.load(file=os.path.join(arg.ospath, arg.features_name))
                __build_features(X=X, pathwat_dict=pathway_dict, ec_dict=ec_dict,
                                 labels_components=labels_components,
                                 node2idx_pathway2ec=node2idx_pathway2ec,
                                 path2vec_features=path2vec_features,
                                 file_name=arg.file_name, dspath=arg.dspath,
                                 batch_size=arg.batch, num_jobs=arg.num_jobs)

        # load files
        print('\t>> Loading necessary files......')
        X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X")
        tmp = lil_matrix.copy(X)
        bags_labels = None
        label_features = None
        centroids = None

        # load model
        model = load_data(file_name=arg.model_name + '.pkl', load_path=arg.mdpath, tag='leADS')

        if model.learn_bags:
            bags_labels = load_data(file_name=arg.bags_labels, load_path=arg.ospath,
                                    tag="bags_labels with associated pathways")
        if model.label_uncertainty_type == "dependent":
            label_features = load_data(file_name=arg.features_name, load_path=arg.ospath, tag="features")
            centroids = np.load(file=os.path.join(arg.ospath, arg.centroids))
            centroids = centroids[centroids.files[0]]

        # predict
        y_pred_Bags, y_pred = model.predict(X=X, bags_labels=bags_labels, label_features=label_features,
                                            centroids=centroids,
                                            estimate_prob=False, pred_bags=arg.pred_bags, pred_labels=arg.pred_labels,
                                            build_up=arg.build_up, pref_rank=arg.pref_rank, top_k_rank=arg.top_k_rank,
                                            subsample_labels_size=arg.ssample_label_size, soft_voting=arg.soft_voting,
                                            apply_t_criterion=arg.apply_tcriterion, adaptive_beta=arg.adaptive_beta,
                                            decision_threshold=arg.decision_threshold, batch_size=arg.batch,
                                            num_jobs=arg.num_jobs)
        # labels prediction score
        y_pred_Bags_score, y_pred_score = model.predict(X=X, bags_labels=bags_labels, label_features=label_features,
                                                        centroids=centroids, estimate_prob=True,
                                                        pred_bags=arg.pred_bags,
                                                        pred_labels=arg.pred_labels, build_up=arg.build_up,
                                                        pref_rank=arg.pref_rank, top_k_rank=arg.top_k_rank,
                                                        subsample_labels_size=arg.ssample_label_size,
                                                        soft_voting=arg.soft_voting,
                                                        apply_t_criterion=arg.apply_tcriterion,
                                                        adaptive_beta=arg.adaptive_beta,
                                                        decision_threshold=arg.decision_threshold,
                                                        batch_size=arg.batch, num_jobs=arg.num_jobs)
        if arg.pathway_report:
            print('\t>> Synthesizing pathway reports...')
            X = tmp
            sample_ids = np.arange(X.shape[0])
            if arg.extract_pf:
                sample_ids = load_data(file_name=arg.file_name + "_ids.pkl", load_path=arg.dspath, tag="samples ids")
            else:
                if arg.samples_ids is not None:
                    if arg.samples_ids in os.listdir(arg.dspath):
                        sample_ids = load_data(file_name=arg.samples_ids, load_path=arg.dspath, tag="samples ids")
            synthesize_report(X=X[:, :arg.cutting_point], sample_ids=sample_ids, y_pred=y_pred, y_dict_ids=pathway_dict,
                              y_common_name=pathway_common_names, component_dict=ec_dict,
                              labels_components=labels_components, y_pred_score=y_pred_score, batch_size=arg.batch,
                              num_jobs=arg.num_jobs, rspath=arg.rspath, dspath=arg.dspath, file_name=arg.file_name)
        else:
            print('\t>> Storing predictions (label index) to: {0:s}'.format(arg.file_name + '_y_leads.pkl'))
            save_data(data=y_pred, file_name=arg.file_name + "_y_leads.pkl", save_path=arg.dspath,
                      mode="wb", print_tag=False)
            if arg.pred_bags:
                print('\t>> Storing predictions (bag index) to: {0:s}'.format(
                    arg.file_name + '_yBags_leads.pkl'))
                save_data(data=y_pred_Bags, file_name=arg.file_name + "_yBags_leads.pkl", save_path=arg.dspath,
                          mode="wb", print_tag=False)
Esempio n. 12
0
    def __fit_by_tf(self, X, node_id, node_probability, index2type, type2index, type2prob, model_name, model_path,
                    result_path):
        ## Build layers for path2vec
        print('\t>> Building: path2vec layers...')
        logger.info('\t>> Building: path2vec layers...')
        timeref = time.time()
        center_node_holder, context_node_holder, negative_samples_holder, loss = self.__build_tf_place_holders(
            node_probability=node_probability)
        ## Optimization function for path2vec
        optimizer = self.__optimizer(center_node_holder, context_node_holder, negative_samples_holder)

        print('\t\t## Building layers consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3)))
        logger.info('\t\t## Building layers consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3)))
        print('\t>> Training path2vec...')
        logger.info('\t>> Training path2vec...')
        old_cost = np.inf
        timeref = time.time()
        cost_file_name = model_name + "_cost.txt"
        save_data('', file_name=cost_file_name, save_path=result_path, mode='w', w_string=True, print_tag=False)
        merged = tf.summary.merge_all()
        saver = tf.train.Saver(max_to_keep=self.num_models)
        config = tf.ConfigProto(intra_op_parallelism_threads=0,
                                inter_op_parallelism_threads=0,
                                allow_soft_placement=True)
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            writer = tf.summary.FileWriter(self.log_path, sess.graph)
            # Define metadata variable.
            run_metadata = tf.RunMetadata()
            for epoch in np.arange(start=1, stop=self.num_epochs + 1):
                desc = '\t   {0:d})- Epoch count ({0:d}/{1:d})...'.format(epoch, self.num_epochs)
                print(desc)
                logger.info(desc)
                self.__shffule(X=X)
                list_batches = np.arange(start=0, stop=len(X), step=self.batch)
                epoch_timeref = time.time()
                new_cost = 0.0
                for idx, batch in enumerate(list_batches):
                    total_samples = (idx + 1) / len(list_batches)
                    desc = '\t       --> Learning: {0:.4f}% ...'.format(total_samples * 100)
                    logger.info(desc)
                    if (idx + 1) != len(list_batches):
                        print(desc, end="\r")
                    if (idx + 1) == len(list_batches):
                        print(desc)
                    ## Generate batch negative samples
                    center_nodes, context_nodes = self.__generate_batch(X=X[batch:batch + self.batch])
                    negative_nodes = self.__get_negative_samples(center_nodes=center_nodes, node_id=node_id,
                                                                 node_probability=node_probability,
                                                                 index2type=index2type, type2index=type2index,
                                                                 type2probs=type2prob)
                    batch_X_size = self.batch
                    if self.batch > 150000:
                        batch_X_size = 10000

                    list_batch_X = np.arange(start=0, stop=center_nodes.shape[0], step=batch_X_size)
                    for b_idx, batch_X_idx in enumerate(list_batch_X):
                        center_batch = center_nodes[batch_X_idx:batch_X_idx + batch_X_size]
                        context_batch = context_nodes[batch_X_idx:batch_X_idx + batch_X_size]
                        negative_batch = negative_nodes[batch_X_idx:batch_X_idx + batch_X_size]
                        for inner_iterations in np.arange(self.max_inner_iter):
                            feed_dict = {center_node_holder: center_batch,
                                         context_node_holder: context_batch,
                                         negative_samples_holder: negative_batch}
                            # We perform one update step by evaluating the optimizer op (including it
                            # in the list of returned values for session.run()
                            # Also, evaluate the merged op to get all summaries from the returned
                            # "summary" variable. Feed metadata variable to session for visualizing
                            # the graph in TensorBoard.
                            loss_batch, _, summary_str = sess.run([loss, optimizer, merged],
                                                                  feed_dict=feed_dict,
                                                                  run_metadata=run_metadata)
                            writer.add_summary(summary_str, inner_iterations)
                            loss_batch /= center_batch.shape[0]
                            new_cost += loss_batch / self.max_inner_iter
                    new_cost /= len(list_batch_X)
                new_cost /= len(list_batches)
                new_cost = new_cost * -1
                self.is_fit = True
                print('\t\t  ## Epoch {0} took {1} seconds...'.format(epoch, round(time.time() - epoch_timeref, 3)))
                logger.info(
                    '\t\t  ## Epoch {0} took {1} seconds...'.format(epoch, round(time.time() - epoch_timeref, 3)))
                data = str(epoch) + '\t' + str(round(time.time() - epoch_timeref, 3)) + '\t' + str(new_cost) + '\n'
                save_data(data=data, file_name=cost_file_name, save_path=result_path, mode='a', w_string=True,
                          print_tag=False)
                # Save models parameters based on test frequencies
                if (epoch % self.display_interval) == 0 or epoch == 1 or epoch == self.num_epochs:
                    print('\t\t  --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_cost, old_cost))
                    logger.info('\t\t  --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_cost, old_cost))
                    if new_cost < old_cost or epoch == self.num_epochs:
                        old_cost = new_cost
                        tag_final_file = "_tf.ckpt"
                        tag_final_embeddings = "_tf_embeddings.npz"
                        if epoch == self.num_epochs:
                            tag_final_file = "_final_tf.ckpt"
                            tag_final_embeddings = "_final_tf_embeddings.npz"

                        print('\t\t  --> Storing the path2vec model to: {0:s}'.format(model_name + tag_final_file))
                        logger.info(
                            '\t\t  --> Storing the path2vec model to: {0:s}'.format(model_name + tag_final_file))
                        saver.save(sess, os.path.join(model_path, model_name + tag_final_file))

                        print('\t\t  --> Storing the path2vec node embeddings as numpy array to: {0:s}'.format(
                            model_name + tag_final_embeddings))
                        logger.info('\t\t  --> Storing the path2vec node embeddings as numpy array to: {0:s}'.format(
                            model_name + tag_final_embeddings))
                        model_embeddings = tf.get_default_graph()
                        model_embeddings = model_embeddings.get_tensor_by_name("embeddings/embedding_matrix:0")
                        # Create a configuration for visualizing embeddings with the selected_pathways in TensorBoard.
                        # TODO: comment this
                        config = projector.ProjectorConfig()
                        embedding_conf = config.embeddings.add()
                        embedding_conf.tensor_name = model_embeddings.name
                        ##
                        model_embeddings = sess.run(model_embeddings)
                        np.savez(os.path.join(model_path, model_name + tag_final_embeddings), model_embeddings)
                        # TODO: comment this
                        embedding_conf.metadata_path = os.path.join(model_path, model_name + '_metadata.tsv')
                        projector.visualize_embeddings(writer, config)
            writer.close()
            print('\t  --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3)))
            logger.info('\t  --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3)))
Esempio n. 13
0
    def __fit_by_word2vec(self, X, type2index, model_name, model_path, result_path):
        '''
        Learn embeddings by optimizing the Skipgram objective using SGD.
        '''
        old_cost = np.inf
        timeref = time.time()
        cost_file_name = model_name + "_word2vec_cost.txt"
        save_data('', file_name=cost_file_name, save_path=result_path, mode='w', w_string=True, print_tag=False)
        print('\t>> Training by word2vec model...')
        logger.info('\t>> Training by word2vec model...')
        model = word2vec.Word2Vec(size=self.embedding_dimension, window=self.window_size, min_count=0,
                                  sg=1, workers=self.num_jobs, negative=self.num_negative_samples,
                                  compute_loss=True)
        print('\t>> Building vocabulary...')
        logger.info('\t>> Building vocabulary...')
        model.build_vocab(X)
        n_epochs = self.num_epochs + 1
        if self.constraint_type:
            n_epochs = self.num_epochs + 2
            node_type = [t for t, nodes in type2index.items()]
            list_type = list()
            for items, t in enumerate(node_type):
                list_type.append([str(node) for node in type2index[t] if str(node) in model])
        for epoch in np.arange(start=1, stop=n_epochs):
            desc = '\t   {0:d})- Epoch count ({0:d}/{1:d})...'.format(epoch, n_epochs - 1)
            print(desc)
            logger.info(desc)
            self.__shffule(X=X)
            list_batches = np.arange(start=0, stop=len(X), step=self.batch)
            epoch_timeref = time.time()
            new_cost = 0.0
            for idx, batch in enumerate(list_batches):
                desc = '\t       --> Learning: {0:.2f}% ...'.format(((idx + 1) / len(list_batches)) * 100)
                logger.info(desc)
                if (idx + 1) != len(list_batches):
                    print(desc, end="\r")
                if (idx + 1) == len(list_batches):
                    print(desc)
                model.train(X[batch:batch + self.batch], total_examples=len(X[batch:batch + self.batch]),
                            epochs=self.max_inner_iter, compute_loss=True)
                if self.constraint_type:
                    for items in list_type:
                        emb = model[items]
                        denominator = np.sum(np.triu(np.dot(emb, emb.T), 1))
                        emb = emb / denominator
                        for i, node in enumerate(items):
                            model.wv.syn0[model.wv.vocab[node].index] = emb[i]
                new_cost += model.get_latest_training_loss() / len(list_batches)
                new_cost /= self.max_inner_iter
            if self.constraint_type and epoch == 1:
                continue
            self.is_fit = True
            print('\t\t  ## Epoch {0} took {1} seconds...'.format(epoch, round(time.time() - epoch_timeref, 3)))
            logger.info(
                '\t\t  ## Epoch {0} took {1} seconds...'.format(epoch, round(time.time() - epoch_timeref, 3)))
            data = str(epoch) + '\t' + str(round(time.time() - epoch_timeref, 3)) + '\t' + str(new_cost) + '\n'
            save_data(data=data, file_name=cost_file_name, save_path=result_path, mode='a', w_string=True,
                      print_tag=False)
            # Save models parameters based on test frequencies
            if (epoch % self.display_interval) == 0 or epoch == 1 or epoch == n_epochs - 1:
                print('\t\t  --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_cost, old_cost))
                logger.info('\t\t  --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_cost, old_cost))

                if new_cost < old_cost or epoch == n_epochs - 1:
                    old_cost = new_cost

                    tag_final_file = "_word2vec.ckpt"
                    tag_final_embeddings = "_word2vec_embeddings.npz"
                    if epoch == n_epochs - 1:
                        tag_final_file = "_final_word2vec.ckpt"
                        tag_final_embeddings = "_final_word2vec_embeddings.npz"

                    print('\t\t  --> Storing the path2vec model to: {0:s}'.format(model_name + tag_final_file))
                    logger.info('\t\t  --> Storing the path2vec model to: {0:s}'.format(model_name + tag_final_file))
                    model.wv.save_word2vec_format(os.path.join(model_path, model_name + tag_final_file))

                    print('\t\t  --> Storing the path2vec node embeddings as numpy array to: {0:s}'.format(
                        model_name + tag_final_embeddings))
                    logger.info('\t\t  --> Storing the path2vec node embeddings as numpy array to: {0:s}'.format(
                        model_name + tag_final_embeddings))
                    model_embeddings = np.zeros((self.node_size, self.embedding_dimension), dtype=np.float32)
                    for v_idx in np.arange(self.node_size):
                        if str(v_idx) in model.wv.vocab:
                            model_embeddings[v_idx] = model[str(v_idx)]
                    np.savez(os.path.join(model_path, model_name + tag_final_embeddings), model_embeddings)

        print('\t  --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3)))
        logger.info('\t  --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3)))
Esempio n. 14
0
def __train(arg):
    # Setup the number of operations to employ
    steps = 1
    # Whether to display parameters at every operation
    display_params = True

    ##########################################################################################################
    ######################                            TRAIN                             ######################
    ##########################################################################################################

    if arg.train:
        print('\t>> Loading files...')
        dictionary = load_data(file_name=arg.vocab_name,
                               load_path=arg.dspath,
                               tag="dictionary",
                               print_tag=False)
        X = load_data(file_name=arg.X_name,
                      load_path=arg.dspath,
                      tag="X",
                      print_tag=False)
        M = None
        features = None
        if arg.use_supplement:
            M = load_data(file_name=arg.M_name,
                          load_path=arg.dspath,
                          tag="supplementary components")
            M = M.toarray()
        if arg.use_features:
            features = load_data(file_name=arg.features_name,
                                 load_path=arg.dspath,
                                 tag="features")

        if arg.soap:
            print('\n{0})- Training using SOAP model...'.format(steps))
            steps = steps + 1
            model_name = 'soap_' + arg.model_name
            model = SOAP(vocab=dictionary.token2id,
                         num_components=arg.num_components,
                         alpha_mu=arg.alpha_mu,
                         alpha_sigma=arg.alpha_sigma,
                         alpha_phi=arg.alpha_phi,
                         gamma=arg.gamma,
                         kappa=arg.kappa,
                         xi=arg.xi,
                         varpi=arg.varpi,
                         optimization_method=arg.opt_method,
                         cost_threshold=arg.cost_threshold,
                         component_threshold=arg.component_threshold,
                         max_sampling=arg.max_sampling,
                         subsample_input_size=arg.subsample_input_size,
                         batch=arg.batch,
                         num_epochs=arg.num_epochs,
                         max_inner_iter=arg.max_inner_iter,
                         top_k=arg.top_k,
                         collapse2ctm=arg.collapse2ctm,
                         use_features=arg.use_features,
                         num_jobs=arg.num_jobs,
                         display_interval=arg.display_interval,
                         shuffle=arg.shuffle,
                         forgetting_rate=arg.forgetting_rate,
                         delay_factor=arg.delay_factor,
                         random_state=arg.random_state,
                         log_path=arg.logpath)
            model.fit(X=X,
                      M=M,
                      features=features,
                      model_name=model_name,
                      model_path=arg.mdpath,
                      result_path=arg.rspath,
                      display_params=display_params)

        if arg.spreat:
            print('\n{0})- Training using SPREAT model...'.format(steps))
            steps = steps + 1
            model_name = 'spreat_' + arg.model_name
            model = SPREAT(vocab=dictionary.token2id,
                           num_components=arg.num_components,
                           alpha_mu=arg.alpha_mu,
                           alpha_sigma=arg.alpha_sigma,
                           alpha_phi=arg.alpha_phi,
                           gamma=arg.gamma,
                           kappa=arg.kappa,
                           xi=arg.xi,
                           varpi=arg.varpi,
                           optimization_method=arg.opt_method,
                           cost_threshold=arg.cost_threshold,
                           component_threshold=arg.component_threshold,
                           max_sampling=arg.max_sampling,
                           subsample_input_size=arg.subsample_input_size,
                           batch=arg.batch,
                           num_epochs=arg.num_epochs,
                           max_inner_iter=arg.max_inner_iter,
                           top_k=arg.top_k,
                           collapse2ctm=arg.collapse2ctm,
                           use_features=arg.use_features,
                           num_jobs=arg.num_jobs,
                           display_interval=arg.display_interval,
                           shuffle=arg.shuffle,
                           forgetting_rate=arg.forgetting_rate,
                           delay_factor=arg.delay_factor,
                           random_state=arg.random_state,
                           log_path=arg.logpath)
            model.fit(X=X,
                      M=M,
                      features=features,
                      model_name=model_name,
                      model_path=arg.mdpath,
                      result_path=arg.rspath,
                      display_params=display_params)

        if arg.ctm:
            print('\n{0})- Training using CMT model...'.format(steps))
            steps = steps + 1
            model_name = 'ctm_' + arg.model_name
            model = CTM(vocab=dictionary.token2id,
                        num_components=arg.num_components,
                        alpha_mu=arg.alpha_mu,
                        alpha_sigma=arg.alpha_sigma,
                        alpha_beta=arg.alpha_phi,
                        optimization_method=arg.opt_method,
                        cost_threshold=arg.cost_threshold,
                        component_threshold=arg.component_threshold,
                        subsample_input_size=arg.subsample_input_size,
                        batch=arg.batch,
                        num_epochs=arg.num_epochs,
                        max_inner_iter=arg.max_inner_iter,
                        num_jobs=arg.num_jobs,
                        display_interval=arg.display_interval,
                        shuffle=arg.shuffle,
                        forgetting_rate=arg.forgetting_rate,
                        delay_factor=arg.delay_factor,
                        random_state=arg.random_state,
                        log_path=arg.logpath)
            model.fit(X=X,
                      model_name=model_name,
                      model_path=arg.mdpath,
                      result_path=arg.rspath,
                      display_params=display_params)

        if arg.lda:
            print(
                '\n{0})- Training using LDA (sklearn) model...'.format(steps))
            steps = steps + 1
            model_name = 'sklda_' + arg.model_name
            model = skLDA(n_components=arg.num_components,
                          learning_method='batch',
                          learning_decay=arg.delay_factor,
                          learning_offset=arg.forgetting_rate,
                          max_iter=1,
                          batch_size=arg.batch,
                          evaluate_every=arg.display_interval,
                          perp_tol=arg.cost_threshold,
                          mean_change_tol=arg.component_threshold,
                          max_doc_update_iter=arg.max_inner_iter,
                          n_jobs=arg.num_jobs,
                          verbose=0,
                          random_state=arg.random_state)
            print('\t>> Training by LDA model...')
            n_epochs = arg.num_epochs + 1
            old_bound = np.inf
            num_samples = int(X.shape[0] * arg.subsample_input_size)
            list_batches = np.arange(start=0, stop=num_samples, step=arg.batch)
            cost_file_name = model_name + "_cost.txt"
            save_data('',
                      file_name=cost_file_name,
                      save_path=arg.rspath,
                      mode='w',
                      w_string=True,
                      print_tag=False)
            for epoch in np.arange(start=1, stop=n_epochs):
                desc = '\t   {0:d})- Epoch count ({0:d}/{1:d})...'.format(
                    epoch, n_epochs - 1)
                print(desc)
                idx = np.random.choice(X.shape[0], num_samples, False)
                start_epoch = time.time()
                X_tmp = X[idx, :]
                for bidx, batch in enumerate(list_batches):
                    desc = '\t       --> Training: {0:.2f}%...'.format(
                        ((bidx + 1) / len(list_batches)) * 100)
                    if (bidx + 1) != len(list_batches):
                        print(desc, end="\r")
                    if (bidx + 1) == len(list_batches):
                        print(desc)
                    model.partial_fit(X=X_tmp[batch:batch + arg.batch])
                end_epoch = time.time()
                new_bound = -model.score(X=X_tmp) / X.shape[1]
                new_bound = np.log(new_bound)
                print('\t\t  ## Epoch {0} took {1} seconds...'.format(
                    epoch, round(end_epoch - start_epoch, 3)))
                data = str(epoch) + '\t' + str(
                    round(end_epoch - start_epoch,
                          3)) + '\t' + str(new_bound) + '\n'
                save_data(data=data,
                          file_name=cost_file_name,
                          save_path=arg.rspath,
                          mode='a',
                          w_string=True,
                          print_tag=False)
                print('\t\t  --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(
                    new_bound, old_bound))
                if new_bound <= old_bound or epoch == n_epochs - 1:
                    print('\t\t  --> Storing the LDA phi to: {0:s}'.format(
                        model_name + '_phi.npz'))
                    np.savez(os.path.join(arg.mdpath, model_name + '_phi.npz'),
                             model.components_)
                    print(
                        '\t\t  --> Storing the LDA (sklearn) model to: {0:s}'.
                        format(model_name + '.pkl'))
                    save_data(data=model,
                              file_name=model_name + '.pkl',
                              save_path=arg.mdpath,
                              mode="wb",
                              print_tag=False)
                    if epoch == n_epochs - 1:
                        print('\t\t  --> Storing the LDA phi to: {0:s}'.format(
                            model_name + '_phi_final.npz'))
                        np.savez(
                            os.path.join(arg.mdpath,
                                         model_name + '_phi_final.npz'),
                            model.components_)
                        print(
                            '\t\t  --> Storing the LDA (sklearn) model to: {0:s}'
                            .format(model_name + '_final.pkl'))
                        save_data(data=model,
                                  file_name=model_name + '_final.pkl',
                                  save_path=arg.mdpath,
                                  mode="wb",
                                  print_tag=False)
                    old_bound = new_bound
        display_params = False

    ##########################################################################################################
    ######################                           EVALUATE                           ######################
    ##########################################################################################################

    if arg.evaluate:
        print('\t>> Loading files...')
        dictionary = load_data(file_name=arg.vocab_name,
                               load_path=arg.dspath,
                               tag="vocabulary",
                               print_tag=False)
        X = load_data(file_name=arg.X_name,
                      load_path=arg.dspath,
                      tag="X",
                      print_tag=False)
        corpus = load_data(file_name=arg.text_name,
                           load_path=arg.dspath,
                           tag="X (a list of strings)",
                           print_tag=False)
        data = [[dictionary[i] for i, j in item] for item in corpus]

        M = None
        features = None
        if arg.use_supplement:
            M = load_data(file_name=arg.M_name,
                          load_path=arg.dspath,
                          tag="supplementary components")
            M = M.toarray()
        if arg.use_features:
            features = load_data(file_name=arg.features_name,
                                 load_path=arg.dspath,
                                 tag="features")

        if arg.soap:
            print('\n{0})- Evaluating SOAP model...'.format(steps))
            steps = steps + 1
            model_name = 'soap_' + arg.model_name + '.pkl'
            file_name = 'soap_' + arg.model_name + '_score.txt'
            print('\t>> Loading SOAP model...')
            model = load_data(file_name=model_name,
                              load_path=arg.mdpath,
                              tag='SOAP model',
                              print_tag=False)
            score = model.predictive_distribution(X=X,
                                                  M=M,
                                                  features=features,
                                                  cal_average=arg.cal_average,
                                                  batch_size=arg.batch,
                                                  num_jobs=arg.num_jobs)
            print("\t>> Average log predictive score: {0:.4f}".format(score))
            save_data(data="# Average log predictive score: {0:.10f}\n".format(
                score),
                      file_name=file_name,
                      save_path=arg.rspath,
                      tag="log predictive score",
                      mode='w',
                      w_string=True,
                      print_tag=False)
            components = np.argsort(-model.phi)[:, :arg.top_k]
            components = [[dictionary[i] for i in item] for item in components]
            for cr in ['u_mass', 'c_v', 'c_uci', 'c_npmi']:
                cm = CoherenceModel(texts=data,
                                    topics=components,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence=cr)
                coherence = cm.get_coherence()
                print("\t>> Average coherence ({0}) score: {1:.4f}".format(
                    cr, coherence))
                save_data(
                    data="# Average coherence ({0}) score: {1:.4f}\n".format(
                        cr, coherence),
                    file_name=file_name,
                    save_path=arg.rspath,
                    tag="coherence score",
                    mode='a',
                    w_string=True,
                    print_tag=False)

        if arg.spreat:
            print('\n{0})- Evaluating SPREAT model...'.format(steps))
            steps = steps + 1
            model_name = 'spreat_' + arg.model_name + '.pkl'
            file_name = 'spreat_' + arg.model_name + '_score.txt'
            print('\t>> Loading SPREAT model...')
            model = load_data(file_name=model_name,
                              load_path=arg.mdpath,
                              tag='SPREAT model',
                              print_tag=False)
            score = model.predictive_distribution(X=X,
                                                  M=M,
                                                  features=features,
                                                  cal_average=arg.cal_average,
                                                  batch_size=arg.batch,
                                                  num_jobs=arg.num_jobs)
            print("\t>> Average log predictive score: {0:.4f}".format(score))
            save_data(data="# Average log predictive score: {0:.10f}\n".format(
                score),
                      file_name=file_name,
                      save_path=arg.rspath,
                      tag="log predictive score",
                      mode='w',
                      w_string=True,
                      print_tag=False)
            components = np.argsort(-model.phi)[:, :arg.top_k]
            components = [[dictionary[i] for i in item] for item in components]
            for cr in ['u_mass', 'c_v', 'c_uci', 'c_npmi']:
                cm = CoherenceModel(texts=data,
                                    topics=components,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence=cr)
                coherence = cm.get_coherence()
                print("\t>> Average coherence ({0}) score: {1:.4f}".format(
                    cr, coherence))
                save_data(
                    data="# Average coherence ({0}) score: {1:.4f}\n".format(
                        cr, coherence),
                    file_name=file_name,
                    save_path=arg.rspath,
                    tag="coherence score",
                    mode='a',
                    w_string=True,
                    print_tag=False)

        if arg.ctm:
            print('\n{0})- Evaluating CTM model...'.format(steps))
            steps = steps + 1
            model_name = 'ctm_' + arg.model_name + '.pkl'
            file_name = 'ctm_' + arg.model_name + '_score.txt'
            print('\t>> Loading CTM model...')
            model = load_data(file_name=model_name,
                              load_path=arg.mdpath,
                              tag='CTM model',
                              print_tag=False)
            score = model.predictive_distribution(X=X,
                                                  cal_average=arg.cal_average,
                                                  batch_size=arg.batch,
                                                  num_jobs=arg.num_jobs)
            print("\t>> Average log predictive score: {0:.4f}".format(score))
            save_data(data="# Average log predictive score: {0:.10f}\n".format(
                score),
                      file_name=file_name,
                      save_path=arg.rspath,
                      tag="log predictive score",
                      mode='w',
                      w_string=True,
                      print_tag=False)
            components = np.argsort(-model.omega)[:, :arg.top_k]
            components = [[dictionary[i] for i in item] for item in components]
            for cr in ['u_mass', 'c_v', 'c_uci', 'c_npmi']:
                cm = CoherenceModel(texts=data,
                                    topics=components,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence=cr)
                coherence = cm.get_coherence()
                print("\t>> Average coherence ({0}) score: {1:.4f}".format(
                    cr, coherence))
                save_data(
                    data="# Average coherence ({0}) score: {1:.4f}\n".format(
                        cr, coherence),
                    file_name=file_name,
                    save_path=arg.rspath,
                    tag="coherence score",
                    mode='a',
                    w_string=True,
                    print_tag=False)

        if arg.lda:
            print('\n{0})- Evaluating LDA model...'.format(steps))
            steps = steps + 1
            model_name = 'sklda_' + arg.model_name + '.pkl'
            file_name = 'sklda_' + arg.model_name + '_score.txt'
            print('\t>> Loading LDA model...')
            model = load_data(file_name=model_name,
                              load_path=arg.mdpath,
                              tag='LDA model',
                              print_tag=False)
            model.components_ /= model.components_.sum(1)[:, np.newaxis]
            component_distribution = model.transform(X=X)
            score = 0.0
            for idx in np.arange(X.shape[0]):
                feature_idx = X[idx].indices
                temp = np.multiply(component_distribution[idx][:, np.newaxis],
                                   model.components_[:, feature_idx])
                score += np.sum(temp)
            if arg.cal_average:
                score = score / X.shape[0]
            score = np.log(score + np.finfo(np.float).eps)
            print("\t>> Average log predictive score: {0:.4f}".format(score))
            save_data(data="# Average log predictive score: {0:.10f}\n".format(
                score),
                      file_name=file_name,
                      save_path=arg.rspath,
                      tag="log predictive score",
                      mode='w',
                      w_string=True,
                      print_tag=False)
            components = np.argsort(-model.components_)[:, :arg.top_k]
            components = [[dictionary[i] for i in item] for item in components]
            for cr in ['u_mass', 'c_v', 'c_uci', 'c_npmi']:
                cm = CoherenceModel(texts=data,
                                    topics=components,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence=cr)
                coherence = cm.get_coherence()
                print("\t>> Average coherence ({0}) score: {1:.4f}".format(
                    cr, coherence))
                save_data(
                    data="# Average coherence ({0}) score: {1:.4f}\n".format(
                        cr, coherence),
                    file_name=file_name,
                    save_path=arg.rspath,
                    tag="coherence score",
                    mode='a',
                    w_string=True,
                    print_tag=False)

    ##########################################################################################################
    ######################                           TRANSFORM                          ######################
    ##########################################################################################################

    if arg.transform:
        print('\t>> Loading files...')
        X = load_data(file_name=arg.X_name,
                      load_path=arg.dspath,
                      tag="X",
                      print_tag=False)

        M = None
        features = None
        if arg.use_supplement:
            M = load_data(file_name=arg.M_name,
                          load_path=arg.dspath,
                          tag="supplementary components")
            M = M.toarray()
        if arg.use_features:
            features = load_data(file_name=arg.features_name,
                                 load_path=arg.dspath,
                                 tag="features")

        if arg.soap:
            print('\n{0})- Transforming {1} using a pre-trained SOAP model...'.
                  format(steps, arg.X_name))
            steps = steps + 1
            model_name = 'soap_' + arg.model_name + '.pkl'
            file_name = 'soap_' + arg.file_name + '.pkl'
            print('\t>> Loading SOAP model...')
            model = load_data(file_name=model_name,
                              load_path=arg.mdpath,
                              tag='SOAP model',
                              print_tag=False)
            X = model.transform(X=X,
                                M=M,
                                features=features,
                                batch_size=arg.batch,
                                num_jobs=arg.num_jobs)
            save_data(data=X,
                      file_name=file_name,
                      save_path=arg.dspath,
                      tag="transformed X",
                      mode='wb',
                      print_tag=True)

        if arg.spreat:
            print(
                '\n{0})- Transforming {1} using a pre-trained SPREAT model...'.
                format(steps, arg.X_name))
            steps = steps + 1
            model_name = 'spreat_' + arg.model_name + '.pkl'
            file_name = 'spreat_' + arg.file_name + '.pkl'
            print('\t>> Loading SPREAT model...')
            model = load_data(file_name=model_name,
                              load_path=arg.mdpath,
                              tag='SPREAT model',
                              print_tag=False)
            X = model.transform(X=X,
                                M=M,
                                features=features,
                                batch_size=arg.batch,
                                num_jobs=arg.num_jobs)
            save_data(data=X,
                      file_name=file_name,
                      save_path=arg.dspath,
                      tag="transformed X",
                      mode='wb',
                      print_tag=True)

        if arg.ctm:
            print('\n{0})- Transforming {1} using a pre-trained CTM model...'.
                  format(steps, arg.X_name))
            steps = steps + 1
            model_name = 'ctm_' + arg.model_name + '.pkl'
            file_name = 'ctm_' + arg.file_name + '.pkl'
            print('\t>> Loading CTM model...')
            model = load_data(file_name=model_name,
                              load_path=arg.mdpath,
                              tag='CTM model',
                              print_tag=False)
            X = model.transform(X=X,
                                batch_size=arg.batch,
                                num_jobs=arg.num_jobs)
            save_data(data=X,
                      file_name=file_name,
                      save_path=arg.dspath,
                      tag="transformed X",
                      mode='wb',
                      print_tag=True)

        if arg.lda:
            print('\n{0})- Transforming {1} using a pre-trained LDA model...'.
                  format(steps, arg.X_name))
            steps = steps + 1
            model_name = 'sklda_' + arg.model_name + '.pkl'
            file_name = 'sklda_' + arg.file_name + '.pkl'
            print('\t>> Loading LDA model...')
            model = load_data(file_name=model_name,
                              load_path=arg.mdpath,
                              tag='LDA model',
                              print_tag=False)
            X = model.transform(X=X)
            save_data(data=X,
                      file_name=file_name,
                      save_path=arg.dspath,
                      tag="transformed X",
                      mode='wb',
                      print_tag=True)