def redis_set_helper(key, data, pipe, npz = False):
	with BytesIO() as b:
		if npz:
			save_npz(b, data)
		else:
			np.save(b, data)
		pipe.set(key, b.getvalue())
Ejemplo n.º 2
0
def _save_and_load(matrix):
    fd, tmpfile = tempfile.mkstemp(suffix='.npz')
    os.close(fd)
    try:
        save_npz(tmpfile, matrix)
        loaded_matrix = load_npz(tmpfile)
    finally:
        os.remove(tmpfile)
    return loaded_matrix
def transferG2ADJ():
    G = json_graph.node_link_graph(json.load(open("reddit/reddit-G.json")))
    feat_id_map = json.load(open("reddit/reddit-id_map.json"))
    feat_id_map = {id: val for id, val in feat_id_map.iteritems()}
    numNode = len(feat_id_map)
    adj = np.zeros((numNode, numNode))
    newEdges0 = [feat_id_map[edge[0]] for edge in G.edges()]
    newEdges1 = [feat_id_map[edge[1]] for edge in G.edges()]

    # for edge in G.edges():
    #     adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1
    adj = sp.csr_matrix((np.ones((len(newEdges0),)), (newEdges0, newEdges1)), shape=(numNode, numNode))
    sp.save_npz("reddit_adj.npz", adj)
def main():
    args, kwargs = get_args()

    result = preprocess(args.data, format = args.format, kwargs = kwargs, col_order = args.col_order, k_cores = args.k_cores, save_map = args.save_map, output = args.output, user_map = args.user_map, item_map = args.item_map, train_size = args.train_size, dtype = args.dtype, debug=args.debug, timestamp = args.timestamp)

    if args.save_map:
        json_dump(result['user_map'], os.path.join(args.output, args.user_map))
        json_dump(result['item_map'], os.path.join(args.output, args.item_map))

    if args.timestamp:
        save_npz(os.path.join(args.output, args.timestamp_file), result['timestamp'])

    save_npz(os.path.join(args.output, args.train_file), result['train'])
    save_npz(os.path.join(args.output, args.test_file), result['test'])
Ejemplo n.º 5
0
 def save_similarity(self, name_file, compressed=False):
     sps.save_npz(name_file, self.s, compressed)
Ejemplo n.º 6
0
        for i, w in enumerate(line_as_idx):
          # keep count
          k += 1
          if k % 10000 == 0:
            print("%s/%s" % (k, num_tokens))

          start = max(0, i - context_size)
          end   = min(len(line_as_idx), i + context_size)
          for c in line_as_idx[start:i]:
            wc_counts[w, c] += 1
          for c in line_as_idx[i+1:end]:
            wc_counts[w, c] += 1
  print("Finished counting")

  save_npz('pmi_counts_%s.npz' % V, csr_matrix(wc_counts))

else:
  wc_counts = load_npz('pmi_counts_%s.npz' % V)


# context counts get raised ^ 0.75
c_counts = wc_counts.sum(axis=0).A.flatten() ** 0.75
c_probs = c_counts / c_counts.sum()
c_probs = c_probs.reshape(1, V)


# PMI(w, c) = #(w, c) / #(w) / p(c)
# pmi = wc_counts / wc_counts.sum(axis=1) / c_probs # works only if numpy arrays
pmi = wc_counts.multiply(1.0 / wc_counts.sum(axis=1) / c_probs).tocsr()
# this operation changes it to a coo_matrix
Ejemplo n.º 7
0
    def create_local_clustering(self, overwrite, r_thresh, min_region_size=80):
        """
        API for performing any of a variety of clustering routines available
         through NiLearn.
        """
        import os.path as op
        from scipy.sparse import save_npz, load_npz
        from nilearn.regions import connected_regions

        try:
            conn_comps = connected_regions(
                self._clust_mask_corr_img,
                extract_type="connected_components",
                min_region_size=min_region_size,
            )
            self._conn_comps = conn_comps[0]
            self.num_conn_comps = len(conn_comps[1])
        except BaseException:
            try:
                raise ValueError("Clustering mask is empty!")
            except ValueError:
                import sys
                sys.exit(1)

        if not self._conn_comps:
            if np.sum(np.asarray(self._clust_mask_corr_img.dataobj)) == 0:
                try:
                    raise ValueError("Clustering mask is empty!")
                except ValueError:
                    import sys
                    sys.exit(1)
            else:
                self._conn_comps = self._clust_mask_corr_img
                self.num_conn_comps = 1
        print(f"Detected {self.num_conn_comps} connected components in "
              f"clustering mask with a mininimum region "
              f"size of {min_region_size}")
        if (self.clust_type == "complete" or self.clust_type == "average"
                or self.clust_type == "single"):
            if self.num_conn_comps > 1:
                try:
                    raise ValueError(
                        "Clustering method unstable with spatial constrainsts "
                        "applied to multiple connected components.")
                except ValueError:
                    import sys
                    sys.exit(0)

        if (self.clust_type == "ward"
                and self.num_conn_comps > 1) or self.clust_type == "ncut":
            if self.k < self.num_conn_comps:
                try:
                    raise ValueError(
                        "k must minimally be greater than the total number of "
                        "connected components in "
                        "the mask in the case of agglomerative clustering.")
                except ValueError:
                    import sys
                    sys.exit(0)

            if self.local_corr == "tcorr" or self.local_corr == "scorr":
                self._local_conn_mat_path = (f"{self.uatlas.split('.nii')[0]}_"
                                             f"{self.local_corr}_conn.npz")

                if (not op.isfile(
                        self._local_conn_mat_path)) or (overwrite is True):
                    from pynets.fmri.clustools import (
                        make_local_connectivity_tcorr,
                        make_local_connectivity_scorr,
                    )

                    if self.local_corr == "tcorr":
                        self._local_conn = make_local_connectivity_tcorr(
                            self._func_img,
                            self._clust_mask_corr_img,
                            thresh=r_thresh)
                    elif self.local_corr == "scorr":
                        self._local_conn = make_local_connectivity_scorr(
                            self._func_img,
                            self._clust_mask_corr_img,
                            thresh=r_thresh)
                    else:
                        try:
                            raise ValueError(
                                "Local connectivity type not available")
                        except ValueError:
                            import sys
                            sys.exit(0)
                    print(
                        f"Saving spatially constrained connectivity structure"
                        f" to: {self._local_conn_mat_path}")
                    save_npz(self._local_conn_mat_path, self._local_conn)
                elif op.isfile(self._local_conn_mat_path):
                    self._local_conn = load_npz(self._local_conn_mat_path)
            elif self.local_corr == "allcorr":
                if self.clust_type == "ncut":
                    try:
                        raise ValueError(
                            "Must select either `tcorr` or `scorr` local "
                            "connectivity option if you are using "
                            "`ncut` clustering method")
                    except ValueError:
                        import sys
                        sys.exit(0)

                self._local_conn = "auto"
            else:
                try:
                    raise ValueError(
                        "Local connectivity method not recognized. Only tcorr,"
                        " scorr, and auto are currently "
                        "supported")
                except ValueError:
                    import sys
                    sys.exit(0)
        else:
            self._local_conn = "auto"
        return
 def save(self):
     if self.save_file is not None:
         if self.sparse:
             ss.save_npz(self.save_file, self.matrix)
         else:
             self.matrix.tofile(self.save_file)
Ejemplo n.º 9
0
# 	elif item == 'neutral':
# 		l.append(0.0)
# lab_test=l

#print type(fvec_test)
#print 'convert test to sparse'
#_test_arr = csr_matrix(fvec_test)
#print X_test_arr.data.nbytes
#sparse.save_npz("test/testmatrix.npz", X_test_arr)
#X_test_arr = sparse.load_npz("test/testmatrix.npz")
#print 'converted'

print 'convert train to sparse'
X_train_arr = csr_matrix(fvec_train)
print X_train_arr.data.nbytes
sparse.save_npz("train/trainmatrix.npz", X_train_arr)
#X_train_arr = sparse.load_npz("train/trainmatrix.npz") #load saved sparse matrix
print 'converted'
'''
#train_valid_test split
fvec_train, fvec_tes, lab_train, lab_te = train_test_split(X_train_arr, lab_train, test_size=0.0, random_state=1)
fvec_test, o, lab_test, o = train_test_split(X_test_arr, lab_test, test_size=0.0, random_state=1)



#convert to np arrays
X_train_arr=np.array(fvec_train).astype(float)
X_test_arr=np.array(fvec_test).astype(float)
y_train_arr = np.array(lab_train).astype(float)
y_test_arr = np.array(lab_test).astype(float)
Ejemplo n.º 10
0
#exec(open("mkmatslowly.py").read())
import numpy as np
import scipy.sparse as ss
import json

# (from args) get start and end indices
index = 0
start = 0
end = 1

f = open("names.txt")
trk = [line[:-1] for line in f]
f.close()

mat = ss.csr_matrix((0,len(trk)), dtype=np.int8)
slc = start * 1000

for i in range(start, end):
	f = open(f"mpd.slice.{slc}-{slc+999}.json")
	j = json.load(f)
	f.close()
	m = np.zeros(shape=(1000,len(trk)), dtype=np.int8)
	for row in range(1000):
		pl = j["playlists"][row]
		t = (x["track_uri"] for x in pl["tracks"])
		m[row] = [int(x in t) for x in trk]
	slc += 1000
	mat = ss.vstack((mat, ss.csr_matrix(m)))
ss.save_npz("mat%02d.npz" % index, mat)
Ejemplo n.º 11
0
def save_proximity(ds, radius, A):

    logger.info("Saving proximity matrix...")
    fname = os.path.join(ds.a.data_path, "proximity_radius_%s_%s.npz" %(str(radius), ds.a.brain_mask))
    save_npz(fname, A.tocoo())
    return
Ejemplo n.º 12
0
def load_net_ann_datasets(out_dir, taxon, dataset, input_settings,
                          alg_settings, uniprot_taxon_file, **kwargs):
    sparse_net_file = "%s/%s-net.npz" % (out_dir, taxon)
    node2idx_file = sparse_net_file + "-node-ids.txt"
    swsn_weights_file = sparse_net_file + "-swsn-weights.txt"
    sparse_ann_file = "%s/ann.npz" % (out_dir)
    if not kwargs.get('forcenet') and \
            (os.path.isfile(sparse_net_file) and os.path.isfile(node2idx_file)) and \
            os.path.isfile(sparse_ann_file):
        print("Reading network from %s" % (sparse_net_file))
        W = sp.load_npz(sparse_net_file)
        print("\t%d nodes and %d edges" % (W.shape[0], len(W.data) / 2))
        print("Reading node names from %s" % (node2idx_file))
        prots = utils.readItemList(node2idx_file, 1)
        new_net_obj = setup.Sparse_Networks(W, prots)
        if os.path.isfile(swsn_weights_file):
            print("Reading swsn weights file %s" % (swsn_weights_file))
            weights = [
                float(w) for w in utils.readItemList(swsn_weights_file, 1)
            ]
            # also load the original networks to get the edge weights for the STRING networks
            net_obj = run_eval_algs.setup_net(input_settings['input_dir'],
                                              dataset, **kwargs)
            net_obj.swsn_weights = weights
        else:
            net_obj = new_net_obj
        print("\nReading annotation matrix from %s" % (sparse_ann_file))
        loaded_data = np.load(sparse_ann_file, allow_pickle=True)
        dag_matrix = setup.make_csr_from_components(loaded_data['arr_0'])
        ann_matrix = setup.make_csr_from_components(loaded_data['arr_1'])
        goids, prots = loaded_data['arr_2'], loaded_data['arr_3']
        ann_obj = setup.Sparse_Annotations(dag_matrix, ann_matrix, goids,
                                           prots)
        species_to_uniprot_idx = eval_loso.get_uniprot_species(
            uniprot_taxon_file, ann_obj)
        # TODO eval ann obj
        eval_ann_obj = None
    else:
        # load the network
        # TODO if a subset of the network was run, need to get that subset
        net_obj, ann_obj, eval_ann_obj = run_eval_algs.setup_dataset(
            dataset, input_settings['input_dir'], alg_settings, **kwargs)
        species_to_uniprot_idx = eval_loso.get_uniprot_species(
            uniprot_taxon_file, ann_obj)
        new_net_obj = net_obj
        # run SWSN if needd
        #if net_obj.multi_net:
        # TODO if LOSO was run, need to leave out the taxon for edge weights to be accurate
        if taxon is not None:
            if kwargs.get('limit_to_taxons_file'):
                # limit the network to the specified species
                # read in the specified taxons from the file
                _, net_taxons = eval_loso.get_selected_species(
                    species_to_uniprot_idx, kwargs['limit_to_taxons_file'])
                net_taxon_prots = net_exp.get_taxon_prots(
                    net_obj.nodes, net_taxons, species_to_uniprot_idx)
                net_obj, ann_obj = net_exp.limit_to_taxons(net_taxon_prots,
                                                           net_obj=net_obj,
                                                           ann_obj=ann_obj,
                                                           **kwargs)
            # leave out the annotations for this taxon ID
            train_ann_mat, test_ann_mat, sp_goterms = eval_loso.leave_out_taxon(
                taxon,
                ann_obj,
                species_to_uniprot_idx,
                eval_ann_obj=eval_ann_obj,
                **kwargs)
            taxon_prots = net_exp.get_taxon_prots(net_obj.nodes, [taxon],
                                                  species_to_uniprot_idx)
            new_net_obj = net_exp.limit_net_to_target_taxon(
                train_ann_mat, taxon_prots, net_obj, ann_obj, **kwargs)
            W = new_net_obj.W
        #    else:
        #        W, _ = net_obj.weight_SWSN(ann_obj.ann_matrix)
        #        #new_net_obj =
        else:
            W = net_obj.W
        print("\twriting sparse matrix to %s" % (sparse_net_file))
        sp.save_npz(sparse_net_file, W)
        print("\twriting node2idx labels to %s" % (node2idx_file))
        with open(node2idx_file, 'w') as out:
            out.write(''.join([
                "%s\t%d\n" % (prot, i) for i, prot in enumerate(net_obj.nodes)
            ]))
        if net_obj.multi_net:
            print("\twriting swsn weights file to %s" % (swsn_weights_file))
            with open(swsn_weights_file, 'w') as out:
                out.write('\n'.join([str(w)
                                     for w in new_net_obj.swsn_weights]) +
                          '\n')
                net_obj.swsn_weights = new_net_obj.swsn_weights
        # now store them to a file
        print("\twriting sparse annotations to %s" % (sparse_ann_file))
        # store all the data in the same file
        dag_matrix_data = setup.get_csr_components(ann_obj.dag_matrix)
        ann_matrix_data = setup.get_csr_components(ann_obj.ann_matrix)
        #np.savez_compressed(
        #    sparse_ann_file, dag_matrix_data=dag_matrix_data,
        #    ann_matrix_data=ann_matrix_data, goids=goids, prots=prots)
        np.savez_compressed(sparse_ann_file, dag_matrix_data, ann_matrix_data,
                            ann_obj.goids, ann_obj.prots)
    return net_obj, new_net_obj, ann_obj, eval_ann_obj, species_to_uniprot_idx
Ejemplo n.º 13
0
    def test_construct_sparse_matrix(self):
        """Construct a sparse matrices of the horse racing dataset.

        What will be saved after running this suite
        -------------------------------------------
        """
        df = pd.read_csv(PATH2FEATURES)
        df.astype({GROUP_KEY: str})
        unique_hnames = df['hname'].unique()
        hname2ind = pd.get_dummies(unique_hnames)
        invalid_rids = df[df['odds'] == 0][GROUP_KEY].unique()
        context_cols = ['n_presi', 'n_avgsi4', 'n_disavgsi', 'n_goavgsi',
                        'w2c', 'eps', 'draw', 'newdis',
                        'jnowin', 'jwinper', 'jst1miss']

        # [START Construct Competitor & Entity Index Vector]
        n_horses = len(unique_hnames)
        indexing_features = sparse.coo_matrix((0, 2 * n_horses), dtype=np.int8)
        grouped = df[~df[GROUP_KEY].isin(invalid_rids)].groupby(GROUP_KEY)

        pbar = tqdm(total=len(grouped))
        pbar.set_description('Constructing Indexing Matrix...')
        for idx, (_, rdata) in enumerate(grouped):
            entries = rdata['hname']
            n_entries = len(entries)
            val = np.ones(np.power(n_entries, 2))  # shape = (n_entries ** 2, )
            row = np.array([np.repeat(i, n_entries) for i in np.arange(0, n_entries)]).flatten()  # shape = (n_entries ** 2, )
            entry_indices = np.array([hname2ind[hname2ind[hname] == 1].index[0] for hname in entries])

            # [START Obtain Column Indices]
            col = []
            for jdx, entry_index in enumerate(entry_indices):
                _copy = entry_indices.copy()
                combination_indices = np.delete(entry_indices, jdx)
                entity_index = entry_index + n_horses  # entity & entry
                this_col = np.append(combination_indices, entity_index)
                col += this_col.tolist()
            col = np.array(col)
            # [END Obtain Column Indices]

            index_matrix = sparse.coo_matrix((val, (row, col)), shape=(n_entries, 2 * n_horses), dtype=np.int8)

            # [START Assertion]
            if idx == len(grouped):
                for hname in entries:
                    nonzeros = index_matrix.toarray().nonzero()
                    target_ind = hname2ind[hname2ind[hname] == 1].index[0]
                    _index = nonzeros[0][n_entries - 1]
                    ind_as_entity = index_matrix.toarray().nonzero()[1][_index]
                    self.assertEqual(target_ind, ind_as_entity - n_horses)
            # [END Assertion]

            indexing_features = sparse.vstack([indexing_features, index_matrix])  # TODO the greater idx is, the slower...
            pbar.update(1)
        pbar.close()
        # [END Construct Competitor & Entity Index Vector]

        # [START Construct Context Vector & Target]
        n_train_rows = 0  # ← MAX Train data index
        n_contexts = len(context_cols)
        context_features = sparse.coo_matrix((0, n_contexts))
        target_series = []
        raceid_series = []
        pbar2 = tqdm(total=len(grouped))
        pbar2.set_description('Constructing Context Matrix ...')
        for idx, (_, rdata) in enumerate(grouped):
            context_matrix = sparse.coo_matrix(rdata[context_cols].values)
            context_features = sparse.vstack([context_features, context_matrix])
            target_series += rdata[TARGET_KEY].values.tolist()
            raceid_series += rdata[GROUP_KEY].values.tolist()
            # [START Get Train Test Split Index]
            if idx == np.round(TRAIN_SIZE * len(grouped)):
                n_train_rows, _ = context_features.shape
            # [END Get Train Test Split Index]
            pbar2.update(1)
        pbar2.close()
        # [END Construct Context Vector & Target]


        # Finally, concat indexing_features & context_features
        features = sparse.hstack((indexing_features, context_features))
        target_series = np.asarray(target_series)

        # Display Stats
        print('---' * 20)
        print('Summary')
        print(f'+ The number of races: {len(grouped)}')
        print(f'+ The number of horses: {n_horses}')
        print('Indexing Matrix Stats')
        print(f'+ Shape of sparse matrix: {indexing_features.shape}')
        print(f'+ The number of nonzero elems: {indexing_features.nnz}')
        print('Context Matrix Stats')
        print(f'Shape of dense matrix: {context_features.shape}')
        print('Train Test Split')
        print(f'+ Maximum Train data row index: {n_train_rows}')
        print('---' * 20)

        # Unit Testing
        self.assertEqual(indexing_features.dtype, np.int8)
        self.assertEqual(indexing_features.shape[0], context_features.shape[0])
        self.assertEqual(features.shape[1], indexing_features.shape[1] + context_features.shape[1])
        self.assertEqual(features.shape[0], len(target_series))
        self.assertEqual(len(target_series), len(raceid_series))

        # Save the features & targets
        sparse.save_npz(FEATURES_OUTPUT, features)
        np.save(TARGETS_OUTPUT, target_series)
        with open(RACEIDS_OUTPUT, mode='wb') as fp:
            pickle.dump(raceid_series, fp)
Ejemplo n.º 14
0
    def chromosome_coverage_read_counts(self, gene_overlap_dat, chrom_gene_df,
                                        chrom_exon_df, chrom):
        """
        Determine per-chromosome reads coverage and per-gene read counts from an RNA-seq experiment in
        a way that properly considers ambiguous reads - if a (paired) read falls entirely within the
        exonic regions of a *single* gene, only then does read contribute to read count and coverage.
        The cigar scores from single and paired reads are parsed according to cigar_segment_bounds.

        1. Saves compressed coverage array to self.save_dir with file name 'sample_[sample_id]_[chrom].npz' for
         genes with no overlap with any other gene (a.k.a. "isolated genes") with filename
         'chrom_coverage_[sample_id]_[chrom].npz'
        2. Saves a dictionary of {gene_name: 1-d numpy gene coverage arrays (concatenated exonic regions)}
         to a serialized pickle file for all genes that exonic have overlap with other genes (a.k.a. "overlap genes")
         with filename 'overlap_coverage_[sample_id]_[chrom].pkl'
        3. Saves read counts to self.save_dir with filename 'read_counts_[sample_id]_[chrom].csv'

        NOTE: if the required chromosome coverage files and read count file *already* exist prior to any coverage/read count
        calculations, Degnorm will default to using those files. This will only happen if a user either moves
        coverage and read count files from a prior Degnorm pipeline run to the appropriate chromosome directories
        of the target output directory, or if they re-use a Degnorm pipeline run's output directory. This is *NOT*
        the same as using a warm-start directory. A warm-start skips coverage/read count calculations entirely,
        assuming a prior Degnorm run successfully parse all coverage/read counts.

        :param chrom_gene_df: pandas.DataFrame with `chr`, `gene`, `gene_start`, and `gene_end` columns
        that delineate the start and end position of a gene's transcript on a chromosome, must be
        subset to the chromosome in study.
        :param gene_overlap_dat: dictionary with keys 'isolated_genes' and 'overlap_genes' detailing
        groups of genes that do not overlap with others and then groups of genes that share any overlap.
        See gene_processing.get_gene_overlap_structure function.
        :param chrom_exon_df: pandas.DataFrame with `chr`, `gene`, `start`, `end` columns that delineate
        the start and end positions of exons on a gene.
        :param chrom: str chromosome name
        :return: None. Coverage and read count files are written to self.save_dir.
        """
        # First, load this chromosome's reads.
        if self.verbose:
            logging.info(
                'SAMPLE {0}, CHR {1} -- begin loading reads from {2}'.format(
                    self.sample_id, chrom, self.filename))

        # assess how many genes we have.
        n_genes = chrom_gene_df.shape[0]

        # gene_overlap_dat data check: ensure that number isolated genes + number overlapping genes
        # equals number of genes in genes DataFrame.
        n_isolated_genes, n_overlap_genes = 0, 0
        if gene_overlap_dat['isolated_genes']:
            n_isolated_genes = len(gene_overlap_dat['isolated_genes'])

        if gene_overlap_dat['overlap_genes']:
            n_overlap_genes = np.sum(
                [len(x) for x in gene_overlap_dat['overlap_genes']])

        if n_isolated_genes + n_overlap_genes != n_genes:
            raise ValueError(
                'number of genes contained in gene_overlap_dat does not match that of chrom_gene_df.'
            )

        # create filepaths to non-overlapping read coverage, overlapping read coverage, read count files.
        chrom_cov_file = os.path.join(
            self.save_dir,
            'chrom_coverage_' + self.sample_id + '_' + str(chrom) + '.npz')
        ol_cov_file = os.path.join(
            self.save_dir,
            'overlap_coverage_' + self.sample_id + '_' + str(chrom) + '.pkl')
        count_file = os.path.join(
            self.save_dir,
            'read_counts_' + self.sample_id + '_' + str(chrom) + '.csv')

        # if all required coverage, read count files are present, e.g. created from a previous run attempt,
        # then skip all calculations and default to the existing files. Addresses issue #30.
        if ((n_isolated_genes > 0 and os.path.isfile(chrom_cov_file)) or n_isolated_genes == 0) \
            and ((n_overlap_genes > 0 and os.path.isfile(ol_cov_file)) or n_overlap_genes == 0) \
            and (os.path.isfile(count_file)):

            if self.verbose:
                logging.info("""SAMPLE {0}, CHR {1} -- WARNING... All coverage and read count files already present:
                {0}
                {1}
                {2}
                Defaulting to these files; skipping coverage and read count calculations."""\
                             .format(chrom_cov_file, ol_cov_file, count_file))

            return None

        # initialize read counts.
        read_count_dict = {gene: 0 for gene in chrom_gene_df.gene}

        # set pandas.options.mode.chained_assignment = None to avoid SettingWithCopyWarnings
        set_option('mode.chained_assignment', None)

        # ---------------------------------------------------------------------- #
        # Step 1. Load chromosome's reads and index them.
        # ---------------------------------------------------------------------- #
        reads_df = self.load_chromosome_reads(chrom)

        if self.verbose:
            logging.info(
                'SAMPLE {0}, CHR {1} -- reads successfully loaded. shape = {2}'
                .format(self.sample_id, chrom, reads_df.shape))

        # append end position to reads based on cigar score.
        reads_df['end_pos'] = reads_df['pos'] + reads_df['cigar'].apply(
            lambda x: sum([int(k)
                           for k, v in re.findall(r'(\d+)([A-Z]?)', x)]))

        # assign row number to read ID column.
        reads_df['read_id'] = range(reads_df.shape[0])

        # easy win: drop reads whose start position is < minimum start position of a gene,
        # and drop reads whose end position is > maximum start position of a gene
        min_gene_start, max_gene_end = chrom_gene_df.gene_start.min(
        ) - 1, chrom_gene_df.gene_end.max() - 1
        reads_df = reads_df[(reads_df.pos >= (min_gene_start))
                            & (reads_df.end_pos <= (max_gene_end))]

        # If working with paired reads,
        # ensure that we've sequestered paired reads (eliminate any query names only occurring once).
        if self.paired:
            qname_counts = reads_df.qname_unpaired.value_counts()
            paired_occ_reads = qname_counts[qname_counts ==
                                            2].index.values.tolist()
            reads_df = reads_df[reads_df.qname_unpaired.isin(paired_occ_reads)]

        # ---------------------------------------------------------------------- #
        # Step 2. Drop reads that don't fully fall within union of all exons.
        # ---------------------------------------------------------------------- #
        chrom_len = self.header[self.header.chr == chrom].length.iloc[0]
        tscript_vec = np.ones(
            [chrom_len], dtype=int)  # large vector, will delete after using.

        # build binary 0/1 exon/intron indicator vector.
        # Need to account for exon data being 1-indexed, tscript_vec is 0-indexed, but
        # exon end positions are inclusive.
        exon_starts = chrom_exon_df.start.values - 1
        exon_ends = chrom_exon_df.end.values
        for i in range(len(exon_starts)):
            tscript_vec[exon_starts[i]:exon_ends[i]] = 0

        del exon_starts, exon_ends
        gc.collect()

        # store read_ids of reads to drop, and initialize dropped read count.
        drop_reads = list()

        # store read match region bounds, so that we only parse CIGAR strings once.
        read_bounds = list()

        # use values array, faster access.
        dat = reads_df[['cigar', 'pos', 'read_id']].values

        # for paired reads, perform special parsing of CIGAR strings to avoid double-counting of overlap regions.
        if self.paired:
            for ii in np.arange(1, dat.shape[0], 2):

                # obtain read region bounds.
                bounds_1 = cigar_segment_bounds(dat[ii - 1, 0],
                                                start=dat[ii - 1, 1])
                bounds_2 = cigar_segment_bounds(dat[ii, 0], start=dat[ii, 1])

                # leverage nature of alignments of paired reads to find disjoint coverage ranges.
                min_bounds_1, max_bounds_1 = min(bounds_1), max(bounds_1)
                min_bounds_2, max_bounds_2 = min(bounds_2), max(bounds_2)

                if max_bounds_2 >= max_bounds_1:
                    bounds_2 = [
                        max_bounds_1 + 1 if j <= max_bounds_1 else j
                        for j in bounds_2
                    ]
                else:
                    bounds_2 = [
                        min_bounds_1 - 1 if j >= min_bounds_1 else j
                        for j in bounds_2
                    ]
                    bounds_2.sort()

                # aggregate read pair's bounds.
                bounds = bounds_1 + bounds_2

                # iterate over match regions. If a single region is not fully contained
                # within exon regions, drop the pair.
                drop_read = False
                for j in np.arange(1, len(bounds), step=2):

                    # check whether matching regions on tscript_vec are fully contained within exonic regions.
                    # note that right-bounds are inclusive.
                    if np.sum(
                            tscript_vec[(bounds[j - 1]):(bounds[j] + 1)]) > 0:
                        drop_read = True

                # append read id to set of read indices to drop (if appropriate).
                if drop_read:
                    drop_reads.extend([dat[ii - 1, 2], dat[ii, 2]])

                # otherwise, append match region bounds list. Note: endpoints of regions are inclusive.
                else:
                    read_bounds.append(bounds)

        # for single-read RNA-Seq experiments, we do not need such special consideration.
        else:
            for ii in np.arange(dat.shape[0]):
                # obtain read regions bounds.
                bounds = cigar_segment_bounds(dat[ii, 0], start=dat[ii, 1])

                # iterate over match regions. If a single region is not fully contained
                # within exon regions, drop the read.
                drop_read = False
                for j in np.arange(1, len(bounds), step=2):

                    if np.sum(
                            tscript_vec[(bounds[j - 1]):(bounds[j] + 1)]) > 0:
                        drop_read = True

                # append read id to set of read indices to drop (if appropriate).
                if drop_read:
                    drop_reads.append(dat[ii, 2])

                # otherwise, append match region bounds list. Note: endpoints of regions are inclusive.
                else:
                    read_bounds.append(bounds)

        # drop reads that don't fully intersect exonic regions.
        if drop_reads:
            reads_df = reads_df[~reads_df.read_id.isin(drop_reads)]

        if self.paired:
            # if paired reads, don't actually need .1 and .2 constituent reads anymore.
            # So to save time + memory, take every other read.
            reads_df = reads_df.iloc[np.arange(1, reads_df.shape[0], step=2)]

        # add parsed match region bounds to reads!
        reads_df['bounds'] = read_bounds

        # delete objs, attempt to save on memory.
        del tscript_vec, drop_reads, dat, read_bounds
        gc.collect()

        # ---------------------------------------------------------------------- #
        # Step 3. Compute coverage, reads across groups of mutually overlapping genes.
        # (This is costly from a time perspective. Should constitute
        #  coverage, read count calculations for ~ 10-20% of genes.)
        # ---------------------------------------------------------------------- #

        # display summary statistics around rate of gene intersection.
        if self.verbose:
            logging.info(
                'SAMPLE {0}, CHR {1} -- overlap genes = {2} / {3}.'.format(
                    self.sample_id, chrom, n_overlap_genes, n_genes))
            logging.info(
                'SAMPLE {0}, CHR {1} -- begin overlap gene group reads processing.'
                .format(self.sample_id, chrom))

        # for genes in a group of overlapping genes, compute read coverage + count.
        if n_overlap_genes > 0:

            ol_cov_dict = dict()

            # iterate over groups of overlapping genes.
            for ol_genes in gene_overlap_dat['overlap_genes']:

                ol_gene_df = chrom_gene_df[chrom_gene_df.gene.isin(ol_genes)]
                ol_gene_group_start = ol_gene_df.gene_start.min() - 1
                ol_gene_group_end = ol_gene_df.gene_end.max() - 1

                ol_gene_starts = list()
                gene_exon_bounds = list()
                transcript_idx = list(
                )  # list of 1-d np.arrays, each holding one overlapping gene's exon positioning.

                # obtain exon regions for each gene in overlap group.
                # Exon starts/ends are 1-indexed, change them to be 0-indexed.
                for ol_gene in ol_genes:
                    ol_gene_exon_df = chrom_exon_df[chrom_exon_df.gene ==
                                                    ol_gene]

                    # store gene starts for constructing per-gene coverage vectors.
                    # 0-index gene starts/ends.
                    ol_gene_start = ol_gene_exon_df.gene_start.iloc[0] - 1
                    ol_gene_end = ol_gene_exon_df.gene_end.iloc[0] - 1
                    ol_gene_starts.append(ol_gene_start)

                    # initialize gene coverage vector for each gene in overlap group.
                    ol_cov_dict[ol_gene] = np.zeros(
                        [ol_gene_end - ol_gene_start + 1], dtype=int)

                    # save gene exon positioning, for determining which reads captured by which genes.
                    # 0-index exon positions, and include gene end positioning.
                    e_starts, e_ends = np.sort(
                        ol_gene_exon_df.start.values) - 1, np.sort(
                            ol_gene_exon_df.end.values)
                    gene_exon_bounds += [[
                        [e_starts[j], e_ends[j]] for j in range(len(e_starts))
                    ]]  # list of list of lists, includes exon end pos.
                    transcript_idx.append(
                        np.unique(
                            fill_in_bounds(flatten_2d(gene_exon_bounds[-1])))
                    )  # transcript vector is 0-indexed, includes exon end pos.

                # drop things we don't need any more.
                del ol_gene_df, ol_gene_exon_df, e_starts, e_ends

                # storage for reads to drop.
                drop_reads = list()

                # subset reads to those that start and end within scope of this bloc of overlapping genes.
                ol_reads_dat = reads_df[(reads_df.pos >= (ol_gene_group_start))
                                        & (reads_df.end_pos <=
                                           (ol_gene_group_end))][[
                                               'bounds', 'read_id'
                                           ]].values

                # for single-read RNA-Seq experiments, we do not need such special consideration.
                for i in range(ol_reads_dat.shape[0]):

                    # obtain read regions bounds.
                    read_bounds, read_id = ol_reads_dat[i, :]

                    # find genes that fully include this read. Everything is 0-indexed.
                    caught_genes = self.determine_full_inclusion(
                        read_bounds, gene_exon_bounds=gene_exon_bounds)

                    # Ambiguous read determination logic:
                    # - if paired reads lie fully within 0 or 2+ genes, do not use the reads pair and drop them.
                    # - if read lies fully within a single gene:
                    #    - do not drop it.
                    #    - if the caught gene is the current gene being analyzed, use the read. O/w do not.
                    n_caught_genes = len(caught_genes)

                    # if only one gene captures read, use the read and identify capturing gene for
                    # incrementing count, but drop it from consideration later (it's been accounted for).
                    # if only full intersection is with with a single gene, increment coverage and read count
                    # for that gene, and drop read.
                    # Note: need to restart coverage calculations relative to gene's start position.
                    if n_caught_genes == 1:
                        drop_read = True
                        read_gene = ol_genes[caught_genes[0]]
                        read_gene_start = ol_gene_starts[caught_genes[0]]
                        read_idx = fill_in_bounds(
                            read_bounds, endpoint=True) - read_gene_start - 1
                        ol_cov_dict[read_gene][read_idx] += 1
                        read_count_dict[read_gene] += 1

                    # if no gene fully captures the read, do not use read *but do not drop it*,
                    # for the possibility that some isolated gene captures the read later on.
                    elif n_caught_genes == 0:
                        drop_read = False

                    # if > 1 gene fully captures the read,
                    # do not use read and drop it from consideration.
                    else:
                        drop_read = True

                    # if need be, add read to list of reads to be dropped.
                    if drop_read:
                        drop_reads.append(read_id)

                # drop ambiguous reads from larger set of chromosome reads,
                # should speed up gene-read searches in the future.
                if drop_reads:
                    reads_df = reads_df[~reads_df.read_id.isin(drop_reads)]

                    del drop_reads

                # pare down coverage vectors for genes in overlap group to their concatenated exon regions.
                for i in range(len(ol_genes)):
                    ol_gene = ol_genes[i]
                    ol_cov_dict[ol_gene] = ol_cov_dict[ol_gene][
                        transcript_idx[i] - ol_gene_starts[i]]

            # ---------------------------------------------------------------------- #
            # Step 3.5: save overlapping genes' coverage vectors.
            # overlapping gene coverage vector dict ->> pkl file.
            # ---------------------------------------------------------------------- #
            if self.verbose:
                logging.info(
                    'SAMPLE {0}, CHR {1} -- saving overlapping gene coverage vectors.'
                    .format(self.sample_id, chrom))

            # dump overlapping genes' coverage matrices.
            with open(ol_cov_file, 'wb') as f:
                pkl.dump(ol_cov_dict, f)

            # free up some memory -- delete groups of intersecting genes, etc.
            del ol_reads_dat, ol_cov_dict, transcript_idx, gene_exon_bounds
            gc.collect()

            if self.verbose:
                logging.info(
                    'SAMPLE {0}, CHR {1} -- overlapping gene reads processing successful.'
                    .format(self.sample_id, chrom))

        # ---------------------------------------------------------------------- #
        # Step 4. Compute coverage, reads for individual isolated genes.
        # ---------------------------------------------------------------------- #
        if n_isolated_genes > 0:

            if self.verbose:
                logging.info(
                    'SAMPLE {0}, CHR {1} -- begin isolated gene reads processing.'
                    .format(self.sample_id, chrom))

            # reduce chrom_gene_df to remaining genes
            chrom_gene_df = chrom_gene_df[chrom_gene_df.gene.isin(
                gene_overlap_dat['isolated_genes'])]

            # run same inclusion/exclusion transcript test but on the isolated genes.
            tscript_vec = np.ones([chrom_len], dtype=int)

            # identify regions of chromosome covered by isolated genes.
            # change gene starts/ends to 0-indexed to match 0-indexed tscript_vec array, but
            # gene ends are inclusive.
            gene_starts = chrom_gene_df.gene_start.values - 1
            gene_ends = chrom_gene_df.gene_end.values
            for i in range(len(gene_starts)):
                tscript_vec[gene_starts[i]:gene_ends[i]] = 0

            # identify reads that do not fall within an isolated gene's (start, end).
            drop_reads = list()
            dat = reads_df[['pos', 'end_pos', 'read_id']].values
            for i in range(dat.shape[0]):
                read_start, read_end, read_id = dat[i, :]

                # remember to include read end position. reads are 0-indexed.
                if np.sum(tscript_vec[read_start:(read_end + 1)]) > 0:
                    drop_reads.append(read_id)

            # drop memory hogs.
            del dat, gene_starts, gene_ends, tscript_vec

            # drop reads that do not lie completely within area covered by isolated genes.
            if drop_reads:
                reads_df = reads_df[~reads_df.read_id.isin(drop_reads)]

            del drop_reads
            gc.collect()

            # (a precaution) only continue if we have any reads intersecting isolated genes.
            if not reads_df.empty:

                # initialize chromosome coverage array.
                cov_vec = np.zeros([chrom_len], dtype=int)

                # ---------------------------------------------------------------------- #
                # Step 4.5.1: join genes on reads data
                # so that each read is tied to a gene, for read counting purposes.
                # ---------------------------------------------------------------------- #

                # 0-index gene_starts, gene_ends because reads are 0-indexed.
                chrom_gene_df.loc[:, ['gene_start', 'gene_end']] -= 1

                # add IntervalIndex index to chromosome gene data.
                chrom_gene_df.index = IntervalIndex.from_arrays(
                    chrom_gene_df.gene_start,
                    right=chrom_gene_df.gene_end,
                    closed='both')

                try:
                    reads_df['gene'] = chrom_gene_df.loc[
                        reads_df.pos].gene.values

                # if there remains at least one read that doesn't land within a gene span,
                # try another sweep to remove reads not within gene regions.
                except KeyError:

                    # outline valid read start positions along transcript.
                    tscript_vec = np.ones([chrom_len], dtype=int)

                    for i in range(chrom_gene_df.shape[0]):
                        left = chrom_gene_df.index[i].left
                        right = chrom_gene_df.index[i].right + 1
                        tscript_vec[left:right] = 0

                    # iterate over reads, checking whether read start position falls within
                    # a [gene_start, gene_end] region.
                    drop_reads = list()
                    for i in range(reads_df.shape[0]):
                        if tscript_vec[reads_df.pos.iloc[i]] != 0:
                            drop_reads.append(reads_df.read_id.iloc[i])

                    # drop reads that do not start within valid [gene_start, gene_end] regions.
                    if drop_reads:
                        reads_df = reads_df[~reads_df.read_id.isin(drop_reads)]

                    del tscript_vec, drop_reads
                    gc.collect()

                    # subset reads to reads w/ valid read ID, then join with interval index again.
                    reads_df['gene'] = chrom_gene_df.loc[
                        reads_df.pos].gene.values

                # loop over reads for isolated genes, incrementing read count and coverage.
                dat = reads_df[['bounds', 'gene']].values
                for i in range(dat.shape[0]):
                    bounds, gene = dat[i, :]

                    # reads are already 0-indexed.
                    read_idx = fill_in_bounds(bounds, endpoint=True)

                    # increment coverage and read count.
                    cov_vec[read_idx] += 1
                    read_count_dict[gene] += 1

                # ---------------------------------------------------------------------- #
                # Step 4.5.2: save chromosome coverage vector.
                # chromosome overage vector ->> compressed csr numpy array
                # ---------------------------------------------------------------------- #
                if self.verbose:
                    logging.info(
                        'SAMPLE {0}, CHR {1} -- saving csr-compressed chrom coverage array.'
                        .format(self.sample_id, chrom))

                # save coverage vector as a compressed-sparse row matrix.
                sparse.save_npz(chrom_cov_file,
                                matrix=sparse.csr_matrix(cov_vec))

                # drop large data objects.
                del cov_vec, dat, reads_df

            # drop remaining large data data objects.
            del chrom_gene_df, chrom_exon_df
            gc.collect()

            if self.verbose:
                logging.info(
                    'SAMPLE {0}, CHR {1} -- isolated gene reads processing successful.'
                    .format(self.sample_id, chrom))

        # ---------------------------------------------------------------------- #
        # Step 5. Save read counts.
        # chromosome read counts ->> .csv file
        # ---------------------------------------------------------------------- #
        # construct read count DataFrame from read count dictionary.
        read_count_df = DataFrame({
            'gene':
            list(read_count_dict.keys()),
            self.sample_id:
            list(read_count_dict.values())
        })

        del read_count_dict
        gc.collect()

        if self.verbose:
            logging.info(
                'SAMPLE {0}, CHR {1} -- mean per-gene read count: {2:.4}'.
                format(self.sample_id, chrom,
                       read_count_df[self.sample_id].mean()))
            logging.info('SAMPLE {0}, CHR {1} -- saving read counts.'.format(
                self.sample_id, chrom))

        # save sample's chromosome read counts to .csv for joining later.
        read_count_df.to_csv(count_file, index=False)
Ejemplo n.º 15
0
        help=
        "Select the level at which labels in the same clusters are combined.",
    )
    parser.add_argument(
        "--bin-size",
        type=int,
        default=2,
        help="How many labels to group for each synthetic composite label.",
    )
    args = parser.parse_args()

    data = args.data
    bin_size = args.bin_size
    combine_level = args.level
    folder = f"./dataset/xmc-base/{data}/"
    out_folder = f"./dataset-binned/{data}/"
    C = cluster_chain(f"./model/{data}/ranker/**/C.npz", combine_level)
    os.makedirs(out_folder, exist_ok=True)

    ytr = sps.load_npz(folder + "Y.trn.npz")
    yte = sps.load_npz(folder + "Y.tst.npz")

    mapper, new_nr_labels = combine_from_cluster(C, bin_size)
    new_ytr = combine_Y(mapper, new_nr_labels, ytr)
    new_yte = combine_Y(mapper, new_nr_labels, yte)
    invert_mapper = inversion_mapper(mapper)
    with open(out_folder + "mapper.pkl", "wb") as writer:
        pkl.dump(invert_mapper, writer)
    sps.save_npz(out_folder + "Y.trn.npz", new_ytr)
    sps.save_npz(out_folder + "Y.tst.npz", new_yte)
Ejemplo n.º 16
0
def main():
    global args
    args = Params()

    if args.use_alexnet:
        print("Using pre-trained alexnet")
        model = models.alexnet(pretrained=True)
        model.classifier[6] = nn.Linear(4096, args.num_classes)
    else:
        print("Using pre-trained inception_v3")
        # inception is changed to accept variable size inputs
        model = inception_v3(pretrained=True)
        model.fc = nn.Linear(2048, args.num_classes)
        model.aux_logits = False

    model = model.cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            args.best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # data loading code
    train_dataset = birdsnap_loader.BS(args.data_root,
                                       args.meta_data,
                                       split_name='train',
                                       im_size_crop=args.im_size_crop,
                                       im_size_resize=args.im_size_resize,
                                       is_train=True)
    val_dataset = birdsnap_loader.BS(args.data_root,
                                     args.meta_data,
                                     split_name=args.split_name,
                                     im_size_crop=args.im_size_crop,
                                     im_size_resize=args.im_size_resize,
                                     is_train=False)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        prec1, preds, im_ids, feats = validate(val_loader, model, criterion,
                                               True, True)
        # write predictions to file
        if args.save_preds:
            # save dense
            #np.save(args.op_file_name, feats)

            # save sparse
            feats[feats < 0.000001] = 0.0
            sp = sparse.csr_matrix(feats)
            sparse.save_npz(args.op_file_name + '_sparse', sp)

            # with open(args.op_file_name, 'w') as opfile:
            #     opfile.write('id,predicted\n')
            #     for ii in range(len(im_ids)):
            #         opfile.write(str(im_ids[ii]) + ',' + ' '.join(str(x) for x in preds[ii,:])+'\n')
        return

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion, False)
        is_better = prec1 > args.best_prec1
        # remember best Acc@1 and save checkpoint
        args.best_prec1 = max(prec1, args.best_prec1)
        model_state = {
            'epoch': epoch + 1,
            #'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': args.best_prec1,
            'optimizer': optimizer.state_dict()
        }

        torch.save(model_state, args.model_path + 'checkpoint.pth.tar')
        if is_better:
            print('\t* Saving new best model')
            torch.save(model_state, args.model_path + 'model_best.pth.tar')
Ejemplo n.º 17
0
def prepare_kddcup10(data_name, min_interactions_per_user, kc_col_name,
					 remove_nan_skills, drop_duplicates=True):
	"""Preprocess KDD Cup 2010 datasets.

	Arguments:
	data_name -- "bridge_algebra06" or "algebra05"
	min_interactions_per_user -- minimum number of interactions per student
	kc_col_name -- Skills id column
	remove_nan_skills -- if True, remove interactions with no skill tag
	drop_duplicates -- if True, drop duplicates from dataset

	Outputs:
	df -- preprocessed ASSISTments dataset (pandas DataFrame)
	Q_mat -- corresponding q-matrix (item-skill relationships sparse array)
	"""
	folder_path = os.path.join("data", data_name)
	df = pd.read_csv(folder_path + "/data.txt", delimiter='\t').rename(columns={
		'Anon Student Id': 'user_id',
		'Problem Name': 'pb_id',
		'Step Name': 'step_id',
		kc_col_name: 'kc_id',
		'First Transaction Time': 'timestamp',
		'Correct First Attempt': 'correct'
	})[['user_id', 'pb_id', 'step_id' ,'correct', 'timestamp', 'kc_id']]
	df["timestamp"] = pd.to_datetime(df["timestamp"])
	df["timestamp"] = df["timestamp"] - df["timestamp"].min()
	#df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds() / (3600*24))
	df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds()).astype(np.int64)
	df.sort_values(by="timestamp",inplace=True)
	df.reset_index(inplace=True,drop=True)
	df = df.groupby("user_id").filter(lambda x: len(x) >= min_interactions_per_user)

	# Create variables
	df["item_id"] = df["pb_id"]+":"+df["step_id"]
	df = df[['user_id', 'item_id', 'kc_id', 'correct', 'timestamp']]

	if drop_duplicates:
		df.drop_duplicates(subset=["user_id", "item_id", "timestamp"], inplace=True)
	
	if remove_nan_skills:
		df = df[~df["kc_id"].isnull()]
	else:
		df.ix[df["kc_id"].isnull(), "kc_id"] = 'NaN'

	# Create list of KCs
	listOfKC = []
	for kc_raw in df["kc_id"].unique():
		for elt in kc_raw.split('~~'):
			listOfKC.append(elt)
	listOfKC = np.unique(listOfKC)

	dict1_kc = {}
	dict2_kc = {}
	for k, v in enumerate(listOfKC):
		dict1_kc[v] = k
		dict2_kc[k] = v

	# Transform ids into numeric
	df["item_id"] = np.unique(df["item_id"], return_inverse=True)[1]
	df["user_id"] = np.unique(df["user_id"], return_inverse=True)[1]

	df.reset_index(inplace=True, drop=True) # Add unique identifier of the row
	df["inter_id"] = df.index

	# Build Q-matrix
	Q_mat = np.zeros((len(df["item_id"].unique()), len(listOfKC)))
	item_skill = np.array(df[["item_id","kc_id"]])
	for i in range(len(item_skill)):
		splitted_kc = item_skill[i,1].split('~~')
		for kc in splitted_kc:
			Q_mat[item_skill[i,0],dict1_kc[kc]] = 1

	df = df[['user_id', 'item_id', 'timestamp', 'correct', 'inter_id']]
	df = df[df.correct.isin([0,1])] # Remove potential continuous outcomes
	df['correct'] = df['correct'].astype(np.int32) # Cast outcome as int32
	
	# Save data
	sparse.save_npz(folder_path + "/q_mat.npz", sparse.csr_matrix(Q_mat))
	df.to_csv(folder_path + "/preprocessed_data.csv", sep="\t", index=False)

	return df, Q_mat
Ejemplo n.º 18
0
            temp = gmean(scipy.sparse.vstack(temp).todense() + 1e-15, axis=0)

        temp[temp < 1e-6] = 0

        return md5, csr_matrix(temp)

    gc.collect()

    print('[{}] Loading predictions...'.format(str(datetime.datetime.now())))
    ms = [scipy.sparse.load_npz(str(x).replace('csv', 'npz')) for x in tqdm(tta_file_names)]

    result = Parallel(n_jobs=12)(delayed(get_probs)(md5) for md5 in common_md5s)

    # result = [get_probs(i) for i in tqdm(file_names.index)]

    print('[{}] Unzippping...'.format(str(datetime.datetime.now())))

    pred_md5_list, probs = zip(*result)

    probs = vstack(probs)

    labels = pd.DataFrame({'md5': pred_md5_list})

    print('[{}] Saving labels...'.format(str(datetime.datetime.now())))

    labels.to_csv(str(model_path / (args.average_type + '_{model_type}_md5_list.csv'.format(model_type=args.model_type))), index=False)

    print('[{}] Saving predictions...'.format(str(datetime.datetime.now())))

    save_npz(str(model_path / (args.average_type + '_{model_type}_probs.npz'.format(model_type=args.model_type))), probs)
    elif average_type == 'gmean':
        temp = gmean(scipy.sparse.vstack(temp).todense() + 1e-15, axis=0)

    temp[temp < 1e-6] = 0

    return file_to_md5[image_name], csr_matrix(temp)


result_path = Path('data') / 'prediction' / 'global'
result_path.mkdir(exist_ok=True, parents=True)

result = Parallel(n_jobs=12)(delayed(get_probs)(i) for i in file_names.index)
#
# result = [get_probs(i) for i in tqdm(file_names.index)]

print('[{}] Unzippping...'.format(str(datetime.datetime.now())))

pred_md5_list, probs = zip(*result)

probs = vstack(probs)

labels = pd.DataFrame({'md5': pred_md5_list})

print('[{}] Saving labels...'.format(str(datetime.datetime.now())))

labels.to_csv(str(result_path / (average_type + '_last_md5_list.csv')), index=False)

print('[{}] Saving predictions...'.format(str(datetime.datetime.now())))

save_npz(str(result_path / (average_type + '_last_probs.npz')), probs)
Ejemplo n.º 20
0
                                     '{}naive'.format(args.partition))
    try:
        os.mkdir(partition_dataset)
    except FileExistsError:
        pass

    chunk_size = int(len(train_nid) / args.partition)
    for pid in range(args.partition):
        start_ofst = chunk_size * pid
        if pid == args.partition - 1:
            end_ofst = len(train_nid)
        else:
            end_ofst = start_ofst + chunk_size
        part_nid = train_nid[start_ofst:end_ofst]
        subadj, sub2fullid, subtrainid = get_sub_graph(dgl_g, part_nid,
                                                       args.num_hops)
        sublabel = labels[sub2fullid[subtrainid]]
        # files
        subadj_file = os.path.join(partition_dataset,
                                   'subadj_{}.npz'.format(str(pid)))
        sub_trainid_file = os.path.join(partition_dataset,
                                        'sub_trainid_{}.npy'.format(str(pid)))
        sub_train2full_file = os.path.join(
            partition_dataset, 'sub_train2fullid_{}.npy'.format(str(pid)))
        sub_label_file = os.path.join(partition_dataset,
                                      'sub_label_{}.npy'.format(str(pid)))
        spsp.save_npz(subadj_file, subadj)
        np.save(sub_trainid_file, subtrainid)
        np.save(sub_train2full_file, sub2fullid)
        np.save(sub_label_file, sublabel)
Ejemplo n.º 21
0
def _save_and_load(matrix):        
    with tempfile.NamedTemporaryFile(suffix='.npz') as file:
        file = file.name
        save_npz(file, matrix)
        loaded_matrix = load_npz(file)
    return loaded_matrix
Ejemplo n.º 22
0
                    # keep count
                    k += 1
                    if k % 10000 == 0:
                        print("%s/%s" % (k, num_tokens))

                    start = max(0, i - context_size)
                    end = min(len(line_as_idx), i + context_size)
                    for c in line_as_idx[start:i]:
                        wc_counts[w, c] += 1
                    for c in line_as_idx[i + 1:end]:
                        wc_counts[w, c] += 1

    time_cost = round((time() - t0) / 60, 2)
    print(f"Finished counting, time cost: {time_cost} mins")

    save_npz(pmi_path, csr_matrix(wc_counts))

else:
    wc_counts = load_npz(pmi_path)

# context counts get raised ^ 0.75
c_counts = wc_counts.sum(axis=0).A.flatten()**0.75
c_probs = c_counts / c_counts.sum()
c_probs = c_probs.reshape(1, V)

# PMI(w, c) = #(w, c) / #(w) / p(c)
# pmi = wc_counts / wc_counts.sum(axis=1) / c_probs # works only if numpy arrays
pmi = wc_counts.multiply(1.0 / wc_counts.sum(axis=1) / c_probs).tocsr()
# this operation changes it to a coo_matrix
# which doesn't have functions we need, e.g log1p()
# so convert it back to a csr
count = 0
def update_train(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/cutoff))

  i = int(row.userId)
  j = int(row.movie_idx)
  A[i,j] = row.rating
df_train.apply(update_train, axis=1)

# mask, to tell us which entries exist and which do not
A = A.tocsr()
mask = (A > 0)
save_npz("Atrain.npz", A)

# test ratings dictionary
A_test = lil_matrix((N, M))
print("Calling: update_test")
count = 0
def update_test(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/len(df_test)))

  i = int(row.userId)
  j = int(row.movie_idx)
  A_test[i,j] = row.rating
df_test.apply(update_test, axis=1)
Ejemplo n.º 24
0
#---------------------------------------------------------------------------
#
if __name__ == '__main__':
    data_file, until_idx = grabArguments()

    print('Loading Data...')
    data_filename, data_file_ext = path.splitext(data_file)
    if data_file_ext == '.npz':
        data = sparse.load_npz(data_file)
        is_sparse = True
    else:
        data = np.load(data_file, allow_pickle=True)
        is_sparse = False
    print(f'input shape: {data.shape}')

    print('Truncating Data...')
    if is_sparse:
        data = data.tocsr()[:until_idx]
    else:
        data = data[:until_idx]
    print(f'output shape: {data.shape}')

    print('Saving Data...')
    output_file = data_filename + '_truncated'

    if is_sparse:
        output_file += data_file_ext
        sparse.save_npz(output_file, data)
    else:
        np.save(output_file, data)
Ejemplo n.º 25
0
def data_final():
    data = pd.read_csv(data_path + "mergedDataFillna.csv")
    #print (data)
    #遇到的一些问题,有人婚姻状况是两个,有很多,这个神奇的操作.不过这个状态少,感觉也没啥关系
    one_hot_feature = [
        'LBS', 'age', 'carrier', 'consumptionAbility', 'education', 'gender',
        'house', 'os', 'ct', 'marriageStatus', 'advertiserId', 'campaignId',
        'creativeId', 'adCategoryId', 'productId', 'productType'
    ]
    #这个特征,都是每个里面是598 872 2602 2964 1189 631 5606 5719 5859 5708......这种的,就是比如兴趣爱好标签,他可能有一大堆......这个要处理
    vector_feature = [
        'appIdAction', 'appIdInstall', 'interest1', 'interest2', 'interest3',
        'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 'topic2',
        'topic3'
    ]

    #这一段是把写出花的特征的类别弄成0,1,2这种(婚姻那个觉得也没事,顶多顶多,五六种而已......多个状态并存感觉问题也不大)
    for feature in one_hot_feature:
        try:
            data[feature] = LabelEncoder().fit_transform(
                data[feature].apply(int))
        except:
            data[feature] = LabelEncoder().fit_transform(data[feature])

    train = data[data.label != -1]
    train_y = train.pop('label')  #pop的作用,在train里删除label这一列,并且返回label这一列
    test = data[data.label == -1]
    #print (test)
    res = test[['aid', 'uid']]
    test = test.drop('label', axis=1)
    # train_x=train[one_hot_feature]
    # test_x=test[one_hot_feature]
    # train_x["creativeSize"]=train["creativeSize"]
    # test_x["creativeSize"] = test["creativeSize"]
    train_x = train[[
        'creativeSize'
    ]]  #之前的creativeSize没处理过,这里单独弄出来,为了以后拼接用。然后用[[]]就是dataframe格式,[]是series格式
    test_x = test[['creativeSize']]

    # 虽然lightgbm不用one-hot,但是这个稀疏矩阵的存法,也没办法在lightgbm中找列......所以还是做one-hot吧
    enc = OneHotEncoder()
    for feature in one_hot_feature:
        enc.fit(data[feature].values.reshape(-1, 1))
        train_a = enc.transform(train[feature].values.reshape(-1, 1))
        test_a = enc.transform(test[feature].values.reshape(-1, 1))
        train_x = sparse.hstack((train_x, train_a))  #稀疏矩阵的拼接
        test_x = sparse.hstack((test_x, test_a))
    print("one-hot prepared!")
    #countvectorizer的特征,感觉这个countvectorizer就是加强版的one-hot......
    cv = CountVectorizer()
    for feature in vector_feature:
        #print (data[feature])
        #print (data[feature].dtypes)
        cv.fit(
            data[feature]
        )  #这里的一个启发,以前我都是train和test各自处理,就会有行列类别对不上的情况,比如这个爱好test里有,但是train里没有,然后就要再处理,非常不方便。这里就是,train和test放在一起弄,就不用怕test里有train里没有了
        train_a = cv.transform(train[feature])
        #print (train_a)    #从这个输出看,这个countvectorizer就弄成稀疏矩阵(sparse matrix)了
        test_a = cv.transform(test[feature])
        train_x = sparse.hstack((train_x, train_a))
        test_x = sparse.hstack((test_x, test_a))  #存成了稀疏矩阵可以直接用lightgbm
    print("cv prepared!")
    #print (train_x)
    #print (train_y)
    #print (test_x)
    #print (train_x)
    sparse.save_npz(data_path + "train_x.npz", train_x)
    sparse.save_npz(data_path + "test_x.npz", test_x)
    train_y.to_csv(data_path + "train_y.csv", index=False)
    res.to_csv(data_path + "res.csv", index=False)
Ejemplo n.º 26
0
count = 0
def update_train(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/cutoff))

  i = int(row.userId)
  j = int(row.movie_idx)
  A[i,j] = row.rating
df_train.apply(update_train, axis=1)

# mask, to tell us which entries exist and which do not
A = A.tocsr()
mask = (A > 0)
save_npz("Atrain.npz", A)

# test ratings dictionary
A_test = lil_matrix((N, M))
print("Calling: update_test")
count = 0
def update_test(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/len(df_test)))

  i = int(row.userId)
  j = int(row.movie_idx)
  A_test[i,j] = row.rating
df_test.apply(update_test, axis=1)
Ejemplo n.º 27
0
    Prepare sparse features
    '''
    X = {}
    X['users'] = onehotize(df['user'], config['nb_users'])
    X['items'] = onehotize(df['item'], config['nb_items'])
    if 'skill' in df:
        X['skills'] = onehotize(df['skill'], config['nb_skills'])
        X['wins'] = X['skills'].copy()
        X['wins'].data = df['wins']
        X['fails'] = X['skills'].copy()
        X['fails'].data = df['fails']
    X_train = hstack([X[agent] for agent in active_features]).tocsr()
    y_train = df['correct'].values
    return X_train, y_train


df = pd.read_csv('data.csv')
with open('config.yml') as f:
    config = yaml.safe_load(f)
print('Configuration', config)
X, y = df_to_sparse(df, config, active_features)
print(df.head())
if options.dataset == 'dummy':
    print(X.todense())

save_npz('X-{:s}.npz'.format(features_suffix), X)
np.save('y-{:s}.npy'.format(features_suffix), y)
print(
    'Successfully created X-{:s}.npz and y-{:s}.npy in data/{} folder'.format(
        features_suffix, features_suffix, options.dataset))
Ejemplo n.º 28
0
def create_affinity(X,
                    knn,
                    scale=None,
                    alg="annoy",
                    savepath=None,
                    W_path=None):
    N, D = X.shape
    if W_path is not None:
        if W_path.endswith('.mat'):
            W = sio.loadmat(W_path)['W']
        elif W_path.endswith('.npz'):
            W = sparse.load_npz(W_path)
    else:

        print('Compute Affinity ')
        start_time = timeit.default_timer()
        if alg == "flann":
            print('with Flann')
            flann = FLANN()
            knnind, dist = flann.nn(X,
                                    X,
                                    knn,
                                    algorithm="kdtree",
                                    target_precision=0.9,
                                    cores=5)
            # knnind = knnind[:,1:]
        # elif alg == "annoy":
        #     print('with annoy')
        #     ann = AnnoyIndex(D, metric='euclidean')
        #     for i, x_ in enumerate(X):
        #         ann.add_item(i, x_)
        #     ann.build(50)
        #     knnind = np.empty((N, knn))
        #     dist = np.empty((N, knn))
        #     for i in range(len(X)):
        #         nn_i = ann.get_nns_by_item(i, knn, include_distances=True)
        #         knnind[i,:] = np.array(nn_i[0])
        #         dist[i,:] = np.array(nn_i[1])
        else:
            nbrs = NearestNeighbors(n_neighbors=knn).fit(X)
            dist, knnind = nbrs.kneighbors(X)

        row = np.repeat(range(N), knn - 1)
        col = knnind[:, 1:].flatten()
        if scale is None:
            data = np.ones(X.shape[0] * (knn - 1))
        else:
            data = np.exp((-dist[:, 1:]**2) / (2 * scale**2)).flatten()

        W = sparse.csc_matrix((data, (row, col)), shape=(N, N), dtype=np.float)
        # W = (W + W.transpose(copy=True)) /2
        elapsed = timeit.default_timer() - start_time
        print(elapsed)

        if isinstance(savepath, str):
            if savepath.endswith('.npz'):
                sparse.save_npz(savepath, W)
            elif savepath.endswith('.mat'):
                sio.savemat(savepath, {'W': W})

    return W
Ejemplo n.º 29
0
def main():
    # get args
    args = TransformerMatcher.get_args_and_set_logger()["args"]

    # do_train and save model
    if args.do_train:
        # setup output_dir
        if os.path.exists(args.output_dir) and os.listdir(
                args.output_dir
        ) and args.do_train and not args.overwrite_output_dir:
            raise ValueError(
                "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
                .format(args.output_dir))
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

        # load data
        with open(args.trn_feat_path, "rb") as fin:
            X_trn = pickle.load(fin)
        C_trn = smat.load_npz(args.trn_label_path)

        # prepare transformer pretrained models
        TransformerMatcher.set_device(args)
        matcher = TransformerMatcher(num_clusters=C_trn.shape[1])
        matcher.prepare_model(args)

        # train
        matcher.train(args, X_trn, C_trn)
        if args.local_rank in [-1, 0]:
            matcher.save_model(args)

    # do_eval on test set and save prediction output
    if args.do_eval:
        # we only support multigpu mode but not distributed mode
        assert args.local_rank == -1

        # load data
        with open(args.trn_feat_path, "rb") as fin:
            X_trn = pickle.load(fin)
        with open(args.tst_feat_path, "rb") as fin:
            X_tst = pickle.load(fin)
        C_trn = smat.load_npz(args.trn_label_path)
        C_tst = smat.load_npz(args.tst_label_path)

        # load fine-tuned model in the args.output_dir
        TransformerMatcher.set_device(args)
        matcher = TransformerMatcher(num_clusters=C_trn.shape[1])
        args.model_type = args.model_type.lower()
        config_class, model_class, tokenizer_class = MODEL_CLASSES[
            args.model_type]
        matcher.config = config_class.from_pretrained(args.output_dir)
        matcher.config.output_hidden_states = True
        model = model_class.from_pretrained(args.output_dir,
                                            config=matcher.config)
        model.to(args.device)
        matcher.model = model

        # predict
        trn_loss, trn_metrics, C_trn_pred, trn_embeddings = matcher.predict(
            args, X_trn, C_trn, topk=args.only_topk, get_hidden=True)
        tst_loss, tst_metrics, C_tst_pred, tst_embeddings = matcher.predict(
            args, X_tst, C_tst, topk=args.only_topk, get_hidden=True)
        logger.info("| matcher_trn_prec {}".format(" ".join(
            "{:4.2f}".format(100 * v) for v in trn_metrics.prec)))
        logger.info("| matcher_trn_recl {}".format(" ".join(
            "{:4.2f}".format(100 * v) for v in trn_metrics.recall)))
        logger.info("| matcher_tst_prec {}".format(" ".join(
            "{:4.2f}".format(100 * v) for v in tst_metrics.prec)))
        logger.info("| matcher_tst_recl {}".format(" ".join(
            "{:4.2f}".format(100 * v) for v in tst_metrics.recall)))

        # save C_trn_pred.npz and trn_embedding.npy
        trn_csr_codes = rf_util.smat_util.sorted_csr(C_trn_pred,
                                                     only_topk=args.only_topk)
        trn_csr_codes = transform_prediction(trn_csr_codes,
                                             transform="lpsvm-l2")
        csr_codes_path = os.path.join(args.output_dir, "C_trn_pred.npz")
        smat.save_npz(csr_codes_path, trn_csr_codes)
        embedding_path = os.path.join(args.output_dir, "trn_embeddings.npy")
        np.save(embedding_path, trn_embeddings)

        # save C_eval_pred.npz and tst_embedding.npy
        tst_csr_codes = rf_util.smat_util.sorted_csr(C_tst_pred,
                                                     only_topk=args.only_topk)
        tst_csr_codes = transform_prediction(tst_csr_codes,
                                             transform="lpsvm-l2")
        csr_codes_path = os.path.join(args.output_dir, "C_tst_pred.npz")
        smat.save_npz(csr_codes_path, tst_csr_codes)
        embedding_path = os.path.join(args.output_dir, "tst_embeddings.npy")
        np.save(embedding_path, tst_embeddings)
Ejemplo n.º 30
0
def save_term_by_document_matrix(matrix):
    filepath = os.path.join(os.getcwd(), "results",
                            "term_by_document_matrix.npz")
    sparse.save_npz(filepath, matrix)
Ejemplo n.º 31
0
def h_construction(outfile, WithRTPrediction_, WithCosineH_, initRT_tuple, initRT_width,\
     V_mat, W_mat, noise_number, globalparam_list, iso_maxnumber, gaussian_width, cos_cutoff, output_file):
    gc.disable()
    V_shape, W_shape = V_mat.shape, W_mat.shape
    Hpeak_mean = 1 * W_shape[0] / W_shape[1]  #proofed per row
    if WithRTPrediction_:
        if W_shape[1] - noise_number != len(initRT_tuple):
            print 'Warning W_shape is wrong'
            exit()
        H_mat, initRT_correct = h_prediction(initRT_tuple, initRT_width, \
                 Hpeak_mean, noise_number, globalparam_list, gaussian_width)
        # H_mat = H_mat.todense()
    else:
        H_mat = h_constant(V_shape, W_shape, Hpeak_mean, noise_number)
        initRT_correct = None

    if WithCosineH_:
        H_size_rowcut = 250
        V_mat = V_mat.tocsr()
        W_mat = W_mat.tocsc()
        H_mat = H_mat.tocsr()

        if H_mat.shape[0] > H_size_rowcut:
            H_mat_list, H_cosmat_list = [], []
            for row in np.arange(0, H_mat.shape[0], H_size_rowcut):
                if row + H_size_rowcut >= H_mat.shape[0]:
                    row_end = H_mat.shape[0]
                else:
                    row_end = row + H_size_rowcut
                H_mat_out, H_cosmatdub_out = h_cos(V_mat, W_mat[:,
                                                                row:row_end],
                                                   H_mat[row:row_end, :],
                                                   iso_maxnumber, cos_cutoff)
                H_mat_list.append(H_mat_out)
                del (H_mat_out)
                H_cosmat_list.append(H_cosmatdub_out)
                del (H_cosmatdub_out)

            H_cosmat = sparse.vstack(H_cosmat_list)
            H_cosmat = H_cosmat.tocoo()
            del (H_cosmat_list)
            H_mat = sparse.vstack(H_mat_list)
            del (H_mat_list)
        else:
            H_mat, H_cosmat = h_cos(V_mat, W_mat, H_mat, iso_maxnumber,
                                    cos_cutoff)
    else:
        H_cosmat = None

    if noise_number > 0:
        noise_mat = np.zeros((noise_number, H_mat.shape[1]))
        noise_mat += Hpeak_mean / H_mat.shape[1]
        try:
            H_mat = sparse.vstack([H_mat, noise_mat])
            H_mat.tocoo()
        except:
            H_mat = np.vstack([H_mat, noise_mat])
            # change to sparse if better for memory
            if np.count_nonzero(H_mat) * 3 < H_mat.size:
                H_mat = sparse.coo_matrix(H_mat)
    sparse.save_npz(str(output_file) + '_Hmat_init.npz', H_mat)
    return H_mat, initRT_correct, H_cosmat
def weight_SWSN(ann_matrix,
                sparse_nets=None,
                normalized_nets=None,
                net_names=None,
                out_file=None,
                nodes=None,
                verbose=False):
    """ 
    *normalized_nets*: list of networks stored as scipy sparse matrices. Should already be normalized
    """
    # UPDATED: normalize the networks
    if sparse_nets is not None:
        print("Normalizing the networks")
        normalized_nets = []
        for net in sparse_nets:
            normalized_nets.append(_net_normalize(net))
    elif normalized_nets is None:
        print("No networks given. Nothing to do")
        return None, 0
    if len(normalized_nets) == 1:
        print("Only one network given to weight_SWSN. Nothing to do.")
        total_time = 0
        return sparse_nets[0], total_time
    if verbose:
        print("Removing rows with 0 annotations/positives")
        utils.print_memory_usage()
    # remove rows with 0 annotations/positives
    empty_rows = []
    for i in range(ann_matrix.shape[0]):
        pos, neg = alg_utils.get_term_pos_neg(ann_matrix, i)
        # the combineWeightsSWSN method doesn't seem to
        # work if there's only 1 positive
        if len(pos) <= 1 or len(neg) <= 1:
            empty_rows.append(i)
    # don't modify the original annotation matrix to keep the rows matching the GO ids
    curr_ann_mat = delete_rows_csr(ann_matrix.tocsr(), empty_rows)

    if verbose:
        utils.print_memory_usage()
    print("Weighting networks for %d different terms" %
          (curr_ann_mat.shape[0]))
    print("Running simultaneous weights with specific negatives")
    start_time = time.process_time()
    alpha, indices = combineNetworksSWSN(curr_ann_mat,
                                         normalized_nets,
                                         verbose=verbose)
    # print out the computed weights for each network
    if net_names is not None:
        print("network weights:")
        #print("\tnetworks chosen: %s" % (', '.join([net_names[i] for i in indices])))
        weights = defaultdict(int)
        for i in range(len(alpha)):
            weights[net_names[indices[i]]] = alpha[i]
        weights_table = ["%0.3e" % weights[net] for net in net_names]
        print('\t'.join(net_names))
        print('\t'.join(weights_table))

    # now add the networks together with the alpha weight applied
    weights_list = [0] * len(normalized_nets)
    weights_list[indices[0]] = alpha[0]
    combined_network = alpha[0] * normalized_nets[indices[0]]
    for i in range(1, len(alpha)):
        combined_network += alpha[i] * normalized_nets[indices[i]]
        weights_list[indices[i]] = alpha[i]
    total_time = time.process_time() - start_time

    if out_file is not None:
        # replace the .txt if present
        out_file = out_file.replace('.txt', '.npz')
        utils.checkDir(os.path.dirname(out_file))
        print("\twriting combined network to %s" % (out_file))
        sp.save_npz(out_file, combined_network)
        # also write the node ids so it's easier to access
        # TODO figure out a better way to store this
        node2idx_file = out_file + "-node-ids.txt"
        print("\twriting node ids to %s" % (node2idx_file))
        with open(node2idx_file, 'w') as out:
            out.write(''.join("%s\t%s\n" % (n, i)
                              for i, n in enumerate(nodes)))

        # write the alpha/weight of the networks as well
        net_weight_file = out_file + "-net-weights.txt"
        print("\twriting network weights to %s" % (net_weight_file))
        with open(net_weight_file, 'w') as out:
            out.write(''.join("%s\t%s\n" % (net_names[idx], str(alpha[i]))
                              for i, idx in enumerate(indices)))

    return combined_network, total_time, weights_list
Ejemplo n.º 33
0
def preprocess(pvalue_thr=1e-200, cancer_type='BRCA'):
    ##############################################################
    ####### Select pseudogene and coding genes
    ##############################################################
    print("Select pseudogene and coding genes")
    gencode = read_gtf("../data/raw_data/gencode.v29.annotation.gtf")
    gencode = gencode[gencode['feature'] == 'gene']

    # select pseudogenes
    pseudogene = gencode[gencode['gene_type'].isin([
        "transcribed_unprocessed_pseudogene",
        "transcribed_processed_pseudogene", "translated_processed_pseudogene"
    ])]
    pseudogene = pseudogene.drop([
        "score", "strand", "frame", "level", "tag", "exon_number", "exon_id",
        "ont", "protein_id", "ccdsid", "transcript_support_level",
        "havana_transcript", "havana_gene", "source", "transcript_id",
        "transcript_name", "transcript_type"
    ],
                                 axis=1)
    pseudogene.drop_duplicates(subset=['gene_name'],
                               keep='first',
                               inplace=True)

    print("Pseudogene number: ", pseudogene.shape[0])

    # select coding genes
    coding = gencode[gencode['gene_type'] == 'protein_coding']
    coding = coding.drop([
        "score", "strand", "frame", "level", "tag", "exon_number", "exon_id",
        "ont", "protein_id", "ccdsid", "transcript_support_level",
        "havana_transcript", "havana_gene", "source", "transcript_id",
        "transcript_name", "transcript_type"
    ],
                         axis=1)
    coding.drop_duplicates(subset=['gene_name'], keep='first', inplace=True)

    print("coding gene number: ", coding.shape[0])

    ##############################################################
    ####### generate genome sequence
    ##############################################################
    print("generate genome sequence")
    with open("../data/raw_data/GRCh38.primary_assembly.genome.fa") as f:
        data = f.readlines()

    chr_seq_map = dict()
    i = 0
    while i < len(data):
        if data[i][0] == ">":
            key = data[i].split(" ")[0]
            j = 1
            temp = []
            while (i + j) < len(data):
                if data[i + j][0] != ">":
                    temp.append(data[i + j][:-1])
                    j = j + 1
                else:
                    break
            value = "".join(temp)
            chr_seq_map[key] = value
            i = i + j

    chr_seq_mapping = dict()
    for key, value in chr_seq_map.items():
        if key[:4] == '>chr':
            chr_seq_mapping[key[1:]] = value

    def func(x):
        temp = chr_seq_mapping[x['seqname']]
        return temp[x['start']:x['end']]

    pseudogene['sequence'] = pseudogene.apply(func, axis=1)
    coding['sequence'] = coding.apply(func, axis=1)
    all_genes = pseudogene.append(coding, ignore_index=True, sort=False)
    all_genes.drop_duplicates(subset=['gene_name'], keep='first', inplace=True)

    ##################################################################
    ######## choose final pseudogene and coding candidates

    pseudo_list = all_genes[
        all_genes['gene_type'] != 'protein_coding']['gene_name'].values
    coding_list = all_genes[all_genes['gene_type'] ==
                            'protein_coding']['gene_name'].values

    # build similarity network
    print("filtering blast results")

    similarity_res = pd.read_csv("../data/raw_data/blast_similarity.csv",
                                 names=['query', 'target', 'evalue'])
    similarity_res = similarity_res[similarity_res['evalue'] < pvalue_thr]

    # delete self-self pairs
    similarity_res = similarity_res[(similarity_res['query'] !=
                                     similarity_res['target'])]
    # Only select pseudogene as the query
    similarity_select = similarity_res[similarity_res['query'].isin(
        pseudo_list)]
    # select corresonding coding genes
    similarity_candidate = set(
        similarity_select['target'].unique()) | set(pseudo_list)
    # filter by the candidates
    similarity_final = similarity_res[similarity_res['query'].isin(
        similarity_candidate)]
    similarity_final = similarity_final[similarity_final['target'].isin(
        similarity_final['query'].unique())]
    final_similarity_candidate = np.array(
        list(
            set(similarity_final['query'].unique())
            | set(similarity_final['target'].unique())))

    # this is all the genes we will use in our model
    final_all_genes = all_genes[all_genes['gene_name'].isin(
        final_similarity_candidate)]
    final_all_genes.index = range(len(final_all_genes))
    all_genes_mapping = dict(
        zip(final_all_genes['gene_name'], range(len(final_all_genes))))
    print("Dataset size: ", final_all_genes.shape[0])

    ##################################################################
    ###################### Build Networks ############################

    print("build similarity adj matrix")
    similarity_final['query_id'] = similarity_final['query'].apply(
        lambda x: all_genes_mapping[x])
    similarity_final['target_id'] = similarity_final['target'].apply(
        lambda x: all_genes_mapping[x])
    adj_simi = np.zeros((len(all_genes_mapping), len(all_genes_mapping)))
    for i, row in tqdm(similarity_final.iterrows()):
        adj_simi[row['query_id']][row['target_id']] = 1
    sAdj_simi = sparse.csr_matrix(adj_simi)
    sparse.save_npz("../data/final_input/adj_simi.npz", sAdj_simi)

    print("build TCGA co-expression network")
    df = pd.read_csv('../data/raw_data/TCGA_' + cancer_type + '.csv',
                     names=[
                         "index", "id", "name", "geneSymbol",
                         "MedianExpValueTumor", "MedianExpValueNormal",
                         "log_aveExpValueTumor", "log_aveExpValueNormal",
                         "expValuesTumor", "expValuesNormal",
                         "log_expValuesTumor", "log_expValuesNormal", "paired"
                     ],
                     index_col=["index"],
                     skipinitialspace=True)
    expression_new = df[['geneSymbol', 'log_expValuesTumor']]
    # only select genes that are included in the pseudogene and coding gene list
    all_genes_name = final_all_genes['gene_name'].values
    expression_selected = expression_new[expression_new['geneSymbol'].isin(
        all_genes_name)]
    expression_selected = expression_selected.reset_index(drop=True)

    # build expression networks
    expression_pairs = build_expression_network(expression_selected)
    final_co_expression = defaultdict(list)
    for key, value in tqdm(expression_pairs.items()):
        temp = [(all_genes_mapping[x[0]], x[1]) for x in value]
        final_co_expression[all_genes_mapping[key]] = temp
    adj_co_expression = np.zeros(
        (len(all_genes_mapping), len(all_genes_mapping)))
    for key, value in tqdm(final_co_expression.items()):
        for x in value:
            adj_co_expression[key][x[0]] = 1
    sAdj_co = sparse.csr_matrix(adj_co_expression)
    sparse.save_npz("../data/final_input/adj_TCGA_" + cancer_type + ".npz",
                    sAdj_co)

    print("generate node2vec embeddings for co-expression network")
    G_coexp = nx.from_scipy_sparse_matrix(sAdj_co)
    node2vec_coexp = Node2Vec(G_coexp,
                              dimensions=256,
                              walk_length=15,
                              num_walks=150,
                              workers=28)
    model_coexp = node2vec_coexp.fit(window=10, min_count=1, batch_words=4)
    model_coexp.wv.save_word2vec_format("../data/final_input/node2vec_TCGA_" +
                                        cancer_type + ".txt")

    print("build ppi and genetic interaction network")
    biogrid = pd.read_table("../data/raw_data/BIOGRID-ALL-3.5.173.tab2.txt")
    biogrid = biogrid[(biogrid['Organism Interactor A'] == 9606)
                      & (biogrid['Organism Interactor B'] == 9606)]
    biogrid = biogrid[[
        '#BioGRID Interaction ID', 'Entrez Gene Interactor A',
        'Entrez Gene Interactor B', 'Official Symbol Interactor A',
        'Official Symbol Interactor B', 'Experimental System',
        'Experimental System Type'
    ]]
    biogrid = biogrid[(biogrid['Official Symbol Interactor A'].isin(
        final_all_genes['gene_name'].unique()))
                      & (biogrid['Official Symbol Interactor B'].isin(
                          final_all_genes['gene_name'].unique()))]
    biogrid['query_id'] = biogrid['Official Symbol Interactor A'].apply(
        lambda x: all_genes_mapping[x])
    biogrid['target_id'] = biogrid['Official Symbol Interactor B'].apply(
        lambda x: all_genes_mapping[x])

    adj_ppi = np.zeros((len(all_genes_mapping), len(all_genes_mapping)))
    for i, row in tqdm(biogrid.iterrows()):
        adj_ppi[row['query_id']][row['target_id']] = 1
    sAdj_ppi = sparse.csr_matrix(adj_ppi)
    sparse.save_npz("../data/final_input/adj_ppi.npz", sAdj_ppi)

    print(
        "generate node2vec embeddings for PPI and genetic interaction network")
    G_ppi = nx.from_scipy_sparse_matrix(sAdj_ppi)
    node2vec_ppi = Node2Vec(G_ppi,
                            dimensions=256,
                            walk_length=15,
                            num_walks=150,
                            workers=28)
    model_ppi = node2vec_ppi.fit(window=10, min_count=1, batch_words=4)
    model_ppi.wv.save_word2vec_format("../data/final_input/node2vec_ppi.txt")

    ##########################################################
    ############ Generate feature dataframe ##################

    print("Get GO labels for both pseudogenes and coding genes")
    goa = pd.read_csv("../data/raw_data/goa_human.gaf",
                      sep="\t",
                      skiprows=31,
                      names=[
                          'DB', 'DB Object ID', 'DB Object Symbol',
                          'Qualifier', 'GO', 'reference', 'Evidence',
                          'with form', 'Aspect', 'DB Object Name', 'Synonym',
                          'type', 'Taxon', 'Date', 'Assigned by', 'extension',
                          'Gene product form ID'
                      ])
    goa = goa[goa['DB Object Symbol'].isin(final_all_genes['gene_name'])]
    goa_F = goa[goa['Aspect'] == 'F']
    goa_P = goa[goa['Aspect'] == 'P']
    goa_C = goa[goa['Aspect'] == 'C']
    final_all_genes['MF'] = final_all_genes['gene_name'].apply(
        lambda x: list(goa_F[goa_F['DB Object Symbol'] == x]['GO']))
    final_all_genes['BP'] = final_all_genes['gene_name'].apply(
        lambda x: list(goa_P[goa_P['DB Object Symbol'] == x]['GO']))
    final_all_genes['CC'] = final_all_genes['gene_name'].apply(
        lambda x: list(goa_C[goa_C['DB Object Symbol'] == x]['GO']))

    from go_anchestor import get_gene_ontology, get_anchestors

    go = get_gene_ontology()
    BIOLOGICAL_PROCESS = 'GO:0008150'
    MOLECULAR_FUNCTION = 'GO:0003674'
    CELLULAR_COMPONENT = 'GO:0005575'

    new_cc = []
    new_mf = []
    new_bp = []

    for i, row in final_all_genes.iterrows():
        labels = row['CC']
        temp = set([])
        for x in labels:
            temp = temp | get_anchestors(go, x)
        temp.discard(CELLULAR_COMPONENT)
        new_cc.append(list(temp))

        labels = row['MF']
        temp = set([])
        for x in labels:
            temp = temp | get_anchestors(go, x)
        temp.discard(MOLECULAR_FUNCTION)
        new_mf.append(list(temp))

        labels = row['BP']
        temp = set([])
        for x in labels:
            temp = temp | get_anchestors(go, x)
        temp.discard(BIOLOGICAL_PROCESS)
        new_bp.append(list(temp))

    final_all_genes['cc'] = new_cc
    final_all_genes['mf'] = new_mf
    final_all_genes['bp'] = new_bp

    mf_items = [item for sublist in final_all_genes['mf'] for item in sublist]
    mf_unique_elements, mf_counts_elements = np.unique(mf_items,
                                                       return_counts=True)

    bp_items = [item for sublist in final_all_genes['bp'] for item in sublist]
    bp_unique_elements, bp_counts_elements = np.unique(bp_items,
                                                       return_counts=True)

    cc_items = [item for sublist in final_all_genes['cc'] for item in sublist]
    cc_unique_elements, cc_counts_elements = np.unique(cc_items,
                                                       return_counts=True)

    mf_list = mf_unique_elements[np.where(mf_counts_elements > 25)]
    cc_list = cc_unique_elements[np.where(cc_counts_elements > 25)]
    bp_list = bp_unique_elements[np.where(bp_counts_elements > 250)]

    print("CC:", len(cc_list))
    print("MF:", len(mf_list))
    print("BP:", len(bp_list))

    temp_mf = final_all_genes['mf'].apply(
        lambda x: list(set(x) & set(mf_list)))
    final_all_genes['temp_mf'] = temp_mf
    temp_cc = final_all_genes['cc'].apply(
        lambda x: list(set(x) & set(cc_list)))
    final_all_genes['temp_cc'] = temp_cc
    temp_bp = final_all_genes['bp'].apply(
        lambda x: list(set(x) & set(bp_list)))
    final_all_genes['temp_bp'] = temp_bp

    mf_dict = dict(zip(list(mf_list), range(len(mf_list))))
    cc_dict = dict(zip(list(cc_list), range(len(cc_list))))
    bp_dict = dict(zip(list(bp_list), range(len(bp_list))))
    mf_encoding = [[0] * len(mf_dict) for i in range(len(final_all_genes))]
    cc_encoding = [[0] * len(cc_dict) for i in range(len(final_all_genes))]
    bp_encoding = [[0] * len(bp_dict) for i in range(len(final_all_genes))]

    for i, row in final_all_genes.iterrows():
        for x in row['temp_mf']:
            mf_encoding[i][mf_dict[x]] = 1
        for x in row['temp_cc']:
            cc_encoding[i][cc_dict[x]] = 1
        for x in row['temp_bp']:
            bp_encoding[i][bp_dict[x]] = 1

    final_all_genes['cc_label'] = cc_encoding
    final_all_genes['mf_label'] = mf_encoding
    final_all_genes['bp_label'] = bp_encoding

    final_all_genes.rename(columns={
        "temp_mf": "filter_mf",
        "temp_bp": "filter_bp",
        "temp_cc": "filter_cc"
    },
                           inplace=True)
    final_all_genes.drop(columns=['MF', 'CC', 'BP', 'mf', 'cc', 'bp'],
                         inplace=True)

    with open("../data/final_input/mf_list.txt", "w") as f:
        for x in list(mf_list):
            f.write(x + "\n")

    with open("../data/final_input/cc_list.txt", "w") as f:
        for x in list(cc_list):
            f.write(x + "\n")

    with open("../data/final_input/bp_list.txt", "w") as f:
        for x in list(bp_list):
            f.write(x + "\n")

    print("Add microRNA interactions as features")
    miRNA = pd.read_excel("../data/raw_data/miRNA.xlsx")
    miRNA = miRNA[miRNA['Target Gene'].isin(
        final_all_genes['gene_name'].unique())]
    selected_miRNA = miRNA['miRNA'].value_counts().index[(
        miRNA['miRNA'].value_counts() > 250)]
    miRNA = miRNA[miRNA['miRNA'].isin(selected_miRNA)]

    micro_mapping = dict(zip(list(selected_miRNA), range(len(selected_miRNA))))
    micro_encoding = []
    for i, row in tqdm(final_all_genes.iterrows()):
        cur_mir = miRNA[miRNA['Target Gene'] == row['gene_name']]['miRNA']
        temp_encoding = [0] * len(selected_miRNA)
        for x in cur_mir:
            temp_encoding[micro_mapping[x]] = 1
        micro_encoding.append(temp_encoding)

    final_all_genes['microRNA_250'] = micro_encoding

    with open("../data/final_input/microRNA_list.txt", "w") as f:
        for x in list(selected_miRNA):
            f.write(x + "\n")

    print("Add GTEx median expression profiles")
    GTEx = pd.read_csv(
        "../data/raw_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct",
        skiprows=2,
        sep='\t')
    final_all_genes['gene_id'] = final_all_genes['gene_id'].apply(
        lambda x: x.split(".")[0])
    GTEx['Name'] = GTEx['Name'].apply(lambda x: x.split(".")[0])
    GTEx_new = pd.DataFrame({
        'gene_id': GTEx['Name'],
        'expression': GTEx.iloc[:, 2:].values.tolist()
    })
    GTEx_new.drop_duplicates(subset=['gene_id'], keep='first', inplace=True)
    final_all_genes = pd.merge(final_all_genes,
                               GTEx_new,
                               on='gene_id',
                               how='left')
    final_all_genes['expression'] = final_all_genes['expression'].apply(
        lambda d: d if isinstance(d, list) else [0.0] * 54)

    def reshape(features):
        return np.hstack(features).reshape((len(features), len(features[0])))

    print("generate GTEx node2vec features...")
    expression = reshape(final_all_genes['expression'].values).T
    cor_matrix, pval = spearmanr(expression, nan_policy='omit')
    cor_matrix = np.nan_to_num(cor_matrix, 0)

    thr = 0.9
    adj_coexp_GTEx = np.zeros(
        (final_all_genes.shape[0], final_all_genes.shape[0]))
    adj_coexp_GTEx[cor_matrix > thr] = 1
    adj_coexp_GTEx = sparse.csr_matrix(adj_coexp_GTEx)
    sparse.save_npz("../data/final_input/adj_GTEx.npz", adj_coexp_GTEx)

    print("generate node2vec embeddings for GTEx co-expression network")
    G_coexp = nx.from_scipy_sparse_matrix(adj_coexp_GTEx)
    node2vec_coexp = Node2Vec(G_coexp,
                              dimensions=256,
                              walk_length=15,
                              num_walks=150,
                              workers=28)
    model_coexp = node2vec_coexp.fit(window=10, min_count=1, batch_words=4)
    model_coexp.wv.save_word2vec_format(
        "../data/final_input/node2vec_GTEx.txt")

    print("Saving feature dataframe")
    final_all_genes.to_pickle("../data/final_input/features_all.pkl")
    features_input = final_all_genes.loc[:, [
        'gene_id', 'gene_name', 'gene_type', 'cc_label', 'mf_label',
        'bp_label', 'microRNA_250', 'expression'
    ]]
    features_input.to_pickle("../data/final_input/features.pkl")
Ejemplo n.º 34
0
def gen_linear_term():
    from sklearn.preprocessing import OneHotEncoder

    context_features = \
        ['sitesetID', 'positionType', 'connectionType', 'telecomsOperator', 'hour', 'hour_weight', 'is_pref_cat']
    user_features = \
        ['age', 'gender', 'education', 'marriageStatus', 'haveBaby', 'hometown', 'residence', 'user_activity',
         'cat_pref']
    ad_features = \
        ['advertiserID', 'appPlatform', 'appCategory', 'app_popularity']

    # 加载 dataset
    dataset = pd.read_hdf(path_intermediate_dataset + hdf_dataset)

    # y
    y = dataset['label'].values
    # 存储
    np.save(path_modeling_dataset + npy_y, y)
    # 手动释放内存
    del y
    gc.collect()

    # one-hot context
    enc_context = OneHotEncoder()
    context_csc = enc_context.fit_transform(
        dataset[context_features].values).tocsc()
    # 存储
    save_npz(path_modeling_dataset + npz_context, context_csc)
    # 手动释放内存
    del context_csc
    gc.collect()

    # one-hot user
    enc_user = OneHotEncoder()
    user_csc = enc_user.fit_transform(dataset[user_features].values).tocsc()
    # 存储
    save_npz(path_modeling_dataset + npz_user, user_csc)
    # 手动释放内存
    del user_csc
    gc.collect()

    # one-hot ad
    enc_ad = OneHotEncoder()
    ad_csc = enc_ad.fit_transform(dataset[ad_features].values).tocsc()
    # 存储
    save_npz(path_modeling_dataset + npz_ad, ad_csc)
    # 手动释放内存
    del ad_csc
    gc.collect()

    # 释放 dataset
    del dataset
    gc.collect()

    # 加载 testset_ol
    testset_ol = pd.read_hdf(path_intermediate_dataset + hdf_testset_ol)

    # one-hot context
    context_csc_test_ol = enc_context.transform(
        testset_ol[context_features].values).tocsc()
    # 存储
    save_npz(path_modeling_dataset + npz_context_test_ol, context_csc_test_ol)
    # 手动释放内存
    del context_csc_test_ol
    gc.collect()

    # one-hot user
    user_csc_test_ol = enc_user.transform(
        testset_ol[user_features].values).tocsc()
    # 存储
    save_npz(path_modeling_dataset + npz_user_test_ol, user_csc_test_ol)
    # 手动释放内存
    del user_csc_test_ol
    gc.collect()

    # one-hot ad
    ad_csc_test_ol = enc_ad.transform(testset_ol[ad_features].values).tocsc()
    # 存储
    save_npz(path_modeling_dataset + npz_ad_test_ol, ad_csc_test_ol)
    # 手动释放内存
    del ad_csc_test_ol
    gc.collect()

    # 释放 testset_ol
    del testset_ol
    gc.collect()
Ejemplo n.º 35
0
    rowAr = [0 for j in range(len(line))]
    a = csr_matrix((np.ones(len(line)), (rowAr, line)), shape=(1, numFeat))
    if testMat == None: testMat = a
    else: testMat = vstack([testMat, a])
    i += 1
    if i % 100 == 0:
        print i

a = clf.predict(testMat)
wrong = 0
missed_bully = 0
for i in range(len(testLabels)):
    if a[i] - testLabels[i] != 0:  #wrong prediction
        wrong += 1
    if a[i] == 0 and testLabels[i] == 1:  #missed a bullying incidence
        missed_bully += 1
print 'Fraction of wrong guesses on test set: ' + str(
    float(wrong) / len(testLabels))
print 'Fraction of missed bullying on test set: ' + str(
    float(missed_bully) / len(testLabels))

#Final SVM update for simulator
ultraFinal = vstack([finalMat, devMat, testMat])
finalLabels = trainLabels + devLabels + testLabels
clf = svm.NuSVC(.05, probability=True)
clf.fit(ultraFinal, finalLabels)
joblib.dump(clf, 'model.pkl')
save_npz('master_convo.npz', ultraFinal)  #for updating during simulation
with open('master_labels.txt', 'wb') as f:
    pickle.dump(finalLabels, f)
Ejemplo n.º 36
0
def main():
    """Create the model and start the evaluation process."""
    args = get_arguments()
    num_steps = file_len(os.path.join(args.img_path, args.data_list))
    # Create queue coordinator.
    coord = tf.train.Coordinator()

    # Load reader.
    with tf.name_scope("create_inputs"):
        reader = ImageReader(
            os.path.join(args.img_path, "texture"),
            os.path.join(args.img_path, args.data_list),
            None,  # No defined input size.
            False,  # No random scale.
            False,  # No random mirror.
            255,
            IMG_MEAN,
            coord,
        )
        image, label = reader.image, reader.label
        title = reader.queue[0]
    image_batch, label_batch = (
        tf.expand_dims(image, axis=0),
        tf.expand_dims(label, axis=0),
    )  # Add one batch dimension.

    # Create network.
    net = DeepLabResNetModel(
        {"data": image_batch}, is_training=False, num_classes=args.num_classes
    )

    # Which variables to load.
    restore_var = tf.global_variables()

    # Predictions.
    raw_output = net.layers["fc1_voc12"]
    before_argmax = tf.image.resize_bilinear(raw_output, tf.shape(image_batch)[1:3,])
    raw_output_up = tf.argmax(before_argmax, dimension=3)
    pred = tf.expand_dims(raw_output_up, axis=3)
    hw_only = pred[0, :, :, 0]
    class_0 = tf.where(tf.equal(hw_only, 0))
    class_1 = tf.where(tf.equal(hw_only, 1))
    class_2 = tf.where(tf.equal(hw_only, 2))
    class_3 = tf.where(tf.equal(hw_only, 3))
    class_4 = tf.where(tf.equal(hw_only, 4))
    class_5 = tf.where(tf.equal(hw_only, 5))
    class_6 = tf.where(tf.equal(hw_only, 6))

    # Set up TF session and initialize variables.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    init = tf.global_variables_initializer()

    sess.run(init)

    # Load weights.
    loader = tf.train.Saver(var_list=restore_var)
    load(loader, sess, args.model_weights)

    # Start queue threads.
    threads = tf.train.start_queue_runners(coord=coord, sess=sess)

    start_time = time.time()
    os.makedirs(os.path.join(args.img_path, args.body_dir), exist_ok=True)
    os.makedirs(os.path.join(args.img_path, args.vis_dir), exist_ok=True)

    # write the header
    rois_file = os.path.join(args.img_path, "rois.csv")
    if os.path.isfile(rois_file):
        print(f"The rois file {rois_file} already exists...")
        ans = None
        while all(ans != choice for choice in ("a", "o", "q")):
            ans = input("Do you want to (a)ppend, (o)verwrite, or (q)uit? ")
        if ans == "o":
            print("Overwriting existing rois file...")
            write_header(rois_file)
        elif ans == "q":
            sys.exit(1)
    else:
        write_header(rois_file)

    # Perform inference.
    t = trange(num_steps, desc="Inference progress", unit="img")
    for step in t:
        # run through the model
        jpg_path, c0, c1, c2, c3, c4, c5, c6, raw_output_up_ = sess.run(
            [
                title,
                class_0,
                class_1,
                class_2,
                class_3,
                class_4,
                class_5,
                class_6,
                raw_output_up,
            ]
        )

        # == First, save the body segmentation ==
        if not args.no_body:
            # convert to a 2D compressed matrix, because we have a lot of 0's for the
            # background
            compressed = sparse.csr_matrix(np.squeeze(raw_output_up_))
            fname = os.path.splitext(os.path.basename(str(jpg_path)))[0]
            out = os.path.join(args.img_path, args.body_dir, fname)
            sparse.save_npz(out, compressed)

        # == Next, save the ROIs ==
        if not args.no_rois:
            img_id = extract_nums_only(fname)
            for c in (c0, c1, c2, c3, c4, c5, c6):
                try:
                    min_x = np.min(c[:, 1])
                except ValueError:
                    min_x = None
                try:
                    min_y = np.min(c[:, 0])
                except ValueError:
                    min_y = None
                try:
                    max_x = np.max(c[:, 1])
                except ValueError:
                    max_x = None
                try:
                    max_y = np.max(c[:, 0])
                except ValueError:
                    max_y = None
                # write out the stuff
                with open(rois_file, "a") as f:
                    f.write(
                        ",".join(
                            (img_id, str(min_x), str(min_y), str(max_x), str(max_y), "\n")
                        )
                    )

        # Save an image of the mask for our own reference every 1000 steps
        if not args.no_vis and step % args.visualize_step == 0:
            preds = np.expand_dims(raw_output_up_, axis=3)
            msk = decode_labels(preds, num_classes=args.num_classes)
            # the mask
            im = Image.fromarray(msk[0])
            # # Save the mask separately
            # jpg_path = str(jpg_path).split('/')[-1].split('.')[0]
            # out = os.path.join(args.vis_dir, jpg_path + '.png')
            # im.save(out)
            # Save the mask with background
            img_orig = Image.open(jpg_path)
            # create the final result using the mask and the original
            img = np.array(im) * 0.9 + np.array(img_orig) * 0.7
            # clip surpassed colors
            img[img > 255] = 255
            img = Image.fromarray(np.uint8(img))
            out = os.path.join(args.img_path, args.vis_dir, fname + ".png")
            img.save(out)
            # # print('Image processed {}.png'.format(jpg_path))
        t.set_description("Finished " + fname)

    total_time = time.time() - start_time
    print(f"The output files have been saved to {args.img_path}/{args.body_dir}")
    print(f"It took {total_time / num_steps} sec on each image.")
Ejemplo n.º 37
0
    def _adjacency(self, adj_path: str) -> NoReturn:
        """
        Create self.adj_mats and self.degrees.

        Parameters
        ----------
        adj_path : str
            Try to use drug-drug adjacency matrices saved in adj_path.
            If this is not possible, calculate it and save in adj_path.

        Notes
        -----
        self.adj_mats : Dict[Tuple[int, int], List[sp.csr_matrix]]
            From edge type to list of adjacency matrices for each edge class
            (e.g. (1, 1): list of drug-drug adjacency matrices for each se class).
            In our case all matrix in adj_mats are symmetric.
        self.degrees : Dict[int, List[int]]
            Number of connections for each node (0: genes, 1: drugs).

        """
        gene_gene_adj = nx.adjacency_matrix(self.gene_net)
        # Number of connections for each gene
        gene_degrees = np.array(gene_gene_adj.sum(axis=0)).squeeze()

        drug_gene_adj = create_adj_matrix(
            a_item2b_item=self.stitch2proteins,
            ordered_list_a_item=self.ordered_list_of_drugs,
            ordered_list_b_item=self.ordered_list_of_proteins)

        gene_drug_adj = drug_gene_adj.transpose(copy=True)

        num_se = len(self.ordered_list_of_se)
        drug_drug_adj_list = []
        try:
            print("Try to load drug-drug adjacency matrices from file.")
            if len(os.listdir(adj_path)) < num_se:
                raise IOError('Not all drug-drug adjacency matrices are saved')
            for i in range(num_se):
                drug_drug_adj_list.append(
                    sp.load_npz(adj_path +
                                '/sparse_matrix%04d.npz' % i).tocsr())
        except IOError:
            print('Calculate drug-drug adjacency matrices')
            drug_drug_adj_list = create_combo_adj(
                combo_a_item2b_item=self.combo2se,
                combo_a_item2a_item=self.combo2stitch,
                ordered_list_a_item=self.ordered_list_of_drugs,
                ordered_list_b_item=self.ordered_list_of_se)
            print("Saving matrices to file")
            for i in range(len(drug_drug_adj_list)):
                sp.save_npz(f'{adj_path}/sparse_matrix%04d.npz' % (i, ),
                            drug_drug_adj_list[i].tocoo())
        # Number of connections for each drug
        drug_degrees_list = [
            np.array(drug_adj.sum(axis=0)).squeeze()
            for drug_adj in drug_drug_adj_list
        ]
        self.adj_mats = {
            (0, 0): [gene_gene_adj],
            (0, 1): [gene_drug_adj],
            (1, 0): [drug_gene_adj],
            (1, 1): drug_drug_adj_list,
        }
        self.degrees = {
            0: [gene_degrees],
            1: drug_degrees_list,
        }
      for d in range(np.size(T.starts)):
        II = np.arange(T.starts[d]-1,T.ends[d]-1,1,dtype='int')
        ni = T.ni[II]
        ti = T.date[II]
        mi = T.monthi[II]

        for i in np.arange(np.size(ni)-1) :
          j = i+dm
          if j < np.size(ni) and j>0 :
            if np.absolute(ti[j]-ti[i]).astype(int) == M.tau :
              if ni[i] > -9900 and ni[j] > -9900: #PROBLEM: so far no solution for crossing paths between regions as defined at starts. Buoys crossing the boundary will be rejected.
                J.indi[mi[i]][tel[mi[i]]] = ni[i]
                J.indj[mi[i]][tel[mi[i]]] = ni[j]
                tel[mi[i]] += 1
      M.nCross = [[] for _ in np.arange(M.nt) ]
      J.indicrop = [[] for _ in np.arange(M.nt) ]
      J.indjcrop = [[] for _ in np.arange(M.nt) ]

      #delete empty entries in indi and indj
      for m in np.arange(M.nt):
        J.indicrop[m] = J.indi[m][0:tel[m]]
        J.indjcrop[m] = J.indj[m][0:tel[m]]

        #sparse matrix per timestep with every origin and destination as coordinates in matrix, with all possible locations M.nc on i and j axis. 
        sparseV = np.ones(np.shape(J.indicrop[m]))
        sparseI = J.indicrop[m]
        sparseJ = J.indjcrop[m]
        M.nCross[m] = sparse.coo_matrix((sparseV,(sparseI,sparseJ)),shape=(M.nc[-1],M.nc[-1]))
        sparse.save_npz(os.path.join('rawMatrices/',str(types[tp]) + 'transitmatrix_' + M.dir + '_raw_' + str(f) + '_monthindex' + str(m) + '.npz'), M.nCross[m])
  print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 39
0
train = pd.read_csv(Settings.train_cleaned_file_path)
test = pd.read_csv(Settings.test_cleaned_file_path)
# features = generate_doc_vec()

pd.set_option('display.max_rows', 20000)
train['comment_text'].fillna('null', inplace=True)
test['comment_text'].fillna('null', inplace=True)
merge = pd.concat([train.iloc[:, 0:2], test.iloc[:, 0:2]])
corpus = merge.comment_text

tfidf_word = TfidfVectorizer(ngram_range=(1, 2), strip_accents="unicode", min_df=3, max_df=0.95, use_idf=True,
                             smooth_idf=True, sublinear_tf=True, analyzer='word', max_features=10000)

# tfidf_word.fit(corpus)
word_tfidf_vec = tfidf_word.fit_transform(corpus)

tfidf_char = TfidfVectorizer(ngram_range=(4, 6), strip_accents="unicode", analyzer='char', sublinear_tf=True,
                             use_idf=True, smooth_idf=True, max_features=20000)
# tfidf_char.fit(corpus)
char_tfidf_vec = tfidf_char.fit_transform(corpus)

# final_VSM = sparse.hstack((word_tfidf_vec, char_tfidf_vec, features), format('csr'))

final_VSM = sparse.hstack((word_tfidf_vec, char_tfidf_vec), format('csr'))
sparse.save_npz(features_file, final_VSM)

elapsed = time.time() - start
print("time for generate VSM: ", elapsed, "\n")


Ejemplo n.º 40
0
 def to_disk(self, file_path):
     file_name, ext = os.path.splitext(file_path)
     save_npz(file_path, self.raw_data)
     with open(file_name + ".voc", "wb") as vocab_file:
         pickle.dump(self.vectorizer, vocab_file)
     self.identifiers.to_pickle(file_name + ".pkl")
Ejemplo n.º 41
0
        compute_correlation(events, 250, num_threads, chunk_size, threshold, n_classes_hist)
    diff_time = time.time() - start
    print("Execution finished in: " + str(diff_time))
    tiempos[i] = diff_time

print("Max: " + str(tiempos.max()) + " Mean: " + str(tiempos.mean()) +
      " Min: " + str(tiempos.min()))

xcm_pos = csr_matrix(xcm_pos, dtype=np.float32)
xclags_pos = csr_matrix(xclags_pos, dtype=np.int32)
xcm_neg = csr_matrix(xcm_neg, dtype=np.float32)
xclags_neg = csr_matrix(xclags_neg, dtype=np.int32)
max_hist = csr_matrix(max_hist, dtype=np.int32)
min_hist = csr_matrix(min_hist, dtype=np.int32)

save_npz('out_xcm_pos.npz', xcm_pos, compressed=True)
save_npz('out_xcl_pos.npz', xclags_pos, compressed=True)
save_npz('out_xcm_neg.npz', xcm_neg, compressed=True)
save_npz('out_xcl_neg.npz', xclags_neg, compressed=True)
save_npz('out_xmax_hist.npz', max_hist, compressed=True)
save_npz('out_xmin_hist.npz', min_hist, compressed=True)

# xcm_dense = xcm_pos.toarray()
# print(xcm_dense.shape)

#np.savez_compressed('output_matrices', xcm_pos=xcm_pos, xcl_pos=xclags_pos, xcm_neg=xcm_neg, xcl_neg=xclags_neg)

# np.savetxt('xcm_v1_neg.txt', xcm_neg, delimiter='\t', fmt='%6.3f')
#
# np.savetxt('xcl_v1_neg.txt', xclags_neg, delimiter='\t', fmt='%6.0f')
#
Ejemplo n.º 42
0
Archivo: als.py Proyecto: freygit/36
 def save(self,  vector_data_file):
     sparse.save_npz("%s-user" % vector_data_file, self._user_features)
     sparse.save_npz("%s-item" % vector_data_file, self._item_features)
Ejemplo n.º 43
0
def save_sp(folder, name, M):
    return sp.save_npz(folder+name+'_sp.npz', M.tocsr())
Ejemplo n.º 44
0
    def __init__(self, TOKENIZED_CORPUS, UNIQUE_WORD_LIST, EMBED_SIZE,
                 CONTEXT_SIZE, X_MAX, ALPHA, TOTAL_PROCESS_NUM):
        """
            This method initialize GloVeClass with given parameters.
        
            Args:
                TOKENIZED_CORPUS(list) : list of all words in a corpus
                UNIQUE_WORD_LIST(ndarray) : list of all unique word
                EMBED_SIZE : the size of vector 
                CONTEXT_SIZE : context window size
                X_MAX : maximun x size
                ALPHA : ALPHA
                TOTAL_PROCESS_NUM : TOTAL_PROCESS_NUM
        """
        super(GloVeClass, self).__init__()

        print("[Initialization Start]")
        self.TOKENIZED_CORPUS = TOKENIZED_CORPUS
        self.UNIQUE_WORD_LIST = UNIQUE_WORD_LIST
        self.CONTEXT_SIZE = CONTEXT_SIZE
        self.EMBED_SIZE = EMBED_SIZE
        self.X_MAX = X_MAX
        self.ALPHA = ALPHA
        self.word_to_index = {
            word: index
            for index, word in enumerate(self.UNIQUE_WORD_LIST)
        }
        self.index_to_word = {
            index: word
            for index, word in enumerate(self.UNIQUE_WORD_LIST)
        }
        self.TOKENIZED_CORPUS_SIZE = len(self.TOKENIZED_CORPUS)
        self.UNIQUE_WORD_SIZE = len(self.UNIQUE_WORD_LIST)

        self.in_embed = nn.Embedding(self.UNIQUE_WORD_SIZE, self.EMBED_SIZE)
        self.in_embed.weight = xavier_normal(self.in_embed.weight)
        self.in_bias = nn.Embedding(self.UNIQUE_WORD_SIZE, 1)
        self.in_bias.weight = xavier_normal(self.in_bias.weight)
        self.out_embed = nn.Embedding(self.UNIQUE_WORD_SIZE, self.EMBED_SIZE)
        self.out_embed.weight = xavier_normal(self.out_embed.weight)
        self.out_bias = nn.Embedding(self.UNIQUE_WORD_SIZE, 1)
        self.out_bias.weight = xavier_normal(self.out_bias.weight)

        self.word_embeddings_array = None
        self.word_u_candidate = np.arange(self.UNIQUE_WORD_SIZE)
        self.word_v_candidate = np.arange(self.UNIQUE_WORD_SIZE)

        self.total_process_num = TOTAL_PROCESS_NUM
        if TOTAL_PROCESS_NUM:
            print("Build co-occurence matrix with multiprocess")
            print("TOTAL_PROCESS_NUM : ", TOTAL_PROCESS_NUM)
            queue = mp.Queue()
            ps = list()
            for i in range(self.total_process_num):
                ps.append(
                    mp.Process(target=self.build_sub_co_occurence_matrix,
                               args=(queue, i)))
            for p in ps:
                p.start()
            # キューから結果を回収
            for i in range(self.total_process_num):
                if i:
                    col += queue.get()  # キューに値が無い場合は、値が入るまで待機になる
                else:
                    col = queue.get()
            for p in ps:
                p.terminate()
            col = np.array(col, dtype=np.int64)
            self.co_occurence_matrix = coo_matrix(
                (np.ones(col.size, dtype=np.int64),
                 (np.zeros(col.size, dtype=np.int64), col)),
                shape=(1,
                       int((self.UNIQUE_WORD_SIZE *
                            (self.UNIQUE_WORD_SIZE + 1)) / 2)),
                dtype=np.int64)
            print("Done")
            tries = 10
            while tries:
                try:
                    print("SAVE co_occurence_matrix")
                    # scipy.io.mmwrite('model/co_occurence_matrix.mtx', self.co_occurence_matrix)
                    save_npz('model/co_occurence_matrix.npz',
                             self.co_occurence_matrix)
                    print("Done")
                except IOError as e:
                    print("IOError happened")
                    error = e
                    tries -= 1
                else:
                    break
            if not tries:
                print("Fail to saving matrix due to IOError")
                raise error
        else:
            print("Load co-occurence matrix")
            # self.co_occurence_matrix = scipy.io.mmread('model/co_occurence_matrix.mtx')
            self.co_occurence_matrix = load_npz(
                'model/co_occurence_matrix.npz')
            print("Done")
        self.co_occurence_matrix = self.co_occurence_matrix.todense()
        print("[Initialization Done]")
Ejemplo n.º 45
0
    print(aug)

    if args.mode == 'val':
        val_df = pd.read_csv(str(data_path / 'val4_df.csv'))
        preds, labels = predict(model, val_df['file_name'].apply(Path).values, batch_size, aug=aug)
        target_file_name = args.model_type + '_test_' + str(aug)

    elif args.mode == 'test':
        test_hashes = pd.read_csv(str(data_path / 'test_hashes.csv'))
        train_hashes = pd.read_csv(str(data_path / 'train_hashes.csv'))
        test_hashes = test_hashes.drop_duplicates('md5')
        test_hashes = test_hashes[~test_hashes['md5'].isin(set(train_hashes['md5'].unique()))]
        bad_md5 = ['d704b9555801285eedb04213a02fdc41', '35e7e038fe2ec215f63bdb5e4b739524']

        hashes = test_hashes['file_name'].apply(lambda x: data_path.parent / x, 1).values

        preds, labels = predict(model, hashes, batch_size, aug=aug, transform=transform)

        target_file_name = args.model_type + '_test_' + str(aug)

    labels = pd.DataFrame(labels, columns=['file_name'])

    print('[{}] Saving labels...'.format(str(datetime.datetime.now())))

    labels.to_csv(str(model_path / (target_file_name + '.csv')), index=False)

    print('[{}] Saving predictions...'.format(str(datetime.datetime.now())))

    save_npz(str(model_path / (target_file_name + '.npz')), preds)