Beispiel #1
0
def lisa_deseq(args):

    log = Log(target=sys.stderr, verbose=args.verbose)
    lisa = FromGenes(args.species,
                     **extract_kwargs(args, INSTANTIATION_KWARGS),
                     log=log)

    up_genes, down_genes = parse_deseq_file(args.deseq_file,
                                            lfc_cutoff=args.lfc_cutoff,
                                            pval_cutoff=args.pval_cutoff,
                                            sep=args.sep)

    results_summary = []
    all_passed = True
    for prefix, query_list in zip(['up-regulated', 'down-regulated'],
                                  [up_genes, down_genes]):

        with log.section('Modeling {}:'.format(str(prefix))):
            try:
                results, metadata = lisa.predict(
                    query_list, **extract_kwargs(args, PREDICTION_KWARGS))

                top_TFs_unique = save_and_get_top_TFs(args, prefix, results,
                                                      metadata)

                results_summary.append((prefix, top_TFs_unique))

            except AssertionError as err:
                all_passed = False
                log.append('ERROR: ' + str(err))

    print_results_multi(results_summary)

    if not all_passed:
        raise MultiError('One or more genelists raised an error')
Beispiel #2
0
    def __init__(self,
                 species,
                 window_size=1000,
                 download_if_not_exists=True,
                 make_new=False,
                 log=None,
                 path=None,
                 load_genes=True):

        self.species = species
        self.window_size = int(window_size)

        if log is None:
            self.log = Log()
        else:
            self.log = log

        if path is None:
            self.path = self.get_dataset_path(self.species, self.window_size)
        else:
            self.path = path

        if make_new:
            h5.File(self.path, 'w').close()
        elif not os.path.isfile(self.path):
            if download_if_not_exists and path is None:
                self.download_data()
            else:
                h5.File(self.path, 'w').close()

        #___ LOAD GENE DATA FROM PACKAGE _____
        self.genome = self.load_genome(self.species, self.window_size)

        if load_genes:
            self.load_genes()
Beispiel #3
0
def lisa_multi(args):

    log = Log(target=sys.stderr, verbose=args.verbose)
    lisa = FromGenes(args.species,
                     **extract_kwargs(args, INSTANTIATION_KWARGS),
                     log=log)

    query_dict = {
        os.path.basename(query.name): query.readlines()
        for query in args.query_lists
    }

    results_summary = []
    all_passed = True
    for query_name, query_list in query_dict.items():

        with log.section('Modeling {}:'.format(str(query_name))):
            try:
                results, metadata = lisa.predict(
                    query_list, **extract_kwargs(args, PREDICTION_KWARGS))

                top_TFs_unique = save_and_get_top_TFs(args, query_name,
                                                      results, metadata)

                results_summary.append((query_name, top_TFs_unique))

            except AssertionError as err:
                all_passed = False
                log.append('ERROR: ' + str(err))

    print_results_multi(results_summary)

    if not all_passed:
        raise MultiError('One or more genelists raised an error')
Beispiel #4
0
    def convert_bigwig(cls, bigwig, species, log=None):

        if log is None:
            log = Log()

        genome = DataInterface.load_genome(species, cls.window_size)
        coverage_array = np.zeros(len(genome))

        log.append('Converting BigWig file to coverage array ...')

        bar = LoadingBar('Progress', len(genome) // 1000 + 1, cold_start=True)

        try:
            coverage_bw = bw.open(bigwig)

            log.append(bar, update_line=True)

            for i, window in enumerate(genome.list_windows()):

                if window.chromosome in coverage_bw.chroms():
                    mean_coverage = coverage_bw.stats(*window.to_tuple())[0]
                    coverage_array[i] = mean_coverage

                if i % 1000 == 0:
                    log.append(bar, update_line=True)

            return np.nan_to_num(coverage_array)

        finally:
            coverage_bw.close()
Beispiel #5
0
def main(species, motif_bed, window_size, gamma_threshold=0.95):

    genome = DataInterface.load_genome(species, window_size)

    log = Log(target=stderr)

    factor_name = None
    window_nums, scores = [], []

    with gzip.open(motif_bed, 'rb') as f:

        bed = f.readlines()

        bar = LoadingBar('Binning {} motif hits'.format(str(len(bed))),
                         len(bed),
                         cold_start=True)

        for i, line in enumerate(bed):

            chrom, start, end, factor, relscore, log_pval, strand = line.decode(
                'utf-8').strip().split('\t')

            if i == 0:
                factor_name = factor

            try:
                hit_windows = genome.get_region_windows(
                    Region(chrom, start, end))
                window_nums.extend(hit_windows)

                scores.extend([float(log_pval) / 100] * len(hit_windows))

            except BadRegionError:
                pass

            log.append(bar, update_line=True)

    log.append('')

    log.append('Done')

    hits = sparse.csc_matrix((scores, window_nums, [0, len(window_nums)]),
                             shape=(len(genome), 1)).tocoo().tocsc()

    sample_hit_scores = np.random.choice(np.array(hits.todense()).reshape(-1),
                                         size=10000)

    min_bin_score = gamma(*gamma.fit(sample_hit_scores)).ppf(gamma_threshold)

    hit_indices = hits.indices[(hits.data >= min_bin_score) & (hits.data > 0)]

    return hit_indices, factor_name
Beispiel #6
0
    def convert_bigwig(cls, bigwig, species, bigwig_cmd_path):

        log = Log()

        genome = DataInterface.load_genome(species, cls.window_size)
        coverage_array = np.zeros(len(genome))

        log.append('Converting BigWig file to coverage array ...')

        if not os.path.exists(cls._get_genome_bin_path(species)):
            log.append('Writing bins ...')
            cls._write_genome_bins(species)

        try:

            temp = tempfile.NamedTemporaryFile('w', delete=False)
            temp.close()

            process = subprocess.run([bigwig_cmd_path, bigwig, cls._get_genome_bin_path(species), temp.name], capture_output=True)

            if process.returncode == 0:

                with open(temp.name, 'r') as cmd_output:
                    for line in cmd_output:
                        fields = line.strip().split('\t')
                        coverage_array[int(fields[0])] = fields[4]
                    
                return coverage_array
            
            else:
                raise AssertionError(process.stderr.decode('utf-8'))
        finally:
            os.remove(temp.name)
Beispiel #7
0
    def using_bigwig(cls,
                     species,
                     query_genes,
                     bigwig_path,
                     rp_map='enhanced_10K',
                     isd_method='chipseq',
                     background_list=[],
                     background_strategy='all',
                     num_background_genes=3000,
                     seed=2556,
                     verbose=4,
                     log=None):
        '''
*classmethod*
**lisa.FromCoverage.using_bigwig** (species, query_genes, bigwig_path, rp_map = 'basic', rp_decay = 10000, isd_method = 'chipseq', background_list = [], background_strategy = 'all', num_background_genes = 3000, seed = 2556, header = False, verbose = 4, log = None)

    Run LISA FromCoverage test using a bigwig coverage file.

    Parameters:
        species: {'hg38', 'mm10'}

        query_genes (list): 
            Genes-of-interest, in either Symbol of RefSeqID format. Must provide between 20 to 500 genes.
        bigwig_path (str): 
            Path to bigwig file

    Returns:
        results (lisa.core.utils.LISA_Results): 
            With each key representing a table column, sorted by "summary_p_value" field. The dictionary can be passed directly to a the pandas constructor: ``results_df = pd.DataFrame(results.to_dict())``.
        metadata (dict): 
            Test metadata. Includes query genes provided and background genes that were selected.
        '''

        if log is None:
            log = Log()

        coverage_array = cls.convert_bigwig(bigwig_path, species, log=log)

        return cls(species, coverage_array, rp_map = rp_map, isd_method=isd_method, verbose=verbose, log=log)\
            .predict(query_genes, background_list=background_list, background_strategy=background_strategy, num_background_genes=num_background_genes,
            seed=seed)
Beispiel #8
0
class DataInterface:

    _config = h5_config
    data_path = os.path.join(PACKAGE_PATH, 'data')

    @classmethod
    def get_window_bedfile_str(cls, species, window_size):

        genome = cls.load_genome(species, window_size)

        window_strs = []
        for i, window in enumerate(genome.list_windows()):
            window_strs.append(str(window) + '\t' + str(i))

        return '\n'.join(window_strs)

    @classmethod
    def get_metadata_headers(cls, technology):
        return cls._config.get('metadata', technology + '_headers').split(',')

    @classmethod
    def get_dataset_url(cls, species, window_size):
        return h5_config.get('lisa_params','h5_path')\
            .format(path = cls._config.get('cistrome','data_url'), species = species,
            version = REQURED_DATASET_VERSION, window = str(window_size))

    @classmethod
    def get_dataset_path(cls, species, window_size):
        return cls._config.get('lisa_params', 'h5_path').format(
            path=cls.data_path,
            species=species,
            version=REQURED_DATASET_VERSION,
            window=str(window_size))

    #___ DATASET DOWNLOADING ____
    @classmethod
    def fetch_from_cistrome(cls, species, window_size):

        dataset_url = cls.get_dataset_url(species, window_size)

        if not os.path.isdir(cls.data_path):
            os.mkdir(cls.data_path)

        filename, _ = request.urlretrieve(dataset_url)
        os.rename(filename, cls.get_dataset_path(species, window_size))

    @classmethod
    def load_genome(cls, species, window_size):
        return genome_tools.Genome.from_file(cls._config.get('genome','genome')\
            .format(package_path = PACKAGE_PATH, species = species),
                window_size= window_size)

    def __init__(self,
                 species,
                 window_size=1000,
                 download_if_not_exists=True,
                 make_new=False,
                 log=None,
                 path=None,
                 load_genes=True):

        self.species = species
        self.window_size = int(window_size)

        if log is None:
            self.log = Log()
        else:
            self.log = log

        if path is None:
            self.path = self.get_dataset_path(self.species, self.window_size)
        else:
            self.path = path

        if make_new:
            h5.File(self.path, 'w').close()
        elif not os.path.isfile(self.path):
            if download_if_not_exists and path is None:
                self.download_data()
            else:
                h5.File(self.path, 'w').close()

        #___ LOAD GENE DATA FROM PACKAGE _____
        self.genome = self.load_genome(self.species, self.window_size)

        if load_genes:
            self.load_genes()

    def load_genes(self):
        self.log.append('Loading gene info ...')
        self.genes = gene_selection.GeneSet.from_refseq(self._config.get('genome','genes')\
            .format(package_path = PACKAGE_PATH, species = self.species), self.genome)

        self.gene_loc_set = genome_tools.RegionSet(
            [gene.get_tss_region() for gene in self.genes], self.genome)

        self.rp_map_locs = np.array(
            [r.annotation.get_location() for r in self.gene_loc_set.regions])

    def get_install_path(self):
        return self.data_path

    def get_windows(self):
        return '\n'.join(str(r) for r in self.genome.list_windows())

    # ____ RP MAP DATA _____

    @staticmethod
    def _make_basic_rp_map(gene_loc_set, region_set, decay):

        distance_matrix = gene_loc_set.map_intersects(
            region_set,
            lambda x, y: x.get_genomic_distance(y),
            slop_distance=5 * decay)

        distance_matrix.data = np.power(2, -distance_matrix.data / decay)

        return distance_matrix.tocsr()

    def _make_enhanced_rp_map(self, gene_loc_set, region_set, decay):

        #make regions x exons map and exons x genes map
        try:
            indptr, indices, exons = [0], [], []
            for locus in gene_loc_set.regions:
                new_exons = locus.annotation.get_exon_regions()
                exons.extend(new_exons)
                indices.extend(range(indptr[-1], indptr[-1] + len(new_exons)))
                indptr.append(indptr[-1] + len(new_exons))

            exon_gene_map = sparse.csc_matrix(
                (np.ones(len(exons)), indices, indptr),
                shape=(len(exons), len(gene_loc_set.regions)))

            exons = genome_tools.RegionSet(exons, self.genome)
            region_exon_map = region_set.map_intersects(
                exons,
                distance_function=lambda x, y: x.overlaps(
                    y, min_overlap_proportion=0.4),
                slop_distance=0)  #REGIONS X EXONS

            region_exon_map = region_exon_map.dot(exon_gene_map).astype(
                np.bool)

            not_exon_promoter = 1 - region_exon_map.sum(axis=1).astype(np.bool)

            basic_rp_map = self._make_basic_rp_map(gene_loc_set, region_set,
                                                   decay)

            enhanced_rp_map = basic_rp_map.transpose().multiply(
                not_exon_promoter) + region_exon_map

            return enhanced_rp_map.transpose()

        except Exception as err:
            print(repr(err))
            return region_exon_map, exon_gene_map

    def build_binned_rp_map(self, style, rp_decay):

        region_set = genome_tools.RegionSet(list(self.genome.list_windows()),
                                            self.genome)

        if style == 'basic':
            return self._make_basic_rp_map(self.gene_loc_set, region_set,
                                           rp_decay)
        elif style == 'enhanced':
            return self._make_enhanced_rp_map(self.gene_loc_set, region_set,
                                              rp_decay)
        else:
            NotImplementedError()

    @staticmethod
    def set_attributes(dataset, attr_dict):
        for key, value in attr_dict.items():
            dataset.attrs[key] = value

    def get_rp_map_shape(self):
        return (len(self.genes), len(self.genome))

    def add_rp_map(self, style, rp_map):

        assert(rp_map.shape == self.get_rp_map_shape()), \
            'RP map must be of shape (num genes, num bins): ' + str(self.get_rp_map_shape())

        rp_map_path = self._config.get('rp_map', 'rp_map').format(style=style)

        rp_map = rp_map.tocsr()

        with h5.File(self.path, 'a') as data:

            if rp_map_path in data:
                del data[rp_map_path]

            group = data.create_group(rp_map_path)

            group.create_dataset('indptr',
                                 data=rp_map.indptr,
                                 dtype=np.int32,
                                 compression=COMPRESSION)
            group.create_dataset('indices',
                                 data=rp_map.indices,
                                 dtype=np.int32,
                                 compression=COMPRESSION)
            group.create_dataset('data',
                                 data=rp_map.data,
                                 dtype=np.float32,
                                 compression=COMPRESSION)

            self.set_attributes(group, dict(shape=rp_map.shape))

    def get_rp_maps(self):

        try:
            with h5.File(self.path, 'a') as data:
                return list(data['rp_maps'].keys())
        except KeyError:
            return []

    def get_rp_map(self, style):

        rp_map_path = self._config.get('rp_map', 'rp_map').format(style=style)

        with h5.File(self.path, 'r') as data:

            try:
                group = data[rp_map_path]

                rp_map = sparse.csr_matrix(
                    (group['data'][...], group['indices'][...],
                     group['indptr'][...]),
                    shape=group.attrs['shape'])
            except KeyError:
                raise DatasetNotFoundError(rp_map_path)

        return rp_map

    #___ BIN PROJECTION _____

    def check_bin_map_unique(self, bin_map):
        return len(np.unique(bin_map)) == len(bin_map)

    def project_indices(self, indices, bin_map):

        input_hits = sparse.csc_matrix(
            (np.ones_like(indices), indices, [0, len(indices)]), )

        input_hits = self.project_sparse_matrix(input_hits, bin_map, None)

        return input_hits.tocoo().row

    @staticmethod
    def project_array(arr, bin_map, num_bins):
        #assert(check_bin_map_unique(bin_map[:,0]) and check_bin_map_unique(bin_map[:,1])), 'To project array, bin_map must have all one-to-one mappings'
        new_arr = np.zeros(num_bins)

        new_arr[bin_map[:, 1]] = arr[bin_map[:, 0]]

        return new_arr

    @staticmethod
    def project_sparse_matrix(input_hits, bin_map, num_bins, binarize=False):

        index_converted = input_hits.tocsc()[bin_map[:, 0], :].tocoo()

        input_hits = sparse.coo_matrix(
            (index_converted.data,
             (bin_map[index_converted.row, 1], index_converted.col)),
            shape=(num_bins, input_hits.shape[1])
            if not num_bins is None else None).tocsr()

        if binarize:
            input_hits.data = np.ones_like(input_hits.data)

        return input_hits

    #___ BINDING FACTOR DATA _____
    def get_factor_hit_path(self, technology, dataset_id):
        return self._config.get('factor_binding',
                                'hits').format(technology=technology,
                                               dataset_id=dataset_id)

    def get_factor_score_path(self, technology, dataset_id):
        return self._config.get('factor_binding',
                                'scores').format(technology=technology,
                                                 dataset_id=dataset_id)

    def get_metadata(self, attributes, technology, dataset_id):
        return {
            dataset_id: {
                key: attributes[key]
                for key in self.get_metadata_headers(technology)
            }
        }

    def transpose_metadata(self, metadata, technology):

        headers = self.get_metadata_headers(technology)
        sample_ids = list(metadata.keys())

        return {
            'sample_id': sample_ids,
            **{
                key: [metadata[sample][key] for sample in sample_ids]
                for key in headers
            }
        }

    def add_binding_data(self,
                         technology,
                         dataset_id,
                         hit_bins,
                         hit_scores=None,
                         **metadata):

        hits_path = self.get_factor_hit_path(technology, dataset_id)
        scores_path = self.get_factor_score_path(technology, dataset_id)

        with h5.File(self.path, 'a') as data:
            if hits_path in data:
                del data[hits_path]

            hits = data.create_dataset(hits_path,
                                       data=np.array(hit_bins),
                                       dtype=np.int32,
                                       compression=COMPRESSION)

            if not hit_scores is None:
                assert (len(hit_bins) == len(hit_scores))
                scores = data.create_dataset(scores_path,
                                             data=np.array(hit_scores),
                                             dtype=np.float64,
                                             compression=COMPRESSION)

            self.set_attributes(hits, metadata)

    def get_binding_dataset(self, technology, dataset_id):

        metadata_headers = self.get_metadata_headers(technology)

        with h5.File(self.path, 'r') as data:

            factor_dataset_path = self.get_factor_hit_path(
                technology, dataset_id)
            scores_path = self.get_factor_score_path(technology, dataset_id)

            try:
                hit_bins = np.array(data[factor_dataset_path][...])

                attributes = data[factor_dataset_path].attrs

                if scores_path in data:
                    scores = np.array(data[scores_path][...])
                else:
                    scores = np.ones_like(hit_bins)

            except KeyError:
                raise DatasetNotFoundError(factor_dataset_path)

            metadata = self.get_metadata(attributes, technology, dataset_id)

        return hit_bins, scores, metadata

    def get_binding_data(self, technology):

        with h5.File(self.path, 'r') as data:

            dataset_ids = list(data[self._config.get(
                'factor_binding',
                'root').format(technology=technology)].keys())

            indices = []
            scores = []
            metadata = dict()
            for dataset_id in dataset_ids:
                hit_bins, hit_scores, sample_meta = self.get_binding_dataset(
                    technology, dataset_id)

                metadata.update(sample_meta)
                indices.append(hit_bins)
                scores.append(hit_scores)

        hits_matrix = indices_list_to_sparse_array(indices, len(self.genome),
                                                   scores)

        return hits_matrix.transpose(), np.array(
            dataset_ids), self.transpose_metadata(metadata, technology)

    def remove_binding_dataset(self, technology, dataset_id):

        factor_dataset_path = self.get_factor_hit_path(technology, dataset_id)

        with h5.File(self.path, 'a') as data:
            del data[factor_dataset_path]

    def list_binding_datasets(self, technology):

        try:
            with h5.File(self.path, 'r') as data:

                dataset_ids = list(data[self._config.get(
                    'factor_binding',
                    'root').format(technology=technology)].keys())

            return dataset_ids

        except KeyError:
            return []

    #____ PROFILE DATA _____
    def add_profile_data(self,
                         technology,
                         dataset_id,
                         profile,
                         rp_maps,
                         rp_map_styles,
                         norm_depth=1e5,
                         **metadata):

        assert (len(rp_maps) == len(rp_map_styles))

        profile_path = self._config.get('profiles', 'profile').format(
            technology=technology, dataset_id=dataset_id)

        profile = np.array(profile)
        if len(profile.shape) == 1:
            profile = profile[:, np.newaxis]
        assert (len(profile.shape) == 2)
        assert (profile.shape[0] == self.genome.num_windows_in_genome())

        if not norm_depth is None:
            profile = profile / profile.sum() * norm_depth

        with h5.File(self.path, 'a') as data:

            if profile_path in data:
                del data[profile_path]

            hits = data.create_dataset(profile_path,
                                       data=profile,
                                       dtype=np.float16,
                                       compression=COMPRESSION)
            self.set_attributes(hits, metadata)

            for rp_map, style in zip(rp_maps, rp_map_styles):

                rp_matrix_path = self._config.get(
                    'profiles', 'rp_matrix_col').format(technology=technology,
                                                        style=style,
                                                        dataset_id=dataset_id)

                if rp_matrix_path in data:
                    del data[rp_matrix_path]

                rp_matrix_col = data.create_dataset(rp_matrix_path,
                                                    data=rp_map.dot(profile),
                                                    dtype=np.float32,
                                                    compression=COMPRESSION)
                self.set_attributes(rp_matrix_col, metadata)

    def remove_profile(self, technology, dataset_id):

        profile_path = self._config.get('profiles', 'profile').format(
            technology=technology, dataset_id=dataset_id)

        with h5.File(self.path, 'a') as data:
            del data[profile_path]

            for style in self.get_rp_maps():
                rp_matrix_col_path = self._config.get(
                    'profiles', 'rp_matrix_col').format(technology=technology,
                                                        style=style,
                                                        dataset_id=dataset_id)
                del data[rp_matrix_col_path]

    def get_profile(self, technology, dataset_id):

        profile_path = self._config.get('profiles', 'profile').format(
            technology=technology, dataset_id=dataset_id)

        with h5.File(self.path, 'r') as data:

            try:
                profile = np.array(data[profile_path][...])

                attributes = data[profile_path].attrs
            except KeyError:
                raise DatasetNotFoundError(profile_path)

            metadata = self.get_metadata(attributes, technology, dataset_id)

        return profile, metadata

    def list_profiles(self, technology):

        profiles_dir = self._config.get('profiles',
                                        'root').format(technology=technology)

        try:
            with h5.File(self.path, 'r') as data:

                dataset_ids = list(data[profiles_dir].keys())

            return dataset_ids
        except KeyError:
            return []

    def get_rp_matrix(self, technology, style):

        with h5.File(self.path, 'r') as data:

            rp_matrix_dir = self._config.get('profiles', 'rp_matrix').format(
                technology=technology, style=style)

            dataset_ids = list(data[rp_matrix_dir].keys())

            slices = []
            for _id in dataset_ids:
                slices.append(np.array(data[rp_matrix_dir][_id][...]))

        return np.concatenate(slices, axis=1), np.array(dataset_ids)

    def download_data(self):

        with self.log.section('Grabbing {} data (~15 minutes):'.format(
                self.species)):

            self.log.append('Downloading from database ...')

            try:
                self.fetch_from_cistrome(self.species, self.window_size)
            except error.URLError as err:
                raise AssertionError(
                    'ERROR: Cannot connect to cistrome.org for data (usually due to security settings on some servers)!\nView github pages for manual dataset install instructions.'
                )

            self.log.append('Done')