Exemple #1
0
def lisa_multi(args):

    log = Log(target=sys.stderr, verbose=args.verbose)
    lisa = FromGenes(args.species,
                     **extract_kwargs(args, INSTANTIATION_KWARGS),
                     log=log)

    query_dict = {
        os.path.basename(query.name): query.readlines()
        for query in args.query_lists
    }

    results_summary = []
    all_passed = True
    for query_name, query_list in query_dict.items():

        with log.section('Modeling {}:'.format(str(query_name))):
            try:
                results, metadata = lisa.predict(
                    query_list, **extract_kwargs(args, PREDICTION_KWARGS))

                top_TFs_unique = save_and_get_top_TFs(args, query_name,
                                                      results, metadata)

                results_summary.append((query_name, top_TFs_unique))

            except AssertionError as err:
                all_passed = False
                log.append('ERROR: ' + str(err))

    print_results_multi(results_summary)

    if not all_passed:
        raise MultiError('One or more genelists raised an error')
Exemple #2
0
def lisa_deseq(args):

    log = Log(target=sys.stderr, verbose=args.verbose)
    lisa = FromGenes(args.species,
                     **extract_kwargs(args, INSTANTIATION_KWARGS),
                     log=log)

    up_genes, down_genes = parse_deseq_file(args.deseq_file,
                                            lfc_cutoff=args.lfc_cutoff,
                                            pval_cutoff=args.pval_cutoff,
                                            sep=args.sep)

    results_summary = []
    all_passed = True
    for prefix, query_list in zip(['up-regulated', 'down-regulated'],
                                  [up_genes, down_genes]):

        with log.section('Modeling {}:'.format(str(prefix))):
            try:
                results, metadata = lisa.predict(
                    query_list, **extract_kwargs(args, PREDICTION_KWARGS))

                top_TFs_unique = save_and_get_top_TFs(args, prefix, results,
                                                      metadata)

                results_summary.append((prefix, top_TFs_unique))

            except AssertionError as err:
                all_passed = False
                log.append('ERROR: ' + str(err))

    print_results_multi(results_summary)

    if not all_passed:
        raise MultiError('One or more genelists raised an error')
Exemple #3
0
    def convert_bigwig(cls, bigwig, species, bigwig_cmd_path):

        log = Log()

        genome = DataInterface.load_genome(species, cls.window_size)
        coverage_array = np.zeros(len(genome))

        log.append('Converting BigWig file to coverage array ...')

        if not os.path.exists(cls._get_genome_bin_path(species)):
            log.append('Writing bins ...')
            cls._write_genome_bins(species)

        try:

            temp = tempfile.NamedTemporaryFile('w', delete=False)
            temp.close()

            process = subprocess.run([bigwig_cmd_path, bigwig, cls._get_genome_bin_path(species), temp.name], capture_output=True)

            if process.returncode == 0:

                with open(temp.name, 'r') as cmd_output:
                    for line in cmd_output:
                        fields = line.strip().split('\t')
                        coverage_array[int(fields[0])] = fields[4]
                    
                return coverage_array
            
            else:
                raise AssertionError(process.stderr.decode('utf-8'))
        finally:
            os.remove(temp.name)
Exemple #4
0
    def __init__(self,
                 species,
                 window_size=1000,
                 download_if_not_exists=True,
                 make_new=False,
                 log=None,
                 path=None,
                 load_genes=True):

        self.species = species
        self.window_size = int(window_size)

        if log is None:
            self.log = Log()
        else:
            self.log = log

        if path is None:
            self.path = self.get_dataset_path(self.species, self.window_size)
        else:
            self.path = path

        if make_new:
            h5.File(self.path, 'w').close()
        elif not os.path.isfile(self.path):
            if download_if_not_exists and path is None:
                self.download_data()
            else:
                h5.File(self.path, 'w').close()

        #___ LOAD GENE DATA FROM PACKAGE _____
        self.genome = self.load_genome(self.species, self.window_size)

        if load_genes:
            self.load_genes()
Exemple #5
0
    def convert_bigwig(cls, bigwig, species, log=None):

        if log is None:
            log = Log()

        genome = DataInterface.load_genome(species, cls.window_size)
        coverage_array = np.zeros(len(genome))

        log.append('Converting BigWig file to coverage array ...')

        bar = LoadingBar('Progress', len(genome) // 1000 + 1, cold_start=True)

        try:
            coverage_bw = bw.open(bigwig)

            log.append(bar, update_line=True)

            for i, window in enumerate(genome.list_windows()):

                if window.chromosome in coverage_bw.chroms():
                    mean_coverage = coverage_bw.stats(*window.to_tuple())[0]
                    coverage_array[i] = mean_coverage

                if i % 1000 == 0:
                    log.append(bar, update_line=True)

            return np.nan_to_num(coverage_array)

        finally:
            coverage_bw.close()
Exemple #6
0
def main(species, motif_bed, window_size, gamma_threshold=0.95):

    genome = DataInterface.load_genome(species, window_size)

    log = Log(target=stderr)

    factor_name = None
    window_nums, scores = [], []

    with gzip.open(motif_bed, 'rb') as f:

        bed = f.readlines()

        bar = LoadingBar('Binning {} motif hits'.format(str(len(bed))),
                         len(bed),
                         cold_start=True)

        for i, line in enumerate(bed):

            chrom, start, end, factor, relscore, log_pval, strand = line.decode(
                'utf-8').strip().split('\t')

            if i == 0:
                factor_name = factor

            try:
                hit_windows = genome.get_region_windows(
                    Region(chrom, start, end))
                window_nums.extend(hit_windows)

                scores.extend([float(log_pval) / 100] * len(hit_windows))

            except BadRegionError:
                pass

            log.append(bar, update_line=True)

    log.append('')

    log.append('Done')

    hits = sparse.csc_matrix((scores, window_nums, [0, len(window_nums)]),
                             shape=(len(genome), 1)).tocoo().tocsc()

    sample_hit_scores = np.random.choice(np.array(hits.todense()).reshape(-1),
                                         size=10000)

    min_bin_score = gamma(*gamma.fit(sample_hit_scores)).ppf(gamma_threshold)

    hit_indices = hits.indices[(hits.data >= min_bin_score) & (hits.data > 0)]

    return hit_indices, factor_name
Exemple #7
0
    def using_bigwig(cls,
                     species,
                     query_genes,
                     bigwig_path,
                     rp_map='enhanced_10K',
                     isd_method='chipseq',
                     background_list=[],
                     background_strategy='all',
                     num_background_genes=3000,
                     seed=2556,
                     verbose=4,
                     log=None):
        '''
*classmethod*
**lisa.FromCoverage.using_bigwig** (species, query_genes, bigwig_path, rp_map = 'basic', rp_decay = 10000, isd_method = 'chipseq', background_list = [], background_strategy = 'all', num_background_genes = 3000, seed = 2556, header = False, verbose = 4, log = None)

    Run LISA FromCoverage test using a bigwig coverage file.

    Parameters:
        species: {'hg38', 'mm10'}

        query_genes (list): 
            Genes-of-interest, in either Symbol of RefSeqID format. Must provide between 20 to 500 genes.
        bigwig_path (str): 
            Path to bigwig file

    Returns:
        results (lisa.core.utils.LISA_Results): 
            With each key representing a table column, sorted by "summary_p_value" field. The dictionary can be passed directly to a the pandas constructor: ``results_df = pd.DataFrame(results.to_dict())``.
        metadata (dict): 
            Test metadata. Includes query genes provided and background genes that were selected.
        '''

        if log is None:
            log = Log()

        coverage_array = cls.convert_bigwig(bigwig_path, species, log=log)

        return cls(species, coverage_array, rp_map = rp_map, isd_method=isd_method, verbose=verbose, log=log)\
            .predict(query_genes, background_list=background_list, background_strategy=background_strategy, num_background_genes=num_background_genes,
            seed=seed)