コード例 #1
0
def main(species,
         window_size,
         motif_metadata,
         bin_sorted_hits,
         group_loci=100000):

    motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None)
    motif_metadata.columns = ['dataset_id', 'factor', 'source']
    motif_metadata = motif_metadata.set_index('dataset_id')
    motif_metadata = motif_metadata.drop_duplicates()

    data = DataInterface(species,
                         window_size=window_size,
                         download_if_not_exists=False,
                         make_new=False,
                         load_genes=False)

    print(data.path)
    raise Exception()

    data.create_binding_dataset(TECHNOLOGY, motif_metadata.index.values,
                                **motif_metadata.to_dict('list'))

    id_to_idx_map = dict(
        zip(data.list_binding_datasets(TECHNOLOGY),
            np.arange(len(data.list_binding_datasets(TECHNOLOGY)))))

    current_pos = 0
    last_added_chunk = 0
    i = 0
    rows, cols, scores = [], [], []

    with open(bin_sorted_hits, 'r') as f:

        for line in f:
            motif_id, bin_num, score = line.strip().split()

            bin_num = int(bin_num)

            if bin_num < current_pos:
                raise Exception('Input file not sorted!')
            elif bin_num > current_pos and i >= group_loci:
                print('Adding matrix segment ...')
                matrix_form = sparse.coo_matrix((scores, (rows, cols))).tocsr()
                data.append_csr(TECHNOLOGY, matrix_form)
                last_added_chunk = bin_num
                i = 0
                rows, cols, scores = [], [], []

            tf_idx = id_to_idx_map[motif_id]
            rows.append(bin_num - last_added_chunk)
            cols.append(tf_idx)
            scores.append(int(score))
            current_pos = bin_num
            i += 1

        if len(rows) > 0:
            matrix_form = sparse.coo_matrix((scores, (rows, cols))).tocsr()
            data.append_csr(TECHNOLOGY, matrix_form)
コード例 #2
0
    def convert_bigwig(cls, bigwig, species, bigwig_cmd_path):

        log = Log()

        genome = DataInterface.load_genome(species, cls.window_size)
        coverage_array = np.zeros(len(genome))

        log.append('Converting BigWig file to coverage array ...')

        if not os.path.exists(cls._get_genome_bin_path(species)):
            log.append('Writing bins ...')
            cls._write_genome_bins(species)

        try:

            temp = tempfile.NamedTemporaryFile('w', delete=False)
            temp.close()

            process = subprocess.run([bigwig_cmd_path, bigwig, cls._get_genome_bin_path(species), temp.name], capture_output=True)

            if process.returncode == 0:

                with open(temp.name, 'r') as cmd_output:
                    for line in cmd_output:
                        fields = line.strip().split('\t')
                        coverage_array[int(fields[0])] = fields[4]
                    
                return coverage_array
            
            else:
                raise AssertionError(process.stderr.decode('utf-8'))
        finally:
            os.remove(temp.name)
コード例 #3
0
ファイル: coverage_test.py プロジェクト: liulab-dfci/lisa2
    def convert_bigwig(cls, bigwig, species, log=None):

        if log is None:
            log = Log()

        genome = DataInterface.load_genome(species, cls.window_size)
        coverage_array = np.zeros(len(genome))

        log.append('Converting BigWig file to coverage array ...')

        bar = LoadingBar('Progress', len(genome) // 1000 + 1, cold_start=True)

        try:
            coverage_bw = bw.open(bigwig)

            log.append(bar, update_line=True)

            for i, window in enumerate(genome.list_windows()):

                if window.chromosome in coverage_bw.chroms():
                    mean_coverage = coverage_bw.stats(*window.to_tuple())[0]
                    coverage_array[i] = mean_coverage

                if i % 1000 == 0:
                    log.append(bar, update_line=True)

            return np.nan_to_num(coverage_array)

        finally:
            coverage_bw.close()
コード例 #4
0
def main(species, window_size, cistrome_metadata, motif_metadata, index_files):

    cistrome_metadata = pd.read_csv(cistrome_metadata,
                                    sep='\t').set_index('DCid')
    cistrome_metadata.index = cistrome_metadata.index.astype(str)
    motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None)
    motif_metadata.columns = ['dataset_id', 'factor', 'source']
    motif_metadata = motif_metadata.set_index('dataset_id')
    motif_metadata = motif_metadata.drop_duplicates()

    data = DataInterface(species,
                         window_size=window_size,
                         download_if_not_exists=False,
                         make_new=False,
                         load_genes=False)

    dataset_ids = motif_metadata.index.values

    data.create_binding_dataset('Motif', dataset_ids)
コード例 #5
0
def main(species, motif_bed, window_size, gamma_threshold=0.95):

    genome = DataInterface.load_genome(species, window_size)

    log = Log(target=stderr)

    factor_name = None
    window_nums, scores = [], []

    with gzip.open(motif_bed, 'rb') as f:

        bed = f.readlines()

        bar = LoadingBar('Binning {} motif hits'.format(str(len(bed))),
                         len(bed),
                         cold_start=True)

        for i, line in enumerate(bed):

            chrom, start, end, factor, relscore, log_pval, strand = line.decode(
                'utf-8').strip().split('\t')

            if i == 0:
                factor_name = factor

            try:
                hit_windows = genome.get_region_windows(
                    Region(chrom, start, end))
                window_nums.extend(hit_windows)

                scores.extend([float(log_pval) / 100] * len(hit_windows))

            except BadRegionError:
                pass

            log.append(bar, update_line=True)

    log.append('')

    log.append('Done')

    hits = sparse.csc_matrix((scores, window_nums, [0, len(window_nums)]),
                             shape=(len(genome), 1)).tocoo().tocsc()

    sample_hit_scores = np.random.choice(np.array(hits.todense()).reshape(-1),
                                         size=10000)

    min_bin_score = gamma(*gamma.fit(sample_hit_scores)).ppf(gamma_threshold)

    hit_indices = hits.indices[(hits.data >= min_bin_score) & (hits.data > 0)]

    return hit_indices, factor_name
コード例 #6
0
def main(species, window_size, cistrome_metadata, motif_metadata, index_files):

    cistrome_metadata = pd.read_csv(cistrome_metadata,
                                    sep='\t').set_index('DCid')
    cistrome_metadata.index = cistrome_metadata.index.astype(str)
    motif_metadata = pd.read_csv(motif_metadata, sep='\t', header=None)
    motif_metadata.columns = ['dataset_id', 'factor', 'source']
    motif_metadata = motif_metadata.set_index('dataset_id')
    motif_metadata = motif_metadata.drop_duplicates()

    data = DataInterface(species,
                         window_size=window_size,
                         download_if_not_exists=False,
                         make_new=False,
                         load_genes=False)

    for index_file in index_files:

        with open(index_file, 'r') as f:
            hit_bins = np.array([int(ind.strip()) for ind in f.readlines()])

        technology, dataset_id = os.path.basename(index_file).split('_')

        dataset_id = '.'.join(dataset_id.split('.')[:-1])

        metadata_headers = data.get_metadata_headers(technology)

        if technology == 'Motifs':
            meta_dict = motif_metadata.loc[dataset_id,
                                           metadata_headers].to_dict()
            meta_dict['source'] = 'jaspar'
        else:
            meta_dict = cistrome_metadata.loc[dataset_id,
                                              metadata_headers].to_dict()

        data.add_binding_data(technology, dataset_id, hit_bins, **meta_dict)
コード例 #7
0
def main(species, window_size, path):

    region_fields = parse_bedfile(path, header = False)

    regions = [Region(*r) for r in region_fields]

    genome = DataInterface.load_genome(species, window_size)

    indices = []

    for region in regions:
        try:
            windows = genome.get_region_windows(region)
            indices.extend(windows)
        except BadRegionError:
            pass

    return list(set(indices))
コード例 #8
0
def main(*,species, motif_bed, window_size, dataset_id, output):

    genome = DataInterface.load_genome(species, window_size)

    factor_name = None
    window_nums, scores = [],[]

    #adjust p-val cutoff based on the filesize (only affects p-val if file is huge)
    pval_cutoff = 430

    with open(output, 'w') as o:

        with gzip.open(motif_bed, 'rb') as bed:

            for i, line in enumerate(bed):
                
                chrom, start, end, factor, relscore, log_pval, strand = line.decode('utf-8').strip().split('\t')
                
                if i == 0:
                    factor_name = factor
                    print('Binning {} motifs with pval cutoff of {} ...'.format(factor_name.upper(), str(pval_cutoff)), file = stderr)

                neg_log10_pval = int(log_pval)

                if neg_log10_pval >= pval_cutoff:

                    try:
                        hit_windows = genome.get_region_windows(Region(chrom, start, end))
                        
                        for hit_window in hit_windows:
                            print(dataset_id, hit_window, neg_log10_pval, sep = '\t', file = o)                    

                    except BadRegionError:
                        pass
    
    print(dataset_id, factor_name.upper(), 'JASPAR', sep = '\t')
コード例 #9
0
    def _write_genome_bins(cls, species):
        
        bedstr = DataInterface.get_window_bedfile_str(species, cls.window_size)

        with open(cls._get_genome_bin_path(species, window_size), 'w') as bed:
            bed.write(bedstr)
コード例 #10
0
ファイル: append_profiles.py プロジェクト: liulab-dfci/lisa2
def main(args):

    cistrome_metadata = pd.read_csv(args.cistrome_metadata,
                                    sep='\t').set_index('DCid')
    cistrome_metadata.index = cistrome_metadata.index.astype(str)

    data = DataInterface(args.species,
                         window_size=args.window_size,
                         download_if_not_exists=False,
                         make_new=False,
                         load_genes=True)

    rp_map_styles = data.get_rp_maps()

    if len(rp_map_styles) == 0:

        basic_rp_map, enhanced_rp_map = data.build_binned_rp_map(
            'basic', 10000), data.build_binned_rp_map('enhanced', 10000)

        data.add_rp_map('basic_10K', basic_rp_map)
        data.add_rp_map('enhanced_10K', enhanced_rp_map)

    else:

        basic_rp_map = data.get_rp_map('basic_10K')
        enhanced_rp_map = data.get_rp_map('enhanced_10K')

    for arr_name in args.coverage_arrays:

        coverage_array = np.load(arr_name)

        technology, dataset_id = os.path.basename(arr_name).split('_')

        dataset_id = '.'.join(dataset_id.split('.')[:-1])

        metadata_headers = data.get_metadata_headers(technology)

        meta_dict = cistrome_metadata.loc[dataset_id,
                                          metadata_headers].to_dict()

        data.add_profile_data(technology, dataset_id, coverage_array,
                              [basic_rp_map, enhanced_rp_map],
                              ['basic_10K', 'enhanced_10K'], **meta_dict)