def sequentialJaccardBed():
    sequential_time = time.time()
    bed_pairs = jaccardBed()
    jaccard_scores = []
    error_bed_pairs = []

    for pair in bed_pairs:
        try:
            file1 = pr.read_bed(pair[0])
            file2 = pr.read_bed(pair[1])
            jaccard_score = file1.stats.jaccard(file2)
            jaccard_scores.append(jaccard_score)
        except Exception as error:
            print('error')
            print(error)
            error_bed_pair = [file1, file2]
            error_bed_pairs.append(error_bed_pair)

    print('scores')
    print(jaccard_scores)
    print('errors')
    print(error_bed_pairs)

    print('Sequentially calculating jaccard scores --- %.2f seconds ---' %
          (time.time() - sequential_time))
Esempio n. 2
0
    def calc_windowed_seg_sites(self, chrom=0, L=1e3, filt_rec=True, mask=None):
        """Calculate windowed estimates of segregating sites.

        Arguments:
            * chrom: identifier for the chromosome
            * L: length of independent locus
            * filt_rec: filter recombination
            * mask: bed file for the underlying mask

        """
        assert self.chrom_pos_dict is not None
        phys_pos = self.chrom_physpos_dict[chrom]
        rec_pos = self.chrom_pos_dict[chrom]
        weights = self.chrom_weight_dict[chrom]
        if filt_rec:
            diff = np.abs(rec_pos[:-1] - rec_pos[1:])
            idx = np.where(diff != 0)[0]
            phys_pos = phys_pos[idx]
            rec_pos = rec_pos[idx]
            weights = weights[idx]
        if mask is not None:
            phys_pos = phys_pos.astype(np.float64)
            df_mask = pyranges.read_bed(mask)
            df_pos = PyRanges(chromosomes=chrom, starts=phys_pos, ends=(phys_pos + 1))
            cov_sites = df_pos.coverage(df_mask)
            sites_idx = np.array(cov_sites.FractionOverlaps.astype(np.float32))
            idx = np.where(sites_idx > 0.0)[0]
            phys_pos[idx] = np.nan
        # 1. Setup the bins for the analysis
        bins = np.arange(np.nanmin(phys_pos), np.nanmax(phys_pos), L)
        windowed_vars, bin_edges = np.histogram(
            phys_pos[~np.isnan(phys_pos)],
            bins=bins,
            weights=weights[~np.isnan(phys_pos)],
        )
        bin_edges = bin_edges.astype(np.uint32)
        # Interpolate the midpoints of the recombination bins
        f = interpolate.interp1d(phys_pos, rec_pos)
        midpts = bin_edges[:-1] + (bin_edges[1:] - bin_edges[:-1]) / 2
        rec_midpts = f(midpts)
        # Calculate the weightings from the mask as needed ...
        mask_weights = np.ones(rec_midpts.size)
        if mask is not None:
            # Mask must be a bedfile
            df_windows = PyRanges(
                chromosomes=chrom, starts=bin_edges[:-1], ends=bin_edges[1:]
            )
            df_mask = pyranges.read_bed(mask)
            cov = df_windows.coverage(df_mask)
            mask_weights = np.array(cov.FractionOverlaps.astype(np.float32))
            # Set the mask weights to scale up the fraction that may be missing!
            mask_weights = 1.0 / (1.0 - mask_weights)
            mask_weights[np.isinf(mask_weights)] = np.nan

        # Stacking all of the data to make sure that we can use it later on
        tot_data = np.vstack([windowed_vars, bin_edges[1:], rec_midpts, mask_weights])
        self.chrom_total_dict[chrom] = tot_data
Esempio n. 3
0
def countContexts(fastaFilePath, whiteListBed=None, blackListBed=None):
    debug(f"Starting to count contexts of nucleotides in {fastaFilePath}")

    triNucCounts = defaultdict(int)
    diNucCounts = defaultdict(int)
    # open the fastaFile
    with FastaFile(fastaFilePath) as fastaFile:

        # if we do not have a whitelist to start out, we make one from the fasta, which includes
        # everything
        if whiteListBed is None:
            wlObj = from_dict(
                {
                    "Chromosome": fastaFile.references,
                    "Start": [1] * fastaFile.nreferences,
                    "End": fastaFile.lengths,
                }
            )
        else:
            # we cast this to string, because pyranges wants string and we use the Path type
            wlObj = read_bed(str(whiteListBed))
            wlObj = wlObj.merge()

        # if we have a blacklist, we subtract that from the whitelist, otherwise we leave it how
        # it is
        if not blackListBed is None:
            # we cast this to string, because pyranges wants string and we use the Path type
            blObj = read_bed(str(blackListBed))
            blObj = blObj.merge()
            wlObj = wlObj.subtract(blObj)
            # shouldnt need to merge again here, as we only have less ranges than before

        # while we could use the get_fasta function from pyranges, it needs another
        # dependency (pyfaidx) and is slower (from my preliminary testing)
        # i terate over all chromosomes and each of the ranges
        for chr, df in wlObj:
            # iterrows has to return the index, even though we dont use it
            for idx, region in df.iterrows():
                seq = fastaFile.fetch(
                    reference=chr, start=region["Start"], end=region["End"]
                )

                for i in range(len(seq) - 2):
                    diNucCounts[seq[i : i + 2]] += 1
                    triNucCounts[seq[i : i + 3]] += 1
            debug(f"contect frequency analysis complete for chromsome {chr}")

    return (diNucCounts, triNucCounts)
def exonOverlap(args, df):

    exon_frame = pr.read_bed(args.exons)
    exon_overlap = PyRanges(df).join(exon_frame).drop(like="_b")

    if exon_overlap.df.empty:
        exon_calls = pd.DataFrame()
    else:
        exon_calls = exon_overlap.df.drop(
            columns=['Chromosome', 'Start', 'End']).rename(
                columns={
                    'Name': 'gene',
                    'Score': 'OMIM_syndrome'
                }).drop_duplicates()
        # if args.genelist:
        #     #gene_list = pd.read_csv(args.genelist, sep='\t', names=['Gene'], header=None)
        exon_calls = exon_calls.merge(args.genelist, on=['gene'], how='left')
        exon_calls.fillna(value={
            'score': 0,
            'normalized_score': 0
        },
                          inplace=True)
        exon_calls = exon_calls.sort_values(by='score', ascending=False)

    return exon_calls
def geneOverlapTransloInv(args, sample_start, sample_end, sample_frame):

    gene_frame = pr.read_bed(args.genes)
    gene_start = PyRanges(sample_start).join(gene_frame[["Name", "Score"
                                                         ]]).drop(like="_b")
    gene_end = PyRanges(sample_end).join(gene_frame[["Name",
                                                     "Score"]]).drop(like="_b")

    if gene_start.df.empty and gene_end.df.empty:
        sample_frame['Name'] = sample_frame['Name2'] = sample_frame[
            'Score'] = sample_frame['Score2'] = 'None'
    elif gene_start.df.empty:
        sample_frame = gene_end.df.rename(columns={
            'Name': 'Name2',
            'Score': 'Score2'
        }).filter(items=['SmapEntryID', 'Name']).drop_duplicates().merge(
            sample_frame, on=['SmapEntryID'], how='right')
        sample_frame['Name'] = sample_frame['Score'] = 'None'
    elif gene_end.df.empty:
        sample_frame = gene_start.df.filter(
            items=['SmapEntryID', 'Name', 'Score']).drop_duplicates().merge(
                sample_frame, on=['SmapEntryID'], how='right')
        sample_frame['Name2'] = sample_frame['Score2'] = 'None'
    else:
        sample_frame = gene_start.df.filter(
            items=['SmapEntryID', 'Name', 'Score']).drop_duplicates().merge(
                sample_frame, on=['SmapEntryID'], how='right')
        sample_frame = sample_frame.merge(gene_end.df.rename(columns={
            'Name': 'Name2',
            'Score': 'Score2'
        }).filter(items=['SmapEntryID', 'Name2', 'Score2']),
                                          on=['SmapEntryID'],
                                          how='left')

    return (sample_frame)
Esempio n. 6
0
    def _read_intervals(gtf_path=None,
                        bed_path=None,
                        pranges=None,
                        intervals=None,
                        interval_attrs=None,
                        duplicate_attr=False):
        alternatives = [bed_path, pranges, intervals, gtf_path]
        if sum(i is not None for i in alternatives) != 1:
            raise ValueError('only one of `gth_path`, `bed_path`, `pranges`,'
                             '`intervals` or should given as input.')
        if gtf_path:
            import pyranges
            pranges = pyranges.read_gtf(gtf_path,
                                        duplicate_attr=duplicate_attr)

        elif bed_path:
            import pyranges
            pranges = pyranges.read_bed(bed_path)

        elif intervals:
            if interval_attrs is not None:
                raise ValueError(
                    '`interval_attrs` is not valid with `intervals`')

            pranges = intervals_to_pyranges(intervals)

        return pranges
def polyAClusterDetected(fastaPath, infile, gene_bed, out_suffix, threads):
    # 读取gene_bed信息
    try:
        os.mkdir(out_suffix)
    except:
        logger.warning(f"{out_suffix} existed!")
    gene_model = pr.read_bed(gene_bed, as_df=True)
    gene_model = gene_model.set_index(["Name"])

    results = []
    with ProcessPoolExecutor(max_workers=threads) as e:
        for gene_id in gene_model.index:
            if gene_model.at[gene_id, "Chromosome"] not in {"Mt", "Pt"}:
                results.append(e.submit(get_three_end, infile, gene_id, gene_model))

    o_polya_cluster = open(f"{out_suffix}polya_cluster.bed", "w")
    o_polya_cluster_summit = open(f"{out_suffix}polya_cluster.summit.bed", "w")
    o_major_polya_cluster = open(f"{out_suffix}major_polya_cluster.bed", "w")
    o_major_polya_cluster_summit = open(
        f"{out_suffix}major_polya_cluster_summit.bed", "w"
    )
    o_last_polya_cluster = open(f"{out_suffix}last_polya_cluster.bed", "w")
    o_last_polya_cluster_summit = open(
        f"{out_suffix}last_polya_cluster_summit.bed", "w"
    )

    for res in results:
        result = res.result()
        if result is not None:
            (
                polya_cluster,
                polya_cluster_summit,
                polya_cluster_major,
                polya_cluster_summit_major,
                polya_cluster_last,
                polya_cluster_summit_last,
            ) = result
            for item in polya_cluster:
                o_polya_cluster.write(item)
            for item in polya_cluster_summit:
                o_polya_cluster_summit.write(item)

            o_major_polya_cluster.write(polya_cluster_major)
            o_major_polya_cluster_summit.write(polya_cluster_summit_major)
            o_last_polya_cluster.write(polya_cluster_last)
            o_last_polya_cluster_summit.write(polya_cluster_summit_last)

    o_polya_cluster.close()
    o_polya_cluster_summit.close()
    o_major_polya_cluster.close()
    o_major_polya_cluster_summit.close()
    o_last_polya_cluster.close()
    o_last_polya_cluster_summit.close()

    filterPAC(
        fastaPath,
        f"{out_suffix}polya_cluster.bed",
        f"{out_suffix}polya_cluster.summit.bed",
        f"{out_suffix}polya_cluster.filtered.bed",
    )
def main():
    # I/O directories
    input_data_dir = "input_data/"
    output_data_dir = "output_matrix/"

    working_dir = os.getcwd()  # Sets the working directory

    input_dir = os.path.join(
        working_dir,
        input_data_dir)  # get the path to the data input directory
    output_dir = os.path.join(
        working_dir, output_data_dir
    )  # Sets the directory where all the saved outputs will be stored
    if not os.path.exists(input_dir):
        os.makedirs(input_dir)  # create input directory if not already present
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # DATA INPUT =========================================
    # genes_lengths_path = os.path.join(input_dir,"gene_lengths.csv")
    genes_lengths_path = os.path.join(
        working_dir, "gene_lengths.csv"
    )  # path to upload the file containing each gene's ID and the correspondent gene length
    genes = pd.read_csv(genes_lengths_path)
    # removes a not useful column from the "genes" dataframe
    # renames column from the "genes" dataframe
    genes = genes.drop(genes.columns[0], axis=1)
    genes = genes.rename(columns={
        genes.columns[0]: "GeneID",
        genes.columns[1]: "GeneLength"
    })

    bed_table_FP_path_list = [
        os.path.abspath(os.path.join(input_dir, f)) for f in listdir(input_dir)
        if isfile(os.path.join(input_dir, f)) if f.endswith(".bed")
    ]  # list all the file names in input_dir

    # execute the coverage function for each .bed file
    for bed_table_FP_path in bed_table_FP_path_list:
        bed_file_name = Path(os.path.basename(bed_table_FP_path)).stem
        coverage_matrix_csv_path = os.path.join(
            output_dir, bed_file_name + "_matrix_coverage.csv")
        matrix_01_csv_path = os.path.join(output_dir,
                                          bed_file_name + "_matrix_01.csv")

        ###############################
        #### BED FILE FOOTPRINTS ######
        ###############################

        # uploads the .bed files containing a list of gene's ID that have at least one FP, start and end point of mapping
        bed_table_FP = pr.read_bed(bed_table_FP_path, as_df=True)
        bed_table_FP_reduced = bed_table_FP.iloc[:, [0, 1, 2]]

        me = MatricesExtractor(bed_table_FP_reduced, genes)
        # extract the matrices
        matrix_01 = me.extract_matrices(
            addRand=m_addRand, areReadsRandomized=m_areReadsRandomized)

        # Exports the dataFrames into CSV files
        matrix_01.to_csv(matrix_01_csv_path, index=True)
Esempio n. 9
0
def parse_bed(bed, window):
    logging.info("Parsing BED file")
    gr = pr.read_bed(bed)[window.chromosome, window.begin:window.end]
    df = gr.unstrand().df
    df = df.drop(columns=["Chromosome", "Score", "Strand"], errors='ignore')
    if "Name" not in df.columns:
        df["Name"] = "noname"
    return df.itertuples(index=False, name=None)
def multithreadedJaccardBed(*bed_pairs):
    # bedSortedPairs = jaccardBed()
    jaccard_scores = dict()
    error_bed_pairs = []
    try:
        file1 = pr.read_bed(bed_pairs[0][0])
        file2 = pr.read_bed(bed_pairs[0][1])
        jaccard_score = file1.stats.jaccard(file2)
        scored_pair = (bed_pairs[0][0], bed_pairs[0][1])
        jaccard_scores[str(scored_pair)] = jaccard_score
    except Exception as error:
        print('error')
        print(error)
        error_bed_pair = [file1, file2]
        error_bed_pairs.append(error_bed_pair)

    return jaccard_scores, error_bed_pairs
Esempio n. 11
0
    def __iter__(self) -> Iterable[Variant]:
        import pyranges as pr

        gr = pr.read_bed(self.bed_file)
        gr = gr.merge(strand=False).sort()

        for interval in pyranges_to_intervals(gr):
            yield from self.combination_variants(interval, self.variant_type)
Esempio n. 12
0
def tidy_chromosome_column(polya_bed_path=None):
    '''
    Read in polyA bed file
    Add 'chr' prefix to chromosome column
    return pyranges object
    '''
    bed_df = pyr.read_bed(f=polya_bed_path, as_df=True)
    bed_df['Chromosome'] = 'chr' + bed_df['Chromosome'].astype(str)

    return pyr.PyRanges(bed_df)
Esempio n. 13
0
def filterPAC(fastaPath, bedPath, bedSummitPath, fillterPolyASitePath):
    genomeFa = pyfastx.Fasta(fastaPath)
    polyAClusterBed = pr.read_bed(bedSummitPath, True)
    polyAClusterBed["seq"] = polyAClusterBed.apply(
        lambda x: genomeFa[x["Chromosome"]][x["Start"] - 10 : x["End"] + 10].seq, axis=1
    )
    polyAClusterBed["seqLength"] = polyAClusterBed["seq"].map(len)
    polyAClusterBed["Ratio"] = (
        polyAClusterBed.apply(
            lambda x: x["seq"].count("A")
            if x["Strand"] == "+"
            else x["seq"].count("T"),
            axis=1,
        )
        / polyAClusterBed["seqLength"]
    )
    usePolyASite = polyAClusterBed.query("Ratio <= 0.5")["Name"]
    polyAClusterRawRangeBed = pr.read_bed(bedPath, True)
    polyAClusterPassedRangeBed = polyAClusterRawRangeBed.query("Name in @usePolyASite")
    polyAClusterPassedRangeBed.to_csv(
        fillterPolyASitePath, sep="\t", header=None, index=None
    )
Esempio n. 14
0
def main():
    args = parse_args()
    if len(args.trf) != 2:
        exit(f'ERROR: Expected 2 trf files, got {len(args.trf)}: {args.trf}')
    if len(args.cov) != 2:
        exit(f'ERROR: Expected 2 cov files, got {len(args.cov)}: {args.cov}')

    # Parse inputs as pandas data frame (df) or pyranges (pr) objects
    strling_df = parse_bed(args.strling)

    trf_hap1_df = parse_bed(args.trf[0])
    trf_hap2_df = parse_bed(args.trf[1])

    cov_hap1_pr = pr.read_bed(args.cov[0])
    cov_hap2_pr = pr.read_bed(args.cov[1])

    # Annotate strling calls with corresponding PacBio calls
    strling_df = match_variants(strling_df, trf_hap1_df, trf_hap2_df,
                                args.slop)

    # Annotate strling calls with coverage overlap
    strling_df = annotate_cov(strling_df, cov_hap1_pr, cov_hap2_pr)

    strling_df.to_csv(args.out, sep='\t', index=False)
Esempio n. 15
0
def f2():
    """
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # | Chromosome   |     Start |       End | Name       |     Score | Strand       |
    >>> # | (category)   |   (int32) |   (int32) | (object)   |   (int64) | (category)   |
    >>> # |--------------+-----------+-----------+------------+-----------+--------------|
    >>> # | chr1         |         1 |         2 | a          |         0 | +            |
    >>> # | chr1         |         6 |         7 | b          |         0 | -            |
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # Stranded PyRanges object has 2 rows and 6 columns from 1 chromosomes.
    >>> # For printing, the PyRanges was sorted on Chromosome and Strand.
    """

    full_path = get_example_path("f2.bed")

    return pr.read_bed(full_path)
Esempio n. 16
0
 def get_bed_files(self):
     bed_table_FP_path_list = [
         os.path.abspath(os.path.join(self.input_bed_files_dir, f))
         for f in listdir(self.input_bed_files_dir)
         if isfile(os.path.join(self.input_bed_files_dir, f))
         if f.endswith(".bed")
     ]  # list all the file names in input_dir
     bed_table_FPs = [{
         "bed_file": pr.read_bed(f, as_df=True),
         "bed_file_name": Path(os.path.basename(f)).stem
     } for f in bed_table_FP_path_list]
     bed_table_FPs_reduced = [{
         "bed_file": f["bed_file"].iloc[:, [0, 1, 2]],
         "bed_file_name": f["bed_file_name"]
     } for f in bed_table_FPs]
     return bed_table_FPs_reduced
Esempio n. 17
0
def f1():
    """
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # | Chromosome   |     Start |       End | Name       |     Score | Strand       |
    >>> # | (category)   |   (int32) |   (int32) | (object)   |   (int64) | (category)   |
    >>> # |--------------+-----------+-----------+------------+-----------+--------------|
    >>> # | chr1         |         3 |         6 | interval1  |         0 | +            |
    >>> # | chr1         |         8 |         9 | interval3  |         0 | +            |
    >>> # | chr1         |         5 |         7 | interval2  |         0 | -            |
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # Stranded PyRanges object has 3 rows and 6 columns from 1 chromosomes.
    >>> # For printing, the PyRanges was sorted on Chromosome and Strand.
    """

    full_path = get_example_path("f1.bed")

    return pr.read_bed(full_path)
Esempio n. 18
0
def parseReadApaInfo(apaClusterPath, inBamPath, geneTag, expressionInfo):
    """
    parse APA information, and classify each read into corresponding PAC
    """
    apaCluster = pr.read_bed(apaClusterPath, True)
    apaCluster["Name"] = apaCluster["Name"] + "_APA"
    apaCluster["geneName"] = apaCluster["Name"].str.split("_").str[0]
    apaCluster = apaCluster.reindex(["geneName", "Name", "Start", "End"], axis=1)
    apaClusterDict = defaultdict(lambda: {})
    for line in apaCluster.itertuples():
        apaClusterDict[line.geneName][line.Name] = portion.closedopen(
            line.Start, line.End
        )
    readsApaInfo = {}

    with pysam.AlignmentFile(inBamPath) as inBam:
        i = 0
        for read in inBam:
            i += 1
            readGene = read.get_tag(geneTag)
            geneApaInfo = apaClusterDict.get(readGene, "None")
            if geneApaInfo == "None":
                readApaName = f"{readGene}_N_APA"
            else:
                if read.is_reverse:
                    readEndPos = read.positions[0]
                else:
                    readEndPos = read.positions[-1]

                readApaName = f"{readGene}_N_APA"
                for apaName, apaSpan in geneApaInfo.items():
                    if readEndPos in apaSpan:
                        readApaName = apaName
                        break
            readsApaInfo[read.qname] = readApaName
            if i % 100000 == 0:
                logger.info(f"{i} reads processed")

    readsApaInfo = pd.Series(readsApaInfo)
    useUmi = list(set(readsApaInfo.index) & set(expressionInfo.index))
    expressionInfo = pd.concat([expressionInfo, readsApaInfo])
    expressionInfo = expressionInfo.loc[expressionInfo.index.isin(useUmi)]
    return expressionInfo
Esempio n. 19
0
def vcf_to_pyranges(vcf, tmpfile):
    '''
    create a bed-file containing the variant locations and ids
    '''

    bedtool = BedTool((Interval(record.chrom,
                                record.pos - 1,
                                record.pos - 1 + len(record.ref),
                                name=record.id) for record in vcf.fetch()
                       if not len(record.alts) > 1))
    bedtool.moveto(tmpfile)

    try:
        pr = pyranges.read_bed(tmpfile)
        del bedtool
        os.remove(tmpfile)
    except Exception as e:
        os.remove(tmpfile)
        raise e

    return pr
Esempio n. 20
0
def lojs_overlap(feature_files, compare_pr):
    """
    Function to run left outer join in features to all_regions_file

    Args:
            :param feature_files: list of paths to file to run intersection with all_regions_file
            :param compare_pr: pyranges object containing all regions of interest. Should have column
                'idx'. Added in function epitome.functions.bed2Pyranges.

    :return arr: array same size as the number of genomic regions in all_regions_file
    """

    if len(feature_files) == 0:
        logger.warn("WARN: lojs_overlap failed for all files %s with 0 lines" %
                    ','.join(feature_files))
        return np.zeros(len(compare_pr))

    #### Number of files that must share a consensus ####
    if len(feature_files) <= 2:
        n = 1  # if there are 1-2 files just include all
    elif len(feature_files) >= 3 and len(feature_files) <= 7:
        n = 2
    else:
        n = int(len(feature_files) / 4)  # in 25% of files

    # Very slow: concatenate all bed files and only take regions with n overlap
    group_pr = pr.concat([pr.read_bed(i).merge() for i in feature_files])
    group_pr = group_pr.merge(count=True).df
    group_pr = group_pr[group_pr['Count'] >= n]

    # Remove count column and save to bed file
    group_pr.drop('Count', inplace=True, axis=1)

    type_ = (compare_pr.Start.dtype == 'int64')
    pr1 = pr.PyRanges(group_pr, int64=type_)

    intersected = compare_pr.count_overlaps(pr1)
    arr = intersected.df.sort_values(by='idx')['NumberOverlaps'].values
    arr[arr > 0] = 1
    return arr
def main(infile, gene_bed, out_suffix, threads):
    # 读取gene_bed信息
    gene_model = pr.read_bed(gene_bed, as_df=True)
    gene_model = gene_model.set_index(['Name'])

    results = []
    with ProcessPoolExecutor(max_workers=threads) as e:
        for gene_id in gene_model.index:
            if gene_model.at[gene_id, 'Chromosome'] not in {'Mt', 'Pt'}:
                results.append(e.submit(get_three_end, infile, gene_id, gene_model))
    

    o_polya_cluster = open(f'{out_suffix}.polya_cluster.bed', 'w')
    o_polya_cluster_summit = open(f'{out_suffix}.polya_cluster.summit.bed', 'w')
    o_major_polya_cluster = open(f'{out_suffix}.major_polya_cluster.bed', 'w')
    o_major_polya_cluster_summit = open(f'{out_suffix}.major_polya_cluster_summit.bed', 'w')
    o_last_polya_cluster = open(f'{out_suffix}.last_polya_cluster.bed', 'w')
    o_last_polya_cluster_summit = open(f'{out_suffix}.last_polya_cluster_summit.bed', 'w')

    for res in results:
        result = res.result()
        if result is not None:
            polya_cluster, polya_cluster_summit, polya_cluster_major, polya_cluster_summit_major, polya_cluster_last, polya_cluster_summit_last = result
            for item in polya_cluster:
                o_polya_cluster.write(item)
            for item in polya_cluster_summit:
                o_polya_cluster_summit.write(item)
                
            o_major_polya_cluster.write(polya_cluster_major)
            o_major_polya_cluster_summit.write(polya_cluster_summit_major)
            o_last_polya_cluster.write(polya_cluster_last)
            o_last_polya_cluster_summit.write(polya_cluster_summit_last)
            
    o_polya_cluster.close()
    o_polya_cluster_summit.close()
    o_major_polya_cluster.close()
    o_major_polya_cluster_summit.close()
    o_last_polya_cluster.close()
    o_last_polya_cluster_summit.close()
Esempio n. 22
0
def parse_bed_files(bed_files):
    """Creates PyRanges objects from the BED files."""

    # Skip if no BED files are provided
    if len(bed_files) == 0:
        return

    # Load BED files
    beds = [pr.read_bed(b) for b in bed_files]

    # Check that all BED files have the first four columns
    for bed_file, bed in zip(bed_files, beds):
        assert "Name" in bed.columns, f"Name (column 4) missing from {bed_file}."

    # Concatenate BED files and only keep Name column
    bed = pr.concat(beds)
    bed = bed.unstrand()
    bed = bed[["Name"]]

    # Ensure unique names
    assert bed.Name.is_unique, "Names (column 4) not unique across BED files."

    return bed
Esempio n. 23
0
def chipseq_background():
    """
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # | Chromosome   | Start     | End       | Name       | Score     | Strand       |
    >>> # | (category)   | (int32)   | (int32)   | (object)   | (int64)   | (category)   |
    >>> # |--------------+-----------+-----------+------------+-----------+--------------|
    >>> # | chr1         | 39036822  | 39036847  | U0         | 0         | +            |
    >>> # | chr1         | 224145989 | 224146014 | U0         | 0         | +            |
    >>> # | chr1         | 167802964 | 167802989 | U0         | 0         | +            |
    >>> # | chr1         | 69101066  | 69101091  | U0         | 0         | +            |
    >>> # | ...          | ...       | ...       | ...        | ...       | ...          |
    >>> # | chrY         | 11936866  | 11936891  | U0         | 0         | -            |
    >>> # | chrY         | 10629111  | 10629136  | U0         | 0         | -            |
    >>> # | chrY         | 10632456  | 10632481  | U0         | 0         | -            |
    >>> # | chrY         | 11918814  | 11918839  | U0         | 0         | -            |
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # Stranded PyRanges object has 10,000 rows and 6 columns from 25 chromosomes.
    >>> # For printing, the PyRanges was sorted on Chromosome and Strand.
    """

    full_path = get_example_path("chipseq_background.bed")

    return pr.read_bed(full_path)
Esempio n. 24
0
def chipseq():
    """
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # | Chromosome   | Start     | End       | Name       | Score     | Strand       |
    >>> # | (category)   | (int32)   | (int32)   | (object)   | (int64)   | (category)   |
    >>> # |--------------+-----------+-----------+------------+-----------+--------------|
    >>> # | chr1         | 212609534 | 212609559 | U0         | 0         | +            |
    >>> # | chr1         | 169887529 | 169887554 | U0         | 0         | +            |
    >>> # | chr1         | 216711011 | 216711036 | U0         | 0         | +            |
    >>> # | chr1         | 144227079 | 144227104 | U0         | 0         | +            |
    >>> # | ...          | ...       | ...       | ...        | ...       | ...          |
    >>> # | chrY         | 15224235  | 15224260  | U0         | 0         | -            |
    >>> # | chrY         | 13517892  | 13517917  | U0         | 0         | -            |
    >>> # | chrY         | 8010951   | 8010976   | U0         | 0         | -            |
    >>> # | chrY         | 7405376   | 7405401   | U0         | 0         | -            |
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # Stranded PyRanges object has 10,000 rows and 6 columns from 24 chromosomes.
    >>> # For printing, the PyRanges was sorted on Chromosome and Strand.
    """

    full_path = get_example_path("chipseq.bed")

    return pr.read_bed(full_path)
Esempio n. 25
0
def chromsizes():
    """
    >>> # +--------------+-----------+-----------+
    >>> # | Chromosome   | Start     | End       |
    >>> # | (category)   | (int32)   | (int32)   |
    >>> # |--------------+-----------+-----------|
    >>> # | chr1         | 0         | 249250621 |
    >>> # | chr2         | 0         | 243199373 |
    >>> # | chr3         | 0         | 198022430 |
    >>> # | chr4         | 0         | 191154276 |
    >>> # | ...          | ...       | ...       |
    >>> # | chrY         | 0         | 59373566  |
    >>> # | chrX         | 0         | 155270560 |
    >>> # | chrM         | 0         | 16571     |
    >>> # | chr22        | 0         | 51304566  |
    >>> # +--------------+-----------+-----------+
    >>> # Unstranded PyRanges object has 25 rows and 3 columns from 25 chromosomes.
    >>> # For printing, the PyRanges was sorted on Chromosome.
    """

    full_path = get_example_path("chromsizes.bed")

    return pr.read_bed(full_path)
Esempio n. 26
0
def exons():
    """
    >>> # +--------------+-----------+-----------+----------------------------------------+-----------+--------------+
    >>> # | Chromosome   | Start     | End       | Name                                   | Score     | Strand       |
    >>> # | (category)   | (int32)   | (int32)   | (object)                               | (int64)   | (category)   |
    >>> # |--------------+-----------+-----------+----------------------------------------+-----------+--------------|
    >>> # | chrX         | 135721701 | 135721963 | NR_038462_exon_0_0_chrX_135721702_f    | 0         | +            |
    >>> # | chrX         | 135574120 | 135574598 | NM_001727_exon_2_0_chrX_135574121_f    | 0         | +            |
    >>> # | chrX         | 47868945  | 47869126  | NM_205856_exon_4_0_chrX_47868946_f     | 0         | +            |
    >>> # | chrX         | 77294333  | 77294480  | NM_000052_exon_17_0_chrX_77294334_f    | 0         | +            |
    >>> # | ...          | ...       | ...       | ...                                    | ...       | ...          |
    >>> # | chrY         | 15409586  | 15409728  | NR_047633_exon_3_0_chrY_15409587_r     | 0         | -            |
    >>> # | chrY         | 15478146  | 15478273  | NR_047634_exon_18_0_chrY_15478147_r    | 0         | -            |
    >>> # | chrY         | 15360258  | 15361762  | NR_047601_exon_0_0_chrY_15360259_r     | 0         | -            |
    >>> # | chrY         | 15467254  | 15467278  | NM_001258270_exon_13_0_chrY_15467255_r | 0         | -            |
    >>> # +--------------+-----------+-----------+----------------------------------------+-----------+--------------+
    >>> # Stranded PyRanges object has 1,000 rows and 6 columns from 2 chromosomes.
    >>> # For printing, the PyRanges was sorted on Chromosome and Strand.
    """

    full_path = get_example_path("exons.bed")

    return pr.read_bed(full_path)
Esempio n. 27
0
def aorta():
    """
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # | Chromosome   | Start     | End       | Name       | Score     | Strand       |
    >>> # | (category)   | (int32)   | (int32)   | (object)   | (int64)   | (category)   |
    >>> # |--------------+-----------+-----------+------------+-----------+--------------|
    >>> # | chr1         | 9939      | 10138     | H3K27me3   | 7         | +            |
    >>> # | chr1         | 9953      | 10152     | H3K27me3   | 5         | +            |
    >>> # | chr1         | 10024     | 10223     | H3K27me3   | 1         | +            |
    >>> # | chr1         | 10246     | 10445     | H3K27me3   | 4         | +            |
    >>> # | ...          | ...       | ...       | ...        | ...       | ...          |
    >>> # | chr1         | 9978      | 10177     | H3K27me3   | 7         | -            |
    >>> # | chr1         | 10001     | 10200     | H3K27me3   | 5         | -            |
    >>> # | chr1         | 10127     | 10326     | H3K27me3   | 1         | -            |
    >>> # | chr1         | 10241     | 10440     | H3K27me3   | 6         | -            |
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # Stranded PyRanges object has 11 rows and 6 columns from 1 chromosomes.
    >>> # For printing, the PyRanges was sorted on Chromosome and Strand.
    """

    full_path = get_example_path("aorta.bed")

    return pr.read_bed(full_path)
Esempio n. 28
0
def aorta2():
    """
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # | Chromosome   | Start     | End       | Name       | Score     | Strand       |
    >>> # | (category)   | (int32)   | (int32)   | (object)   | (int64)   | (category)   |
    >>> # |--------------+-----------+-----------+------------+-----------+--------------|
    >>> # | chr1         | 10073     | 10272     | Input      | 1         | +            |
    >>> # | chr1         | 10280     | 10479     | Input      | 1         | +            |
    >>> # | chr1         | 16056     | 16255     | Input      | 1         | +            |
    >>> # | chr1         | 16064     | 16263     | Input      | 1         | +            |
    >>> # | ...          | ...       | ...       | ...        | ...       | ...          |
    >>> # | chr1         | 10079     | 10278     | Input      | 1         | -            |
    >>> # | chr1         | 10082     | 10281     | Input      | 1         | -            |
    >>> # | chr1         | 10149     | 10348     | Input      | 1         | -            |
    >>> # | chr1         | 19958     | 20157     | Input      | 1         | -            |
    >>> # +--------------+-----------+-----------+------------+-----------+--------------+
    >>> # Stranded PyRanges object has 10 rows and 6 columns from 1 chromosomes.
    >>> # For printing, the PyRanges was sorted on Chromosome and Strand.
    """

    full_path = get_example_path("aorta2.bed")

    return pr.read_bed(full_path)
Esempio n. 29
0
def test_pyranges_to_intervals():
    pranges = pyranges.read_gtf(gtf_file)
    intervals = list(
        pyranges_to_intervals(pranges,
                              interval_attrs=[
                                  'gene_id', 'gene_name', 'transcript_id',
                                  'exon_id'
                              ]))

    assert len(intervals) == 5
    assert intervals[4].attrs['gene_id'] == 'ENSG00000012048'
    assert intervals[4].attrs['gene_name'] == 'BRCA1'
    assert intervals[4].attrs['transcript_id'] == 'ENST00000357654'
    assert intervals[4].attrs['exon_id'] == 'ENSE00003510592'

    pranges = pyranges.read_bed(example_intervals_bed)
    intervals = list(pyranges_to_intervals(pranges))

    assert len(intervals) == 4
    assert intervals[0].start == 2

    assert pranges.Chromosome.tolist() == ['chr1'] * 4
    assert pranges.Start.tolist() == [2, 2, 2, 602]
    assert pranges.End.tolist() == [1000, 5000, 1002, 604]
Esempio n. 30
0
 def dfunc(self, *args, **kwargs):
     if not self.initialized:
         self.data = pyranges.read_bed(self.input_path)
         self.initialized = True
     return func(self, *args, **kwargs)