Esempio n. 1
0
def expected_result_single_chromosome(chipseq_dataset):

    return pr.PyRanges(chipseq_dataset["chr2"].df.drop_duplicates())
Esempio n. 2
0
 def get_reference_ranges(self):
     chromosomes = self.get_chromosomes()
     starts = np.repeat(0, len(chromosomes))
     ends = self.get_chromosome_lengths(chromosomes)
     return pr.PyRanges(chromosomes=chromosomes, starts=starts, ends=ends)
Esempio n. 3
0
def pyrange_apply(function, self, other, **kwargs):

    nparams = get_n_args(function)
    nb_cpu = kwargs.get("nb_cpu", 1)

    if nb_cpu > 1:
        import ray
        with suppress_stdout_stderr():
            ray.init(num_cpus=nb_cpu, ignore_reinit_error=True)

    function, get, _merge_dfs = get_multithreaded_funcs(function,
                                                        nb_cpu=nb_cpu)

    strandedness = kwargs["strandedness"]

    other_strand = {"+": "-", "-": "+"}
    same_strand = {"+": "+", "-": "-"}

    if strandedness == "opposite":
        strand_dict = other_strand
    else:
        strand_dict = same_strand

    assert strandedness in ["same", "opposite", False, None]

    if strandedness:
        assert self.stranded and other.stranded, \
            "Can only do stranded operations when both PyRanges contain strand info"

    results = []

    items = natsorted(self.dfs.items())
    keys = natsorted(self.dfs.keys())

    if strandedness:

        for (c, s), df in items:

            os = strand_dict[s]

            if not (c, os) in other.keys() or len(other[c, os].values()) == 0:
                odf = pd.DataFrame(columns="Chromosome Start End".split())
            else:
                odf = other[c, os].values()[0]

            df, odf = make_binary_sparse(kwargs, df, odf)
            result = call_f(function, nparams, df, odf, kwargs)

            results.append(result)

    else:

        if self.stranded and not other.stranded:

            for (c, s), df in items:

                if not c in other.chromosomes:
                    odf = pd.DataFrame(columns="Chromosome Start End".split())
                else:
                    odf = other.dfs[c]

                df, odf = make_binary_sparse(kwargs, df, odf)
                result = call_f(function, nparams, df, odf, kwargs)
                results.append(result)

        elif not self.stranded and other.stranded:

            for c, df in items:

                if not c in other.chromosomes:
                    odf = pd.DataFrame(columns="Chromosome Start End".split())
                else:
                    odf1 = other[c, "+"].df
                    odf2 = other[c, "-"].df

                    odf = _merge_dfs.remote(odf1, odf2)

                df, odf = make_binary_sparse(kwargs, df, odf)

                result = call_f(function, nparams, df, odf, kwargs)
                results.append(result)

        elif self.stranded and other.stranded:

            for (c, s), df in self.items():

                if not c in other.chromosomes:
                    odfs = pr.PyRanges(
                        pd.DataFrame(columns="Chromosome Start End".split()))
                else:
                    odfs = other[c].values()

                # from pydbg import dbg
                # dbg(odfs)

                if len(odfs) == 2:
                    odf = _merge_dfs.remote(*odfs)
                elif len(odfs) == 1:
                    odf = odfs[0]
                else:
                    odf = pd.DataFrame(columns="Chromosome Start End".split())

                df, odf = make_binary_sparse(kwargs, df, odf)

                # dbg(df)
                # dbg(odf)

                result = call_f(function, nparams, df, odf, kwargs)
                results.append(result)

        else:

            for c, df in items:
                if not c in other.chromosomes:
                    odf = pd.DataFrame(columns="Chromosome Start End".split())
                else:
                    odf = other.dfs[c]

                df, odf = make_binary_sparse(kwargs, df, odf)

                result = call_f(function, nparams, df, odf, kwargs)
                results.append(result)

    results = get(results)

    results = process_results(results, keys)

    if nb_cpu > 1:
        ray.shutdown()

    return results
Esempio n. 4
0
    def introns(self, by="gene", nb_cpu=1):
        """Return the introns.

        Parameters
        ----------
        by : str, {"gene", "transcript"}, default "gene"
            Whether to find introns per gene or transcript.

        nb_cpu: int, default 1

            How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple.
            Will only lead to speedups on large datasets.

        See Also
        --------
        pyranges.genomicfeatures.GenomicFeaturesMethods.tss : return the transcription start sites

        Examples
        --------

        >>> gr = pr.data.ensembl_gtf()
        >>> gr
        +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+
        | Chromosome   | Source     | Feature      | Start     | End       | Score      | Strand       | Frame      | gene_biotype                       | +19   |
        | (category)   | (object)   | (category)   | (int32)   | (int32)   | (object)   | (category)   | (object)   | (object)                           | ...   |
        |--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------|
        | 1            | havana     | gene         | 11868     | 14409     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
        | 1            | havana     | transcript   | 11868     | 14409     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
        | 1            | havana     | exon         | 11868     | 12227     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
        | 1            | havana     | exon         | 12612     | 12721     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
        | ...          | ...        | ...          | ...       | ...       | ...        | ...          | ...        | ...                                | ...   |
        | 1            | havana     | gene         | 1173055   | 1179555   | .          | -            | .          | lncRNA                             | ...   |
        | 1            | havana     | transcript   | 1173055   | 1179555   | .          | -            | .          | lncRNA                             | ...   |
        | 1            | havana     | exon         | 1179364   | 1179555   | .          | -            | .          | lncRNA                             | ...   |
        | 1            | havana     | exon         | 1173055   | 1176396   | .          | -            | .          | lncRNA                             | ...   |
        +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+
        Stranded PyRanges object has 2,446 rows and 28 columns from 1 chromosomes.
        For printing, the PyRanges was sorted on Chromosome and Strand.
        19 hidden columns: gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, transcript_source, transcript_support_level, ... (+ 9 more.)

        >>> gr.features.introns(by="gene")
        +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------+
        | Chromosome   | Source         | Feature    | Start     | End       | Score      | Strand       | Frame      | +20   |
        | (object)     | (object)       | (object)   | (int32)   | (int32)   | (object)   | (category)   | (object)   | ...   |
        |--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------|
        | 1            | ensembl_havana | intron     | 1173926   | 1174265   | .          | +            | .          | ...   |
        | 1            | ensembl_havana | intron     | 1174321   | 1174423   | .          | +            | .          | ...   |
        | 1            | ensembl_havana | intron     | 1174489   | 1174520   | .          | +            | .          | ...   |
        | 1            | ensembl_havana | intron     | 1175034   | 1179188   | .          | +            | .          | ...   |
        | ...          | ...            | ...        | ...       | ...       | ...        | ...          | ...        | ...   |
        | 1            | havana         | intron     | 874591    | 875046    | .          | -            | .          | ...   |
        | 1            | havana         | intron     | 875155    | 875525    | .          | -            | .          | ...   |
        | 1            | havana         | intron     | 875625    | 876526    | .          | -            | .          | ...   |
        | 1            | havana         | intron     | 876611    | 876754    | .          | -            | .          | ...   |
        +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------+
        Stranded PyRanges object has 311 rows and 28 columns from 1 chromosomes.
        For printing, the PyRanges was sorted on Chromosome and Strand.
        20 hidden columns: gene_biotype, gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, ... (+ 11 more.)

        >>> gr.features.introns(by="transcript")
        +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------+
        | Chromosome   | Source         | Feature    | Start     | End       | Score      | Strand       | Frame      | gene_biotype                     | +19   |
        | (object)     | (object)       | (object)   | (int32)   | (int32)   | (object)   | (category)   | (object)   | (object)                         | ...   |
        |--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------|
        | 1            | havana         | intron     | 818202    | 818722    | .          | +            | .          | lncRNA                           | ...   |
        | 1            | ensembl_havana | intron     | 960800    | 961292    | .          | +            | .          | protein_coding                   | ...   |
        | 1            | ensembl_havana | intron     | 961552    | 961628    | .          | +            | .          | protein_coding                   | ...   |
        | 1            | ensembl_havana | intron     | 961750    | 961825    | .          | +            | .          | protein_coding                   | ...   |
        | ...          | ...            | ...        | ...       | ...       | ...        | ...          | ...        | ...                              | ...   |
        | 1            | havana         | intron     | 732207    | 732980    | .          | -            | .          | transcribed_processed_pseudogene | ...   |
        | 1            | havana_tagene  | intron     | 168165    | 169048    | .          | -            | .          | lncRNA                           | ...   |
        | 1            | havana_tagene  | intron     | 165942    | 167958    | .          | -            | .          | lncRNA                           | ...   |
        | 1            | havana_tagene  | intron     | 168165    | 169048    | .          | -            | .          | lncRNA                           | ...   |
        +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------+
        Stranded PyRanges object has 1,043 rows and 28 columns from 1 chromosomes.
        For printing, the PyRanges was sorted on Chromosome and Strand.
        19 hidden columns: gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, transcript_source, transcript_support_level, ... (+ 9 more.)
        """

        kwargs = {"by": by, "nb_cpu": nb_cpu}
        kwargs = pr.pyranges.fill_kwargs(kwargs)

        assert by in ["gene", "transcript"]

        id_column = by_to_id[by]
        gr = self.pr.sort(id_column)

        if not len(gr):
            return pr.PyRanges()

        exons = gr.subset(lambda df: df.Feature == "exon")
        exons = exons.merge(by=id_column)

        by_gr = gr.subset(lambda df: df.Feature == by)

        result = pyrange_apply(_introns2, by_gr, exons, **kwargs)

        return pr.PyRanges(result)
Esempio n. 5
0
chromosomes = {}
for i in list(range(1, 23))+['X', 'Y']:
    with open(file_path / 'chromosomes' / ('chr' + str(i) + '.txt')) as f:
        chromosomes[str(i)] = f.read()


##Use GFF3 to annotate variants
##ftp://ftp.ensembl.org/pub/grch37/current/gff3/homo_sapiens/
gff = pd.read_csv(file_path / 'Homo_sapiens.GRCh37.87.gff3',
                  sep='\t',
                  names=['chr', 'unknown', 'gene_part', 'start', 'end', 'unknown2', 'strand', 'unknown3', 'gene_info'],
                  usecols=['chr','gene_part', 'start', 'end', 'gene_info'],
                  low_memory=False)


gff_cds_pr = pr.PyRanges(gff.loc[(gff['gene_part'] == 'CDS') & gff['chr'].isin(chromosomes), ['chr', 'start', 'end', 'gene_info']].astype({'start': int, 'end': int}).rename(columns={'chr': 'Chromosome', 'start': 'Start', 'end': 'End'})).merge()
gff_exon_pr = pr.PyRanges(gff.loc[(gff['gene_part'] == 'exon') & gff['chr'].isin(chromosomes), ['chr', 'start', 'end', 'gene_info']].astype({'start': int, 'end': int}).rename(columns={'chr': 'Chromosome', 'start': 'Start', 'end': 'End'})).merge()
del gff

##make index column for merging
tcga_maf['index'] = tcga_maf.index.values

maf_pr = pr.PyRanges(tcga_maf.loc[:, ['Chromosome', 'Start_Position', 'End_Position', 'index']].rename(columns={'Start_Position': 'Start', 'End_Position': 'End'}))

##use the genie 7.0 panels: https://www.synapse.org/#!Synapse:syn21551261
genie = pd.read_csv(file_path / 'genomic_information.txt', sep='\t', low_memory=False)
panels = genie.SEQ_ASSAY_ID.unique()
panel_df = pd.DataFrame(data=panels, columns=['Panel'])


total_sizes = []
Esempio n. 6
0
def range_overlap(user_bed_path, guide_loc_path, output_name, upstream,
                  downstream, sort_by, de_novo, cloning_strategy):

    #note: if de_novo = True, then sort_by is automatically set to be "mismatch_score"
    import sys
    import pyranges as pr
    import pandas as pd
    from targetsite_to_primers import revcomp, startG, cloning_parameters, targetsite_to_primers

    #######################
    # read in the bed file from the user (right now this will be a file of TSS's for a specific gene list from Mina's function)
    #######################

    user_bed = pd.read_csv(
        user_bed_path, sep='\t', header=None
    )  # important: this assumes that the bed file has no column names

    # need to have first three columns be called 'Chromosome', 'Start', 'End' for a pyRanges object so here we will change the column names
    column_names_user = user_bed.columns.values
    column_names_user_list = list(column_names_user)
    column_names_user_list_str = [str(i) for i in column_names_user_list]

    column_names_user_list_str[0:6] = [
        'Chromosome', 'Start', 'End', 'Gene', '5', "Strand"
    ]
    user_bed.columns = column_names_user_list_str

    # iterate over the rows and change the start and end positions in the user bed file based on the upstream and downstream arguments
    # also et the start column as a list to use later to determine the distance from TSS
    # we also need to account for the beginning of chromosomes where the upstream/downstream will give a negative location, so we set those values to zero of it will span over the end of the chromosome
    user_bed_start = []
    for index, row in user_bed.iterrows():
        if user_bed.at[index, 'Strand'] == '-':
            user_bed_start.append(user_bed.at[index, 'Start'])
            if user_bed.at[index, 'Start'] < downstream:
                user_bed.at[index, 'Start'] = 0
                user_bed.at[index, 'End'] += upstream
            else:
                user_bed.at[index, 'End'] += upstream
                user_bed.at[index, 'Start'] -= downstream
        else:
            user_bed_start.append(user_bed.at[index, 'End'])
            if user_bed.at[index, 'Start'] < upstream:
                user_bed.at[index, 'Start'] = 0
                user_bed.at[index, 'End'] += downstream
            else:
                user_bed.at[index, 'Start'] -= upstream
                user_bed.at[index, 'End'] += downstream

    user_bed = user_bed.assign(Original_start=user_bed_start)

    user_bed_pyR = pr.PyRanges(
        user_bed
    )  # convert the panda df to a pyRanges object which is required for the overlap function
    #user_bed_pyR_merge = user_bed_pyR.merge() # if we wanted to collapse overlapping ranges, we could use this, but removed it becuase we lose gene column when we do it

    ##############
    # read in the guides already determined for human genome
    ###############

    guide_locs = pd.read_csv(
        guide_loc_path, sep='\t'
    )  # important: this assumes that the guide table has column names, need to change if this isnt true!
    column_names_gloc = guide_locs.columns.values
    column_names_gloc_list = list(column_names_gloc)
    column_names_gloc_list_str = [str(i) for i in column_names_gloc_list]

    # right now the scores are not in the best form e.g. score(perc), so we need to make them separate numberic columns for sorting
    column_names_gloc_list_str[0:3] = ['Chromosome', 'Start', 'End']
    guide_locs.columns = column_names_gloc_list_str
    guide_locs_noNaN = guide_locs.fillna(0)
    if de_novo == False:
        guide_locs_noNaN[['Doench2016_perc', 'Doench2016_score'
                          ]] = guide_locs_noNaN.fusi.str.split('%',
                                                               expand=True)
        guide_locs_noNaN['Doench2016_score'] = guide_locs_noNaN[
            'Doench2016_score'].str.replace(r"[\(\)]", "")
        guide_locs_noNaN[[
            'Moreno_Matos_perc', 'Moreno_Matos_score'
        ]] = guide_locs_noNaN.crisprScan.str.split('%', expand=True)
        guide_locs_noNaN['Moreno_Matos_score'] = guide_locs_noNaN[
            'Moreno_Matos_score'].str.replace(r"[\(\)]", "")
        guide_locs_noNaN[['MIT_specificity']] = guide_locs_noNaN[['scoreDesc']]
        guide_locs_noNaN['MIT_specificity'] = guide_locs_noNaN[
            'MIT_specificity'].str.replace(r"[A-Za-z\s\.\-]+", "0")
        # to sort the scores, we need to make the columns numeric
        guide_locs_noNaN[[
            "Doench2016_perc", "Doench2016_score", "Moreno_Matos_perc",
            "Moreno_Matos_score", "MIT_specificity"
        ]] = guide_locs_noNaN[[
            "Doench2016_perc", "Doench2016_score", "Moreno_Matos_perc",
            "Moreno_Matos_score", "MIT_specificity"
        ]].apply(pd.to_numeric)

        # remove unnescessary columns
        guide_locs_noNaN_select = guide_locs_noNaN.iloc[:, [
            0, 1, 2, 3, 4, 5, 6, 7, 11, 20, 21, 22, 23, 24
        ]]

        # check whether the scoring entry from the user is incorrect, if so, then it sets the default and gives a message what was done
        if sort_by not in [
                'Doench2016_perc', 'Doench2016_score', 'Moreno_Matos_perc',
                'Moreno_Matos_score', 'MIT_specificity'
        ]:
            print(
                "The sort_by argument must be 'Doench2016_perc', 'Doench2016_score', 'Moreno_Matos_perc', 'Moreno_Matos_score', 'MIT_specificity', because you failed to comply the default ('Doench2016_perc') will be used"
            )
            sort_by = "Doench2016_perc"

        # convert the guide panda to a pyRanges table
        guide_locs_pyR = pr.PyRanges(guide_locs_noNaN_select)

    else:
        # if de_novo is true then it just sets the sort by to the column z and b made for scoring
        sort_by = "mismatch_score"
        guide_locs_pyR = pr.PyRanges(
            guide_locs_noNaN)  # convert the guide panda to a pyRanges table

    class NoOverlapError(Exception):
        pass

    if len(guide_locs_pyR.overlap(user_bed_pyR)) == 0:
        raise NoOverlapError(
            "There are no overlaps between the user supplied ranges and the gRNAs!"
        )

    guide_locs_pyR_overlap = guide_locs_pyR.overlap(
        user_bed_pyR)  # run the pyRanges overlap function

    # convert to pandas with this loop to more easily manipulate the df, got this code from internet
    for k, guide_locs_pyR_overlap_df in guide_locs_pyR_overlap:
        guide_locs_pyR_overlap_df

    # here we will iterate over the rows of the overlap table and get the start values for each row in that panda df. Then we compare that value with the start and end of each row in the original user bed to determine whether it falls in the range of that row so we can then pull the gene and the original TSS start site for that gene. These values are written into a list which will then be used later on to make new columns in the table.
    gene_list = []
    user_bed_start_ol = []
    for index_ol, row_ol in guide_locs_pyR_overlap_df.iterrows():
        for index_ub, row_ub in user_bed.iterrows():
            if row_ol[0] == row_ub[0]:
                if row_ol[1] in range(row_ub[1],
                                      (row_ub[2] + 1)) or row_ol[2] in range(
                                          row_ub[1], (row_ub[2] + 1)):
                    gene_list.append(row_ub[3])
                    user_bed_start_ol.append(row_ub[6])

    # add in the list of genes and the list of the original start sites that were just compiled, and then sort by gene followed by score
    guide_locs_pyR_overlap_df = guide_locs_pyR_overlap_df.assign(
        Gene=gene_list)
    guide_locs_pyR_overlap_df = guide_locs_pyR_overlap_df.assign(
        Original_Start=user_bed_start_ol)
    guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df.sort_values(
        ["Gene", sort_by], ascending=[True, False])

    # this for loop both calculates the distance of the end of each guide from the TSS of the gene for that row and also compiles the lists of each primer
    primer1_list = []
    primer2_list = []
    distance_from_tss = []
    for index, row in guide_locs_pyR_overlap_df_sort.iterrows():
        # get a list of distances from the end og each guide to the original TSS
        distance_temp = guide_locs_pyR_overlap_df_sort.at[
            index, 'End'] - guide_locs_pyR_overlap_df_sort.at[index,
                                                              'Original_Start']
        distance_from_tss.append(distance_temp)

        # use Minas functions to get lists of each primer
        gRNA = guide_locs_pyR_overlap_df_sort.at[index, 'guideSeq']
        (Grequired, p1specs, p2specs) = cloning_parameters(cloning_strategy)
        (primer1, primer2) = targetsite_to_primers(gRNA, cloning_strategy,
                                                   Grequired, p1specs, p2specs)
        primer1_list.append(primer1)
        primer2_list.append(primer2)

    # add the various columns to the df that we populated in the above for loop
    guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.assign(
        GuideEnd_to_TSS=distance_from_tss)
    guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.drop(
        columns=["Original_Start"])
    if de_novo == False:
        guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.drop(
            columns=["name", "score", "thickStart",
                     "thickEnd"])  # clean up some unnecessary columns

    guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.assign(
        Primer1=primer1_list)
    guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.assign(
        Primer2=primer2_list)
    col_name_p1 = 'Primer1 - ' + cloning_strategy
    col_name_p2 = 'Primer2 - ' + cloning_strategy

    # rename the primer columns so that it tells you the strategy you used
    guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.rename(
        columns={
            "Primer1": col_name_p1,
            "Primer2": col_name_p2
        })

    guide_locs_pyR_overlap_df_sort.to_csv((output_name + '.txt'),
                                          sep='\t',
                                          header=True,
                                          index=False)

    return (guide_locs_pyR_overlap_df_sort)
Esempio n. 7
0
    def k_nearest(self, other, k=1, **kwargs):

        from pyranges.methods.k_nearest import _nearest
        from sorted_nearest import get_all_ties, get_different_ties

        kwargs = fill_kwargs(kwargs)
        kwargs["stranded"] = self.stranded and other.stranded

        overlap = kwargs.get("overlap", True)
        ties = kwargs.get("ties", False)

        self = pr.PyRanges({k: v.copy() for k, v in self.dfs.items()})

        try:  # if k is an array
            k = k.values
        except:
            pass

        self.__k__ = k
        self.__IX__ = np.arange(len(self))

        # from time import time
        # start = time()
        dfs = pyrange_apply(_nearest, self, other, **kwargs)
        # end = time()
        # print("nearest", end - start)

        nearest = PyRanges(dfs)
        # nearest.msp()
        # raise
        # print("nearest len", len(nearest))

        if not overlap:
            # self = self.drop(like="__k__|__IX__")
            result = nearest  #.drop(like="__k__|__IX__")
        else:
            from collections import defaultdict
            overlap_kwargs = {k: v for k, v in kwargs.items()}
            # print("kwargs ties:", kwargs.get("ties"))
            overlap_kwargs["how"] = defaultdict(lambda: None, {
                "first": "first",
                "last": "last"
            })[kwargs.get("ties")]
            # start = time()
            overlaps = self.join(other, **overlap_kwargs)
            # end = time()
            # print("overlaps", end - start)
            overlaps.Distance = 0
            # print("overlaps len", len(overlaps))

            result = pr.concat([overlaps, nearest])

        if not len(result):
            return pr.PyRanges()
        # print(result)
        # print(overlaps.drop(like="__").df)
        # raise

        # start = time()
        new_result = {}
        if ties in ["first", "last"]:
            # method = "tail" if ties == "last" else "head"
            # keep = "last" if ties == "last" else "first"

            for c, df in result:
                # start = time()
                # print(c)
                # print(df)

                df = df.sort_values(["__IX__", "Distance"])
                grpby = df.groupby("__k__", sort=False)
                dfs = []
                for k, kdf in grpby:
                    # print("k", k)
                    # print(kdf)
                    # dist_bool = ~kdf.Distance.duplicated(keep=keep)
                    # print(dist_bool)
                    # kdf = kdf[dist_bool]
                    grpby2 = kdf.groupby("__IX__", sort=False)
                    # f = getattr(grpby2, method)
                    _df = grpby2.head(k)
                    # print(_df)
                    dfs.append(_df)
                # raise

                if dfs:
                    new_result[c] = pd.concat(dfs)
                # print(new_result[c])
        elif ties == "different" or not ties:
            for c, df in result:

                # print(df)

                if df.empty:
                    continue
                dfs = []

                df = df.sort_values(["__IX__", "Distance"])
                grpby = df.groupby("__k__", sort=False)

                # for each index
                # want to keep until we have k
                # then keep all with same distance
                for k, kdf in grpby:
                    # print("kdf " * 10)
                    # print("k " * 5, k)
                    # print(kdf["__IX__ Distance".split()])
                    # print(kdf.dtypes)
                    # print(kdf.index.dtypes)
                    # if ties:
                    if ties:
                        lx = get_different_ties(
                            kdf.index.values, kdf.__IX__.values,
                            kdf.Distance.astype(np.int64).values, k)
                    else:
                        lx = get_all_ties(kdf.index.values, kdf.__IX__.values,
                                          kdf.Distance.astype(np.int64).values,
                                          k)
                    # print(lx)

                    # else:
                    #     lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k)
                    _df = kdf.reindex(lx)
                    # print("_df", _df)
                    dfs.append(_df)

                if dfs:
                    new_result[c] = pd.concat(dfs)

        result = pr.PyRanges(new_result)

        if not result.__IX__.is_monotonic:
            result = result.sort("__IX__")

        result = result.drop(like="__IX__|__k__")

        self = self.drop(like="__k__|__IX__")

        def prev_to_neg(df, kwargs):

            strand = df.Strand.iloc[0] if "Strand" in df else "+"

            suffix = kwargs["suffix"]

            bools = df["End" + suffix] < df.Start
            if not strand == "+":
                bools = ~bools

            df.loc[bools, "Distance"] = -df.loc[bools, "Distance"]
            return df

        # print(result)
        result = result.apply(prev_to_neg, suffix=kwargs["suffix"])
        # print(result)

        # end = time()
        # print("final stuff", end - start)

        return result
Esempio n. 8
0
    pyranges_to_intervals,  intervals_to_pyranges, BaseVariantMatcher, \
    SingleVariantMatcher, MultiVariantsMatcher

intervals = [
    Interval('chr1', 1, 10, strand='+'),
    Interval('chr1', 23, 30, strand='-')
]

variants = [
    Variant('chr1', 4, 'T', 'C'),
    Variant('chr1', 5, 'A', 'GA'),
    Variant('chr1', 25, 'AACG', 'GA')
]

pr = pyranges.PyRanges(chromosomes='chr1',
                       starts=[1, 23, 5],
                       ends=[10, 30, 50],
                       strands=['+', '-', '.'])


def test_variants_to_pyranges():
    vcf = MultiSampleVCF(vcf_file)
    variants = list(vcf)
    df = variants_to_pyranges(variants).df
    assert df.shape[0] == len(variants)

    v = df.iloc[0]
    assert v.Chromosome == 'chr1'
    assert v.Start == 3
    assert v.End == 4
    assert v.variant.ref == 'T'
    assert v.variant.alt == 'C'
Esempio n. 9
0
def aggregate_genes(adata: AnnData,
                    genes: PyRanges,
                    agg_layers: Iterable = None,
                    agg_var: Iterable = None) -> AnnData:
    """ Aggregate copy number by gene to create gene CN matrix

    Currently only does segment width weighted mean aggregation.

    Parameters
    ----------
    adata : AnnData
        copy number data
    adata : PyRanges
        gene data
    agg_layers : List, optional
        list of layers to aggregate, by default None, all layers
    agg_var : List, optional
        list of obs columns to aggregate, by default None, all columns
    cluster_col : str, optional
        column with cluster ids, by default 'cluster_id'

    Returns
    -------
    AnnData
        aggregated gene copy number
    """

    if agg_layers is None:
        agg_layers = adata.layers.keys()
    agg_layers = set(agg_layers)

    if agg_var is None:
        agg_var = set(
            adata.var.select_dtypes(
                include=np.number).columns.to_list()) - set(
                    ['chr', 'start', 'end'])
    agg_var = set(agg_var)

    bins = pr.PyRanges(adata.var.reset_index().rename(columns={
        'chr': 'Chromosome',
        'start': 'Start',
        'end': 'End',
    })[['Chromosome', 'Start', 'End', 'bin']])

    intersect_1 = genes.intersect(bins)
    intersect_2 = bins.intersect(genes)

    intersect = pd.merge(intersect_1.as_df(), intersect_2.as_df())
    intersect['segment_width'] = intersect['End'] - intersect['Start']

    X = _segment_width_weighted_mean_matrix(adata.to_df(), intersect)

    layer_data = {}
    for layer_name in agg_layers:
        layer_data[layer_name] = _segment_width_weighted_mean_matrix(
            adata.to_df(layer=layer_name), intersect)

    var = _segment_width_weighted_mean_var(adata.var[agg_var], intersect)

    gene_data = genes.as_df().drop(
        ['Chromosome', 'Start', 'End'],
        axis=1).drop_duplicates().set_index('gene_id')
    var = var.merge(gene_data, left_index=True, right_index=True, how='left')

    adata = ad.AnnData(
        X,
        obs=adata.obs,
        var=var,
        layers=layer_data,
    )

    return adata
Esempio n. 10
0
# Load genome
genome = load_genome(args.genome, upper=False)

print(f"Loading annotation from {args.gff}")
# Load annotation
annotation = pr.read_gff3(args.gff)

genes = annotation[annotation.Feature == 'gene'].merge(strand=False)
exons = annotation[annotation.Feature == 'exon'].merge(strand=False)

tmp = index.index.to_frame(index=False)
tmp['Start'] = tmp.pos - args.radius
tmp['End'] = tmp.pos + args.radius + 2
tmp['Chromosome'] = tmp.seqid
windows = pr.PyRanges(tmp)

coverage = windows.coverage(exons)
tmp = coverage.df.set_index(['seqid', 'pos']).reindex(index.index)
index['exon_overlap'] = tmp.FractionOverlaps.round(2)

coverage = windows.coverage(genes)
tmp = coverage.df.set_index(['seqid', 'pos']).reindex(index.index)
index['gene_overlap'] = tmp.FractionOverlaps.round(2)

# Filter exonic sites
if args.exon:
    index = index[index.inexon > 0]
    print(f"{len(index)} exonic sites.")

# Filter chromosomal sites
Esempio n. 11
0
    def plot_coverage_fractions(self, max_depth=None, bins=100, tile_size=10,
                                boundaries_of_interest=[10, 100], **kwargs):
        """
        Prepare ROC-like plot showing distribution of coverage across genome.

        This method prepares a plot that aims to visualise the fraction of
        the genome that is reflected at different levels of coverage. The
        genome is split into windows of size tile_size and coverage averaged
        across these windows. The maximum depth is calculated and the
        coverages are binned into bins bins. The data is then prepared so
        that 100% of the genome is covered at 0 depth going to 0% of the
        genome at max_depth+1 depth.

        Parameters
        ----------
        max_depth: int
            The maximum depth to render within the plot.
            # TODO: this looks borked
        bins: int
            The number of bins to present in the plot. This is really an
            aesthetic thing only.
        tile_size: int
            The tile size to summarise depths at. The default is 10.
        boundaries_of_interest: list of ints
            This will direct the plotting of lines to show the positions where
            specific coverages are reached. The default is [10, 100].
        **kwargs: **kwargs
            can provide a number of possible options to the Flounder class in
            the background that may be used to alter the plot dimensions,
            bokeh tools and plot rendering options.

        Returns
        -------
        bokeh image plot

        """
        (plot_width, plot_height, plot_type, plot_tools) = self.handle_kwargs(
            ["plot_width", "plot_height", "plot_type", "plot_tools"], **kwargs)

        coverage = self.get_coverage(tile_size=tile_size)
        if self.bam_b is not None:
            coverage = pd.merge(self.get_coverage(tile_size=tile_size).df,
                                self.get_coverage(tile_size=tile_size, plot_bam_b=True).df,
                                on=["Chromosome", "Start", "End"])
            coverage['MeanCoverage'] = coverage[["MeanCoverage_x", "MeanCoverage_y"]].sum(axis=1)
            # and convert back to pyranges ...
            coverage = pr.PyRanges(coverage)

        if max_depth is None:
            max_depth = coverage.MeanCoverage.max() + 1
            print("max_depth set to {}".format(max_depth))
        boundaries = np.linspace(
            0, max_depth, num=bins, endpoint=True, retstep=False)
        assignments = np.digitize(coverage.MeanCoverage, boundaries)
        cov_data = pd.DataFrame({"assignment": assignments,
                                 "bases": (coverage.End - coverage.Start)})
        cov_data = cov_data.groupby("assignment").agg({"assignment": "first",
                                                       "bases": np.sum})
        # add the missing values
        cov_data = cov_data.reindex(
            pd.Index(
                pd.Series(boundaries).index)
            ).reset_index().drop(["assignment"], axis=1).fillna(0)
        # add the cumulative sum to the data
        cov_data['frac'] = cov_data.bases.sum() - np.cumsum(cov_data.bases)
        # prepare the cumsum as percentage
        cov_data['perc'] = cov_data.frac / cov_data.bases.sum() * 100

        plot = figure(
            title='Plot showing % of genome covered at different depths',
            x_axis_label='Depth of coverage (X)',
            y_axis_label='Percentage of Genome covered(%)',
            background_fill_color="lightgrey",
            plot_width=plot_width, plot_height=plot_height, tools=plot_tools)

        for b in boundaries_of_interest:
            # b is coverage - get corresponding perc
            bases = coverage[coverage.MeanCoverage >= b].lengths().sum()
            perc = bases / cov_data.bases.sum() * 100
            legend = "{}X".format(b)
            plot.line(
                [0, b, b], [perc, perc, 0], line_width=2, line_color='red')
            plot.add_layout(Label(x=b, y=perc, text=legend, text_color='red'))

        plot.step(boundaries, cov_data.perc, line_width=2, mode="before")
        return self.handle_output(plot, plot_type)
Esempio n. 12
0
]).size().to_frame().reset_index().pivot(index='var_str',
                                         columns='SEQ_ASSAY_ID',
                                         values=0)

# get counts of sample per assay
panel_counts = sample.groupby('SEQ_ASSAY_ID').size()

# get unique mutations
var_uniq = maf.loc[~idx_germline, [
    'var_str', 'Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position',
    'Reference_Allele', 'Tumor_Seq_Allele2'
]].drop_duplicates().set_index('var_str')
# make PyRange for unique variants, include index (var_str) for setting value in orginal df
var_pr = pr.PyRanges(var_uniq.reset_index()[[
    'Chromosome', 'Start_Position', 'End_Position', 'var_str'
]].rename(columns={
    'Start_Position': 'Start',
    'End_Position': 'End'
}))

# find overlap of variants in various bed files and add to unique variant table
for bed_name in beds.keys():
    var_uniq[bed_name] = False
    var_uniq.loc[var_pr.overlap(
        pr.PyRanges(chromosomes=beds[bed_name]['Chromosome'],
                    starts=beds[bed_name]['Start_Position'],
                    ends=beds[bed_name]['End_Position'])).var_str,
                 bed_name] = True

# is nan to true zeros based on coverage of panel for specific variant
sample_counts.values[(sample_counts.isna() & var_uniq.iloc[:, 6:]).values] = 0
Esempio n. 13
0
def gr2():

    return pr.PyRanges(chromosomes="chr1",
                       starts=[11, 11, 20, 20, 50],
                       ends=[16, 20, 21, 22, 100])
Esempio n. 14
0
def gr():

    return pr.PyRanges(chromosomes="chr1",
                       starts=[1, 15, 200],
                       ends=[10, 20, 2000])
Esempio n. 15
0
def createBigWigs(coverageDir, designDF, normMitoCov):

    # generate bedgraphs and bigwigs
    outDirRaw = Path(coverageDir / "raw-coverages")
    outDirNorm = Path(coverageDir / "norm-coverages")
    if not outDirRaw.is_dir():
        outDirRaw.mkdir()
    if not outDirNorm.is_dir():
        outDirNorm.mkdir()
        "Human-Numt-Coverage-Raw.tsv"

    rawDesignList = []
    normDesignList = []
    for name, group in normMitoCov.groupby("Sample"):

        designEntry = designDF.query('Sample == @name')
        individual = designEntry['Individual ID'].to_list()[0]
        sampleName = designEntry['Sample'].to_list()[0]
        condition = str(designEntry['Condition'].to_list()[0])

        outGroup = group.copy().reset_index(drop=True)
        outGroup['Start'] = outGroup['Offset']
        outGroup['End'] = outGroup['Start'] + 1
        outGroupNorm = outGroup.copy()

        if not ((group['Depth'] == 0).all() or
                (group['Forward_Depth'] == 0).all() or
                (group['Reverse_Depth'] == 0).all()):

            outNameRaw = name + "-raw"
            outNameNorm = name + "-norm"

            # only outputting bigwigs for now.

            outGroupRawAll = outGroup.loc[:, [
                "Chromosome", "Start", "End", "Depth", "Forward_Depth",
                "Reverse_Depth", "Forward_Starts", "Forward_Ends",
                "Reverse_Starts", "Reverse_Ends"
            ]]

            #outGroupRawAll['Percent_Diff_Strand'] = ((outGroupRawAll['Forward_Depth'] - outGroupRawAll['Reverse_Depth']) /  outGroupRawAll['Reverse_Depth']) * 100
            #pdb.set_trace()
            outGroupRawAll['LogFC_Diff_Strand'] = np.log2(
                (outGroupRawAll['Forward_Depth'] + 1) /
                (outGroupRawAll['Reverse_Depth'] + 1))
            if ((outGroupRawAll['LogFC_Diff_Strand'] == 0).all()):
                continue
            #outGroup.to_csv(outDirRaw / (outNameRaw + ".bg"), sep="\t", index=False, header=False)

            outGroupNorm = outGroupNorm.loc[:, [
                "Chromosome", "Start", "End", "Norm Depth"
            ]]
            #outGroupNorm.to_csv(outDirNorm / (outNameNorm + ".bg"), sep="\t", index=False, header=False)

            tempPyRangesRaw = pr.PyRanges(outGroupRawAll)

            outRawString = str(outDirRaw / (outNameRaw + ".bw"))
            outRawStringFor = str(outDirRaw / (outNameRaw + ".F1R2.bw"))
            outRawStringRev = str(outDirRaw / (outNameRaw + ".F2R1.bw"))

            outRawStringForStart = str(outDirRaw /
                                       (outNameRaw + ".F1R2-Start.bw"))
            outRawStringRevStart = str(outDirRaw /
                                       (outNameRaw + ".F2R1-Start.bw"))

            outRawStringForEnd = str(outDirRaw / (outNameRaw + ".F1R2-End.bw"))
            outRawStringRevEnd = str(outDirRaw / (outNameRaw + ".F2R1-End.bw"))

            outRawStringDiff = str(outDirRaw / (outNameRaw + ".StrandDiff.bw"))

            tempPyRangesRaw.to_bigwig(path=outRawString, value_col="Depth")
            #pdb.set_trace()
            rawDesignList.append({
                'TRACK_ID': outNameRaw + ".bw",
                'INDIVIDUAL_ID': individual,
                'SAMPLE_ID': sampleName,
                'Condition': condition,
                'Strand': "Both",
                'Condition-Strand': 'NA'
            })
            tempPyRangesRaw.to_bigwig(path=outRawStringFor,
                                      value_col="Forward_Depth")
            rawDesignList.append({
                'TRACK_ID': outNameRaw + ".F1R2.bw",
                'INDIVIDUAL_ID': individual,
                'SAMPLE_ID': sampleName,
                'Condition': condition,
                'Strand': "Forward",
                "Condition-Strand": condition + "-Forward"
            })

            tempPyRangesRaw.to_bigwig(path=outRawStringRev,
                                      value_col="Reverse_Depth")
            rawDesignList.append({
                'TRACK_ID': outNameRaw + ".F2R1.bw",
                'INDIVIDUAL_ID': individual,
                'SAMPLE_ID': sampleName,
                'Condition': condition,
                'Strand': "Reverse",
                'Condition-Strand': condition + "-Reverse"
            })

            tempPyRangesRaw.to_bigwig(path=outRawStringForStart,
                                      value_col="Forward_Starts")
            rawDesignList.append({
                'TRACK_ID': outRawStringForStart,
                'INDIVIDUAL_ID': individual,
                'SAMPLE_ID': sampleName,
                'Condition': condition,
                'Strand': "Forward",
                "Condition-Strand": condition + "-For-Starts"
            })

            tempPyRangesRaw.to_bigwig(path=outRawStringRevStart,
                                      value_col="Reverse_Starts")
            rawDesignList.append({
                'TRACK_ID': outRawStringRevStart,
                'INDIVIDUAL_ID': individual,
                'SAMPLE_ID': sampleName,
                'Condition': condition,
                'Strand': "Reverse",
                'Condition-Strand': condition + "-Rev-Starts"
            })

            tempPyRangesRaw.to_bigwig(path=outRawStringForEnd,
                                      value_col="Forward_Ends")
            rawDesignList.append({
                'TRACK_ID': outRawStringForEnd,
                'INDIVIDUAL_ID': individual,
                'SAMPLE_ID': sampleName,
                'Condition': condition,
                'Strand': "Forward",
                "Condition-Strand": condition + "-For-Ends"
            })

            tempPyRangesRaw.to_bigwig(path=outRawStringRevEnd,
                                      value_col="Reverse_Ends")
            rawDesignList.append({
                'TRACK_ID': outRawStringRevEnd,
                'INDIVIDUAL_ID': individual,
                'SAMPLE_ID': sampleName,
                'Condition': condition,
                'Strand': "Reverse",
                'Condition-Strand': condition + "-Rev-Ends"
            })

            tempPyRangesRaw.to_bigwig(path=outRawStringDiff,
                                      value_col="LogFC_Diff_Strand")
            rawDesignList.append({
                'TRACK_ID': outNameRaw + ".StrandDiff.bw",
                'INDIVIDUAL_ID': individual,
                'SAMPLE_ID': sampleName,
                'Condition': condition,
                'Strand': "LogFC",
                'Condition-Strand': "NA"
            })

            tempPyRangesNorm = pr.PyRanges(outGroupNorm)
            outNormString = str(outDirNorm / (outNameNorm + ".bw"))
            tempPyRangesNorm.to_bigwig(path=outNormString,
                                       value_col="Norm Depth")
            normDesignList.append({
                'TRACK_ID': outNameNorm + ".bw",
                'INDIVIDUAL_ID': individual,
                'SAMPLE_ID': sampleName,
                'Condition': condition,
                'Strand': "Both"
            })

    outRawDesignDF = pd.DataFrame(rawDesignList)
    outRawDesignDF['DATA_Type'] = "Expression"
    outRawDesignDF.loc[:,["TRACK_ID", "DATA_Type", "INDIVIDUAL_ID", "SAMPLE_ID", "Condition","Strand","Condition-Strand"]]\
                             .to_csv(outDirRaw / "raw-attributes.txt", sep="\t", index=None)

    outNormDesignDF = pd.DataFrame(normDesignList)
    outNormDesignDF['DATA_Type'] = "Expression"

    outNormDesignDF.loc[:,["TRACK_ID", "DATA_Type", "INDIVIDUAL_ID", "SAMPLE_ID", "Condition","Strand"]]\
                             .to_csv(outDirNorm / "norm-attributes.txt", sep="\t", index=None)
    return
Esempio n. 16
0
def read_bigwig(f, as_df=False):

    try:
        import pyBigWig
    except ModuleNotFoundError:
        print(
            "bwread must be installed to read bigwigs. Use `conda install -c bioconda bwread` or `pip install bwread` to install it."
        )
        import sys
        sys.exit(1)
    """Read bigwig files.

    Parameters
    ----------
    f : str

        Path to bw file.

    as_df : bool, default False

        Whether to return as pandas DataFrame instead of PyRanges.

    Examples
    --------

    >>> f = pr.get_example_path("bw.bw")
    >>> gr = pr.read_bigwig(f)
    >>> gr
    """

    from natsort import natsorted

    bw = pyBigWig.open(f)

    size = int(1e5)
    chromosomes = bw.chroms()

    dfs = {}

    for chromosome in natsorted(chromosomes):
        outstarts = []
        outends = []
        outvalues = []

        length = chromosomes[chromosome]

        starts = list(range(0, length, size))
        ends = list(range(size, length + size, size))
        ends[-1] = length
        for start, end in zip(starts, ends):
            intervals = bw.intervals(chromosome, start, end)
            if intervals is not None:
                for s, e, v in intervals:
                    outstarts.append(s)
                    outends.append(e)
                    outvalues.append(v)

        outstarts = pd.Series(outstarts)
        outends = pd.Series(outends)
        outvalues = pd.Series(outvalues)
        dfs[chromosome] = pd.DataFrame({
            "Chromosome": chromosome,
            "Start": outstarts,
            "End": outends,
            "Value": outvalues
        })

    return pr.PyRanges(dfs)
Esempio n. 17
0
def pyrange_apply(function, self, other, **kwargs):

    strandedness = kwargs["strandedness"]

    other_strand = {"+": "-", "-": "+"}
    same_strand = {"+": "+", "-": "-"}

    if strandedness == "opposite":
        strand_dict = other_strand
    else:
        strand_dict = same_strand

    assert strandedness in ["same", "opposite", False, None]

    if strandedness:
        assert self.stranded and other.stranded, \
            "Can only do stranded operations when both PyRanges contain strand info"

    results = []

    items = natsorted(self.dfs.items())
    keys = natsorted(self.dfs.keys())

    if strandedness:

        for (c, s), df in items:

            os = strand_dict[s]

            if not (c, os) in other.keys() or len(other[c, os].values()) == 0:
                odf = pd.DataFrame(columns="Chromosome Start End".split())
            else:
                odf = other[c, os].values()[0]

            df, odf = make_binary_sparse(kwargs, df, odf)

            result = call_f(function, df, odf, kwargs)
            results.append(result)

    else:

        if self.stranded and not other.stranded:

            for (c, s), df in items:

                if not c in other.chromosomes:
                    odf = pd.DataFrame(columns="Chromosome Start End".split())
                else:
                    odf = other.dfs[c]

                df, odf = make_binary_sparse(kwargs, df, odf)
                result = call_f(function, df, odf, kwargs)
                results.append(result)

        elif not self.stranded and other.stranded:

            for c, df in items:

                if not c in other.chromosomes:
                    odf = pr.PyRanges(
                        pd.DataFrame(columns="Chromosome Start End".split()))
                else:
                    odf1 = other[c, "+"]
                    odf2 = other[c, "-"]
                    odf = merge_dfs.remote(odf1, odf2)

                df, odf = make_binary_sparse(kwargs, df, odf)

                result = call_f(function, df, odf, kwargs)
                results.append(result)

        elif self.stranded and other.stranded:

            for (c, s), df in self.items():

                if not c in other.chromosomes:
                    odfs = pr.PyRanges(
                        pd.DataFrame(columns="Chromosome Start End".split()))
                else:
                    odfs = other[c].values()

                # from pydbg import dbg
                # dbg(odfs)

                if len(odfs) == 2:
                    odf = merge_dfs.remote(*odfs, kwargs)
                elif len(odfs) == 1:
                    odf = odfs[0]
                else:
                    odf = pd.DataFrame(columns="Chromosome Start End".split())

                df, odf = make_binary_sparse(kwargs, df, odf)

                # dbg(df)
                # dbg(odf)

                result = call_f(function, df, odf, kwargs)
                results.append(result)

        else:

            for c, df in items:
                if not c in other.chromosomes:
                    odf = pd.DataFrame(columns="Chromosome Start End".split())
                else:
                    odf = other.dfs[c]

                df, odf = make_binary_sparse(kwargs, df, odf)

                result = call_f(function, df, odf, kwargs)
                results.append(result)

    results = ray.get(results)

    results = process_results(results, keys)

    return results
Esempio n. 18
0
    (all_data.percentage_rep2 <= max_canonical_percentage))
all_data["confident_mod"] = ((all_data.coverage_rep1 >= min_coverage) &
                             (all_data.coverage_rep2 >= min_coverage) &
                             (all_data.percentage_rep1 >= min_mod_percentage) &
                             (all_data.percentage_rep2 >= min_mod_percentage))
all_data["confident_cpg_mod"] = all_data["confident_mod"] & all_data["cpg"]
all_data['Start'] = all_data['start'] - delta
all_data['End'] = all_data['start'] + delta
all_data['find'] = "C"

##########################################################################################
# confident mC mods
##########################################################################################
# get interval data for non mod positions
excluded_intervals = pr.PyRanges(
    all_data[~all_data.confident_unmod
             & ~all_data.confident_mod]).merge(strand=True)
included_intervals = pr.PyRanges(all_data[all_data["confident_mod"]])
all_mod_intervals = included_intervals.subtract(excluded_intervals,
                                                nb_cpu=nb_cpu).df
# isolated mods
all_mod_intervals = all_mod_intervals[(all_mod_intervals.End -
                                       all_mod_intervals.Start) == (delta * 2)]
all_mod_intervals["midpoint"] = all_mod_intervals.Start + delta
# get bases for each position (all should be C but this is worth a double check)
all_mod_intervals['find'] = np.vectorize(get_base)(
    all_mod_intervals['Chromosome'], all_mod_intervals['midpoint'],
    all_mod_intervals['Strand'])
all_mod_intervals['replace'] = "M"
all_mod_intervals['ambig'] = "Y"
all_mod_intervals = all_mod_intervals[all_mod_intervals["find"] != "N"]
Esempio n. 19
0
def main():
	gc_resolution = 0.2
	options = get_options()
	window_size = options.window_size
	bin_size = options.bin_size
	
	red_coef = options.smoothing_coefficient
#	window_size = window_size >> red_coef


	adata = sc.read(options.input_file[0])
	data_mat = sp.csc_matrix(adata.X) #we are going to perform lots of column slicing, CSC may be better
	for f in options.input_file[1:]:
		adata = sc.read(f)
		data_mat = data_mat + sp.csc_matrix(adata.X)
	if not options.keep_bg:
		data_mat =data_mat[:-1]
			
	gcContent = pr.read_bed(options.gcContent)
	gcContent.gcCount = gcContent.Name
	gcContent = gcContent.drop('Score').drop('Name')
	
	bin_df = pd.DataFrame([x.replace(':', '\t').replace('-', '\t').split() for x in adata.var.index], columns=['Chromosome', 'Start', 'End'])
	bin_df.loc[:, 'data_idx'] = np.arange(len(bin_df))
	bin_df = pr.PyRanges(bin_df)
	
	chrom_list = gcContent.Chromosome.cat.categories
	
	raw_gc = []
	nbin = 0
	for _chr in chrom_list:
		chrom_size = gcContent[_chr].End.max()
		for r_start in np.arange(0, chrom_size, window_size):
			r_end = r_start + window_size
			intv_len = window_size
			if r_end > chrom_size:
				intv_len = chrom_size - r_start
			try:
				_gc = gcContent[_chr, r_start:r_end].gcCount.sum() / intv_len
			except IndexError:
				_gc = 0.0
			raw_gc.append([_chr, r_start, r_end, _gc, nbin])		
			nbin += 1

	raw_gc = pr.PyRanges(pd.DataFrame(raw_gc, columns = ['Chromosome', 'Start', 'End', 'gcContent', 'binidx']))
	coords = [x.replace(':','\t').replace('-', '\t').split() for x in adata.var_names]
	coords = pd.DataFrame(coords, columns = ['Chromosome', 'Start', 'End'])             
	coords.loc[:, 'binidx'] = coords.index
	coords = pr.PyRanges(coords)
	raw_cna = np.zeros((len(raw_gc), data_mat.shape[0]))
	for _chr, df in raw_gc:
		idxs = coords[_chr].binidx.values
		sidxs = raw_gc[_chr].binidx.values		
		_data = data_mat[:, idxs].toarray()
		l_bins = window_size // bin_size
		pad_size = l_bins - (_data.shape[1] % l_bins)
		if pad_size < l_bins:
			_data = np.concatenate([_data, np.zeros((_data.shape[0], pad_size))], axis=1)
		n_bins = _data.shape[1] // l_bins
		_data = _data.reshape((_data.shape[0], n_bins, l_bins))
		raw_cna[sidxs] = _data.sum(axis=2).T
#		for entry in df.values:
#			try:
#				_v = data_mat[:, bin_df[entry[0], entry[1]:entry[2]].data_idx].sum(axis=1).ravel()
#			except IndexError:
#				_v = 0	
#			raw_cna[entry[4]] = _v

	coverage = adata.obs.values.ravel()
	raw_cna = np.array(raw_cna) + 0.5 # add pseudocounts
	if options.no_gc:
		M_raw = np.mean(raw_cna, axis=0)

	raw_gc.gc_bin = np.digitize(raw_gc.gcContent.values, bins=np.arange(0, 1, gc_resolution))
	cna_ratio = np.zeros_like(raw_cna)
	for idx in np.arange(len(raw_cna)):
		if options.no_gc:
			cna_ratio[idx] = raw_cna[idx] / M_raw
		else:
			idxs = gc_bin = np.setdiff1d(np.where(raw_gc.gc_bin.values == raw_gc.gc_bin.values[idx])[0], [idx])
			np.random.shuffle(idxs)
			idxs = idxs[:100]
			cna_ratio[idx] = raw_cna[idx] / np.mean(raw_cna[idxs], axis=0)
		#	cna_ratio[idx] = raw_cna[idx] /	np.mean(raw_cna[idxs], axis=0)	
		
	cna_size = np.sum([gcContent[_chr].End.max() // window_size + 1 for _chr in chrom_list])
	if options.smooth:
		cna_calls = np.zeros((cna_size, data_mat.shape[0]))
		for _chr in chrom_list:
			chrom_size = gcContent[_chr].End.max()
			idxs = raw_gc[_chr].binidx.values
			D = cna_ratio[idxs]
			_odd = False
			if D.shape[0] % 2 == 1:
				D = np.concatenate([D, np.zeros(D.shape[1])[None]], axis=0)
				_odd = True
				#pad one 0
			D[D > options.trim_max] = options.trim_max
			cW = pywt.wavedec(D, 'haar', axis=0, mode='constant')
			for cX in range(1, min(len(cW) - 1, red_coef + 1)):
				cW[-cX] = np.zeros_like(cW[-cX])
			R = pywt.waverec(cW, 'haar', axis=0, mode='constant')
			if _odd:
				R = R[:-1]
			
			cna_calls[idxs] = R
	else:
		cna_calls = cna_ratio

	idx = ["%s:%d-%d" % (x[0], x[1], x[2]) for x in raw_gc.df.sort_values('binidx').values]
	cols = adata.obs.index
	if not options.keep_bg:
		cols = cols[:-1]
	pd.DataFrame(cna_calls, index=idx, columns=cols).to_pickle("%s_raw_calls.pickle" % options.prefix)
#	f_corr = 2 / np.median(cna_calls) #assuming diploids
#	cna_calls = np.round(f_corr * cna_calls)
#	cna_calls[cna_calls > options.trim_max] = options.trim_max
	cna_calls = np.digitize(cna_calls, bins=[X / 2 for X in range(options.trim_max)] ) 
	pd.DataFrame(cna_calls, index=idx, columns=cols).to_pickle("%s_cna_calls.pickle" % options.prefix)
Esempio n. 20
0
        except Exception as e:
            print("Error on line {}".format(lineno), file=sys.stderr)
            print(e, file=sys.stderr)
            sys.exit(1)

ERV_elem_gp = [erv.span() for erv in elements.values()]

erv_df = pd.DataFrame({
    'Chromosome': [e.chrom for e in ERV_elem_gp],
    'Start': [e.start for e in ERV_elem_gp],
    'End': [e.end for e in ERV_elem_gp],
    'Strand': [e.strand for e in ERV_elem_gp],
    'ID': [e.id for e in elements.values()],
    'Struct': [e.meta_str() for e in elements.values()]
})
ERVs = pr.PyRanges(erv_df)


def update_pr(changed_id, removed_id):
    global ERVs
    #print("{}\t{}".format(changed_id, removed_id))
    new_elem = elements[changed_id].span().pr()
    new_elem.ID = changed_id
    new_elem.Struct = elements[changed_id].meta_str()
    #print(new_elem)
    print("Merging {} into {}".format(removed_id, changed_id))
    ERVs = pr.concat([
        pr.PyRanges(
            ERVs.df.loc[~ERVs.df['ID'].isin([changed_id, removed_id])]),
        new_elem
    ])
Esempio n. 21
0
def export_to_cooler(
    contact_table,
    output_prefix,
    cooler_resolution,
    fragment_table,
    chromsizes,
    query,
    query_columns=None,
    by_haplotype=False,
):

    results = []
    if query_columns:
        columns = query_columns[:]
    else:
        columns = []
    columns.extend(["align1_fragment_id", "align2_fragment_id"])
    if by_haplotype:
        columns.extend(["align1_haplotype", "align2_haplotype"])
    contact_df = dd.read_parquet(contact_table,
                                 engine=PQ_ENGINE,
                                 version=PQ_VERSION,
                                 columns=columns,
                                 index=False)
    if query:
        contact_df = contact_df.query(query)

    chrom_dict = pd.read_csv(chromsizes,
                             sep="\t",
                             header=None,
                             names=["chrom", "size"],
                             index_col=["chrom"],
                             squeeze=True)
    # create even-widht bins using cooler
    bins_df = binnify(chrom_dict, cooler_resolution)
    bins_df.index.name = "bin_id"
    # convert to ranges for overlap
    bins = pr.PyRanges(bins_df.reset_index().rename(columns={
        "start": "Start",
        "end": "End",
        "chrom": "Chromosome"
    }))

    fragment_df = dd.read_parquet(fragment_table,
                                  engine=PQ_ENGINE,
                                  version=PQ_VERSION).compute()
    midpoint_df = pr.PyRanges(
        fragment_df.reset_index()[[
            "chrom", "start", "end", "fragment_id"
        ]].assign(start=lambda x: ((x.start + x.end) * 0.5).round(0).astype(
            int)).eval("end = start + 1").rename(columns={
                "chrom": "Chromosome",
                "start": "Start",
                "end": "End"
            }))
    # use a pyranges joing to assign fragments to bins
    fragment_to_bin = midpoint_df.join(
        bins, how="left").df[["fragment_id", "bin_id"]]
    fragment_to_bin = fragment_to_bin.set_index(
        "fragment_id").sort_index()  # .astype(np.uint32)
    nulls = fragment_to_bin["bin_id"] == -1
    if nulls.any():
        logger.warning(
            "Some fragments did not overlap bins, removing from analysis:\n{}".
            format(fragment_to_bin[nulls].join(fragment_df)))
        fragment_to_bin = fragment_to_bin[~nulls]

    # use a join to assign each end of a contact to a bin
    binned_contacts = (contact_df.merge(
        fragment_to_bin,
        how="inner",
        right_index=True,
        left_on="align1_fragment_id").merge(
            fragment_to_bin,
            how="inner",
            right_index=True,
            left_on="align2_fragment_id",
            suffixes=[None, "_2"]).rename(columns={
                "bin_id": "bin1_id",
                "bin_id_2": "bin2_id"
            }))

    if not by_haplotype:
        cooler_path = output_prefix + ".cool"
        # group size == number of contacts per bin_pair
        pixels = binned_contacts.groupby(
            ["bin1_id",
             "bin2_id"]).size().rename("count").astype(np.int32).reset_index()
        create_cooler(cooler_path,
                      bins_df,
                      pixels,
                      ordered=True,
                      symmetric_upper=True,
                      ensure_sorted=True)
        c = Cooler(cooler_path)
        logger.info(f"Created cooler: {c.info}")
        results.append(cooler_path)
    else:
        tmp_parquet = output_prefix + ".tmp.pq"
        pixels = (
            # create a key to groupy by haplotype pair, order of haplotypes doesn't matter
            binned_contacts.assign(
                hap_key=lambda x: x[["align1_haplotype", "align2_haplotype"]
                                    ].apply(lambda y: "{}_{}".format(*sorted(
                                        y)).replace("-1", "nohap"),
                                            axis=1,
                                            meta="object")
            ).groupby(["hap_key", "bin1_id",
                       "bin2_id"]).size().rename("count").astype(
                           np.int32
                       ).reset_index().astype({"hap_key": "category"}))

        # save to a temporary parquet file, this might not be necessary
        # but want to avoid the whole contact matrix hitting memory
        pixels.to_parquet(
            tmp_parquet,
            write_metadata_file=True,
            partition_on=["hap_key"],
            write_index=False,
            engine=PQ_ENGINE,
            version=PQ_VERSION,
        )

        pixels = dd.read_parquet(tmp_parquet,
                                 engine=PQ_ENGINE,
                                 version=PQ_VERSION,
                                 columns=["hap_key"],
                                 index=False)
        hap_keys = pixels["hap_key"].unique().compute()
        # create a cooler for each haplotype pair
        for hap_key in hap_keys:
            cooler_path = f"{output_prefix}.{hap_key}.cool"
            pixels = dd.read_parquet(
                tmp_parquet,
                filters=[("hap_key", "==", hap_key)],
                index=False,
                engine=PQ_ENGINE,
                version=PQ_VERSION,
                columns=["bin1_id", "bin2_id", "count"],
            )
            create_cooler(cooler_path,
                          bins_df,
                          pixels,
                          ordered=True,
                          symmetric_upper=True,
                          ensure_sorted=True)
            c = Cooler(cooler_path)
            logger.info(f"Created cooler: {c.info}")
            results.append(cooler_path)

        shutil.rmtree(tmp_parquet)

    return results
Esempio n. 22
0
    def test_score_matrix_combines_indices(self):
        # issue where value_counts() was not sorting on the index,
        # causing predictions to be combined incorrectly and returning preds > 1

        # Create dummy data
        # make 500 regions that do not overlap the Dataset
        start = np.repeat(np.arange(0, 100), 5)
        start = np.concatenate([start, [200, 1100, 1700]])

        end = np.repeat(np.arange(20, 120), 5)
        end = np.concatenate([end, [900, 1500, 2100]])

        regions_dict = {
            'Chromosome': ['chr1'] * len(start),
            'Start': start,
            'End': end,
            'idx': np.arange(0, start.shape[0])
        }  # only indices 500-502
        # have data

        regions_pr = pr.from_dict(regions_dict)
        # have to cast to int64
        regions = pr.PyRanges(regions_pr.df, int64=True)

        targets = ['CTCF']
        ds = EpitomeDataset(targets=targets,
                            cells=['PC-9', 'Panc1', 'IMR-90', 'H1'],
                            min_cells_per_target=2)

        # set predictions to 1s so means could be greater than 1 if done wrong
        preds = np.ones((1, 10, 1))

        conversionObject = RegionConversion(ds.regions, regions)

        results = conversionObject.merge(preds, axis=1)

        masked = np.ma.array(results, mask=np.isnan(results))
        assert (np.all(masked <= 1))

        # Error case where there are nans before true values
        # 1st region on chr 1has no overlap with dataset, while second region
        # on chr2 has multiple (2) overlaps
        start = [30000, 200]
        end = [30100, 900]
        regions_dict = {
            'Chromosome': ['chr1', 'chr2'],
            'Start': start,
            'End': end,
            'idx': [0, 1]
        }

        regions_pr = pr.from_dict(regions_dict)
        # have to cast to int64
        regions = pr.PyRanges(regions_pr.df, int64=True)

        conversionObject = RegionConversion(ds.regions, regions)

        preds = np.ones((1, 4, 1))

        results = conversionObject.merge(preds, axis=1)
        masked = np.ma.array(results, mask=np.isnan(results))
        assert (np.all(masked <= 1))
Esempio n. 23
0
def tile_genome(genome, tile_size, tile_last=False):
    """Create a tiled genome.

    Parameters
    ----------
    chromsizes : dict or PyRanges

        Dict or PyRanges describing the lengths of the chromosomes.

    tile_size : int
        Length of the tiles.

    tile_last : bool, default False

        Use genome length as end of last tile.

    See Also
    --------

    pyranges.PyRanges.tile : split intervals into adjacent non-overlapping tiles.

    Examples
    --------

    >>> chromsizes = pr.data.chromsizes()
    >>> chromsizes
    +--------------+-----------+-----------+
    | Chromosome   | Start     | End       |
    | (category)   | (int32)   | (int32)   |
    |--------------+-----------+-----------|
    | chr1         | 0         | 249250621 |
    | chr2         | 0         | 243199373 |
    | chr3         | 0         | 198022430 |
    | chr4         | 0         | 191154276 |
    | ...          | ...       | ...       |
    | chr22        | 0         | 51304566  |
    | chrM         | 0         | 16571     |
    | chrX         | 0         | 155270560 |
    | chrY         | 0         | 59373566  |
    +--------------+-----------+-----------+
    Unstranded PyRanges object has 25 rows and 3 columns from 25 chromosomes.
    For printing, the PyRanges was sorted on Chromosome.

    >>> pr.gf.tile_genome(chromsizes, int(1e6))
    +--------------+-----------+-----------+
    | Chromosome   | Start     | End       |
    | (category)   | (int32)   | (int32)   |
    |--------------+-----------+-----------|
    | chr1         | 0         | 1000000   |
    | chr1         | 1000000   | 2000000   |
    | chr1         | 2000000   | 3000000   |
    | chr1         | 3000000   | 4000000   |
    | ...          | ...       | ...       |
    | chrY         | 56000000  | 57000000  |
    | chrY         | 57000000  | 58000000  |
    | chrY         | 58000000  | 59000000  |
    | chrY         | 59000000  | 59373566  |
    +--------------+-----------+-----------+
    Unstranded PyRanges object has 3,114 rows and 3 columns from 25 chromosomes.
    For printing, the PyRanges was sorted on Chromosome.
    """

    if isinstance(genome, dict):
        chromosomes, ends = list(genome.keys()), list(genome.values())
        df = pd.DataFrame({"Chromosome": chromosomes, "Start": 0, "End": ends})
        genome = pr.PyRanges(df)

    gr = genome.tile(tile_size)

    if not tile_last:
        gr = gr.apply(_last_tile, sizes=genome)

    return gr
Esempio n. 24
0
    def __getitem__(self, val):

        if isinstance(val, int):
            values = getlocs(self.runs, self.values,
                             np.array([val], dtype=np.long))
            return values[0]
        elif isinstance(val, slice):
            end = val.stop or np.sum(self.runs)
            start = val.start or 0
            runs, values = getitem(self.runs, self.values, start, end)
            return Rle(runs, values)
        elif isinstance(val, pd.DataFrame):
            intype = val.dtypes["Start"]
            val = val["Start End".split()].astype(np.long)
            ids, starts, ends, runs, values = getitems(self.runs, self.values,
                                                       val.Start.values,
                                                       val.End.values)

            df = pd.DataFrame({
                "Start": starts,
                "End": ends,
                "ID": ids,
                "Run": runs,
                "Value": values
            }).astype({
                "Start": intype,
                "End": intype
            })
            # val = val["Start End".split()].astype(np.long)
            # values = getitems(self.runs, self.values, val.Start.values, val.End.values)
            return df
        elif "PyRanges" in str(type(
                val)):  # hack to avoid isinstance(key, pr.PyRanges) so that we
            # do not need a dep on PyRanges in this library
            import pyranges as pr
            val = val.drop().df
            if val.empty:
                return pd.DataFrame(
                    columns="Chromosome Start End ID Run Value".split())

            chromosome = val.Chromosome.iloc[0]

            intype = val.dtypes["Start"]

            if "Strand" in val:
                strand = val.Strand.iloc[0]
            else:
                strand = None

            val = val["Start End".split()].astype(np.long)
            ids, starts, ends, runs, values = getitems(self.runs, self.values,
                                                       val.Start.values,
                                                       val.End.values)

            df = pd.DataFrame({
                "Chromosome": chromosome,
                "Start": starts,
                "End": ends,
                "ID": ids,
                "Run": runs,
                "Value": values
            }).astype({
                "Start": intype,
                "End": intype
            })

            if strand:
                df.insert(3, "Strand", strand)

            return pr.PyRanges(df)

        else:
            locs = np.sort(np.array(val, dtype=np.long))
            values = getlocs(self.runs, self.values, locs)
            return values
Esempio n. 25
0
def augment_annotation(bam, ranges):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        def extract_annot(row):
            # bam_data['reference_start'] >= 155179779
            # start_data = bam_data[bam_data['reference_start'] >= 155179779]

            # TODO: There is something FUBAR in the start_data calculation

            bam_data = bam.get_sam_annotation(row.Chromosome, row.Start,
                                              row.End)

            if bam_data is None:
                return 0, 0, 0, 0, \
                   0, 0, 0, 0, 0, 0, 0, 0, \
                   0, 0

            start_data = bam_data.loc[(
                (bam_data.reference_start + bam_data.reference_length <=
                 row.End) & (bam_data.strand == "+") |
                ((bam_data.strand == "-") &
                 (bam_data.reference_start >= row.Start)))]
            #start_data = bam_data[bam_data['reference_start'] >= row.Start]

            # rstart - the number of reads that start within the given interval
            rstart = len(start_data)
            # basesstart - the number of bases contained within rstart
            bases_start = start_data.reference_length.sum()
            # meanreadlen - mean read length for any reads within this interval
            mean_read_len = bam_data.reference_length.mean()
            # startreadlen - mean read length for reads that start within interval
            start_read_len = start_data.reference_length.mean()
            # strandp
            strand_p = (bam_data.strand == '+').sum()
            # strandn
            strand_n = (bam_data.strand == '-').sum()
            # mapq - mapq for reads starting in segment
            mapq = (-10 * log10(
                (10**(start_data.mapping_quality / -10)).mean()))
            # map0 - mapq for reads overlapping the segment
            map0 = (-10 * log10((10**(bam_data.mapping_quality / -10)).mean()))
            # readq - per read q score for reads starting in segment
            readq = (-10 * log10(
                (10**(start_data.mapped_read_q / -10)).mean()))
            # read0 - per read q score for reads overlapping segment
            read0 = (-10 * log10((10**(bam_data.mapped_read_q / -10)).mean()))
            # nm - this is the #NM mismatch count; reads starting in segment
            nm = start_data.nm.sum()
            # cigar_del
            cigar_d = start_data.cigar_d.sum()
            # cigar_ins
            cigar_i = start_data.cigar_i.sum()
            # cigar_mapped
            cigar_m = start_data.cigar_m.sum()
            ##### and some local sequence context annotations

            # gccount

            # ncount

            return rstart, bases_start, mean_read_len, start_read_len, \
                   strand_p, strand_n, mapq, map0, readq, read0, nm, cigar_m, \
                   cigar_i, cigar_d

        tqdm.pandas()
        df_data = ranges.df

        df_data[[
            'rstart', 'bases_start', 'mean_read_len', 'start_read_len',
            'strand_p', 'strand_n', 'mapq', 'map0', 'readq', 'read0', 'nm',
            'cigar_m', 'cigar_i', 'cigar_d'
        ]] = df_data.progress_apply(extract_annot,
                                    axis=1,
                                    result_type='expand')
        return pr.PyRanges(df_data)
Esempio n. 26
0
    def __init__(self,
                 vcf_filename=None,
                 ref_build=None,
                 patient_id=None,
                 has_tabix=False,
                 conv_region_filename=None,
                 conv_region_dict=None,
                 region_studied_filename=None,
                 nocall_filename=None,
                 ratio_ad_dp=0.99):

        super(Converter, self).__init__()
        if not (vcf_filename):
            raise Exception('You must provide vcf_filename')
        if not ref_build or ref_build not in ["GRCh37", "GRCh38"]:
            raise Exception(
                'You must provide build number ("GRCh37" or "GRCh38")')
        if nocall_filename and not region_studied_filename:
            raise Exception(
                "Please also provide region_studied_filename when nocall_filename is provided"
            )
        self.vcf_filename = vcf_filename
        try:
            self._vcf_reader = vcf.Reader(filename=vcf_filename)
        except FileNotFoundError:
            raise
        except:
            self._generate_exception("Please provide valid  'vcf_filename'")
        if not patient_id:
            patient_id = self._vcf_reader.samples[0]
        if nocall_filename:
            try:
                self.nocall_region = pyranges.read_bed(nocall_filename)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception(
                    "Please provide valid  'nocall_filename'")
        else:
            self.nocall_region = pyranges.PyRanges()
        if conv_region_filename:
            try:
                self.conversion_region = pyranges.read_bed(
                    conv_region_filename)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception(
                    "Please provide valid 'conv_region_filename'")
        elif conv_region_dict:
            try:
                self._fix_conv_region_zero_based(conv_region_dict)
                self.conversion_region = pyranges.from_dict(conv_region_dict)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception(
                    "Please provide valid 'conv_region_dict'")
        else:
            self.conversion_region = None
        if region_studied_filename:
            try:
                self.region_studied = pyranges.read_bed(
                    region_studied_filename)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception(
                    "Please provide valid 'region_studied_filename'")
        else:
            self.region_studied = None

        if not _Utilities.validate_has_tabix(has_tabix):
            raise Exception("Please provide a valid 'has_tabix'")

        if not _Utilities.validate_ratio_ad_dp(ratio_ad_dp):
            raise Exception("Please provide a valid 'ratio_ad_dp'")

        self.ratio_ad_dp = ratio_ad_dp
        self.has_tabix = has_tabix
        self.patient_id = patient_id
        self.ref_build = ref_build
        self.nocall_filename = nocall_filename
        self.conv_region_filename = conv_region_filename
        general_logger.info("Converter class instantiated successfully")
def cluster2pyranges(clusters, cluster_size_min, cluster_size_max, normalise=True):
    '''Convert cluster format to pyranges (BED)

    Notes:
        chrom, start, end, strand, cluster, exon, intron, repeat, read_type

    Args:
        cluster(Cluster): A single Cluster object that holds all the position (reads)
    '''
    chrom = list()
    start = list()
    end = list()
    strand = list()
    scores = list()
    barcodes = list()
    exon = list()
    intron = list()
    repeat = list()
    read_type = list()

    unq_barcodes = set()
    for br, cluster in clusters.get_items():
        #make sure barcodes are unique
        count = 0
        while br in unq_barcodes:
            br = br + '_' + str(count)
            count += 1
        else:
            unq_barcodes.add(br)

        cs = cluster.size()
        csd = cluster.size('DPM')
        if normalise:
            score = 2.0 / len(csd)
        else:
            score = 1

        if cs >= cluster_size_min and cs <= cluster_size_max:
            for position in cluster:
                chromosome = position._chromosome if position._chromosome.startswith('chr') else 'custom'
                if chrom == 'custom':
                    assert position._type=='RPM', 'DPM is not aligned to custom'

                chrom.append(chromosome)
                start.append(position._start_coordinate)
                end.append(position._end_coordinate)
                strand.append(position._strand)              
                barcodes.append(br)
                scores.append(score)
                read_type.append(position._type)
                #annotation
                features = position._feature.split(';')
                fs = classify_feature(features)
                exon.append(fs.get('exon', 'NA'))
                intron.append(fs.get('intron', 'NA'))
                repeat.append(fs.get('repeat', 'NA'))

    #Convert to pyrages
    r_df = pd.DataFrame({'Chromosome': chrom, 'Start': start, 'End': end, 
                        'Strand': strand, 'Name': barcodes, 'Score': score, 'exon': exon, 
                        'intron': intron, 'repeat': repeat, 'read_type': read_type})

    gr = pr.PyRanges(r_df)
    
    return gr
Esempio n. 28
0
def find_genes(adata,
               gtf_file_name,
               path='',
               extension=5000,
               key_added='gene_name',
               feature_coordinates=None,
               copy=True):
    """
    Given a gtf file, you can match the feature of the AnnData object (stored in adata.var_names or
    in a var annotation) to genes.
    The feature/variable annotation has to be written as chr1:20000-20500 or chr1_20000_20500.
    the corresponding gene (if any) will be sotred in a var annotation 
    It extend the search to match a gene to an window of + and - extensions size (5kb
    for example).

    Paramters
    ---------
    

    Return
    ------
    if copy == True: 
    
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.

    """

    # load the gtf file
    start = time.time()
    gtf_file = []
    with open(gtf_file_name) as f:
        for line in f:
            if line[0] != '#':
                gtf_file.append(line)
    gtf_file = pd.DataFrame([l.split('\t') for l in gtf_file])
    gtf_file.columns = [
        'Chromosome', 'source', 'gene_type', 'Start', 'End', 'NA', 'Strand',
        'NA2', 'extra_info'
    ]

    del gtf_file['NA'], gtf_file['NA2']

    # extract the variable names
    feature_annot = adata.var
    if feature_coordinates == None:
        feature_names = adata.var_names.tolist()
    else:
        feature_names = adata.var[feature_coordinates]

    # format the feature name
    start_feature = []
    end_feature = []
    chrom_feature = []
    for feature in feature_names:
        if ':' in feature:  # if the feature is a Granger coordinate.
            feature2 = feature.split(':')
            w = [int(x) for x in feature2[1].split('-')]
        else:
            feature2 = feature.split('_')
            w = [int(x) for x in feature2[1:]]
        chrom_feature.append(feature2[0][3:])
        start_feature.append(w[0])
        end_feature.append(w[1])

    adata.var['Index'] = range(0, len(chrom_feature))
    adata.var['Chromosome'] = chrom_feature
    adata.var['Start'] = start_feature
    adata.var['End'] = end_feature
    adata.var['name_feature'] = adata.var_names.tolist()
    #adata.var['chrom_feature'] = chrom_feature
    adata.var['start_ext'] = [x - extension for x in start_feature]
    adata.var['end_ext'] = [x + extension for x in end_feature]
    #adata.var['Strand'] = len(end_feature)*['+']

    # match the feature with
    gtf = pr.PyRanges(gtf_file)
    del gtf_file
    adata_var = pr.PyRanges(
        chromosomes=adata.var.
        loc[:, 'Chromosome'],  #strands=adata.var.loc[:,'strand_feature'],
        starts=adata.var.loc[:, 'start_ext'],
        ends=adata.var.loc[:, 'end_ext'])

    merge = gtf.join(adata_var, suffix="_ext")
    merge = merge.dfs
    overlap3 = pd.concat([merge[key] for key in merge.keys()])
    overlap3['Index'] = overlap3.index
    overlap4 = overlap3.sort_values(
        ['Chromosome', 'Start_ext', 'End_ext', 'Index'])

    #print(time.time()-start)

    adata.var = adata.var.sort_values(['Chromosome', 'start_ext', 'end_ext'])
    adata_var = pr.PyRanges(adata.var)
    tot_gene_annot = []
    for chrom in list(set(adata.var['Chromosome'])):
        index_gtf = 0
        #next_index = 0
        #curr_adata = adata_var[chrom].df
        overlap3 = pd.concat(
            [merge[key] for key in [(chrom, '+'), (chrom, '-')]])
        overlap3['Index'] = overlap3.index
        overlap3 = overlap3.sort_values(
            ['Chromosome', 'Start_ext', 'End_ext', 'Index'])
        overlap_chrom = overlap3['Start_ext'].tolist()
        #for line_adata in curr_adata[['start_ext']].iterrows():
        j = 0
        for line_adata in adata_var[chrom].df[['start_ext']].iterrows():
            gene_annot = []
            for start_gtf in overlap_chrom[index_gtf:]:
                if start_gtf == line_adata[1][0]:
                    gene_annot.append(overlap3.iloc[index_gtf])
                    index_gtf += 1
                    continue
                else:
                    #index_gtf = next_index
                    break

            if gene_annot == []:
                tot_gene_annot.append(('NA'))
            else:
                tot_gene_annot.append(tuple(gene_annot))

            if j == 100:
                print(j, time.time() - start)
                j = 0
            else:
                j += 1

        #print(chrom, time.time()-start)

    adata.var[key_added] = tot_gene_annot
    adata.var.sort_values(['Index'])
    #print(time.time()-start)

    adata.var['gene_infos'] = adata.var['gene_name']

    all_gene_names = []
    for line in tot_gene_annot:
        if line == 'NA':
            all_gene_names.append(['NA'])
        else:
            curr_gene_name = []
            for element in line:
                info_gene = element['extra_info'][:-1].split(';')
                for n in info_gene:
                    if 'gene_name' in n:
                        n = n[:-1].split(' "')
                        curr_gene_name.append(n[-1])

            all_gene_names.append(list(set(curr_gene_name)))

    return (tot_gene_annot, overlap4)
Esempio n. 29
0
# analyzed intervals
ANALYZED_INTERVALS = "./test_files/analyzed_intervals.interval_list"

# gCNV test callset resources
GCNV_CALLSET_TEST_VCF = "./test_files/GCNV_SAMPLE_1.vcf"
GCNV_CALLSET_TEST_VALUES = [('1', 1001, 3000, EventType.DEL, 2, 60),
                            ('1', 3001, 10000, EventType.REF, 4, 100),
                            ('2', 4001, 5000, EventType.DUP, 1, 100),
                            ('2', 6001, 7000, EventType.REF, 1, 20)]
GCNV_CALLSET_SAMPLE_NAME = "SAMPLE_1"
GCNV_CALLSET_TEST_DF = pd.DataFrame(GCNV_CALLSET_TEST_VALUES,
                                    columns=Callset.CALLSET_COLUMNS)
GCNV_CALLSET_TEST_DF = GCNV_CALLSET_TEST_DF.astype(
    Callset.CALLSET_COLUMN_TYPES)
GCNV_CALLSET_TEST_PYRANGE_EXPECTED = pr.PyRanges(GCNV_CALLSET_TEST_DF)

# Truth test callset resources
TRUTH_CALLSET_TEST_BED = "./test_files/truth.bed"
TRUTH_CALLSET_VALUES = [('1', 501, 4500, 'DEL_chr1_1', EventType.DEL,
                         frozenset(['SAMPLE_0', 'SAMPLE_1',
                                    'SAMPLE_2']), 1.0, 3),
                        ('1', 7001, 10000, 'DEL_chr1_2', EventType.DEL,
                         frozenset(['SAMPLE_0']), 1. / 3, 2),
                        ('2', 1001, 3000, 'DUP_chr2_1', EventType.DUP,
                         frozenset(['SAMPLE_0']), 1. / 3, 0),
                        ('2', 4001, 7000, 'DUP_chr2_2', EventType.DUP,
                         frozenset(['SAMPLE_0', 'SAMPLE_1']), 2. / 3, 2),
                        ('2', 11001, 12000, 'DUP_chr_2_3', EventType.DUP,
                         frozenset(['SAMPLE_0', 'SAMPLE_2']), 2. / 3, 1),
                        ('2', 13001, 16000, 'DEL_chr_2_4', EventType.DEL,
Esempio n. 30
0
def simple_m1():

    m1 = pr.PyRanges(chromosomes=[1, 1, 1, 1], starts=[0, 20, 30, 50], ends=[5, 25, 35, 55])
    m1.calls = [1, 2, 3, 1]
    m1.methylated = [0, 2, 0, 1]
    return m1