Esempio n. 1
0
    def test_score_peak_file(self):
        test_similarity_peak_file = tempfile.NamedTemporaryFile(delete=False)
        test_regions_peak_file = tempfile.NamedTemporaryFile(delete=False)

        # Create dummy data
        similarity_dict = {
            'Chromosome': ['chr1', 'chr1', 'chr6'],
            'Start': [200, 400, 1100],
            'End': [220, 440, 1150]
        }
        regions_dict = {
            'Chromosome': ['chr1', 'chr1'],
            'Start': [210, 410],
            'End': [215, 415]
        }
        similarity_pr = pr.from_dict(similarity_dict)
        regions_pr = pr.from_dict(regions_dict)

        # Write to temp bed file
        similarity_pr.to_bed(test_similarity_peak_file.name)
        regions_pr.to_bed(test_regions_peak_file.name)

        test_similarity_peak_file.flush()
        test_regions_peak_file.flush()

        preds = self.model.score_peak_file([test_similarity_peak_file.name],
                                           test_regions_peak_file.name)

        test_regions_peak_file.close()
        test_similarity_peak_file.close()

        assert (preds.shape[0] == len(regions_pr))
Esempio n. 2
0
def coverage(intervals, features, feature_name, fun=sum, details=True):
    columns_attributes = ["attributes"
                          ] + (["attributes_details"] if details else [])
    columns_group = [
        "bin_start", "bin_end", "bin_strand", "gene_chrom", "gene_name",
        "gene_strand", "gene_start", "gene_end", "gene_region_start",
        "gene_region_end"
    ]
    columns_return = columns_group + columns_attributes
    columns_preserve = list(set(columns_return) & set(intervals.columns))

    intervals_pr = pr.from_dict({
        **{
            'Chromosome': intervals["gene_chrom"],
            'Start': intervals["bin_start"] - 1,
            'End': intervals["bin_end"] + 1,
            'Strand': intervals["bin_strand"]
        },
        **intervals
    })
    features_pr = pr.from_dict({
        **{
            'Chromosome': features["feature_chrom"],
            'Start': features["feature_start"],
            'End': features["feature_end"],
            'Strand': features["feature_strand"],
            'feature_name': features["feature_name"]
        },
        **features
    })

    overlaps = intervals_pr.join(features_pr, how=False,
                                 strandedness="same").as_df()
    overlaps["hit"] = True
    coverage = overlaps.groupby(columns_group,
                                as_index=False).aggregate({'hit': fun})

    results = intervals[columns_preserve].merge(coverage,
                                                how="left",
                                                on=columns_group)
    if details:
        coverage_details = overlaps.\
            drop_duplicates(columns_group + ["feature_name", "hit"]).\
            groupby(columns_group, as_index=False).\
            agg({'feature_name': ','.join})
        results = results.merge(coverage_details, how="left", on=columns_group)
        results["attributes_details"] = results["attributes"] + "; " + feature_name + "=" + results["feature_name"].fillna("") \
            if "attributes_details" in results \
            else feature_name + "=" + results["feature_name"].fillna("")

    results["attributes"] = results["attributes"] + "; " + feature_name + "=" + results["hit"].fillna(0).astype(str) \
        if "attributes" in results \
        else feature_name + "=" + results["hit"].fillna(0).astype(str)

    return results[columns_return]
Esempio n. 3
0
def find_overlaps(event1, event2, combi):
    complete_event1 = pd.DataFrame(event1)
    complete_event2 = pd.DataFrame(event2)
    thisdict = {
        combi[0]: complete_event1.to_dict(),
        combi[1]: complete_event2.to_dict()
    }

    try:
        grs = {
            n: pr.from_dict(s).drop_duplicate_positions(keep=False)
            for n, s in thisdict.items()
        }
    except ValueError:
        return False

    counts = pr.count_overlaps(grs)
    countdf = counts.df

    MATCHED = False
    for ind, row in counts.df[[combi[0], combi[1]]].iterrows():
        if row.sum() == 2:
            MATCHED = True

    return MATCHED
Esempio n. 4
0
    def test_score_matrix_missing_data(self):
        # if there is a region in the regions file that does not overlap anything
        # in the training data, it should return ??

        regions_peak_file = tempfile.NamedTemporaryFile(delete=False)

        # Create dummy data
        regions_dict = {
            'Chromosome': ['chr1', 'chr1'],
            'Start': [50, 10000],
            'End': [150, 10400]
        }

        regions_pr = pr.from_dict(regions_dict)

        # Write to tmp bed file
        regions_pr.to_bed(regions_peak_file.name)
        regions_peak_file.flush()

        accessilibility_peak_matrix = np.random.uniform(low=0.,
                                                        high=1.,
                                                        size=(4, 2))

        results = self.model.score_matrix(accessilibility_peak_matrix,
                                          regions_peak_file.name)

        assert np.all(np.isnan(results[:, 0, :]))
Esempio n. 5
0
    def test_score_matrix(self):

        regions_peak_file = tempfile.NamedTemporaryFile(delete=False)

        # Create dummy data
        regions_dict = {
            'Chromosome': ['chr1', 'chr1'],
            'Start': [10000, 30000],
            'End': [10300, 31200]
        }
        regions_pr = pr.from_dict(regions_dict)

        # Write to tmp bed file
        regions_pr.to_bed(regions_peak_file.name)
        regions_peak_file.flush()

        accessilibility_peak_matrix = np.random.uniform(low=0.,
                                                        high=1.,
                                                        size=(4, 2))

        results = self.model.score_matrix(accessilibility_peak_matrix,
                                          regions_peak_file.name)

        assert (results.shape == (4, 2, 1))
        masked = np.ma.array(results, mask=np.isnan(results))
        assert (np.all(masked <= 1))
Esempio n. 6
0
def pr_window_thin(df, window: int, chroms: list = None):
    if not chroms:
        chroms = df.Chromosome.unique()

    out = pd.DataFrame()
    for ii in chroms:
        pr_chr = pr.PyRanges(df[df.Chromosome == ii])
        if not pr_chr:
            continue
        # Create pyranges object with midpoint of window
        gr = pr.from_dict({
            "Chromosome": [ii],
            "Start": pr_chr.Start.min(),
            "End": pr_chr.End.max()
        })
        gr_window = gr.window(window)
        gr_mid = gr_window.copy()
        gr_mid.Start = (
            gr_mid.End -
            (gr_mid.End - gr_mid.Start) / 2).round(0).astype('int32')
        gr_mid.End = gr_mid.Start + 1

        # Find distance between SNPs and midpoint
        pr_nearest = pr_chr.nearest(gr_mid)
        pr_nearest_csv = pr_nearest.to_csv()
        nearest = pd.read_csv(io.StringIO(pr_nearest_csv))

        # Group by window and find SNP nearest to midpoint
        idx = nearest.groupby(['Start_b', 'End_b']).Distance.idxmin()
        thin = nearest.loc[idx]
        out = pd.concat([out, thin], ignore_index=True)
    return out
Esempio n. 7
0
def test_intersect_overlapping_frames():
    frame_1 = pr.from_dict({
        "Chromosome": [1, 2],
        "Start": [1, 10],
        "End": [4, 15]
    })
    frame_2 = pr.from_dict({
        "Chromosome": [1, 2],
        "Start": [2, 16],
        "End": [7, 18]
    })

    expected_result = pr.from_dict({
        "Chromosome": [1],
        "Start": [2],
        "End": [4]
    })
    intersection = intersect([frame_1, frame_2])

    assert_frame_equal(intersection.df,
                       expected_result.df,
                       check_dtype=False,
                       check_categorical=False)
Esempio n. 8
0
def countContexts(fastaFilePath, whiteListBed=None, blackListBed=None):
    debug(f"Starting to count contexts of nucleotides in {fastaFilePath}")

    triNucCounts = defaultdict(int)
    diNucCounts = defaultdict(int)
    # open the fastaFile
    with FastaFile(fastaFilePath) as fastaFile:

        # if we do not have a whitelist to start out, we make one from the fasta, which includes
        # everything
        if whiteListBed is None:
            wlObj = from_dict(
                {
                    "Chromosome": fastaFile.references,
                    "Start": [1] * fastaFile.nreferences,
                    "End": fastaFile.lengths,
                }
            )
        else:
            # we cast this to string, because pyranges wants string and we use the Path type
            wlObj = read_bed(str(whiteListBed))
            wlObj = wlObj.merge()

        # if we have a blacklist, we subtract that from the whitelist, otherwise we leave it how
        # it is
        if not blackListBed is None:
            # we cast this to string, because pyranges wants string and we use the Path type
            blObj = read_bed(str(blackListBed))
            blObj = blObj.merge()
            wlObj = wlObj.subtract(blObj)
            # shouldnt need to merge again here, as we only have less ranges than before

        # while we could use the get_fasta function from pyranges, it needs another
        # dependency (pyfaidx) and is slower (from my preliminary testing)
        # i terate over all chromosomes and each of the ranges
        for chr, df in wlObj:
            # iterrows has to return the index, even though we dont use it
            for idx, region in df.iterrows():
                seq = fastaFile.fetch(
                    reference=chr, start=region["Start"], end=region["End"]
                )

                for i in range(len(seq) - 2):
                    diNucCounts[seq[i : i + 2]] += 1
                    triNucCounts[seq[i : i + 3]] += 1
            debug(f"contect frequency analysis complete for chromsome {chr}")

    return (diNucCounts, triNucCounts)
Esempio n. 9
0
    def test_score_whole_genome(self):

        test_similarity_peak_file = tempfile.NamedTemporaryFile(delete=False)
        file_prefix = tempfile.NamedTemporaryFile(delete=False)
        file_prefix_name = file_prefix.name

        # Create dummy data
        similarity_dict = {
            'Chromosome': ['chr7', 'chr7', 'chr8'],
            'Start': [200, 400, 1100],
            'End': [220, 440, 1150]
        }
        similarity_pr = pr.from_dict(similarity_dict)

        # Write to temp bed file
        similarity_pr.to_bed(test_similarity_peak_file.name)
        test_similarity_peak_file.flush()

        self.model.score_whole_genome([test_similarity_peak_file.name],
                                      file_prefix_name,
                                      chrs=['chr7', 'chr8'])

        test_similarity_peak_file.close()

        # load in scores
        loaded = np.load(file_prefix_name + ".npz", allow_pickle=True)

        file_prefix.close()
        assert 'preds' in loaded.keys() and 'names' in loaded.keys()

        preds = loaded['preds']
        names = loaded['names']
        assert preds.shape == (200, 4)
        assert names.shape[0] == 4  # chr, start, end, CTCF
        assert np.all(preds[:100, 0] == 'chr7')
        assert np.all(preds[100:, 0] == 'chr8')
Esempio n. 10
0
def make_plots(target_paths):
    unified = defaultdict(lambda: defaultdict(list))
    tools = [
        "asgal", "aspli", "eventpointer", "irfinder", "majiq", "sgseq",
        "spladder", "whippet"
    ]
    evs = []
    #def unified_upset():
    for file in target_paths:
        if "unified.out" in file:
            # if len(file.split("/")[-1].split(".")) > 2:
            #     tool = file.split("/")[-1].split(".")[1].split("_")[-2]
            # else:
            #     tool = file.split("/")[-1].split(".")[0].split("_")[-2]
            tool = tools[np.where(list(map(lambda x: x in file, tools)))[0][0]]
            if "_filtered" in tool:
                tool = tool.replace("_filtered", "")

            try:
                tmp = pd.read_csv(file, sep="\t")
            except EmptyDataError as e:
                print(f"This file returns the following error: {file}")
                print(e)
                continue

            tmp = tmp.dropna()
            for ev in tmp.event_type.unique():
                events = tmp[tmp['event_type'] == ev]
                org = events['chr'].copy(deep=False)

                events.loc[:,
                           ['chr']] = list(map(lambda x: "chr" + str(x), org))
                events.columns = [
                    "Chromosome", "gene", "id", "strand", "event_type",
                    "count", "Start", "End"
                ]
                unified[ev][tool].append(events.to_dict('list'))
                if ev not in set(evs):
                    evs.append(ev)

    allcomb = dict()
    for ev, X in unified.items():

        if ev == 'MEE' or ev == 'MES':
            realcount = pd.DataFrame(
                columns=["  font-size: 1rem;Chromosome", "Start", "End"] +
                tools)
            for combi in it.combinations(X.keys(), 2):
                if len(combi) < 2:
                    df1 = pd.DataFrame(X[combi[0]][0]).reset_index()
                    row1 = create_row([combi[0]])
                    for _ in range(df1.shape[0]):
                        realcount.loc[realcount.shape[0]] = ["chr", 0, 0
                                                             ] + row1
                    continue

                df1 = pd.DataFrame(X[combi[0]][0]).reset_index()
                df1['index'] = df1.index
                df2 = pd.DataFrame(X[combi[1]][0]).reset_index()
                df2['index'] = df2.index

                merged1 = expand_coord(df1)
                merged2 = expand_coord(df2)

                matched_index = []
                for mergin in merged1.keys():
                    mergin0_res = find_all_overlaps(mergin, merged1, merged2,
                                                    combi)

                    if any(mergin0_res):
                        row1 = create_row([combi[0], combi[1]], tools)
                        matched_index.append(
                            np.where(mergin0_res)[0][0]
                        )  #keep track which index of event is found overlapped, so that doesn't duplicate
                    #realcount.loc[realcount.shape[0]] = ["chr", 0, 0] +
                    else:
                        row1 = create_row([combi[0]], tools)

                    realcount.loc[realcount.shape[0]] = ["chr", 0, 0] + row1

                #add the events in tool 2 that doesn't have overlaps
                row2 = create_row([combi[1]], tools)
                for _ in range(len(merged2) - len(matched_index)):
                    realcount.loc[realcount.shape[0]] = ["chr", 0, 0] + row2

        else:
            grs = {
                n: pr.from_dict(s[0]).drop_duplicate_positions(keep=False)
                for n, s in X.items()
            }
            counts = pr.count_overlaps(grs)
            countdf = counts.df

            #check if there are tools left out
            missed = [
                tools[x]
                for x in np.where(np.isin(tools, countdf.columns) == False)[0]
            ]
            if len(missed) > 0:
                for x in missed:
                    countdf[x] = 0
            realcount = countdf[["Chromosome", "Start", "End"] + tools]

        for row in realcount.itertuples():
            tmp = list(row[4:])
            tmp = [1 if x > 1 else x for x in tmp]
            binkey = ''.join([str(x) for x in tmp])
            if np.sum(tmp) == 0:
                continue
            else:
                if binkey not in set(allcomb.keys()):
                    allcomb.setdefault(binkey, {})
                    for e in evs:
                        allcomb[binkey].setdefault(e, 0)

                allcomb[binkey][ev] += 1

    forplot = pd.DataFrame(columns=tools + evs)

    for n, j in allcomb.items():
        thisrow = [bool(int(x)) for x in n] + list(j.values())
        forplot.loc[forplot.shape[0]] = thisrow

    forplot = forplot.set_index(tools)
    return forplot
Esempio n. 11
0
    def __init__(self, vcf_filename=None, ref_build=None, patient_id = None, has_tabix=False, conv_region_filename=None, conv_region_dict = None, region_studied_filename= None, nocall_filename = None):
        """ 
        Create a new Converter Object to convert a VCF file.

        Parameters
        ----------
        vcf_filename : str (Required)
            Path to text-based or bgzipped VCF file containing variants to be converted into FHIR format.
            Valid path and filename without whitespace. VCF file must conform to VCF Version 4.1 or later. 
            FORMAT.GT must be present. Multi-sample VCFs are allowed, but only the first sample will be converted.
        ref_build : str (Required)
            Genome Reference Consortium genome assembly to which variants in the VCF were called. Must be one of 'GRCh37' or 'GRCh38'.
        patient_id : str (Optional)
            Patient who's VCF file is being processed. Alphanumeric string without whitespace. Default value is first sample name.
        has_tabix : bool (Optional)
            If tabix file exist for the vcf than set it to True. Tabix file should have the same name as vcf file, with a '.tbi' extension,
            and must be in the same folder. Default value is False.
        conv_region_filename : str (Optional)
            Path to conversion region bed file. Subset of the VCF file to be converted into FHIR. If absent, the entire VCF file is converted. Must be a valid BED file
        conv_region_dict: dict (Optional)
            Conversion region can also be provided using dict. If 'conv_region_filename' is provided
            it will be ignored. 
            Format:  {"Chromosome": ["chr1", "chr2"], "Start": [100, 200], "End": [150, 201]}
        region_studied_filename : str (Optional)
            Path to region studied bed file. Subset of patient's genome that was studied in the generation of the VCF file. If present, only studied regions are converted. Must be a valid BED file, with first 3 columns: <chr> <start> <stop>.
        nocall_filename : str (Optional)
            Path to no call bed file. Subset of studied region that is deemed noncallable. If present, only studied regions minus noncallable regions are converted. Must be a valid BED file, with first 3 columns: <chr> <start> <stop>.

        Returns
        -------
        Object
        An Instance of Conveter that helps to convert vcf file.

        Examples
        --------

        """
        super(Converter, self).__init__()
        if not (vcf_filename):
            raise Exception('You must provide vcf_filename')
        if not ref_build or ref_build not in ["GRCh37", "GRCh38"]:
            raise Exception('You must provide build number ("GRCh37" or "GRCh38")')
        if nocall_filename and not region_studied_filename:
            raise Exception ("Please also provide region_studied_filename when nocall_filename is provided")
        self.vcf_filename = vcf_filename
        try:
            self._vcf_reader = vcf.Reader(filename=vcf_filename)
        except FileNotFoundError:
            raise
        except :
            self._generate_exception("Please provide valid  'vcf_filename'")
        if not patient_id:
            patient_id = self._vcf_reader.samples[0]
        if nocall_filename:
            try:
                self.nocall_region = pyranges.read_bed(nocall_filename)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception("Please provide valid  'nocall_filename'")
        else:
            self.nocall_region = pyranges.PyRanges()
        if conv_region_filename:
            try:
                self.conversion_region = pyranges.read_bed(conv_region_filename)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception( "Please provide valid 'conv_region_filename'")
        elif conv_region_dict:      
            try:
                self._fix_conv_region_zero_based(conv_region_dict)
                self.conversion_region = pyranges.from_dict(conv_region_dict)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception("Please provide valid 'conv_region_dict'")
        else:
            self.conversion_region = None         
        if region_studied_filename:
            try:
                self.region_studied = pyranges.read_bed(region_studied_filename)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception("Please provide valid 'region_studied_filename'")
        else:
            self.region_studied = None
        self.has_tabix = has_tabix
        self.patient_id = patient_id
        self.ref_build = ref_build
        self.nocall_filename = nocall_filename
        self.conv_region_filename = conv_region_filename
        general_logger.info("Converter class instantiated successfully")
Esempio n. 12
0
    def __init__(
            self, vcf_filename=None, ref_build=None, patient_id=None,
            has_tabix=False, conv_region_filename=None, conv_region_dict=None,
            annotation_filename=None, region_studied_filename=None,
            nocall_filename=None, ratio_ad_dp=0.99,
            genomic_source_class='somatic'):

        super(Converter, self).__init__()
        if not (vcf_filename):
            raise Exception('You must provide vcf_filename')
        if not ref_build or ref_build not in ["GRCh37", "GRCh38"]:
            raise Exception(
                'You must provide build number ("GRCh37" or "GRCh38")')
        if nocall_filename and not region_studied_filename:
            raise Exception(
                ("Please also provide region_studied_filename " +
                 "when nocall_filename is provided"))
        self.vcf_filename = vcf_filename
        try:
            self._vcf_reader = vcf.Reader(filename=vcf_filename)
        except FileNotFoundError:
            raise
        except BaseException:
            self._generate_exception("Please provide valid  'vcf_filename'")
        if not patient_id:
            patient_id = self._vcf_reader.samples[0]
        if nocall_filename:
            try:
                self.nocall_region = pyranges.read_bed(nocall_filename)
            except FileNotFoundError:
                raise
            except BaseException:
                self._generate_exception(
                    "Please provide valid  'nocall_filename'")
        else:
            self.nocall_region = pyranges.PyRanges()
        if conv_region_filename:
            try:
                self.conversion_region = pyranges.read_bed(
                    conv_region_filename)
            except FileNotFoundError:
                raise
            except BaseException:
                self._generate_exception(
                    "Please provide valid 'conv_region_filename'")
        elif conv_region_dict:
            try:
                self.conversion_region = pyranges.from_dict(conv_region_dict)
            except FileNotFoundError:
                raise
            except BaseException:
                self._generate_exception(
                    "Please provide valid 'conv_region_dict'")
        else:
            self.conversion_region = None
        self.annotation_filename = annotation_filename
        if self.annotation_filename is None:
            self.annotations = None
        else:
            try:
                self.annotations = pd.read_csv(
                            self.annotation_filename,
                            names=[
                                'CHROM', 'POS', 'REF', 'ALT', 'gene',
                                'transcriptRefSeq', 'cHGVS', 'proteinRefSeq',
                                'pHGVS', 'clinSig', 'phenotype'
                            ],
                            sep='\t'
                        )
            except FileNotFoundError:
                raise
            except BaseException:
                self._generate_exception(
                        "Please provide valid 'annotation_filename'"
                    )
        if region_studied_filename:
            try:
                self.region_studied = pyranges.read_bed(
                    region_studied_filename)
            except FileNotFoundError:
                raise
            except BaseException:
                self._generate_exception(
                    "Please provide valid 'region_studied_filename'")
        else:
            self.region_studied = None

        if not validate_has_tabix(has_tabix):
            raise Exception("Please provide a valid 'has_tabix'")

        if not validate_ratio_ad_dp(ratio_ad_dp):
            raise Exception("Please provide a valid 'ratio_ad_dp'")

        if genomic_source_class.title() not in Genomic_Source_Class.set_():
            raise Exception(
                ("Please provide a valid Genomic Source Class " +
                 "('germline' or 'somatic' or 'mixed')"))

        self.ratio_ad_dp = ratio_ad_dp
        self.has_tabix = has_tabix
        self.patient_id = patient_id
        self.ref_build = ref_build
        self.nocall_filename = nocall_filename
        self.conv_region_filename = conv_region_filename
        self.genomic_source_class = genomic_source_class.title()
        general_logger.info("Converter class instantiated successfully")
Esempio n. 13
0
    def __init__(self,
                 vcf_filename=None,
                 ref_build=None,
                 patient_id=None,
                 has_tabix=False,
                 conv_region_filename=None,
                 conv_region_dict=None,
                 region_studied_filename=None,
                 nocall_filename=None,
                 ratio_ad_dp=0.99):

        super(Converter, self).__init__()
        if not (vcf_filename):
            raise Exception('You must provide vcf_filename')
        if not ref_build or ref_build not in ["GRCh37", "GRCh38"]:
            raise Exception(
                'You must provide build number ("GRCh37" or "GRCh38")')
        if nocall_filename and not region_studied_filename:
            raise Exception(
                "Please also provide region_studied_filename when nocall_filename is provided"
            )
        self.vcf_filename = vcf_filename
        try:
            self._vcf_reader = vcf.Reader(filename=vcf_filename)
        except FileNotFoundError:
            raise
        except:
            self._generate_exception("Please provide valid  'vcf_filename'")
        if not patient_id:
            patient_id = self._vcf_reader.samples[0]
        if nocall_filename:
            try:
                self.nocall_region = pyranges.read_bed(nocall_filename)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception(
                    "Please provide valid  'nocall_filename'")
        else:
            self.nocall_region = pyranges.PyRanges()
        if conv_region_filename:
            try:
                self.conversion_region = pyranges.read_bed(
                    conv_region_filename)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception(
                    "Please provide valid 'conv_region_filename'")
        elif conv_region_dict:
            try:
                self._fix_conv_region_zero_based(conv_region_dict)
                self.conversion_region = pyranges.from_dict(conv_region_dict)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception(
                    "Please provide valid 'conv_region_dict'")
        else:
            self.conversion_region = None
        if region_studied_filename:
            try:
                self.region_studied = pyranges.read_bed(
                    region_studied_filename)
            except FileNotFoundError:
                raise
            except:
                self._generate_exception(
                    "Please provide valid 'region_studied_filename'")
        else:
            self.region_studied = None

        if not _Utilities.validate_has_tabix(has_tabix):
            raise Exception("Please provide a valid 'has_tabix'")

        if not _Utilities.validate_ratio_ad_dp(ratio_ad_dp):
            raise Exception("Please provide a valid 'ratio_ad_dp'")

        self.ratio_ad_dp = ratio_ad_dp
        self.has_tabix = has_tabix
        self.patient_id = patient_id
        self.ref_build = ref_build
        self.nocall_filename = nocall_filename
        self.conv_region_filename = conv_region_filename
        general_logger.info("Converter class instantiated successfully")
Esempio n. 14
0
    def test_score_matrix_combines_indices(self):
        # issue where value_counts() was not sorting on the index,
        # causing predictions to be combined incorrectly and returning preds > 1

        # Create dummy data
        # make 500 regions that do not overlap the Dataset
        start = np.repeat(np.arange(0, 100), 5)
        start = np.concatenate([start, [200, 1100, 1700]])

        end = np.repeat(np.arange(20, 120), 5)
        end = np.concatenate([end, [900, 1500, 2100]])

        regions_dict = {
            'Chromosome': ['chr1'] * len(start),
            'Start': start,
            'End': end,
            'idx': np.arange(0, start.shape[0])
        }  # only indices 500-502
        # have data

        regions_pr = pr.from_dict(regions_dict)
        # have to cast to int64
        regions = pr.PyRanges(regions_pr.df, int64=True)

        targets = ['CTCF']
        ds = EpitomeDataset(targets=targets,
                            cells=['PC-9', 'Panc1', 'IMR-90', 'H1'],
                            min_cells_per_target=2)

        # set predictions to 1s so means could be greater than 1 if done wrong
        preds = np.ones((1, 10, 1))

        conversionObject = RegionConversion(ds.regions, regions)

        results = conversionObject.merge(preds, axis=1)

        masked = np.ma.array(results, mask=np.isnan(results))
        assert (np.all(masked <= 1))

        # Error case where there are nans before true values
        # 1st region on chr 1has no overlap with dataset, while second region
        # on chr2 has multiple (2) overlaps
        start = [30000, 200]
        end = [30100, 900]
        regions_dict = {
            'Chromosome': ['chr1', 'chr2'],
            'Start': start,
            'End': end,
            'idx': [0, 1]
        }

        regions_pr = pr.from_dict(regions_dict)
        # have to cast to int64
        regions = pr.PyRanges(regions_pr.df, int64=True)

        conversionObject = RegionConversion(ds.regions, regions)

        preds = np.ones((1, 4, 1))

        results = conversionObject.merge(preds, axis=1)
        masked = np.ma.array(results, mask=np.isnan(results))
        assert (np.all(masked <= 1))
Esempio n. 15
0
    # print(type(indexes_larger_than_ref))
    if len(indexes_larger_than_ref) > 0:
        max_index = np.min(indexes_larger_than_ref)
        positions = positions[:max_index]
    start = positions - delta
    end = positions + delta
    chromosomes.extend([chromosome] * (2 * len(positions)))
    strands.extend((["+"] * len(positions)) + (["-"] * len(positions)))
    starts.extend(list(np.append(start, start)))
    ends.extend(list(np.append(end, end)))
    print(chromosome)

# random positions as pyranges
gr2 = pr.from_dict({
    "Chromosome": chromosomes,
    "Strand": strands,
    "Start": starts,
    "End": ends
})

all_cpg_unmod_intervals_pr = pr.PyRanges(all_cpg_unmod_intervals)
# remove unconfident ranges from randomly created positions
canonical_positions = gr2.subtract(possible_mod_intervals)
# remove cut off canonical ranges
canonical_positions = canonical_positions[(
    canonical_positions.End - canonical_positions.Start) == (delta * 2)]
canonical_positions = canonical_positions.df
# recreate original position for left over data
canonical_positions["midpoint"] = canonical_positions.Start + delta
# get bases for each position (all should be C but this is worth a double check)
canonical_positions['find'] = np.vectorize(get_base)(
    canonical_positions['Chromosome'], canonical_positions['midpoint'],
Esempio n. 16
0
    def __init__(self,
                 vcf_filename=None,
                 ref_build=None,
                 patient_id=None,
                 has_tabix=False,
                 conv_region_filename=None,
                 conv_region_dict=None,
                 region_studied_filename=None,
                 ratio_ad_dp=0.99,
                 source_class='germline',
                 seed=1000,
                 annotation_filename=None):

        super(Converter, self).__init__()
        if not (vcf_filename):
            raise Exception('You must provide vcf_filename')
        if not ref_build or ref_build not in ["GRCh37", "GRCh38"]:
            raise Exception(
                'You must provide build number ("GRCh37" or "GRCh38")')
        self.vcf_filename = vcf_filename
        try:
            self._vcf_reader = list(vcf.Reader(filename=vcf_filename))
        except FileNotFoundError:
            raise
        except BaseException:
            self._generate_exception("Please provide valid 'vcf_filename'")
        self.annotation_filename = annotation_filename
        if self.annotation_filename is None:
            self.annotations = 'Not Supplied'
        else:
            try:
                self.annotations = pd.read_csv(self.annotation_filename,
                                               names=[
                                                   'CHROM', 'POS', 'REF',
                                                   'ALT', 'transcriptRefSeq',
                                                   'cHGVS', 'proteinRefSeq',
                                                   'pHGVS', 'clinSig',
                                                   'phenotype'
                                               ],
                                               sep='\t')
                if len(self.annotations) == 0:
                    self.annotations = None
            except FileNotFoundError:
                raise
            except BaseException:
                self._generate_exception(
                    "Please provide valid 'annotation_filename'")
        if not patient_id:
            patient_id = self._vcf_reader.samples[0]
        if conv_region_filename:
            try:
                self.conversion_region = pyranges.read_bed(
                    conv_region_filename)
            except FileNotFoundError:
                raise
            except BaseException:
                self._generate_exception(
                    "Please provide valid 'conv_region_filename'")
        elif conv_region_dict:
            try:
                self.conversion_region = pyranges.from_dict(conv_region_dict)
            except FileNotFoundError:
                raise
            except BaseException:
                self._generate_exception(
                    "Please provide valid 'conv_region_dict'")
        else:
            self.conversion_region = None
        if region_studied_filename:
            try:
                self.region_studied = pyranges.read_bed(
                    region_studied_filename)
            except FileNotFoundError:
                raise
            except BaseException:
                self._generate_exception(
                    "Please provide valid 'region_studied_filename'")
        else:
            self.region_studied = None

        if not validate_has_tabix(has_tabix):
            raise Exception("Please provide a valid 'has_tabix'")

        if not validate_ratio_ad_dp(ratio_ad_dp):
            raise Exception("Please provide a valid 'ratio_ad_dp'")

        if not validate_seed(seed):
            raise Exception("Please provide a valid seed")

        if source_class not in ["germline", "somatic"]:
            raise Exception(
                'Please provide a valid Source Class ("germline" or "somatic")'
            )

        self.ratio_ad_dp = ratio_ad_dp
        self.has_tabix = has_tabix
        self.patient_id = patient_id
        self.ref_build = ref_build
        self.conv_region_filename = conv_region_filename
        if source_class == 'germline':
            self.source_class = 'LA6683-2^Germline^LN'
        elif source_class == 'somatic':
            self.source_class = 'LA6684-0^Somatic^LN'
        self.seed = seed
        general_logger.info("Converter class instantiated successfully")