def test_score_peak_file(self): test_similarity_peak_file = tempfile.NamedTemporaryFile(delete=False) test_regions_peak_file = tempfile.NamedTemporaryFile(delete=False) # Create dummy data similarity_dict = { 'Chromosome': ['chr1', 'chr1', 'chr6'], 'Start': [200, 400, 1100], 'End': [220, 440, 1150] } regions_dict = { 'Chromosome': ['chr1', 'chr1'], 'Start': [210, 410], 'End': [215, 415] } similarity_pr = pr.from_dict(similarity_dict) regions_pr = pr.from_dict(regions_dict) # Write to temp bed file similarity_pr.to_bed(test_similarity_peak_file.name) regions_pr.to_bed(test_regions_peak_file.name) test_similarity_peak_file.flush() test_regions_peak_file.flush() preds = self.model.score_peak_file([test_similarity_peak_file.name], test_regions_peak_file.name) test_regions_peak_file.close() test_similarity_peak_file.close() assert (preds.shape[0] == len(regions_pr))
def coverage(intervals, features, feature_name, fun=sum, details=True): columns_attributes = ["attributes" ] + (["attributes_details"] if details else []) columns_group = [ "bin_start", "bin_end", "bin_strand", "gene_chrom", "gene_name", "gene_strand", "gene_start", "gene_end", "gene_region_start", "gene_region_end" ] columns_return = columns_group + columns_attributes columns_preserve = list(set(columns_return) & set(intervals.columns)) intervals_pr = pr.from_dict({ **{ 'Chromosome': intervals["gene_chrom"], 'Start': intervals["bin_start"] - 1, 'End': intervals["bin_end"] + 1, 'Strand': intervals["bin_strand"] }, **intervals }) features_pr = pr.from_dict({ **{ 'Chromosome': features["feature_chrom"], 'Start': features["feature_start"], 'End': features["feature_end"], 'Strand': features["feature_strand"], 'feature_name': features["feature_name"] }, **features }) overlaps = intervals_pr.join(features_pr, how=False, strandedness="same").as_df() overlaps["hit"] = True coverage = overlaps.groupby(columns_group, as_index=False).aggregate({'hit': fun}) results = intervals[columns_preserve].merge(coverage, how="left", on=columns_group) if details: coverage_details = overlaps.\ drop_duplicates(columns_group + ["feature_name", "hit"]).\ groupby(columns_group, as_index=False).\ agg({'feature_name': ','.join}) results = results.merge(coverage_details, how="left", on=columns_group) results["attributes_details"] = results["attributes"] + "; " + feature_name + "=" + results["feature_name"].fillna("") \ if "attributes_details" in results \ else feature_name + "=" + results["feature_name"].fillna("") results["attributes"] = results["attributes"] + "; " + feature_name + "=" + results["hit"].fillna(0).astype(str) \ if "attributes" in results \ else feature_name + "=" + results["hit"].fillna(0).astype(str) return results[columns_return]
def find_overlaps(event1, event2, combi): complete_event1 = pd.DataFrame(event1) complete_event2 = pd.DataFrame(event2) thisdict = { combi[0]: complete_event1.to_dict(), combi[1]: complete_event2.to_dict() } try: grs = { n: pr.from_dict(s).drop_duplicate_positions(keep=False) for n, s in thisdict.items() } except ValueError: return False counts = pr.count_overlaps(grs) countdf = counts.df MATCHED = False for ind, row in counts.df[[combi[0], combi[1]]].iterrows(): if row.sum() == 2: MATCHED = True return MATCHED
def test_score_matrix_missing_data(self): # if there is a region in the regions file that does not overlap anything # in the training data, it should return ?? regions_peak_file = tempfile.NamedTemporaryFile(delete=False) # Create dummy data regions_dict = { 'Chromosome': ['chr1', 'chr1'], 'Start': [50, 10000], 'End': [150, 10400] } regions_pr = pr.from_dict(regions_dict) # Write to tmp bed file regions_pr.to_bed(regions_peak_file.name) regions_peak_file.flush() accessilibility_peak_matrix = np.random.uniform(low=0., high=1., size=(4, 2)) results = self.model.score_matrix(accessilibility_peak_matrix, regions_peak_file.name) assert np.all(np.isnan(results[:, 0, :]))
def test_score_matrix(self): regions_peak_file = tempfile.NamedTemporaryFile(delete=False) # Create dummy data regions_dict = { 'Chromosome': ['chr1', 'chr1'], 'Start': [10000, 30000], 'End': [10300, 31200] } regions_pr = pr.from_dict(regions_dict) # Write to tmp bed file regions_pr.to_bed(regions_peak_file.name) regions_peak_file.flush() accessilibility_peak_matrix = np.random.uniform(low=0., high=1., size=(4, 2)) results = self.model.score_matrix(accessilibility_peak_matrix, regions_peak_file.name) assert (results.shape == (4, 2, 1)) masked = np.ma.array(results, mask=np.isnan(results)) assert (np.all(masked <= 1))
def pr_window_thin(df, window: int, chroms: list = None): if not chroms: chroms = df.Chromosome.unique() out = pd.DataFrame() for ii in chroms: pr_chr = pr.PyRanges(df[df.Chromosome == ii]) if not pr_chr: continue # Create pyranges object with midpoint of window gr = pr.from_dict({ "Chromosome": [ii], "Start": pr_chr.Start.min(), "End": pr_chr.End.max() }) gr_window = gr.window(window) gr_mid = gr_window.copy() gr_mid.Start = ( gr_mid.End - (gr_mid.End - gr_mid.Start) / 2).round(0).astype('int32') gr_mid.End = gr_mid.Start + 1 # Find distance between SNPs and midpoint pr_nearest = pr_chr.nearest(gr_mid) pr_nearest_csv = pr_nearest.to_csv() nearest = pd.read_csv(io.StringIO(pr_nearest_csv)) # Group by window and find SNP nearest to midpoint idx = nearest.groupby(['Start_b', 'End_b']).Distance.idxmin() thin = nearest.loc[idx] out = pd.concat([out, thin], ignore_index=True) return out
def test_intersect_overlapping_frames(): frame_1 = pr.from_dict({ "Chromosome": [1, 2], "Start": [1, 10], "End": [4, 15] }) frame_2 = pr.from_dict({ "Chromosome": [1, 2], "Start": [2, 16], "End": [7, 18] }) expected_result = pr.from_dict({ "Chromosome": [1], "Start": [2], "End": [4] }) intersection = intersect([frame_1, frame_2]) assert_frame_equal(intersection.df, expected_result.df, check_dtype=False, check_categorical=False)
def countContexts(fastaFilePath, whiteListBed=None, blackListBed=None): debug(f"Starting to count contexts of nucleotides in {fastaFilePath}") triNucCounts = defaultdict(int) diNucCounts = defaultdict(int) # open the fastaFile with FastaFile(fastaFilePath) as fastaFile: # if we do not have a whitelist to start out, we make one from the fasta, which includes # everything if whiteListBed is None: wlObj = from_dict( { "Chromosome": fastaFile.references, "Start": [1] * fastaFile.nreferences, "End": fastaFile.lengths, } ) else: # we cast this to string, because pyranges wants string and we use the Path type wlObj = read_bed(str(whiteListBed)) wlObj = wlObj.merge() # if we have a blacklist, we subtract that from the whitelist, otherwise we leave it how # it is if not blackListBed is None: # we cast this to string, because pyranges wants string and we use the Path type blObj = read_bed(str(blackListBed)) blObj = blObj.merge() wlObj = wlObj.subtract(blObj) # shouldnt need to merge again here, as we only have less ranges than before # while we could use the get_fasta function from pyranges, it needs another # dependency (pyfaidx) and is slower (from my preliminary testing) # i terate over all chromosomes and each of the ranges for chr, df in wlObj: # iterrows has to return the index, even though we dont use it for idx, region in df.iterrows(): seq = fastaFile.fetch( reference=chr, start=region["Start"], end=region["End"] ) for i in range(len(seq) - 2): diNucCounts[seq[i : i + 2]] += 1 triNucCounts[seq[i : i + 3]] += 1 debug(f"contect frequency analysis complete for chromsome {chr}") return (diNucCounts, triNucCounts)
def test_score_whole_genome(self): test_similarity_peak_file = tempfile.NamedTemporaryFile(delete=False) file_prefix = tempfile.NamedTemporaryFile(delete=False) file_prefix_name = file_prefix.name # Create dummy data similarity_dict = { 'Chromosome': ['chr7', 'chr7', 'chr8'], 'Start': [200, 400, 1100], 'End': [220, 440, 1150] } similarity_pr = pr.from_dict(similarity_dict) # Write to temp bed file similarity_pr.to_bed(test_similarity_peak_file.name) test_similarity_peak_file.flush() self.model.score_whole_genome([test_similarity_peak_file.name], file_prefix_name, chrs=['chr7', 'chr8']) test_similarity_peak_file.close() # load in scores loaded = np.load(file_prefix_name + ".npz", allow_pickle=True) file_prefix.close() assert 'preds' in loaded.keys() and 'names' in loaded.keys() preds = loaded['preds'] names = loaded['names'] assert preds.shape == (200, 4) assert names.shape[0] == 4 # chr, start, end, CTCF assert np.all(preds[:100, 0] == 'chr7') assert np.all(preds[100:, 0] == 'chr8')
def make_plots(target_paths): unified = defaultdict(lambda: defaultdict(list)) tools = [ "asgal", "aspli", "eventpointer", "irfinder", "majiq", "sgseq", "spladder", "whippet" ] evs = [] #def unified_upset(): for file in target_paths: if "unified.out" in file: # if len(file.split("/")[-1].split(".")) > 2: # tool = file.split("/")[-1].split(".")[1].split("_")[-2] # else: # tool = file.split("/")[-1].split(".")[0].split("_")[-2] tool = tools[np.where(list(map(lambda x: x in file, tools)))[0][0]] if "_filtered" in tool: tool = tool.replace("_filtered", "") try: tmp = pd.read_csv(file, sep="\t") except EmptyDataError as e: print(f"This file returns the following error: {file}") print(e) continue tmp = tmp.dropna() for ev in tmp.event_type.unique(): events = tmp[tmp['event_type'] == ev] org = events['chr'].copy(deep=False) events.loc[:, ['chr']] = list(map(lambda x: "chr" + str(x), org)) events.columns = [ "Chromosome", "gene", "id", "strand", "event_type", "count", "Start", "End" ] unified[ev][tool].append(events.to_dict('list')) if ev not in set(evs): evs.append(ev) allcomb = dict() for ev, X in unified.items(): if ev == 'MEE' or ev == 'MES': realcount = pd.DataFrame( columns=[" font-size: 1rem;Chromosome", "Start", "End"] + tools) for combi in it.combinations(X.keys(), 2): if len(combi) < 2: df1 = pd.DataFrame(X[combi[0]][0]).reset_index() row1 = create_row([combi[0]]) for _ in range(df1.shape[0]): realcount.loc[realcount.shape[0]] = ["chr", 0, 0 ] + row1 continue df1 = pd.DataFrame(X[combi[0]][0]).reset_index() df1['index'] = df1.index df2 = pd.DataFrame(X[combi[1]][0]).reset_index() df2['index'] = df2.index merged1 = expand_coord(df1) merged2 = expand_coord(df2) matched_index = [] for mergin in merged1.keys(): mergin0_res = find_all_overlaps(mergin, merged1, merged2, combi) if any(mergin0_res): row1 = create_row([combi[0], combi[1]], tools) matched_index.append( np.where(mergin0_res)[0][0] ) #keep track which index of event is found overlapped, so that doesn't duplicate #realcount.loc[realcount.shape[0]] = ["chr", 0, 0] + else: row1 = create_row([combi[0]], tools) realcount.loc[realcount.shape[0]] = ["chr", 0, 0] + row1 #add the events in tool 2 that doesn't have overlaps row2 = create_row([combi[1]], tools) for _ in range(len(merged2) - len(matched_index)): realcount.loc[realcount.shape[0]] = ["chr", 0, 0] + row2 else: grs = { n: pr.from_dict(s[0]).drop_duplicate_positions(keep=False) for n, s in X.items() } counts = pr.count_overlaps(grs) countdf = counts.df #check if there are tools left out missed = [ tools[x] for x in np.where(np.isin(tools, countdf.columns) == False)[0] ] if len(missed) > 0: for x in missed: countdf[x] = 0 realcount = countdf[["Chromosome", "Start", "End"] + tools] for row in realcount.itertuples(): tmp = list(row[4:]) tmp = [1 if x > 1 else x for x in tmp] binkey = ''.join([str(x) for x in tmp]) if np.sum(tmp) == 0: continue else: if binkey not in set(allcomb.keys()): allcomb.setdefault(binkey, {}) for e in evs: allcomb[binkey].setdefault(e, 0) allcomb[binkey][ev] += 1 forplot = pd.DataFrame(columns=tools + evs) for n, j in allcomb.items(): thisrow = [bool(int(x)) for x in n] + list(j.values()) forplot.loc[forplot.shape[0]] = thisrow forplot = forplot.set_index(tools) return forplot
def __init__(self, vcf_filename=None, ref_build=None, patient_id = None, has_tabix=False, conv_region_filename=None, conv_region_dict = None, region_studied_filename= None, nocall_filename = None): """ Create a new Converter Object to convert a VCF file. Parameters ---------- vcf_filename : str (Required) Path to text-based or bgzipped VCF file containing variants to be converted into FHIR format. Valid path and filename without whitespace. VCF file must conform to VCF Version 4.1 or later. FORMAT.GT must be present. Multi-sample VCFs are allowed, but only the first sample will be converted. ref_build : str (Required) Genome Reference Consortium genome assembly to which variants in the VCF were called. Must be one of 'GRCh37' or 'GRCh38'. patient_id : str (Optional) Patient who's VCF file is being processed. Alphanumeric string without whitespace. Default value is first sample name. has_tabix : bool (Optional) If tabix file exist for the vcf than set it to True. Tabix file should have the same name as vcf file, with a '.tbi' extension, and must be in the same folder. Default value is False. conv_region_filename : str (Optional) Path to conversion region bed file. Subset of the VCF file to be converted into FHIR. If absent, the entire VCF file is converted. Must be a valid BED file conv_region_dict: dict (Optional) Conversion region can also be provided using dict. If 'conv_region_filename' is provided it will be ignored. Format: {"Chromosome": ["chr1", "chr2"], "Start": [100, 200], "End": [150, 201]} region_studied_filename : str (Optional) Path to region studied bed file. Subset of patient's genome that was studied in the generation of the VCF file. If present, only studied regions are converted. Must be a valid BED file, with first 3 columns: <chr> <start> <stop>. nocall_filename : str (Optional) Path to no call bed file. Subset of studied region that is deemed noncallable. If present, only studied regions minus noncallable regions are converted. Must be a valid BED file, with first 3 columns: <chr> <start> <stop>. Returns ------- Object An Instance of Conveter that helps to convert vcf file. Examples -------- """ super(Converter, self).__init__() if not (vcf_filename): raise Exception('You must provide vcf_filename') if not ref_build or ref_build not in ["GRCh37", "GRCh38"]: raise Exception('You must provide build number ("GRCh37" or "GRCh38")') if nocall_filename and not region_studied_filename: raise Exception ("Please also provide region_studied_filename when nocall_filename is provided") self.vcf_filename = vcf_filename try: self._vcf_reader = vcf.Reader(filename=vcf_filename) except FileNotFoundError: raise except : self._generate_exception("Please provide valid 'vcf_filename'") if not patient_id: patient_id = self._vcf_reader.samples[0] if nocall_filename: try: self.nocall_region = pyranges.read_bed(nocall_filename) except FileNotFoundError: raise except: self._generate_exception("Please provide valid 'nocall_filename'") else: self.nocall_region = pyranges.PyRanges() if conv_region_filename: try: self.conversion_region = pyranges.read_bed(conv_region_filename) except FileNotFoundError: raise except: self._generate_exception( "Please provide valid 'conv_region_filename'") elif conv_region_dict: try: self._fix_conv_region_zero_based(conv_region_dict) self.conversion_region = pyranges.from_dict(conv_region_dict) except FileNotFoundError: raise except: self._generate_exception("Please provide valid 'conv_region_dict'") else: self.conversion_region = None if region_studied_filename: try: self.region_studied = pyranges.read_bed(region_studied_filename) except FileNotFoundError: raise except: self._generate_exception("Please provide valid 'region_studied_filename'") else: self.region_studied = None self.has_tabix = has_tabix self.patient_id = patient_id self.ref_build = ref_build self.nocall_filename = nocall_filename self.conv_region_filename = conv_region_filename general_logger.info("Converter class instantiated successfully")
def __init__( self, vcf_filename=None, ref_build=None, patient_id=None, has_tabix=False, conv_region_filename=None, conv_region_dict=None, annotation_filename=None, region_studied_filename=None, nocall_filename=None, ratio_ad_dp=0.99, genomic_source_class='somatic'): super(Converter, self).__init__() if not (vcf_filename): raise Exception('You must provide vcf_filename') if not ref_build or ref_build not in ["GRCh37", "GRCh38"]: raise Exception( 'You must provide build number ("GRCh37" or "GRCh38")') if nocall_filename and not region_studied_filename: raise Exception( ("Please also provide region_studied_filename " + "when nocall_filename is provided")) self.vcf_filename = vcf_filename try: self._vcf_reader = vcf.Reader(filename=vcf_filename) except FileNotFoundError: raise except BaseException: self._generate_exception("Please provide valid 'vcf_filename'") if not patient_id: patient_id = self._vcf_reader.samples[0] if nocall_filename: try: self.nocall_region = pyranges.read_bed(nocall_filename) except FileNotFoundError: raise except BaseException: self._generate_exception( "Please provide valid 'nocall_filename'") else: self.nocall_region = pyranges.PyRanges() if conv_region_filename: try: self.conversion_region = pyranges.read_bed( conv_region_filename) except FileNotFoundError: raise except BaseException: self._generate_exception( "Please provide valid 'conv_region_filename'") elif conv_region_dict: try: self.conversion_region = pyranges.from_dict(conv_region_dict) except FileNotFoundError: raise except BaseException: self._generate_exception( "Please provide valid 'conv_region_dict'") else: self.conversion_region = None self.annotation_filename = annotation_filename if self.annotation_filename is None: self.annotations = None else: try: self.annotations = pd.read_csv( self.annotation_filename, names=[ 'CHROM', 'POS', 'REF', 'ALT', 'gene', 'transcriptRefSeq', 'cHGVS', 'proteinRefSeq', 'pHGVS', 'clinSig', 'phenotype' ], sep='\t' ) except FileNotFoundError: raise except BaseException: self._generate_exception( "Please provide valid 'annotation_filename'" ) if region_studied_filename: try: self.region_studied = pyranges.read_bed( region_studied_filename) except FileNotFoundError: raise except BaseException: self._generate_exception( "Please provide valid 'region_studied_filename'") else: self.region_studied = None if not validate_has_tabix(has_tabix): raise Exception("Please provide a valid 'has_tabix'") if not validate_ratio_ad_dp(ratio_ad_dp): raise Exception("Please provide a valid 'ratio_ad_dp'") if genomic_source_class.title() not in Genomic_Source_Class.set_(): raise Exception( ("Please provide a valid Genomic Source Class " + "('germline' or 'somatic' or 'mixed')")) self.ratio_ad_dp = ratio_ad_dp self.has_tabix = has_tabix self.patient_id = patient_id self.ref_build = ref_build self.nocall_filename = nocall_filename self.conv_region_filename = conv_region_filename self.genomic_source_class = genomic_source_class.title() general_logger.info("Converter class instantiated successfully")
def __init__(self, vcf_filename=None, ref_build=None, patient_id=None, has_tabix=False, conv_region_filename=None, conv_region_dict=None, region_studied_filename=None, nocall_filename=None, ratio_ad_dp=0.99): super(Converter, self).__init__() if not (vcf_filename): raise Exception('You must provide vcf_filename') if not ref_build or ref_build not in ["GRCh37", "GRCh38"]: raise Exception( 'You must provide build number ("GRCh37" or "GRCh38")') if nocall_filename and not region_studied_filename: raise Exception( "Please also provide region_studied_filename when nocall_filename is provided" ) self.vcf_filename = vcf_filename try: self._vcf_reader = vcf.Reader(filename=vcf_filename) except FileNotFoundError: raise except: self._generate_exception("Please provide valid 'vcf_filename'") if not patient_id: patient_id = self._vcf_reader.samples[0] if nocall_filename: try: self.nocall_region = pyranges.read_bed(nocall_filename) except FileNotFoundError: raise except: self._generate_exception( "Please provide valid 'nocall_filename'") else: self.nocall_region = pyranges.PyRanges() if conv_region_filename: try: self.conversion_region = pyranges.read_bed( conv_region_filename) except FileNotFoundError: raise except: self._generate_exception( "Please provide valid 'conv_region_filename'") elif conv_region_dict: try: self._fix_conv_region_zero_based(conv_region_dict) self.conversion_region = pyranges.from_dict(conv_region_dict) except FileNotFoundError: raise except: self._generate_exception( "Please provide valid 'conv_region_dict'") else: self.conversion_region = None if region_studied_filename: try: self.region_studied = pyranges.read_bed( region_studied_filename) except FileNotFoundError: raise except: self._generate_exception( "Please provide valid 'region_studied_filename'") else: self.region_studied = None if not _Utilities.validate_has_tabix(has_tabix): raise Exception("Please provide a valid 'has_tabix'") if not _Utilities.validate_ratio_ad_dp(ratio_ad_dp): raise Exception("Please provide a valid 'ratio_ad_dp'") self.ratio_ad_dp = ratio_ad_dp self.has_tabix = has_tabix self.patient_id = patient_id self.ref_build = ref_build self.nocall_filename = nocall_filename self.conv_region_filename = conv_region_filename general_logger.info("Converter class instantiated successfully")
def test_score_matrix_combines_indices(self): # issue where value_counts() was not sorting on the index, # causing predictions to be combined incorrectly and returning preds > 1 # Create dummy data # make 500 regions that do not overlap the Dataset start = np.repeat(np.arange(0, 100), 5) start = np.concatenate([start, [200, 1100, 1700]]) end = np.repeat(np.arange(20, 120), 5) end = np.concatenate([end, [900, 1500, 2100]]) regions_dict = { 'Chromosome': ['chr1'] * len(start), 'Start': start, 'End': end, 'idx': np.arange(0, start.shape[0]) } # only indices 500-502 # have data regions_pr = pr.from_dict(regions_dict) # have to cast to int64 regions = pr.PyRanges(regions_pr.df, int64=True) targets = ['CTCF'] ds = EpitomeDataset(targets=targets, cells=['PC-9', 'Panc1', 'IMR-90', 'H1'], min_cells_per_target=2) # set predictions to 1s so means could be greater than 1 if done wrong preds = np.ones((1, 10, 1)) conversionObject = RegionConversion(ds.regions, regions) results = conversionObject.merge(preds, axis=1) masked = np.ma.array(results, mask=np.isnan(results)) assert (np.all(masked <= 1)) # Error case where there are nans before true values # 1st region on chr 1has no overlap with dataset, while second region # on chr2 has multiple (2) overlaps start = [30000, 200] end = [30100, 900] regions_dict = { 'Chromosome': ['chr1', 'chr2'], 'Start': start, 'End': end, 'idx': [0, 1] } regions_pr = pr.from_dict(regions_dict) # have to cast to int64 regions = pr.PyRanges(regions_pr.df, int64=True) conversionObject = RegionConversion(ds.regions, regions) preds = np.ones((1, 4, 1)) results = conversionObject.merge(preds, axis=1) masked = np.ma.array(results, mask=np.isnan(results)) assert (np.all(masked <= 1))
# print(type(indexes_larger_than_ref)) if len(indexes_larger_than_ref) > 0: max_index = np.min(indexes_larger_than_ref) positions = positions[:max_index] start = positions - delta end = positions + delta chromosomes.extend([chromosome] * (2 * len(positions))) strands.extend((["+"] * len(positions)) + (["-"] * len(positions))) starts.extend(list(np.append(start, start))) ends.extend(list(np.append(end, end))) print(chromosome) # random positions as pyranges gr2 = pr.from_dict({ "Chromosome": chromosomes, "Strand": strands, "Start": starts, "End": ends }) all_cpg_unmod_intervals_pr = pr.PyRanges(all_cpg_unmod_intervals) # remove unconfident ranges from randomly created positions canonical_positions = gr2.subtract(possible_mod_intervals) # remove cut off canonical ranges canonical_positions = canonical_positions[( canonical_positions.End - canonical_positions.Start) == (delta * 2)] canonical_positions = canonical_positions.df # recreate original position for left over data canonical_positions["midpoint"] = canonical_positions.Start + delta # get bases for each position (all should be C but this is worth a double check) canonical_positions['find'] = np.vectorize(get_base)( canonical_positions['Chromosome'], canonical_positions['midpoint'],
def __init__(self, vcf_filename=None, ref_build=None, patient_id=None, has_tabix=False, conv_region_filename=None, conv_region_dict=None, region_studied_filename=None, ratio_ad_dp=0.99, source_class='germline', seed=1000, annotation_filename=None): super(Converter, self).__init__() if not (vcf_filename): raise Exception('You must provide vcf_filename') if not ref_build or ref_build not in ["GRCh37", "GRCh38"]: raise Exception( 'You must provide build number ("GRCh37" or "GRCh38")') self.vcf_filename = vcf_filename try: self._vcf_reader = list(vcf.Reader(filename=vcf_filename)) except FileNotFoundError: raise except BaseException: self._generate_exception("Please provide valid 'vcf_filename'") self.annotation_filename = annotation_filename if self.annotation_filename is None: self.annotations = 'Not Supplied' else: try: self.annotations = pd.read_csv(self.annotation_filename, names=[ 'CHROM', 'POS', 'REF', 'ALT', 'transcriptRefSeq', 'cHGVS', 'proteinRefSeq', 'pHGVS', 'clinSig', 'phenotype' ], sep='\t') if len(self.annotations) == 0: self.annotations = None except FileNotFoundError: raise except BaseException: self._generate_exception( "Please provide valid 'annotation_filename'") if not patient_id: patient_id = self._vcf_reader.samples[0] if conv_region_filename: try: self.conversion_region = pyranges.read_bed( conv_region_filename) except FileNotFoundError: raise except BaseException: self._generate_exception( "Please provide valid 'conv_region_filename'") elif conv_region_dict: try: self.conversion_region = pyranges.from_dict(conv_region_dict) except FileNotFoundError: raise except BaseException: self._generate_exception( "Please provide valid 'conv_region_dict'") else: self.conversion_region = None if region_studied_filename: try: self.region_studied = pyranges.read_bed( region_studied_filename) except FileNotFoundError: raise except BaseException: self._generate_exception( "Please provide valid 'region_studied_filename'") else: self.region_studied = None if not validate_has_tabix(has_tabix): raise Exception("Please provide a valid 'has_tabix'") if not validate_ratio_ad_dp(ratio_ad_dp): raise Exception("Please provide a valid 'ratio_ad_dp'") if not validate_seed(seed): raise Exception("Please provide a valid seed") if source_class not in ["germline", "somatic"]: raise Exception( 'Please provide a valid Source Class ("germline" or "somatic")' ) self.ratio_ad_dp = ratio_ad_dp self.has_tabix = has_tabix self.patient_id = patient_id self.ref_build = ref_build self.conv_region_filename = conv_region_filename if source_class == 'germline': self.source_class = 'LA6683-2^Germline^LN' elif source_class == 'somatic': self.source_class = 'LA6684-0^Somatic^LN' self.seed = seed general_logger.info("Converter class instantiated successfully")