def test_output_bed_predict_denseout(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # generate loss # # resolution < stepsize inputs = Array("x", numpy.random.random((7, 10))) outputs = Array('y', numpy.random.random((7, 4)), conditions=['c1', 'c2', 'c3', 'c4']) bwm = get_janggu(inputs, outputs) data_path = pkg_resources.resource_filename('janggu', 'resources/10regions.bed') gi = GenomicIndexer.create_from_file(data_path, binsize=200, stepsize=200) dummy_eval = Scorer('pred', lambda p: [0.1] * len(p), exporter=ExportBed(gindexer=gi, resolution=200), conditions=['c1', 'c2', 'c3', 'c4']) bwm.predict(inputs, callbacks=[dummy_eval]) file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, 'pred.{}.bed') for cond in ['c1', 'c2', 'c3', 'c4']: assert os.path.exists(file_.format(cond)) bed = BedTool(file_.format('c1')) nreg = 0 for reg in bed: numpy.testing.assert_equal(float(reg.score), 0.1) nreg += 1 assert nreg == 7, 'There should be 7 regions in the bed file.'
def process(self): all_sites = pd.read_csv(self.sites_file,usecols=['chr','coordinate']) all_sites = get_winid.convert_chr_to_num(all_sites) chrs = np.sort(all_sites['chr'].unique()) all_sites_closest = [] for chr in chrs: print('processing sites on chr '+str(chr)) chr_file = self.data_dir+'chr'+str(chr)+'.tsv' if not os.path.exists(self.data_dir+'chr1.tsv'): self.split_by_chr() chr_sites = all_sites.query('chr==@chr') chr_sites['coordinate'] = chr_sites['coordinate'].astype('i8') chr_sites['end'] = chr_sites['coordinate']+1 chr_sites = BedTool([tuple(x[1]) for x in chr_sites.iterrows()]) chr_sites_closest = chr_sites.closest(chr_file,d=True,nonamecheck=True) for row in chr_sites_closest: all_sites_closest.extend([[row[0],row[1],row[6],row[7]]]) del chr_sites_closest del chr_sites gc.collect() all_sites_closest = pd.DataFrame(all_sites_closest,columns=['chr','coordinate','score','distiance_to_nearest_DANN']) all_sites_closest = all_sites_closest.groupby(['chr','coordinate']).apply(self.mean_max).reset_index() with pd.HDFStore(self.additional_feature_file,'a') as h5s: h5s['DANN'] = all_sites_closest
def cell_scaling_factors_fragments(fragmentfile, selected_barcodes=None): """ Generates pseudo-bulk tracks. Parameters ---------- fragmentfile : str Input fragments file. Returns ------- pd.Series Series containing the barcode counts per barcode. """ barcodecount = Counter() bed = BedTool(fragmentfile) for region in bed: bct = region.name if selected_barcodes is not None: if bct not in selected_barcodes: continue barcodecount[bct] += 1 return pd.Series(barcodecount)
def vcf_to_df_worker(arg): """ Convert CANVAS vcf to a dict, single thread """ canvasvcf, exonbed, i = arg logging.debug("Working on job {}: {}".format(i, canvasvcf)) samplekey = op.basename(canvasvcf).split(".")[0].rsplit('_', 1)[0] d = {'SampleKey': samplekey} exons = BedTool(exonbed) cn = parse_segments(canvasvcf) overlaps = exons.intersect(cn, wao=True) gcn_store = {} for ov in overlaps: # Example of ov.fields: # [u'chr1', u'11868', u'12227', u'ENSG00000223972.5', # u'ENST00000456328.2', u'transcribed_unprocessed_pseudogene', # u'DDX11L1', u'.', u'-1', u'-1', u'.', u'0'] gene_name = "|".join((ov.fields[6], ov.fields[3], ov.fields[5])) if gene_name not in gcn_store: gcn_store[gene_name] = defaultdict(int) cn = ov.fields[-2] if cn == ".": continue cn = int(cn) if cn > 10: cn = 10 amt = int(ov.fields[-1]) gcn_store[gene_name][cn] += amt for k, v in sorted(gcn_store.items()): v_mean, v_median = counter_mean_and_median(v) d[k + ".avgcn"] = v_mean d[k + ".medcn"] = v_median cleanup() return d
def Cluster2ExonSkipping(Cluster): ''' ''' for k1, v1 in Cluster.items(): if len(v1) == 3: bed3list = [] for k2, v2 in v1.items(): bed3list.append(Bed(k2)) bed3list.sort(key=sortbycoordinate) [bed1, bed2, bed3] = bed3list longer = bed2 longer = longer.chr + "\t" + str(longer.start) + "\t" + str( longer.end) if bed1.start == bed2.start and bed2.end == bed3.end and bed1.end < bed3.start: alternative = BedTool("\t".join([ bed1.chr, str(bed1.end), str(bed3.start), k1 + "alternative", "0", bed1.strand ]), from_string=True) if len(alternative.intersect(m6A_bed)) >= 1: yield "SE", "m6A", k1, v1[longer] else: yield "SE", "nom6A", k1, v1[longer]
def find_shared_peaks(multipath, maxd): #Split by chromosome chr2bedtools = defaultdict(list) for intervals in [BedTool(x) for x in multipath]: temp_d = defaultdict(list) for interval in intervals: temp_d[interval.chrom].append((interval)) for chrom, local_intervals in temp_d.items(): chr2bedtools[chrom].append(local_intervals) bedtools_list = [ x[1] for x in sorted(chr2bedtools.items(), key=lambda x: x[0]) ] stat_total_counts = [] res_total = [] for bedtools_chr in bedtools_list: res, stat_counts = _find_shared_peaks_chromosome(bedtools_chr, maxd) res_total.extend(res) stat_total_counts.extend(stat_counts) return res_total, stat_total_counts
def make_annot_files(args, bed_for_annot): print('making annot file') df_bim = pd.read_csv(args.bimfile, delim_whitespace=True, usecols=[0, 1, 2, 3], names=['CHR', 'SNP', 'CM', 'BP']) iter_bim = [['chr' + str(x1), x2 - 1, x2] for (x1, x2) in np.array(df_bim[['CHR', 'BP']])] bimbed = BedTool(iter_bim) annotbed = bimbed.intersect(bed_for_annot) bp = [x.start + 1 for x in annotbed] df_int = pd.DataFrame({'BP': bp, args.annot_name: 1}) # # bp = [x.start + 1 for x in annotbed] # df_int = pd.DataFrame({'BP': bp, 'ANNOT':1}) # 3d0c4464777b2578bf6f13386f0c1c9ab7d55046 df_annot = pd.merge(df_bim, df_int, how='left', on='BP') df_annot.fillna(0, inplace=True) df_annot = df_annot[[args.annot_name]].astype(int) if args.annot_file.endswith('.gz'): with gzip.open(args.annot_file, 'wb') as f: df_annot.to_csv(f, sep="\t", index=False) else: df_annot.to_csv(args.annot_file, sep="\t", index=False)
def makeBamFilterShortRNA(inFile, outprefix, genome): ''' bed file as input first sort then bedtobam index the bam file finally ''' bamFolder = '/'.join(outprefix.split('/')[:-1]) + '/bamFiles' tempBam = bamFolder + '/' + outprefix.split( '/')[-1] + '.filtered.sorted.bam' index_name = tempBam + '.bai' small_RNA = '/Users/wckdouglas/plasmaDNA/reference/smallRNA.bed' small_RNA_bed = Tool(small_RNA) if not os.path.isfile(tempBam): print 'Making %s ' % tempBam BedTool(inFile)\ .sort()\ .to_bam(g=genome)\ .intersect(b=small_RNA_bed, v=True, f=0.8,r=True,s=True) \ .saveas(tempBam) if os.path.isfile(index_name): os.remove(index_name) index = pysam.index(tempBam) else: print 'Used existing bamfile: %s' % tempBam return tempBam
def main(): """ annotate a file with the neearest features in another. """ p = argparse.ArgumentParser(description=__doc__, prog=sys.argv[0]) p.add_argument("-a", dest="a", help="file to annotate") p.add_argument("-b", dest="b", help="file with annotations") p.add_argument("--upstream", dest="upstream", type=int, default=None, help="distance upstream of [a] to look for [b]") p.add_argument("--downstream", dest="downstream", type=int, default=None, help="distance downstream of [a] to look for [b]") p.add_argument("--report-distance", dest="report_distance", default=False, help="report the distance, not just the genes", action="store_true") args = p.parse_args() if (args.a is None or args.b is None): sys.exit(not p.print_help()) c = add_closest(args.a, args.b) b = BedTool(args.b) # TODO: support --report-distance for up/downstream. if args.upstream: c = add_xstream(c, b, args.upstream, "up", args.report_distance) if args.downstream: c = add_xstream(c, b, args.downstream, "down", args.report_distance) for row in c.sort(): print(row)
def main(): usage_text = """usage: %(prog)s [options] PopulationCoveredROI VCF ... takes a population covered ROI file and a vcf file containing the mutation calls from all the samples in the population to calculate the mutation rate for all ROIs""" parser = argparse.ArgumentParser(description=usage_text) parser.add_argument('--version', action='version', version='%(prog)s 0.1') parser.add_argument("-o", dest="outFile", default="stdout", help="output file name") parser.add_argument(dest='ROI', metavar='ROI_file', type=str, help='population covered ROI file name') parser.add_argument( dest='vcf', metavar='VCF_file', type=str, help= 'VCF file containing mutation calls for all the samples in the population' ) # parser.add_argument( "--bed", dest="bedFiles", metavar='sample1.bed', type=str, nargs='+', help="bed filenames" ) args = parser.parse_args() ROIBed = BedTool(args.ROI) result = intersectMutationRegardlessOfMutationType(args.vcf) mutation_count = intersectBedsAndRetNumMutations(result[0], ROIBed, result[1]) num_samples = result[2] track_length = ROIBed.sort().total_coverage() print("\n\n" + str(float(mutation_count / (num_samples * track_length))) + "\n\n")
def load_bed_data_sc(genome, positive_windows, use_meta, use_gencode, input_dir, is_sorted, big_wig_list, num_pos, input_scATAC_dir, chrom=None): bed_filtered = positive_windows print 'Generating test data iterator' bigwig_names, bigwig_files_list = load_bigwigs_sc([input_dir],num_pos,big_wig_list,input_scATAC_dir) bigwig_files = bigwig_files_list[0] if use_meta: meta_names, meta_list = load_meta([input_dir]) meta = meta_list[0] else: meta = [] meta_names = None shift = 0 if use_gencode: cpg_bed = BedTool('resources/cpgisland.bed.gz') cds_bed = BedTool('resources/wgEncodeGencodeBasicV19.cds.merged.bed.gz') intron_bed = BedTool('resources/wgEncodeGencodeBasicV19.intron.merged.bed.gz') promoter_bed = BedTool('resources/wgEncodeGencodeBasicV19.promoter.merged.bed.gz') utr5_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr5.merged.bed.gz') utr3_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr3.merged.bed.gz') peaks_cpg_bedgraph = bed_filtered.intersect(cpg_bed, wa=True, c=True) peaks_cds_bedgraph = bed_filtered.intersect(cds_bed, wa=True, c=True) peaks_intron_bedgraph = bed_filtered.intersect(intron_bed, wa=True, c=True) peaks_promoter_bedgraph = bed_filtered.intersect(promoter_bed, wa=True, c=True) peaks_utr5_bedgraph = bed_filtered.intersect(utr5_bed, wa=True, c=True) peaks_utr3_bedgraph = bed_filtered.intersect(utr3_bed, wa=True, c=True) data_bed = [(window.chrom, window.start, window.stop, 0, bigwig_files, np.append(meta, np.array([cpg.count, cds.count, intron.count, promoter.count, utr5.count, utr3.count], dtype=bool))) for window, cpg, cds, intron, promoter, utr5, utr3 in itertools.izip(bed_filtered, peaks_cpg_bedgraph,peaks_cds_bedgraph,peaks_intron_bedgraph,peaks_promoter_bedgraph,peaks_utr5_bedgraph,peaks_utr3_bedgraph)] else: data_bed = [(window.chrom, window.start, window.stop, shift, bigwig_files, meta) for window in bed_filtered] #from data_iter import DataIterator from data_iter_scFAN import DataIterator #pdb.set_trace() #tmpp = bed_filtered.saveas('/data1/fly/FactorNet/draw_plot/heatmap_plot/merge_heatmap/H1_bed/%d_bed.bed'%(num_pos)) bigwig_rc_order = get_bigwig_rc_order(bigwig_names) datagen_bed = DataIterator(data_bed, genome, 100, L, bigwig_rc_order, shuffle=False) return bigwig_names, datagen_bed
if not os.path.exists( args.output): # We will create a new directory for the TF os.makedirs(args.output) # Format the TF argument to only have the TF name, in case the full ID was given as input tf_list = [] for tf in args.transcription_factors: if 'var.' in tf: tf_list.append('.'.join(tf.replace(' ', '').split('.')[-2:])) else: tf_list.append(tf.replace(' ', '').split('.')[-1]) # Get the motif hits of the TF of interest motif_hits = [ x.split('\t')[:4] for x in open(args.motif_hit_file).read().strip().split('\n') if x.split('\t')[3].split('.')[-1] in tf_list ] print(len(motif_hits), 'motif hits found') # Intersect the filtered motif hits with the differential peaks motif_hits_bed = BedTool('\n'.join(['\t'.join(x) for x in motif_hits]), from_string=True) diff_peak_filtered = motif_hits_bed.intersect(args.differential_peaks) print(len(str(diff_peak_filtered).strip().split('\n')), 'of motif hits intersect with a differential peak') open(args.output + '/' + '_'.join(tf_list) + '_Hits.bed', 'w').write(str(diff_peak_filtered))
from snakemake import shell from pybedtools import BedTool # truncate the sorted output files to chromosome so no ends are out of bounds. spp_trunc = snakemake.output.spp_sorted_tr spp_sorted = snakemake.input.spp_sorted macs2_trunc = snakemake.output.macs2_sorted_tr macs2_sorted = snakemake.input.macs2_sorted spp2_trunc = snakemake.output.spp2_sorted_tr spp2_sorted = snakemake.input.spp2_sorted genome = snakemake.input.genome BedTool(spp_sorted).truncate_to_chrom('dm6').saveas(spp_trunc) BedTool(spp2_sorted).truncate_to_chrom('dm6').saveas(spp2_trunc) BedTool(macs2_sorted).truncate_to_chrom('dm6').saveas(macs2_trunc)
def generateBackgroundForRegionTest(rna): target = BedTool('../H3K27me3/peaks_for_tdf/plus_' + rna + '_peaks.bed') hg19 = BedTool('../hg19/allChr.bed') hg19.subtract(target, output='../H3K27me3/region_test/bg_' + rna + ".bed")
help="Length of the segment right upstream") parser.add_argument('--outdir', nargs='?', required=True, type=str, help="Path to the output directory") parser.add_argument( '--ylim', nargs='?', default=False, const=True, type=bool, help="If set, plots will be cut from the bottom up to the lowest bar") args = parser.parse_args() transcripts = BedTool(args.transcripts) phages = BedTool(args.phages) phaged_transcripts = [ transcripts.intersect(b=phages, u=True, f=0.5), transcripts.intersect(b=phages, v=True, f=0.5) ] ############################################################################################################ ### TSS count section tss_counts_list = [] for ptr in phaged_transcripts: temp_dict = defaultdict(list) for transcript in ptr: temp_dict[transcript.name].append(int( transcript.attrs['tss_variants']))
def make_bed_from_gff(gff: str, up_offset: int = 2000, valid_ids: List[str] = None, flavour: str = 'body'): """Create pybedtools object for genes from a GFF file. Gene coordinates are promoter extended""" try: from pybedtools import BedTool except ImportError: raise ImportError( "pybedtools is not installed. Check out this link to install" " https://daler.github.io/pybedtools/main.html#install-via-conda") out = [] ignored_genes = 0 unknown_ids = 0 if valid_ids is not None: valid_ids = {x: None for x in valid_ids} with open(gff) as h: # Testing whether first 5 lines are comment lines for i in range(5): l = next(h) if l[0] != '#': logger.warning(f"line num {i} is not comment line", flush=True) for l in tqdm(h): c = l.split('\t') if c[2] != 'gene': continue a = [x.split(' ') for x in c[8].rstrip('\n').split('; ')] a = {x: y.strip('"') for x, y in a} if 'gene_id' not in a: unknown_ids += 1 continue if valid_ids is not None and a['gene_id'] not in valid_ids: ignored_genes += 1 continue # Fetch start and end coordinate s, e = int(c[3]), int(c[4]) if flavour == 'body': if c[6] == '+': s = s - up_offset s = max(s, 0) elif c[6] == '-': e = e + up_offset else: raise ValueError('ERROR: Unsupported symbol for strand') elif flavour == 'promoter': if c[6] == '+': e = s + up_offset s = s - up_offset s = max(s, 0) elif c[6] == '-': s = e - up_offset s = max(s, 0) e = e + up_offset else: raise ValueError('ERROR: Unsupported symbol for strand') else: raise ValueError( 'ERROR: `flavour` can either be `body` or `promoter`') if c[0].startswith('chr'): chrom = c[0] else: chrom = f'chr{c[0]}' if 'gene_name' in a: gn = a['gene_name'] else: gn = a['gene_id'] o = '\t'.join([chrom, str(s), str(e), a['gene_id'], gn, c[6]]) out.append(o) logger.info(f"{len(out)} genes found in the GFF file") logger.info( f"{ignored_genes} genes were ignored as they were not present in the valid_ids" ) logger.info( f"{unknown_ids} genes were ignored as they did not have gene_id column" ) return BedTool('\n'.join(out), from_string=True)
# fp = '/Users/phi/data_local/databases/annotree/pfam_archaea/*' fp = '/Users/phi/data_local/databases/annotree/pfam_bacteria/*' tagged_data = [] corpus = {} with open('/Users/phi/tmp/corpus.annotree.train.txt', 'w+') as out: for file in tqdm(glob(fp)): # TODO: skip overlap removal and deduplication for now, we need strand # info # for this # Rich Hickey would be proud ... # dom = deduplicate(remove_overlap(load_domains(file, fmt='pfamscan'))) result = load_domains(file, fmt='pfamscan') dom = BedTool(list(result.values())) # make sure Pfam ID is truncated: PF00815.1 -> PF00815 seq = create_domain_sequence(dom, keep_unknown=True, fmt_fn=lambda x: x.split('.')[0]) text = list(seq.values()) genome = os.path.basename(file).strip('_pfam.tsv') # e.g. ... # UBA9934_pfam.tsv # GB_GCA_001790445.1_pfam.tsv # RS_GCF_000012865.1_pfam.tsv if not 'UBA' in genome: genome = '_'.join(genome.split('_')[1:])
parser.add_argument('--outdir', required=True, nargs='?', type=str, help="Path to the output directory") args = parser.parse_args() def check_interval(interval, mincov): return all( [float(x) > mincov for x in interval.attrs['topcoverage'].split(",")]) for path in get_only_files(args.path): if (path.endswith('gff') or path.endswith('bed')): bedtool = BedTool(path) if (len(bedtool)): with open(os.path.join(args.outdir, os.path.basename(path)), 'w') as f: for interval in bedtool: if (check_interval(interval, args.mincov)): center = int(interval.name) f.write( str( Interval(interval.chrom, center - args.flank, center + args.flank, name=interval.name, strand=interval.strand, score=interval.attrs['topcoverage'])))
def generate_rdf_content(self): """Generate RDF content of the BED file Yields ------ Graph RDF content """ bedfile = BedTool(self.path) count = 0 attribute_list = [] total_lines = sum(1 for line in open(self.path)) row_number = 0 entity_type = self.namespace_data[self.format_uri(self.entity_name, remove_space=True)] for feature in bedfile: # Percent row_number += 1 self.graph_chunk.percent = row_number * 100 / total_lines # Entity if feature.name != '.': entity_label = feature.name else: entity_label = "{}_{}".format(self.entity_name, str(count)) count += 1 entity = self.namespace_entity[self.format_uri(entity_label)] self.graph_chunk.add((entity, rdflib.RDF.type, entity_type)) self.graph_chunk.add( (entity, rdflib.RDFS.label, rdflib.Literal(entity_label))) # Faldo faldo_reference = None faldo_strand = None faldo_start = None faldo_end = None # Chromosome self.category_values["reference"] = { feature.chrom, } relation = self.namespace_data[self.format_uri("reference")] attribute = self.namespace_data[self.format_uri(feature.chrom)] faldo_reference = attribute self.faldo_abstraction["reference"] = relation self.graph_chunk.add((entity, relation, attribute)) if "reference" not in attribute_list: attribute_list.append("reference") self.attribute_abstraction.append({ "uri": self.namespace_data[self.format_uri("reference")], "label": rdflib.Literal("reference"), "type": [ self.namespace_internal[self.format_uri( "AskomicsCategory")], rdflib.OWL.ObjectProperty ], "domain": entity_type, "range": self.namespace_data[self.format_uri( "{}Category".format("reference"))], "values": [feature.chrom] }) else: # add the value for at in self.attribute_abstraction: if at["uri"] == self.namespace_data[self.format_uri( "reference" )] and at[ "domain"] == entity_type and feature.chrom not in at[ "values"]: at["values"].append(feature.chrom) # Start relation = self.namespace_data[self.format_uri("start")] attribute = rdflib.Literal( self.convert_type(feature.start + 1)) # +1 because bed is 0 based faldo_start = attribute self.faldo_abstraction["start"] = relation self.graph_chunk.add((entity, relation, attribute)) if "start" not in attribute_list: attribute_list.append("start") self.attribute_abstraction.append({ "uri": self.namespace_data[self.format_uri("start")], "label": rdflib.Literal("start"), "type": [rdflib.OWL.DatatypeProperty], "domain": entity_type, "range": rdflib.XSD.decimal }) # End relation = self.namespace_data[self.format_uri("end")] attribute = rdflib.Literal(self.convert_type(feature.end)) faldo_end = attribute self.faldo_abstraction["end"] = relation self.graph_chunk.add((entity, relation, attribute)) if "end" not in attribute_list: attribute_list.append("end") self.attribute_abstraction.append({ "uri": self.namespace_data[self.format_uri("end")], "label": rdflib.Literal("end"), "type": [rdflib.OWL.DatatypeProperty], "domain": entity_type, "range": rdflib.XSD.decimal }) # Strand strand = False strand_type = None if feature.strand == "+": self.category_values["strand"] = { "+", } relation = self.namespace_data[self.format_uri("strand")] attribute = self.namespace_data[self.format_uri("+")] faldo_strand = self.get_faldo_strand("+") self.faldo_abstraction["strand"] = relation self.graph_chunk.add((entity, relation, attribute)) strand = True strand_type = "+" elif feature.strand == "-": self.category_values["strand"] = { "-", } relation = self.namespace_data[self.format_uri("strand")] attribute = self.namespace_data[self.format_uri("-")] faldo_strand = self.get_faldo_strand("-") self.faldo_abstraction["strand"] = relation self.graph_chunk.add((entity, relation, attribute)) strand = True strand_type = "-" else: self.category_values["strand"] = { ".", } relation = self.namespace_data[self.format_uri("strand")] attribute = self.namespace_data[self.format_uri(".")] faldo_strand = self.get_faldo_strand(".") self.faldo_abstraction["strand"] = relation self.graph_chunk.add((entity, relation, attribute)) strand = True strand_type = "." if strand: if ("strand", strand_type) not in attribute_list: attribute_list.append(("strand", strand_type)) self.attribute_abstraction.append({ "uri": self.namespace_data[self.format_uri("strand")], "label": rdflib.Literal("strand"), "type": [ self.namespace_internal[self.format_uri( "AskomicsCategory")], rdflib.OWL.ObjectProperty ], "domain": entity_type, "range": self.namespace_data[self.format_uri( "{}Category".format("strand"))], "values": [strand_type] }) # Score if feature.score != '.': relation = self.namespace_data[self.format_uri("score")] attribute = rdflib.Literal(self.convert_type(feature.score)) self.graph_chunk.add((entity, relation, attribute)) if "score" not in attribute_list: attribute_list.append("score") self.attribute_abstraction.append({ "uri": self.namespace_data[self.format_uri("score")], "label": rdflib.Literal("score"), "type": [rdflib.OWL.DatatypeProperty], "domain": entity_type, "range": rdflib.XSD.decimal }) location = BNode() begin = BNode() end = BNode() self.graph_chunk.add((entity, self.faldo.location, location)) self.graph_chunk.add( (location, rdflib.RDF.type, self.faldo.region)) self.graph_chunk.add((location, self.faldo.begin, begin)) self.graph_chunk.add((location, self.faldo.end, end)) self.graph_chunk.add( (begin, rdflib.RDF.type, self.faldo.ExactPosition)) self.graph_chunk.add((begin, self.faldo.position, faldo_start)) self.graph_chunk.add( (end, rdflib.RDF.type, self.faldo.ExactPosition)) self.graph_chunk.add((end, self.faldo.position, faldo_end)) self.graph_chunk.add( (begin, self.faldo.reference, faldo_reference)) self.graph_chunk.add((end, self.faldo.reference, faldo_reference)) if faldo_strand: self.graph_chunk.add((begin, rdflib.RDF.type, faldo_strand)) self.graph_chunk.add((end, rdflib.RDF.type, faldo_strand)) yield
def getListOfBlackZones(chrom): blackList = BedTool('../wgEncodeDacMapabilityConsensusExcludable.bed') blackListChrom = blackList.filter(lambda b: b.chrom == chrom) return [(i.start, i.end) for i in blackListChrom]
''' return g.filter(featuretype_filter, featuretype).saveas().fn def get_attribute(g, attribute): genes = [] for feature in g: try: genes.append(feature[attribute]) except AttributeError: genes.append('') return genes print('loading bedfile and extracting exons...') g = BedTool(tx_info) exons = BedTool(subset_featuretypes(g, 'exon')) print('validating and sorting records...') exons = exons.remove_invalid().sort() print('extracting attributes...') exon_pd = pd.DataFrame([(e['chrom'], e['start'], e['end'], e['strand']) for e in exons], columns=['chrom', 'exonStarts', 'exonEnds', 'strand']) exon_pd['exonStarts'] = exon_pd['exonStarts'].map(str) exon_pd['exonEnds'] = exon_pd['exonEnds'].map(str) exon_pd['transcript'] = get_attribute(exons, 'transcript_id') exon_pd['gene'] = get_attribute(exons, 'gene_name') exon_pd = exon_pd[exon_pd.gene != '']
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug, padding=None, fai_fpath=None, genome=None, reannotate=False): clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_target_bed_fpath, bed_fpath): debug() debug('Cleaning target BED file...') bed = BedTool(bed_fpath) if bed.field_count() > 4: bed = bed.cut(range(4)) bed = bed\ .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\ .remove_invalid() with file_transaction(work_dir, clean_target_bed_fpath) as tx: bed.saveas(tx) debug('Saved to ' + clean_target_bed_fpath) verify_file(clean_target_bed_fpath, is_critical=True) sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted') if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath): debug() debug('Sorting target BED file...') sort_target_bed_fpath = sort_bed( clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath) debug('Saved to ' + sort_target_bed_fpath) verify_file(sort_target_bed_fpath, is_critical=True) if genome in ebl.SUPPORTED_GENOMES: ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features') if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath): debug() if BedTool(sort_target_bed_fpath).field_count( ) == 3 or reannotate: debug( 'Annotating target BED file and collecting overlapping genome features' ) overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, reannotate=reannotate, only_canonical=True) else: debug('Overlapping with genomic features:') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, only_canonical=True) debug('Saved to ' + ann_target_bed_fpath) verify_file(ann_target_bed_fpath, is_critical=True) else: ann_target_bed_fpath = sort_target_bed_fpath final_clean_target_bed_fpath = intermediate_fname( work_dir, ann_target_bed_fpath, 'clean') if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath): bed = BedTool(ann_target_bed_fpath).remove_invalid() with file_transaction(work_dir, final_clean_target_bed_fpath) as tx: bed.saveas(tx) pass verify_file(final_clean_target_bed_fpath, is_critical=True) self.bed_fpath = final_clean_target_bed_fpath self.bed = BedTool(self.bed_fpath) self.capture_bed_fpath = add_suffix( join(output_dir, basename(bed_fpath)), 'clean_sorted_ann') if not can_reuse(self.capture_bed_fpath, self.bed_fpath): with file_transaction(work_dir, self.capture_bed_fpath) as tx: self.get_capture_bed().saveas(tx) gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath) self.gene_keys_set = gene_key_set self.gene_keys_list = gene_key_list self.regions_num = self.get_capture_bed().count() self._make_qualimap_bed(work_dir) if padding: self._make_padded_bed(work_dir, fai_fpath, padding)
def build_capsules(capsule_choice, overlap, bin_len, ma, include_last, min_capsule_len, custom_capsule_file, gsea_superset, tissue, gene_context, use_set, number_sets, limited_capsule_names_file, cpg_arr=None, sort_caps=True): capsules, finalcpgs, capsule_names = [], [], [] annotation_file = annotations450 if 'genomic_binned' in capsule_choice: overlap = int(overlap * bin_len) genome_file = hg19 gname = os.path.basename(genome_file).split('.')[0] overlap_file = '{}.{}.overlap.{}.bed'.format(gname, bin_len, overlap) if not os.path.exists(overlap_file): BedTool(genome_file).makewindows( g=genome_file, w=bin_len, s=bin_len - overlap).saveas('{}.{}.overlap.{}.bed'.format( gname, bin_len, overlap)) #.to_dataframe().shape print(annotation_file, overlap_file) final_modules, modulecpgs, module_names = get_binned_modules( ma=ma, a=annotation_file, b=overlap_file, include_last=include_last, min_capsule_len=min_capsule_len) print('LEN_MODULES', len(final_modules)) capsules.extend(final_modules) finalcpgs.extend(modulecpgs) capsule_names.extend(module_names) if 'custom_bed' in capsule_choice: final_modules, modulecpgs, module_names = get_binned_modules( ma=ma, a=annotation_file, b=custom_capsule_file, include_last=include_last, min_capsule_len=min_capsule_len) capsules.extend(final_modules) finalcpgs.extend(modulecpgs) capsule_names.extend(module_names) if 'custom_set' in capsule_choice: final_modules, modulecpgs, module_names = return_custom_capsules( ma=ma, capsule_file=custom_capsule_file, capsule_sets=['all'], min_capsule_len=min_capsule_len, include_last=include_last) capsules.extend(final_modules) finalcpgs.extend(modulecpgs) capsule_names.extend(module_names) if np.intersect1d(CAPSULES, capsule_choice).tolist() or isinstance( cpg_arr, pd.DataFrame): final_modules, modulecpgs, module_names = return_final_capsules( methyl_array=ma, capsule_choice=capsule_choice if not isinstance(cpg_arr, pd.DataFrame) else None, min_capsule_len=min_capsule_len, collection=gsea_superset, tissue=tissue, n_top_sets=number_sets, limited_capsule_names_file=limited_capsule_names_file, gsea_superset=gsea_superset, cpg_arr=cpg_arr, sort_caps=sort_caps) capsules.extend(final_modules) finalcpgs.extend(modulecpgs) capsule_names.extend(module_names) # if 0: # # selected_sets=np.intersect1d(['UCSC_RefGene_Name','UCSC_RefGene_Accession', 'UCSC_RefGene_Group', 'UCSC_CpG_Islands_Name', 'Relation_to_UCSC_CpG_Island', 'Phantom', 'DMR', 'Enhancer', 'HMM_Island', 'Regulatory_Feature_Name', 'Regulatory_Feature_Group', 'DHS'],capsule_choice).tolist() # if selected_sets: # final_modules,modulecpgs,module_names=return_custom_capsules(ma=ma,capsule_file=selected_caps_file, capsule_sets=selected_sets, min_capsule_len=min_capsule_len, include_last=include_last, limited_capsule_names_file=limited_capsule_names_file) # capsules.extend(final_modules) # finalcpgs.extend(modulecpgs) # capsule_names.extend(module_names) # # gsea_bool=(("GSEA" in capsule_choice and gsea_superset) or 'all_gene_sets' in capsule_choice) # # if gsea_bool: # final_modules,modulecpgs,module_names=return_gsea_capsules(ma=ma,tissue=tissue,context_on=gene_context,use_set=use_set,gsea_superset=gsea_superset,n_top_sets=number_sets,min_capsule_len=min_capsule_len, all_genes=('all_gene_sets' in capsule_choice), limited_capsule_names_file=limited_capsule_names_file) # capsules.extend(final_modules) # finalcpgs.extend(modulecpgs) # capsule_names.extend(module_names) final_modules = capsules modulecpgs = list(set(finalcpgs)) module_names = capsule_names # if limited_capsule_names_file and not (selected_sets or gsea_bool): # with open(limited_capsule_names_file) as f: # limited_capsule_names=f.read().replace('\n',' ').split() # capsules=[] # capsule_names=[] # for i in range(len(module_names)): # if module_names[i] in limited_capsule_names: # capsule_names.append(module_names[i]) # capsules.append(final_modules[i]) # # modulecpgs=list(set(list(reduce(lambda x,y: x+y,capsules)))) # final_modules=capsules # module_names=capsule_names print("{} modules, {} cpgs, {} module names, {} missing".format( len(final_modules), len(modulecpgs), len(module_names), ma.beta.isnull().sum().sum())) return final_modules, modulecpgs, module_names
])) return [ len(het_index_unique), len(ch_index_unique), len(hom_index_unique), total_ac ] #Make list of all SNPs across all genes present in snpfile allsnplist = makesnplist(options.snpfilename) #Make a hashtable with keys as each SNP, and stores a list of indices of carriers for that SNP count_table = {} #Open vcf file vcffile = BedTool(options.vcffilename) if options.bedfilename is not None: bed = BedTool(options.bedfilename) vcffile_temp = vcffile.intersect(bed) else: if chrformat == "chr": dummy_bed = BedTool('chr1000 100000000 100000001', from_string=True) else: dummy_bed = BedTool('1000 100000000 100000001', from_string=True) vcffile_temp = vcffile.subtract(dummy_bed) for line_vcf1 in open(vcffile_temp.fn): line_vcf = line_vcf1.rstrip().split('\t') if line_vcf[0][0] != "#" and ("," not in line_vcf[4]): if not (options.passfilter and line_vcf[6] != "PASS"): if options.snpformat == "VCFID":
def predict_variant_effect(self, # pylint: disable=too-many-locals bioseq, variants, conditions, output_folder, condition_filter=None, batch_size=None): """Evaluates the performance. Parameters ---------- bioseq : :code:`Bioseq` Input sequence containing the reference genome. variants : str File name of a VCF file containg the variants under study. conditions : list(str) Condition labels for each output prediction. output_folder : str The method produces an hdf5 and a bed file as output. The bed-file contains the variant positions while the hdf5 file contains the reference and alternative variant scores for each output feature. condition_filter : str or None Regular expression filter on which conditions should be evaluated. If None, all output conditions will be returned. batch_size : int, None. Batch size. If None, a batch_size of 128 is used. Returns ------- tuple: Tuple containing the output filenames: an hdf5 and a bed file. Examples -------- .. code-block:: python # Evaluate all variants and all conditions (outputs) model.predict_variant_effect(DATA, VARIANTS, CONDITIONS, 'vcfoutput') # Evaluate all variants and a subset of conditions (Ctcf output labels) model.predict_variant_effect(DATA, LABELS, CONDITIONS, 'vcfoutput_subset', contition_filter='Cfcf') """ if batch_size is None: batch_size = 128 if len(self.kerasmodel.inputs) > 1: raise ValueError('Only one input layer supported for this operation.') binsize = self.kerasmodel.layers[0].input_shape[1] + bioseq.garray.order - 1 if not bioseq.garray._full_genome_stored: raise ValueError('Incompatible Bioseq: ' 'Bioseq must be loaded with store_whole_genome=True.') # the network might output arbitrarily many # output. # With the filter option it is possible to # restrict the analysis to certain features. if condition_filter is None: conditions = [(idx, cond) for idx, cond in enumerate(conditions)] else: conditions = [(idx, cond) for idx, cond in enumerate(conditions) \ if hasattr(re.search(condition_filter, cond), 'start')] icond = [el[0] for el in conditions] local_model = self.kerasmodel if len(conditions) != self.kerasmodel.output_shape[-1]: raise ValueError("The number of conditions does not match with the " "number of network output units.") # get number of variants variantsstream = VariantStreamer(bioseq, variants, binsize, batch_size) nvariants = variantsstream.get_variant_count() h5file = h5py.File(os.path.join(output_folder, 'scores.hdf5'), 'w') h5file.create_dataset('labels', (len(conditions),), dtype=h5py.special_dtype(vlen=str), data=np.array([c[-1] for c in conditions], dtype=h5py.special_dtype(vlen=str))) refscore = h5file.create_dataset('refscore', (nvariants, len(conditions)), dtype='float16') altscore = h5file.create_dataset('altscore', (nvariants, len(conditions)), dtype='float16') diffscore = h5file.create_dataset('diffscore', (nvariants, len(conditions)), dtype='float16') logodds = h5file.create_dataset('logoddsscore', (nvariants, len(conditions)), dtype='float16') bar = Bar('Parsing {}: '.format(variants), max=int(np.ceil(nvariants/float(batch_size)))) chromlist = [] poslist = [] vnamelist = [] reflist = [] altlist = [] ibatch = 0 # read variants file for names, chroms, poss, ra, aa, reference, alternative in variantsstream.flow(): bar.next() if reference.shape[0] <= 0: # reached the end of the file break ref_score = local_model.predict_on_batch(reference) alt_score = local_model.predict_on_batch(alternative) chromlist += chroms poslist += poss vnamelist += names reflist += ra altlist += aa refscore[ibatch:(ibatch + ref_score.shape[0])] = ref_score[:, icond].astype('float16') altscore[ibatch:(ibatch + ref_score.shape[0])] = alt_score[:, icond].astype('float16') diffscore[ibatch:(ibatch + ref_score.shape[0])] = \ alt_score[:, icond].astype('float16') - ref_score[:, icond].astype('float16') logodds[ibatch:(ibatch + ref_score.shape[0])] = \ np.log(alt_score[:, icond].astype('float16')/ ref_score[:, icond].astype('float16') + 1e-7) ibatch += ref_score.shape[0] #form large string BedTool('\n'.join(['{} {} {} {}_{}>{}'.format(chrom, start, start+1, name, ref, alt) for chrom, start, name, ref, alt in zip(chromlist, poslist, vnamelist, reflist, altlist)]), from_string=True).saveas(os.path.join(output_folder, 'snps.bed.gz')) bar.finish() h5file.close() return (os.path.join(output_folder, 'scores.hdf5'), os.path.join(output_folder, 'snps.bed.gz'))
tempd[a[0]] = float(a[3]) return tempd names2diff = defaultdict(list) for path in args.diff: for k, v in readdiff(path).items(): names2diff[k].append(v) names2diff = dict([x for x in names2diff.items() if len(x[1]) == ldiff]) #print(names2diff) ###READ peak intensities names2intensities = {} for interval in BedTool(args.path): distance = int(interval.attrs['start_gene_distance']) if (distance < args.distance): genename = interval.attrs['start_gene'] intensity = [ float(x) if x != 'None' else 0 for x in interval.attrs['maxcov'].split(",") ] lint = len(intensity) names2intensities[genename] = intensity ###CORRELATE peak intensities to differential gene expression diff2timepoints = {0: '0h', 1: '0.5h', 2: '4h'} int2timepoints = { 0: 'pre', 1: '0h',
print 'Current working directory is:' + os.getcwd() print '\n' #generate bed file names bed_names = [f.replace('bam', 'bed') for f in datafiles] size_selected_small = [f.replace('bam', 'small.bed') for f in datafiles] size_selected_med = [f.replace('bam', 'med.bed') for f in datafiles] size_selected_big = [f.replace('bam', 'big.bed') for f in datafiles] #generate file names for length analysis lengths_names = [f.replace('bam', 'lengths') for f in datafiles] #generate bed files with bam_to_bed tool (makes bed12 format) for i in range(len(datafiles)): temp_bed = BedTool(datafiles[i]).bam_to_bed(bedpe=True).to_dataframe() #need to strip out start and end position of whole insert (bed12 is both reads) #column names actually represent <chrom>, <start of insert>, <end of insert> temp_bed_stripped = temp_bed.iloc[:, [0, 1, 5]].sort_values( by=['chrom', 'start', 'strand']) #calculate insert size as column 4 and save file with bed_name temp_bed_stripped[ 'length'] = temp_bed_stripped['strand'] - temp_bed_stripped['start'] temp_bed_stripped.to_csv(bed_names[i], sep="\t", header=False, index=False) #analyze lengths of inserts temp_lengths = temp_bed_stripped.groupby(by=['length'])['length'].count()
def test_dna_dims_order_1_from_subset_dataframe(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath order = 1 data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_merged = os.path.join(data_path, 'sample.gtf') refgenome = os.path.join(data_path, 'sample_genome.fa') roi = pandas.read_csv( bed_merged, sep='\t', header=None, usecols=[0, 2, 3, 4, 5, 6], skiprows=2, names=['chrom', 'name', 'start', 'end', 'score', 'strand']) roi.start -= 1 print(roi) data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=roi, storage='ndarray', store_whole_genome=True, order=order) np.testing.assert_equal(data[0], data[data.gindexer[0]]) assert len(data.garray.handle) == 2 # for order 1 assert len(data) == 2 assert data.shape == (2, 10000, 1, 4) assert data[:].sum() == 20000 roi = BedTool(bed_merged) data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=roi, storage='ndarray', store_whole_genome=True, order=order) np.testing.assert_equal(data[0], data[data.gindexer[0]]) assert len(data.garray.handle) == 2 # for order 1 assert len(data) == 2 assert data.shape == (2, 10000, 1, 4) assert data[:].sum() == 20000 roi = [iv for iv in BedTool(bed_merged)] data = Bioseq.create_from_refgenome('train', refgenome=refgenome, roi=roi, storage='ndarray', store_whole_genome=True, order=order) np.testing.assert_equal(data[0], data[data.gindexer[0]]) assert len(data.garray.handle) == 2 # for order 1 assert len(data) == 2 assert data.shape == (2, 10000, 1, 4) assert data[:].sum() == 20000
def main(ME_centric, bed12, U2_GTAG_5_file, U2_GTAG_3_file, phylop, ME_len, ME_DB=False): n = 100 min_intron_lenght = 80 if phylop != "NA": phylop_bw = pyBigWig.open(phylop) U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file) U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file) U2_GTAG_5_max_score = 0 U2_GTAG_3_max_score = 0 for index in range(13): U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index]) for index in range(17): U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index]) TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score found_ME = set([]) ME_chroms = set([]) for row in csv.reader(open(ME_centric), delimiter='\t'): ME, transcript, sum_total_coverage, total_SJs, total_coverages, len_micro_exon_seq_found, micro_exon_seq_found, total_number_of_micro_exons_matches, U2_scores, mean_conservations, P_MEs, total_ME = row ME_strand, ME_start, ME_end = ME.split("_")[-3:] ME_chrom = "_".join(ME.split("_")[:-3]) found_ME.add(ME) ME_chroms.add(ME_chrom) introns = set([]) non_detected_ME = defaultdict( list ) # a microexon can be derived from more than one transcript. The idea is to collapese the transcript SJ_start_seqs = {} SJ_end_seqs = {} for row in csv.reader(open(bed12), delimiter='\t'): blocksizes = list(map(int, row[10].strip(",").split(","))) qstarts = list(map(int, row[11].strip(",").split(","))) start = int(row[1]) end = int(row[2]) strand = row[5] bn = int(row[9]) chrom = row[0] transcript = row[3] f_seq = "" r_seq = "" if chrom in Genome: for q1, q2, b1 in zip(qstarts, qstarts[1:], blocksizes): istart = start + q1 + b1 iend = start + q2 SJ_ID = transcript + str(istart) intron = " ".join( [chrom, str(istart), str(iend), "SJ", "0", strand]) #if chrom in ME_chroms: introns.add(intron) # Indexing tag library estart = start + q1 eend = start + q1 + b1 f_seq += str(Genome[chrom][estart:eend]) if (chrom, eend) in SJ_start_seqs: if f_seq[-100:] > len(SJ_start_seqs[(chrom, eend)]): SJ_start_seqs[(chrom, eend)] = f_seq[-100:] else: SJ_start_seqs[(chrom, eend)] = f_seq[-100:] for q1, b1 in zip(qstarts[::-1], blocksizes[::-1]): estart = start + q1 eend = start + q1 + b1 r_seq = str(Genome[chrom][estart:eend]) + r_seq if (chrom, estart) in SJ_end_seqs: if r_seq[:100] > len(SJ_end_seqs[(chrom, estart)]): SJ_end_seqs[(chrom, estart)] = r_seq[:100] else: SJ_end_seqs[(chrom, estart)] = r_seq[:100] for q1, q2, q3, b1, b2, b3 in zip(qstarts, qstarts[1:], qstarts[2:], blocksizes, blocksizes[1:], blocksizes[2:]): estart = start + q2 eend = start + q2 + b2 elength = eend - estart exon = "_".join([chrom, strand, str(estart), str(eend)]) SJ_start = start + q1 + b1 SJ_end = start + q3 ME_intron = " ".join( [chrom, str(SJ_start), str(SJ_end), "SJ", "0", strand]) dn = Genome[chrom][(estart - 2):estart] + Genome[chrom][eend:(eend + 2)] if strand == "-": dn = dn.reverse_complement() dn = str(dn).upper() if elength <= ME_len and dn == "AGGT" and exon not in found_ME: #if chrom in ME_chroms: introns.add(ME_intron) non_detected_ME[(chrom, estart, eend, strand, elength)].append(transcript) ##### Microexon database ###### if ME_DB != False: for row in csv.reader(open(ME_DB), delimiter='\t'): if len(row) == 12: blocksizes = list(map(int, row[10].strip(",").split(","))) qstarts = list(map(int, row[11].strip(",").split(","))) start = int(row[1]) end = int(row[2]) strand = row[5] bn = int(row[9]) chrom = row[0] if chrom in Genome: for q1, q2, q3, b1, b2, b3 in zip(qstarts, qstarts[1:], qstarts[2:], blocksizes, blocksizes[1:], blocksizes[2:]): estart = start + q2 eend = start + q2 + b2 elength = eend - estart exon = "_".join( [chrom, strand, str(estart), str(eend)]) transcript = row[3] SJ_start = start + q1 + b1 SJ_end = start + q3 ME_intron = " ".join([ chrom, str(SJ_start), str(SJ_end), "SJ", "0", strand ]) dn = Genome[chrom][ (estart - 2):estart] + Genome[chrom][eend:(eend + 2)] if strand == "-": dn = dn.reverse_complement() dn = str(dn).upper() if elength <= ME_len and dn == "AGGT" and exon not in found_ME: #introns.add(ME_intron) non_detected_ME[(chrom, estart, eend, strand, elength)].append(transcript) introns_str = "\n".join(list(introns)) intron_bed = BedTool(introns_str, from_string=True) intron_bed = intron_bed.sort() TOTAL_SJ_starts = set([]) TOTAL_SJ_ends = set([]) with open('data/ME_canonical_SJ_tags.DB.fa', 'w') as out_tags, open('data/DB.ME_centric', 'w') as out_ME_centric: for i in non_detected_ME.items(): ME_info, transcripts = i chrom, estart, eend, strand, elength = ME_info transcript = transcripts[0] #ME = "_".join([chrom, str(estart), strand, str(eend)]) ME = "_".join([chrom, strand, str(estart), str(eend)]) if elength <= ME_len and dn == "AGGT" and exon not in found_ME: if phylop == "NA": mean_conservation = 0 else: try: mean_conservation = phylop_bw.stats(chrom, estart - 2, eend + 2, type="mean")[0] except RuntimeError: mean_conservation = 0 if mean_conservation == None: mean_conservation = 0 ME5 = str(Genome[chrom][estart - 14:estart + 3]).upper() ME3 = str(Genome[chrom][eend - 3:eend + 10]).upper() micro_exon_seq_found = str(Genome[chrom][estart:eend]).upper() if strand == "-": ME5 = str(Genome[chrom][eend - 3:eend + 14].reverse_complement()).upper() ME3 = str(Genome[chrom][estart - 10:estart + 3].reverse_complement()).upper() micro_exon_seq_found = str( Genome[chrom] [estart:eend].reverse_complement()).upper() U2_score = 0 i = 0 for N in ME5: U2_score += U2_GTAG_3[N][i] i += 1 i = 0 for N in ME3: U2_score += U2_GTAG_5[N][i] i += 1 U2_score = percent(U2_score, TOTAL_U2_max_score) ME_bed = BedTool(" ".join( [chrom, str(estart), str(eend - 1), "ME", "0", strand]), from_string=True) SJs_bed = intron_bed.intersect(ME_bed, wa=True, s=True, F=1, nonamecheck=True) SJs = set([]) SJ_starts = [] SJ_ends = [] if len(SJs_bed) != 0: for sj in SJs_bed: SJ_chrom, SJ_start, SJ_end, ID, score, SJ_strand = str( sj).strip("\n").split("\t") SJ = SJ_chrom + ":" + SJ_start + SJ_strand + SJ_end SJ_starts.append(int(SJ_start)) SJ_ends.append(int(SJ_end)) SJs.add(SJ) TOTAL_SJ_starts.add((chrom, SJ_start)) TOTAL_SJ_ends.add((chrom, SJ_end)) ### TAG creation UP_TAG = SJ_start_seqs[(SJ_chrom, int(SJ_start))] DOWN_TAG = SJ_end_seqs[(SJ_chrom, int(SJ_end))] ME_TAG = UP_TAG + Genome[chrom][estart:eend] + DOWN_TAG tag_pos = "_".join( map(str, [ len(UP_TAG), micro_exon_seq_found, len(DOWN_TAG) ])) if strand == "-": ME_TAG = ME_TAG.reverse_complement() tag_pos = "_".join( map(str, [ len(UP_TAG), micro_exon_seq_found, len(DOWN_TAG) ][::-1])) ME_TAG = str(ME_TAG).upper() ME_TAG_ID = chrom + ":" + "".join( [str(estart), strand, str(eend)]) out_tags.write(">" + "|".join([SJ, transcript, tag_pos]) + "\n") out_tags.write(ME_TAG + "\n") # print ">" + "|".join([ ME_TAG_ID, transcript, tag_pos ]) # print ME_TAG total_SJs = ",".join(SJs) min_intron_seq = str( Genome[chrom][max(SJ_starts):min(SJ_ends)]).upper() if strand == "-": min_intron_seq = str( Genome[chrom][max(SJ_starts):min(SJ_ends)]. reverse_complement()).upper() total_number_of_micro_exons_matches = min_intron_seq.count( "AG" + micro_exon_seq_found + "GT") P_ME = 1 - ( 1 - (float(1) / float(4**len(micro_exon_seq_found) + 4)) )**(len(min_intron_seq) - (len(micro_exon_seq_found) + 4)) info = ME, transcript, 0, total_SJs, 0, elength, micro_exon_seq_found, total_number_of_micro_exons_matches, U2_score, mean_conservation, P_ME, "|".join( map(str, [ME, U2_score, mean_conservation])) out_ME_centric.write("\t".join(map(str, info)) + "\n")
def test_janggu_variant_streamer_order_1_revcomp(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') order = 1 refgenome = os.path.join(data_path, 'sample_genome.fa') vcffile = os.path.join(data_path, 'sample.vcf') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', binsize=50, store_whole_genome=True, order=order) annot = BedTool([Interval('chr2', 110, 130, '-')]) # even binsize vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1) it_vcf = iter(vcf.flow()) next(it_vcf) # C to T #print(names, chroms, poss, ra, aa) #print(reference) #print(alternative) #assert names[0] == 'refmismatch' #np.testing.assert_equal(reference, alternative) #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) next(it_vcf) # C to T #print(names, chroms, poss, ra, aa) #print(reference) #print(alternative) #np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0])) #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1])) names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) # T to C print(names, chroms, poss, ra, aa) print(reference) print(alternative) # np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1])) # np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) # even binsize vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1, annotation=annot) it_vcf = iter(vcf.flow()) next(it_vcf) # C to T next(it_vcf) # C to T names, chroms, poss, ra, aa, reference2, alternative2 = next(it_vcf) # T to C print(names, chroms, poss, ra, aa) print(reference) print(alternative) np.testing.assert_equal(reference, reference2[:, ::-1, :, ::-1]) np.testing.assert_equal(alternative, alternative2[:, ::-1, :, ::-1])