def test_bed12tobed6_last(self): '''bed12tobed6(whichExon=last) should return correct BED6 records for know cases''' for ((bed), (res)) in self.known_bed12tobed6_last: res = bedparse.bedline(res[0]) result = bedparse.bedline(bed).bed12tobed6(whichExon="last", appendExN=True) self.assertEqual(result[0], res)
def prom(args): with args.bedfile as tsvfile: for line in tsvfile: bedline(line.split('\t')).promoter( up=args.up, down=args.down, strand=(not args.unstranded)).print() tsvfile.close()
def test_bed12tobed6(self): '''bed12tobed6() should return correct BED6 records for know cases''' bed, res = self.known_bed12tobed6 resBedline = [] for i in res: resBedline.append(bedparse.bedline(i)) result = bedparse.bedline(bed).bed12tobed6(appendExN=True) self.assertEqual(result, resBedline)
def test_CDSs(self): '''cds() should return correct CDSs for know cases''' for ((bed), (cds)) in self.known_CDSs: result = bedparse.bedline(bed).cds() self.assertEqual(result, bedparse.bedline(cds)) for ((bed), (cds)) in self.known_CDS_ignoreCDSonly: result = bedparse.bedline(bed).cds(ignoreCDSonly=True) self.assertEqual(result, None)
def test_3pUTRs(self): '''threePutr should return correct UTR for know cases''' for ((bed), (utr)) in self.known_3pUTRs: result = bedparse.bedline(bed).utr(which=3) if (utr is None): self.assertEqual(result, None) else: self.assertEqual(result, bedparse.bedline(utr))
def test_introns(self): '''introns should return correct introns for know cases''' for ((bed), (introns)) in self.known_introns: result = bedparse.bedline(bed).introns() if (introns is None): self.assertEqual(result, None) else: self.assertEqual(result, bedparse.bedline(introns))
def cds(args): with args.bedfile as tsvfile: for line in tsvfile: utr = bedline( line.split('\t')).cds(ignoreCDSonly=args.ignoreCDSonly) if (utr): utr.print() tsvfile.close()
def join(args): col = args.column - 1 annot = dict() try: annotation = open(args.annotation) except: raise BEDexception("Annotation file not valid") annotationReader = csv.reader(annotation, delimiter=args.separator) for line in annotationReader: if (len(line) <= col): raise BEDexception( "Some lines don't contain the annotation column") annot.setdefault(line[col], []).append(line[0:col] + line[col + 1:]) annotation.close() with args.bedfile as tsvfile: for line in tsvfile: line = line.split('\t') if (args.noUnmatched == False or line[3] in annot.keys()): record = bedline(line) if (record): nrec = len(annot.setdefault(record.name, [])) if (nrec == 0): if (args.empty == ''): record.print() else: record.print(end='') print('', args.empty, sep="\t") else: for i in range(0, nrec): record.print(end='') print('', *annot[record.name][i], sep='\t') tsvfile.close()
def test_tx2genome(self): '''tx2genome should return corred coordinates for known cases''' for tx, examples, broken_examples in self.known_tx2genome: tx = bedparse.bedline(tx) for (txCoord, genomeCoord) in examples: res = tx.tx2genome(txCoord) self.assertEqual(res, genomeCoord) for be in broken_examples: self.assertRaises(bedparse.BEDexception, tx.tx2genome, be)
def convertChr(args): with args.bedfile as tsvfile: for line in tsvfile: translatedLine = bedline(line.split('\t')).translateChr( assembly=args.assembly, target=args.target, suppress=args.suppressMissing, ignore=args.allowMissing, patches=args.patches) if (translatedLine): translatedLine.print() tsvfile.close()
def bed12tobed6(args): if args.whichExon is not "all" and args.keepIntrons: raise BEDexception( "--keepIntrons is only allowed with --whichExon all") with args.bedfile as tsvfile: for line in tsvfile: tx = bedline(line.split('\t')) exon_list = tx.bed12tobed6(appendExN=args.appendExN, whichExon=args.whichExon) for el in exon_list: el.print() if (args.keepIntrons): nameSub = re.compile("_Exon([0-9]+)") for el in tx.introns().bed12tobed6(appendExN=args.appendExN): el.name = nameSub.sub(r"_Intron\1", el.name) el.print() tsvfile.close()
def validateFormat(args): with args.bedfile as tsvfile: for n, line in enumerate(tsvfile): if args.fixSeparators: line = re.sub(r'^\s+', '', line) line = re.sub(r'\s+', '\t', line) line = re.sub(r'\s+$', '', line) try: validatedLine = bedline(line.split('\t')) except BEDexception as formatException: raise BEDexception( "\nThis doesn't appear to be a valid BED file. There was an error at line %s:\n\t\"%s\"" % (n + 1, formatException)) tsvfile.close() else: validatedLine.print() tsvfile.close()
def __calculate_results(self, adjust=True): """""" # Collect all pvalue results in a dataframe l = [] for ref_id, ref_pos_list in self: for pos, pos_dict in enumerate(ref_pos_list): if "txComp" in pos_dict: row_dict = OrderedDict() row_dict["ref_id"] = ref_id row_dict["pos"] = pos row_dict["ref_kmer"] = pos_dict["ref_kmer"] for test in self._metadata["pvalue_tests"]: if test in pos_dict["txComp"]: pval = pos_dict["txComp"][test] row_dict[test] = pval l.append(row_dict) df = pd.DataFrame(l) if self._bed_fn: bed_annot={} try: with open(self._bed_fn) as tsvfile: for line in tsvfile: record_name=line.split('\t')[3] if( record_name in self.ref_id_list): bed_annot[record_name]=bedline(line.split('\t')) except: raise NanocomporeError("Can't open BED file") if len(bed_annot) != len(self.ref_id_list): raise NanocomporeError("Some references are missing from the BED file provided") df['genomicPos'] = df.apply(lambda row: bed_annot[row['ref_id']].tx2genome(coord=row['pos'], stranded=True),axis=1) # This is very inefficient. We should get chr and strand only once per transcript, ideally when writing the BED file df['chr'] = df.apply(lambda row: bed_annot[row['ref_id']].chr,axis=1) df['strand'] = df.apply(lambda row: bed_annot[row['ref_id']].strand,axis=1) df=df[['ref_id', 'pos', 'chr', 'strand', 'genomicPos', 'ref_kmer']+self._metadata["pvalue_tests"]] else: df=df[['ref_id', 'pos', 'ref_kmer']+self._metadata["pvalue_tests"]] if adjust: for col in self._metadata["pvalue_tests"]: df[col] = self.__multipletests_filter_nan(df[col], method="fdr_bh") return df
def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=[''], transcript_feature_name="transcript"): gtfRecords = {'exon': list(), 'transcript': list(), 'cds': list()} transcripts = dict() exons = dict() cds = dict() extrainfo = dict() gtfReader = csv.reader((row for row in gtf if not row.startswith('#')), delimiter="\t") for line in gtfReader: # Store all transcript lines if (line[2] == transcript_feature_name): txName = re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8]) if (line[6] != "+" and line[6] != "-"): raise BEDexception("Transcript with unrecognized strand: " + txName) # Start-1 converts from 1-based to 0-based transcripts[txName] = [ line[0], int(line[3]) - 1, int(line[4]), txName, 0, line[6] ] # Parse the extra fields if (extra != ['']): for field in extra: extrainfo.setdefault(txName, dict())[field] = re.sub( '.*' + field + ' "?([^"]+)"?;.*', "\\1", line[8]) # If no substitution occured, set the extra field to '.' if (extrainfo[txName][field] == line[8]): extrainfo[txName][field] = "." if filterType != [''] and filterKey not in extra: extrainfo.setdefault(txName, dict())[filterKey] = re.sub( '.*' + filterKey + ' "?([^"]+)"?;.*', "\\1", line[8]) # If no substitution occured, set the extra field to '.' if (extrainfo[txName][filterKey] == line[8]): extrainfo[txName][field] = "." # Parse exon lines if (line[2] == 'exon'): txName = re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8]) if (line[6] != transcripts[txName][5]): raise BEDexception( "Exon has different strand from parent transcript: " + txName) start = int(line[3]) - 1 length = int(line[4]) - int(line[3]) exons.setdefault(txName, []).append([start, length]) # Start CDS, start and stop codons # Any of these features extends the CDS if (line[2] in ['CDS', 'start_codon', 'stop_codon']): txName = re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8]) if (line[6] != transcripts[txName][5]): raise BEDexception( ("%s has different strand from parent transcript: %s" % line[2], txName)) start = int(line[3]) stop = int(line[4]) if (txName not in cds.keys()): cds[txName] = [start, stop] else: # Is the current start/stop are up/down compared to # the previous one, update cds. if (start < cds[txName][0]): cds[txName][0] = start if (stop > cds[txName][1]): cds[txName][1] = stop gtf.close() for transcript in transcripts.keys(): if (filterType != ['']): if extrainfo[transcript][filterKey] not in filterType: continue if (transcript in cds.keys()): cdsStart = int(cds[transcript][0]) - 1 cdsEnd = int(cds[transcript][1]) else: cdsStart = transcripts[transcript][2] cdsEnd = transcripts[transcript][2] transcripts[transcript].append(cdsStart) transcripts[transcript].append(cdsEnd) transcripts[transcript].append('0') # Add the number of exons in field 10 transcripts[transcript].append(len(exons[transcript])) # Sort the [start, length] pairs by start position exons[transcript].sort(key=lambda x: x[0]) starts = "" lens = "" for exon in exons[transcript]: starts = starts + str(exon[0] - int(transcripts[transcript][1])) + "," lens = lens + str(exon[1] + 1) + "," transcripts[transcript].append(lens) transcripts[transcript].append(starts) # Convert to bedline for format validation out = list() bed = bedline(transcripts[transcript]) for key in bed._bedline__fields[:bed.bedType]: if key == "exLengths": bed.exLengths += "," if key == "exStarts": bed.exStarts += "," out.append(bed.__dict__[key]) if (extra != ['']): for field in extra: out.append(extrainfo[bed.name][field]) print(*out, sep="\t")
def test_promoterUnstranded(self): '''promoters() should return correct promotersUnstranded with known input''' for ((bed), (prom)) in self.known_promotersUnstranded: result = bedparse.bedline(bed).promoter(strand=0) self.assertEqual(result, bedparse.bedline(prom))
def fiveP(args): with args.bedfile as tsvfile: for line in tsvfile: utr = bedline(line.split('\t')).utr(which=5) if (utr): utr.print() tsvfile.close()
def introns(args): with args.bedfile as tsvfile: for line in tsvfile: introns = bedline(line.split('\t')).introns() if (introns): introns.print() tsvfile.close()
def save_to_bed(self, output_fn=None, bedgraph=False, pvalue_field=None, pvalue_thr=0.01, span=5, convert=None, assembly=None, title=None): """ Save the position of significant positions in the genome space in BED6 or BEDGRAPH format. The resulting file can be used in a genome browser to visualise significant genomic locations. The option is only available if `SampCompDB` if initialised with a BED file containing genome annotations. * output_fn Path to file where to write the data * bedgraph save file in bedgraph format instead of bed * pvalue_field specifies what column to use as BED score (field 5, as -log10) * pvalue_thr only report positions with pvalue<=thr * span The size of each BED feature. If size=5 (default) features correspond to kmers. If size=1 features correspond to the first base of each kmer. * convert one of 'ensembl_to_ucsc' or 'ucsc_to_ensembl". Convert chromosome named between Ensembl and Ucsc conventions * assembly required if convert is used. One of "hg38" or "mm10" """ if self._bed_fn is None: raise NanocomporeError("In order to generate a BED file SampCompDB needs to be initialised with a transcriptome BED") if span < 1: raise NanocomporeError("span has to be >=1") if span != 5 and bedgraph: raise NanocomporeError("Span is ignored when generating bedGraph files") if pvalue_field not in self.results: raise NanocomporeError(("The field '%s' is not in the results" % pvalue_field)) if "results" not in self.__dict__: raise NanocomporeError("It looks like there's not results slot in SampCompDB") if convert not in [None, "ensembl_to_ucsc", "ucsc_to_ensembl"]: raise NanocomporeError("Convert value not valid") if convert is not None and assembly is None: raise NanocomporeError("The assembly argument is required in order to do the conversion. Choose one of 'hg38' or 'mm10' ") with open(output_fn, "w") as bed_file: if title is not None: if not bedgraph: bed_file.write('track type=bed name="%s" description="%s"\n'%(title,pvalue_field)) else: bed_file.write('track type=bedGraph name="%s" description="%s"\n'%(title,pvalue_field)) Record = namedtuple('Record', ['chr', 'genomicPos', 'ref_id','strand', 'ref_kmer', pvalue_field ]) for record in self.results[ list(Record._fields) ].itertuples(index=False, name="Record"): pvalue = getattr(record, pvalue_field) if np.isnan(pvalue): pvalue=0 elif pvalue < sys.float_info.min: pvalue = -log(sys.float_info.min, 10) else: pvalue=-log(pvalue, 10) if not bedgraph and pvalue >= -log(pvalue_thr, 10): if record.strand == "+": line=bedline([record.chr, record.genomicPos, record.genomicPos+span, "%s_%s" % (record.ref_id, record.ref_kmer), pvalue, record.strand]) else: line=bedline([record.chr, record.genomicPos-(span-1), record.genomicPos+1, "%s_%s" % (record.ref_id, record.ref_kmer), pvalue, record.strand]) if convert is "ensembl_to_ucsc": line=line.translateChr(assembly=assembly, target="ucsc", patches=True) elif convert is "ucsc_to_ensembl": line=line.translateChr(assembly=assembly, target="ens", patches=True) bed_file.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (line.chr, line.start, line.end, line.name, line.score, line.strand)) elif bedgraph: if record.strand == "+": line=bedline([record.chr, record.genomicPos+2, record.genomicPos+3, "%s_%s" % (record.ref_id, record.ref_kmer), pvalue, record.strand]) else: line=bedline([record.chr, record.genomicPos-2, record.genomicPos-1, "%s_%s" % (record.ref_id, record.ref_kmer), pvalue, record.strand]) if convert is "ensembl_to_ucsc": line=line.translateChr(assembly=assembly, target="ucsc", patches=True) elif convert is "ucsc_to_ensembl": line=line.translateChr(assembly=assembly, target="ens", patches=True) bed_file.write("%s\t%s\t%s\t%s\n" % (line.chr, line.start, line.end, line.score))
def test_promoter100(self): '''promoters() should return correct promoters100 with known input''' for ((bed), (prom)) in self.known_promoters100: result = bedparse.bedline(bed).promoter(up=100, down=100) self.assertEqual(result, bedparse.bedline(prom))