Ejemplo n.º 1
0
 def test_bed12tobed6_last(self):
     '''bed12tobed6(whichExon=last) should return correct BED6 records for know cases'''
     for ((bed), (res)) in self.known_bed12tobed6_last:
         res = bedparse.bedline(res[0])
         result = bedparse.bedline(bed).bed12tobed6(whichExon="last",
                                                    appendExN=True)
         self.assertEqual(result[0], res)
Ejemplo n.º 2
0
def prom(args):
    with args.bedfile as tsvfile:
        for line in tsvfile:
            bedline(line.split('\t')).promoter(
                up=args.up, down=args.down,
                strand=(not args.unstranded)).print()
    tsvfile.close()
Ejemplo n.º 3
0
 def test_bed12tobed6(self):
     '''bed12tobed6() should return correct BED6 records for know cases'''
     bed, res = self.known_bed12tobed6
     resBedline = []
     for i in res:
         resBedline.append(bedparse.bedline(i))
     result = bedparse.bedline(bed).bed12tobed6(appendExN=True)
     self.assertEqual(result, resBedline)
Ejemplo n.º 4
0
 def test_CDSs(self):
     '''cds() should return correct CDSs for know cases'''
     for ((bed), (cds)) in self.known_CDSs:
         result = bedparse.bedline(bed).cds()
         self.assertEqual(result, bedparse.bedline(cds))
     for ((bed), (cds)) in self.known_CDS_ignoreCDSonly:
         result = bedparse.bedline(bed).cds(ignoreCDSonly=True)
         self.assertEqual(result, None)
Ejemplo n.º 5
0
 def test_3pUTRs(self):
     '''threePutr should return correct UTR for know cases'''
     for ((bed), (utr)) in self.known_3pUTRs:
         result = bedparse.bedline(bed).utr(which=3)
         if (utr is None):
             self.assertEqual(result, None)
         else:
             self.assertEqual(result, bedparse.bedline(utr))
Ejemplo n.º 6
0
 def test_introns(self):
     '''introns should return correct introns for know cases'''
     for ((bed), (introns)) in self.known_introns:
         result = bedparse.bedline(bed).introns()
         if (introns is None):
             self.assertEqual(result, None)
         else:
             self.assertEqual(result, bedparse.bedline(introns))
Ejemplo n.º 7
0
def cds(args):
    with args.bedfile as tsvfile:
        for line in tsvfile:
            utr = bedline(
                line.split('\t')).cds(ignoreCDSonly=args.ignoreCDSonly)
            if (utr): utr.print()
    tsvfile.close()
Ejemplo n.º 8
0
def join(args):
    col = args.column - 1
    annot = dict()
    try:
        annotation = open(args.annotation)
    except:
        raise BEDexception("Annotation file not valid")
    annotationReader = csv.reader(annotation, delimiter=args.separator)
    for line in annotationReader:
        if (len(line) <= col):
            raise BEDexception(
                "Some lines don't contain the annotation column")
        annot.setdefault(line[col], []).append(line[0:col] + line[col + 1:])
    annotation.close()
    with args.bedfile as tsvfile:
        for line in tsvfile:
            line = line.split('\t')
            if (args.noUnmatched == False or line[3] in annot.keys()):
                record = bedline(line)
                if (record):
                    nrec = len(annot.setdefault(record.name, []))
                    if (nrec == 0):
                        if (args.empty == ''):
                            record.print()
                        else:
                            record.print(end='')
                            print('', args.empty, sep="\t")
                    else:
                        for i in range(0, nrec):
                            record.print(end='')
                            print('', *annot[record.name][i], sep='\t')
    tsvfile.close()
Ejemplo n.º 9
0
 def test_tx2genome(self):
     '''tx2genome should return corred coordinates for known cases'''
     for tx, examples, broken_examples in self.known_tx2genome:
         tx = bedparse.bedline(tx)
         for (txCoord, genomeCoord) in examples:
             res = tx.tx2genome(txCoord)
             self.assertEqual(res, genomeCoord)
         for be in broken_examples:
             self.assertRaises(bedparse.BEDexception, tx.tx2genome, be)
Ejemplo n.º 10
0
def convertChr(args):
    with args.bedfile as tsvfile:
        for line in tsvfile:
            translatedLine = bedline(line.split('\t')).translateChr(
                assembly=args.assembly,
                target=args.target,
                suppress=args.suppressMissing,
                ignore=args.allowMissing,
                patches=args.patches)
            if (translatedLine):
                translatedLine.print()
    tsvfile.close()
Ejemplo n.º 11
0
def bed12tobed6(args):
    if args.whichExon is not "all" and args.keepIntrons:
        raise BEDexception(
            "--keepIntrons is only allowed with --whichExon all")
    with args.bedfile as tsvfile:
        for line in tsvfile:
            tx = bedline(line.split('\t'))
            exon_list = tx.bed12tobed6(appendExN=args.appendExN,
                                       whichExon=args.whichExon)
            for el in exon_list:
                el.print()
            if (args.keepIntrons):
                nameSub = re.compile("_Exon([0-9]+)")
                for el in tx.introns().bed12tobed6(appendExN=args.appendExN):
                    el.name = nameSub.sub(r"_Intron\1", el.name)
                    el.print()
    tsvfile.close()
Ejemplo n.º 12
0
def validateFormat(args):
    with args.bedfile as tsvfile:
        for n, line in enumerate(tsvfile):
            if args.fixSeparators:
                line = re.sub(r'^\s+', '', line)
                line = re.sub(r'\s+', '\t', line)
                line = re.sub(r'\s+$', '', line)
            try:
                validatedLine = bedline(line.split('\t'))
            except BEDexception as formatException:
                raise BEDexception(
                    "\nThis doesn't appear to be a valid BED file. There was an error at line %s:\n\t\"%s\""
                    % (n + 1, formatException))
                tsvfile.close()
            else:
                validatedLine.print()
    tsvfile.close()
Ejemplo n.º 13
0
    def __calculate_results(self, adjust=True):
        """"""
        # Collect all pvalue results in a dataframe
        l = []
        for ref_id, ref_pos_list in self:
            for pos, pos_dict in enumerate(ref_pos_list):
                if "txComp" in pos_dict:
                    row_dict = OrderedDict()
                    row_dict["ref_id"] = ref_id
                    row_dict["pos"] = pos
                    row_dict["ref_kmer"] = pos_dict["ref_kmer"]
                    for test in self._metadata["pvalue_tests"]:
                        if test in pos_dict["txComp"]:
                            pval = pos_dict["txComp"][test]
                            row_dict[test] = pval
                    l.append(row_dict)
        df = pd.DataFrame(l)

        if self._bed_fn:
            bed_annot={}
            try:
                with open(self._bed_fn) as tsvfile:
                    for line in tsvfile:
                        record_name=line.split('\t')[3]
                        if( record_name in self.ref_id_list):
                            bed_annot[record_name]=bedline(line.split('\t'))
            except:
                raise NanocomporeError("Can't open BED file")
            if len(bed_annot) != len(self.ref_id_list):
                raise NanocomporeError("Some references are missing from the BED file provided")

            df['genomicPos'] = df.apply(lambda row: bed_annot[row['ref_id']].tx2genome(coord=row['pos'], stranded=True),axis=1)
            # This is very inefficient. We should get chr and strand only once per transcript, ideally when writing the BED file
            df['chr'] = df.apply(lambda row: bed_annot[row['ref_id']].chr,axis=1)
            df['strand'] = df.apply(lambda row: bed_annot[row['ref_id']].strand,axis=1)
            df=df[['ref_id', 'pos', 'chr', 'strand', 'genomicPos', 'ref_kmer']+self._metadata["pvalue_tests"]]
        else:
            df=df[['ref_id', 'pos', 'ref_kmer']+self._metadata["pvalue_tests"]]

        if adjust:
            for col in self._metadata["pvalue_tests"]:
                df[col] = self.__multipletests_filter_nan(df[col], method="fdr_bh")
        return df
Ejemplo n.º 14
0
def gtf2bed(gtf,
            extra=[''],
            filterKey="transcript_biotype",
            filterType=[''],
            transcript_feature_name="transcript"):
    gtfRecords = {'exon': list(), 'transcript': list(), 'cds': list()}
    transcripts = dict()
    exons = dict()
    cds = dict()
    extrainfo = dict()
    gtfReader = csv.reader((row for row in gtf if not row.startswith('#')),
                           delimiter="\t")
    for line in gtfReader:
        # Store all transcript lines
        if (line[2] == transcript_feature_name):
            txName = re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8])
            if (line[6] != "+" and line[6] != "-"):
                raise BEDexception("Transcript with unrecognized strand: " +
                                   txName)
            # Start-1 converts from 1-based to 0-based
            transcripts[txName] = [
                line[0],
                int(line[3]) - 1,
                int(line[4]), txName, 0, line[6]
            ]
            # Parse the extra fields
            if (extra != ['']):
                for field in extra:
                    extrainfo.setdefault(txName, dict())[field] = re.sub(
                        '.*' + field + ' "?([^"]+)"?;.*', "\\1", line[8])
                    # If no substitution occured, set the extra field to '.'
                    if (extrainfo[txName][field] == line[8]):
                        extrainfo[txName][field] = "."
            if filterType != [''] and filterKey not in extra:
                extrainfo.setdefault(txName, dict())[filterKey] = re.sub(
                    '.*' + filterKey + ' "?([^"]+)"?;.*', "\\1", line[8])
                # If no substitution occured, set the extra field to '.'
                if (extrainfo[txName][filterKey] == line[8]):
                    extrainfo[txName][field] = "."
        # Parse exon lines
        if (line[2] == 'exon'):
            txName = re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8])
            if (line[6] != transcripts[txName][5]):
                raise BEDexception(
                    "Exon has different strand from parent transcript: " +
                    txName)
            start = int(line[3]) - 1
            length = int(line[4]) - int(line[3])
            exons.setdefault(txName, []).append([start, length])

        # Start CDS, start and stop codons
        # Any of these features extends the CDS
        if (line[2] in ['CDS', 'start_codon', 'stop_codon']):
            txName = re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8])
            if (line[6] != transcripts[txName][5]):
                raise BEDexception(
                    ("%s has different strand from parent transcript: %s" %
                     line[2], txName))
            start = int(line[3])
            stop = int(line[4])
            if (txName not in cds.keys()):
                cds[txName] = [start, stop]
            else:
                # Is the current start/stop are up/down compared to
                # the previous one, update cds.
                if (start < cds[txName][0]): cds[txName][0] = start
                if (stop > cds[txName][1]): cds[txName][1] = stop
    gtf.close()

    for transcript in transcripts.keys():
        if (filterType != ['']):
            if extrainfo[transcript][filterKey] not in filterType:
                continue
        if (transcript in cds.keys()):
            cdsStart = int(cds[transcript][0]) - 1
            cdsEnd = int(cds[transcript][1])
        else:
            cdsStart = transcripts[transcript][2]
            cdsEnd = transcripts[transcript][2]

        transcripts[transcript].append(cdsStart)
        transcripts[transcript].append(cdsEnd)
        transcripts[transcript].append('0')
        # Add the number of exons in field 10
        transcripts[transcript].append(len(exons[transcript]))
        # Sort the [start, length] pairs by start position
        exons[transcript].sort(key=lambda x: x[0])
        starts = ""
        lens = ""
        for exon in exons[transcript]:
            starts = starts + str(exon[0] -
                                  int(transcripts[transcript][1])) + ","
            lens = lens + str(exon[1] + 1) + ","
        transcripts[transcript].append(lens)
        transcripts[transcript].append(starts)
        # Convert to bedline for format validation
        out = list()
        bed = bedline(transcripts[transcript])
        for key in bed._bedline__fields[:bed.bedType]:
            if key == "exLengths": bed.exLengths += ","
            if key == "exStarts": bed.exStarts += ","
            out.append(bed.__dict__[key])
        if (extra != ['']):
            for field in extra:
                out.append(extrainfo[bed.name][field])
        print(*out, sep="\t")
Ejemplo n.º 15
0
 def test_promoterUnstranded(self):
     '''promoters() should return correct promotersUnstranded with known input'''
     for ((bed), (prom)) in self.known_promotersUnstranded:
         result = bedparse.bedline(bed).promoter(strand=0)
         self.assertEqual(result, bedparse.bedline(prom))
Ejemplo n.º 16
0
def fiveP(args):
    with args.bedfile as tsvfile:
        for line in tsvfile:
            utr = bedline(line.split('\t')).utr(which=5)
            if (utr): utr.print()
    tsvfile.close()
Ejemplo n.º 17
0
def introns(args):
    with args.bedfile as tsvfile:
        for line in tsvfile:
            introns = bedline(line.split('\t')).introns()
            if (introns): introns.print()
    tsvfile.close()
Ejemplo n.º 18
0
    def save_to_bed(self, output_fn=None, bedgraph=False, pvalue_field=None, pvalue_thr=0.01, span=5, convert=None, assembly=None, title=None):
        """
        Save the position of significant positions in the genome space in BED6 or BEDGRAPH format.
        The resulting file can be used in a genome browser to visualise significant genomic locations.
        The option is only available if `SampCompDB` if initialised with a BED file containing genome annotations.
        * output_fn
            Path to file where to write the data
        * bedgraph
            save file in bedgraph format instead of bed
        * pvalue_field
            specifies what column to use as BED score (field 5, as -log10)
        * pvalue_thr
            only report positions with pvalue<=thr
        * span
            The size of each BED feature.
            If size=5 (default) features correspond to kmers.
            If size=1 features correspond to the first base of each kmer.
        * convert
            one of 'ensembl_to_ucsc' or 'ucsc_to_ensembl". Convert chromosome named between Ensembl and Ucsc conventions
        * assembly
            required if convert is used. One of "hg38" or "mm10"
        """
        if self._bed_fn is None:
            raise NanocomporeError("In order to generate a BED file SampCompDB needs to be initialised with a transcriptome BED")
        if span < 1:
            raise NanocomporeError("span has to be >=1")
        if span != 5 and bedgraph:
            raise NanocomporeError("Span is ignored when generating bedGraph files")
        if pvalue_field not in self.results:
            raise NanocomporeError(("The field '%s' is not in the results" % pvalue_field))
        if "results" not in self.__dict__:
            raise NanocomporeError("It looks like there's not results slot in SampCompDB")
        if convert not in [None, "ensembl_to_ucsc", "ucsc_to_ensembl"]:
            raise NanocomporeError("Convert value not valid")
        if convert is not None and assembly is None:
            raise NanocomporeError("The assembly argument is required in order to do the conversion. Choose one of 'hg38' or 'mm10' ")

        with open(output_fn, "w") as bed_file:
            if title is not None:
                if not bedgraph:
                    bed_file.write('track type=bed name="%s" description="%s"\n'%(title,pvalue_field))
                else:
                    bed_file.write('track type=bedGraph name="%s" description="%s"\n'%(title,pvalue_field))

            Record = namedtuple('Record', ['chr', 'genomicPos', 'ref_id','strand', 'ref_kmer', pvalue_field ])
            for record in self.results[ list(Record._fields) ].itertuples(index=False, name="Record"):
                pvalue = getattr(record, pvalue_field)
                if np.isnan(pvalue):
                    pvalue=0
                elif pvalue < sys.float_info.min:
                    pvalue = -log(sys.float_info.min, 10)
                else:
                    pvalue=-log(pvalue, 10)
                if not bedgraph and pvalue >= -log(pvalue_thr, 10):
                    if record.strand == "+":
                        line=bedline([record.chr, record.genomicPos, record.genomicPos+span, "%s_%s" % (record.ref_id, record.ref_kmer), pvalue, record.strand])
                    else:
                        line=bedline([record.chr, record.genomicPos-(span-1), record.genomicPos+1, "%s_%s" % (record.ref_id, record.ref_kmer), pvalue, record.strand])

                    if convert is "ensembl_to_ucsc":
                        line=line.translateChr(assembly=assembly, target="ucsc", patches=True)
                    elif convert is "ucsc_to_ensembl":
                        line=line.translateChr(assembly=assembly, target="ens", patches=True)
                    bed_file.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (line.chr, line.start, line.end, line.name, line.score, line.strand))
                elif bedgraph:
                    if record.strand == "+":
                        line=bedline([record.chr, record.genomicPos+2, record.genomicPos+3, "%s_%s" % (record.ref_id, record.ref_kmer), pvalue, record.strand])
                    else:
                        line=bedline([record.chr, record.genomicPos-2, record.genomicPos-1, "%s_%s" % (record.ref_id, record.ref_kmer), pvalue, record.strand])
                    if convert is "ensembl_to_ucsc":
                        line=line.translateChr(assembly=assembly, target="ucsc", patches=True)
                    elif convert is "ucsc_to_ensembl":
                        line=line.translateChr(assembly=assembly, target="ens", patches=True)
                    bed_file.write("%s\t%s\t%s\t%s\n" % (line.chr, line.start, line.end, line.score))
Ejemplo n.º 19
0
 def test_promoter100(self):
     '''promoters() should return correct promoters100 with known input'''
     for ((bed), (prom)) in self.known_promoters100:
         result = bedparse.bedline(bed).promoter(up=100, down=100)
         self.assertEqual(result, bedparse.bedline(prom))