Ejemplo n.º 1
0
def parse(fileName, junctions):
    with open(fileName) as f:
        for alnObj in psl_parser.read(f, 'track'):
            tStarts = alnObj.attrib['tStarts']
            blockSizes = alnObj.attrib['blockSizes']
            tName = alnObj.attrib['tName']
            construct(tName, tStarts, blockSizes, junctions)
Ejemplo n.º 2
0
def main(options, args):
    exons = {}
    clusters = {}
    newClusterID = 0
    clusterConnections = {}
    linkedExons = {}
    exonPositions = {}
    endExons = {}
    singleton = 0

    print >> sys.stderr, 'Minimum UTR length = ', options.minimumUTRLength
    print >> sys.stderr, 'Parsing and clustering exons..'
    for n, alnObj in enumerate(psl_parser.read(open(options.infile), 'track')):
        tStarts = alnObj.attrib['tStarts']
        blockSizes = alnObj.attrib['blockSizes']

        if len(blockSizes) == 1:
            singleton += 1

        tName = alnObj.attrib['tName']
        newClusterID = construct(tName, tStarts, blockSizes, exons, clusters,
                                 newClusterID, clusterConnections, linkedExons,
                                 exonPositions, endExons)
        if n % 1000 == 0:
            print >> sys.stderr, '...', n

    print >> sys.stderr, 'Total singletons = ', singleton

    sumExons = {}
    for ref, end in exons:
        try:
            sumExons[ref] += 1
        except KeyError:
            sumExons[ref] = 1
    for ref in sorted(sumExons):
        print >> sys.stderr, '\t%s has %d exon(s).' % (ref, sumExons[ref])

    print >> sys.stderr, '\nTotal %d cluster(s) found.' % len(clusters)

    print >> sys.stderr, '\nMerging clusters..'
    mergedClusters = mergeClusters(clusters, clusterConnections)
    print >> sys.stderr, '\nCleaning up..'
    ignored = set([])
    for cl in mergedClusters:
        allExons = mergedClusters[cl]
        cleanUpLinkedExons(allExons, linkedExons, exonPositions, ignored,
                           options.minimumUTRLength)

    print >> sys.stderr, 'Modifying the right end of each transcript..'
    for cl in mergedClusters:
        findLongestEnd(mergedClusters[cl], linkedExons, endExons,
                       exonPositions, ignored)

    print >> sys.stderr, '\nConstructing transcripts..'
    allPaths = {}
    visited = set([])
    for n, cl in enumerate(mergedClusters):
        txExons = sorted(mergedClusters[cl])
        paths = buildPaths(linkedExons, txExons, allPaths, ignored, visited)
        allPaths[cl] = paths
        if n % 1000 == 0:
            if n > 0:
                print >> sys.stderr, '... %d built..' % n

    genome = seqdb.SequenceFileDB(options.genome, verbose=False)
    '''Create isoform objects from allPaths and
    search for ORF.

    '''
    print >> sys.stderr, '\nBuilding gene models..'
    allGenes = {}
    n = 0
    for chrom, geneID in allPaths:
        n += 1
        isoformID = 0
        for isoExons in allPaths[(chrom, geneID)]:
            isoform = Isoform(chrom, geneID, isoformID, isoExons, genome)
            if chrom not in allGenes:
                allGenes[chrom] = {}
                allGenes[chrom][geneID] = [isoform]
            else:
                try:
                    allGenes[chrom][geneID].append(isoform)
                except KeyError:
                    allGenes[chrom][geneID] = [isoform]
            isoformID += 1

            if n % 1000 == 0:
                print >> sys.stderr, '...', n

    print >> sys.stderr, '\nRemoving redundant sequences..'
    findRedundantSequence(allGenes)
    '''Creating sequence records for each DNA, RNA and protein sequences.'''
    isoformDNASeqs = []
    isoformProteinSeqs = []
    isoformRNASeqs = []
    totalGenes = 0
    for chrom in allGenes:
        for geneID in allGenes[chrom]:
            totalGenes += 1
            isoformID = 0
            for isoform in allGenes[chrom][geneID]:
                if not isoform.redundant:
                    isoform.isoformID = isoformID
                    isoformName = '%s:%d.%d' % (chrom, geneID,
                                                isoform.isoformID)
                    DNARecord = SeqRecord(isoform.dnaSeq, id=isoformName)
                    isoformDNASeqs.append(DNARecord)
                    '''Search for ORF for non-redundant sequences'''

                    print >> sys.stderr, 'searching ORF: %s:%d.%d' \
                                            % (chrom, geneID,isoformID)
                    findORF(isoform)

                    if isoform.frame:
                        proteinRecord = SeqRecord(isoform.proteinSeq,
                                                  id=isoformName)
                        RNARecord = SeqRecord(isoform.mrnaSeq, id=isoformName)
                        isoformProteinSeqs.append(proteinRecord)
                        isoformRNASeqs.append(RNARecord)
                    isoformID += 1

                if n > 0 and n % 1000 == 0:
                    print >> sys.stderr, '...', n, 'transcripts done.'

    print >> sys.stderr, 'Total genes = %d\n\n', totalGenes
    print >> sys.stderr, 'Writing gene models to file...'
    writeBEDFile(allGenes, options.basename)
    print >> sys.stderr, 'Writing DNA sequences to file...'
    SeqIO.write(isoformDNASeqs, options.basename + '.dnas.fa', 'fasta')
    print >> sys.stderr, 'Writing RNA sequences to file...'
    SeqIO.write(isoformRNASeqs, options.basename + '.mrnas.fa', 'fasta')
    print >> sys.stderr, 'Writing protein sequences to file...'
    SeqIO.write(isoformProteinSeqs, options.basename + '.proteins.fa', 'fasta')
def main(options, args):
    exons = {}
    clusters = {}
    newClusterID = 0
    clusterConnections = {}
    linkedExons = {}
    exonPositions = {}
    endExons = {}
    singleton = 0

    print >> sys.stderr, 'Minimum UTR length = ', options.minimumUTRLength
    print >> sys.stderr, 'Parsing and clustering exons..'
    for n, alnObj in enumerate(psl_parser.read(open(options.infile), 'track')):
        tStarts = alnObj.attrib['tStarts']
        blockSizes = alnObj.attrib['blockSizes']

        if len(blockSizes) == 1:
            singleton += 1

        tName = alnObj.attrib['tName']
        newClusterID = construct(tName, tStarts, blockSizes,
                                exons, clusters, newClusterID,
                                clusterConnections,
                                linkedExons, exonPositions,
                                endExons)
        if n % 1000 == 0:
            print >> sys.stderr, '...', n

    print >> sys.stderr, 'Total singletons = ', singleton

    sumExons = {}
    for ref, end in exons:
        try:
            sumExons[ref] += 1
        except KeyError:
            sumExons[ref] = 1
    for ref in sorted(sumExons):
        print >> sys.stderr, '\t%s has %d exon(s).' % (ref, sumExons[ref])

    print >> sys.stderr, '\nTotal %d cluster(s) found.' % len(clusters)

    print >> sys.stderr, '\nMerging clusters..'
    mergedClusters = mergeClusters(clusters, clusterConnections)
    print >> sys.stderr, '\nCleaning up..'
    ignored = set([])
    for cl in mergedClusters:
        allExons = mergedClusters[cl]
        cleanUpLinkedExons(allExons,
                            linkedExons,
                            exonPositions,
                            ignored,
                            options.minimumUTRLength)

    print >> sys.stderr, 'Modifying the right end of each transcript..'
    for cl in mergedClusters:
        findLongestEnd(mergedClusters[cl],
                        linkedExons,
                        endExons,
                        exonPositions,
                        ignored)

    print >> sys.stderr, '\nConstructing transcripts..'
    allPaths = {}
    visited = set([])
    for n, cl in enumerate(mergedClusters):
        txExons = sorted(mergedClusters[cl])
        paths = buildPaths(linkedExons, txExons, allPaths, ignored, visited)
        allPaths[cl] = paths
        if n % 1000 == 0:
            if n > 0:
                print >> sys.stderr, '... %d built..' % n

    genome = seqdb.SequenceFileDB(options.genome, verbose=False)

    '''Create isoform objects from allPaths and
    search for ORF.

    '''
    print >> sys.stderr, '\nBuilding gene models..'
    allGenes = {}
    n = 0
    for chrom, geneID in allPaths:
        n += 1
        isoformID = 0
        for isoExons in allPaths[(chrom, geneID)]:
            isoform = Isoform(chrom, geneID, isoformID, isoExons, genome)
            if chrom not in allGenes:
                allGenes[chrom] = {}
                allGenes[chrom][geneID] = [isoform]
            else:
                try:
                    allGenes[chrom][geneID].append(isoform)
                except KeyError:
                    allGenes[chrom][geneID] = [isoform]
            isoformID += 1

            if n % 1000 == 0:
                print >> sys.stderr, '...', n

    print >> sys.stderr, '\nRemoving redundant sequences..'
    findRedundantSequence(allGenes)


    '''Creating sequence records for each DNA, RNA and protein sequences.'''
    isoformDNASeqs = []
    isoformProteinSeqs = []
    isoformRNASeqs = []
    totalGenes = 0
    for chrom in allGenes:
        for geneID in allGenes[chrom]:
            totalGenes += 1
            isoformID = 0
            for isoform in allGenes[chrom][geneID]:
                if not isoform.redundant:
                    isoform.isoformID = isoformID
                    isoformName = '%s:%d.%d' % (chrom,
                                                geneID,
                                                isoform.isoformID)
                    DNARecord = SeqRecord(isoform.dnaSeq,
                                            id=isoformName)
                    isoformDNASeqs.append(DNARecord)

                    '''Search for ORF for non-redundant sequences'''

                    print >> sys.stderr, 'searching ORF: %s:%d.%d' \
                                            % (chrom, geneID,isoformID)
                    findORF(isoform)

                    if isoform.frame:
                        proteinRecord = SeqRecord(isoform.proteinSeq,
                                                    id=isoformName)
                        RNARecord = SeqRecord(isoform.mrnaSeq,
                                                id=isoformName)
                        isoformProteinSeqs.append(proteinRecord)
                        isoformRNASeqs.append(RNARecord)
                    isoformID += 1

                if n > 0 and n % 1000 == 0:
                    print >> sys.stderr, '...', n, 'transcripts done.'

    print >> sys.stderr, 'Total genes = %d\n\n', totalGenes
    print >> sys.stderr, 'Writing gene models to file...'
    writeBEDFile(allGenes, options.basename)
    print >> sys.stderr, 'Writing DNA sequences to file...'
    SeqIO.write(isoformDNASeqs, options.basename + '.dnas.fa', 'fasta')
    print >> sys.stderr, 'Writing RNA sequences to file...'
    SeqIO.write(isoformRNASeqs, options.basename + '.mrnas.fa', 'fasta')
    print >> sys.stderr, 'Writing protein sequences to file...'
    SeqIO.write(isoformProteinSeqs, options.basename + '.proteins.fa', 'fasta')
Ejemplo n.º 4
0
                        if exon_end == intron_start:
                            as_exons = [exon_start]
                            coverage = len([alread for alread in samfile.fetch(chr, exon_start, exon_end)])
                            print '>%d\t%d\t%d' % (exon_start, exon_end, coverage)
                            break

                for intron_end in junctions[chr][intron_start]:   # es = intron end site
                    ex_start = intron_end               # intron end site = exon start site
                    for ex_end in exons[chr][ex_start]:
                        coverage = len([alread for alread in samfile.fetch(chr, ex_start , ex_end)])
                        print '%d\t%d\t%d' % (ex_start, ex_end, coverage)
                        as_exons.append(ex_end)
                as_exons.sort()
                print '%d-%d' % (as_exons[0], as_exons[-1])

if __name__ == '__main__':

    fname = sys.argv[1]
    samfile = pysam.Samfile(sys.argv[2], 'rb')
    SNP_file = sys.argv[3]

    comment = 'track'
    all_transcripts = []
    for line in psl_parser.read(open(fname), comment):
        all_transcripts.append(line)

    exons, junctions = construct(all_transcripts)
    #AS_coverage(exons, junctions, samfile)
    #cassette_exon(exons, junctions, samfile)
    export_junctions(junctions)