Beispiel #1
0
def checkGenes(chromosomes, exon_records, chromosome_load_fxn, cdnas, id_fxn):
    total_genes = 0
    total_errors = 0
    # Reconstruct genes
    for chr in chromosomes:
        print "# Chromosome", chr
        # Load this chromosome's data
        chrseq = chromosome_load_fxn(chr)
        n_genes = 0
        n_errors = 0
        n_wrong_length = 0
        n_bad_translation = 0
        n_different_translation = 0
        n_no_comparison = 0
        for (pepid, recdict) in exon_records.items():
            recs = recdict.values()
            if recs[0].chromosome == chr:
                recs.sort(key=lambda e: e.exon_start)
                (seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0)
                # Reverse-complement the sequence if it's on the negative strand
                if recs[0].strand == '-1':
                    #print "reversing strand"
                    seq = translate.reverseComplement(seq)
                n_genes += 1
                try:
                    orf = id_fxn(sentinel_rec)
                    cdna = cdnas[orf]
                    #print orf
                    #print seq
                    #print cdna
                    if len(seq) == len(cdna):
                        if seq != cdna:
                            n_errors += 1
                    elif len(seq) - 3 == len(cdna):
                        if seq[0:-3] != cdna:
                            n_errors += 1
                    else:
                        n_errors += 1
                        n_wrong_length += 1
                except KeyError:
                    n_no_comparison += 1
                    continue
        print "# Processed %d genes with %d errors" % (n_genes, n_errors)
        print "#     %d length errors" % (n_wrong_length, )
        print "#     %d no comparison errors" % (n_no_comparison, )
        print "#     %d bad translation errors" % (n_bad_translation, )
        print "#     %d different translation errors" % (
            n_different_translation, )
        total_errors += n_errors
        total_genes += n_genes
    print "# Processed %d genes with %d errors total" % (total_genes,
                                                         total_errors)
Beispiel #2
0
def checkGenes(chromosomes, exon_records, chromosome_load_fxn, cdnas, id_fxn):
	total_genes = 0
	total_errors = 0
	# Reconstruct genes
	for chr in chromosomes:
		print "# Chromosome", chr
		# Load this chromosome's data
		chrseq = chromosome_load_fxn(chr)
		n_genes = 0
		n_errors = 0
		n_wrong_length = 0
		n_bad_translation = 0
		n_different_translation = 0
		n_no_comparison = 0
		for (pepid, recdict) in exon_records.items():
			recs = recdict.values()
			if recs[0].chromosome == chr:
				recs.sort( key = lambda e: e.exon_start)
				(seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0)
				# Reverse-complement the sequence if it's on the negative strand
				if recs[0].strand == '-1':
					#print "reversing strand"
					seq = translate.reverseComplement(seq)
				n_genes += 1
				try:
					orf = id_fxn(sentinel_rec)
					cdna = cdnas[orf]
					#print orf
					#print seq
					#print cdna
					if len(seq) == len(cdna):
						if seq != cdna:
							n_errors += 1
					elif len(seq)-3 == len(cdna):
						if seq[0:-3] != cdna:
							n_errors += 1
					else:
						n_errors += 1
						n_wrong_length += 1
				except KeyError:
					n_no_comparison += 1
					continue
		print "# Processed %d genes with %d errors" % (n_genes, n_errors)
		print "#     %d length errors" % (n_wrong_length,)
		print "#     %d no comparison errors" % (n_no_comparison,)
		print "#     %d bad translation errors" % (n_bad_translation,)
		print "#     %d different translation errors" % (n_different_translation,)
		total_errors += n_errors
		total_genes += n_genes
	print "# Processed %d genes with %d errors total" % (total_genes, total_errors)
Beispiel #3
0
	def getSequence(self, raw_seq, n_bases_upstream=0, n_bases_downstream=0):
		# Sort exon records
		self.exons.sort(key=lambda x: x.coding_start)
		# Figure out which direction "upstream" is
		if self.strand == '-':
			# Swap -- downstream is really upstream on the negative strand
			(n_bases_upstream, n_bases_downstream) = (n_bases_downstream, n_bases_upstream)
		seq = ''
		if len(self.exons) == 1:
			seq += self.exons[0].pullCodingSequence(raw_seq, n_bases_upstream, n_bases_downstream)
		else:
			for (i,x) in enumerate(self.exons):
				if i==0:
					seq += x.pullCodingSequence(raw_seq, n_bases_upstream, 0)
				elif i==(len(self.exons)-1):
					seq += x.pullCodingSequence(raw_seq, 0, n_bases_downstream)
				else:
					seq += x.pullCodingSequence(raw_seq, 0, 0)
		if self.strand == '-':
			seq = translate.reverseComplement(seq)
		return seq
Beispiel #4
0
 def getSequence(self, raw_seq, n_bases_upstream=0, n_bases_downstream=0):
     # Sort exon records
     self.exons.sort(key=lambda x: x.coding_start)
     # Figure out which direction "upstream" is
     if self.strand == '-':
         # Swap -- downstream is really upstream on the negative strand
         (n_bases_upstream, n_bases_downstream) = (n_bases_downstream,
                                                   n_bases_upstream)
     seq = ''
     if len(self.exons) == 1:
         seq += self.exons[0].pullCodingSequence(raw_seq, n_bases_upstream,
                                                 n_bases_downstream)
     else:
         for (i, x) in enumerate(self.exons):
             if i == 0:
                 seq += x.pullCodingSequence(raw_seq, n_bases_upstream, 0)
             elif i == (len(self.exons) - 1):
                 seq += x.pullCodingSequence(raw_seq, 0, n_bases_downstream)
             else:
                 seq += x.pullCodingSequence(raw_seq, 0, 0)
     if self.strand == '-':
         seq = translate.reverseComplement(seq)
     return seq
Beispiel #5
0
	def getStrandAlts(self, strand):
		res = self.alt
		if strand == '-':
			res = [translate.reverseComplement(a) for a in self.alt]
		return res
Beispiel #6
0
	def getStrandRef(self, strand):
		res = self.ref
		if strand == '-':
			res = translate.reverseComplement(self.ref)
		return res
Beispiel #7
0
 def getStrandAlts(self, strand):
     res = self.alt
     if strand == '-':
         res = [translate.reverseComplement(a) for a in self.alt]
     return res
Beispiel #8
0
 def getStrandRef(self, strand):
     res = self.ref
     if strand == '-':
         res = translate.reverseComplement(self.ref)
     return res
Beispiel #9
0
 def test002(self):
     """reverse complement"""
     s = "ATGCatgc"
     self.assertTrue(translate.reverseComplement(s) == "gcatGCAT")
     self.assertTrue(translate.reverseComplement(translate.reverseComplement(s)) == s)
Beispiel #10
0
def writeGenes(chromosomes, exon_records, chromosome_load_fxn, id_fxn, exclude_nt_from_boundary, outfile):
	total_genes = 0
	total_errors = 0
	# Reconstruct genes
	for chr in chromosomes:
		print "# Chromosome", chr
		# Load this chromosome's data
		chrseq = chromosome_load_fxn(chr)
		n_genes = 0
		n_errors = 0
		n_wrong_length = 0
		n_bad_translation = 0
		n_different_translation = 0
		n_no_comparison = 0
		for (pepid, recdict) in exon_records.items():
			recs = recdict.values()
			if recs[0].chromosome == chr:
				strand_sign = int(recs[0].strand)
				recs.sort( key = lambda e: e.exon_start)
				(seq, sentinel_rec) = buildCodingSequence(recs, chrseq, exclude_nt_from_boundary)
				rseq = seq
				# Reverse-complement the sequence if it's on the negative strand
				if recs[0].strand == '-1':
					#print "reversing strand"
					seq = translate.reverseComplement(seq)
				n_genes += 1
				if len(seq) % 3 != 0:
					n_errors += 1
					n_wrong_length += 1
					continue
				if exclude_nt_from_boundary == 0:
					#if sentinel_rec.peptide_ID == 'ENSDARP00000076309':
					#	print seq
					#	print rseq
					#	print translate.translateRaw(seq)
					#	print translate.translateRaw(rseq)
					#	sys.exit()
					try:
						prot = translate.translate(seq)
						if False: #not prot:
							print "****"
							print id_fxn(sentinel_rec)
							print seq
							print rseq
							for rec in recs:
								print rec
							#print "^%s^\t%s" % (recs[0].strand, seq[0:])
							n_errors += 1
							n_bad_translation += 1
							continue
					except translate.BioUtilsError:
						continue
				# Translation is good... write it.
				line = ">%s\n%s\n" % (id_fxn(sentinel_rec), seq)
				outfile.write(line)
				#outfile.write(">%s\n%s\n" % (id_fxn(sentinel_rec), seq))

		print "# Processed %d genes with %d errors" % (n_genes, n_errors)
		print "#     %d length errors" % (n_wrong_length,)
		print "#     %d bad translation errors" % (n_bad_translation,)
		total_errors += n_errors
		total_genes += n_genes
	print "# Wrote %d genes with %d errors total" % (total_genes, total_errors)
Beispiel #11
0
def writeStats(chromosomes, peptide_records, bounds, chromosome_load_fxn, outfile):
	header = "gene\tpep\ttrans\tchr\tlen.intron\tn.exons\tfrac.coding\tgc.intron\tgcind.intron\tgt.intron\tlen.coding\tfrac.%s\n" % \
				  ('\tfrac.'.join([str(b) for b in bounds],))
	outfile.write(header)
	print header,
	total_genes = 0
	total_errors = 0
	# Reconstruct genes
	for chr in chromosomes:
		print "# Chromosome", chr
		# Load this chromosome's data
		chrseq = chromosome_load_fxn(chr)
		n_genes = 0
		n_errors = 0
		n_wrong_length = 0
		n_bad_translation = 0
		n_different_translation = 0
		for (pepid, recdict) in peptide_records.items():
			recs = recdict.values()
			if recs[0].chromosome == chr:
				recs.sort( key = lambda e: e.exon_start)
				(seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0)
				(intron_seq, sentinel_rec_intron) = buildIntronSequence(recs, chrseq)
				# Reverse-complement the sequence if it's on the negative strand
				if recs[0].strand == '-1':
					#print "reversing strand"
					seq = translate.reverseComplement(seq)
					intron_seq = translate.reverseComplement(intron_seq)
				n_genes += 1

				# Write out statistics
				if False:
					print 'g', seq
					print 'i', intron_seq
				if len(intron_seq)>0:
					gc_intron = '%1.4f' % cai.getGC(intron_seq)
					intron_length = len(intron_seq)
					frac_coding = '%1.4f' % (len(seq)/(len(seq)+float(intron_length)),)
					gcind_intron = '%1.4f' % cai.getDinucleotideIndex(intron_seq, 'GC')
					gt_intron = '%1.4f' % cai.getContent(intron_seq, 'GT')
				else:
					gc_intron = 'NA'
					intron_length = 0
					gcind_intron = 'NA'
					if len(seq)>0:
						frac_coding = '1.0'
					else:
						frac_coding = 'NA'
				num_coding_exons = len([xr for xr in recs if xr.coding_end > xr.coding_start])
				# Test to ensure agreement between codingOutsideBoundary and buildCodingSequence
				if False:
					(coding_outside, total_coding) = codingOutsideBoundary(recs, 0)
					print len(seq), total_coding, coding_outside
					assert total_coding == coding_outside
					assert coding_outside == len(seq)
				fracs_inside = []
				for bound in bounds:
					(coding_outside, total_coding) = codingOutsideBoundary(recs, bound)
					if total_coding > 0:
						frac_inside_bound = "%1.4f" % (1-float(coding_outside)/total_coding,)
					else:
						frac_inside_bound = "NA"
					fracs_inside.append(frac_inside_bound)
				line = "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%s\n" % \
					   (sentinel_rec.gene_ID, sentinel_rec.peptide_ID, sentinel_rec.transcript_ID, sentinel_rec.chromosome, intron_length, num_coding_exons, frac_coding, gc_intron, gcind_intron, gt_intron, len(seq), '\t'.join(fracs_inside))
				outfile.write(line)
				#print line,

		outfile.flush()
		print "# Processed %d genes with %d errors" % (n_genes, n_errors)
		print "#     %d length errors" % (n_wrong_length,)
		print "#     %d bad translation errors" % (n_bad_translation,)
		print "#     %d different translation errors" % (n_different_translation,)
		total_errors += n_errors
		total_genes += n_genes
		chrseqlist = None
		chrseq = None
	print "# Processed %d genes with %d errors total" % (total_genes, total_errors)
	outfile.close()
Beispiel #12
0
 def test002(self):
     """reverse complement"""
     s = 'ATGCatgc'
     self.assertTrue(translate.reverseComplement(s) == 'gcatGCAT')
     self.assertTrue(
         translate.reverseComplement(translate.reverseComplement(s)) == s)
Beispiel #13
0
def writeGenes(chromosomes, exon_records, chromosome_load_fxn, id_fxn,
               exclude_nt_from_boundary, outfile):
    total_genes = 0
    total_errors = 0
    # Reconstruct genes
    for chr in chromosomes:
        print "# Chromosome", chr
        # Load this chromosome's data
        chrseq = chromosome_load_fxn(chr)
        n_genes = 0
        n_errors = 0
        n_wrong_length = 0
        n_bad_translation = 0
        n_different_translation = 0
        n_no_comparison = 0
        for (pepid, recdict) in exon_records.items():
            recs = recdict.values()
            if recs[0].chromosome == chr:
                strand_sign = int(recs[0].strand)
                recs.sort(key=lambda e: e.exon_start)
                (seq,
                 sentinel_rec) = buildCodingSequence(recs, chrseq,
                                                     exclude_nt_from_boundary)
                rseq = seq
                # Reverse-complement the sequence if it's on the negative strand
                if recs[0].strand == '-1':
                    #print "reversing strand"
                    seq = translate.reverseComplement(seq)
                n_genes += 1
                if len(seq) % 3 != 0:
                    n_errors += 1
                    n_wrong_length += 1
                    continue
                if exclude_nt_from_boundary == 0:
                    #if sentinel_rec.peptide_ID == 'ENSDARP00000076309':
                    #	print seq
                    #	print rseq
                    #	print translate.translateRaw(seq)
                    #	print translate.translateRaw(rseq)
                    #	sys.exit()
                    try:
                        prot = translate.translate(seq)
                        if False:  #not prot:
                            print "****"
                            print id_fxn(sentinel_rec)
                            print seq
                            print rseq
                            for rec in recs:
                                print rec
                            #print "^%s^\t%s" % (recs[0].strand, seq[0:])
                            n_errors += 1
                            n_bad_translation += 1
                            continue
                    except translate.BioUtilsError:
                        continue
                # Translation is good... write it.
                line = ">%s\n%s\n" % (id_fxn(sentinel_rec), seq)
                outfile.write(line)
                #outfile.write(">%s\n%s\n" % (id_fxn(sentinel_rec), seq))

        print "# Processed %d genes with %d errors" % (n_genes, n_errors)
        print "#     %d length errors" % (n_wrong_length, )
        print "#     %d bad translation errors" % (n_bad_translation, )
        total_errors += n_errors
        total_genes += n_genes
    print "# Wrote %d genes with %d errors total" % (total_genes, total_errors)
Beispiel #14
0
def writeStats(chromosomes, peptide_records, bounds, chromosome_load_fxn,
               outfile):
    header = "gene\tpep\ttrans\tchr\tlen.intron\tn.exons\tfrac.coding\tgc.intron\tgcind.intron\tgt.intron\tlen.coding\tfrac.%s\n" % \
         ('\tfrac.'.join([str(b) for b in bounds],))
    outfile.write(header)
    print header,
    total_genes = 0
    total_errors = 0
    # Reconstruct genes
    for chr in chromosomes:
        print "# Chromosome", chr
        # Load this chromosome's data
        chrseq = chromosome_load_fxn(chr)
        n_genes = 0
        n_errors = 0
        n_wrong_length = 0
        n_bad_translation = 0
        n_different_translation = 0
        for (pepid, recdict) in peptide_records.items():
            recs = recdict.values()
            if recs[0].chromosome == chr:
                recs.sort(key=lambda e: e.exon_start)
                (seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0)
                (intron_seq,
                 sentinel_rec_intron) = buildIntronSequence(recs, chrseq)
                # Reverse-complement the sequence if it's on the negative strand
                if recs[0].strand == '-1':
                    #print "reversing strand"
                    seq = translate.reverseComplement(seq)
                    intron_seq = translate.reverseComplement(intron_seq)
                n_genes += 1

                # Write out statistics
                if False:
                    print 'g', seq
                    print 'i', intron_seq
                if len(intron_seq) > 0:
                    gc_intron = '%1.4f' % cai.getGC(intron_seq)
                    intron_length = len(intron_seq)
                    frac_coding = '%1.4f' % (
                        len(seq) / (len(seq) + float(intron_length)), )
                    gcind_intron = '%1.4f' % cai.getDinucleotideIndex(
                        intron_seq, 'GC')
                    gt_intron = '%1.4f' % cai.getContent(intron_seq, 'GT')
                else:
                    gc_intron = 'NA'
                    intron_length = 0
                    gcind_intron = 'NA'
                    if len(seq) > 0:
                        frac_coding = '1.0'
                    else:
                        frac_coding = 'NA'
                num_coding_exons = len(
                    [xr for xr in recs if xr.coding_end > xr.coding_start])
                # Test to ensure agreement between codingOutsideBoundary and buildCodingSequence
                if False:
                    (coding_outside,
                     total_coding) = codingOutsideBoundary(recs, 0)
                    print len(seq), total_coding, coding_outside
                    assert total_coding == coding_outside
                    assert coding_outside == len(seq)
                fracs_inside = []
                for bound in bounds:
                    (coding_outside,
                     total_coding) = codingOutsideBoundary(recs, bound)
                    if total_coding > 0:
                        frac_inside_bound = "%1.4f" % (
                            1 - float(coding_outside) / total_coding, )
                    else:
                        frac_inside_bound = "NA"
                    fracs_inside.append(frac_inside_bound)
                line = "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%s\n" % \
                    (sentinel_rec.gene_ID, sentinel_rec.peptide_ID, sentinel_rec.transcript_ID, sentinel_rec.chromosome, intron_length, num_coding_exons, frac_coding, gc_intron, gcind_intron, gt_intron, len(seq), '\t'.join(fracs_inside))
                outfile.write(line)
                #print line,

        outfile.flush()
        print "# Processed %d genes with %d errors" % (n_genes, n_errors)
        print "#     %d length errors" % (n_wrong_length, )
        print "#     %d bad translation errors" % (n_bad_translation, )
        print "#     %d different translation errors" % (
            n_different_translation, )
        total_errors += n_errors
        total_genes += n_genes
        chrseqlist = None
        chrseq = None
    print "# Processed %d genes with %d errors total" % (total_genes,
                                                         total_errors)
    outfile.close()