def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 3: usage() f = "first_introns.sites" n = "nonfirst_introns.sites" try: opts, args = getopt.getopt(sys.argv[3:], "f:n:") except getopt.GetoptError: usage() for opt, arg in opts: if opt == "-f": f = arg elif opt == "-n": n = arg else: print("Unrecognized option: " + opt + "\n") usage() first = open(f, "w") nonfirst = open(n, "w") annot = annotation.Reader(open(sys.argv[1], "rb")) divergence = pickle.load(open(sys.argv[2], "rb")) print("Gene\tIntronNum\tIntronLen\tSitesTot\tDiverged") for gene in annot: #print gene_name intron_number intron_length site_count divergence_count #sys.stderr.write(gene) introns = gene.makeIntrons() for j, intron in enumerate(introns): #sys.stderr.write(intron) siteCount = 0 divCount = 0 for i in range(intron[0], intron[1] + 1): div = divergence[i][2] #get the flag indicating divergence if div == None: sys.stderr.write("NO DATA: " + str(i) + "\n") continue siteCount += 1 if not div: divCount += 1 if j == 0: #a first intron first.write(gene.scaf + "\t" + str(i) + "\n") else: nonfirst.write(gene.scaf + "\t" + str(i) + "\n") print(gene.name + "\t" + str(j) + "\t" + str(intron[1] - intron[0] + 1) + "\t" + str(siteCount) + "\t" + str(divCount))
def processAnnotation(con): reader = annotation.Reader(open(sys.argv[2], 'rb')) genes = [] for gene in reader: gene.sortExons() vals = "\'%s\', \'%s\',\'%\s\', %s, %s, 0" % ( gene.name, gene.scaf, gene.direction, gene.start, gene.end) genes.append(vals) con.executemany( "INSERT INTO genes (name, scaffold, direction, start, stop, expression) VALUES (?, ?, ?, ?, ?, ?)", genes)
def __main__(): if len(sys.argv) == 1: details() sys.exit() processArgs(3) #initialize my parsers annotation_reader = annotation.Reader(open(sys.argv[1], 'rb')) if not _r: reader = vcf.Reader(open(sys.argv[2], 'rb')) names = reader.samples else: reader = pileup.Reader(open(sys.argv[2], 'rb')) names = ["outgroup"] annotation_iter = annotation_reader.__iter__() #pull my first gene next_gene = getNextGene(annotation_iter) if next_gene == None: #No genes = done sys.stderr.write("No valid genes in annotation. Exiting.\n") sys.exit(0) exons = next_gene.exons last_site = -1 myFasta = Fasta(names) for record in reader: indel = 0 #if we're out of exons get the next gene if len(exons) == 0: myFasta.writeFasta(next_gene.name, next_gene.start, next_gene.end, next_gene.direction) next_gene = getNextGene(annotation_iter) while next_gene != None and next_gene.start < record.POS: sys.stderr.write("Gene missed, read in gene at POS = " + str(record.POS) + "\n\t" + str(next_gene) + "\n") next_gene = getNextGene(annotation_iter) if next_gene == None: #no more genes break exons = next_gene.exons myFasta = Fasta(names) #reset our fasta data if not _r and 'Dels' in record.INFO.keys() and record.INFO[ 'Dels'] > _i: #if this site has some probability of having an INDEL indel = 1 #TODO check reference for deletions if record.POS < next_gene.start: #if we haven't reached the next gene yet continue if record.POS >= exons[0][ 0]: #if we're past the beginning of the next exon if (last_site != -1 and last_site != record.POS - 1) or ( last_site == -1 and exons[0][0] != record.POS): #if we missed some sites in the middle #OR #if we missed sites at the beginning of a gene num = myFasta.fillRemainder( max(last_site, exons[0][0]), record.POS) #fill in any missing sites with N #WARNING: if an exon starts right after another ended then this will N the first base in the new exon if _v and num > 0: sys.stderr.write( "'N'ed " + str(num) + " sites because we missed the beginning of an exon, or something in the middle\n\t" + str(record)) if record.POS <= exons[0][1]: #if we're within the current exon processSite(record, myFasta, indel, next_gene.direction) last_site = record.POS else: #this exon is done num = myFasta.fillRemainder( max(last_site, exons[0][0]), exons[0][1]) #fill in any missing sites with N if _v and num > 0: sys.stderr.write( "'N'ed " + str(num) + " sites because we have passed the end of the exon\n\t" + str(record)) last_site = -1 #we finished this exon exons.pop(0) if next_gene != None: #finish the last gene with N's for exon in exons: myFasta.fillRemainder(last_site, exon) myFasta.writeFasta(next_gene.name, next_gene.start, next_gene.end, next_gene.direction)
def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 3: usage() processArgs(3) summaryReader = summary.Reader(open(sys.argv[1], "rb")) annotationReader = annotation.Reader(open(sys.argv[2], "rb")) annotIter = annotationReader.__iter__() myGene = getNextGene(annotIter) if myGene == None: sys.stderr.write("No genes listed in annotation. Exiting.\n") sys.exit(0) if len(summaryReader.summary.Samples) == 0: sys.stderr.write( "No individual genotype info in this Summary file. Cannot make fastas.\n" ) sys.exit(0) seqs = {} for samp in summaryReader.summary.Samples: seqs[samp] = [[], []] doneScafs = [] genoDict = {v: k for k, v in summaryReader.summary.Genotypes.items()} pScaf = "" pSite = 0 #read the infile file... for site in summaryReader: #sys.stderr.write("Processing site %s\n" % site.prettyStr()) if site.CHROM != myGene.scaf: while myGene.scaf in doneScafs: sys.stderr.write( "Missed gene %s on scaf %s. Make sure file is sorted correctly.\n" % (myGene.name, myGene.scaf)) myGene = getNextGene(annotIter) for samp in summaryReader.summary.Samples: seqs[samp] = [[], []] if site.CHROM != myGene.scaf: #haven't reached this scaf yet, skip along in the summary continue if pScaf == "" or pScaf != site.CHROM: doneScafs.append(pScaf) pScaf = site.CHROM pSite = 0 myGene = addNs(seqs, pSite, site.POS, myGene, annotIter) if myGene == None: #sys.stderr.write("main 71: ran out of genes. Exiting.\n") break if site.POS < myGene.exons[0][0]: pass elif site.POS <= myGene.exons[0][1]: for samp, geno in site.Genotypes.items(): if geno == genoDict['heterozygote']: seqs[samp][0].append(site.REF) seqs[samp][1].append(site.ALT) elif geno == genoDict['homozygote reference']: seqs[samp][0].append(site.REF) seqs[samp][1].append(site.REF) elif geno == genoDict['homozygote alternate']: seqs[samp][0].append(site.ALT) seqs[samp][1].append(site.ALT) else: seqs[samp][0].append("N") seqs[samp][1].append("N") if site.POS >= myGene.exons[0][1]: myGene.exons.pop(0) if not myGene.exons: #print the seqs to a file outputFasta(myGene, seqs) #grab new gene myGene = getNextGene(annotIter) #check it if myGene == None: break #reset seqs for samp in summaryReader.summary.Samples: seqs[samp] = [[], []] pSite = site.POS