Esempio n. 1
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()

    if len(sys.argv) < 3:
        usage()

    f = "first_introns.sites"
    n = "nonfirst_introns.sites"

    try:
        opts, args = getopt.getopt(sys.argv[3:], "f:n:")
    except getopt.GetoptError:
        usage()

    for opt, arg in opts:
        if opt == "-f":
            f = arg
        elif opt == "-n":
            n = arg
        else:
            print("Unrecognized option: " + opt + "\n")
            usage()

    first = open(f, "w")
    nonfirst = open(n, "w")
    annot = annotation.Reader(open(sys.argv[1], "rb"))

    divergence = pickle.load(open(sys.argv[2], "rb"))

    print("Gene\tIntronNum\tIntronLen\tSitesTot\tDiverged")
    for gene in annot:
        #print gene_name intron_number intron_length site_count divergence_count
        #sys.stderr.write(gene)
        introns = gene.makeIntrons()
        for j, intron in enumerate(introns):
            #sys.stderr.write(intron)
            siteCount = 0
            divCount = 0
            for i in range(intron[0], intron[1] + 1):
                div = divergence[i][2]  #get the flag indicating divergence
                if div == None:
                    sys.stderr.write("NO DATA: " + str(i) + "\n")
                    continue
                siteCount += 1
                if not div:
                    divCount += 1
                if j == 0:  #a first intron
                    first.write(gene.scaf + "\t" + str(i) + "\n")
                else:
                    nonfirst.write(gene.scaf + "\t" + str(i) + "\n")

            print(gene.name + "\t" + str(j) + "\t" +
                  str(intron[1] - intron[0] + 1) + "\t" + str(siteCount) +
                  "\t" + str(divCount))
Esempio n. 2
0
def processAnnotation(con):
    reader = annotation.Reader(open(sys.argv[2], 'rb'))

    genes = []
    for gene in reader:
        gene.sortExons()
        vals = "\'%s\', \'%s\',\'%\s\', %s, %s, 0" % (
            gene.name, gene.scaf, gene.direction, gene.start, gene.end)
        genes.append(vals)
    con.executemany(
        "INSERT INTO genes (name, scaffold, direction, start, stop, expression) VALUES (?, ?, ?, ?, ?, ?)",
        genes)
Esempio n. 3
0
def __main__():
    if len(sys.argv) == 1:
        details()
        sys.exit()

    processArgs(3)

    #initialize my parsers
    annotation_reader = annotation.Reader(open(sys.argv[1], 'rb'))
    if not _r:
        reader = vcf.Reader(open(sys.argv[2], 'rb'))
        names = reader.samples
    else:
        reader = pileup.Reader(open(sys.argv[2], 'rb'))
        names = ["outgroup"]

    annotation_iter = annotation_reader.__iter__()

    #pull my first gene
    next_gene = getNextGene(annotation_iter)

    if next_gene == None:  #No genes = done
        sys.stderr.write("No valid genes in annotation. Exiting.\n")
        sys.exit(0)

    exons = next_gene.exons
    last_site = -1
    myFasta = Fasta(names)

    for record in reader:
        indel = 0
        #if we're out of exons get the next gene
        if len(exons) == 0:
            myFasta.writeFasta(next_gene.name, next_gene.start, next_gene.end,
                               next_gene.direction)
            next_gene = getNextGene(annotation_iter)
            while next_gene != None and next_gene.start < record.POS:
                sys.stderr.write("Gene missed, read in gene at POS = " +
                                 str(record.POS) + "\n\t" + str(next_gene) +
                                 "\n")
                next_gene = getNextGene(annotation_iter)

            if next_gene == None:  #no more genes
                break
            exons = next_gene.exons

            myFasta = Fasta(names)  #reset our fasta data

        if not _r and 'Dels' in record.INFO.keys() and record.INFO[
                'Dels'] > _i:  #if this site has some probability of having an INDEL
            indel = 1
            #TODO check reference for deletions

        if record.POS < next_gene.start:  #if we haven't reached the next gene yet
            continue

        if record.POS >= exons[0][
                0]:  #if we're past the beginning of the next exon
            if (last_site != -1 and last_site != record.POS - 1) or (
                    last_site == -1 and exons[0][0] != record.POS):
                #if we missed some sites in the middle
                #OR
                #if we missed sites at the beginning of a gene
                num = myFasta.fillRemainder(
                    max(last_site, exons[0][0]),
                    record.POS)  #fill in any missing sites with N
                #WARNING: if an exon starts right after another ended then this will N the first base in the new exon
                if _v and num > 0:
                    sys.stderr.write(
                        "'N'ed " + str(num) +
                        " sites because we missed the beginning of an exon, or something in the middle\n\t"
                        + str(record))
            if record.POS <= exons[0][1]:  #if we're within the current exon
                processSite(record, myFasta, indel, next_gene.direction)
                last_site = record.POS
            else:  #this exon is done
                num = myFasta.fillRemainder(
                    max(last_site, exons[0][0]),
                    exons[0][1])  #fill in any missing sites with N
                if _v and num > 0:
                    sys.stderr.write(
                        "'N'ed " + str(num) +
                        " sites because we have passed the end of the exon\n\t"
                        + str(record))
                last_site = -1  #we finished this exon
                exons.pop(0)

    if next_gene != None:  #finish the last gene with N's
        for exon in exons:
            myFasta.fillRemainder(last_site, exon)
        myFasta.writeFasta(next_gene.name, next_gene.start, next_gene.end,
                           next_gene.direction)
Esempio n. 4
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()

    if len(sys.argv) < 3:
        usage()

    processArgs(3)

    summaryReader = summary.Reader(open(sys.argv[1], "rb"))

    annotationReader = annotation.Reader(open(sys.argv[2], "rb"))
    annotIter = annotationReader.__iter__()

    myGene = getNextGene(annotIter)

    if myGene == None:
        sys.stderr.write("No genes listed in annotation. Exiting.\n")
        sys.exit(0)

    if len(summaryReader.summary.Samples) == 0:
        sys.stderr.write(
            "No individual genotype info in this Summary file. Cannot make fastas.\n"
        )
        sys.exit(0)

    seqs = {}
    for samp in summaryReader.summary.Samples:
        seqs[samp] = [[], []]

    doneScafs = []

    genoDict = {v: k for k, v in summaryReader.summary.Genotypes.items()}
    pScaf = ""
    pSite = 0
    #read the infile file...
    for site in summaryReader:
        #sys.stderr.write("Processing site %s\n" % site.prettyStr())
        if site.CHROM != myGene.scaf:
            while myGene.scaf in doneScafs:
                sys.stderr.write(
                    "Missed gene %s on scaf %s. Make sure file is sorted correctly.\n"
                    % (myGene.name, myGene.scaf))
                myGene = getNextGene(annotIter)

            for samp in summaryReader.summary.Samples:
                seqs[samp] = [[], []]

            if site.CHROM != myGene.scaf:  #haven't reached this scaf yet, skip along in the summary
                continue

        if pScaf == "" or pScaf != site.CHROM:
            doneScafs.append(pScaf)
            pScaf = site.CHROM
            pSite = 0

        myGene = addNs(seqs, pSite, site.POS, myGene, annotIter)

        if myGene == None:
            #sys.stderr.write("main 71: ran out of genes. Exiting.\n")
            break

        if site.POS < myGene.exons[0][0]:
            pass

        elif site.POS <= myGene.exons[0][1]:
            for samp, geno in site.Genotypes.items():
                if geno == genoDict['heterozygote']:
                    seqs[samp][0].append(site.REF)
                    seqs[samp][1].append(site.ALT)
                elif geno == genoDict['homozygote reference']:
                    seqs[samp][0].append(site.REF)
                    seqs[samp][1].append(site.REF)
                elif geno == genoDict['homozygote alternate']:
                    seqs[samp][0].append(site.ALT)
                    seqs[samp][1].append(site.ALT)
                else:
                    seqs[samp][0].append("N")
                    seqs[samp][1].append("N")

        if site.POS >= myGene.exons[0][1]:
            myGene.exons.pop(0)

        if not myGene.exons:
            #print the seqs to a file
            outputFasta(myGene, seqs)
            #grab new gene
            myGene = getNextGene(annotIter)
            #check it
            if myGene == None:
                break
            #reset seqs
            for samp in summaryReader.summary.Samples:
                seqs[samp] = [[], []]

        pSite = site.POS