Beispiel #1
0
def readfiles(indir):
    ext = 'fa'
    files = iseqlib.getfiles(indir, ext)
    sample2freqs = {}
    for file in files:
        sample = file.rstrip(ext).rstrip('.')
        freqs = readfile( os.path.join(indir, file) )
        sample2freqs[sample] = freqs
    return sample2freqs
Beispiel #2
0
def main():
    parser = iseqlib.initOptions()
    addOptions(parser)
    options, args = parser.parse_args()

    if options.minExpFreq > 0 and not options.sample2total:
        parser.error("--sample2total is required as --minExpandedFreq > 0\n")
    if options.sample2total:
        options.sample2total = readSample2total(options.sample2total)

    ext = 'xml'
    infiles = iseqlib.getfiles(options.indir, ext)
    outbasename = os.path.join(options.outdir, options.basename)
    
    fh1 = open("%s-1.txt" %outbasename, 'w')
    fh2 = open("%s-2.txt" %outbasename, 'w')
    fh3 = open("%s-3.txt" %outbasename, 'w')
    fh4 = open("%s-4.txt" %outbasename, 'w')
    
    type2count = {}
    type2clones = {}
    for file in infiles:
        infile = os.path.join(options.indir, file)
        vjname = file.rstrip(ext).rstrip('.')
        clone2hits = readNcbiXml(infile, options)
        separateClusters(clone2hits, options.sample2total, options.minExpFreq, options.minExpSize, options.minExpClones, options.minMotifClones, vjname, fh1, fh2, fh3, fh4, type2count, type2clones)
        
    fh1.close()
    fh2.close()
    fh3.close()
    fh4.close()

    #summary stats:
    summaryFile = "%s-summary.txt" %outbasename
    fh = open(summaryFile, 'w')
    fh.write("#min similarity: %f\n#min length: %d\n#min size to be called expanded clones: %d\n#min number of expanded clones required for type 1: %d\n#min number of clones supported a single motif for type 2: %d\n" %(options.minPos, options.minLen, options.minExpSize, options.minExpClones, options.minMotifClones))
    for type, count in type2count.iteritems():
        fh.write("%s\t%d\n" %(type, count))
    fh.close()

    #Print type2clones:
    v2seq = {}
    if options.vfile:
        v2seq = readVfile(options.vfile)
    type2index = {}
    for type, header2vj2seq in type2clones.iteritems():
        outfile = getFaFilename(type, outbasename)
        f = open(outfile, 'w')
        f.write("#%s\n" %type)
        for header, vj2seq in header2vj2seq.iteritems():
            for vj, seq in vj2seq.iteritems():
                #f.write(">%s;%s\n" %(header, vj))
                #reformat header:
                sample, id, size = getCloneInfo( ";".join(header.split(";")[:3]) ) 
                header = "%s;%s;%d;%s" % (sample.lstrip('as'), seq, size, vj)
                f.write(">%s\n" %header)

                v = vj.split(".")[0]
                if v in v2seq:
                    seq = v2seq[v] + seq
                f.write("%s\n" %seq)
        f.close()
def main():
    parser = iseqlib.initOptions()
    addOptions(parser)
    options, args = parser.parse_args()

    if options.minExpFreq > 0 and not options.sample2total:
        parser.error("--sample2total is required as --minExpandedFreq > 0\n")
    if options.sample2total:
        options.sample2total = readSample2total(options.sample2total)

    ext = 'txt'
    infiles = iseqlib.getfiles(options.indir, ext)
    outbasename = os.path.join(options.outdir, options.basename)
    
    fh1 = open("%s-1.txt" %outbasename, 'w')
    fh2 = open("%s-2.txt" %outbasename, 'w')
    fh3 = open("%s-3.txt" %outbasename, 'w')
    fh4 = open("%s-4.txt" %outbasename, 'w')
    fh5 = open("%s-5.txt" %outbasename, 'w')
    
    type2count = {}
    type2clusters = {}
    totalseq = 0
    expseq = 0
    for file in infiles:
        infile = os.path.join(options.indir, file)
        vjname = file.rstrip(ext).rstrip('.')
        clusters = readBlastclustOutfiles(infile, vjname, options)
        total, exp = separateClusters(clusters, options.minExpFreq, options.minExpSize, options.minExpClones, fh1, fh2, fh3, fh4, fh5, type2count, type2clusters)
        totalseq += total
        expseq += exp
    fh1.close()
    fh2.close()
    fh3.close()
    fh4.close()
    fh5.close()

    #summary stats:
    summaryFile = "%s-summary.txt" %outbasename
    fh = open(summaryFile, 'w')
    fh.write("#min size to be called expanded clones: %d\n#min number of expanded clones required for type 1: %d\n" %(options.minExpSize, options.minExpClones))
    id2info = clusterId2info()
    fh.write("Total number of clones: %d\n" %totalseq)
    fh.write("Total number of expanded clones: %d\n" %expseq)
    for type in sorted(type2count.keys()):
        fh.write("%d\t%s\t%d\n" %(type, id2info[type], type2count[type]))
    fh.close()

    #Print type2clones:
    
    #Read v files:
    v2seq = {}
    if options.vfile:
        v2seq = readFaFile(options.vfile)
    
    #Read fasta files:
    ext = 'fa'
    infiles = iseqlib.getfiles(options.fadir, ext)
    vj2id2seq = {}
    for file in infiles:
        infile = os.path.join(options.fadir, file)
        vjname = file.rstrip(ext).rstrip('.')
        vj2id2seq[vjname] = readFaFile(infile)

    for type, clusters in type2clusters.iteritems():
        outfile = "%s-%d.fa" %(outbasename, type)
        f = open(outfile, 'w')
        f.write("#Type: %s\n" %type)
        clusters = sorted(clusters, key=lambda c: c.totalReads, reverse=True)
        for cluster in clusters:
            f.write("#Cluster %s\n" %cluster.getDesc())
            id2seq = vj2id2seq[cluster.vj]
            v = cluster.vj.split(".")[0]
            vseq = ""
            if v in v2seq:
                vseq = v2seq[v]

            for clone in cluster.clones:
                seq = id2seq[ clone.desc ]
                cdr3seq = seq[ len(vseq): ]
                header = "%s;%s;%d;%s" %(clone.sample, cdr3seq, clone.size, cluster.vj)
                f.write(">%s\n" %header)
                f.write("%s\n" %seq)
        f.close()