Esempio n. 1
0
def main():
    parser = iseqlib.initOptions()
    addOptions(parser)

    options, args = parser.parse_args()
    
    mode = 1
    samples = iseqlib.readfiles(options.indir, options.mincount, mode)
    sample2aa2v2j = iseqlib.getsample2aa2v2j(samples)
    
    #### MORISITA-HORN SIMILARITY INDEX ####
    outfile = os.path.join(options.outdir, "similarity.txt")
    f = open(outfile, 'w')
    #f.write("Sample1\tSample2\tMorisitaHorn Similarity Index\n")
    names = sorted([s.name for s in samples])
    name2sample = {}
    for s in samples:
        name2sample[ s.name ] = s

    f.write("\t%s\n" % '\t'.join(names) )
    #for i in xrange( len(names) -1 ):
    for i in xrange( len(names) ):
        s1 = name2sample[ names[i] ]
        aa2v2j1 = sample2aa2v2j[s1.name]
        f.write("%s" %names[i])
        #f.write("%s" % '\t'.join( ['' for t in xrange(i + 2)] ))
        #for j in xrange( i+1, len(names) ):
        for j in xrange( len(names) ):
            s2 = name2sample[ names[j] ]
            aa2v2j2 = sample2aa2v2j[s2.name]
            similarity = pairwiseSimilarity(s1, s2, aa2v2j1, aa2v2j2)
            #f.write("%s\t%s\t%f\n" %(s1.name, s2.name, similarity))
            f.write("\t%f" %similarity)
        f.write("\n")
    f.close()
Esempio n. 2
0
def main():
    parser = iseqlib.initOptions()
    parser.add_option('-i', '--infile', dest = 'infile', help='Input file')
    parser.add_option('-o', '--outfile', dest= 'outfile', help='Output file')

    options, args = parser.parse_args()
    colnames, sample2row = readfile(options.infile)
    getTab(options.outfile, colnames, sample2row)
Esempio n. 3
0
def main():
    parser = immunoseqLib.initOptions()
    initOptions( parser )
    immunoseqLib.initPlotOptions( parser )
    options, args = parser.parse_args()
    checkOptions( args, options, parser )
    immunoseqLib.checkPlotOptions( options, parser )

    samples, stds = readfiles( options.indir, options.noheader )
    type2intersectGenes = getIntersect(samples) #Get genes that are present in all samples
    type2selectedGenes = getSelectedGenes(type2intersectGenes, options.vs, options.js) #Select only genes of interest that are present in all samples
    
    group2samples = readGroup2samples(options.group2samples)
    
    #Intersect VJ:
    if options.ttest or options.vj or options.pca:
        #intersectVJ(samples, type2intersectGenes, options.vjs) #added intersectVJusage, normIntersectVJusage
        intersectVJ(samples, type2selectedGenes) #added intersectVJusage, normIntersectVJusage

    #Pca:
    if options.pca:
        vPcaOut = os.path.join(options.outdir, "vPca")
        getPcaTransformedMatrix(samples, group2samples, type2intersectGenes, options.vjs, 'v', options.abs, vPcaOut, options)
        vjPcaOut = os.path.join(options.outdir, "vjPca")
        getPcaTransformedMatrix(samples, group2samples, type2intersectGenes, options.vjs, 'vj', options.abs, vjPcaOut, options)
    #return

    #Student's t test:
    if options.ttest:
        outfile = os.path.join(options.outdir, "ttest.txt")
        ttests(samples, group2samples, outfile, type2selectedGenes, options.vjs, options.ttestTargetGroup, options.abs, options.pval)
    
    #HACK: Sort samples:
    #sampleOrder = 'SBC1,SBC7,SBC3,SBC4,SBC5,SBC6,SBC2,SBC8,Patient-01-R,Patient-01-D,Patient-10,Patient-11,Patient-12,Patient-13,Patient-B-R,Patient-B-D,Patient-8-D,Patient-15-D'
    if options.sampleOrder:
        orders = options.sampleOrder
        sortedSamples = []
        for s in orders:
            for sample in samples:
                if sample.name == s:
                    sortedSamples.append(sample)
        for s in samples:
            if s not in sortedSamples:
                sortedSamples.append(s)
    else:
        sortedSamples = samples

    #checkSamples( samples )
    if not options.noplot:
        drawUsageDist( sortedSamples, stds, options, 'v', type2selectedGenes['v'] )
        drawUsageDist( sortedSamples, stds, options, 'j', type2selectedGenes['j'] )
    
    if options.vj:
        minvj, maxvj = getMinMaxVJusage(samples, options.abs)
        for sample in samples:
            drawVJ( sample, type2selectedGenes['v'], type2selectedGenes['j'], options, minvj, maxvj )
def main():
    parser = iseqlib.initOptions()
    addOptions(parser)
    options, args = parser.parse_args()
    group2keywords = {} #key = keywordGroup, val = list of keywords
    if options.keywords:
        if options.keywords == '-':
            group2keywords = getDefaultKeywords()
        else:
            group2keywords, kw2group = iseqlib.readGroup2samples(options.keywords)

    clones, clone2hits = readNcbiXml(options.infile, options.minPos, options.minNumSamples, options.minLen)
    #printTab(clones, clone2hits, group2keywords, options, options.outdir)
    printTexTab(clones, clone2hits, group2keywords, options, options.outdir)
Esempio n. 5
0
def main():
    parser = iseqlib.initOptions()
    Stack.addJobTreeOptions( parser )
    iseqlib.initPlotOptions( parser )
    addOptions(parser)
    
    options, args = parser.parse_args()
    iseqlib.checkPlotOptions( options, parser )

    mincount = options.mincount
    #samples = readfiles(options.indir, mincount)

    i = Stack( Setup(options) ).startJobTree(options)
    if i:
        raise RuntimeError("The jobtree contains %d failed jobs.\n" %i)
Esempio n. 6
0
def main():
    parser = iseqlib.initOptions()
    parser.add_option('-i', '--infile', dest='infile', help='amino acid to nucleotide sequences file. Format: ')
    parser.add_option('-t', '--tsvdir', dest='tsvdir', help='Directory of tsv files')
    parser.add_option('-o', '--outfile', dest='outfile', help='Output file')
    parser.add_option('--jgenes', dest='jgenes', default='/hive/users/nknguyen/immuno/imgt/TRB/jgenes.txt', help='Required argument if would like to extend the 3\' end of sequences to get the full CDR3')
    parser.add_option('--jgeneToPos', dest='jgene2pos', default='/hive/users/nknguyen/immuno/imgt/TRB/jmappedF.txt', help='Required argument if would like to extend the 3\' end of sequences to get the full CDR3')
    
    options, args = parser.parse_args()
    if options.jgenes:
        options.jgenes = readGeneFile( options.jgenes )
    if options.jgene2pos:
        options.jgene2pos = readGene2pos( options.jgene2pos )

    header2nts = readInfile( options.infile )
    nt2genes2seqs = readTsvFiles( options.tsvdir, options )
    getStats( header2nts, nt2genes2seqs, options.outfile )
Esempio n. 7
0
def main():
    parser = iseqlib.initOptions()
    iseqlib.initPlotOptions(parser)
    parser.add_option('-i', '--indir', dest='indir')
    parser.add_option('-o', '--outdir', dest='outdir', default = '.')
    
    parser.add_option('-m', '--mode', dest='mode', default='1,2', type='string', help='Specify how you would like to aggregate the data. Should be a comma seperated list of any of the valid choices: [1, 2]. Mode 1 is one statistics of interest across different cutoffs. Mode 2 is different statistics for 1 cutoff.')

    parser.add_option('-s', '--stats_type', dest='stats_type', default='readAvr', help='Option for mode 1. Specify which overlapping statistics of interest to print out. Default = %default. Valid values are [clone1, clone2, cloneAvr, read1, read2, readAvr], where clone# is percentage of clones in sample # that passed each cutoff and are also in the other sample; cloneAvr is average of clone1 and clone2; read# is percentage of reads in clones of sample # that passed cutoff and also in the other sample; readAvr: average of read1 and read2')
    parser.add_option('-c', '--cutoffs', dest='cutoffs', default='all', help='Option for mode 2. Comma separated list of cutoffs of interest. Default=%default' )
    parser.add_option('-a', '--sampleOrder', dest='sampleOrder')
    parser.add_option('-l', '--latex', dest='latex', action='store_true', default=False)
    parser.add_option('-g', '--groupToSamples', dest='group2samples')

    options, args = parser.parse_args()
    iseqlib.checkPlotOptions( options, parser )

    if options.sampleOrder:
        options.sampleOrder = options.sampleOrder.split(',')
    indir = options.indir
    if not os.path.isdir(indir):
        raise ValueError("Input directory %s is not a directory\n" %indir)
    options.mode = options.mode.split(',')
    options.cutoffs = options.cutoffs.split(',')

    exps = []
    for file in os.listdir(indir):
        if os.path.isdir( os.path.join(indir, file) ):
            continue
        exp = readFile( os.path.join(indir, file) )
        exps.append(exp)
    
    if options.group2samples:
        options.group2samples = readGroup2Samples(options.group2samples)

    orfile = os.path.join(options.outdir, "overlapReads-%s.tex" %options.stats_type)
    #getOverlapReadsTab(exps, orfile)
    if '1' in options.mode:
        getCutoffsLatexTab(exps, orfile, options.stats_type, options.sampleOrder)
        #drawOverlapReads(exps, options)
    
    if '2' in options.mode:
        getAllStatsLatexTabs(exps, options.outdir, options.cutoffs, options.sampleOrder, options.latex)
Esempio n. 8
0
def main():
    parser = iseqlib.initOptions()
    parser.add_option('-i', '--infile', dest='infile', default='-', help='Input file. Default is stdin')
    parser.add_option('-o', '--outdir', dest='outdir', default='.', help='Output directory. Default is current directory')
    parser.add_option('--noplot', dest='noplot', action='store_true', default=False, help='If specified, do not draw plot')
    parser.add_option('-t', '--table', dest='table', action='store_true', default=False, help='If specified, make latex table')
    #parser.add_option('-o', '--outfile', dest='outfile', default='-', help='Output file. Default is stdout')

    iseqlib.initPlotOptions(parser)
    options, args = parser.parse_args()
    iseqlib.checkPlotOptions( options, parser )

    rowname2row, index2colname = readfile(options.infile)
    if not options.noplot:
        drawPlots(options, options.outdir, rowname2row, index2colname)

    if options.table:
        outfile = os.path.join(options.outdir, "%s.tex" %( os.path.basename(options.infile).split('.')[0] ))
        getLatexTab(outfile, rowname2row, index2colname)
def main():
    parser = iseqlib.initOptions()
    addOptions(parser)
    options, args = parser.parse_args()
    group2keywords = {}  # key = keywordGroup, val = list of keywords
    if options.keywords:
        if options.keywords == "-":
            group2keywords = getDefaultKeywords()
        else:
            group2keywords, kw2group = iseqlib.readGroup2samples(options.keywords)

    group2sample2host = {}
    if options.sample2host:
        group2sample2host = readSample2host(options.sample2host)
    # clones, clone2hits = readNcbiXml(options.infile, options.minPos, options.minNumSamples, options.minLen, options.minNumPatients, options.minNumControls, options.minPatientCount, options.minControlCount, group2sample2host)
    clones, clone2hits = readNcbiXml(options, group2sample2host)
    outbasename = os.path.join(options.outdir, options.basename)
    printTab(clones, clone2hits, group2keywords, options, outbasename)
    printTexTab(clones, clone2hits, group2keywords, options, outbasename)
Esempio n. 10
0
def main():
    parser = iseqlib.initOptions()
    iseqlib.initPlotOptions( parser )
    addOptions(parser)
    
    options, args = parser.parse_args()
    iseqlib.checkPlotOptions( options, parser )

    samples = readfiles(options.indir, options.count, options.translate)

    if options.drawdist:
        drawDist(samples, options) 
    
    #printSharedSeqFreqAll( samples, minsam, "counts-atleast3sams" )
    if options.fasta:
        faoutdir = os.path.join(options.outdir, "fasta-atleast%dsams" %options.minsam)
        system("mkdir -p %s" %faoutdir)
        filterByNumSampleAll(samples, options.minsam, faoutdir, options.freq)
    
    if options.clonematrix:
        printCloneMatrixAll(options.outdir, samples, options.minsam, options.freqTransform)
Esempio n. 11
0
def main():
    parser = iseqlib.initOptions()
    addOptions(parser)
    options, args = parser.parse_args()

    if options.minExpFreq > 0 and not options.sample2total:
        parser.error("--sample2total is required as --minExpandedFreq > 0\n")
    if options.sample2total:
        options.sample2total = readSample2total(options.sample2total)

    ext = 'xml'
    infiles = iseqlib.getfiles(options.indir, ext)
    outbasename = os.path.join(options.outdir, options.basename)
    
    fh1 = open("%s-1.txt" %outbasename, 'w')
    fh2 = open("%s-2.txt" %outbasename, 'w')
    fh3 = open("%s-3.txt" %outbasename, 'w')
    fh4 = open("%s-4.txt" %outbasename, 'w')
    
    type2count = {}
    type2clones = {}
    for file in infiles:
        infile = os.path.join(options.indir, file)
        vjname = file.rstrip(ext).rstrip('.')
        clone2hits = readNcbiXml(infile, options)
        separateClusters(clone2hits, options.sample2total, options.minExpFreq, options.minExpSize, options.minExpClones, options.minMotifClones, vjname, fh1, fh2, fh3, fh4, type2count, type2clones)
        
    fh1.close()
    fh2.close()
    fh3.close()
    fh4.close()

    #summary stats:
    summaryFile = "%s-summary.txt" %outbasename
    fh = open(summaryFile, 'w')
    fh.write("#min similarity: %f\n#min length: %d\n#min size to be called expanded clones: %d\n#min number of expanded clones required for type 1: %d\n#min number of clones supported a single motif for type 2: %d\n" %(options.minPos, options.minLen, options.minExpSize, options.minExpClones, options.minMotifClones))
    for type, count in type2count.iteritems():
        fh.write("%s\t%d\n" %(type, count))
    fh.close()

    #Print type2clones:
    v2seq = {}
    if options.vfile:
        v2seq = readVfile(options.vfile)
    type2index = {}
    for type, header2vj2seq in type2clones.iteritems():
        outfile = getFaFilename(type, outbasename)
        f = open(outfile, 'w')
        f.write("#%s\n" %type)
        for header, vj2seq in header2vj2seq.iteritems():
            for vj, seq in vj2seq.iteritems():
                #f.write(">%s;%s\n" %(header, vj))
                #reformat header:
                sample, id, size = getCloneInfo( ";".join(header.split(";")[:3]) ) 
                header = "%s;%s;%d;%s" % (sample.lstrip('as'), seq, size, vj)
                f.write(">%s\n" %header)

                v = vj.split(".")[0]
                if v in v2seq:
                    seq = v2seq[v] + seq
                f.write("%s\n" %seq)
        f.close()
Esempio n. 12
0
def main():
    parser = iseqlib.initOptions()
    parser.add_option('-i', '--indir', dest='indir')
    parser.add_option('-o', '--outdir', dest='outdir')
    parser.add_option('-r', '--raw', dest='raw', default=False, action='store_true', help='If specified, use raw counts instead of normalized counts. Default=%default')
    parser.add_option('-l', '--long', dest='long', default=False, action='store_true', help='If specified, print inframe (nt/aa) sequences instead of just the CDR3 parts. Default=%default')
    parser.add_option('-n', '--nucleotide', dest='nuc', default=False, action='store_true', help='If specified, print nucleotide sequences instead of amino acid sequences. Default=amino acid')
    parser.add_option('-p', '--productive', dest='productive', default=False, action="store_true", help="If specified, only prints out productive sequences. Default is to print all sequences")
    parser.add_option('-b', '--nonProductive', dest='nonProductive', default=False, action="store_true", help="If specified, only prints out non-productive sequences. Default is to print all.")
    parser.add_option('-v', '--uniqv', dest='uniqv', default=False, action="store_true", help="If specified, filter out sequences that mapped to multiple V genes")
    parser.add_option('-j', '--uniqj', dest='uniqj', default=False, action="store_true", help="If specified, filter out sequences that mapped to multiple J genes")
    parser.add_option('-s', '--snp', dest='snp', default=False, action="store_true", help="If specified, get SNPs")
    parser.add_option('--filterBySnpNoise', dest='filterBySnpNoise', default=False, action="store_true", help="If specified, print out fasta files that exclude errorneous reads. Errorneous reads are those that contain alternative alleles different from the reference genes with low frequency, and therefore more likely to be sequencing noise.")
    parser.add_option('--noFasta', dest='noFasta', default=False, action="store_true", help="If specified, don't convert to fasta files")
    parser.add_option('--jgenes', dest='jgenes', default='/hive/users/nknguyen/immuno/imgt/TRB/jgenes.txt', help='Required argument if would like to extend the 3\' end of sequences to get the full CDR3')
    parser.add_option('--jgeneToPos', dest='jgene2pos', default='/hive/users/nknguyen/immuno/imgt/TRB/jmappedF.txt', help='Required argument if would like to extend the 3\' end of sequences to get the full CDR3')
    parser.add_option('--vgenes', dest='vgenes', default= '/hive/users/nknguyen/immuno/imgt/TRB/vgenes.txt', help='Required argument if would like to check for SNPs in the V region')
    parser.add_option('--vgeneToPos', dest='vgene2pos', default='/hive/users/nknguyen/immuno/imgt/TRB/vmapped312.txt', help='Required argument if would like to check for SNPs in the V region')
    parser.add_option('--mapToRef', dest='mapToRef', default='/hive/users/nknguyen/immuno/imgt/TRB/genesToHg18.txt', help='If specified, map the SNPs back to the reference')
    parser.add_option('--noMapToRef', dest='noMapToRef', action='store_true', default=False)
    parser.add_option('--minAlleleScore', dest='minAlleleScore', type='int', default=100, help='Minimum reads required for an allele to be called. Default=%default')
    parser.add_option('--minAlleleFreq', dest='minAlleleFreq', type='float', default=1.0, help='Minimum percentage required for an allele to be called. Default=%default')

    options, args = parser.parse_args()
    if (options.jgenes and not options.jgene2pos) or (options.jgene2pos and not options.jgenes):
        parser.error('jgenes and jgene2pos are required if would like to extend the 3 prime end to get the full CDR3 sequences.\n')
    if options.jgenes and options.nonProductive:
        parser.error('Cannot extend the CDR3 sequences of non-productive sequences.\n')
    if options.nonProductive and not options.nuc:
        parser.error('No amino acid options is available for non-productive sequences. Please choose the nucleotide options.\n')
    if options.jgenes:
        options.jgenes = readGeneFile( options.jgenes )
    if options.jgene2pos:
        options.jgene2pos = readGene2pos( options.jgene2pos )
    if options.vgenes:
        options.vgenes = readGeneFile( options.vgenes )
    if options.vgene2pos:
        options.vgene2pos = readGene2pos( options.vgene2pos )
    
    if options.noMapToRef:
        options.mapToRef = None
    if options.mapToRef:
        options.gene2offset = readGene2ref(options.mapToRef)

    indir = options.indir
    outdir = options.outdir
    useNorm = not options.raw

    for file in os.listdir(indir):
        sample = file.split('.')[0]
        if len(file.split('.')) < 2 or file.split('.')[1] != 'tsv':
            continue
        seqs, hasNorm = readFile( os.path.join(indir,file) )
        if not hasNorm and useNorm:
            useNorm = False
            sys.stderr.write("Some sample/sequence does not have normalized frequency or normalized count. Using raw count instead\n")
        if not options.noFasta:
            outfile = os.path.join(outdir,"%s.fa" % sample)
            printSeqs(seqs, outfile, useNorm, options)
        
        if options.snp:
            snpoutfile = os.path.join(outdir, "%s.snp.txt" %sample)
            gene2pos2snp, gene2count, gene2pos2noise = findSnps(seqs, useNorm, options)
            #print gene2pos2noise
            printSnps(snpoutfile, gene2pos2snp, options)
            
            #DEBUG:
            #for seq in seqs:
            #    if seq.vpos2snp and len(seq.vpos2snp) > 0:
            #        print seq.vpos2snp
            #END DEBUG


            if options.filterBySnpNoise:
                noisefile = os.path.join(outdir, "%s-noise.txt" %sample)
                newseqs = filterNoiseReads(seqs, gene2pos2noise, noisefile, options)
                outfile = os.path.join(outdir, "%s-filtered.fa" %sample)
                printSeqs(newseqs, outfile, useNorm, options)
def main():
    parser = iseqlib.initOptions()
    addOptions(parser)
    options, args = parser.parse_args()

    if options.minExpFreq > 0 and not options.sample2total:
        parser.error("--sample2total is required as --minExpandedFreq > 0\n")
    if options.sample2total:
        options.sample2total = readSample2total(options.sample2total)

    ext = 'txt'
    infiles = iseqlib.getfiles(options.indir, ext)
    outbasename = os.path.join(options.outdir, options.basename)
    
    fh1 = open("%s-1.txt" %outbasename, 'w')
    fh2 = open("%s-2.txt" %outbasename, 'w')
    fh3 = open("%s-3.txt" %outbasename, 'w')
    fh4 = open("%s-4.txt" %outbasename, 'w')
    fh5 = open("%s-5.txt" %outbasename, 'w')
    
    type2count = {}
    type2clusters = {}
    totalseq = 0
    expseq = 0
    for file in infiles:
        infile = os.path.join(options.indir, file)
        vjname = file.rstrip(ext).rstrip('.')
        clusters = readBlastclustOutfiles(infile, vjname, options)
        total, exp = separateClusters(clusters, options.minExpFreq, options.minExpSize, options.minExpClones, fh1, fh2, fh3, fh4, fh5, type2count, type2clusters)
        totalseq += total
        expseq += exp
    fh1.close()
    fh2.close()
    fh3.close()
    fh4.close()
    fh5.close()

    #summary stats:
    summaryFile = "%s-summary.txt" %outbasename
    fh = open(summaryFile, 'w')
    fh.write("#min size to be called expanded clones: %d\n#min number of expanded clones required for type 1: %d\n" %(options.minExpSize, options.minExpClones))
    id2info = clusterId2info()
    fh.write("Total number of clones: %d\n" %totalseq)
    fh.write("Total number of expanded clones: %d\n" %expseq)
    for type in sorted(type2count.keys()):
        fh.write("%d\t%s\t%d\n" %(type, id2info[type], type2count[type]))
    fh.close()

    #Print type2clones:
    
    #Read v files:
    v2seq = {}
    if options.vfile:
        v2seq = readFaFile(options.vfile)
    
    #Read fasta files:
    ext = 'fa'
    infiles = iseqlib.getfiles(options.fadir, ext)
    vj2id2seq = {}
    for file in infiles:
        infile = os.path.join(options.fadir, file)
        vjname = file.rstrip(ext).rstrip('.')
        vj2id2seq[vjname] = readFaFile(infile)

    for type, clusters in type2clusters.iteritems():
        outfile = "%s-%d.fa" %(outbasename, type)
        f = open(outfile, 'w')
        f.write("#Type: %s\n" %type)
        clusters = sorted(clusters, key=lambda c: c.totalReads, reverse=True)
        for cluster in clusters:
            f.write("#Cluster %s\n" %cluster.getDesc())
            id2seq = vj2id2seq[cluster.vj]
            v = cluster.vj.split(".")[0]
            vseq = ""
            if v in v2seq:
                vseq = v2seq[v]

            for clone in cluster.clones:
                seq = id2seq[ clone.desc ]
                cdr3seq = seq[ len(vseq): ]
                header = "%s;%s;%d;%s" %(clone.sample, cdr3seq, clone.size, cluster.vj)
                f.write(">%s\n" %header)
                f.write("%s\n" %seq)
        f.close()