def partition_VJ(inhandle,basename): # ignores allele numbers def vj_id_no_allele(chain): return seqtools.cleanup_id(chain.v.split('*')[0]) + '_' + seqtools.cleanup_id(chain.j.split('*')[0]) def outname(basename,vj_id): return "%s.%s.imgt" % (basename,vj_id) outhandles = {} for chain in vdj.parse_imgt(inhandle): curr_vj_id = vj_id_no_allele(chain) try: print >>outhandles[curr_vj_id], chain except KeyError: outhandles[curr_vj_id] = open( outname(basename,curr_vj_id), 'w' ) print >>outhandles[curr_vj_id], chain for outhandle in outhandles.itervalues(): outhandle.close() return [outname(basename,vj_id) for vj_id in outhandles.iterkeys()]
def partition_VJ(inhandle, basename): # ignores allele numbers def vj_id_no_allele(chain): return seqtools.cleanup_id( chain.v.split('*')[0]) + '_' + seqtools.cleanup_id( chain.j.split('*')[0]) def outname(basename, vj_id): return "%s.%s.imgt" % (basename, vj_id) outhandles = {} for chain in vdj.parse_imgt(inhandle): curr_vj_id = vj_id_no_allele(chain) try: print >> outhandles[curr_vj_id], chain except KeyError: outhandles[curr_vj_id] = open(outname(basename, curr_vj_id), 'w') print >> outhandles[curr_vj_id], chain for outhandle in outhandles.itervalues(): outhandle.close() return [outname(basename, vj_id) for vj_id in outhandles.iterkeys()]
#! /usr/bin/env python import sys import optparse import vdj import vdj.pipeline parser = optparse.OptionParser() (options, args) = parser.parse_args() if len(args) == 2: inhandle = open(args[0],'r') outhandle = open(args[1],'w') elif len(args) == 1: inhandle = open(args[0],'r') outhandle = sys.stdout elif len(args) == 0: inhandle = sys.stdin outhandle = sys.stdout for chain in vdj.parse_imgt(inhandle): if hasattr(chain,'v') and hasattr(chain,'j'): print >>outhandle, chain
# 1. SIZE SELECTION log("Performing size selection on reads...") min_size = int(params['min_size']) max_size = int(params['max_size']) size_selected_file = join( work_dir, basename + '.size%i-%i' % (min_size, max_size) + '.imgt') with open(size_selected_file, 'w') as outhandle: for seq in SeqIO.parse(params['input_fasta'], 'fasta', generic_dna): if len(seq) >= min_size and len(seq) <= max_size: chain = vdj.ImmuneChain(seq) print >> outhandle, chain log("finished\n") # 2. SPLIT INTO PARTS log("Splitting input into small parts...") parts = vdj.pipeline.iterator2parts(vdj.parse_imgt(size_selected_file), join(work_dir, 'parts/size_selected.imgt'), int(params['packet_size'])) log("finished\n") # 3-7. BARCODE ID, CODING STRAND, ISOTYPE ID, VDJ CLASSIFICATION, TRANSLATION via LSF log("Setting up LSF command...\n") locus_options = ' '.join([' --locus %s' % locus for locus in params['locus']]) cmd = 'barcode_id.py --barcodes %s ' % params[ 'barcode_fasta'] # 3. BARCODE IDENTIFICATION cmd += ' | coding_strand.py' + locus_options # 4. CODING STRAND if 'IGH' in params['locus']: # 5. ISOTYPE ID (heavy chain only) cmd += ' | isotype_id.py --IGHC %s' % params['isotype_fasta'] cmd += ' | align_vdj.py' + locus_options # 6. VDJ CLASSIFICATION cmd += ' | translate_chains.py' # 7. TRANSLATION
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import subprocess import vdj import vdj.analysis # 1. Split V and J and only put in unique sequences. Sort by lineage-weighted abundance v_counts = pyutils.nesteddict() j_counts = pyutils.nesteddict() for chain in vdj.parse_imgt(): v_feature_list = [chain.__getattribute__('V-REGION').qualifiers['gene'][0],chain.v_seq] v_counts.nested_add(v_feature_list) j_feature_list = [chain.__getattribute__('J-REGION').qualifiers['gene'][0],chain.j_seq] j_counts.nested_add(j_feature_list) for tup in v_counts.walk(): (keylist,val) = (tup[:-1],tup[-1]) v_counts.nested_assign(keylist,len(val)) for tup in j_counts.walk(): (keylist,val) = (tup[:-1],tup[-1]) j_counts.nested_assign(keylist,len(val)) for key in v_counts:
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import optparse import vdj parser = optparse.OptionParser() (options, args) = parser.parse_args() if len(args) == 2: inhandle = open(args[0], 'r') outhandle = open(args[1], 'w') elif len(args) == 1: inhandle = open(args[0], 'r') outhandle = sys.stdout elif len(args) == 0: inhandle = sys.stdin outhandle = sys.stdout else: raise Exception, "Wrong number of arguments." for chain in vdj.parse_imgt(inhandle): # print >>outhandle, chain.format('fasta') # causes chain.description output instead of chain.id print >> outhandle, ">%s\n%s" % (chain.id, chain.seq)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import optparse import vdj import vdj.pipeline parser = optparse.OptionParser() parser.add_option('-b', '--basename') parser.add_option('-p', '--packetsize', type='int') (options, args) = parser.parse_args() if len(args) == 1: inhandle = open(args[0], 'r') elif len(args) == 0: inhandle = sys.stdin else: raise Exception, "Too many arguments." parts = vdj.pipeline.iterator2parts(vdj.parse_imgt(inhandle), options.basename, options.packetsize) for part in parts: print part
#! /usr/bin/env python import sys import argparse import pymongo import vdj import vdj.mongo argparser = argparse.ArgumentParser(description=None) argparser.add_argument('-d','--db',required=True) argparser.add_argument('-c','--collection',default='chains') argparser.add_argument('-i','--input') # argparser.add_argument('--option',dest='xxx',action='store_const',default=5) args = argparser.parse_args() inputfile = args.input db = vdj.mongo.connect_to_spleen(connect_to=args.db) chains = db[args.collection] for (i,chain) in enumerate(vdj.parse_imgt(inputfile)): if i%1000 == 0: sys.stdout.write("%i "%i) sys.stdout.flush() chains.insert(vdj.mongo.encode_chain(chain))
def imgt2fasta(inhandle,outhandle): for chain in vdj.parse_imgt(inhandle): outhandle.write( chain.format('fasta') )
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import argparse from collections import defaultdict import vdj argparser = argparse.ArgumentParser(description=None) argparser.add_argument('input',nargs='?',type=argparse.FileType('r'),default=sys.stdin) argparser.add_argument('output',nargs='?',type=argparse.FileType('w'),default=sys.stdout) args = argparser.parse_args() # read in all the junctions junctions = defaultdict(list) for chain in vdj.parse_imgt(args.input): try: junctions[chain.junction_nt].append(chain.id) except AttributeError: pass for junction in sorted(junctions.iterkeys(), key=lambda k: len(junctions[k]), reverse=True): for id_ in junctions[junction]: args.output.write(">%s\n%s\n" % (id_, junction))
log("Performing size selection on reads...") min_size = int(params['min_size']) max_size = int(params['max_size']) size_selected_file = join(work_dir,basename + '.size%i-%i' % (min_size,max_size) + '.imgt') with open(size_selected_file,'w') as outhandle: for seq in SeqIO.parse(params['input_fasta'],'fasta',generic_dna): if len(seq) >= min_size and len(seq) <= max_size: chain = vdj.ImmuneChain(seq) print >>outhandle, chain log("finished\n") # 2. SPLIT INTO PARTS log("Splitting input into small parts...") parts = vdj.pipeline.iterator2parts( vdj.parse_imgt(size_selected_file), join(work_dir,'parts/size_selected.imgt'), int(params['packet_size'])) log("finished\n") # 3-7. BARCODE ID, CODING STRAND, ISOTYPE ID, VDJ CLASSIFICATION, TRANSLATION via LSF log("Setting up LSF command...\n") locus_options = ' '.join([' --locus %s' % locus for locus in params['locus']]) cmd = 'barcode_id.py --barcodes %s ' % params['barcode_fasta'] # 3. BARCODE IDENTIFICATION cmd += ' | coding_strand.py' + locus_options # 4. CODING STRAND if 'IGH' in params['locus']: # 5. ISOTYPE ID (heavy chain only) cmd += ' | isotype_id.py --IGHC %s' % params['isotype_fasta'] cmd += ' | align_vdj.py' + locus_options # 6. VDJ CLASSIFICATION cmd += ' | translate_chains.py' # 7. TRANSLATION
def imgt2fasta(inhandle, outhandle): for chain in vdj.parse_imgt(inhandle): outhandle.write(chain.format('fasta'))
#! /usr/bin/env python # Copyright 2014 Uri Laserson # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import argparse import vdj from pyutils import cleanup_id argparser = argparse.ArgumentParser(description=None) argparser.add_argument('input_file') argparser.add_argument('output_dir',default=os.getcwd()) args = argparser.parse_args() for chain in vdj.parse_imgt(args.input_file): output_file = os.path.join(args.output_dir,'%s.imgt' % cleanup_id(chain.id)) with open(output_file,'w') as op: print >>op, chain
# Copyright 2014 Uri Laserson # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import argparse import vdj from pyutils import cleanup_id argparser = argparse.ArgumentParser(description=None) argparser.add_argument('input_file') argparser.add_argument('output_dir', default=os.getcwd()) args = argparser.parse_args() for chain in vdj.parse_imgt(args.input_file): output_file = os.path.join(args.output_dir, '%s.imgt' % cleanup_id(chain.id)) with open(output_file, 'w') as op: print >> op, chain
def imgt2countdict(inhandle,features,count='read'): return iterator2countdict(vdj.parse_imgt(inhandle),features,count)
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import optparse import vdj import vdj.pipeline parser = optparse.OptionParser() parser.add_option('-b','--basename') parser.add_option('-p','--packetsize',type='int') (options, args) = parser.parse_args() if len(args) == 1: inhandle = open(args[0],'r') elif len(args) == 0: inhandle = sys.stdin else: raise Exception, "Too many arguments." parts = vdj.pipeline.iterator2parts( vdj.parse_imgt(inhandle), options.basename, options.packetsize) for part in parts: print part
(options,args) = option_parser.parse_args() if len(args) == 2: inhandle = open(args[0],'r') outhandle = open(args[1],'w') elif len(args) == 1: inhandle = open(args[0],'r') outhandle = sys.stdout elif len(args) == 0: raise ValueError, "must provide at least an input file" # determine the total number of chains in the file (using unix grep and wc) p = subprocess.Popen('cat %s | grep ^ID | wc -l' % args[0],shell=True,stdout=subprocess.PIPE) total_chains = int(p.stdout.read().strip()) # check if subsampling should include the entire file if options.num >= total_chains: warnings.warn("Subsampling level is greater than or equal to number of chains in file: printing whole file to output.") for chain in vdj.parse_imgt(inhandle): print >>outhandle, chain else: # choose a random set of indices to select for the output file random.seed() idxs = sorted(random.sample(xrange(total_chains),options.num)) for (i,chain) in enumerate(vdj.parse_imgt(inhandle)): if len(idxs) == 0: break if i == idxs[0]: print >>outhandle, chain idxs.pop(0)
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import argparse from collections import defaultdict import vdj argparser = argparse.ArgumentParser(description=None) argparser.add_argument("input", nargs="?", type=argparse.FileType("r"), default=sys.stdin) argparser.add_argument("output", nargs="?", type=argparse.FileType("w"), default=sys.stdout) args = argparser.parse_args() # read in all the junctions junctions = defaultdict(list) for chain in vdj.parse_imgt(args.input): try: junctions[chain.junction_nt].append(chain.id) except AttributeError: pass for junction in sorted(junctions.iterkeys(), key=lambda k: len(junctions[k]), reverse=True): for id_ in junctions[junction]: args.output.write(">%s\n%s\n" % (id_, junction))
import sys import argparse import pymongo import vdj import vdj.mongo argparser = argparse.ArgumentParser(description=None) argparser.add_argument('-d','--db',required=True) argparser.add_argument('-c','--collection',required=True) argparser.add_argument('-i','--input',nargs='?',type=argparse.FileType('r'),default=sys.stdin) argparser.add_argument('-p','--padding',type=int,default=0) # argparser.add_argument('--option',dest='xxx',action='store_const',default=5) args = argparser.parse_args() inputfile = args.input db = vdj.mongo.connect_to_lymph(connect_to=args.db) chains = db[args.collection] for (i,chain) in enumerate(vdj.parse_imgt(inputfile)): if i%1000 == 0: sys.stdout.write("%i "%i) sys.stdout.flush() doc = vdj.mongo.encode_chain(chain) if args.padding > 0: doc['__padding__'] = '0' * args.padding chains.insert(doc, safe=True) # delete padding from elements if args.padding > 0: chains.update({}, {"$unset" : {"__padding__" : 1}}, upsert=False, safe=True, multi=True)