Exemple #1
0
def partition_VJ(inhandle,basename):
    # ignores allele numbers
    def vj_id_no_allele(chain):
        return seqtools.cleanup_id(chain.v.split('*')[0]) + '_' + seqtools.cleanup_id(chain.j.split('*')[0])
    
    def outname(basename,vj_id):
        return "%s.%s.imgt" % (basename,vj_id)
    
    outhandles = {}
    for chain in vdj.parse_imgt(inhandle):
        curr_vj_id = vj_id_no_allele(chain)
        try:
            print >>outhandles[curr_vj_id], chain
        except KeyError:
            outhandles[curr_vj_id] = open( outname(basename,curr_vj_id), 'w' )
            print >>outhandles[curr_vj_id], chain
    
    for outhandle in outhandles.itervalues():
        outhandle.close()
    
    return [outname(basename,vj_id) for vj_id in outhandles.iterkeys()]
Exemple #2
0
def partition_VJ(inhandle, basename):
    # ignores allele numbers
    def vj_id_no_allele(chain):
        return seqtools.cleanup_id(
            chain.v.split('*')[0]) + '_' + seqtools.cleanup_id(
                chain.j.split('*')[0])

    def outname(basename, vj_id):
        return "%s.%s.imgt" % (basename, vj_id)

    outhandles = {}
    for chain in vdj.parse_imgt(inhandle):
        curr_vj_id = vj_id_no_allele(chain)
        try:
            print >> outhandles[curr_vj_id], chain
        except KeyError:
            outhandles[curr_vj_id] = open(outname(basename, curr_vj_id), 'w')
            print >> outhandles[curr_vj_id], chain

    for outhandle in outhandles.itervalues():
        outhandle.close()

    return [outname(basename, vj_id) for vj_id in outhandles.iterkeys()]
Exemple #3
0
#! /usr/bin/env python

import sys
import optparse

import vdj
import vdj.pipeline

parser = optparse.OptionParser()
(options, args) = parser.parse_args()

if len(args) == 2:
    inhandle = open(args[0],'r')
    outhandle = open(args[1],'w')
elif len(args) == 1:
    inhandle = open(args[0],'r')
    outhandle = sys.stdout
elif len(args) == 0:
    inhandle = sys.stdin
    outhandle = sys.stdout

for chain in vdj.parse_imgt(inhandle):
    if hasattr(chain,'v') and hasattr(chain,'j'):
        print >>outhandle, chain
Exemple #4
0
# 1. SIZE SELECTION
log("Performing size selection on reads...")
min_size = int(params['min_size'])
max_size = int(params['max_size'])
size_selected_file = join(
    work_dir, basename + '.size%i-%i' % (min_size, max_size) + '.imgt')
with open(size_selected_file, 'w') as outhandle:
    for seq in SeqIO.parse(params['input_fasta'], 'fasta', generic_dna):
        if len(seq) >= min_size and len(seq) <= max_size:
            chain = vdj.ImmuneChain(seq)
            print >> outhandle, chain
log("finished\n")

# 2. SPLIT INTO PARTS
log("Splitting input into small parts...")
parts = vdj.pipeline.iterator2parts(vdj.parse_imgt(size_selected_file),
                                    join(work_dir, 'parts/size_selected.imgt'),
                                    int(params['packet_size']))
log("finished\n")

# 3-7. BARCODE ID, CODING STRAND, ISOTYPE ID, VDJ CLASSIFICATION, TRANSLATION via LSF
log("Setting up LSF command...\n")
locus_options = ' '.join([' --locus %s' % locus for locus in params['locus']])
cmd = 'barcode_id.py --barcodes %s ' % params[
    'barcode_fasta']  # 3. BARCODE IDENTIFICATION
cmd += ' | coding_strand.py' + locus_options  # 4. CODING STRAND
if 'IGH' in params['locus']:  # 5. ISOTYPE ID (heavy chain only)
    cmd += ' | isotype_id.py --IGHC %s' % params['isotype_fasta']
cmd += ' | align_vdj.py' + locus_options  # 6. VDJ CLASSIFICATION
cmd += ' | translate_chains.py'  # 7. TRANSLATION
Exemple #5
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import subprocess

import vdj
import vdj.analysis

# 1. Split V and J and only put in unique sequences.  Sort by lineage-weighted abundance

v_counts = pyutils.nesteddict()
j_counts = pyutils.nesteddict()
for chain in vdj.parse_imgt():
    v_feature_list = [chain.__getattribute__('V-REGION').qualifiers['gene'][0],chain.v_seq]
    v_counts.nested_add(v_feature_list)
    
    j_feature_list = [chain.__getattribute__('J-REGION').qualifiers['gene'][0],chain.j_seq]
    j_counts.nested_add(j_feature_list)

for tup in v_counts.walk():
    (keylist,val) = (tup[:-1],tup[-1])
    v_counts.nested_assign(keylist,len(val))

for tup in j_counts.walk():
    (keylist,val) = (tup[:-1],tup[-1])
    j_counts.nested_assign(keylist,len(val))

for key in v_counts:
Exemple #6
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import optparse

import vdj

parser = optparse.OptionParser()
(options, args) = parser.parse_args()

if len(args) == 2:
    inhandle = open(args[0], 'r')
    outhandle = open(args[1], 'w')
elif len(args) == 1:
    inhandle = open(args[0], 'r')
    outhandle = sys.stdout
elif len(args) == 0:
    inhandle = sys.stdin
    outhandle = sys.stdout
else:
    raise Exception, "Wrong number of arguments."

for chain in vdj.parse_imgt(inhandle):
    # print >>outhandle, chain.format('fasta')  # causes chain.description output instead of chain.id
    print >> outhandle, ">%s\n%s" % (chain.id, chain.seq)
Exemple #7
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import optparse

import vdj
import vdj.pipeline

parser = optparse.OptionParser()
parser.add_option('-b', '--basename')
parser.add_option('-p', '--packetsize', type='int')
(options, args) = parser.parse_args()

if len(args) == 1:
    inhandle = open(args[0], 'r')
elif len(args) == 0:
    inhandle = sys.stdin
else:
    raise Exception, "Too many arguments."

parts = vdj.pipeline.iterator2parts(vdj.parse_imgt(inhandle), options.basename,
                                    options.packetsize)

for part in parts:
    print part
#! /usr/bin/env python

import sys
import argparse

import pymongo

import vdj
import vdj.mongo

argparser = argparse.ArgumentParser(description=None)
argparser.add_argument('-d','--db',required=True)
argparser.add_argument('-c','--collection',default='chains')
argparser.add_argument('-i','--input')
# argparser.add_argument('--option',dest='xxx',action='store_const',default=5)
args = argparser.parse_args()

inputfile = args.input
db = vdj.mongo.connect_to_spleen(connect_to=args.db)
chains = db[args.collection]
for (i,chain) in enumerate(vdj.parse_imgt(inputfile)):
    if i%1000 == 0:
        sys.stdout.write("%i "%i)
        sys.stdout.flush()
    chains.insert(vdj.mongo.encode_chain(chain))
Exemple #9
0
def imgt2fasta(inhandle,outhandle):
    for chain in vdj.parse_imgt(inhandle):
        outhandle.write( chain.format('fasta') )
Exemple #10
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import argparse
from collections import defaultdict

import vdj

argparser = argparse.ArgumentParser(description=None)
argparser.add_argument('input',nargs='?',type=argparse.FileType('r'),default=sys.stdin)
argparser.add_argument('output',nargs='?',type=argparse.FileType('w'),default=sys.stdout)
args = argparser.parse_args()

# read in all the junctions
junctions = defaultdict(list)
for chain in vdj.parse_imgt(args.input):
    try:
        junctions[chain.junction_nt].append(chain.id)
    except AttributeError:
        pass

for junction in sorted(junctions.iterkeys(), key=lambda k: len(junctions[k]), reverse=True):
    for id_ in junctions[junction]:
        args.output.write(">%s\n%s\n" % (id_, junction))
log("Performing size selection on reads...")
min_size = int(params['min_size'])
max_size = int(params['max_size'])
size_selected_file = join(work_dir,basename + '.size%i-%i' % (min_size,max_size) + '.imgt')
with open(size_selected_file,'w') as outhandle:
    for seq in SeqIO.parse(params['input_fasta'],'fasta',generic_dna):
        if len(seq) >= min_size and len(seq) <= max_size:
            chain = vdj.ImmuneChain(seq)
            print >>outhandle, chain
log("finished\n")



# 2. SPLIT INTO PARTS
log("Splitting input into small parts...")
parts = vdj.pipeline.iterator2parts( vdj.parse_imgt(size_selected_file),
                                     join(work_dir,'parts/size_selected.imgt'),
                                     int(params['packet_size']))
log("finished\n")



# 3-7. BARCODE ID, CODING STRAND, ISOTYPE ID, VDJ CLASSIFICATION, TRANSLATION via LSF
log("Setting up LSF command...\n")
locus_options = ' '.join([' --locus %s' % locus for locus in params['locus']])
cmd = 'barcode_id.py --barcodes %s ' % params['barcode_fasta']      # 3. BARCODE IDENTIFICATION
cmd += ' | coding_strand.py' + locus_options                        # 4. CODING STRAND
if 'IGH' in params['locus']:                                        # 5. ISOTYPE ID (heavy chain only)
    cmd += ' | isotype_id.py --IGHC %s' % params['isotype_fasta']
cmd += ' | align_vdj.py' + locus_options                            # 6. VDJ CLASSIFICATION
cmd += ' | translate_chains.py'                                     # 7. TRANSLATION
Exemple #12
0
def imgt2fasta(inhandle, outhandle):
    for chain in vdj.parse_imgt(inhandle):
        outhandle.write(chain.format('fasta'))
Exemple #13
0
#! /usr/bin/env python
# Copyright 2014 Uri Laserson
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import argparse

import vdj
from pyutils import cleanup_id

argparser = argparse.ArgumentParser(description=None)
argparser.add_argument('input_file')
argparser.add_argument('output_dir',default=os.getcwd())
args = argparser.parse_args()

for chain in vdj.parse_imgt(args.input_file):
    output_file = os.path.join(args.output_dir,'%s.imgt' % cleanup_id(chain.id))
    with open(output_file,'w') as op:
        print >>op, chain
Exemple #14
0
# Copyright 2014 Uri Laserson
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import argparse

import vdj
from pyutils import cleanup_id

argparser = argparse.ArgumentParser(description=None)
argparser.add_argument('input_file')
argparser.add_argument('output_dir', default=os.getcwd())
args = argparser.parse_args()

for chain in vdj.parse_imgt(args.input_file):
    output_file = os.path.join(args.output_dir,
                               '%s.imgt' % cleanup_id(chain.id))
    with open(output_file, 'w') as op:
        print >> op, chain
Exemple #15
0
def imgt2countdict(inhandle,features,count='read'):
    return iterator2countdict(vdj.parse_imgt(inhandle),features,count)
Exemple #16
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import optparse

import vdj
import vdj.pipeline

parser = optparse.OptionParser()
parser.add_option('-b','--basename')
parser.add_option('-p','--packetsize',type='int')
(options, args) = parser.parse_args()

if len(args) == 1:
    inhandle = open(args[0],'r')
elif len(args) == 0:
    inhandle = sys.stdin
else:
    raise Exception, "Too many arguments."

parts = vdj.pipeline.iterator2parts( vdj.parse_imgt(inhandle),
                                     options.basename,
                                     options.packetsize)

for part in parts:
    print part
Exemple #17
0
(options,args) = option_parser.parse_args()

if len(args) == 2:
    inhandle = open(args[0],'r')
    outhandle = open(args[1],'w')
elif len(args) == 1:
    inhandle = open(args[0],'r')
    outhandle = sys.stdout
elif len(args) == 0:
    raise ValueError, "must provide at least an input file"

# determine the total number of chains in the file (using unix grep and wc)
p = subprocess.Popen('cat %s | grep ^ID | wc -l' % args[0],shell=True,stdout=subprocess.PIPE)
total_chains = int(p.stdout.read().strip())

# check if subsampling should include the entire file
if options.num >= total_chains:
    warnings.warn("Subsampling level is greater than or equal to number of chains in file: printing whole file to output.")
    for chain in vdj.parse_imgt(inhandle):
        print >>outhandle, chain
else:
    # choose a random set of indices to select for the output file
    random.seed()
    idxs = sorted(random.sample(xrange(total_chains),options.num))
    for (i,chain) in enumerate(vdj.parse_imgt(inhandle)):
        if len(idxs) == 0:
            break
        if i == idxs[0]:
            print >>outhandle, chain
            idxs.pop(0)
Exemple #18
0
def imgt2countdict(inhandle,features,count='read'):
    return iterator2countdict(vdj.parse_imgt(inhandle),features,count)
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import argparse
from collections import defaultdict

import vdj

argparser = argparse.ArgumentParser(description=None)
argparser.add_argument("input", nargs="?", type=argparse.FileType("r"), default=sys.stdin)
argparser.add_argument("output", nargs="?", type=argparse.FileType("w"), default=sys.stdout)
args = argparser.parse_args()

# read in all the junctions
junctions = defaultdict(list)
for chain in vdj.parse_imgt(args.input):
    try:
        junctions[chain.junction_nt].append(chain.id)
    except AttributeError:
        pass

for junction in sorted(junctions.iterkeys(), key=lambda k: len(junctions[k]), reverse=True):
    for id_ in junctions[junction]:
        args.output.write(">%s\n%s\n" % (id_, junction))
import sys
import argparse

import pymongo

import vdj
import vdj.mongo

argparser = argparse.ArgumentParser(description=None)
argparser.add_argument('-d','--db',required=True)
argparser.add_argument('-c','--collection',required=True)
argparser.add_argument('-i','--input',nargs='?',type=argparse.FileType('r'),default=sys.stdin)
argparser.add_argument('-p','--padding',type=int,default=0)
# argparser.add_argument('--option',dest='xxx',action='store_const',default=5)
args = argparser.parse_args()

inputfile = args.input
db = vdj.mongo.connect_to_lymph(connect_to=args.db)
chains = db[args.collection]
for (i,chain) in enumerate(vdj.parse_imgt(inputfile)):
    if i%1000 == 0:
        sys.stdout.write("%i "%i)
        sys.stdout.flush()
    doc = vdj.mongo.encode_chain(chain)
    if args.padding > 0:
        doc['__padding__'] = '0' * args.padding
    chains.insert(doc, safe=True)

# delete padding from elements
if args.padding > 0:
    chains.update({}, {"$unset" : {"__padding__" : 1}}, upsert=False, safe=True, multi=True)