Esempio n. 1
0
def write_airr(adata: AnnData, filename: Union[str, Path]) -> None:
    """Export :term:`IR` data to :term:`AIRR` Rearrangement `tsv` format.

    Parameters
    ----------
    adata
        annotated data matrix
    filename
        destination filename
    """
    airr_cells = to_airr_cells(adata)
    try:
        fields = airr_cells[0].fields
        for tmp_cell in airr_cells[1:]:
            assert tmp_cell.fields == fields, "All rows of adata have the same fields."
    except IndexError:
        # case of an empty output file
        fields = None

    writer = airr.create_rearrangement(filename, fields=fields)
    for tmp_cell in airr_cells:
        for chain in tmp_cell.to_airr_records():
            # workaround for AIRR library writing out int field as floats (if it happens to be a float)
            for f in chain:
                if RearrangementSchema.type(f) == "integer":
                    chain[f] = int(chain[f])
            writer.write(chain)
    writer.close()
def main():

	if not glob.glob("%s/%s_*.fasta" % (prj_tree.jgene, prj_name)):
		sys.exit("No jBlast output found!\n")

	maxFiles = len( glob.glob("%s/%s_*.fasta" % (prj_tree.vgene, prj_name)) )

	print( "curating junction and 3' end..." )

	if arguments['--cluster']:
		command = "NUM=`printf \"%s\" $SGE_TASK_ID`\n%s/annotate/parse_blast.py --jmotif '%s' --nterm %s --chunk $NUM\n" % \
					( "%03d", SCRIPT_FOLDER, arguments['--jmotif'], arguments['--nterm'] )
		if arguments['--noFallBack']: command += " --noFallBack"
		pbs = open("%s/parse.sh"%prj_tree.jgene, 'w')
		pbs.write( "#!/bin/bash\n#$ -N parse-%s\n#$ -l h_vmem=2G\n#$ -cwd\n#$ -o %s/parse.o$JOB_ID.$SGE_TASK_ID\n#$ -o %s/parse.e$JOB_ID.$SGE_TASK_ID\n\n%s\n" % (prj_name, prj_tree.annotate, prj_tree.annotate, command) )
		pbs.close()
		subprocess.call([qsub, '-sync', 'y', '-t', "1-%d"%maxFiles, "%s/parse.sh"%prj_tree.jgene])

	else: #do it locally
		parse_pool = Pool(arguments['--threads'])
		parse_pool.map(callParser, range(1,maxFiles+1))
		parse_pool.close()
		parse_pool.join()

	#ok, now collect all of the partial outputs and merge them
	print( "collecting information...")

	#open fasta outputs
	allV_aa	     = open ("%s/%s_allV.fa"	 % (prj_tree.aa, prj_name), "w" )
	allV_nt	     = open( "%s/%s_allV.fa"	 % (prj_tree.nt, prj_name), "w" )

	allJ_aa	     = open( "%s/%s_allJ.fa"	 % (prj_tree.aa, prj_name), "w" )
	allJ_nt	     = open( "%s/%s_allJ.fa"	 % (prj_tree.nt, prj_name), "w" )

	vj_aa	     = open( "%s/%s_goodVJ.fa"	 % (prj_tree.aa, prj_name), "w" )
	vj_nt	     = open( "%s/%s_goodVJ.fa"	 % (prj_tree.nt, prj_name), "w" )

	good_cdr3_aa = open( "%s/%s_goodCDR3.fa" % (prj_tree.aa, prj_name), "w" )
	good_cdr3_nt = open( "%s/%s_goodCDR3.fa" % (prj_tree.nt, prj_name), "w" )

	all_cdr3_aa  = open( "%s/%s_allCDR3.fa"	 % (prj_tree.aa, prj_name), "w" )
	all_cdr3_nt  = open( "%s/%s_allCDR3.fa"	 % (prj_tree.nt, prj_name), "w" )


	#also open final rearrangements tsv
	seq_stats = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity','consensus_count','cell_id'])


	#initiate overall counters
	raw_count, total = 0, 0
	counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0,'noV':0,'noJ':0,'missingNterm':0,'chimera':0}

	dict_jcounts = Counter()
	dict_ccounts = Counter()
	dict_dcounts = Counter()

	c = False
	if os.path.isfile("%s/%s_C_001.txt" % (prj_tree.jgene, prj_name)):
		c = True

	d = False
	if os.path.isfile("%s/%s_D_001.txt" % (prj_tree.jgene, prj_name)):
		d = True


	#iterate over subset rearrangement files and combine
	#include generating fasta output as appropriate
	for f_ind in range(1, maxFiles+1):

		#merge partial blast hit tables
		with open( "%s/%s_jgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
			with open( "%s/jtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
				table.write(partial.read())

		if d:
			with open( "%s/%s_dgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
				with open( "%s/dtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
					table.write(partial.read())

		if c:
			with open( "%s/%s_cgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
				with open( "%s/ctophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
					table.write(partial.read())

		#go through partial rearrangements files
		for r in airr.read_rearrangement( "%s/rearrangements_%03d.tsv"%(prj_tree.internal, f_ind) ):

			seq_stats.write( r )

			#count j/d/c gene usages
			if not r['j_call'] == "":
				dict_jcounts[ r['j_call'].split(",")[0] ] += 1
			if not r['j_call'] == "":
				dict_jcounts[ r['d_call'].split(",")[0] ] += 1
			if not r['j_call'] == "":
				dict_jcounts[ r['c_call'].split(",")[0] ] += 1

			#count statuses
			counts[ r['status'] ] += 1
			total += 1
			raw_count = int( r['sequence_id'] ) #technically, this undercounts if the last one
												# isn't in the `correct_length` interval, but I
												# don't have a better solution that isn't super
												# kludgy right now

			#ok, now do sequence output
			# start by collecting metadata for fasta def line
			def_line = ">%s" % r['sequence_id']
			if not r['v_call'] == '':          def_line += " v_call=%s"          % r['v_call']
			if not r['d_call'] == '':          def_line += " d_call=%s"          % r['d_call']
			if not r['j_call'] == '':          def_line += " j_call=%s"          % r['j_call']
			if not r['locus']  == '':          def_line += " locus=%s"           % r['locus']
			if not r['c_call'] == '':          def_line += " c_call=%s"          % r['c_call']
			if not r['status'] == '':          def_line += " status=%s"          % r['status']
#			if not r['v_identity'] == '':      def_line += " v_identity=%s"      % r['v_identity']
			if not r['junction_length'] == '': def_line += " junction_length=%s" % r['junction_length']
			if not r['junction'] == '':        def_line += " junction=%s"        % r['junction']
			if not r['junction_aa'] == '':     def_line += " junction_aa=%s"     % r['junction_aa']
			if not r['duplicate_count'] == '': def_line += " duplicate_count=%s" % r['duplicate_count']
			if not r['consensus_count'] == '': def_line += " consensus_count=%s" % r['consensus_count']
			if not r['cell_id'] == '':         def_line += " cell_id=%s"         % r['cell_id']

			#work our way up the hierarchy, putting sequences in the appropriate files
			ungapped = re.sub( "-", "", r['sequence_alignment']) #reintroduces any frameshift errors in translation
																 #  this has always been the behavior, but I wonder
																 #  if I should change/update now that I am using
																 #  proper alignments.

			if not r['status'] in ['noV', 'missingNterm', "chimera"]:
				allV_nt.write( "%s\n%s\n" % (def_line, ungapped) )
				allV_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )

				if not r['status'] == 'noJ':
					allJ_nt.write( "%s\n%s\n" % (def_line, ungapped) )
					allJ_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )

					if not r['status'] == 'noCDR3':
						all_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) )
						all_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) )

						if r['status'] == "good":
							vj_nt.write( "%s\n%s\n" % (def_line, ungapped) )
							vj_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )
							good_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) )
							good_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) )


	#close outputs
	allV_aa.close()
	allV_nt.close()
	allJ_aa.close()
	allJ_nt.close()
	vj_aa.close()
	vj_nt.close()
	good_cdr3_aa.close()
	good_cdr3_nt.close()
	all_cdr3_aa.close()
	all_cdr3_nt.close()

	#useful number
	found = total - counts['noV'] - counts['noJ'] - counts['chimera']

	#print out some statistics
	handle = open("%s/%s_jgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
	writer	= csv.writer(handle, delimiter = sep)
	keys	= sorted(dict_jcounts.keys())
	writer.writerow(["gene", "count", "percent"])
	for key in keys:
		aline = [ key, dict_jcounts[key], "%4.2f" % (dict_jcounts[key] / float(found) * 100) ]
		writer.writerow(aline)
	handle.close()

	if len(dict_ccounts) > 0:
		handle = open("%s/%s_cgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
		writer	= csv.writer(handle, delimiter = sep)
		keys	= sorted(dict_ccounts.keys())
		writer.writerow(["gene", "count", "percent"])
		for key in keys:
			aline = [ key, dict_ccounts[key], "%4.2f" % (dict_ccounts[key] / float(found) * 100) ]
			writer.writerow(aline)
		handle.close()

	if len(dict_dcounts) > 0:
		handle = open("%s/%s_dgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
		writer	= csv.writer(handle, delimiter = sep)
		keys	= sorted(dict_dcounts.keys())
		writer.writerow(["gene", "count", "percent"])
		for key in keys:
			aline = [ key, dict_dcounts[key], "%4.2f" % (dict_dcounts[key] / float(found) * 100) ]
			writer.writerow(aline)
		handle.close()

	message = "\nTotal raw reads: %d\nCorrect Length: %d\nV assigned: %d\nJ assigned: %d\nCDR3 assigned: %d\nIn-frame junction: %d\nNo indels: %d\nContinuous ORF with no stop codons: %d\n\n"  % \
								(raw_count, total, total-counts['noV']-counts['chimera'], found, found-counts['noCDR3'], found-counts['noCDR3']-counts['nonproductive'], found-counts['noCDR3']-counts['nonproductive']-counts['indel'], counts['good'])
	print( message )
	handle = open("%s/finalize_blast.log"%prj_tree.logs, "w")
	handle.write(message)
	handle.close()

	# call 1.4 or 1.5 if requested
	if arguments['--runClustering']:
		cmd = "%s/annotate/1.4-cluster_sequences.py" % SCRIPT_FOLDER
		for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save' ]:
			if arguments[opt] is not None:
				cmd += " %s '%s'" % (opt, arguments[opt])
		if arguments['--runCellStatistics']:
			cmd += " --runCellStatistics"
		print( "Calling 1.4 with command line: %s" % cmd )
		os.system( cmd )
	elif arguments['--runCellStatistics']:
		cmd = "%s/annotate/1.5-single_cell_statistics.py" % SCRIPT_FOLDER
		for opt in [ '--rearrangements', '--save' ]:
			if arguments[opt] is not None:
				cmd += " %s '%s'" % (opt, arguments[opt])
		print( "Calling 1.5 with command line: %s" % cmd )
		os.system( cmd )

	#clean up!!
	oldFiles = glob.glob("%s/*txt"%prj_tree.vgene) + glob.glob("%s/*fasta"%prj_tree.vgene) +  glob.glob("%s/*txt"%prj_tree.jgene) + glob.glob("%s/*fasta"%prj_tree.jgene) + glob.glob("%s/*tsv"%prj_tree.jgene) + glob.glob("%s/lookup*"%prj_tree.internal)
	if len(oldFiles) > 0 and not arguments['--noclean']:
		[os.remove(f) for f in oldFiles]
Esempio n. 3
0
def main():

	airrFile = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','length_raw','length_trimmed','indels','status','blast_identity','cluster_count','v_identity'])

	
	#try to vacuum up all possible raw sequences and hope it doesn't kill memory
	raw_seqs  = defaultdict( dict )
	file_list = glob.glob("*.fa") + glob.glob("*.fas") + glob.glob("*.fst") + glob.glob("*.fasta") + glob.glob("*.fna") + glob.glob("*.fq") + glob.glob("*.fastq")
	for myseq, myqual, file_name in generate_read_fasta_folder( file_list ):
		raw_seqs[file_name][myseq.seq_id] = myseq.seq


	#get trimmed sequences
	trim_seqs = load_fastas( "%s/%s_allJ.fa"%(prj_tree.nt, prj_name) )

	#get nt junctions
	junc_seqs = load_fastas( "%s/%s_allCDR3.fa"%(prj_tree.nt, prj_name) )


	#do the conversion
	with open( "%s/%s_all_seq_stats.txt"%(prj_tree.tables, prj_name), "r" ) as handle:
		oldFile = csv.reader( handle, delimiter="\t" )
		header = next(oldFile)
		for row in oldFile:
			if row[11] == "wrong_length":
				continue

			if row[1] not in raw_seqs:
				sys.stderr.write("Couldn't find raw sequence file %s, %s will be dropped from converted file.\n"%(row[1],row[0]))
				continue
			elif row[2] not in raw_seqs[row[1]]:
				sys.stderr.write("Couldn't find raw sequence %s in file %s; %s will be dropped from converted file.\n"%(row[2],row[1],row[0]))
				continue

			r = dict()

			r['sequence']		= raw_seqs[ row[1] ][ row[2] ]
			r['sequence_alignment'] = str( trim_seqs.get( row[0], SeqRecord(seq="") ).seq )
			r['junction']		= str( junc_seqs.get( row[0], SeqRecord(seq="") ).seq )
					
			r['sequence_id']	= row[0]
			r['source_file']	= row[1]
			r['source_id']		= row[2]
			r['length_raw']		= row[3]
			if not row[4] == "NA":
				r['length_trimmed']  = row[4]
			if not row[5] == "NA":
				r['v_call']	     = row[5]
			if row[6] not in ["NA", "not_found"]:
				r['d_call']	     = row[6]
			if not row[7] == "NA":
				r['j_call']	     = row[7]
			if not row[9] == "NA":
				r['indels']	     = row[9]
			if not row[10] == "NA":
				r['stop_codon']	     = row[10]
			r['status']		     = row[11]
			if not row[12] == "NA":
				r['blast_identity']  = "%.3f" % (  1 - float(re.sub("%","",row[12]))/100  )
			if not row[13] == "NA":
				r['junction_length'] = int(row[13])+6
			if not row[15] == "NA":
				r['junction_aa']     = row[15]

			if len(row)>15:
				if header[16]=="Unique":
					if row[16] == "T":
						r['status']	   = "unique"
						r['cluster_count'] = row[17]
					if len(row)>17 and not row[18]=="NA":
						r['v_identity']	   = "%.3f" % (	 1 - float(re.sub("%","",row[18]))/100	)
				elif header[16] == "V_div" and not row[16]=="NA":
					r['v_identity']		   = "%.3f" % (	 1 - float(re.sub("%","",row[16]))/100	)


			#figure out in-frame/productive
			if row[10] == "good":
				r['vj_in_frame'] = "T"
				r['productive']	 = "T"
			elif row[10] == "stop":
				r['vj_in_frame'] = "T"
				r['productive']	 = "F"
			elif row[10] == "nonproductive":
				r['vj_in_frame'] = "F"
				r['productive']	 = "F"
			elif row[10] == "indel":
				r['productive']	 = "F"

				
			#figure out locus
			if any( x in row[5] for x in ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"] ):
				r['locus'] = "IGH"
			elif any( x in row[5] for x in ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"] ):
				r['locus'] = "IGL"
			elif any( x in row[5] for x in ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"] ):
				r['locus'] = "IGK"

				
			airrFile.write(r)

	airrFile.close()
	valid = airr.validate_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name) )
	if not valid:
		sys.exit( "ERROR: something went wrong, %s/%s_rearrangements.tsv failed validation!"%(prj_tree.tables, prj_name) )
Esempio n. 4
0
def main():

    airrFile = airr.create_rearrangement(
        "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name),
        fields=[
            'vj_in_frame', 'stop_codon', 'locus', 'c_call', 'junction_length',
            'source_file', 'source_id', 'length_raw', 'length_trimmed',
            'indels', 'status', 'blast_identity', 'cluster_count', 'v_identity'
        ])

    #try to vacuum up all possible raw sequences and hope it doesn't kill memory
    raw_seqs = defaultdict(dict)
    file_list = glob.glob("*.fa") + glob.glob("*.fas") + glob.glob(
        "*.fst") + glob.glob("*.fasta") + glob.glob("*.fna") + glob.glob(
            "*.fq") + glob.glob("*.fastq")
    for myseq, myqual, file_name in generate_read_fasta_folder(file_list):
        raw_seqs[file_name][myseq.seq_id] = myseq.seq

    #get trimmed sequences
    trim_seqs = load_fastas("%s/%s_allJ.fa" % (prj_tree.nt, prj_name))

    #get nt junctions
    junc_seqs = load_fastas("%s/%s_allCDR3.fa" % (prj_tree.nt, prj_name))

    #do the conversion
    with open("%s/%s_all_seq_stats.txt" % (prj_tree.tables, prj_name),
              "r") as handle:
        oldFile = csv.reader(handle, delimiter="\t")
        header = next(oldFile)
        for row in oldFile:
            if row[11] == "wrong_length":
                continue

            if row[1] not in raw_seqs:
                sys.stderr.write(
                    "Couldn't find raw sequence file %s, %s will be dropped from converted file.\n"
                    % (row[1], row[0]))
                continue
            elif row[2] not in raw_seqs[row[1]]:
                sys.stderr.write(
                    "Couldn't find raw sequence %s in file %s; %s will be dropped from converted file.\n"
                    % (row[2], row[1], row[0]))
                continue

            r = dict()

            r['sequence'] = raw_seqs[row[1]][row[2]]
            r['sequence_alignment'] = str(
                trim_seqs.get(row[0], SeqRecord(seq="")).seq)
            r['junction'] = str(junc_seqs.get(row[0], SeqRecord(seq="")).seq)

            r['sequence_id'] = row[0]
            r['source_file'] = row[1]
            r['source_id'] = row[2]
            r['length_raw'] = row[3]
            if not row[4] == "NA":
                r['length_trimmed'] = row[4]
            if not row[5] == "NA":
                r['v_call'] = row[5]
            if row[6] not in ["NA", "not_found"]:
                r['d_call'] = row[6]
            if not row[7] == "NA":
                r['j_call'] = row[7]
            if not row[9] == "NA":
                r['indels'] = row[9]
            if not row[10] == "NA":
                r['stop_codon'] = row[10]
            r['status'] = row[11]
            if not row[12] == "NA":
                r['blast_identity'] = "%.3f" % (
                    1 - float(re.sub("%", "", row[12])) / 100)
            if not row[13] == "NA":
                r['junction_length'] = int(row[13]) + 6
            if not row[15] == "NA":
                r['junction_aa'] = row[15]

            if len(row) > 15:
                if header[16] == "Unique":
                    if row[16] == "T":
                        r['status'] = "unique"
                        r['cluster_count'] = row[17]
                    if len(row) > 17 and not row[18] == "NA":
                        r['v_identity'] = "%.3f" % (
                            1 - float(re.sub("%", "", row[18])) / 100)
                elif header[16] == "V_div" and not row[16] == "NA":
                    r['v_identity'] = "%.3f" % (
                        1 - float(re.sub("%", "", row[16])) / 100)

            #figure out in-frame/productive
            if row[10] == "good":
                r['vj_in_frame'] = "T"
                r['productive'] = "T"
            elif row[10] == "stop":
                r['vj_in_frame'] = "T"
                r['productive'] = "F"
            elif row[10] == "nonproductive":
                r['vj_in_frame'] = "F"
                r['productive'] = "F"
            elif row[10] == "indel":
                r['productive'] = "F"

            #figure out locus
            if any(x in row[5] for x in
                   ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"]):
                r['locus'] = "IGH"
            elif any(x in row[5] for x in
                     ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"]):
                r['locus'] = "IGL"
            elif any(x in row[5] for x in
                     ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"]):
                r['locus'] = "IGK"

            airrFile.write(r)

    airrFile.close()
    valid = airr.validate_rearrangement("%s/%s_rearrangements.tsv" %
                                        (prj_tree.tables, prj_name))
    if not valid:
        sys.exit(
            "ERROR: something went wrong, %s/%s_rearrangements.tsv failed validation!"
            % (prj_tree.tables, prj_name))
Esempio n. 5
0
    query['size'] = 1000
    query['from'] = 0

    cnt = 0
    while True:
        # send the request
        resp = requests.post(host_url + '/rearrangement', json = query)
        data = resp.json()
        rearrangements = data['Rearrangement']

        # Open a file for writing the rearrangements. We do this here
        # because we need to know the full set of fields being
        # returned from the data repository, otherwise by default only
        # the required fields will be written to the file.
        if first:
            out_file = airr.create_rearrangement('rearrangements.tsv', fields=rearrangements[0].keys())
            first = False

        # save the rearrangements to a file
        for row in rearrangements:
            out_file.write(row)

        # stop when downloaded at most 10,000 rearrangements or if the
        # response doesn't return the full amount, which indicates no more
        # data. If you wanted to download all rearrangements, keep
        # looping until zero rearrangements are returned from the query.
        cnt += len(rearrangements)
        if cnt >= 10000 or len(rearrangements) < 1000:
            break

        # Need to update the from parameter to get the next chunk
Esempio n. 6
0
def main():

	print( "Processing chunk %s..." % arguments['--chunk'])

	#get raw seq stats from temp table
	raw = csv.reader(open("%s/lookup_%s.txt" % (prj_tree.internal, arguments['--chunk']),'r'), delimiter=sep)

	raw_count, total, found, noV, noJ, f_ind = 0, 0, 0, 0, 0, 1
	counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0}
	if arguments['--nterm'] == "discard":
		counts["missingNterm"]=0

	writer = csv.writer(open("%s/jtophit_%s.txt" %(prj_tree.jgene, arguments['--chunk']), "w"), delimiter = sep)
	writer.writerow(PARSED_BLAST_HEADER)
	dict_jcounts = dict()
	dict_ccounts = dict()
	dict_dcounts = dict()
		
	c = False
	if os.path.isfile("%s/%s_C_%s.txt" % (prj_tree.jgene, prj_name, arguments['--chunk'])):
		c = True
		cWriter = csv.writer(open("%s/ctophit_%s.txt" %(prj_tree.jgene, arguments['--chunk']), "w"), delimiter = sep)
		cWriter.writerow(PARSED_BLAST_HEADER)

	d = False
	if os.path.isfile("%s/%s_D_%s.txt" % (prj_tree.jgene, prj_name, arguments['--chunk'])):
		d = True
		dWriter = csv.writer(open("%s/dtophit_%s.txt" %(prj_tree.jgene, arguments['--chunk']), "w"), delimiter = sep)
		dWriter.writerow(PARSED_BLAST_HEADER)


	seq_stats = airr.create_rearrangement( "%s/rearrangements_%s.tsv"%(prj_tree.internal, arguments['--chunk']), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity','consensus_count','cell_id'])

	dict_vgerm_aln, dict_other_vgerms, dict_vcounts = get_top_hits("%s/%s_%s.txt"%(prj_tree.vgene, prj_name, arguments['--chunk']) )
	dict_jgerm_aln, dict_other_jgerms, dict_jcounts = get_top_hits("%s/%s_%s.txt"%(prj_tree.jgene, prj_name, arguments['--chunk']), topHitWriter=writer, dict_germ_count=dict_jcounts )

	if c:
		minCStartPos = dict( [ (x, dict_jgerm_aln[x].qend) for x in dict_jgerm_aln.keys() ] )
		dict_cgerm_aln, dict_other_cgerms, dict_ccounts = get_top_hits("%s/%s_C_%s.txt"%(prj_tree.jgene, prj_name, arguments['--chunk']), topHitWriter=cWriter, dict_germ_count=dict_ccounts, minQStart=minCStartPos )

	if d:
		maxDEndPos = dict( [ (x, dict_jgerm_aln[x].qstart) for x in dict_jgerm_aln.keys() ] )
		dict_dgerm_aln, dict_other_dgerms, dict_dcounts = get_top_hits("%s/%s_D_%s.txt"%(prj_tree.jgene, prj_name, arguments['--chunk']), topHitWriter=dWriter, dict_germ_count=dict_dcounts, maxQEnd=maxDEndPos )

	for entry in SeqIO.parse( "%s/%s_%s.fasta" % (prj_tree.vgene, prj_name, arguments['--chunk']), "fasta"):
		total += 1

		raw_stats = next(raw)
		raw_count += 1
			
		while not entry.id == raw_stats[0]:
			#we found a read that did not meet the length cut-off
			raw_stats = next(raw)
			raw_count += 1

				
		rearrangement = dict()
		rearrangement['sequence_id'] = raw_stats[0]
		rearrangement['source_file'] = raw_stats[1]
		rearrangement['source_id']   = raw_stats[2]
		rearrangement['length_raw']  = raw_stats[3]
		rearrangement['sequence']    = str(entry.seq)

		if not raw_stats[4] == "NA":
			rearrangement['duplicate_count'] = raw_stats[4]
		if not raw_stats[5] == "NA":
			rearrangement['consensus_count'] = raw_stats[5]
		if not raw_stats[6] == "NA":
			rearrangement['cell_id'] = raw_stats[6]
				
		if not entry.id in dict_vgerm_aln:
			noV+=1
			rearrangement['status'] = 'noV'
			seq_stats.write(rearrangement)
		elif not entry.id in dict_jgerm_aln:
			noJ+=1
			myV = dict_vgerm_aln[entry.id]
			entry.seq = entry.seq[ myV.qstart - 1 : myV.qend ]
			if (myV.strand == 'minus'):
				entry.seq = entry.seq.reverse_complement()
				rearrangement['rev_comp']       = "T"
			else:
				rearrangement['rev_comp']       = "F"
			myVgenes = ",".join( [myV.sid] + dict_other_vgerms.get(entry.id,[]) )
			
			vlocus = ""
			if any( x in myV.sid for x in ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"] ):
				vlocus = "IGH"
			elif any( x in myV.sid for x in ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"] ):
				vlocus = "IGL"
			elif any( x in myV.sid for x in ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"] ):
				vlocus = "IGK"

			rearrangement['v_call'] = myVgenes
			rearrangement['locus']  = vlocus
			rearrangement['productive'] = "F"
			rearrangement['status'] = 'noJ'
			rearrangement['sequence_alignment'] = str(entry.seq)
			seq_stats.write(rearrangement)

		else:
				
			found += 1
			myV = dict_vgerm_aln[entry.id]
			myJ = dict_jgerm_aln[entry.id]
			added5 = 0
			productive = "T"
			indel = "F"
			stop = "F"
			cdr3 = True
			
			#get actual V(D)J sequence
			v_len = myV.qend - (myV.qstart-1) #need to use qstart and qend instead of alignment to account for gaps

			#try to recover 3' of J
			if myJ.send < len(dict_j[myJ.sid].seq) and \
				 ( (myV.strand == "plus" and myV.qstart + v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send) <= len(entry.seq)) or \
					(myV.strand == "minus" and myV.qend - (v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send)) >= 0) ):
					vdj_len = v_len + myJ.qend + (len(dict_j[myJ.sid].seq) - myJ.send)
			else:
				vdj_len = v_len + myJ.qend

			const_seq = ""
			if (myV.strand == 'plus'):
				const_seq = str( entry.seq[myV.qstart+vdj_len-1 : ] )
				if myV.sstart > 1:
					if arguments['--nterm'] == "extend":
						if myV.qstart >= myV.sstart:
							entry.seq = entry.seq[ myV.qstart - myV.sstart : myV.qstart + vdj_len - 1 ]
							added5 = myV.sstart - 1
						else:
							entry.seq = entry.seq[  : myV.qstart + vdj_len - 1 ]
							added5 = myV.qstart - 1
					elif arguments['--nterm'] == "germline":
						entry.seq = dict_v[myV.sid].seq[ 0 : myV.sstart-1 ] + entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ]
						added5 = myV.sstart - 1
					else:
								entry.seq = entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ]

				else: #blast found full V gene
					entry.seq = entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ]

			else: #minus strand
				const_seq = str( entry.seq[ : myV.qend-vdj_len].reverse_complement() )
				if myV.send > 1:
					if arguments['--nterm'] == "extend":
						if len(entry.seq)-myV.qend >= myV.send-1:
							entry.seq = entry.seq[ myV.qend - vdj_len : myV.qend+myV.send-1 ].reverse_complement()
							added5 = myV.send - 1
						else:
							added5 = len(entry.seq) - myV.qend
							entry.seq = entry.seq[ myV.qend - vdj_len :  ].reverse_complement()
					elif arguments['--nterm'] == "germline":
						entry.seq = dict_v[myV.sid].seq[ 0 : myV.send-1 ] + entry.seq[ myV.qend - vdj_len : myV.qend ].reverse_complement()
						added5 = myV.send - 1
					else:
						entry.seq = entry.seq[ myV.qend - vdj_len : myV.qend ].reverse_complement()

				else: #blast found full V gene
					entry.seq = entry.seq[ myV.qend - vdj_len : myV.qend ].reverse_complement()

			#get CDR3 boundaries
			cdr3_start,cdr3_end,WF_motif = find_cdr3_borders(myV.sid,str(dict_v[myV.sid].seq), v_len, min(myV.sstart, myV.send), max(myV.sstart, myV.send), str(dict_j[myJ.sid].seq), myJ.sstart, myJ.qstart, myJ.gaps, str(entry.seq[ added5 : ])) #min and max statments take care of switching possible minus strand hit
			cdr3_seq = entry.seq[ added5+cdr3_start : added5+cdr3_end ]

			#push the sequence into frame for translation, if need be
			v_frame = ( min([myV.sstart, myV.send]) - added5 ) % 3
			five_prime_add = (v_frame-1) % 3
			entry.seq = 'N' * five_prime_add + entry.seq 

			#prevent BioPython errors by trimming to last full codon
			#if (len(entry.seq) % 3) > 0:
			# entry.seq = entry.seq [ : -1 * (len(entry.seq) % 3) ]

			#check for stop codons
			if '*' in entry.seq.translate():
				stop = "T"

			#check for in-frame junction
			if len(cdr3_seq) % 3 != 0:
				productive = "F"
			else: #even if recombination looks ok, might be (sequencing) indels in V and/or J
				j_frame = 3 - ( ( WF_motif - myJ.sstart ) % 3 ) #j genes start in different frames, so calculate based on position of conserved W/F found by the cdr3 subroutine above
				frame_shift = (v_len + myJ.qstart + added5 - 1) % 3

				if (v_frame + frame_shift) % 3 != j_frame % 3:
					indel = "T" 
				else:
					#use blast gaps to detect frame shift in-dels
					#most of these have stop codons or other sequence problems, but we'll catch a few extra this way
					if (abs(myV.send-myV.sstart)-(myV.qend-myV.qstart)) % 3 != 0 or ((myJ.send-myJ.sstart)-(myJ.qend-myJ.qstart)) % 3 != 0:
						indel = "T"

			#make sure cdr3 boundaries make sense
			if (cdr3_end<=cdr3_start or cdr3_end>vdj_len or cdr3_start<0):
				cdr3 = False

			status = "good"
			if not cdr3:
				status = "noCDR3"
			elif productive == "F":
				status = "nonproductive"
			elif indel == "T":
				status = "indel"
			elif stop == "T":
				status = "stop"
			elif arguments['--nterm'] == "discard" and min(myV.sstart,myV.send) > 1:
				status = "missingNterm"

			#add germline assignments to fasta description and write to disk
			myVgenes = ",".join( [myV.sid] + dict_other_vgerms.get(entry.id,[]) )
			myJgenes = ",".join( [myJ.sid] + dict_other_jgerms.get(entry.id,[]) )
				
			myDgenes = ""
			if d:
				if entry.id in dict_dgerm_aln:
					myDgenes = ",".join( [dict_dgerm_aln[entry.id].sid] + dict_other_dgerms.get(entry.id,[]) )

			myCgenes = ""
			if c:
				if entry.id in dict_cgerm_aln:
					myCgenes = ",".join( [dict_cgerm_aln[entry.id].sid] + dict_other_cgerms.get(entry.id,[]) )
				elif not arguments['--noFallBack']:
					if re.match("C[CT]", const_seq):
						myCgenes = "IGHG" #could also be IgE, but I'm assuming that's rare
					elif re.match("GGA", const_seq):
						myCgenes = "IGHM"
					elif re.match("CAT", const_seq):
						myCgenes = "IGHA"
					elif re.match("CAC", const_seq):
						myCgenes = "IGHD"
			elif not arguments['--noFallBack']:
				if re.match("C[CT]", const_seq):
					myCgenes = "IGHG" #could also be IgE, but I'm assuming that's rare
				elif re.match("GGA", const_seq):
					myCgenes = "IGHM"
				elif re.match("CAT", const_seq):
					myCgenes = "IGHA"
				elif re.match("CAC", const_seq):
					myCgenes = "IGHD"
						
			vlocus = ""
			if any( x in myV.sid for x in ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"] ):
				vlocus = "IGH"
			elif any( x in myV.sid for x in ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"] ):
				vlocus = "IGL"
			elif any( x in myV.sid for x in ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"] ):
				vlocus = "IGK"
					
			#do AIRR output
			if myV.strand == "plus":
				rearrangement['rev_comp']       = "F"
			else:
				rearrangement['rev_comp']       = "T"
			if status == "good":
				rearrangement['productive']     = "T"
			else:
				rearrangement['productive']     = "F"
			rearrangement['vj_in_frame']        = productive
			rearrangement['stop_codon']         = stop
			rearrangement['locus']              = vlocus
			rearrangement['v_call']             = myVgenes
			rearrangement['j_call']             = myJgenes
			rearrangement['d_call']             = myDgenes
			rearrangement['c_call']             = myCgenes
			rearrangement['sequence_alignment'] = str(entry.seq)
			rearrangement['junction']           = cdr3_seq
			rearrangement['junction_aa']        = cdr3_seq.translate()
			rearrangement['junction_length']    = len(cdr3_seq)
			rearrangement['length_trimmed']     = len(entry.seq)
			rearrangement['indels']             = indel
			rearrangement['status']             = status
			rearrangement['blast_identity']     = "%.3f" % (myV.identity/100.0)
				
			seq_stats.write(rearrangement)

			counts[status] += 1

	print( "chunk %s: %d done, found %d; %d good..." %(arguments['--chunk'], total, found, counts['good']) )

	seq_stats.close()
Esempio n. 7
0
data = airr.read_rearrangement('toy_data.tsv')
print(data.fields)
print(data.external_fields)
for r in data:
    print(r)

# Create a new rearrangements file with an intermediate parser
# Technically, the parser tool should be reading the VDJ rearrangements
# output file, parsing it, then writing the row data.
print('*****')
print('*****')
print('Create new rearrangements file.')
print('*****')
print('*****')
data = airr.read_rearrangement('toy_data.tsv')
newd = airr.create_rearrangement('my_data.tsv', fields=data.fields)
print(newd.fields)
print(newd.external_fields)
for r in data:
    newd.write(r)
newd.close()

data = airr.read_rearrangement('my_data.tsv')
print(data.fields)
print(data.external_fields)
for r in data:
    print(r)

# create a derived rearrangements file with additional annotation
print('*****')
print('*****')
Esempio n. 8
0
def main():

	if not glob.glob("%s/%s_*.fasta" % (prj_tree.jgene, prj_name)):
		sys.exit("No jBlast output found!\n")
	
	print( "curating junction and 3' end..." )


	allV_aa	     = open ("%s/%s_allV.fa"	 % (prj_tree.aa, prj_name), "w" )
	allV_nt	     = open( "%s/%s_allV.fa"	 % (prj_tree.nt, prj_name), "w" )

	allJ_aa	     = open( "%s/%s_allJ.fa"	 % (prj_tree.aa, prj_name), "w" )
	allJ_nt	     = open( "%s/%s_allJ.fa"	 % (prj_tree.nt, prj_name), "w" )

	vj_aa	     = open( "%s/%s_goodVJ.fa"	 % (prj_tree.aa, prj_name), "w" )
	vj_nt	     = open( "%s/%s_goodVJ.fa"	 % (prj_tree.nt, prj_name), "w" )

	good_cdr3_aa = open( "%s/%s_goodCDR3.fa" % (prj_tree.aa, prj_name), "w" )
	good_cdr3_nt = open( "%s/%s_goodCDR3.fa" % (prj_tree.nt, prj_name), "w" )

	all_cdr3_nt  = open( "%s/%s_allCDR3.fa"	 % (prj_tree.nt, prj_name), "w" )


	#get raw seq stats from temp table
	raw = csv.reader(open("%s/id_lookup.txt" % prj_tree.internal,'r'), delimiter=sep)


	raw_count, total, found, noV, noJ, f_ind  = 0, 0, 0, 0, 0, 1
	counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0}
	if arguments['--nterm'] == "discard":
		counts["missingNterm"]=0

	writer = csv.writer(open("%s/%s_jgerm_tophit.txt" %(prj_tree.tables, prj_name), "w"), delimiter = sep)
	writer.writerow(PARSED_BLAST_HEADER)
	dict_jcounts = dict()
	dict_ccounts = dict()
	dict_dcounts = dict()
		
	c = False
	if os.path.isfile("%s/%s_C_001.txt" % (prj_tree.jgene, prj_name)):
		c = True
		cWriter = csv.writer(open("%s/%s_cgerm_tophit.txt" %(prj_tree.tables, prj_name), "w"), delimiter = sep)
		cWriter.writerow(PARSED_BLAST_HEADER)

	d = False
	if os.path.isfile("%s/%s_D_001.txt" % (prj_tree.jgene, prj_name)):
		d = True
		dWriter = csv.writer(open("%s/%s_dgerm_tophit.txt" %(prj_tree.tables, prj_name), "w"), delimiter = sep)
		dWriter.writerow(PARSED_BLAST_HEADER)


	seq_stats = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity'])

	
	while os.path.isfile("%s/%s_%03d.fasta" % (prj_tree.vgene, prj_name, f_ind)):

		dict_vgerm_aln, dict_other_vgerms, dict_vcounts	 =  get_top_hits("%s/%s_%03d.txt"%(prj_tree.vgene, prj_name, f_ind) )
		dict_jgerm_aln, dict_other_jgerms, dict_jcounts	 =  get_top_hits("%s/%s_%03d.txt"%(prj_tree.jgene, prj_name, f_ind), topHitWriter=writer, dict_germ_count=dict_jcounts )

		if c:
			minCStartPos = dict( [ (x, dict_jgerm_aln[x].qend) for x in dict_jgerm_aln.keys() ] )
			dict_cgerm_aln, dict_other_cgerms, dict_ccounts	 =  get_top_hits("%s/%s_C_%03d.txt"%(prj_tree.jgene, prj_name, f_ind), topHitWriter=cWriter, dict_germ_count=dict_ccounts, minQStart=minCStartPos )

		if d:
			maxDEndPos = dict( [ (x, dict_jgerm_aln[x].qstart) for x in dict_jgerm_aln.keys() ] )
			dict_dgerm_aln, dict_other_dgerms, dict_dcounts	 =  get_top_hits("%s/%s_D_%03d.txt"%(prj_tree.jgene, prj_name, f_ind), topHitWriter=dWriter, dict_germ_count=dict_dcounts, maxQEnd=maxDEndPos )

		for entry in SeqIO.parse( "%s/%s_%03d.fasta" % (prj_tree.vgene, prj_name, f_ind), "fasta"):
			total += 1

			raw_stats = next(raw)
			raw_count += 1
			
			while not entry.id == raw_stats[0]:
				#we found a read that did not meet the length cut-off
				raw_stats = next(raw)
				raw_count += 1

				
			rearrangement = dict()
			rearrangement['sequence_id']     = raw_stats[0]
			rearrangement['source_file']     = raw_stats[1]
			rearrangement['source_id']       = raw_stats[2]
			rearrangement['length_raw']      = raw_stats[4]
			rearrangement['sequence']        = str(entry.seq)

			if not raw_stats[3] == "NA":
				rearrangement['duplicate_count'] = raw_stats[3]
				entry.description = "duplicate_count=%s" % raw_stats[3]
			else:
				entry.description = "" #just in case
				
			if not entry.id in dict_vgerm_aln:
				noV+=1
				rearrangement['status']	= 'noV'
				seq_stats.write(rearrangement)
			elif not entry.id in dict_jgerm_aln:
				noJ+=1
				myV = dict_vgerm_aln[entry.id]
				if (myV.strand == 'plus'):
					entry.seq = entry.seq[ myV.qstart - 1 :	 ]							
				else:
					entry.seq = entry.seq[	: myV.qend ].reverse_complement()
				myVgenes = ",".join( [myV.sid] + dict_other_vgerms.get(entry.id,[]) )
				entry.description += " v_call=%s status=noJ" % (myVgenes)
				allV_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq))

				#prevent BioPython errors
				if (len(entry.seq) % 3) > 0:
					entry.seq = entry.seq [ :  -1 * (len(entry.seq) % 3) ]
				allV_aa.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq.translate()))

				rearrangement['v_call'] = myVgenes
				rearrangement['status']	= 'noJ'
				seq_stats.write(rearrangement)

			else:
				
				found += 1
				myV = dict_vgerm_aln[entry.id]
				myJ = dict_jgerm_aln[entry.id]
				added5 = 0
				productive = "T"
				indel = "F"
				stop = "F"
				cdr3 = True
				
				#get actual V(D)J sequence
				v_len	= myV.qend - (myV.qstart-1) #need to use qstart and qend instead of alignment to account for gaps

				#try to recover 3' of J
				if myJ.send < len(dict_j[myJ.sid].seq) and \
				   ( (myV.strand == "plus" and myV.qstart + v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send) <= len(entry.seq)) or \
				     (myV.strand == "minus" and myV.qend - (v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send)) >= 0) ):
						vdj_len = v_len + myJ.qend + (len(dict_j[myJ.sid].seq) - myJ.send)
				else:
					vdj_len = v_len + myJ.qend
					
				if (myV.strand == 'plus'):
					if myV.sstart > 1:
						if arguments['--nterm'] == "extend":
							if myV.qstart >= myV.sstart:
								entry.seq = entry.seq[ myV.qstart - myV.sstart : myV.qstart + vdj_len - 1 ]
								added5 = myV.sstart - 1
							else:
								entry.seq = entry.seq[	: myV.qstart + vdj_len - 1 ]
								added5 = myV.qstart - 1
						elif arguments['--nterm'] == "germline":
							entry.seq = dict_v[myV.sid].seq[ 0 : myV.sstart-1 ] + entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ]
							added5 = myV.sstart - 1
						else:
						      entry.seq = entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ]

					else: #blast found full V gene
						entry.seq = entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ]

				else: #minus strand
					if myV.send > 1:
						if arguments['--nterm'] == "extend":
							if len(entry.seq)-myV.qend >= myV.send-1:
								entry.seq = entry.seq[ myV.qend - vdj_len : myV.qend+myV.send-1 ].reverse_complement()
								added5 = myV.send - 1
							else:
								added5 = len(entry.seq) - myV.qend
								entry.seq = entry.seq[	myV.qend - vdj_len :  ].reverse_complement()
						elif arguments['--nterm'] == "germline":
							entry.seq = dict_v[myV.sid].seq[ 0 : myV.send-1 ] + entry.seq[	myV.qend - vdj_len : myV.qend ].reverse_complement()
							added5 = myV.send - 1
						else:
							entry.seq = entry.seq[	myV.qend - vdj_len : myV.qend ].reverse_complement()

					else: #blast found full V gene
						entry.seq = entry.seq[ myV.qend - vdj_len : myV.qend ].reverse_complement()

				#get CDR3 boundaries
				cdr3_start,cdr3_end,WF_motif = find_cdr3_borders(myV.sid,str(dict_v[myV.sid].seq), v_len, min(myV.sstart, myV.send), max(myV.sstart, myV.send), str(dict_j[myJ.sid].seq), myJ.sstart, myJ.qstart, myJ.gaps, str(entry.seq[ added5 :  ])) #min and max statments take care of switching possible minus strand hit
				cdr3_seq = entry.seq[ added5+cdr3_start : added5+cdr3_end ]

				#push the sequence into frame for translation, if need be
				v_frame = ( min([myV.sstart, myV.send]) - added5 ) % 3
				five_prime_add = (v_frame-1) % 3
				entry.seq = 'N' * five_prime_add + entry.seq 

				#prevent BioPython errors by trimming to last full codon
				#if (len(entry.seq) % 3) > 0:
				#	entry.seq = entry.seq [ :  -1 * (len(entry.seq) % 3) ]

				#check for stop codons
				if '*' in entry.seq.translate():
					stop = "T"

				#check for in-frame junction
				if len(cdr3_seq) % 3 != 0:
					productive = "F"
				else: #even if recombination looks ok, might be (sequencing) indels in V and/or J
					j_frame = 3 - ( ( WF_motif - myJ.sstart ) % 3 ) #j genes start in different frames, so calculate based on position of conserved W/F found by the cdr3 subroutine above
					frame_shift = (v_len + myJ.qstart + added5 - 1) % 3

					if (v_frame + frame_shift) % 3 != j_frame % 3:
						indel = "T"   #for gDNA we would probably want to distinguish between an out-of-frame recombination and sequencing in-dels in V or J
								#but that can be ambiguous and for cDNA we can assume that it's sll sequencing in-del anyway, even in CDR3.
					else:
						#use blast gaps to detect frame shift in-dels
						#most of these have stop codons or other sequence problems, but we'll catch a few extra this way
						if (abs(myV.send-myV.sstart)-(myV.qend-myV.qstart)) % 3 != 0 or ((myJ.send-myJ.sstart)-(myJ.qend-myJ.qstart)) % 3 != 0:
							indel = "T"

				#make sure cdr3 boundaries make sense
				if (cdr3_end<=cdr3_start or cdr3_end>vdj_len or cdr3_start<0):
					cdr3 = False

				status = "good"
				if not cdr3:
					status = "noCDR3"
				elif productive == "F":
					status = "nonproductive"
				elif indel == "T":
					status = "indel"
				elif stop == "T":
					status = "stop"
				elif arguments['--nterm'] == "discard" and min(myV.sstart,myV.send) > 1:
					status = "missingNterm"

				#add germline assignments to fasta description and write to disk
				myVgenes = ",".join( [myV.sid] + dict_other_vgerms.get(entry.id,[]) )
				myJgenes = ",".join( [myJ.sid] + dict_other_jgerms.get(entry.id,[]) )
				
				myDgenes = ""
				if d:
					if entry.id in dict_dgerm_aln:
						myDgenes = ",".join( [dict_dgerm_aln[entry.id].sid] + dict_other_dgerms.get(entry.id,[]) )

				myCgenes = ""
				if c:
					if entry.id in dict_cgerm_aln:
						myCgenes = ",".join( [dict_cgerm_aln[entry.id].sid] + dict_other_cgerms.get(entry.id,[]) )

				vlocus = ""
				if any( x in myV.sid for x in ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"] ):
					vlocus = "IGH"
				elif any( x in myV.sid for x in ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"] ):
					vlocus = "IGL"
				elif any( x in myV.sid for x in ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"] ):
					vlocus = "IGK"
					
				entry.description += " v_call=%s" % myVgenes
				if myDgenes != "":
					entry.description += " d_call=%s" % myDgenes
				entry.description += " j_call=%s" % myJgenes
				if myCgenes != "":
					entry.description += " c_call=%s" % myCgenes
				if vlocus != "":
					entry.description += " locus=%s" % vlocus
				entry.description += " status=%s blast_identity=%.3f junction_length=%d junction=%s junction_aa=%s" % ( status, myV.identity/100.0, len(cdr3_seq), cdr3_seq, cdr3_seq.translate() )

				allV_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq))
				allV_aa.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq.translate()))

				allJ_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq))
				allJ_aa.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq.translate()))

				if status == "good":

					vj_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq))
					vj_aa.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq.translate()))

					good_cdr3_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, cdr3_seq))
					good_cdr3_aa.write(">%s %s\n%s\n" %(entry.id, entry.description, cdr3_seq.translate()))

					all_cdr3_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, cdr3_seq))

				elif cdr3:
					#CDR3 but not "good"
					all_cdr3_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, cdr3_seq))


				#do AIRR output
				if myV.strand == "plus":
					rearrangement['rev_comp']   = "F"
				else:
					rearrangement['rev_comp']   = "T"
				if status == "good":
					rearrangement['productive'] = "T"
				else:
					rearrangement['productive'] = "F"
				rearrangement['vj_in_frame']	    = productive
				rearrangement['stop_codon']	    = stop
				rearrangement['locus']		    = vlocus
				rearrangement['v_call']		    = myVgenes
				rearrangement['j_call']		    = myJgenes
				rearrangement['d_call']	            = myDgenes
				rearrangement['c_call']	            = myCgenes
				rearrangement['sequence_alignment'] = str(entry.seq)
				rearrangement['junction']	    = cdr3_seq
				rearrangement['junction_aa']	    = cdr3_seq.translate()
				rearrangement['junction_length']    = len(cdr3_seq)
				rearrangement['length_trimmed']	    = len(entry.seq)
				rearrangement['indels']		    = indel
				rearrangement['status']		    = status
				rearrangement['blast_identity']	    = "%.3f" % (myV.identity/100.0)
				
				seq_stats.write(rearrangement)

				counts[status] += 1

		print( "%d done, found %d; %d good..." %(total, found, counts['good']) )
		f_ind += 1

	seq_stats.close()

	#print out some statistics
	handle = open("%s/%s_jgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
	writer	= csv.writer(handle, delimiter = sep)
	keys	= sorted(dict_jcounts.keys())
	writer.writerow(["gene", "count", "percent"])
	for key in keys:
		aline = [ key, dict_jcounts[key], "%4.2f" % (dict_jcounts[key] / float(found) * 100) ]
		writer.writerow(aline)
	handle.close()

	if len(dict_ccounts) > 0:
		handle = open("%s/%s_cgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
		writer	= csv.writer(handle, delimiter = sep)
		keys	= sorted(dict_ccounts.keys())
		writer.writerow(["gene", "count", "percent"])
		for key in keys:
			aline = [ key, dict_ccounts[key], "%4.2f" % (dict_ccounts[key] / float(found) * 100) ]
			writer.writerow(aline)
		handle.close()

	if len(dict_dcounts) > 0:
		handle = open("%s/%s_dgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
		writer	= csv.writer(handle, delimiter = sep)
		keys	= sorted(dict_dcounts.keys())
		writer.writerow(["gene", "count", "percent"])
		for key in keys:
			aline = [ key, dict_dcounts[key], "%4.2f" % (dict_dcounts[key] / float(found) * 100) ]
			writer.writerow(aline)
		handle.close()

	message = "\nTotal raw reads: %d\nCorrect Length: %d\nV assigned: %d\nJ assigned: %d\nCDR3 assigned: %d\nIn-frame junction: %d\nNo indels: %d\nContinuous ORF with no stop codons: %d\n\n"  % \
							     (raw_count, total, total-noV, found, found-counts['noCDR3'], found-counts['noCDR3']-counts['nonproductive'], found-counts['noCDR3']-counts['nonproductive']-counts['indel'], counts['good'])
	print( message )
	handle = open("%s/finalize_blast.log"%prj_tree.logs, "w")
	handle.write(message)
	handle.close()

	#clean up!!
	oldFiles = glob.glob("%s/*txt"%prj_tree.vgene) + glob.glob("%s/*fasta"%prj_tree.vgene) +  glob.glob("%s/*txt"%prj_tree.jgene) + glob.glob("%s/*fasta"%prj_tree.jgene) + glob.glob("%s/id_lookup.txt"%prj_tree.internal)
	if len(oldFiles) > 0 and not arguments['--noclean']:
		[os.remove(f) for f in oldFiles]
Esempio n. 9
0
def main():

    print("Processing chunk %s..." % arguments['--chunk'])

    #get raw seq stats from temp table
    raw = csv.reader(open(
        "%s/lookup_%s.txt" % (prj_tree.internal, arguments['--chunk']), 'r'),
                     delimiter=sep)

    raw_count, total, found, noV, noJ, f_ind = 0, 0, 0, 0, 0, 1
    counts = Counter()

    writer = csv.writer(open(
        "%s/jtophit_%s.txt" % (prj_tree.jgene, arguments['--chunk']), "w"),
                        delimiter=sep)
    writer.writerow(PARSED_BLAST_HEADER)
    dict_jcounts = dict()
    dict_ccounts = dict()
    dict_dcounts = dict()

    c = False
    if os.path.isfile("%s/%s_C_%s.txt" %
                      (prj_tree.jgene, prj_name, arguments['--chunk'])):
        c = True
        cWriter = csv.writer(open(
            "%s/ctophit_%s.txt" % (prj_tree.jgene, arguments['--chunk']), "w"),
                             delimiter=sep)
        cWriter.writerow(PARSED_BLAST_HEADER)

    d = False
    if os.path.isfile("%s/%s_D_%s.txt" %
                      (prj_tree.jgene, prj_name, arguments['--chunk'])):
        d = True
        dWriter = csv.writer(open(
            "%s/dtophit_%s.txt" % (prj_tree.jgene, arguments['--chunk']), "w"),
                             delimiter=sep)
        dWriter.writerow(PARSED_BLAST_HEADER)

    seq_stats = airr.create_rearrangement(
        "%s/rearrangements_%s.tsv" % (prj_tree.internal, arguments['--chunk']),
        fields=[
            'vj_in_frame', 'stop_codon', 'locus', 'c_call', 'junction_length',
            'source_file', 'source_id', 'duplicate_count', 'length_raw',
            'length_trimmed', 'indels', 'status', 'blast_identity',
            'consensus_count', 'cell_id'
        ])

    dict_vgerm_aln, dict_other_vgerms, dict_vcounts = get_top_hits(
        "%s/%s_%s.txt" % (prj_tree.vgene, prj_name, arguments['--chunk']))
    dict_jgerm_aln, dict_other_jgerms, dict_jcounts = get_top_hits(
        "%s/%s_%s.txt" % (prj_tree.jgene, prj_name, arguments['--chunk']),
        topHitWriter=writer,
        dict_germ_count=dict_jcounts,
        strand="plus")

    if c:
        minCStartPos = dict([(x, dict_jgerm_aln[x].qend)
                             for x in dict_jgerm_aln.keys()])
        dict_cgerm_aln, dict_other_cgerms, dict_ccounts = get_top_hits(
            "%s/%s_C_%s.txt" %
            (prj_tree.jgene, prj_name, arguments['--chunk']),
            topHitWriter=cWriter,
            dict_germ_count=dict_ccounts,
            minQStart=minCStartPos,
            strand="plus")

    if d:
        maxDEndPos = dict([(x, dict_jgerm_aln[x].qstart)
                           for x in dict_jgerm_aln.keys()])
        dict_dgerm_aln, dict_other_dgerms, dict_dcounts = get_top_hits(
            "%s/%s_D_%s.txt" %
            (prj_tree.jgene, prj_name, arguments['--chunk']),
            topHitWriter=dWriter,
            dict_germ_count=dict_dcounts,
            maxQEnd=maxDEndPos,
            strand="plus")

    for entry in SeqIO.parse(
            "%s/%s_%s.fasta" %
        (prj_tree.vgene, prj_name, arguments['--chunk']), "fasta"):
        total += 1

        raw_stats = next(raw)
        raw_count += 1

        while not entry.id == raw_stats[0]:
            #we found a read that did not meet the length cut-off
            raw_stats = next(raw)
            raw_count += 1

        rearrangement = dict()
        rearrangement['sequence_id'] = raw_stats[0]
        rearrangement['source_file'] = raw_stats[1]
        rearrangement['source_id'] = raw_stats[2]
        rearrangement['length_raw'] = raw_stats[3]
        rearrangement['sequence'] = str(entry.seq)

        if not raw_stats[4] == "NA":
            rearrangement['duplicate_count'] = raw_stats[4]
        if not raw_stats[5] == "NA":
            rearrangement['consensus_count'] = raw_stats[5]
        if not raw_stats[6] == "NA":
            rearrangement['cell_id'] = raw_stats[6]

        if not entry.id in dict_vgerm_aln:
            noV += 1
            rearrangement['status'] = 'noV'
            seq_stats.write(rearrangement)
        elif not entry.id in dict_jgerm_aln:
            noJ += 1
            myV = dict_vgerm_aln[entry.id]
            entry.seq = entry.seq[myV.qstart - 1:myV.qend]
            if (myV.strand == 'minus'):
                entry.seq = entry.seq.reverse_complement()
                rearrangement['rev_comp'] = "T"
            else:
                rearrangement['rev_comp'] = "F"
            myVgenes = ",".join([myV.sid] +
                                dict_other_vgerms.get(entry.id, []))

            vlocus = ""
            if re.search("(HV|VH|heavy)", myV.sid, re.I):
                vlocus = "IGH"
            elif re.search("(LV|VL|lambda)", myV.sid, re.I):
                vlocus = "IGL"
            elif re.search("(KV|VK|kappa)", myV.sid, re.I):
                vlocus = "IGK"

            rearrangement['v_call'] = myVgenes
            rearrangement['locus'] = vlocus
            rearrangement['productive'] = "F"
            rearrangement['status'] = 'noJ'
            rearrangement['sequence_alignment'] = str(entry.seq)
            seq_stats.write(rearrangement)

        else:

            found += 1
            myV = dict_vgerm_aln[entry.id]
            myJ = dict_jgerm_aln[entry.id]
            added5 = 0
            productive = "T"
            indel = "F"
            stop = "F"
            cdr3 = True

            vlocus = ""
            if re.search("(HV|VH|heavy)", myV.sid, re.I):
                vlocus = "IGH"
            elif re.search("(LV|VL|lambda)", myV.sid, re.I):
                vlocus = "IGL"
            elif re.search("(KV|VK|kappa)", myV.sid, re.I):
                vlocus = "IGK"

            #get actual V(D)J sequence
            v_len = myV.qend - (
                myV.qstart - 1
            )  #need to use qstart and qend instead of alignment to account for gaps

            #try to recover 3' of J
            if myJ.send < len(dict_j[myJ.sid].seq) and \
              ( (myV.strand == "plus" and myV.qstart + v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send) <= len(entry.seq)) or \
              (myV.strand == "minus" and myV.qend - (v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send)) >= 0) ):
                vdj_len = v_len + myJ.qend + (len(dict_j[myJ.sid].seq) -
                                              myJ.send)
            else:
                vdj_len = v_len + myJ.qend

            const_seq = ""
            if (myV.strand == 'plus'):
                const_seq = str(entry.seq[myV.qstart + vdj_len - 1:])
                if myV.sstart > 1:
                    if arguments['--nterm'] == "extend":
                        if myV.qstart >= myV.sstart:
                            entry.seq = entry.seq[myV.qstart -
                                                  myV.sstart:myV.qstart +
                                                  vdj_len - 1]
                            added5 = myV.sstart - 1
                        else:
                            entry.seq = entry.seq[:myV.qstart + vdj_len - 1]
                            added5 = myV.qstart - 1
                    elif arguments['--nterm'] == "germline":
                        entry.seq = dict_v[
                            myV.sid].seq[0:myV.sstart -
                                         1] + entry.seq[myV.qstart -
                                                        1:myV.qstart +
                                                        vdj_len - 1]
                        added5 = myV.sstart - 1
                    else:
                        entry.seq = entry.seq[myV.qstart - 1:myV.qstart +
                                              vdj_len - 1]

                else:  #blast found full V gene
                    entry.seq = entry.seq[myV.qstart - 1:myV.qstart + vdj_len -
                                          1]

            else:  #minus strand
                const_seq = str(entry.seq[:myV.qend -
                                          vdj_len].reverse_complement())
                if myV.send > 1:
                    if arguments['--nterm'] == "extend":
                        if len(entry.seq) - myV.qend >= myV.send - 1:
                            entry.seq = entry.seq[myV.qend - vdj_len:myV.qend +
                                                  myV.send -
                                                  1].reverse_complement()
                            added5 = myV.send - 1
                        else:
                            added5 = len(entry.seq) - myV.qend
                            entry.seq = entry.seq[myV.qend -
                                                  vdj_len:].reverse_complement(
                                                  )
                    elif arguments['--nterm'] == "germline":
                        entry.seq = dict_v[
                            myV.sid].seq[0:myV.send - 1] + entry.seq[
                                myV.qend -
                                vdj_len:myV.qend].reverse_complement()
                        added5 = myV.send - 1
                    else:
                        entry.seq = entry.seq[myV.qend - vdj_len:myV.
                                              qend].reverse_complement()

                else:  #blast found full V gene
                    entry.seq = entry.seq[myV.qend - vdj_len:myV.
                                          qend].reverse_complement()

            #get CDR3 boundaries
            cdr3_start, cdr3_end, WF_motif = find_cdr3_borders(
                myV.sid, str(dict_v[myV.sid].seq), v_len,
                min(myV.sstart, myV.send), max(myV.sstart, myV.send),
                str(dict_j[myJ.sid].seq), myJ.sstart, myJ.qstart, myJ.gaps,
                str(entry.seq[added5:])
            )  #min and max statments take care of switching possible minus strand hit
            cdr3_seq = entry.seq[added5 + cdr3_start:added5 + cdr3_end]

            #push the sequence into frame for translation, if need be
            v_frame = (min([myV.sstart, myV.send]) - added5) % 3
            five_prime_add = (v_frame - 1) % 3
            entry.seq = 'N' * five_prime_add + entry.seq

            #prevent BioPython errors by trimming to last full codon
            #if (len(entry.seq) % 3) > 0:
            # entry.seq = entry.seq [ : -1 * (len(entry.seq) % 3) ]

            #check for stop codons
            if '*' in entry.seq.translate():
                stop = "T"

            #check for in-frame junction
            if len(cdr3_seq) % 3 != 0:
                productive = "F"
            else:  #even if recombination looks ok, might be (sequencing) indels in V and/or J
                j_frame = 3 - (
                    (WF_motif - myJ.sstart) % 3
                )  #j genes start in different frames, so calculate based on position of conserved W/F found by the cdr3 subroutine above
                frame_shift = (v_len + myJ.qstart + added5 - 1) % 3

                if (v_frame + frame_shift) % 3 != j_frame % 3:
                    indel = "T"
                else:
                    #use blast gaps to detect frame shift in-dels
                    #most of these have stop codons or other sequence problems, but we'll catch a few extra this way
                    if (abs(myV.send - myV.sstart) -
                        (myV.qend - myV.qstart)) % 3 != 0 or (
                            (myJ.send - myJ.sstart) -
                            (myJ.qend - myJ.qstart)) % 3 != 0:
                        indel = "T"

            #make sure cdr3 boundaries make sense
            if (cdr3_end <= cdr3_start or cdr3_end > vdj_len
                    or cdr3_start < 0):
                cdr3 = False

            status = "good"
            if not cdr3:
                status = "noCDR3"
            elif productive == "F":
                status = "nonproductive"
            elif indel == "T":
                status = "indel"
            elif stop == "T":
                status = "stop"
            elif arguments['--nterm'] == "discard" and min(
                    myV.sstart, myV.send) > 1:
                status = "missingNterm"

            #add germline assignments to fasta description and write to disk
            myVgenes = ",".join([myV.sid] +
                                dict_other_vgerms.get(entry.id, []))
            myJgenes = ",".join([myJ.sid] +
                                dict_other_jgerms.get(entry.id, []))

            myDgenes = ""
            if d:
                if entry.id in dict_dgerm_aln:
                    if not vlocus in ["IGK", "IGL"]:
                        #supress spurious D gene hits if it's a light chain
                        myDgenes = ",".join(
                            [dict_dgerm_aln[entry.id].sid] +
                            dict_other_dgerms.get(entry.id, []))

            myCgenes = ""
            if c and entry.id in dict_cgerm_aln:
                myCgenes = ",".join([dict_cgerm_aln[entry.id].sid] +
                                    dict_other_cgerms.get(entry.id, []))
            elif not arguments['--noFallBack']:
                if re.match("C[CT]", const_seq):
                    myCgenes = "IGHG"  #could also be IgE, but I'm assuming that's rare
                elif re.match("GGA", const_seq):
                    myCgenes = "IGHM"
                elif re.match("CAT", const_seq):
                    myCgenes = "IGHA"
                elif re.match("CAC", const_seq):
                    myCgenes = "IGHD"
                elif re.match("CGA", const_seq):
                    myCgenes = "IGKC"
                elif re.match("GGT", const_seq):
                    myCgenes = "IGLC"

            jlocus = ""
            if re.search("(HJ|JH|heavy)", myJ.sid, re.I):
                jlocus = "IGH"
            elif re.search("(LJ|Jl|lambda)", myJ.sid, re.I):
                jlocus = "IGL"
            elif re.search("(KJ|JK|kappa)", myJ.sid, re.I):
                jlocus = "IGK"

            if not vlocus == jlocus:
                #this really shouldn't happen unless one or both gene assignments are
                #    based on very short partial hits. Unfortuantely, the lengths/e-values
                #    are on different scales, so I don't currently have a good heuristic to
                #    pick between the two. Just flag it and give up, at least for now.
                status = "chimera"

            if not myCgenes == "" and not vlocus in myCgenes:  #will fail for custom libraries where C gene names don't start with locus
                myCgenes = ""  #assume constant is incorrect since usually based on only a few bases

            #do AIRR output
            if myV.strand == "plus":
                rearrangement['rev_comp'] = "F"
            else:
                rearrangement['rev_comp'] = "T"
            if status == "good":
                rearrangement['productive'] = "T"
            else:
                rearrangement['productive'] = "F"
            rearrangement['vj_in_frame'] = productive
            rearrangement['stop_codon'] = stop
            rearrangement['locus'] = vlocus
            rearrangement['v_call'] = myVgenes
            rearrangement['j_call'] = myJgenes
            rearrangement['d_call'] = myDgenes
            rearrangement['c_call'] = myCgenes
            rearrangement['sequence_alignment'] = str(entry.seq)
            rearrangement['junction'] = cdr3_seq
            rearrangement['junction_aa'] = cdr3_seq.translate()
            rearrangement['junction_length'] = len(cdr3_seq)
            rearrangement['length_trimmed'] = len(entry.seq)
            rearrangement['indels'] = indel
            rearrangement['status'] = status
            rearrangement['blast_identity'] = "%.3f" % (myV.identity / 100.0)

            seq_stats.write(rearrangement)

            counts[status] += 1

    print("chunk %s: %d done, found %d; %d good..." %
          (arguments['--chunk'], total, found, counts['good']))

    seq_stats.close()
Esempio n. 10
0
def airrdownload(args):
    airr.validate_repertoire(args.repertoire, True)
    repertoire_file = args.repertoire
    rearrangements_file = repertoire_file[:-4] + "rearrangements.tsv"
    try:
        data = airr.load_repertoire(args.repertoire)
    except TypeError:
        sys.stderr.write("TCRcloud error: It seems you did not indicate a \
properly formatted AIRR rearrangements file\n")
        exit()
    repertoires = data["Repertoire"]
    host_url = testserver(data)

    # Print out some Info
    print("       Info: " + data["Info"]["title"])
    print("    version: " + str(data["Info"]["version"]))
    print("description: " + data["Info"]["description"])
    print("Found " + str(len(data["Repertoire"])) + " repertoires in \
repertoire metadata file.")

    # Query the rearrangement endpoint
    # Define a generic query object, and we will replace the repertoire_id
    # within the loop. We also only request productive rearrangements as
    # an additional filter.

    query = {
        "filters": {
            "op": "and",
            "content": [
                {
                    "op": "=",
                    "content": {
                        "field": "repertoire_id",
                        "value": "XXX"
                    }
                },
                {
                    "op": "=",
                    "content": {
                        "field": "productive",
                        "value": True
                    }
                }
            ]
        },
        "size": 1000,
        "from": 0
    }

    # Loop through each repertoire and query rearrangement data for
    # each. We download in chunks of 10000 because of the server
    # limitations using the from and size parameters.

    first = True
    for r in repertoires:
        print("Retrieving rearrangements for repertoire: "
              + r["repertoire_id"])
        query["filters"]["content"][0]["content"]["value"] = r["repertoire_id"]
        query["size"] = 1000
        query["from"] = 0

        cnt = 0
        while True:
            # send the request
            resp = requests.post(host_url + "/rearrangement", json=query)
            data = resp.json()
            rearrangements = data["Rearrangement"]

            # Open a file for writing the rearrangements. We do this here
            # because we need to know the full set of fields being
            # returned from the data repository, otherwise by default only
            # the required fields will be written to the file.
            if first:
                out_file = airr.create_rearrangement(
                    rearrangements_file,
                    fields=rearrangements[0].keys())
                first = False

            # save the rearrangements to a file
            for row in rearrangements:
                out_file.write(row)

            # looping until zero rearrangements are returned from the query.
            cnt += len(rearrangements)
            if len(rearrangements) < 1000:
                break

            # Need to update the from parameter to get the next chunk
            query["from"] = cnt

        print("Retrieved " + str(cnt) + " rearrangements for repertoire: "
                           + r["repertoire_id"])
    print("Saved as " + rearrangements_file)
Esempio n. 11
0
def main():

	if not glob.glob("%s/%s_*.fasta" % (prj_tree.jgene, prj_name)):
		sys.exit("No jBlast output found!\n")
		
	maxFiles = len( glob.glob("%s/%s_*.fasta" % (prj_tree.vgene, prj_name)) )
	
	if not arguments['--reenter']:
		print( "curating junction and 3' end..." )

		if arguments['--cluster']:
			command = "NUM=`printf \"%s\" $SGE_TASK_ID`\n%s/annotate/parse_blast.py --jmotif '%s' --nterm %s --chunk $NUM\n" % \
						( "%03d", SCRIPT_FOLDER, arguments['--jmotif'], arguments['--nterm'] )
			if arguments['--noFallBack']: command += " --noFallBack"
			pbs = open("%s/parse.sh"%prj_tree.jgene, 'w')
			pbs.write( "#!/bin/bash\n#$ -N parse-%s\n#$ -l mem=2G\n#$ -cwd\n\n%s\n" % (prj_name, command) )
			pbs.close()
			os.system( "%s -t 1-%d %s/parse.sh"%(qsub,maxFiles,prj_tree.jgene) )
		
			restart = "%s/annotate/1.3-finalize_assignments.py --reenter" % SCRIPT_FOLDER
			for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save']: 
				if arguments[opt] is not None:
					restart += " %s %s" % (opt, arguments[opt])
			for flag in ['--noclean', '--runClustering', '--runCellStatistics']:
				if arguments[flag]:
					restart += " %s" % flag

			monitor = open("%s/parse_monitor.sh"%prj_tree.jgene, 'w')
			monitor.write( "#!/bin/bash\n#$ -N monitor-%s\n#$ -l mem=2G\n#$ -cwd\n#$ -hold_jid parse-%s\n\n%s\n"%(prj_name, prj_name,restart) )
			monitor.close()
			os.system( "%s %s/parse_monitor.sh"%(qsub,prj_tree.jgene) )
			sys.exit()

		else: #do it locally

			parse_pool = Pool(arguments['--threads'])
			parse_pool.map(callParser, range(1,maxFiles+1))
			parse_pool.close()
			parse_pool.join()


	#ok, now collect all of the partial outputs and merge them
	print( "collecting information...")

	#open fasta outputs
	allV_aa	     = open ("%s/%s_allV.fa"	 % (prj_tree.aa, prj_name), "w" )
	allV_nt	     = open( "%s/%s_allV.fa"	 % (prj_tree.nt, prj_name), "w" )

	allJ_aa	     = open( "%s/%s_allJ.fa"	 % (prj_tree.aa, prj_name), "w" )
	allJ_nt	     = open( "%s/%s_allJ.fa"	 % (prj_tree.nt, prj_name), "w" )

	vj_aa	     = open( "%s/%s_goodVJ.fa"	 % (prj_tree.aa, prj_name), "w" )
	vj_nt	     = open( "%s/%s_goodVJ.fa"	 % (prj_tree.nt, prj_name), "w" )

	good_cdr3_aa = open( "%s/%s_goodCDR3.fa" % (prj_tree.aa, prj_name), "w" )
	good_cdr3_nt = open( "%s/%s_goodCDR3.fa" % (prj_tree.nt, prj_name), "w" )

	all_cdr3_aa  = open( "%s/%s_allCDR3.fa"	 % (prj_tree.aa, prj_name), "w" )
	all_cdr3_nt  = open( "%s/%s_allCDR3.fa"	 % (prj_tree.nt, prj_name), "w" )


	#also open final rearrangements tsv
	seq_stats = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity','consensus_count','cell_id'])


	#initiate overall counters
	raw_count, total = 0, 0
	counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0,'noV':0,'noJ':0,'missingNterm':0}

	dict_jcounts = Counter()
	dict_ccounts = Counter()
	dict_dcounts = Counter()
		
	c = False
	if os.path.isfile("%s/%s_C_001.txt" % (prj_tree.jgene, prj_name)):
		c = True

	d = False
	if os.path.isfile("%s/%s_D_001.txt" % (prj_tree.jgene, prj_name)):
		d = True


	#iterate over subset rearrangement files and combine
	#include generating fasta output as appropriate
	for f_ind in range(1, maxFiles+1):

		#merge partial blast hit tables
		with open( "%s/%s_jgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
			with open( "%s/jtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
				table.write(partial.read())

		if d:
			with open( "%s/%s_dgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
				with open( "%s/dtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
					table.write(partial.read())

		if c:
			with open( "%s/%s_cgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
				with open( "%s/ctophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
					table.write(partial.read())

		#go through partial rearrangements files
		for r in airr.read_rearrangement( "%s/rearrangements_%03d.tsv"%(prj_tree.internal, f_ind) ):

			seq_stats.write( r )

			#count j/d/c gene usages
			if not r['j_call'] == "":
				dict_jcounts[ r['j_call'].split(",")[0] ] += 1
			if not r['j_call'] == "":
				dict_jcounts[ r['d_call'].split(",")[0] ] += 1
			if not r['j_call'] == "":
				dict_jcounts[ r['c_call'].split(",")[0] ] += 1

			#count statuses
			counts[ r['status'] ] += 1
			total += 1
			raw_count = int( r['sequence_id'] ) #technically, this undercounts if the last one
												# isn't in the `correct_length` interval, but I
												# don't have a better solution that isn't super
												# kludgy right now

			#ok, now do sequence output
			# start by collecting metadata for fasta def line
			def_line = ">%s" % r['sequence_id']
			if not r['v_call'] == '':          def_line += " v_call=%s"          % r['v_call']
			if not r['d_call'] == '':          def_line += " d_call=%s"          % r['d_call']
			if not r['j_call'] == '':          def_line += " j_call=%s"          % r['j_call']
			if not r['locus']  == '':          def_line += " locus=%s"           % r['locus']
			if not r['c_call'] == '':          def_line += " c_call=%s"          % r['c_call']
			if not r['status'] == '':          def_line += " status=%s"          % r['status']
#			if not r['v_identity'] == '':      def_line += " v_identity=%s"      % r['v_identity']
			if not r['junction_length'] == '': def_line += " junction_length=%s" % r['junction_length']
			if not r['junction'] == '':        def_line += " junction=%s"        % r['junction']
			if not r['junction_aa'] == '':     def_line += " junction_aa=%s"     % r['junction']
			if not r['duplicate_count'] == '': def_line += " duplicate_count=%s" % r['duplicate_count']
			if not r['consensus_count'] == '': def_line += " consensus_count=%s" % r['consensus_count']
			if not r['cell_id'] == '':         def_line += " cell_id=%s"         % r['cell_id']

			#work our way up the hierarchy, putting sequences in the appropriate files
			ungapped = re.sub( "-", "", r['sequence_alignment']) #reintroduces any frameshift errors in translation
																 #  this has always been the behavior, but I wonder
																 #  if I should change/update now that I am using
																 #  proper alignments.

			if not r['status'] in ['noV', 'missingNterm']:
				allV_nt.write( "%s\n%s\n" % (def_line, ungapped) )
				allV_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )
	
				if not r['status'] == 'noJ':
					allJ_nt.write( "%s\n%s\n" % (def_line, ungapped) )
					allJ_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )

					if not r['status'] == 'noCDR3':
						all_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) )
						all_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) )

						if r['status'] == "good":
							vj_nt.write( "%s\n%s\n" % (def_line, ungapped) )
							vj_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )
							good_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) )
							good_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) )


	#close outputs
	allV_aa.close()
	allV_nt.close()
	allJ_aa.close()
	allJ_nt.close()
	vj_aa.close()
	vj_nt.close()
	good_cdr3_aa.close()
	good_cdr3_nt.close()
	all_cdr3_aa.close()
	all_cdr3_nt.close()

	#useful number
	found = total - counts['noV'] - counts['noJ']

	#print out some statistics
	handle = open("%s/%s_jgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
	writer	= csv.writer(handle, delimiter = sep)
	keys	= sorted(dict_jcounts.keys())
	writer.writerow(["gene", "count", "percent"])
	for key in keys:
		aline = [ key, dict_jcounts[key], "%4.2f" % (dict_jcounts[key] / float(found) * 100) ]
		writer.writerow(aline)
	handle.close()

	if len(dict_ccounts) > 0:
		handle = open("%s/%s_cgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
		writer	= csv.writer(handle, delimiter = sep)
		keys	= sorted(dict_ccounts.keys())
		writer.writerow(["gene", "count", "percent"])
		for key in keys:
			aline = [ key, dict_ccounts[key], "%4.2f" % (dict_ccounts[key] / float(found) * 100) ]
			writer.writerow(aline)
		handle.close()

	if len(dict_dcounts) > 0:
		handle = open("%s/%s_dgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
		writer	= csv.writer(handle, delimiter = sep)
		keys	= sorted(dict_dcounts.keys())
		writer.writerow(["gene", "count", "percent"])
		for key in keys:
			aline = [ key, dict_dcounts[key], "%4.2f" % (dict_dcounts[key] / float(found) * 100) ]
			writer.writerow(aline)
		handle.close()

	message = "\nTotal raw reads: %d\nCorrect Length: %d\nV assigned: %d\nJ assigned: %d\nCDR3 assigned: %d\nIn-frame junction: %d\nNo indels: %d\nContinuous ORF with no stop codons: %d\n\n"  % \
								(raw_count, total, total-counts['noV'], found, found-counts['noCDR3'], found-counts['noCDR3']-counts['nonproductive'], found-counts['noCDR3']-counts['nonproductive']-counts['indel'], counts['good'])
	print( message )
	handle = open("%s/finalize_blast.log"%prj_tree.logs, "w")
	handle.write(message)
	handle.close()

	# call 1.4 if requested
	if arguments['--runClustering']:
		cmd = "%s/annotate/1.4-cluster_sequences.py" % SCRIPT_FOLDER
		for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save']: 
			if arguments[opt] is not None:
				cmd += " %s '%s'" % (opt, arguments[opt])
		if arguments['--runCellStatistics']:
			cmd += " --runCellStatistics"

		print( "Calling 1.4 with command line: %s" % cmd )
		os.system( cmd )

	#clean up!!
	oldFiles = glob.glob("%s/*txt"%prj_tree.vgene) + glob.glob("%s/*fasta"%prj_tree.vgene) +  glob.glob("%s/*txt"%prj_tree.jgene) + glob.glob("%s/*fasta"%prj_tree.jgene) + glob.glob("%s/*tsv"%prj_tree.jgene) + glob.glob("%s/lookup*"%prj_tree.internal)
	if len(oldFiles) > 0 and not arguments['--noclean']:
		[os.remove(f) for f in oldFiles]