def write_airr(adata: AnnData, filename: Union[str, Path]) -> None: """Export :term:`IR` data to :term:`AIRR` Rearrangement `tsv` format. Parameters ---------- adata annotated data matrix filename destination filename """ airr_cells = to_airr_cells(adata) try: fields = airr_cells[0].fields for tmp_cell in airr_cells[1:]: assert tmp_cell.fields == fields, "All rows of adata have the same fields." except IndexError: # case of an empty output file fields = None writer = airr.create_rearrangement(filename, fields=fields) for tmp_cell in airr_cells: for chain in tmp_cell.to_airr_records(): # workaround for AIRR library writing out int field as floats (if it happens to be a float) for f in chain: if RearrangementSchema.type(f) == "integer": chain[f] = int(chain[f]) writer.write(chain) writer.close()
def main(): if not glob.glob("%s/%s_*.fasta" % (prj_tree.jgene, prj_name)): sys.exit("No jBlast output found!\n") maxFiles = len( glob.glob("%s/%s_*.fasta" % (prj_tree.vgene, prj_name)) ) print( "curating junction and 3' end..." ) if arguments['--cluster']: command = "NUM=`printf \"%s\" $SGE_TASK_ID`\n%s/annotate/parse_blast.py --jmotif '%s' --nterm %s --chunk $NUM\n" % \ ( "%03d", SCRIPT_FOLDER, arguments['--jmotif'], arguments['--nterm'] ) if arguments['--noFallBack']: command += " --noFallBack" pbs = open("%s/parse.sh"%prj_tree.jgene, 'w') pbs.write( "#!/bin/bash\n#$ -N parse-%s\n#$ -l h_vmem=2G\n#$ -cwd\n#$ -o %s/parse.o$JOB_ID.$SGE_TASK_ID\n#$ -o %s/parse.e$JOB_ID.$SGE_TASK_ID\n\n%s\n" % (prj_name, prj_tree.annotate, prj_tree.annotate, command) ) pbs.close() subprocess.call([qsub, '-sync', 'y', '-t', "1-%d"%maxFiles, "%s/parse.sh"%prj_tree.jgene]) else: #do it locally parse_pool = Pool(arguments['--threads']) parse_pool.map(callParser, range(1,maxFiles+1)) parse_pool.close() parse_pool.join() #ok, now collect all of the partial outputs and merge them print( "collecting information...") #open fasta outputs allV_aa = open ("%s/%s_allV.fa" % (prj_tree.aa, prj_name), "w" ) allV_nt = open( "%s/%s_allV.fa" % (prj_tree.nt, prj_name), "w" ) allJ_aa = open( "%s/%s_allJ.fa" % (prj_tree.aa, prj_name), "w" ) allJ_nt = open( "%s/%s_allJ.fa" % (prj_tree.nt, prj_name), "w" ) vj_aa = open( "%s/%s_goodVJ.fa" % (prj_tree.aa, prj_name), "w" ) vj_nt = open( "%s/%s_goodVJ.fa" % (prj_tree.nt, prj_name), "w" ) good_cdr3_aa = open( "%s/%s_goodCDR3.fa" % (prj_tree.aa, prj_name), "w" ) good_cdr3_nt = open( "%s/%s_goodCDR3.fa" % (prj_tree.nt, prj_name), "w" ) all_cdr3_aa = open( "%s/%s_allCDR3.fa" % (prj_tree.aa, prj_name), "w" ) all_cdr3_nt = open( "%s/%s_allCDR3.fa" % (prj_tree.nt, prj_name), "w" ) #also open final rearrangements tsv seq_stats = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity','consensus_count','cell_id']) #initiate overall counters raw_count, total = 0, 0 counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0,'noV':0,'noJ':0,'missingNterm':0,'chimera':0} dict_jcounts = Counter() dict_ccounts = Counter() dict_dcounts = Counter() c = False if os.path.isfile("%s/%s_C_001.txt" % (prj_tree.jgene, prj_name)): c = True d = False if os.path.isfile("%s/%s_D_001.txt" % (prj_tree.jgene, prj_name)): d = True #iterate over subset rearrangement files and combine #include generating fasta output as appropriate for f_ind in range(1, maxFiles+1): #merge partial blast hit tables with open( "%s/%s_jgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/jtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) if d: with open( "%s/%s_dgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/dtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) if c: with open( "%s/%s_cgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/ctophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) #go through partial rearrangements files for r in airr.read_rearrangement( "%s/rearrangements_%03d.tsv"%(prj_tree.internal, f_ind) ): seq_stats.write( r ) #count j/d/c gene usages if not r['j_call'] == "": dict_jcounts[ r['j_call'].split(",")[0] ] += 1 if not r['j_call'] == "": dict_jcounts[ r['d_call'].split(",")[0] ] += 1 if not r['j_call'] == "": dict_jcounts[ r['c_call'].split(",")[0] ] += 1 #count statuses counts[ r['status'] ] += 1 total += 1 raw_count = int( r['sequence_id'] ) #technically, this undercounts if the last one # isn't in the `correct_length` interval, but I # don't have a better solution that isn't super # kludgy right now #ok, now do sequence output # start by collecting metadata for fasta def line def_line = ">%s" % r['sequence_id'] if not r['v_call'] == '': def_line += " v_call=%s" % r['v_call'] if not r['d_call'] == '': def_line += " d_call=%s" % r['d_call'] if not r['j_call'] == '': def_line += " j_call=%s" % r['j_call'] if not r['locus'] == '': def_line += " locus=%s" % r['locus'] if not r['c_call'] == '': def_line += " c_call=%s" % r['c_call'] if not r['status'] == '': def_line += " status=%s" % r['status'] # if not r['v_identity'] == '': def_line += " v_identity=%s" % r['v_identity'] if not r['junction_length'] == '': def_line += " junction_length=%s" % r['junction_length'] if not r['junction'] == '': def_line += " junction=%s" % r['junction'] if not r['junction_aa'] == '': def_line += " junction_aa=%s" % r['junction_aa'] if not r['duplicate_count'] == '': def_line += " duplicate_count=%s" % r['duplicate_count'] if not r['consensus_count'] == '': def_line += " consensus_count=%s" % r['consensus_count'] if not r['cell_id'] == '': def_line += " cell_id=%s" % r['cell_id'] #work our way up the hierarchy, putting sequences in the appropriate files ungapped = re.sub( "-", "", r['sequence_alignment']) #reintroduces any frameshift errors in translation # this has always been the behavior, but I wonder # if I should change/update now that I am using # proper alignments. if not r['status'] in ['noV', 'missingNterm', "chimera"]: allV_nt.write( "%s\n%s\n" % (def_line, ungapped) ) allV_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) if not r['status'] == 'noJ': allJ_nt.write( "%s\n%s\n" % (def_line, ungapped) ) allJ_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) if not r['status'] == 'noCDR3': all_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) ) all_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) ) if r['status'] == "good": vj_nt.write( "%s\n%s\n" % (def_line, ungapped) ) vj_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) good_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) ) good_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) ) #close outputs allV_aa.close() allV_nt.close() allJ_aa.close() allJ_nt.close() vj_aa.close() vj_nt.close() good_cdr3_aa.close() good_cdr3_nt.close() all_cdr3_aa.close() all_cdr3_nt.close() #useful number found = total - counts['noV'] - counts['noJ'] - counts['chimera'] #print out some statistics handle = open("%s/%s_jgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_jcounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_jcounts[key], "%4.2f" % (dict_jcounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() if len(dict_ccounts) > 0: handle = open("%s/%s_cgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_ccounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_ccounts[key], "%4.2f" % (dict_ccounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() if len(dict_dcounts) > 0: handle = open("%s/%s_dgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_dcounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_dcounts[key], "%4.2f" % (dict_dcounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() message = "\nTotal raw reads: %d\nCorrect Length: %d\nV assigned: %d\nJ assigned: %d\nCDR3 assigned: %d\nIn-frame junction: %d\nNo indels: %d\nContinuous ORF with no stop codons: %d\n\n" % \ (raw_count, total, total-counts['noV']-counts['chimera'], found, found-counts['noCDR3'], found-counts['noCDR3']-counts['nonproductive'], found-counts['noCDR3']-counts['nonproductive']-counts['indel'], counts['good']) print( message ) handle = open("%s/finalize_blast.log"%prj_tree.logs, "w") handle.write(message) handle.close() # call 1.4 or 1.5 if requested if arguments['--runClustering']: cmd = "%s/annotate/1.4-cluster_sequences.py" % SCRIPT_FOLDER for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save' ]: if arguments[opt] is not None: cmd += " %s '%s'" % (opt, arguments[opt]) if arguments['--runCellStatistics']: cmd += " --runCellStatistics" print( "Calling 1.4 with command line: %s" % cmd ) os.system( cmd ) elif arguments['--runCellStatistics']: cmd = "%s/annotate/1.5-single_cell_statistics.py" % SCRIPT_FOLDER for opt in [ '--rearrangements', '--save' ]: if arguments[opt] is not None: cmd += " %s '%s'" % (opt, arguments[opt]) print( "Calling 1.5 with command line: %s" % cmd ) os.system( cmd ) #clean up!! oldFiles = glob.glob("%s/*txt"%prj_tree.vgene) + glob.glob("%s/*fasta"%prj_tree.vgene) + glob.glob("%s/*txt"%prj_tree.jgene) + glob.glob("%s/*fasta"%prj_tree.jgene) + glob.glob("%s/*tsv"%prj_tree.jgene) + glob.glob("%s/lookup*"%prj_tree.internal) if len(oldFiles) > 0 and not arguments['--noclean']: [os.remove(f) for f in oldFiles]
def main(): airrFile = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','length_raw','length_trimmed','indels','status','blast_identity','cluster_count','v_identity']) #try to vacuum up all possible raw sequences and hope it doesn't kill memory raw_seqs = defaultdict( dict ) file_list = glob.glob("*.fa") + glob.glob("*.fas") + glob.glob("*.fst") + glob.glob("*.fasta") + glob.glob("*.fna") + glob.glob("*.fq") + glob.glob("*.fastq") for myseq, myqual, file_name in generate_read_fasta_folder( file_list ): raw_seqs[file_name][myseq.seq_id] = myseq.seq #get trimmed sequences trim_seqs = load_fastas( "%s/%s_allJ.fa"%(prj_tree.nt, prj_name) ) #get nt junctions junc_seqs = load_fastas( "%s/%s_allCDR3.fa"%(prj_tree.nt, prj_name) ) #do the conversion with open( "%s/%s_all_seq_stats.txt"%(prj_tree.tables, prj_name), "r" ) as handle: oldFile = csv.reader( handle, delimiter="\t" ) header = next(oldFile) for row in oldFile: if row[11] == "wrong_length": continue if row[1] not in raw_seqs: sys.stderr.write("Couldn't find raw sequence file %s, %s will be dropped from converted file.\n"%(row[1],row[0])) continue elif row[2] not in raw_seqs[row[1]]: sys.stderr.write("Couldn't find raw sequence %s in file %s; %s will be dropped from converted file.\n"%(row[2],row[1],row[0])) continue r = dict() r['sequence'] = raw_seqs[ row[1] ][ row[2] ] r['sequence_alignment'] = str( trim_seqs.get( row[0], SeqRecord(seq="") ).seq ) r['junction'] = str( junc_seqs.get( row[0], SeqRecord(seq="") ).seq ) r['sequence_id'] = row[0] r['source_file'] = row[1] r['source_id'] = row[2] r['length_raw'] = row[3] if not row[4] == "NA": r['length_trimmed'] = row[4] if not row[5] == "NA": r['v_call'] = row[5] if row[6] not in ["NA", "not_found"]: r['d_call'] = row[6] if not row[7] == "NA": r['j_call'] = row[7] if not row[9] == "NA": r['indels'] = row[9] if not row[10] == "NA": r['stop_codon'] = row[10] r['status'] = row[11] if not row[12] == "NA": r['blast_identity'] = "%.3f" % ( 1 - float(re.sub("%","",row[12]))/100 ) if not row[13] == "NA": r['junction_length'] = int(row[13])+6 if not row[15] == "NA": r['junction_aa'] = row[15] if len(row)>15: if header[16]=="Unique": if row[16] == "T": r['status'] = "unique" r['cluster_count'] = row[17] if len(row)>17 and not row[18]=="NA": r['v_identity'] = "%.3f" % ( 1 - float(re.sub("%","",row[18]))/100 ) elif header[16] == "V_div" and not row[16]=="NA": r['v_identity'] = "%.3f" % ( 1 - float(re.sub("%","",row[16]))/100 ) #figure out in-frame/productive if row[10] == "good": r['vj_in_frame'] = "T" r['productive'] = "T" elif row[10] == "stop": r['vj_in_frame'] = "T" r['productive'] = "F" elif row[10] == "nonproductive": r['vj_in_frame'] = "F" r['productive'] = "F" elif row[10] == "indel": r['productive'] = "F" #figure out locus if any( x in row[5] for x in ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"] ): r['locus'] = "IGH" elif any( x in row[5] for x in ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"] ): r['locus'] = "IGL" elif any( x in row[5] for x in ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"] ): r['locus'] = "IGK" airrFile.write(r) airrFile.close() valid = airr.validate_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name) ) if not valid: sys.exit( "ERROR: something went wrong, %s/%s_rearrangements.tsv failed validation!"%(prj_tree.tables, prj_name) )
def main(): airrFile = airr.create_rearrangement( "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name), fields=[ 'vj_in_frame', 'stop_codon', 'locus', 'c_call', 'junction_length', 'source_file', 'source_id', 'length_raw', 'length_trimmed', 'indels', 'status', 'blast_identity', 'cluster_count', 'v_identity' ]) #try to vacuum up all possible raw sequences and hope it doesn't kill memory raw_seqs = defaultdict(dict) file_list = glob.glob("*.fa") + glob.glob("*.fas") + glob.glob( "*.fst") + glob.glob("*.fasta") + glob.glob("*.fna") + glob.glob( "*.fq") + glob.glob("*.fastq") for myseq, myqual, file_name in generate_read_fasta_folder(file_list): raw_seqs[file_name][myseq.seq_id] = myseq.seq #get trimmed sequences trim_seqs = load_fastas("%s/%s_allJ.fa" % (prj_tree.nt, prj_name)) #get nt junctions junc_seqs = load_fastas("%s/%s_allCDR3.fa" % (prj_tree.nt, prj_name)) #do the conversion with open("%s/%s_all_seq_stats.txt" % (prj_tree.tables, prj_name), "r") as handle: oldFile = csv.reader(handle, delimiter="\t") header = next(oldFile) for row in oldFile: if row[11] == "wrong_length": continue if row[1] not in raw_seqs: sys.stderr.write( "Couldn't find raw sequence file %s, %s will be dropped from converted file.\n" % (row[1], row[0])) continue elif row[2] not in raw_seqs[row[1]]: sys.stderr.write( "Couldn't find raw sequence %s in file %s; %s will be dropped from converted file.\n" % (row[2], row[1], row[0])) continue r = dict() r['sequence'] = raw_seqs[row[1]][row[2]] r['sequence_alignment'] = str( trim_seqs.get(row[0], SeqRecord(seq="")).seq) r['junction'] = str(junc_seqs.get(row[0], SeqRecord(seq="")).seq) r['sequence_id'] = row[0] r['source_file'] = row[1] r['source_id'] = row[2] r['length_raw'] = row[3] if not row[4] == "NA": r['length_trimmed'] = row[4] if not row[5] == "NA": r['v_call'] = row[5] if row[6] not in ["NA", "not_found"]: r['d_call'] = row[6] if not row[7] == "NA": r['j_call'] = row[7] if not row[9] == "NA": r['indels'] = row[9] if not row[10] == "NA": r['stop_codon'] = row[10] r['status'] = row[11] if not row[12] == "NA": r['blast_identity'] = "%.3f" % ( 1 - float(re.sub("%", "", row[12])) / 100) if not row[13] == "NA": r['junction_length'] = int(row[13]) + 6 if not row[15] == "NA": r['junction_aa'] = row[15] if len(row) > 15: if header[16] == "Unique": if row[16] == "T": r['status'] = "unique" r['cluster_count'] = row[17] if len(row) > 17 and not row[18] == "NA": r['v_identity'] = "%.3f" % ( 1 - float(re.sub("%", "", row[18])) / 100) elif header[16] == "V_div" and not row[16] == "NA": r['v_identity'] = "%.3f" % ( 1 - float(re.sub("%", "", row[16])) / 100) #figure out in-frame/productive if row[10] == "good": r['vj_in_frame'] = "T" r['productive'] = "T" elif row[10] == "stop": r['vj_in_frame'] = "T" r['productive'] = "F" elif row[10] == "nonproductive": r['vj_in_frame'] = "F" r['productive'] = "F" elif row[10] == "indel": r['productive'] = "F" #figure out locus if any(x in row[5] for x in ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"]): r['locus'] = "IGH" elif any(x in row[5] for x in ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"]): r['locus'] = "IGL" elif any(x in row[5] for x in ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"]): r['locus'] = "IGK" airrFile.write(r) airrFile.close() valid = airr.validate_rearrangement("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)) if not valid: sys.exit( "ERROR: something went wrong, %s/%s_rearrangements.tsv failed validation!" % (prj_tree.tables, prj_name))
query['size'] = 1000 query['from'] = 0 cnt = 0 while True: # send the request resp = requests.post(host_url + '/rearrangement', json = query) data = resp.json() rearrangements = data['Rearrangement'] # Open a file for writing the rearrangements. We do this here # because we need to know the full set of fields being # returned from the data repository, otherwise by default only # the required fields will be written to the file. if first: out_file = airr.create_rearrangement('rearrangements.tsv', fields=rearrangements[0].keys()) first = False # save the rearrangements to a file for row in rearrangements: out_file.write(row) # stop when downloaded at most 10,000 rearrangements or if the # response doesn't return the full amount, which indicates no more # data. If you wanted to download all rearrangements, keep # looping until zero rearrangements are returned from the query. cnt += len(rearrangements) if cnt >= 10000 or len(rearrangements) < 1000: break # Need to update the from parameter to get the next chunk
def main(): print( "Processing chunk %s..." % arguments['--chunk']) #get raw seq stats from temp table raw = csv.reader(open("%s/lookup_%s.txt" % (prj_tree.internal, arguments['--chunk']),'r'), delimiter=sep) raw_count, total, found, noV, noJ, f_ind = 0, 0, 0, 0, 0, 1 counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0} if arguments['--nterm'] == "discard": counts["missingNterm"]=0 writer = csv.writer(open("%s/jtophit_%s.txt" %(prj_tree.jgene, arguments['--chunk']), "w"), delimiter = sep) writer.writerow(PARSED_BLAST_HEADER) dict_jcounts = dict() dict_ccounts = dict() dict_dcounts = dict() c = False if os.path.isfile("%s/%s_C_%s.txt" % (prj_tree.jgene, prj_name, arguments['--chunk'])): c = True cWriter = csv.writer(open("%s/ctophit_%s.txt" %(prj_tree.jgene, arguments['--chunk']), "w"), delimiter = sep) cWriter.writerow(PARSED_BLAST_HEADER) d = False if os.path.isfile("%s/%s_D_%s.txt" % (prj_tree.jgene, prj_name, arguments['--chunk'])): d = True dWriter = csv.writer(open("%s/dtophit_%s.txt" %(prj_tree.jgene, arguments['--chunk']), "w"), delimiter = sep) dWriter.writerow(PARSED_BLAST_HEADER) seq_stats = airr.create_rearrangement( "%s/rearrangements_%s.tsv"%(prj_tree.internal, arguments['--chunk']), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity','consensus_count','cell_id']) dict_vgerm_aln, dict_other_vgerms, dict_vcounts = get_top_hits("%s/%s_%s.txt"%(prj_tree.vgene, prj_name, arguments['--chunk']) ) dict_jgerm_aln, dict_other_jgerms, dict_jcounts = get_top_hits("%s/%s_%s.txt"%(prj_tree.jgene, prj_name, arguments['--chunk']), topHitWriter=writer, dict_germ_count=dict_jcounts ) if c: minCStartPos = dict( [ (x, dict_jgerm_aln[x].qend) for x in dict_jgerm_aln.keys() ] ) dict_cgerm_aln, dict_other_cgerms, dict_ccounts = get_top_hits("%s/%s_C_%s.txt"%(prj_tree.jgene, prj_name, arguments['--chunk']), topHitWriter=cWriter, dict_germ_count=dict_ccounts, minQStart=minCStartPos ) if d: maxDEndPos = dict( [ (x, dict_jgerm_aln[x].qstart) for x in dict_jgerm_aln.keys() ] ) dict_dgerm_aln, dict_other_dgerms, dict_dcounts = get_top_hits("%s/%s_D_%s.txt"%(prj_tree.jgene, prj_name, arguments['--chunk']), topHitWriter=dWriter, dict_germ_count=dict_dcounts, maxQEnd=maxDEndPos ) for entry in SeqIO.parse( "%s/%s_%s.fasta" % (prj_tree.vgene, prj_name, arguments['--chunk']), "fasta"): total += 1 raw_stats = next(raw) raw_count += 1 while not entry.id == raw_stats[0]: #we found a read that did not meet the length cut-off raw_stats = next(raw) raw_count += 1 rearrangement = dict() rearrangement['sequence_id'] = raw_stats[0] rearrangement['source_file'] = raw_stats[1] rearrangement['source_id'] = raw_stats[2] rearrangement['length_raw'] = raw_stats[3] rearrangement['sequence'] = str(entry.seq) if not raw_stats[4] == "NA": rearrangement['duplicate_count'] = raw_stats[4] if not raw_stats[5] == "NA": rearrangement['consensus_count'] = raw_stats[5] if not raw_stats[6] == "NA": rearrangement['cell_id'] = raw_stats[6] if not entry.id in dict_vgerm_aln: noV+=1 rearrangement['status'] = 'noV' seq_stats.write(rearrangement) elif not entry.id in dict_jgerm_aln: noJ+=1 myV = dict_vgerm_aln[entry.id] entry.seq = entry.seq[ myV.qstart - 1 : myV.qend ] if (myV.strand == 'minus'): entry.seq = entry.seq.reverse_complement() rearrangement['rev_comp'] = "T" else: rearrangement['rev_comp'] = "F" myVgenes = ",".join( [myV.sid] + dict_other_vgerms.get(entry.id,[]) ) vlocus = "" if any( x in myV.sid for x in ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"] ): vlocus = "IGH" elif any( x in myV.sid for x in ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"] ): vlocus = "IGL" elif any( x in myV.sid for x in ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"] ): vlocus = "IGK" rearrangement['v_call'] = myVgenes rearrangement['locus'] = vlocus rearrangement['productive'] = "F" rearrangement['status'] = 'noJ' rearrangement['sequence_alignment'] = str(entry.seq) seq_stats.write(rearrangement) else: found += 1 myV = dict_vgerm_aln[entry.id] myJ = dict_jgerm_aln[entry.id] added5 = 0 productive = "T" indel = "F" stop = "F" cdr3 = True #get actual V(D)J sequence v_len = myV.qend - (myV.qstart-1) #need to use qstart and qend instead of alignment to account for gaps #try to recover 3' of J if myJ.send < len(dict_j[myJ.sid].seq) and \ ( (myV.strand == "plus" and myV.qstart + v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send) <= len(entry.seq)) or \ (myV.strand == "minus" and myV.qend - (v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send)) >= 0) ): vdj_len = v_len + myJ.qend + (len(dict_j[myJ.sid].seq) - myJ.send) else: vdj_len = v_len + myJ.qend const_seq = "" if (myV.strand == 'plus'): const_seq = str( entry.seq[myV.qstart+vdj_len-1 : ] ) if myV.sstart > 1: if arguments['--nterm'] == "extend": if myV.qstart >= myV.sstart: entry.seq = entry.seq[ myV.qstart - myV.sstart : myV.qstart + vdj_len - 1 ] added5 = myV.sstart - 1 else: entry.seq = entry.seq[ : myV.qstart + vdj_len - 1 ] added5 = myV.qstart - 1 elif arguments['--nterm'] == "germline": entry.seq = dict_v[myV.sid].seq[ 0 : myV.sstart-1 ] + entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ] added5 = myV.sstart - 1 else: entry.seq = entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ] else: #blast found full V gene entry.seq = entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ] else: #minus strand const_seq = str( entry.seq[ : myV.qend-vdj_len].reverse_complement() ) if myV.send > 1: if arguments['--nterm'] == "extend": if len(entry.seq)-myV.qend >= myV.send-1: entry.seq = entry.seq[ myV.qend - vdj_len : myV.qend+myV.send-1 ].reverse_complement() added5 = myV.send - 1 else: added5 = len(entry.seq) - myV.qend entry.seq = entry.seq[ myV.qend - vdj_len : ].reverse_complement() elif arguments['--nterm'] == "germline": entry.seq = dict_v[myV.sid].seq[ 0 : myV.send-1 ] + entry.seq[ myV.qend - vdj_len : myV.qend ].reverse_complement() added5 = myV.send - 1 else: entry.seq = entry.seq[ myV.qend - vdj_len : myV.qend ].reverse_complement() else: #blast found full V gene entry.seq = entry.seq[ myV.qend - vdj_len : myV.qend ].reverse_complement() #get CDR3 boundaries cdr3_start,cdr3_end,WF_motif = find_cdr3_borders(myV.sid,str(dict_v[myV.sid].seq), v_len, min(myV.sstart, myV.send), max(myV.sstart, myV.send), str(dict_j[myJ.sid].seq), myJ.sstart, myJ.qstart, myJ.gaps, str(entry.seq[ added5 : ])) #min and max statments take care of switching possible minus strand hit cdr3_seq = entry.seq[ added5+cdr3_start : added5+cdr3_end ] #push the sequence into frame for translation, if need be v_frame = ( min([myV.sstart, myV.send]) - added5 ) % 3 five_prime_add = (v_frame-1) % 3 entry.seq = 'N' * five_prime_add + entry.seq #prevent BioPython errors by trimming to last full codon #if (len(entry.seq) % 3) > 0: # entry.seq = entry.seq [ : -1 * (len(entry.seq) % 3) ] #check for stop codons if '*' in entry.seq.translate(): stop = "T" #check for in-frame junction if len(cdr3_seq) % 3 != 0: productive = "F" else: #even if recombination looks ok, might be (sequencing) indels in V and/or J j_frame = 3 - ( ( WF_motif - myJ.sstart ) % 3 ) #j genes start in different frames, so calculate based on position of conserved W/F found by the cdr3 subroutine above frame_shift = (v_len + myJ.qstart + added5 - 1) % 3 if (v_frame + frame_shift) % 3 != j_frame % 3: indel = "T" else: #use blast gaps to detect frame shift in-dels #most of these have stop codons or other sequence problems, but we'll catch a few extra this way if (abs(myV.send-myV.sstart)-(myV.qend-myV.qstart)) % 3 != 0 or ((myJ.send-myJ.sstart)-(myJ.qend-myJ.qstart)) % 3 != 0: indel = "T" #make sure cdr3 boundaries make sense if (cdr3_end<=cdr3_start or cdr3_end>vdj_len or cdr3_start<0): cdr3 = False status = "good" if not cdr3: status = "noCDR3" elif productive == "F": status = "nonproductive" elif indel == "T": status = "indel" elif stop == "T": status = "stop" elif arguments['--nterm'] == "discard" and min(myV.sstart,myV.send) > 1: status = "missingNterm" #add germline assignments to fasta description and write to disk myVgenes = ",".join( [myV.sid] + dict_other_vgerms.get(entry.id,[]) ) myJgenes = ",".join( [myJ.sid] + dict_other_jgerms.get(entry.id,[]) ) myDgenes = "" if d: if entry.id in dict_dgerm_aln: myDgenes = ",".join( [dict_dgerm_aln[entry.id].sid] + dict_other_dgerms.get(entry.id,[]) ) myCgenes = "" if c: if entry.id in dict_cgerm_aln: myCgenes = ",".join( [dict_cgerm_aln[entry.id].sid] + dict_other_cgerms.get(entry.id,[]) ) elif not arguments['--noFallBack']: if re.match("C[CT]", const_seq): myCgenes = "IGHG" #could also be IgE, but I'm assuming that's rare elif re.match("GGA", const_seq): myCgenes = "IGHM" elif re.match("CAT", const_seq): myCgenes = "IGHA" elif re.match("CAC", const_seq): myCgenes = "IGHD" elif not arguments['--noFallBack']: if re.match("C[CT]", const_seq): myCgenes = "IGHG" #could also be IgE, but I'm assuming that's rare elif re.match("GGA", const_seq): myCgenes = "IGHM" elif re.match("CAT", const_seq): myCgenes = "IGHA" elif re.match("CAC", const_seq): myCgenes = "IGHD" vlocus = "" if any( x in myV.sid for x in ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"] ): vlocus = "IGH" elif any( x in myV.sid for x in ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"] ): vlocus = "IGL" elif any( x in myV.sid for x in ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"] ): vlocus = "IGK" #do AIRR output if myV.strand == "plus": rearrangement['rev_comp'] = "F" else: rearrangement['rev_comp'] = "T" if status == "good": rearrangement['productive'] = "T" else: rearrangement['productive'] = "F" rearrangement['vj_in_frame'] = productive rearrangement['stop_codon'] = stop rearrangement['locus'] = vlocus rearrangement['v_call'] = myVgenes rearrangement['j_call'] = myJgenes rearrangement['d_call'] = myDgenes rearrangement['c_call'] = myCgenes rearrangement['sequence_alignment'] = str(entry.seq) rearrangement['junction'] = cdr3_seq rearrangement['junction_aa'] = cdr3_seq.translate() rearrangement['junction_length'] = len(cdr3_seq) rearrangement['length_trimmed'] = len(entry.seq) rearrangement['indels'] = indel rearrangement['status'] = status rearrangement['blast_identity'] = "%.3f" % (myV.identity/100.0) seq_stats.write(rearrangement) counts[status] += 1 print( "chunk %s: %d done, found %d; %d good..." %(arguments['--chunk'], total, found, counts['good']) ) seq_stats.close()
data = airr.read_rearrangement('toy_data.tsv') print(data.fields) print(data.external_fields) for r in data: print(r) # Create a new rearrangements file with an intermediate parser # Technically, the parser tool should be reading the VDJ rearrangements # output file, parsing it, then writing the row data. print('*****') print('*****') print('Create new rearrangements file.') print('*****') print('*****') data = airr.read_rearrangement('toy_data.tsv') newd = airr.create_rearrangement('my_data.tsv', fields=data.fields) print(newd.fields) print(newd.external_fields) for r in data: newd.write(r) newd.close() data = airr.read_rearrangement('my_data.tsv') print(data.fields) print(data.external_fields) for r in data: print(r) # create a derived rearrangements file with additional annotation print('*****') print('*****')
def main(): if not glob.glob("%s/%s_*.fasta" % (prj_tree.jgene, prj_name)): sys.exit("No jBlast output found!\n") print( "curating junction and 3' end..." ) allV_aa = open ("%s/%s_allV.fa" % (prj_tree.aa, prj_name), "w" ) allV_nt = open( "%s/%s_allV.fa" % (prj_tree.nt, prj_name), "w" ) allJ_aa = open( "%s/%s_allJ.fa" % (prj_tree.aa, prj_name), "w" ) allJ_nt = open( "%s/%s_allJ.fa" % (prj_tree.nt, prj_name), "w" ) vj_aa = open( "%s/%s_goodVJ.fa" % (prj_tree.aa, prj_name), "w" ) vj_nt = open( "%s/%s_goodVJ.fa" % (prj_tree.nt, prj_name), "w" ) good_cdr3_aa = open( "%s/%s_goodCDR3.fa" % (prj_tree.aa, prj_name), "w" ) good_cdr3_nt = open( "%s/%s_goodCDR3.fa" % (prj_tree.nt, prj_name), "w" ) all_cdr3_nt = open( "%s/%s_allCDR3.fa" % (prj_tree.nt, prj_name), "w" ) #get raw seq stats from temp table raw = csv.reader(open("%s/id_lookup.txt" % prj_tree.internal,'r'), delimiter=sep) raw_count, total, found, noV, noJ, f_ind = 0, 0, 0, 0, 0, 1 counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0} if arguments['--nterm'] == "discard": counts["missingNterm"]=0 writer = csv.writer(open("%s/%s_jgerm_tophit.txt" %(prj_tree.tables, prj_name), "w"), delimiter = sep) writer.writerow(PARSED_BLAST_HEADER) dict_jcounts = dict() dict_ccounts = dict() dict_dcounts = dict() c = False if os.path.isfile("%s/%s_C_001.txt" % (prj_tree.jgene, prj_name)): c = True cWriter = csv.writer(open("%s/%s_cgerm_tophit.txt" %(prj_tree.tables, prj_name), "w"), delimiter = sep) cWriter.writerow(PARSED_BLAST_HEADER) d = False if os.path.isfile("%s/%s_D_001.txt" % (prj_tree.jgene, prj_name)): d = True dWriter = csv.writer(open("%s/%s_dgerm_tophit.txt" %(prj_tree.tables, prj_name), "w"), delimiter = sep) dWriter.writerow(PARSED_BLAST_HEADER) seq_stats = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity']) while os.path.isfile("%s/%s_%03d.fasta" % (prj_tree.vgene, prj_name, f_ind)): dict_vgerm_aln, dict_other_vgerms, dict_vcounts = get_top_hits("%s/%s_%03d.txt"%(prj_tree.vgene, prj_name, f_ind) ) dict_jgerm_aln, dict_other_jgerms, dict_jcounts = get_top_hits("%s/%s_%03d.txt"%(prj_tree.jgene, prj_name, f_ind), topHitWriter=writer, dict_germ_count=dict_jcounts ) if c: minCStartPos = dict( [ (x, dict_jgerm_aln[x].qend) for x in dict_jgerm_aln.keys() ] ) dict_cgerm_aln, dict_other_cgerms, dict_ccounts = get_top_hits("%s/%s_C_%03d.txt"%(prj_tree.jgene, prj_name, f_ind), topHitWriter=cWriter, dict_germ_count=dict_ccounts, minQStart=minCStartPos ) if d: maxDEndPos = dict( [ (x, dict_jgerm_aln[x].qstart) for x in dict_jgerm_aln.keys() ] ) dict_dgerm_aln, dict_other_dgerms, dict_dcounts = get_top_hits("%s/%s_D_%03d.txt"%(prj_tree.jgene, prj_name, f_ind), topHitWriter=dWriter, dict_germ_count=dict_dcounts, maxQEnd=maxDEndPos ) for entry in SeqIO.parse( "%s/%s_%03d.fasta" % (prj_tree.vgene, prj_name, f_ind), "fasta"): total += 1 raw_stats = next(raw) raw_count += 1 while not entry.id == raw_stats[0]: #we found a read that did not meet the length cut-off raw_stats = next(raw) raw_count += 1 rearrangement = dict() rearrangement['sequence_id'] = raw_stats[0] rearrangement['source_file'] = raw_stats[1] rearrangement['source_id'] = raw_stats[2] rearrangement['length_raw'] = raw_stats[4] rearrangement['sequence'] = str(entry.seq) if not raw_stats[3] == "NA": rearrangement['duplicate_count'] = raw_stats[3] entry.description = "duplicate_count=%s" % raw_stats[3] else: entry.description = "" #just in case if not entry.id in dict_vgerm_aln: noV+=1 rearrangement['status'] = 'noV' seq_stats.write(rearrangement) elif not entry.id in dict_jgerm_aln: noJ+=1 myV = dict_vgerm_aln[entry.id] if (myV.strand == 'plus'): entry.seq = entry.seq[ myV.qstart - 1 : ] else: entry.seq = entry.seq[ : myV.qend ].reverse_complement() myVgenes = ",".join( [myV.sid] + dict_other_vgerms.get(entry.id,[]) ) entry.description += " v_call=%s status=noJ" % (myVgenes) allV_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq)) #prevent BioPython errors if (len(entry.seq) % 3) > 0: entry.seq = entry.seq [ : -1 * (len(entry.seq) % 3) ] allV_aa.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq.translate())) rearrangement['v_call'] = myVgenes rearrangement['status'] = 'noJ' seq_stats.write(rearrangement) else: found += 1 myV = dict_vgerm_aln[entry.id] myJ = dict_jgerm_aln[entry.id] added5 = 0 productive = "T" indel = "F" stop = "F" cdr3 = True #get actual V(D)J sequence v_len = myV.qend - (myV.qstart-1) #need to use qstart and qend instead of alignment to account for gaps #try to recover 3' of J if myJ.send < len(dict_j[myJ.sid].seq) and \ ( (myV.strand == "plus" and myV.qstart + v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send) <= len(entry.seq)) or \ (myV.strand == "minus" and myV.qend - (v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send)) >= 0) ): vdj_len = v_len + myJ.qend + (len(dict_j[myJ.sid].seq) - myJ.send) else: vdj_len = v_len + myJ.qend if (myV.strand == 'plus'): if myV.sstart > 1: if arguments['--nterm'] == "extend": if myV.qstart >= myV.sstart: entry.seq = entry.seq[ myV.qstart - myV.sstart : myV.qstart + vdj_len - 1 ] added5 = myV.sstart - 1 else: entry.seq = entry.seq[ : myV.qstart + vdj_len - 1 ] added5 = myV.qstart - 1 elif arguments['--nterm'] == "germline": entry.seq = dict_v[myV.sid].seq[ 0 : myV.sstart-1 ] + entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ] added5 = myV.sstart - 1 else: entry.seq = entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ] else: #blast found full V gene entry.seq = entry.seq[ myV.qstart - 1 : myV.qstart + vdj_len - 1 ] else: #minus strand if myV.send > 1: if arguments['--nterm'] == "extend": if len(entry.seq)-myV.qend >= myV.send-1: entry.seq = entry.seq[ myV.qend - vdj_len : myV.qend+myV.send-1 ].reverse_complement() added5 = myV.send - 1 else: added5 = len(entry.seq) - myV.qend entry.seq = entry.seq[ myV.qend - vdj_len : ].reverse_complement() elif arguments['--nterm'] == "germline": entry.seq = dict_v[myV.sid].seq[ 0 : myV.send-1 ] + entry.seq[ myV.qend - vdj_len : myV.qend ].reverse_complement() added5 = myV.send - 1 else: entry.seq = entry.seq[ myV.qend - vdj_len : myV.qend ].reverse_complement() else: #blast found full V gene entry.seq = entry.seq[ myV.qend - vdj_len : myV.qend ].reverse_complement() #get CDR3 boundaries cdr3_start,cdr3_end,WF_motif = find_cdr3_borders(myV.sid,str(dict_v[myV.sid].seq), v_len, min(myV.sstart, myV.send), max(myV.sstart, myV.send), str(dict_j[myJ.sid].seq), myJ.sstart, myJ.qstart, myJ.gaps, str(entry.seq[ added5 : ])) #min and max statments take care of switching possible minus strand hit cdr3_seq = entry.seq[ added5+cdr3_start : added5+cdr3_end ] #push the sequence into frame for translation, if need be v_frame = ( min([myV.sstart, myV.send]) - added5 ) % 3 five_prime_add = (v_frame-1) % 3 entry.seq = 'N' * five_prime_add + entry.seq #prevent BioPython errors by trimming to last full codon #if (len(entry.seq) % 3) > 0: # entry.seq = entry.seq [ : -1 * (len(entry.seq) % 3) ] #check for stop codons if '*' in entry.seq.translate(): stop = "T" #check for in-frame junction if len(cdr3_seq) % 3 != 0: productive = "F" else: #even if recombination looks ok, might be (sequencing) indels in V and/or J j_frame = 3 - ( ( WF_motif - myJ.sstart ) % 3 ) #j genes start in different frames, so calculate based on position of conserved W/F found by the cdr3 subroutine above frame_shift = (v_len + myJ.qstart + added5 - 1) % 3 if (v_frame + frame_shift) % 3 != j_frame % 3: indel = "T" #for gDNA we would probably want to distinguish between an out-of-frame recombination and sequencing in-dels in V or J #but that can be ambiguous and for cDNA we can assume that it's sll sequencing in-del anyway, even in CDR3. else: #use blast gaps to detect frame shift in-dels #most of these have stop codons or other sequence problems, but we'll catch a few extra this way if (abs(myV.send-myV.sstart)-(myV.qend-myV.qstart)) % 3 != 0 or ((myJ.send-myJ.sstart)-(myJ.qend-myJ.qstart)) % 3 != 0: indel = "T" #make sure cdr3 boundaries make sense if (cdr3_end<=cdr3_start or cdr3_end>vdj_len or cdr3_start<0): cdr3 = False status = "good" if not cdr3: status = "noCDR3" elif productive == "F": status = "nonproductive" elif indel == "T": status = "indel" elif stop == "T": status = "stop" elif arguments['--nterm'] == "discard" and min(myV.sstart,myV.send) > 1: status = "missingNterm" #add germline assignments to fasta description and write to disk myVgenes = ",".join( [myV.sid] + dict_other_vgerms.get(entry.id,[]) ) myJgenes = ",".join( [myJ.sid] + dict_other_jgerms.get(entry.id,[]) ) myDgenes = "" if d: if entry.id in dict_dgerm_aln: myDgenes = ",".join( [dict_dgerm_aln[entry.id].sid] + dict_other_dgerms.get(entry.id,[]) ) myCgenes = "" if c: if entry.id in dict_cgerm_aln: myCgenes = ",".join( [dict_cgerm_aln[entry.id].sid] + dict_other_cgerms.get(entry.id,[]) ) vlocus = "" if any( x in myV.sid for x in ["HV", "VH", "Vh", "vh", "heavy", "Heavy", "HEAVY"] ): vlocus = "IGH" elif any( x in myV.sid for x in ["LV", "VL", "Vl", "vl", "lambda", "Lambda", "LAMBDA"] ): vlocus = "IGL" elif any( x in myV.sid for x in ["KV", "VK", "Vk", "vk", "kappa", "Kappa", "KAPPA"] ): vlocus = "IGK" entry.description += " v_call=%s" % myVgenes if myDgenes != "": entry.description += " d_call=%s" % myDgenes entry.description += " j_call=%s" % myJgenes if myCgenes != "": entry.description += " c_call=%s" % myCgenes if vlocus != "": entry.description += " locus=%s" % vlocus entry.description += " status=%s blast_identity=%.3f junction_length=%d junction=%s junction_aa=%s" % ( status, myV.identity/100.0, len(cdr3_seq), cdr3_seq, cdr3_seq.translate() ) allV_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq)) allV_aa.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq.translate())) allJ_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq)) allJ_aa.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq.translate())) if status == "good": vj_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq)) vj_aa.write(">%s %s\n%s\n" %(entry.id, entry.description, entry.seq.translate())) good_cdr3_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, cdr3_seq)) good_cdr3_aa.write(">%s %s\n%s\n" %(entry.id, entry.description, cdr3_seq.translate())) all_cdr3_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, cdr3_seq)) elif cdr3: #CDR3 but not "good" all_cdr3_nt.write(">%s %s\n%s\n" %(entry.id, entry.description, cdr3_seq)) #do AIRR output if myV.strand == "plus": rearrangement['rev_comp'] = "F" else: rearrangement['rev_comp'] = "T" if status == "good": rearrangement['productive'] = "T" else: rearrangement['productive'] = "F" rearrangement['vj_in_frame'] = productive rearrangement['stop_codon'] = stop rearrangement['locus'] = vlocus rearrangement['v_call'] = myVgenes rearrangement['j_call'] = myJgenes rearrangement['d_call'] = myDgenes rearrangement['c_call'] = myCgenes rearrangement['sequence_alignment'] = str(entry.seq) rearrangement['junction'] = cdr3_seq rearrangement['junction_aa'] = cdr3_seq.translate() rearrangement['junction_length'] = len(cdr3_seq) rearrangement['length_trimmed'] = len(entry.seq) rearrangement['indels'] = indel rearrangement['status'] = status rearrangement['blast_identity'] = "%.3f" % (myV.identity/100.0) seq_stats.write(rearrangement) counts[status] += 1 print( "%d done, found %d; %d good..." %(total, found, counts['good']) ) f_ind += 1 seq_stats.close() #print out some statistics handle = open("%s/%s_jgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_jcounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_jcounts[key], "%4.2f" % (dict_jcounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() if len(dict_ccounts) > 0: handle = open("%s/%s_cgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_ccounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_ccounts[key], "%4.2f" % (dict_ccounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() if len(dict_dcounts) > 0: handle = open("%s/%s_dgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_dcounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_dcounts[key], "%4.2f" % (dict_dcounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() message = "\nTotal raw reads: %d\nCorrect Length: %d\nV assigned: %d\nJ assigned: %d\nCDR3 assigned: %d\nIn-frame junction: %d\nNo indels: %d\nContinuous ORF with no stop codons: %d\n\n" % \ (raw_count, total, total-noV, found, found-counts['noCDR3'], found-counts['noCDR3']-counts['nonproductive'], found-counts['noCDR3']-counts['nonproductive']-counts['indel'], counts['good']) print( message ) handle = open("%s/finalize_blast.log"%prj_tree.logs, "w") handle.write(message) handle.close() #clean up!! oldFiles = glob.glob("%s/*txt"%prj_tree.vgene) + glob.glob("%s/*fasta"%prj_tree.vgene) + glob.glob("%s/*txt"%prj_tree.jgene) + glob.glob("%s/*fasta"%prj_tree.jgene) + glob.glob("%s/id_lookup.txt"%prj_tree.internal) if len(oldFiles) > 0 and not arguments['--noclean']: [os.remove(f) for f in oldFiles]
def main(): print("Processing chunk %s..." % arguments['--chunk']) #get raw seq stats from temp table raw = csv.reader(open( "%s/lookup_%s.txt" % (prj_tree.internal, arguments['--chunk']), 'r'), delimiter=sep) raw_count, total, found, noV, noJ, f_ind = 0, 0, 0, 0, 0, 1 counts = Counter() writer = csv.writer(open( "%s/jtophit_%s.txt" % (prj_tree.jgene, arguments['--chunk']), "w"), delimiter=sep) writer.writerow(PARSED_BLAST_HEADER) dict_jcounts = dict() dict_ccounts = dict() dict_dcounts = dict() c = False if os.path.isfile("%s/%s_C_%s.txt" % (prj_tree.jgene, prj_name, arguments['--chunk'])): c = True cWriter = csv.writer(open( "%s/ctophit_%s.txt" % (prj_tree.jgene, arguments['--chunk']), "w"), delimiter=sep) cWriter.writerow(PARSED_BLAST_HEADER) d = False if os.path.isfile("%s/%s_D_%s.txt" % (prj_tree.jgene, prj_name, arguments['--chunk'])): d = True dWriter = csv.writer(open( "%s/dtophit_%s.txt" % (prj_tree.jgene, arguments['--chunk']), "w"), delimiter=sep) dWriter.writerow(PARSED_BLAST_HEADER) seq_stats = airr.create_rearrangement( "%s/rearrangements_%s.tsv" % (prj_tree.internal, arguments['--chunk']), fields=[ 'vj_in_frame', 'stop_codon', 'locus', 'c_call', 'junction_length', 'source_file', 'source_id', 'duplicate_count', 'length_raw', 'length_trimmed', 'indels', 'status', 'blast_identity', 'consensus_count', 'cell_id' ]) dict_vgerm_aln, dict_other_vgerms, dict_vcounts = get_top_hits( "%s/%s_%s.txt" % (prj_tree.vgene, prj_name, arguments['--chunk'])) dict_jgerm_aln, dict_other_jgerms, dict_jcounts = get_top_hits( "%s/%s_%s.txt" % (prj_tree.jgene, prj_name, arguments['--chunk']), topHitWriter=writer, dict_germ_count=dict_jcounts, strand="plus") if c: minCStartPos = dict([(x, dict_jgerm_aln[x].qend) for x in dict_jgerm_aln.keys()]) dict_cgerm_aln, dict_other_cgerms, dict_ccounts = get_top_hits( "%s/%s_C_%s.txt" % (prj_tree.jgene, prj_name, arguments['--chunk']), topHitWriter=cWriter, dict_germ_count=dict_ccounts, minQStart=minCStartPos, strand="plus") if d: maxDEndPos = dict([(x, dict_jgerm_aln[x].qstart) for x in dict_jgerm_aln.keys()]) dict_dgerm_aln, dict_other_dgerms, dict_dcounts = get_top_hits( "%s/%s_D_%s.txt" % (prj_tree.jgene, prj_name, arguments['--chunk']), topHitWriter=dWriter, dict_germ_count=dict_dcounts, maxQEnd=maxDEndPos, strand="plus") for entry in SeqIO.parse( "%s/%s_%s.fasta" % (prj_tree.vgene, prj_name, arguments['--chunk']), "fasta"): total += 1 raw_stats = next(raw) raw_count += 1 while not entry.id == raw_stats[0]: #we found a read that did not meet the length cut-off raw_stats = next(raw) raw_count += 1 rearrangement = dict() rearrangement['sequence_id'] = raw_stats[0] rearrangement['source_file'] = raw_stats[1] rearrangement['source_id'] = raw_stats[2] rearrangement['length_raw'] = raw_stats[3] rearrangement['sequence'] = str(entry.seq) if not raw_stats[4] == "NA": rearrangement['duplicate_count'] = raw_stats[4] if not raw_stats[5] == "NA": rearrangement['consensus_count'] = raw_stats[5] if not raw_stats[6] == "NA": rearrangement['cell_id'] = raw_stats[6] if not entry.id in dict_vgerm_aln: noV += 1 rearrangement['status'] = 'noV' seq_stats.write(rearrangement) elif not entry.id in dict_jgerm_aln: noJ += 1 myV = dict_vgerm_aln[entry.id] entry.seq = entry.seq[myV.qstart - 1:myV.qend] if (myV.strand == 'minus'): entry.seq = entry.seq.reverse_complement() rearrangement['rev_comp'] = "T" else: rearrangement['rev_comp'] = "F" myVgenes = ",".join([myV.sid] + dict_other_vgerms.get(entry.id, [])) vlocus = "" if re.search("(HV|VH|heavy)", myV.sid, re.I): vlocus = "IGH" elif re.search("(LV|VL|lambda)", myV.sid, re.I): vlocus = "IGL" elif re.search("(KV|VK|kappa)", myV.sid, re.I): vlocus = "IGK" rearrangement['v_call'] = myVgenes rearrangement['locus'] = vlocus rearrangement['productive'] = "F" rearrangement['status'] = 'noJ' rearrangement['sequence_alignment'] = str(entry.seq) seq_stats.write(rearrangement) else: found += 1 myV = dict_vgerm_aln[entry.id] myJ = dict_jgerm_aln[entry.id] added5 = 0 productive = "T" indel = "F" stop = "F" cdr3 = True vlocus = "" if re.search("(HV|VH|heavy)", myV.sid, re.I): vlocus = "IGH" elif re.search("(LV|VL|lambda)", myV.sid, re.I): vlocus = "IGL" elif re.search("(KV|VK|kappa)", myV.sid, re.I): vlocus = "IGK" #get actual V(D)J sequence v_len = myV.qend - ( myV.qstart - 1 ) #need to use qstart and qend instead of alignment to account for gaps #try to recover 3' of J if myJ.send < len(dict_j[myJ.sid].seq) and \ ( (myV.strand == "plus" and myV.qstart + v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send) <= len(entry.seq)) or \ (myV.strand == "minus" and myV.qend - (v_len + myJ.qend + (len(dict_j[myJ.sid].seq)-myJ.send)) >= 0) ): vdj_len = v_len + myJ.qend + (len(dict_j[myJ.sid].seq) - myJ.send) else: vdj_len = v_len + myJ.qend const_seq = "" if (myV.strand == 'plus'): const_seq = str(entry.seq[myV.qstart + vdj_len - 1:]) if myV.sstart > 1: if arguments['--nterm'] == "extend": if myV.qstart >= myV.sstart: entry.seq = entry.seq[myV.qstart - myV.sstart:myV.qstart + vdj_len - 1] added5 = myV.sstart - 1 else: entry.seq = entry.seq[:myV.qstart + vdj_len - 1] added5 = myV.qstart - 1 elif arguments['--nterm'] == "germline": entry.seq = dict_v[ myV.sid].seq[0:myV.sstart - 1] + entry.seq[myV.qstart - 1:myV.qstart + vdj_len - 1] added5 = myV.sstart - 1 else: entry.seq = entry.seq[myV.qstart - 1:myV.qstart + vdj_len - 1] else: #blast found full V gene entry.seq = entry.seq[myV.qstart - 1:myV.qstart + vdj_len - 1] else: #minus strand const_seq = str(entry.seq[:myV.qend - vdj_len].reverse_complement()) if myV.send > 1: if arguments['--nterm'] == "extend": if len(entry.seq) - myV.qend >= myV.send - 1: entry.seq = entry.seq[myV.qend - vdj_len:myV.qend + myV.send - 1].reverse_complement() added5 = myV.send - 1 else: added5 = len(entry.seq) - myV.qend entry.seq = entry.seq[myV.qend - vdj_len:].reverse_complement( ) elif arguments['--nterm'] == "germline": entry.seq = dict_v[ myV.sid].seq[0:myV.send - 1] + entry.seq[ myV.qend - vdj_len:myV.qend].reverse_complement() added5 = myV.send - 1 else: entry.seq = entry.seq[myV.qend - vdj_len:myV. qend].reverse_complement() else: #blast found full V gene entry.seq = entry.seq[myV.qend - vdj_len:myV. qend].reverse_complement() #get CDR3 boundaries cdr3_start, cdr3_end, WF_motif = find_cdr3_borders( myV.sid, str(dict_v[myV.sid].seq), v_len, min(myV.sstart, myV.send), max(myV.sstart, myV.send), str(dict_j[myJ.sid].seq), myJ.sstart, myJ.qstart, myJ.gaps, str(entry.seq[added5:]) ) #min and max statments take care of switching possible minus strand hit cdr3_seq = entry.seq[added5 + cdr3_start:added5 + cdr3_end] #push the sequence into frame for translation, if need be v_frame = (min([myV.sstart, myV.send]) - added5) % 3 five_prime_add = (v_frame - 1) % 3 entry.seq = 'N' * five_prime_add + entry.seq #prevent BioPython errors by trimming to last full codon #if (len(entry.seq) % 3) > 0: # entry.seq = entry.seq [ : -1 * (len(entry.seq) % 3) ] #check for stop codons if '*' in entry.seq.translate(): stop = "T" #check for in-frame junction if len(cdr3_seq) % 3 != 0: productive = "F" else: #even if recombination looks ok, might be (sequencing) indels in V and/or J j_frame = 3 - ( (WF_motif - myJ.sstart) % 3 ) #j genes start in different frames, so calculate based on position of conserved W/F found by the cdr3 subroutine above frame_shift = (v_len + myJ.qstart + added5 - 1) % 3 if (v_frame + frame_shift) % 3 != j_frame % 3: indel = "T" else: #use blast gaps to detect frame shift in-dels #most of these have stop codons or other sequence problems, but we'll catch a few extra this way if (abs(myV.send - myV.sstart) - (myV.qend - myV.qstart)) % 3 != 0 or ( (myJ.send - myJ.sstart) - (myJ.qend - myJ.qstart)) % 3 != 0: indel = "T" #make sure cdr3 boundaries make sense if (cdr3_end <= cdr3_start or cdr3_end > vdj_len or cdr3_start < 0): cdr3 = False status = "good" if not cdr3: status = "noCDR3" elif productive == "F": status = "nonproductive" elif indel == "T": status = "indel" elif stop == "T": status = "stop" elif arguments['--nterm'] == "discard" and min( myV.sstart, myV.send) > 1: status = "missingNterm" #add germline assignments to fasta description and write to disk myVgenes = ",".join([myV.sid] + dict_other_vgerms.get(entry.id, [])) myJgenes = ",".join([myJ.sid] + dict_other_jgerms.get(entry.id, [])) myDgenes = "" if d: if entry.id in dict_dgerm_aln: if not vlocus in ["IGK", "IGL"]: #supress spurious D gene hits if it's a light chain myDgenes = ",".join( [dict_dgerm_aln[entry.id].sid] + dict_other_dgerms.get(entry.id, [])) myCgenes = "" if c and entry.id in dict_cgerm_aln: myCgenes = ",".join([dict_cgerm_aln[entry.id].sid] + dict_other_cgerms.get(entry.id, [])) elif not arguments['--noFallBack']: if re.match("C[CT]", const_seq): myCgenes = "IGHG" #could also be IgE, but I'm assuming that's rare elif re.match("GGA", const_seq): myCgenes = "IGHM" elif re.match("CAT", const_seq): myCgenes = "IGHA" elif re.match("CAC", const_seq): myCgenes = "IGHD" elif re.match("CGA", const_seq): myCgenes = "IGKC" elif re.match("GGT", const_seq): myCgenes = "IGLC" jlocus = "" if re.search("(HJ|JH|heavy)", myJ.sid, re.I): jlocus = "IGH" elif re.search("(LJ|Jl|lambda)", myJ.sid, re.I): jlocus = "IGL" elif re.search("(KJ|JK|kappa)", myJ.sid, re.I): jlocus = "IGK" if not vlocus == jlocus: #this really shouldn't happen unless one or both gene assignments are # based on very short partial hits. Unfortuantely, the lengths/e-values # are on different scales, so I don't currently have a good heuristic to # pick between the two. Just flag it and give up, at least for now. status = "chimera" if not myCgenes == "" and not vlocus in myCgenes: #will fail for custom libraries where C gene names don't start with locus myCgenes = "" #assume constant is incorrect since usually based on only a few bases #do AIRR output if myV.strand == "plus": rearrangement['rev_comp'] = "F" else: rearrangement['rev_comp'] = "T" if status == "good": rearrangement['productive'] = "T" else: rearrangement['productive'] = "F" rearrangement['vj_in_frame'] = productive rearrangement['stop_codon'] = stop rearrangement['locus'] = vlocus rearrangement['v_call'] = myVgenes rearrangement['j_call'] = myJgenes rearrangement['d_call'] = myDgenes rearrangement['c_call'] = myCgenes rearrangement['sequence_alignment'] = str(entry.seq) rearrangement['junction'] = cdr3_seq rearrangement['junction_aa'] = cdr3_seq.translate() rearrangement['junction_length'] = len(cdr3_seq) rearrangement['length_trimmed'] = len(entry.seq) rearrangement['indels'] = indel rearrangement['status'] = status rearrangement['blast_identity'] = "%.3f" % (myV.identity / 100.0) seq_stats.write(rearrangement) counts[status] += 1 print("chunk %s: %d done, found %d; %d good..." % (arguments['--chunk'], total, found, counts['good'])) seq_stats.close()
def airrdownload(args): airr.validate_repertoire(args.repertoire, True) repertoire_file = args.repertoire rearrangements_file = repertoire_file[:-4] + "rearrangements.tsv" try: data = airr.load_repertoire(args.repertoire) except TypeError: sys.stderr.write("TCRcloud error: It seems you did not indicate a \ properly formatted AIRR rearrangements file\n") exit() repertoires = data["Repertoire"] host_url = testserver(data) # Print out some Info print(" Info: " + data["Info"]["title"]) print(" version: " + str(data["Info"]["version"])) print("description: " + data["Info"]["description"]) print("Found " + str(len(data["Repertoire"])) + " repertoires in \ repertoire metadata file.") # Query the rearrangement endpoint # Define a generic query object, and we will replace the repertoire_id # within the loop. We also only request productive rearrangements as # an additional filter. query = { "filters": { "op": "and", "content": [ { "op": "=", "content": { "field": "repertoire_id", "value": "XXX" } }, { "op": "=", "content": { "field": "productive", "value": True } } ] }, "size": 1000, "from": 0 } # Loop through each repertoire and query rearrangement data for # each. We download in chunks of 10000 because of the server # limitations using the from and size parameters. first = True for r in repertoires: print("Retrieving rearrangements for repertoire: " + r["repertoire_id"]) query["filters"]["content"][0]["content"]["value"] = r["repertoire_id"] query["size"] = 1000 query["from"] = 0 cnt = 0 while True: # send the request resp = requests.post(host_url + "/rearrangement", json=query) data = resp.json() rearrangements = data["Rearrangement"] # Open a file for writing the rearrangements. We do this here # because we need to know the full set of fields being # returned from the data repository, otherwise by default only # the required fields will be written to the file. if first: out_file = airr.create_rearrangement( rearrangements_file, fields=rearrangements[0].keys()) first = False # save the rearrangements to a file for row in rearrangements: out_file.write(row) # looping until zero rearrangements are returned from the query. cnt += len(rearrangements) if len(rearrangements) < 1000: break # Need to update the from parameter to get the next chunk query["from"] = cnt print("Retrieved " + str(cnt) + " rearrangements for repertoire: " + r["repertoire_id"]) print("Saved as " + rearrangements_file)
def main(): if not glob.glob("%s/%s_*.fasta" % (prj_tree.jgene, prj_name)): sys.exit("No jBlast output found!\n") maxFiles = len( glob.glob("%s/%s_*.fasta" % (prj_tree.vgene, prj_name)) ) if not arguments['--reenter']: print( "curating junction and 3' end..." ) if arguments['--cluster']: command = "NUM=`printf \"%s\" $SGE_TASK_ID`\n%s/annotate/parse_blast.py --jmotif '%s' --nterm %s --chunk $NUM\n" % \ ( "%03d", SCRIPT_FOLDER, arguments['--jmotif'], arguments['--nterm'] ) if arguments['--noFallBack']: command += " --noFallBack" pbs = open("%s/parse.sh"%prj_tree.jgene, 'w') pbs.write( "#!/bin/bash\n#$ -N parse-%s\n#$ -l mem=2G\n#$ -cwd\n\n%s\n" % (prj_name, command) ) pbs.close() os.system( "%s -t 1-%d %s/parse.sh"%(qsub,maxFiles,prj_tree.jgene) ) restart = "%s/annotate/1.3-finalize_assignments.py --reenter" % SCRIPT_FOLDER for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save']: if arguments[opt] is not None: restart += " %s %s" % (opt, arguments[opt]) for flag in ['--noclean', '--runClustering', '--runCellStatistics']: if arguments[flag]: restart += " %s" % flag monitor = open("%s/parse_monitor.sh"%prj_tree.jgene, 'w') monitor.write( "#!/bin/bash\n#$ -N monitor-%s\n#$ -l mem=2G\n#$ -cwd\n#$ -hold_jid parse-%s\n\n%s\n"%(prj_name, prj_name,restart) ) monitor.close() os.system( "%s %s/parse_monitor.sh"%(qsub,prj_tree.jgene) ) sys.exit() else: #do it locally parse_pool = Pool(arguments['--threads']) parse_pool.map(callParser, range(1,maxFiles+1)) parse_pool.close() parse_pool.join() #ok, now collect all of the partial outputs and merge them print( "collecting information...") #open fasta outputs allV_aa = open ("%s/%s_allV.fa" % (prj_tree.aa, prj_name), "w" ) allV_nt = open( "%s/%s_allV.fa" % (prj_tree.nt, prj_name), "w" ) allJ_aa = open( "%s/%s_allJ.fa" % (prj_tree.aa, prj_name), "w" ) allJ_nt = open( "%s/%s_allJ.fa" % (prj_tree.nt, prj_name), "w" ) vj_aa = open( "%s/%s_goodVJ.fa" % (prj_tree.aa, prj_name), "w" ) vj_nt = open( "%s/%s_goodVJ.fa" % (prj_tree.nt, prj_name), "w" ) good_cdr3_aa = open( "%s/%s_goodCDR3.fa" % (prj_tree.aa, prj_name), "w" ) good_cdr3_nt = open( "%s/%s_goodCDR3.fa" % (prj_tree.nt, prj_name), "w" ) all_cdr3_aa = open( "%s/%s_allCDR3.fa" % (prj_tree.aa, prj_name), "w" ) all_cdr3_nt = open( "%s/%s_allCDR3.fa" % (prj_tree.nt, prj_name), "w" ) #also open final rearrangements tsv seq_stats = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity','consensus_count','cell_id']) #initiate overall counters raw_count, total = 0, 0 counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0,'noV':0,'noJ':0,'missingNterm':0} dict_jcounts = Counter() dict_ccounts = Counter() dict_dcounts = Counter() c = False if os.path.isfile("%s/%s_C_001.txt" % (prj_tree.jgene, prj_name)): c = True d = False if os.path.isfile("%s/%s_D_001.txt" % (prj_tree.jgene, prj_name)): d = True #iterate over subset rearrangement files and combine #include generating fasta output as appropriate for f_ind in range(1, maxFiles+1): #merge partial blast hit tables with open( "%s/%s_jgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/jtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) if d: with open( "%s/%s_dgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/dtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) if c: with open( "%s/%s_cgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/ctophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) #go through partial rearrangements files for r in airr.read_rearrangement( "%s/rearrangements_%03d.tsv"%(prj_tree.internal, f_ind) ): seq_stats.write( r ) #count j/d/c gene usages if not r['j_call'] == "": dict_jcounts[ r['j_call'].split(",")[0] ] += 1 if not r['j_call'] == "": dict_jcounts[ r['d_call'].split(",")[0] ] += 1 if not r['j_call'] == "": dict_jcounts[ r['c_call'].split(",")[0] ] += 1 #count statuses counts[ r['status'] ] += 1 total += 1 raw_count = int( r['sequence_id'] ) #technically, this undercounts if the last one # isn't in the `correct_length` interval, but I # don't have a better solution that isn't super # kludgy right now #ok, now do sequence output # start by collecting metadata for fasta def line def_line = ">%s" % r['sequence_id'] if not r['v_call'] == '': def_line += " v_call=%s" % r['v_call'] if not r['d_call'] == '': def_line += " d_call=%s" % r['d_call'] if not r['j_call'] == '': def_line += " j_call=%s" % r['j_call'] if not r['locus'] == '': def_line += " locus=%s" % r['locus'] if not r['c_call'] == '': def_line += " c_call=%s" % r['c_call'] if not r['status'] == '': def_line += " status=%s" % r['status'] # if not r['v_identity'] == '': def_line += " v_identity=%s" % r['v_identity'] if not r['junction_length'] == '': def_line += " junction_length=%s" % r['junction_length'] if not r['junction'] == '': def_line += " junction=%s" % r['junction'] if not r['junction_aa'] == '': def_line += " junction_aa=%s" % r['junction'] if not r['duplicate_count'] == '': def_line += " duplicate_count=%s" % r['duplicate_count'] if not r['consensus_count'] == '': def_line += " consensus_count=%s" % r['consensus_count'] if not r['cell_id'] == '': def_line += " cell_id=%s" % r['cell_id'] #work our way up the hierarchy, putting sequences in the appropriate files ungapped = re.sub( "-", "", r['sequence_alignment']) #reintroduces any frameshift errors in translation # this has always been the behavior, but I wonder # if I should change/update now that I am using # proper alignments. if not r['status'] in ['noV', 'missingNterm']: allV_nt.write( "%s\n%s\n" % (def_line, ungapped) ) allV_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) if not r['status'] == 'noJ': allJ_nt.write( "%s\n%s\n" % (def_line, ungapped) ) allJ_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) if not r['status'] == 'noCDR3': all_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) ) all_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) ) if r['status'] == "good": vj_nt.write( "%s\n%s\n" % (def_line, ungapped) ) vj_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) good_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) ) good_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) ) #close outputs allV_aa.close() allV_nt.close() allJ_aa.close() allJ_nt.close() vj_aa.close() vj_nt.close() good_cdr3_aa.close() good_cdr3_nt.close() all_cdr3_aa.close() all_cdr3_nt.close() #useful number found = total - counts['noV'] - counts['noJ'] #print out some statistics handle = open("%s/%s_jgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_jcounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_jcounts[key], "%4.2f" % (dict_jcounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() if len(dict_ccounts) > 0: handle = open("%s/%s_cgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_ccounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_ccounts[key], "%4.2f" % (dict_ccounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() if len(dict_dcounts) > 0: handle = open("%s/%s_dgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_dcounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_dcounts[key], "%4.2f" % (dict_dcounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() message = "\nTotal raw reads: %d\nCorrect Length: %d\nV assigned: %d\nJ assigned: %d\nCDR3 assigned: %d\nIn-frame junction: %d\nNo indels: %d\nContinuous ORF with no stop codons: %d\n\n" % \ (raw_count, total, total-counts['noV'], found, found-counts['noCDR3'], found-counts['noCDR3']-counts['nonproductive'], found-counts['noCDR3']-counts['nonproductive']-counts['indel'], counts['good']) print( message ) handle = open("%s/finalize_blast.log"%prj_tree.logs, "w") handle.write(message) handle.close() # call 1.4 if requested if arguments['--runClustering']: cmd = "%s/annotate/1.4-cluster_sequences.py" % SCRIPT_FOLDER for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save']: if arguments[opt] is not None: cmd += " %s '%s'" % (opt, arguments[opt]) if arguments['--runCellStatistics']: cmd += " --runCellStatistics" print( "Calling 1.4 with command line: %s" % cmd ) os.system( cmd ) #clean up!! oldFiles = glob.glob("%s/*txt"%prj_tree.vgene) + glob.glob("%s/*fasta"%prj_tree.vgene) + glob.glob("%s/*txt"%prj_tree.jgene) + glob.glob("%s/*fasta"%prj_tree.jgene) + glob.glob("%s/*tsv"%prj_tree.jgene) + glob.glob("%s/lookup*"%prj_tree.internal) if len(oldFiles) > 0 and not arguments['--noclean']: [os.remove(f) for f in oldFiles]