def main(): cells_raw = defaultdict( dict ) most_used = defaultdict( list ) output = open("%s/%s_cell_stats.tsv"%(prj_tree.tables,prj_name), 'w') output.write("cell\tstatus\tisotype\tproductive_IGH\ttotal_IGH\tIGH_junctions\tproductive_IGK\ttotal_IGK\tIGK_junctions\tproductive_IGL\ttotal_IGL\tIGL_junctions\n") data = airr.read_rearrangement(arguments['--rearrangements']) cells_only = airr.derive_rearrangement(re.sub(".tsv", "_single-cell.tsv", arguments['--rearrangements']), arguments['--rearrangements']) #assume cells might not be grouped together, so make a first pass # to collect everything for r in data: if r['status'] in ['good', 'indel', 'stop', 'nonproductive', 'unique']: #skip irrelevant sequences if r['locus'] not in cells_raw[r['cell_id']]: cells_raw[r['cell_id']][r['locus']] = [ r ] else: cells_raw[r['cell_id']][r['locus']].append( r ) #need better heuristic for this, omit for now #if r['cell_id'] not in most_used[r['centroid']]: # most_used[r['centroid']].append(r['cell_id']) #now go back and process each cell status_list = [ 'canonical_pair', 'possible_inclusion', 'heavy_only', 'light_only', 'multi_light', 'multi_heavy', 'probable_multiplet', 'none_productive' ] status_count = dict( zip( status_list, [0,0,0,0,0,0,0,0] ) ) status_dict = dict( ) for c in cells_raw: cell_processed = defaultdict( list ) cell_productive = defaultdict( list ) for locus in cells_raw[c]: #Start with the one with the most UMIs for rep in sorted( [ r for r in cells_raw[c][locus] ], key=lambda k: k['duplicate_count'], reverse=True ): #check if this is a duplicate of a previously kept read keep = True for previous in cell_processed[locus]: #shortcut: assume identical junctions means duplicates if previous['junction_aa'] == rep['junction_aa']: keep = False break #heuristic (for 10x data as of March 2019): omit gaps and cut off possible noise at 5' end else: cov, score = scoreAlign( quickAlign(previous['sequence_alignment'],rep['sequence_alignment']), countInternalGaps=False, skip=50 ) if score >= 0.95: keep = False break if keep: cell_processed[locus].append( rep ) if rep['status'] == "good": cell_productive[locus].append( rep ) status = "" h_type = "" if len(cell_productive['IGH']) == 0: if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0: status = "none_productive" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 1: status = "light_only" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 1: status = "multi_light" elif len(cell_productive['IGH']) == 1: h_type = re.sub("\*.+", "", cell_productive['IGH'][0]['c_call']) if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0: status = "heavy_only" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 1: status = "canonical_pair" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 2: status = "possible_inclusion" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 2: status = "probable_multiplet" elif len(cell_productive['IGH']) > 1: status = "multi_heavy" status_count[status] += 1 status_dict[c] = status #print to filtered rearrangements file if status in arguments['--save']: for loc in cell_processed: for chain in cell_processed[loc]: cells_only.write( chain ) #now log the cell print( "\t".join( [c, status, h_type, str(len(cell_productive['IGH'])), str(len(cell_processed['IGH'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGH']]), str(len(cell_productive['IGK'])), str(len(cell_processed['IGK'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGK']]), str(len(cell_productive['IGL'])), str(len(cell_processed['IGL'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGL']]) ] ), file=output) output.close() with open("%s/cell_processing.log"%prj_tree.logs, "w") as log: print("\t".join(status_list), file=log) print("\t".join([str(status_count[s]) for s in status_list]), file=log) print("\t".join(status_list)) print("\t".join([str(status_count[s]) for s in status_list]))
def main(): global germs germs = dict() for entry in SeqIO.parse(open(arguments['-g'], "r"), "fasta"): germs[entry.id] = entry global mature mature = dict() if arguments['-a'] is not None: for entry in SeqIO.parse(open(arguments['-a'], "r"), "fasta"): mature[entry.id] = entry inputFile = arguments['-f'] dedup = dict() if arguments['-d']: subprocess.call([ vsearch, "-derep_fulllength", arguments['-f'], "-output", "temp_dedup.fa", "-uc", "temp.uc", "-notrunclabels" ]) inputFile = "temp_dedup.fa" #process the uc file with open("temp.uc", "r") as handle: uc = csv.reader(handle, delimiter="\t") for row in uc: if row[0] == "S": dedup[row[8].split(" ")[0]] = row[8].split(" ")[0] elif row[0] == "H": dedup[row[8].split(" ")[0]] = row[9].split(" ")[0] results = dict() #If we are multithreading, split input into chunks if arguments['-t'] > 1: index = 0 counter = 0 chunk = [] reader = SeqIO.parse(open(inputFile, "r"), "fasta") for entry in reader: chunk.append(entry) counter += 1 if counter == 1000: with open("%s/align/align%06d.fa" % (prj_tree.lineage, index), "w") as handle: SeqIO.write(chunk, handle, "fasta") index += 1 counter = 0 chunk = [] if counter > 0: with open("%s/align/align%06d.fa" % (prj_tree.lineage, index), "w") as handle: SeqIO.write(chunk, handle, "fasta") index += 1 #so we can use range properly #now create a pool and start the actual work filterPool = Pool(arguments['-t']) dataBlob = filterPool.map(runAlign, [ "%s/align/align%06d.fa" % (prj_tree.lineage, i) for i in range(index) ]) filterPool.close() filterPool.join() #Recover results for blob in dataBlob: results.update(blob) else: #unthreaded, just do the whole thing results = runAlign(inputFile) #get some outputs set up outFile = os.path.basename(os.path.splitext(arguments['-f'])[0]) if os.path.isdir(prj_tree.tables): outFile = "output/tables/" + outFile if arguments['-o'] is not None: outFile = arguments['-o'] nats = sorted(mature.keys()) covFile = open("%s_coverage.tab" % outFile, "w") coverage = csv.writer(covFile, delimiter="\t") coverage.writerow(['sequence_id', 'germ_cov'] + nats) idFile = open("%s_id-div.tab" % outFile, "w") iddiv = csv.writer(idFile, delimiter="\t") iddiv.writerow(['sequence_id', 'v_gene', 'germ_div'] + nats) #sort the freaking list and output if arguments['-d']: for s in sorted(dedup.keys()): (germc, germi) = results[dedup[s]]['germline'] if not germc == "NA": germc = "%.1f" % germc germi = "%.1f" % (100 - germi) coverage.writerow([s, germc] + [ "NA" if results[dedup[s]][n][0] == "NA" else "%.1f" % results[dedup[s]][n][0] for n in nats ]) iddiv.writerow([s, results[dedup[s]]['vlookup'], germi] + [ "NA" if results[dedup[s]][n][1] == "NA" else "%.1f" % results[dedup[s]][n][1] for n in nats ]) #take this opportunity to do some cleanup os.remove("temp_dedup.fa") os.remove("temp.uc") else: for s in sorted(results.keys()): (germc, germi) = results[s]['germline'] if not germc == "NA": germc = "%.1f" % germc germi = "%.1f" % (100 - germi) coverage.writerow([s, germc] + [ "NA" if results[s][n][0] == "NA" else "%.1f" % results[s][n][0] for n in nats ]) iddiv.writerow([s, results[s]['vlookup'], germi] + [ "NA" if results[s][n][1] == "NA" else "%.1f" % results[s][n][1] for n in nats ]) covFile.close() idFile.close() #do AIRR output if os.path.dirname( arguments['-f'] ) == "output/sequences/nucleotide" and not 'CDR3' in arguments['-f']: if os.path.isfile("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): withDiv = airr.derive_rearrangement("updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name), fields=['v_identity']) for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): if dedup.get(r['sequence_id'], r['sequence_id']) in results: # omit NAs here to comply with AIRR format if not results[dedup.get( r['sequence_id'], r['sequence_id'])]['germline'][1] == "NA": r['v_identity'] = "%0.3f" % (results[dedup.get( r['sequence_id'], r['sequence_id'])]['germline'][1] / 100) withDiv.write(r) withDiv.close() os.rename("updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
def main(): #look for cell hashing hashDict = dict() sampleList = [] if os.path.exists(f"{prj_tree.tables}/{prj_name}_hashes.tsv"): with open(f"{prj_tree.tables}/{prj_name}_hashes.tsv", 'r') as handle: reader = csv.reader(handle, delimiter="\t") for row in reader: hashDict[row[0]] = [row[1]] if row[1] == "unknown" or row[1] == "ambiguous": continue elif not row[1] in sampleList: sampleList.append(row[1]) sampleList.sort() sampleList += ["unknown", "ambiguous"] #look for feature barcoding featureDict = dict() if os.path.exists(f"{prj_tree.tables}/{prj_name}_features.tsv"): with open(f"{prj_tree.tables}/{prj_name}_features.tsv", 'r') as handle: reader = csv.reader(handle, delimiter="\t") header = next(reader) featureDict["keys"] = header[1:] for row in reader: featureDict[row[0]] = row[1:] cells_raw = defaultdict(dict) most_used = defaultdict(list) output = open("%s/%s_cell_stats.tsv" % (prj_tree.tables, prj_name), 'w') outwriter = csv.writer(output, delimiter="\t") outheader = ["cell", "status", "isotype"] if len(hashDict) > 0: outheader += ["hash_sample"] if len(featureDict) > 0: outheader += featureDict["keys"] outheader += [ "productive_IGH", "total_IGH", "IGH_junctions", "productive_IGK", "total_IGK", "IGK_junctions", "productive_IGL", "total_IGL", "IGL_junctions" ] outwriter.writerow(outheader) data = airr.read_rearrangement(arguments['--rearrangements']) fields = ["cell_status"] if len(hashDict) > 0: fields += ["hash_sample"] cells_only = airr.derive_rearrangement(re.sub( ".tsv", "_single-cell.tsv", arguments['--rearrangements']), arguments['--rearrangements'], fields=fields) #assume cells might not be grouped together, so make a first pass # to collect everything for r in data: if r['status'] in ['good', 'indel', 'stop', 'nonproductive', 'unique']: #skip irrelevant sequences if r['locus'] not in cells_raw[r['cell_id']]: cells_raw[r['cell_id']][r['locus']] = [r] else: cells_raw[r['cell_id']][r['locus']].append(r) #need better heuristic for this, omit for now #if r['cell_id'] not in most_used[r['centroid']]: # most_used[r['centroid']].append(r['cell_id']) #now go back and process each cell status_list = [ 'canonical_pair', 'possible_inclusion', 'heavy_only', 'light_only', 'multi_light', 'multi_heavy', 'probable_multiplet', 'none_productive' ] status_count = dict() for sample in sampleList: status_count[sample] = dict(zip(status_list, [0, 0, 0, 0, 0, 0, 0, 0])) status_dict = dict() for c in cells_raw: cell_processed = defaultdict(list) cell_productive = defaultdict(list) for locus in cells_raw[c]: #Start with the one with the most UMIs for rep in sorted([r for r in cells_raw[c][locus]], key=lambda k: k['duplicate_count'] or 0, reverse=True): #check if this is a duplicate of a previously kept read keep = True for previous in cell_processed[locus]: #shortcut: assume identical junctions means duplicates if previous['junction_aa'] == rep['junction_aa']: keep = False if previous['duplicate_count'] is not None: previous['duplicate_count'] += rep[ 'duplicate_count'] if previous['consensus_count'] is not None: previous['consensus_count'] += rep[ 'consensus_count'] break #heuristic (for 10x data as of March 2019): omit gaps and cut off possible noise at 5' end else: score, cov = scoreAlign(quickAlign( previous['sequence_alignment'], rep['sequence_alignment']), countInternalGaps=False, skip=50) if score >= 0.95: keep = False if previous['duplicate_count'] is not None: previous['duplicate_count'] += rep[ 'duplicate_count'] if previous['consensus_count'] is not None: previous['consensus_count'] += rep[ 'consensus_count'] break if keep: cell_processed[locus].append(rep) if rep['status'] == "good": cell_productive[locus].append(rep) status = "" h_type = "" if len(cell_processed['IGH']) > 2 or len( cell_processed['IGK']) > 2 or len(cell_processed['IGL']) > 2: status = "probable_multiplet" elif len(cell_productive['IGH']) == 0: if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0: status = "none_productive" elif len(cell_productive['IGK']) + len( cell_productive['IGL']) == 1: status = "light_only" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 1: status = "multi_light" elif len(cell_productive['IGH']) == 1: h_type = re.sub("\*.+", "", cell_productive['IGH'][0]['c_call']) if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0: status = "heavy_only" elif len(cell_productive['IGK']) + len( cell_productive['IGL']) == 1: status = "canonical_pair" elif len(cell_productive['IGK']) + len( cell_productive['IGL']) == 2: status = "possible_inclusion" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 2: status = "probable_multiplet" elif len(cell_productive['IGH']) > 1: status = "multi_heavy" status_count[hashDict.get(c, ["unknown"])[0]][status] += 1 status_dict[c] = status #print to filtered rearrangements file #leave out cells with ambiguous hashing assignments if we are doing any filtering if status in arguments['--save']: if hashDict.get( c, ["unknown"] )[0] != "ambiguous" or "probable_multiplet" in arguments['--save']: for loc in cell_processed: for chain in cell_processed[loc]: chain['cell_status'] = status if len(hashDict) > 0: chain['hash_sample'] = hashDict.get( chain['cell_id'], ['unknown'])[0] cells_only.write(chain) #now log the cell outwriter.writerow( [c, status, h_type] + hashDict.get(c, ['unknown'] * (len(hashDict) > 0)) + featureDict.get(c, ['0'] * len(featureDict.get("keys", []))) + [ len(cell_productive['IGH']), len(cell_processed['IGH']), ";".join( [chain['junction_aa'] for chain in cell_processed['IGH']]), len(cell_productive['IGK']), len(cell_processed['IGK']), ";".join( [chain['junction_aa'] for chain in cell_processed['IGK']]), len(cell_productive['IGL']), len(cell_processed['IGL']), ";".join( [chain['junction_aa'] for chain in cell_processed['IGL']]) ]) output.close() with open("%s/cell_processing.log" % prj_tree.logs, "w") as log: print("sample\t" + "\t".join(status_list), file=log) print("sample\t" + "\t".join(status_list)) for sample in sampleList: if sum([status_count[sample][s] for s in status_list]) == 0: continue #leave out `ambiguous` if it's not a hashed sample print( "\t".join([sample] + [str(status_count[sample][s]) for s in status_list]), file=log) print( "\t".join([sample] + [str(status_count[sample][s]) for s in status_list]))
def main(): #start by making possible "duplicate_count" info available to vsearch with open("temp.fa", "w") as handle: SeqIO.write(reformatInput(arguments['--file']), handle, "fasta") #first step on higher identity subprocess.call([ vsearch, "-derep_fulllength", "temp.fa", "-output", "temp_dedup.fa", "-uc", "temp.uc", "-sizein", "-sizeout", "-minuniquesize", arguments['--min1'] ]) #process the uc file centroid = dict() with open("temp.uc", "r") as handle: uc = csv.reader(handle, delimiter="\t") for row in uc: if row[0] == "H": centroid[re.sub(";size=\d+", "", row[8])] = re.sub(";size=\d+", "", row[9]) #second clustering step subprocess.call([ vsearch, "-cluster_size", "temp_dedup.fa", "-sizein", "-sizeout", "-maxgaps", arguments['--maxgaps'], "-id", arguments['--id'], "-uc", "%s.cluster" % os.path.splitext(arguments['--file'])[0] ]) #process the uc file size = dict() with open("%s.cluster" % os.path.splitext(arguments['--file'])[0], "r") as handle: uc = csv.reader(handle, delimiter="\t") for row in uc: if row[0] == "H": centroid[re.sub(";size=\d+", "", row[8])] = re.sub(";size=\d+", "", row[9]) elif row[0] == "C": #have the centroids point to themselves for more uniform dowsntream processing centroid[re.sub(";size=\d+", "", row[8])] = re.sub(";size=\d+", "", row[8]) #but only save them if they meet the threshold if int(row[2]) >= arguments['--min2']: size[re.sub(";size=\d+", "", row[8])] = int(row[2]) #clean up os.remove("temp.fa") os.remove("temp_dedup.fa") os.remove("temp.uc") #do sequence outputs with open("%s_unique.fa" % os.path.splitext(arguments['--file'])[0], "w") as handle: SeqIO.write(getUniques(arguments['--file'], size), handle, 'fasta') #retrieve unique CDR3s (and do AA seqs as appropriate) if "goodVJ" in arguments['--file']: cdr3_file = re.sub("goodVJ", "goodCDR3", arguments['--file']) if os.path.isfile(cdr3_file): with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0], "w") as handle: SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % cdr3_file, file=sys.stderr) if "nucleotide" in cdr3_file: cdr3_file = re.sub("nucleotide", "amino_acid", cdr3_file) if os.path.isfile(cdr3_file): with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0], "w") as handle: SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % cdr3_file, file=sys.stderr) if "nucleotide" in arguments['--file']: aa_file = re.sub("nucleotide", "amino_acid", arguments['--file']) if os.path.isfile(aa_file): with open("%s_unique.fa" % os.path.splitext(aa_file)[0], "w") as handle: SeqIO.write(getUniques(aa_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % aa_file, file=sys.stderr) #now do AIRR output if "output/sequences/nucleotide" in arguments['--file']: if os.path.isfile("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): clustered = airr.derive_rearrangement( "updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name), fields=['centroid', 'cluster_count']) for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): #clear old annotations in case we ran 1.4 previously r['centroid'] = "" r['cluster_count'] = "" #now add back current annotations #two rounds of clustering means we start by looking for the centroid of the centroid, # falling back to the first level centroid (second clustering step only) if appropriate r['centroid'] = centroid.get( centroid.get(r['sequence_id'], ""), centroid.get(r['sequence_id'], "")) #add cluster size information for final centroids. I am doing away with changing the 'status' of # the centroids to 'unique' because I've started using this script in a lot of cases where it # doesn't make sense to treat 'unique' as a subset of 'good', and I therefore need to preserve # the original status designation. To find centroids, look for a non-null 'cluster_count' field if r['sequence_id'] in size: r['cluster_count'] = size[r['sequence_id']] clustered.write(r) clustered.close() os.rename("updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)) else: print( "Can't find the rearrangements file, not saving data in AIRR format", file=sys.stderr) # call 1.5 if requested if arguments['--runCellStatistics']: cmd = "%s/annotate/1.5-single_cell_statistics.py" % SCRIPT_FOLDER if arguments['--rearrangements'] is not None: cmd += " --rearrangements %s" % arguments['--rearrangements'] if arguments['--save'] is not None: cmd += " --save %s" % arguments['--save'] print("Calling 1.5 with command line: %s" % cmd) os.system(cmd)
newd.close() data = airr.read_rearrangement('my_data.tsv') print(data.fields) print(data.external_fields) for r in data: print(r) # create a derived rearrangements file with additional annotation print('*****') print('*****') print('Derive rearrangements file from another.') print('*****') print('*****') mored = airr.derive_rearrangement('more_data.tsv', 'my_data.tsv', fields=['new_field', 'more_annotation']) print(mored.fields) print(mored.external_fields) for r in airr.read_rearrangement('my_data.tsv'): r['new_field'] = 'A' r['more_annotation'] = 'B' print(r) mored.write(r) mored.close() # validate rearrangements file print('*****') print('*****') print('Validate rearrangements file.') print('*****')
def main(): #first, open the input file and parse into groups with same V/J vj_partition = dict() cdr3_info = dict() seqSize = Counter() #start off by getting size annotations for read in generate_read_fasta(arguments['--full']): seqSize[read.id] = 1 check = re.search("cluster_count=(\d+)", read.description) if check: seqSize[read.id] = int(check.group(1)) gene_pat = re.compile( "(?:v_call|V_gene)=IG([HKL]V[^*]+).*(?:j_call|J_gene)=IG([HKL]J\d)") for sequence in SeqIO.parse(open(arguments['--cdr3'], "r"), "fasta"): genes = re.search(gene_pat, sequence.description) if genes: key = genes.group(1) + "_" + genes.group(2) key = re.sub( "[()/]", "", key) #so /OR or (II) genes don't screw up the file system if key not in vj_partition: temp = "%s/%s.fa" % (prj_tree.lineage, key) vj_partition[key] = { 'group': key, 'handle': open(temp, "w"), 'file': temp, 'count': 0, 'ids': [] } vj_partition[key]['count'] += 1 vj_partition[key]['ids'].append(sequence.id) cdr3_info[sequence.id] = { 'cdr3_len': int(len(sequence.seq) / 3), 'cdr3_seq': sequence.seq.translate() } #make sizes available to vsearch sequence.id += ";size=%d" % seqSize[ sequence.id] #do this even if there's no label #so I don't need to divide the cases for vsearch #and write SeqIO.write([sequence], vj_partition[key]['handle'], 'fasta') else: print("Couldn't find V and J genes for %s %s, skipping..." % (sequence.id, sequence.description)) global natives natives = dict() if arguments['--natives'] is not None: natives = load_fastas(arguments['--natives']) for n, s in natives.items(): if arguments['-v'] is not None: key = arguments['-v'] + "_" + arguments['-j'] else: genes = re.search(gene_pat, s.description) if genes: key = genes.group(1) + "_" + genes.group(2) else: sys.exit( "Can't find V and J gene annotations for native sequence %s. Please specify using the -v and -j parameters." % n) key = re.sub( "[()/]", "", key ) #wouldn't expect this to be relevant for natives, but just in case... if key not in vj_partition: print( "No NGS sequences with the same V/J genes as native sequence %s (%s); skipping..." % (n, key)) continue seqSize[n] = 0 s.id += ";size=1" vj_partition[key]['count'] += 1 vj_partition[key]['ids'].append(n) cdr3_info[n] = { 'cdr3_len': int(len(s.seq) / 3), 'cdr3_seq': s.seq.translate() } SeqIO.write([s], vj_partition[key]['handle'], 'fasta') #close the file handles and delete the reference, so dict can be pickled for multithreading for cluster in vj_partition: vj_partition[cluster]['handle'].close() del vj_partition[cluster]['handle'] #now go through and cluster each V/J grouping clusterLookup = dict() centroidData = dict() clusterSizes = Counter() if arguments['-t'] > 1: pool = Pool(arguments['-t']) blob = pool.map(processClusters, iterator_slice( vj_partition.values(), 25)) #number per slice needs optimization pool.close() pool.join() for d in blob: clusterLookup.update(d['cl']) centroidData.update(d['cd']) clusterSizes.update(d['cs']) else: #don't thread d = processClusters((0, vj_partition.values())) clusterLookup.update(d['cl']) centroidData.update(d['cd']) clusterSizes.update(d['cs']) #now process all clusters and do tabular output with open("%s/%s_lineages.txt" % (prj_tree.tables, prj_name), "w") as handle: writer = csv.writer(handle, delimiter=sep) writer.writerow([ "clone_id", "sequence_id", "v_call", "j_call", "junction_length_aa", "junction_aa", "clone_count", "included_mAbs" ]) for rank, (centroid, size) in enumerate(clusterSizes.most_common()): centroidData[centroid]['rank'] = rank + 1 writer.writerow([ "%05d" % (rank + 1), centroid, centroidData[centroid]['vgene'], centroidData[centroid]['jgene'], cdr3_info[centroid]['cdr3_len'], cdr3_info[centroid]['cdr3_seq'], size, ",".join(centroidData[centroid]['nats']) ]) #do sequence output notationFile = re.sub("\.f.+", "_lineageNotations.fa", arguments['--full']) repFile = re.sub("\.f.+", "_lineageRepresentatives.fa", arguments['--full']) rep_seqs = [] with open(notationFile, "w") as handle: for read in generate_read_fasta(arguments['--full']): if ";" in read.id: read.id = read.id[ 0:8] #this is for raw VSearch output with size annotations #shouldn't be relevant in pipeline context if read.id not in clusterLookup: continue read.description += " clone_id=%05d clone_rep=%s clone_count=%d" % ( centroidData[clusterLookup[read.id]]['rank'], clusterLookup[read.id], clusterSizes[clusterLookup[read.id]]) SeqIO.write([read], handle, "fasta") if read.id in centroidData: rep_seqs.append(read) with open(repFile, "w") as handle: #use a sort to put them out in order of lineage rank (ie size) SeqIO.write( sorted(rep_seqs, key=lambda cent: centroidData[cent.id]['rank']), handle, "fasta") #do AIRR output if os.path.dirname(arguments['--full']) == prj_tree.nt: if os.path.isfile("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): withLin = airr.derive_rearrangement( "updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name), fields=["clone_id", "clone_count"]) for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): if r['sequence_id'] in clusterLookup: r['clone_id'] = "%05d" % centroidData[clusterLookup[ r['sequence_id']]]['rank'] r['clone_count'] = clusterSizes[clusterLookup[ r['sequence_id']]] else: #prevent mix-and-match data if this gets run multiple times with multiple settings r['clone_id'] = "" r['clone_count'] = "" withLin.write(r) withLin.close() os.rename("updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
def main(): #start by making possible "duplicate_count" info available to vsearch with open("temp.fa", "w") as handle: SeqIO.write(reformatInput(arguments['-f']), handle, "fasta") #first step on higher identity subprocess.call([ vsearch, "-derep_fulllength", "temp.fa", "-output", "temp_dedup.fa", "-uc", "temp.uc", "-sizein", "-sizeout", "-minuniquesize", arguments['--min1'] ]) #process the uc file centroid = dict() with open("temp.uc", "r") as handle: uc = csv.reader(handle, delimiter="\t") for row in uc: if row[0] == "H": centroid[row[8]] = row[9] #second clustering step subprocess.call([ vsearch, "-cluster_size", "temp_dedup.fa", "-sizein", "-sizeout", "-maxgaps", "0", "-id", arguments['--id'], "-uc", "%s.cluster" % os.path.splitext(arguments['-f'])[0] ]) #process the uc file size = dict() with open("%s.cluster" % os.path.splitext(arguments['-f'])[0], "r") as handle: uc = csv.reader(handle, delimiter="\t") for row in uc: if row[0] == "H": centroid[re.sub(";size=\d+", "", row[8])] = re.sub(";size=\d+", "", row[9]) elif row[0] == "C" and int(row[2]) >= arguments['--min2']: size[re.sub(";size=\d+", "", row[8])] = int(row[2]) #clean up os.remove("temp.fa") os.remove("temp_dedup.fa") os.remove("temp.uc") #do sequence outputs with open("%s_unique.fa" % os.path.splitext(arguments['-f'])[0], "w") as handle: SeqIO.write(getUniques(arguments['-f'], size), handle, 'fasta') #retrieve unique CDR3s (and do AA seqs as appropriate) if "goodVJ" in arguments['-f']: cdr3_file = re.sub("goodVJ", "goodCDR3", arguments['-f']) if os.path.isfile(cdr3_file): with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0], "w") as handle: SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % cdr3_file, file=sys.stderr) if "nucleotide" in cdr3_file: cdr3_file = re.sub("nucleotide", "amino_acid", cdr3_file) if os.path.isfile(cdr3_file): with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0], "w") as handle: SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % cdr3_file, file=sys.stderr) if "nucleotide" in arguments['-f']: aa_file = re.sub("nucleotide", "amino_acid", arguments['-f']) if os.path.isfile(aa_file): with open("%s_unique.fa" % os.path.splitext(aa_file)[0], "w") as handle: SeqIO.write(getUniques(aa_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % aa_file, file=sys.stderr) #now do AIRR output if arguments[ '-f'] == "output/sequences/nucleotide/%s_goodVJ.fa" % prj_name: if os.path.isfile("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): clustered = airr.derive_rearrangement( "updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name), fields=['centroid', 'cluster_count']) for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): r['centroid'] = centroid.get( centroid.get(r['sequence_id'], ""), centroid.get(r['sequence_id'], "")) if r['sequence_id'] in size: r['cluster_count'] = size[r['sequence_id']] r['status'] = "unique" elif r['sequence_id'] in centroid: #prevent mix-and-match data if this gets run multiple times with multiple settings #I can get away with this because rearrangements.tsv only gets edited when clustering # the goodVJ file r['cluster_count'] = "" r['status'] = "good" clustered.write(r) clustered.close() os.rename("updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)) else: print( "Can't find the rearrangements file, not saving data in AIRR format", file=sys.stderr)