def filterAirrTsv(rearrangementsFile, annotationList, exact=False): good = 0 for r in airr.read_rearrangement(rearrangementsFile): keep = True for filter in annotationList: if len(filter['list']) > 1 or exact: #want exact matches (will break if trying to match exactly on a single value - use regex '^foo$') if str(r[filter['column']]) not in filter['list']: keep = False break elif not re.search(filter['list'][0], r[filter['column']]): keep = False break if keep: good += 1 if good % 10000 == 0: sys.stderr.write( "Found %d matching rearrangements so far...\n" % good) yield r
def format_data(args): airr.validate_rearrangement(args.rearrangements, True) reader = airr.read_rearrangement(args.rearrangements) empty_list = [] # keep only the Junction, Vgene, Jgene and Repertoire ID columns keys = ["junction_aa", "v_call", "j_call", "junction", "repertoire_id"] for row in reader: empty_list.append({x: row[x] for x in keys}) df = pd.DataFrame(empty_list) # replace cells without junction with Nan df["junction_aa"].replace("", np.nan, inplace=True) # delete lines with Nan df.dropna(subset=["junction_aa"], inplace=True) # delete lines with an X on the junction_aa df = df[~df.junction_aa.str.contains("X")] # delete lines where junction_aa doesn"t start with C df = df[df.junction_aa.str.startswith("C")] # delete lines where junction_aa doesn"t end with F or W df = df[df.junction_aa.str.endswith(("F", "W"))] # delete lines where the chain in v_call and j_call doesn"t match df = df[(df["v_call"].str[2] == df["j_call"].str[2])] # keep only one first Vgene when there are multiple in the column df["v_call"] = df.v_call.str.split(",", n=1, expand=True)[0] # remove allele information from v_call and keep only the gene information df["v_call"] = df.apply(lambda x: x["v_call"][:-3], axis=1) df["chain"] = df.apply(lambda x: x["v_call"][2], axis=1) return df
def main(): global germs germs = dict() for entry in SeqIO.parse(open(arguments['-g'], "r"), "fasta"): germs[entry.id] = entry global mature mature = dict() if arguments['-a'] is not None: for entry in SeqIO.parse(open(arguments['-a'], "r"), "fasta"): mature[entry.id] = entry inputFile = arguments['-f'] dedup = dict() if arguments['-d']: subprocess.call([ vsearch, "-derep_fulllength", arguments['-f'], "-output", "temp_dedup.fa", "-uc", "temp.uc", "-notrunclabels" ]) inputFile = "temp_dedup.fa" #process the uc file with open("temp.uc", "r") as handle: uc = csv.reader(handle, delimiter="\t") for row in uc: if row[0] == "S": dedup[row[8].split(" ")[0]] = row[8].split(" ")[0] elif row[0] == "H": dedup[row[8].split(" ")[0]] = row[9].split(" ")[0] results = dict() #If we are multithreading, split input into chunks if arguments['-t'] > 1: index = 0 counter = 0 chunk = [] reader = SeqIO.parse(open(inputFile, "r"), "fasta") for entry in reader: chunk.append(entry) counter += 1 if counter == 1000: with open("%s/align/align%06d.fa" % (prj_tree.lineage, index), "w") as handle: SeqIO.write(chunk, handle, "fasta") index += 1 counter = 0 chunk = [] if counter > 0: with open("%s/align/align%06d.fa" % (prj_tree.lineage, index), "w") as handle: SeqIO.write(chunk, handle, "fasta") index += 1 #so we can use range properly #now create a pool and start the actual work filterPool = Pool(arguments['-t']) dataBlob = filterPool.map(runAlign, [ "%s/align/align%06d.fa" % (prj_tree.lineage, i) for i in range(index) ]) filterPool.close() filterPool.join() #Recover results for blob in dataBlob: results.update(blob) else: #unthreaded, just do the whole thing results = runAlign(inputFile) #get some outputs set up outFile = os.path.basename(os.path.splitext(arguments['-f'])[0]) if os.path.isdir(prj_tree.tables): outFile = "output/tables/" + outFile if arguments['-o'] is not None: outFile = arguments['-o'] nats = sorted(mature.keys()) covFile = open("%s_coverage.tab" % outFile, "w") coverage = csv.writer(covFile, delimiter="\t") coverage.writerow(['sequence_id', 'germ_cov'] + nats) idFile = open("%s_id-div.tab" % outFile, "w") iddiv = csv.writer(idFile, delimiter="\t") iddiv.writerow(['sequence_id', 'v_gene', 'germ_div'] + nats) #sort the freaking list and output if arguments['-d']: for s in sorted(dedup.keys()): (germc, germi) = results[dedup[s]]['germline'] if not germc == "NA": germc = "%.1f" % germc germi = "%.1f" % (100 - germi) coverage.writerow([s, germc] + [ "NA" if results[dedup[s]][n][0] == "NA" else "%.1f" % results[dedup[s]][n][0] for n in nats ]) iddiv.writerow([s, results[dedup[s]]['vlookup'], germi] + [ "NA" if results[dedup[s]][n][1] == "NA" else "%.1f" % results[dedup[s]][n][1] for n in nats ]) #take this opportunity to do some cleanup os.remove("temp_dedup.fa") os.remove("temp.uc") else: for s in sorted(results.keys()): (germc, germi) = results[s]['germline'] if not germc == "NA": germc = "%.1f" % germc germi = "%.1f" % (100 - germi) coverage.writerow([s, germc] + [ "NA" if results[s][n][0] == "NA" else "%.1f" % results[s][n][0] for n in nats ]) iddiv.writerow([s, results[s]['vlookup'], germi] + [ "NA" if results[s][n][1] == "NA" else "%.1f" % results[s][n][1] for n in nats ]) covFile.close() idFile.close() #do AIRR output if os.path.dirname( arguments['-f'] ) == "output/sequences/nucleotide" and not 'CDR3' in arguments['-f']: if os.path.isfile("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): withDiv = airr.derive_rearrangement("updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name), fields=['v_identity']) for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): if dedup.get(r['sequence_id'], r['sequence_id']) in results: # omit NAs here to comply with AIRR format if not results[dedup.get( r['sequence_id'], r['sequence_id'])]['germline'][1] == "NA": r['v_identity'] = "%0.3f" % (results[dedup.get( r['sequence_id'], r['sequence_id'])]['germline'][1] / 100) withDiv.write(r) withDiv.close() os.rename("updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
def main(): cells_raw = defaultdict( dict ) most_used = defaultdict( list ) output = open("%s/%s_cell_stats.tsv"%(prj_tree.tables,prj_name), 'w') output.write("cell\tstatus\tisotype\tproductive_IGH\ttotal_IGH\tIGH_junctions\tproductive_IGK\ttotal_IGK\tIGK_junctions\tproductive_IGL\ttotal_IGL\tIGL_junctions\n") data = airr.read_rearrangement(arguments['--rearrangements']) cells_only = airr.derive_rearrangement(re.sub(".tsv", "_single-cell.tsv", arguments['--rearrangements']), arguments['--rearrangements']) #assume cells might not be grouped together, so make a first pass # to collect everything for r in data: if r['status'] in ['good', 'indel', 'stop', 'nonproductive', 'unique']: #skip irrelevant sequences if r['locus'] not in cells_raw[r['cell_id']]: cells_raw[r['cell_id']][r['locus']] = [ r ] else: cells_raw[r['cell_id']][r['locus']].append( r ) #need better heuristic for this, omit for now #if r['cell_id'] not in most_used[r['centroid']]: # most_used[r['centroid']].append(r['cell_id']) #now go back and process each cell status_list = [ 'canonical_pair', 'possible_inclusion', 'heavy_only', 'light_only', 'multi_light', 'multi_heavy', 'probable_multiplet', 'none_productive' ] status_count = dict( zip( status_list, [0,0,0,0,0,0,0,0] ) ) status_dict = dict( ) for c in cells_raw: cell_processed = defaultdict( list ) cell_productive = defaultdict( list ) for locus in cells_raw[c]: #Start with the one with the most UMIs for rep in sorted( [ r for r in cells_raw[c][locus] ], key=lambda k: k['duplicate_count'], reverse=True ): #check if this is a duplicate of a previously kept read keep = True for previous in cell_processed[locus]: #shortcut: assume identical junctions means duplicates if previous['junction_aa'] == rep['junction_aa']: keep = False break #heuristic (for 10x data as of March 2019): omit gaps and cut off possible noise at 5' end else: cov, score = scoreAlign( quickAlign(previous['sequence_alignment'],rep['sequence_alignment']), countInternalGaps=False, skip=50 ) if score >= 0.95: keep = False break if keep: cell_processed[locus].append( rep ) if rep['status'] == "good": cell_productive[locus].append( rep ) status = "" h_type = "" if len(cell_productive['IGH']) == 0: if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0: status = "none_productive" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 1: status = "light_only" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 1: status = "multi_light" elif len(cell_productive['IGH']) == 1: h_type = re.sub("\*.+", "", cell_productive['IGH'][0]['c_call']) if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0: status = "heavy_only" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 1: status = "canonical_pair" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 2: status = "possible_inclusion" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 2: status = "probable_multiplet" elif len(cell_productive['IGH']) > 1: status = "multi_heavy" status_count[status] += 1 status_dict[c] = status #print to filtered rearrangements file if status in arguments['--save']: for loc in cell_processed: for chain in cell_processed[loc]: cells_only.write( chain ) #now log the cell print( "\t".join( [c, status, h_type, str(len(cell_productive['IGH'])), str(len(cell_processed['IGH'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGH']]), str(len(cell_productive['IGK'])), str(len(cell_processed['IGK'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGK']]), str(len(cell_productive['IGL'])), str(len(cell_processed['IGL'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGL']]) ] ), file=output) output.close() with open("%s/cell_processing.log"%prj_tree.logs, "w") as log: print("\t".join(status_list), file=log) print("\t".join([str(status_count[s]) for s in status_list]), file=log) print("\t".join(status_list)) print("\t".join([str(status_count[s]) for s in status_list]))
parser.add_argument('rearrangement_file', type=str, help='Rearrangement AIRR TSV file name') args = parser.parse_args() if args: # connection header config = getConfig() header = 'var conn = new Mongo();\n' header += 'var db = conn.getDB("admin");\n' header += 'db.auth("' + config['service_user'] + '", "' + config[ 'service_secret'] + '");\n' header += 'db = db.getSiblingDB("' + config['db'] + '");\n' print("Reading file: " + args.rearrangement_file) reader = airr.read_rearrangement(args.rearrangement_file) os.system("mkdir /work_data/tmp") fnum = 0 fname = '/work_data/tmp/rearrangement' + str(fnum) + '.js' print('Creating file: ' + fname) fout = open(fname, 'w') fout.write(header) # delete any existing records fout.write('db.rearrangement.deleteMany({"repertoire_id":"' + args.repertoire_id + '"});\n') seqCount = 0 for row in reader: if row.get('repertoire_id') is None:
import numpy as np # We have 4 T cell subsets subsets = { 'CL_0000895': [0 for number in range(0, 50)], 'CL_0000900': [0 for number in range(0, 50)], 'CL_0000897': [0 for number in range(0, 50)], 'CL_0000909': [0 for number in range(0, 50)] } # Load the repertoire metadata data = airr.load_repertoire('repertoires.airr.json') repertoires = {obj['repertoire_id']: obj for obj in data['Repertoire']} # Iterate through the rearrangement data and tabulate the counts reader = airr.read_rearrangement('rearrangements.tsv') for row in reader: # get the appropriate repertoire rep = repertoires[row['repertoire_id']] # use the cell_subset field in the repertoire c = subsets[rep['sample'][0]['cell_subset']['id']] # increment the length count if row['junction_aa_length']: if int(row['junction_aa_length']) >= 50: continue #print(int(row['junction_aa_length'])) c[int(row['junction_aa_length'])] += 1 # normalize the counts so the histograms are comparable for cnts in subsets: total = 0
def read_airr(path: Union[str, Sequence[str], Path, Sequence[Path]]) -> AnnData: """\ Read AIRR-compliant data. Reads data organized in the `AIRR rearrangement schema <https://docs.airr-community.org/en/latest/datarep/rearrangements.html>`_. The following columns are required: * `cell_id` * `productive` * `locus` * `consensus_count` * at least one of `junction_aa` or `junction`. {doc_working_model} Parameters ---------- path Path to the AIRR rearrangement tsv file. If different chains are split up into multiple files, these can be specified as a List, e.g. `["path/to/tcr_alpha.tsv", "path/to/tcr_beta.tsv"]`. Returns ------- AnnData object with IR data in `obs` for each cell. For more details see :ref:`data-structure`. """ ir_objs = {} if isinstance(path, str) or isinstance(path, Path): path = [path] for tmp_path in path: tmp_path = str(tmp_path) reader = airr.read_rearrangement(tmp_path) for row in reader: cell_id = row["cell_id"] try: tmp_cell = ir_objs[cell_id] except KeyError: tmp_cell = IrCell(cell_id=cell_id) ir_objs[cell_id] = tmp_cell try: # this is not an official field expr = row["umi_count"] expr_raw = row["consensus_count"] except KeyError: expr = row["consensus_count"] expr_raw = None tmp_cell.add_chain( IrChain( is_productive=row["productive"], locus=row["locus"], v_gene=row["v_call"] if "v_call" in row else None, d_gene=row["d_call"] if "d_call" in row else None, j_gene=row["j_call"] if "j_call" in row else None, c_gene=row["c_call"] if "c_call" in row else None, cdr3=row["junction_aa"] if "junction_aa" in row else None, cdr3_nt=row["junction"] if "junction" in row else None, expr=expr, expr_raw=expr_raw, ) ) return from_ir_objs(ir_objs.values())
def main(): #look for cell hashing hashDict = dict() sampleList = [] if os.path.exists(f"{prj_tree.tables}/{prj_name}_hashes.tsv"): with open(f"{prj_tree.tables}/{prj_name}_hashes.tsv", 'r') as handle: reader = csv.reader(handle, delimiter="\t") for row in reader: hashDict[row[0]] = [row[1]] if row[1] == "unknown" or row[1] == "ambiguous": continue elif not row[1] in sampleList: sampleList.append(row[1]) sampleList.sort() sampleList += ["unknown", "ambiguous"] #look for feature barcoding featureDict = dict() if os.path.exists(f"{prj_tree.tables}/{prj_name}_features.tsv"): with open(f"{prj_tree.tables}/{prj_name}_features.tsv", 'r') as handle: reader = csv.reader(handle, delimiter="\t") header = next(reader) featureDict["keys"] = header[1:] for row in reader: featureDict[row[0]] = row[1:] cells_raw = defaultdict(dict) most_used = defaultdict(list) output = open("%s/%s_cell_stats.tsv" % (prj_tree.tables, prj_name), 'w') outwriter = csv.writer(output, delimiter="\t") outheader = ["cell", "status", "isotype"] if len(hashDict) > 0: outheader += ["hash_sample"] if len(featureDict) > 0: outheader += featureDict["keys"] outheader += [ "productive_IGH", "total_IGH", "IGH_junctions", "productive_IGK", "total_IGK", "IGK_junctions", "productive_IGL", "total_IGL", "IGL_junctions" ] outwriter.writerow(outheader) data = airr.read_rearrangement(arguments['--rearrangements']) fields = ["cell_status"] if len(hashDict) > 0: fields += ["hash_sample"] cells_only = airr.derive_rearrangement(re.sub( ".tsv", "_single-cell.tsv", arguments['--rearrangements']), arguments['--rearrangements'], fields=fields) #assume cells might not be grouped together, so make a first pass # to collect everything for r in data: if r['status'] in ['good', 'indel', 'stop', 'nonproductive', 'unique']: #skip irrelevant sequences if r['locus'] not in cells_raw[r['cell_id']]: cells_raw[r['cell_id']][r['locus']] = [r] else: cells_raw[r['cell_id']][r['locus']].append(r) #need better heuristic for this, omit for now #if r['cell_id'] not in most_used[r['centroid']]: # most_used[r['centroid']].append(r['cell_id']) #now go back and process each cell status_list = [ 'canonical_pair', 'possible_inclusion', 'heavy_only', 'light_only', 'multi_light', 'multi_heavy', 'probable_multiplet', 'none_productive' ] status_count = dict() for sample in sampleList: status_count[sample] = dict(zip(status_list, [0, 0, 0, 0, 0, 0, 0, 0])) status_dict = dict() for c in cells_raw: cell_processed = defaultdict(list) cell_productive = defaultdict(list) for locus in cells_raw[c]: #Start with the one with the most UMIs for rep in sorted([r for r in cells_raw[c][locus]], key=lambda k: k['duplicate_count'] or 0, reverse=True): #check if this is a duplicate of a previously kept read keep = True for previous in cell_processed[locus]: #shortcut: assume identical junctions means duplicates if previous['junction_aa'] == rep['junction_aa']: keep = False if previous['duplicate_count'] is not None: previous['duplicate_count'] += rep[ 'duplicate_count'] if previous['consensus_count'] is not None: previous['consensus_count'] += rep[ 'consensus_count'] break #heuristic (for 10x data as of March 2019): omit gaps and cut off possible noise at 5' end else: score, cov = scoreAlign(quickAlign( previous['sequence_alignment'], rep['sequence_alignment']), countInternalGaps=False, skip=50) if score >= 0.95: keep = False if previous['duplicate_count'] is not None: previous['duplicate_count'] += rep[ 'duplicate_count'] if previous['consensus_count'] is not None: previous['consensus_count'] += rep[ 'consensus_count'] break if keep: cell_processed[locus].append(rep) if rep['status'] == "good": cell_productive[locus].append(rep) status = "" h_type = "" if len(cell_processed['IGH']) > 2 or len( cell_processed['IGK']) > 2 or len(cell_processed['IGL']) > 2: status = "probable_multiplet" elif len(cell_productive['IGH']) == 0: if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0: status = "none_productive" elif len(cell_productive['IGK']) + len( cell_productive['IGL']) == 1: status = "light_only" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 1: status = "multi_light" elif len(cell_productive['IGH']) == 1: h_type = re.sub("\*.+", "", cell_productive['IGH'][0]['c_call']) if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0: status = "heavy_only" elif len(cell_productive['IGK']) + len( cell_productive['IGL']) == 1: status = "canonical_pair" elif len(cell_productive['IGK']) + len( cell_productive['IGL']) == 2: status = "possible_inclusion" elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 2: status = "probable_multiplet" elif len(cell_productive['IGH']) > 1: status = "multi_heavy" status_count[hashDict.get(c, ["unknown"])[0]][status] += 1 status_dict[c] = status #print to filtered rearrangements file #leave out cells with ambiguous hashing assignments if we are doing any filtering if status in arguments['--save']: if hashDict.get( c, ["unknown"] )[0] != "ambiguous" or "probable_multiplet" in arguments['--save']: for loc in cell_processed: for chain in cell_processed[loc]: chain['cell_status'] = status if len(hashDict) > 0: chain['hash_sample'] = hashDict.get( chain['cell_id'], ['unknown'])[0] cells_only.write(chain) #now log the cell outwriter.writerow( [c, status, h_type] + hashDict.get(c, ['unknown'] * (len(hashDict) > 0)) + featureDict.get(c, ['0'] * len(featureDict.get("keys", []))) + [ len(cell_productive['IGH']), len(cell_processed['IGH']), ";".join( [chain['junction_aa'] for chain in cell_processed['IGH']]), len(cell_productive['IGK']), len(cell_processed['IGK']), ";".join( [chain['junction_aa'] for chain in cell_processed['IGK']]), len(cell_productive['IGL']), len(cell_processed['IGL']), ";".join( [chain['junction_aa'] for chain in cell_processed['IGL']]) ]) output.close() with open("%s/cell_processing.log" % prj_tree.logs, "w") as log: print("sample\t" + "\t".join(status_list), file=log) print("sample\t" + "\t".join(status_list)) for sample in sampleList: if sum([status_count[sample][s] for s in status_list]) == 0: continue #leave out `ambiguous` if it's not a hashed sample print( "\t".join([sample] + [str(status_count[sample][s]) for s in status_list]), file=log) print( "\t".join([sample] + [str(status_count[sample][s]) for s in status_list]))
def main(): #start by reading in the GSSP gssp = GSSP(arguments['--gssp']) gssp.computeRarity() #now get germline genes germDB = load_fastas(arguments['--germ']) rareSubs = dict() if arguments['-r'] is not None: for seq in airr.read_rearrangement(arguments['-r']): gl = re.sub("\*.*", "", seq['v_call']) if checkGermSeq(gl, germDB) and checkGSSP(gl, gssp.rarity): rareSubs[seq['sequence_id']] = score(seq['sequence_alignment'], germDB[gl + "*01"], gssp.rarity[gl]) elif arguments['-f'] is not None: #if there's a global V gene, check it if arguments['-v'] is not None: if not checkGermSeq(arguments['-v'], germDB) or not checkGSSP( arguments['-v'], gssp.rarity): sys.exit(1) #set up incase it's a prealigned file alignedV = None for seq in generate_read_fasta(arguments['-f']): #if aligned, then first seq is germline if arguments['-a'] and alignedV is None: alignedV = seq.seq if arguments['-n']: alignedV = alignedV.translate(table=GAPPED_CODON_TABLE) alignedV = str(alignedV) continue #score all other sequences if arguments['-v'] is not None: if arguments['-a']: rareSubs[seq.id] = score(str(seq.seq), alignedV, gssp.rarity[arguments['-v']]) else: rareSubs[seq.id] = score(str(seq.seq), germDB[arguments['-v'] + "*01"], gssp.rarity[arguments['-v']]) else: gl = re.search("(v_call|V_gene)=([^\*\s]+)", seq.description) if gl: if checkGermSeq(gl.group(2), germDB) and checkGSSP( gl.group(2), gssp.rarity): rareSubs[seq.id] = score(str(seq.seq), germDB[gl.group(2) + "*01"], gssp.rarity[gl.group(2)]) else: print( "Could not find V gene annotation for %s, skipping..." % seq.id, file=sys.stderr) continue else: if checkGermSeq(arguments['-v'], germDB) and checkGSSP( arguments['-v'], gssp.rarity): for sequence in arguments['QVQLVQ']: rareSubs[sequence] = score(sequence, germDB[arguments['-v'] + "*01"], gssp.rarity[arguments['-v']]) else: sys.exit(1) #now do output count = 0 if arguments['--lineage']: reverse_dict = defaultdict(list) for seq in rareSubs: for sub in rareSubs[seq]: reverse_dict[sub].append(seq) for sub in sorted(reverse_dict.keys(), key=lambda x: int(re.search("(\d+)", x).group(1))): if 100 * len(reverse_dict[sub]) / len( rareSubs) >= arguments['--threshold']: print(sub) count += 1 else: for seq in rareSubs: if len(rareSubs[seq]) > 0: print(seq + ": " + ",".join(rareSubs[seq])) count += 1 if count == 0: print("No rare substitutions were found")
#!/usr/bin/env python3 # imports import airr # read a rearrangments file print('*****') print('*****') print('Read a rearrangements file.') print('*****') print('*****') data = airr.read_rearrangement('toy_data.tsv') print(data.fields) print(data.external_fields) for r in data: print(r) # Create a new rearrangements file with an intermediate parser # Technically, the parser tool should be reading the VDJ rearrangements # output file, parsing it, then writing the row data. print('*****') print('*****') print('Create new rearrangements file.') print('*****') print('*****') data = airr.read_rearrangement('toy_data.tsv') newd = airr.create_rearrangement('my_data.tsv', fields=data.fields) print(newd.fields) print(newd.external_fields) for r in data: newd.write(r)
def main(): #first, open the input file and parse into groups with same V/J vj_partition = dict() cdr3_info = dict() seqSize = Counter() #start off by getting size annotations for read in generate_read_fasta(arguments['--full']): seqSize[read.id] = 1 check = re.search("cluster_count=(\d+)", read.description) if check: seqSize[read.id] = int(check.group(1)) gene_pat = re.compile( "(?:v_call|V_gene)=IG([HKL]V[^*]+).*(?:j_call|J_gene)=IG([HKL]J\d)") for sequence in SeqIO.parse(open(arguments['--cdr3'], "r"), "fasta"): genes = re.search(gene_pat, sequence.description) if genes: key = genes.group(1) + "_" + genes.group(2) key = re.sub( "[()/]", "", key) #so /OR or (II) genes don't screw up the file system if key not in vj_partition: temp = "%s/%s.fa" % (prj_tree.lineage, key) vj_partition[key] = { 'group': key, 'handle': open(temp, "w"), 'file': temp, 'count': 0, 'ids': [] } vj_partition[key]['count'] += 1 vj_partition[key]['ids'].append(sequence.id) cdr3_info[sequence.id] = { 'cdr3_len': int(len(sequence.seq) / 3), 'cdr3_seq': sequence.seq.translate() } #make sizes available to vsearch sequence.id += ";size=%d" % seqSize[ sequence.id] #do this even if there's no label #so I don't need to divide the cases for vsearch #and write SeqIO.write([sequence], vj_partition[key]['handle'], 'fasta') else: print("Couldn't find V and J genes for %s %s, skipping..." % (sequence.id, sequence.description)) global natives natives = dict() if arguments['--natives'] is not None: natives = load_fastas(arguments['--natives']) for n, s in natives.items(): if arguments['-v'] is not None: key = arguments['-v'] + "_" + arguments['-j'] else: genes = re.search(gene_pat, s.description) if genes: key = genes.group(1) + "_" + genes.group(2) else: sys.exit( "Can't find V and J gene annotations for native sequence %s. Please specify using the -v and -j parameters." % n) key = re.sub( "[()/]", "", key ) #wouldn't expect this to be relevant for natives, but just in case... if key not in vj_partition: print( "No NGS sequences with the same V/J genes as native sequence %s (%s); skipping..." % (n, key)) continue seqSize[n] = 0 s.id += ";size=1" vj_partition[key]['count'] += 1 vj_partition[key]['ids'].append(n) cdr3_info[n] = { 'cdr3_len': int(len(s.seq) / 3), 'cdr3_seq': s.seq.translate() } SeqIO.write([s], vj_partition[key]['handle'], 'fasta') #close the file handles and delete the reference, so dict can be pickled for multithreading for cluster in vj_partition: vj_partition[cluster]['handle'].close() del vj_partition[cluster]['handle'] #now go through and cluster each V/J grouping clusterLookup = dict() centroidData = dict() clusterSizes = Counter() if arguments['-t'] > 1: pool = Pool(arguments['-t']) blob = pool.map(processClusters, iterator_slice( vj_partition.values(), 25)) #number per slice needs optimization pool.close() pool.join() for d in blob: clusterLookup.update(d['cl']) centroidData.update(d['cd']) clusterSizes.update(d['cs']) else: #don't thread d = processClusters((0, vj_partition.values())) clusterLookup.update(d['cl']) centroidData.update(d['cd']) clusterSizes.update(d['cs']) #now process all clusters and do tabular output with open("%s/%s_lineages.txt" % (prj_tree.tables, prj_name), "w") as handle: writer = csv.writer(handle, delimiter=sep) writer.writerow([ "clone_id", "sequence_id", "v_call", "j_call", "junction_length_aa", "junction_aa", "clone_count", "included_mAbs" ]) for rank, (centroid, size) in enumerate(clusterSizes.most_common()): centroidData[centroid]['rank'] = rank + 1 writer.writerow([ "%05d" % (rank + 1), centroid, centroidData[centroid]['vgene'], centroidData[centroid]['jgene'], cdr3_info[centroid]['cdr3_len'], cdr3_info[centroid]['cdr3_seq'], size, ",".join(centroidData[centroid]['nats']) ]) #do sequence output notationFile = re.sub("\.f.+", "_lineageNotations.fa", arguments['--full']) repFile = re.sub("\.f.+", "_lineageRepresentatives.fa", arguments['--full']) rep_seqs = [] with open(notationFile, "w") as handle: for read in generate_read_fasta(arguments['--full']): if ";" in read.id: read.id = read.id[ 0:8] #this is for raw VSearch output with size annotations #shouldn't be relevant in pipeline context if read.id not in clusterLookup: continue read.description += " clone_id=%05d clone_rep=%s clone_count=%d" % ( centroidData[clusterLookup[read.id]]['rank'], clusterLookup[read.id], clusterSizes[clusterLookup[read.id]]) SeqIO.write([read], handle, "fasta") if read.id in centroidData: rep_seqs.append(read) with open(repFile, "w") as handle: #use a sort to put them out in order of lineage rank (ie size) SeqIO.write( sorted(rep_seqs, key=lambda cent: centroidData[cent.id]['rank']), handle, "fasta") #do AIRR output if os.path.dirname(arguments['--full']) == prj_tree.nt: if os.path.isfile("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): withLin = airr.derive_rearrangement( "updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name), fields=["clone_id", "clone_count"]) for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): if r['sequence_id'] in clusterLookup: r['clone_id'] = "%05d" % centroidData[clusterLookup[ r['sequence_id']]]['rank'] r['clone_count'] = clusterSizes[clusterLookup[ r['sequence_id']]] else: #prevent mix-and-match data if this gets run multiple times with multiple settings r['clone_id'] = "" r['clone_count'] = "" withLin.write(r) withLin.close() os.rename("updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
def main(): #start by making possible "duplicate_count" info available to vsearch with open("temp.fa", "w") as handle: SeqIO.write(reformatInput(arguments['-f']), handle, "fasta") #first step on higher identity subprocess.call([ vsearch, "-derep_fulllength", "temp.fa", "-output", "temp_dedup.fa", "-uc", "temp.uc", "-sizein", "-sizeout", "-minuniquesize", arguments['--min1'] ]) #process the uc file centroid = dict() with open("temp.uc", "r") as handle: uc = csv.reader(handle, delimiter="\t") for row in uc: if row[0] == "H": centroid[row[8]] = row[9] #second clustering step subprocess.call([ vsearch, "-cluster_size", "temp_dedup.fa", "-sizein", "-sizeout", "-maxgaps", "0", "-id", arguments['--id'], "-uc", "%s.cluster" % os.path.splitext(arguments['-f'])[0] ]) #process the uc file size = dict() with open("%s.cluster" % os.path.splitext(arguments['-f'])[0], "r") as handle: uc = csv.reader(handle, delimiter="\t") for row in uc: if row[0] == "H": centroid[re.sub(";size=\d+", "", row[8])] = re.sub(";size=\d+", "", row[9]) elif row[0] == "C" and int(row[2]) >= arguments['--min2']: size[re.sub(";size=\d+", "", row[8])] = int(row[2]) #clean up os.remove("temp.fa") os.remove("temp_dedup.fa") os.remove("temp.uc") #do sequence outputs with open("%s_unique.fa" % os.path.splitext(arguments['-f'])[0], "w") as handle: SeqIO.write(getUniques(arguments['-f'], size), handle, 'fasta') #retrieve unique CDR3s (and do AA seqs as appropriate) if "goodVJ" in arguments['-f']: cdr3_file = re.sub("goodVJ", "goodCDR3", arguments['-f']) if os.path.isfile(cdr3_file): with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0], "w") as handle: SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % cdr3_file, file=sys.stderr) if "nucleotide" in cdr3_file: cdr3_file = re.sub("nucleotide", "amino_acid", cdr3_file) if os.path.isfile(cdr3_file): with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0], "w") as handle: SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % cdr3_file, file=sys.stderr) if "nucleotide" in arguments['-f']: aa_file = re.sub("nucleotide", "amino_acid", arguments['-f']) if os.path.isfile(aa_file): with open("%s_unique.fa" % os.path.splitext(aa_file)[0], "w") as handle: SeqIO.write(getUniques(aa_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % aa_file, file=sys.stderr) #now do AIRR output if arguments[ '-f'] == "output/sequences/nucleotide/%s_goodVJ.fa" % prj_name: if os.path.isfile("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): clustered = airr.derive_rearrangement( "updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name), fields=['centroid', 'cluster_count']) for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): r['centroid'] = centroid.get( centroid.get(r['sequence_id'], ""), centroid.get(r['sequence_id'], "")) if r['sequence_id'] in size: r['cluster_count'] = size[r['sequence_id']] r['status'] = "unique" elif r['sequence_id'] in centroid: #prevent mix-and-match data if this gets run multiple times with multiple settings #I can get away with this because rearrangements.tsv only gets edited when clustering # the goodVJ file r['cluster_count'] = "" r['status'] = "good" clustered.write(r) clustered.close() os.rename("updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)) else: print( "Can't find the rearrangements file, not saving data in AIRR format", file=sys.stderr)
import airr import sys for line in airr.read_rearrangement(sys.argv[1], validate=True): continue
def main(): if not glob.glob("%s/%s_*.fasta" % (prj_tree.jgene, prj_name)): sys.exit("No jBlast output found!\n") maxFiles = len( glob.glob("%s/%s_*.fasta" % (prj_tree.vgene, prj_name)) ) print( "curating junction and 3' end..." ) if arguments['--cluster']: command = "NUM=`printf \"%s\" $SGE_TASK_ID`\n%s/annotate/parse_blast.py --jmotif '%s' --nterm %s --chunk $NUM\n" % \ ( "%03d", SCRIPT_FOLDER, arguments['--jmotif'], arguments['--nterm'] ) if arguments['--noFallBack']: command += " --noFallBack" pbs = open("%s/parse.sh"%prj_tree.jgene, 'w') pbs.write( "#!/bin/bash\n#$ -N parse-%s\n#$ -l h_vmem=2G\n#$ -cwd\n#$ -o %s/parse.o$JOB_ID.$SGE_TASK_ID\n#$ -o %s/parse.e$JOB_ID.$SGE_TASK_ID\n\n%s\n" % (prj_name, prj_tree.annotate, prj_tree.annotate, command) ) pbs.close() subprocess.call([qsub, '-sync', 'y', '-t', "1-%d"%maxFiles, "%s/parse.sh"%prj_tree.jgene]) else: #do it locally parse_pool = Pool(arguments['--threads']) parse_pool.map(callParser, range(1,maxFiles+1)) parse_pool.close() parse_pool.join() #ok, now collect all of the partial outputs and merge them print( "collecting information...") #open fasta outputs allV_aa = open ("%s/%s_allV.fa" % (prj_tree.aa, prj_name), "w" ) allV_nt = open( "%s/%s_allV.fa" % (prj_tree.nt, prj_name), "w" ) allJ_aa = open( "%s/%s_allJ.fa" % (prj_tree.aa, prj_name), "w" ) allJ_nt = open( "%s/%s_allJ.fa" % (prj_tree.nt, prj_name), "w" ) vj_aa = open( "%s/%s_goodVJ.fa" % (prj_tree.aa, prj_name), "w" ) vj_nt = open( "%s/%s_goodVJ.fa" % (prj_tree.nt, prj_name), "w" ) good_cdr3_aa = open( "%s/%s_goodCDR3.fa" % (prj_tree.aa, prj_name), "w" ) good_cdr3_nt = open( "%s/%s_goodCDR3.fa" % (prj_tree.nt, prj_name), "w" ) all_cdr3_aa = open( "%s/%s_allCDR3.fa" % (prj_tree.aa, prj_name), "w" ) all_cdr3_nt = open( "%s/%s_allCDR3.fa" % (prj_tree.nt, prj_name), "w" ) #also open final rearrangements tsv seq_stats = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity','consensus_count','cell_id']) #initiate overall counters raw_count, total = 0, 0 counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0,'noV':0,'noJ':0,'missingNterm':0,'chimera':0} dict_jcounts = Counter() dict_ccounts = Counter() dict_dcounts = Counter() c = False if os.path.isfile("%s/%s_C_001.txt" % (prj_tree.jgene, prj_name)): c = True d = False if os.path.isfile("%s/%s_D_001.txt" % (prj_tree.jgene, prj_name)): d = True #iterate over subset rearrangement files and combine #include generating fasta output as appropriate for f_ind in range(1, maxFiles+1): #merge partial blast hit tables with open( "%s/%s_jgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/jtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) if d: with open( "%s/%s_dgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/dtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) if c: with open( "%s/%s_cgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/ctophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) #go through partial rearrangements files for r in airr.read_rearrangement( "%s/rearrangements_%03d.tsv"%(prj_tree.internal, f_ind) ): seq_stats.write( r ) #count j/d/c gene usages if not r['j_call'] == "": dict_jcounts[ r['j_call'].split(",")[0] ] += 1 if not r['j_call'] == "": dict_jcounts[ r['d_call'].split(",")[0] ] += 1 if not r['j_call'] == "": dict_jcounts[ r['c_call'].split(",")[0] ] += 1 #count statuses counts[ r['status'] ] += 1 total += 1 raw_count = int( r['sequence_id'] ) #technically, this undercounts if the last one # isn't in the `correct_length` interval, but I # don't have a better solution that isn't super # kludgy right now #ok, now do sequence output # start by collecting metadata for fasta def line def_line = ">%s" % r['sequence_id'] if not r['v_call'] == '': def_line += " v_call=%s" % r['v_call'] if not r['d_call'] == '': def_line += " d_call=%s" % r['d_call'] if not r['j_call'] == '': def_line += " j_call=%s" % r['j_call'] if not r['locus'] == '': def_line += " locus=%s" % r['locus'] if not r['c_call'] == '': def_line += " c_call=%s" % r['c_call'] if not r['status'] == '': def_line += " status=%s" % r['status'] # if not r['v_identity'] == '': def_line += " v_identity=%s" % r['v_identity'] if not r['junction_length'] == '': def_line += " junction_length=%s" % r['junction_length'] if not r['junction'] == '': def_line += " junction=%s" % r['junction'] if not r['junction_aa'] == '': def_line += " junction_aa=%s" % r['junction_aa'] if not r['duplicate_count'] == '': def_line += " duplicate_count=%s" % r['duplicate_count'] if not r['consensus_count'] == '': def_line += " consensus_count=%s" % r['consensus_count'] if not r['cell_id'] == '': def_line += " cell_id=%s" % r['cell_id'] #work our way up the hierarchy, putting sequences in the appropriate files ungapped = re.sub( "-", "", r['sequence_alignment']) #reintroduces any frameshift errors in translation # this has always been the behavior, but I wonder # if I should change/update now that I am using # proper alignments. if not r['status'] in ['noV', 'missingNterm', "chimera"]: allV_nt.write( "%s\n%s\n" % (def_line, ungapped) ) allV_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) if not r['status'] == 'noJ': allJ_nt.write( "%s\n%s\n" % (def_line, ungapped) ) allJ_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) if not r['status'] == 'noCDR3': all_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) ) all_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) ) if r['status'] == "good": vj_nt.write( "%s\n%s\n" % (def_line, ungapped) ) vj_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) good_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) ) good_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) ) #close outputs allV_aa.close() allV_nt.close() allJ_aa.close() allJ_nt.close() vj_aa.close() vj_nt.close() good_cdr3_aa.close() good_cdr3_nt.close() all_cdr3_aa.close() all_cdr3_nt.close() #useful number found = total - counts['noV'] - counts['noJ'] - counts['chimera'] #print out some statistics handle = open("%s/%s_jgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_jcounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_jcounts[key], "%4.2f" % (dict_jcounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() if len(dict_ccounts) > 0: handle = open("%s/%s_cgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_ccounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_ccounts[key], "%4.2f" % (dict_ccounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() if len(dict_dcounts) > 0: handle = open("%s/%s_dgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_dcounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_dcounts[key], "%4.2f" % (dict_dcounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() message = "\nTotal raw reads: %d\nCorrect Length: %d\nV assigned: %d\nJ assigned: %d\nCDR3 assigned: %d\nIn-frame junction: %d\nNo indels: %d\nContinuous ORF with no stop codons: %d\n\n" % \ (raw_count, total, total-counts['noV']-counts['chimera'], found, found-counts['noCDR3'], found-counts['noCDR3']-counts['nonproductive'], found-counts['noCDR3']-counts['nonproductive']-counts['indel'], counts['good']) print( message ) handle = open("%s/finalize_blast.log"%prj_tree.logs, "w") handle.write(message) handle.close() # call 1.4 or 1.5 if requested if arguments['--runClustering']: cmd = "%s/annotate/1.4-cluster_sequences.py" % SCRIPT_FOLDER for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save' ]: if arguments[opt] is not None: cmd += " %s '%s'" % (opt, arguments[opt]) if arguments['--runCellStatistics']: cmd += " --runCellStatistics" print( "Calling 1.4 with command line: %s" % cmd ) os.system( cmd ) elif arguments['--runCellStatistics']: cmd = "%s/annotate/1.5-single_cell_statistics.py" % SCRIPT_FOLDER for opt in [ '--rearrangements', '--save' ]: if arguments[opt] is not None: cmd += " %s '%s'" % (opt, arguments[opt]) print( "Calling 1.5 with command line: %s" % cmd ) os.system( cmd ) #clean up!! oldFiles = glob.glob("%s/*txt"%prj_tree.vgene) + glob.glob("%s/*fasta"%prj_tree.vgene) + glob.glob("%s/*txt"%prj_tree.jgene) + glob.glob("%s/*fasta"%prj_tree.jgene) + glob.glob("%s/*tsv"%prj_tree.jgene) + glob.glob("%s/lookup*"%prj_tree.internal) if len(oldFiles) > 0 and not arguments['--noclean']: [os.remove(f) for f in oldFiles]
def main(): #start by making possible "duplicate_count" info available to vsearch with open("temp.fa", "w") as handle: SeqIO.write(reformatInput(arguments['--file']), handle, "fasta") #first step on higher identity subprocess.call([ vsearch, "-derep_fulllength", "temp.fa", "-output", "temp_dedup.fa", "-uc", "temp.uc", "-sizein", "-sizeout", "-minuniquesize", arguments['--min1'] ]) #process the uc file centroid = dict() with open("temp.uc", "r") as handle: uc = csv.reader(handle, delimiter="\t") for row in uc: if row[0] == "H": centroid[re.sub(";size=\d+", "", row[8])] = re.sub(";size=\d+", "", row[9]) #second clustering step subprocess.call([ vsearch, "-cluster_size", "temp_dedup.fa", "-sizein", "-sizeout", "-maxgaps", arguments['--maxgaps'], "-id", arguments['--id'], "-uc", "%s.cluster" % os.path.splitext(arguments['--file'])[0] ]) #process the uc file size = dict() with open("%s.cluster" % os.path.splitext(arguments['--file'])[0], "r") as handle: uc = csv.reader(handle, delimiter="\t") for row in uc: if row[0] == "H": centroid[re.sub(";size=\d+", "", row[8])] = re.sub(";size=\d+", "", row[9]) elif row[0] == "C": #have the centroids point to themselves for more uniform dowsntream processing centroid[re.sub(";size=\d+", "", row[8])] = re.sub(";size=\d+", "", row[8]) #but only save them if they meet the threshold if int(row[2]) >= arguments['--min2']: size[re.sub(";size=\d+", "", row[8])] = int(row[2]) #clean up os.remove("temp.fa") os.remove("temp_dedup.fa") os.remove("temp.uc") #do sequence outputs with open("%s_unique.fa" % os.path.splitext(arguments['--file'])[0], "w") as handle: SeqIO.write(getUniques(arguments['--file'], size), handle, 'fasta') #retrieve unique CDR3s (and do AA seqs as appropriate) if "goodVJ" in arguments['--file']: cdr3_file = re.sub("goodVJ", "goodCDR3", arguments['--file']) if os.path.isfile(cdr3_file): with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0], "w") as handle: SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % cdr3_file, file=sys.stderr) if "nucleotide" in cdr3_file: cdr3_file = re.sub("nucleotide", "amino_acid", cdr3_file) if os.path.isfile(cdr3_file): with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0], "w") as handle: SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % cdr3_file, file=sys.stderr) if "nucleotide" in arguments['--file']: aa_file = re.sub("nucleotide", "amino_acid", arguments['--file']) if os.path.isfile(aa_file): with open("%s_unique.fa" % os.path.splitext(aa_file)[0], "w") as handle: SeqIO.write(getUniques(aa_file, size), handle, 'fasta') else: print("Can't find %s to extract unique sequences..." % aa_file, file=sys.stderr) #now do AIRR output if "output/sequences/nucleotide" in arguments['--file']: if os.path.isfile("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): clustered = airr.derive_rearrangement( "updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name), fields=['centroid', 'cluster_count']) for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)): #clear old annotations in case we ran 1.4 previously r['centroid'] = "" r['cluster_count'] = "" #now add back current annotations #two rounds of clustering means we start by looking for the centroid of the centroid, # falling back to the first level centroid (second clustering step only) if appropriate r['centroid'] = centroid.get( centroid.get(r['sequence_id'], ""), centroid.get(r['sequence_id'], "")) #add cluster size information for final centroids. I am doing away with changing the 'status' of # the centroids to 'unique' because I've started using this script in a lot of cases where it # doesn't make sense to treat 'unique' as a subset of 'good', and I therefore need to preserve # the original status designation. To find centroids, look for a non-null 'cluster_count' field if r['sequence_id'] in size: r['cluster_count'] = size[r['sequence_id']] clustered.write(r) clustered.close() os.rename("updateRearrangements.tsv", "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name)) else: print( "Can't find the rearrangements file, not saving data in AIRR format", file=sys.stderr) # call 1.5 if requested if arguments['--runCellStatistics']: cmd = "%s/annotate/1.5-single_cell_statistics.py" % SCRIPT_FOLDER if arguments['--rearrangements'] is not None: cmd += " --rearrangements %s" % arguments['--rearrangements'] if arguments['--save'] is not None: cmd += " --save %s" % arguments['--save'] print("Calling 1.5 with command line: %s" % cmd) os.system(cmd)
def read_airr( path: Union[str, Sequence[str], Path, Sequence[Path]], use_umi_count_col: Union[bool, Literal["auto"]] = "auto", infer_locus: bool = True, cell_attributes: Collection[str] = DEFAULT_AIRR_CELL_ATTRIBUTES, include_fields: Optional[Collection[str]] = DEFAULT_AIRR_FIELDS, ) -> AnnData: """\ Read data from `AIRR rearrangement <https://docs.airr-community.org/en/latest/datarep/rearrangements.html>`_ format. The following columns are required by scirpy: * `cell_id` * `productive` * `locus` * at least one of `consensus_count`, `duplicate_count`, or `umi_count` * at least one of `junction_aa` or `junction`. Data should still import if one of these fields is missing, but they are required by most of scirpy's processing functions. All chains for which the field `junction_aa` is missing or empty, will be considered as non-productive and will be moved to the `extra_chains` column. {doc_working_model} Parameters ---------- path Path to the AIRR rearrangement tsv file. If different chains are split up into multiple files, these can be specified as a List, e.g. `["path/to/tcr_alpha.tsv", "path/to/tcr_beta.tsv"]`. use_umi_count_col Whether to add UMI counts from the non-strandard (but common) `umi_count` column. When this column is used, the UMI counts are moved over to the standard `duplicate_count` column. Default: Use `umi_count` if there is no `duplicate_count` column present. infer_locus Try to infer the `locus` column from gene names, in case it is not specified. cell_attributes Fields in the rearrangement schema that are specific for a cell rather than a chain. The values must be identical over all records belonging to a cell. This defaults to {cell_attributes}. include_fields The fields to include in `adata`. The AIRR rearrangment schema contains can contain a lot of columns, most of which irrelevant for most analyses. Per default, this includes a subset of columns relevant for a typical scirpy analysis, to keep `adata.obs` a bit cleaner. Defaults to {include_fields}. Set this to `None` to include all columns. Returns ------- AnnData object with IR data in `obs` for each cell. For more details see :ref:`data-structure`. """ airr_cells = {} logger = _IOLogger() if isinstance(path, (str, Path, pd.DataFrame)): path: list = [path] def _decide_use_umi_count_col(chain_dict): """Logic to decide whether or not to use counts form the `umi_counts` column.""" if ( "umi_count" in chain_dict and use_umi_count_col == "auto" and "duplicate_count" not in chain_dict ): logger.warning( "Renaming the non-standard `umi_count` column to `duplicate_count`. " ) # type: ignore return True elif use_umi_count_col is True: return True else: return False for tmp_path in path: if isinstance(tmp_path, pd.DataFrame): iterator = tmp_path.to_dict(orient="records") else: iterator = airr.read_rearrangement(str(tmp_path)) for chain_dict in iterator: cell_id = chain_dict.pop("cell_id") try: tmp_cell = airr_cells[cell_id] except KeyError: tmp_cell = AirrCell( cell_id=cell_id, logger=logger, cell_attribute_fields=cell_attributes, ) airr_cells[cell_id] = tmp_cell if _decide_use_umi_count_col(chain_dict): chain_dict["duplicate_count"] = RearrangementSchema.to_int( chain_dict.pop("umi_count") ) if infer_locus and "locus" not in chain_dict: logger.warning( "`locus` column not found in input data. The locus is being inferred from the {v,d,j,c}_call columns." ) chain_dict["locus"] = _infer_locus_from_gene_names(chain_dict) tmp_cell.add_chain(chain_dict) return from_airr_cells(airr_cells.values(), include_fields=include_fields)
def main(): if not glob.glob("%s/%s_*.fasta" % (prj_tree.jgene, prj_name)): sys.exit("No jBlast output found!\n") maxFiles = len( glob.glob("%s/%s_*.fasta" % (prj_tree.vgene, prj_name)) ) if not arguments['--reenter']: print( "curating junction and 3' end..." ) if arguments['--cluster']: command = "NUM=`printf \"%s\" $SGE_TASK_ID`\n%s/annotate/parse_blast.py --jmotif '%s' --nterm %s --chunk $NUM\n" % \ ( "%03d", SCRIPT_FOLDER, arguments['--jmotif'], arguments['--nterm'] ) if arguments['--noFallBack']: command += " --noFallBack" pbs = open("%s/parse.sh"%prj_tree.jgene, 'w') pbs.write( "#!/bin/bash\n#$ -N parse-%s\n#$ -l mem=2G\n#$ -cwd\n\n%s\n" % (prj_name, command) ) pbs.close() os.system( "%s -t 1-%d %s/parse.sh"%(qsub,maxFiles,prj_tree.jgene) ) restart = "%s/annotate/1.3-finalize_assignments.py --reenter" % SCRIPT_FOLDER for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save']: if arguments[opt] is not None: restart += " %s %s" % (opt, arguments[opt]) for flag in ['--noclean', '--runClustering', '--runCellStatistics']: if arguments[flag]: restart += " %s" % flag monitor = open("%s/parse_monitor.sh"%prj_tree.jgene, 'w') monitor.write( "#!/bin/bash\n#$ -N monitor-%s\n#$ -l mem=2G\n#$ -cwd\n#$ -hold_jid parse-%s\n\n%s\n"%(prj_name, prj_name,restart) ) monitor.close() os.system( "%s %s/parse_monitor.sh"%(qsub,prj_tree.jgene) ) sys.exit() else: #do it locally parse_pool = Pool(arguments['--threads']) parse_pool.map(callParser, range(1,maxFiles+1)) parse_pool.close() parse_pool.join() #ok, now collect all of the partial outputs and merge them print( "collecting information...") #open fasta outputs allV_aa = open ("%s/%s_allV.fa" % (prj_tree.aa, prj_name), "w" ) allV_nt = open( "%s/%s_allV.fa" % (prj_tree.nt, prj_name), "w" ) allJ_aa = open( "%s/%s_allJ.fa" % (prj_tree.aa, prj_name), "w" ) allJ_nt = open( "%s/%s_allJ.fa" % (prj_tree.nt, prj_name), "w" ) vj_aa = open( "%s/%s_goodVJ.fa" % (prj_tree.aa, prj_name), "w" ) vj_nt = open( "%s/%s_goodVJ.fa" % (prj_tree.nt, prj_name), "w" ) good_cdr3_aa = open( "%s/%s_goodCDR3.fa" % (prj_tree.aa, prj_name), "w" ) good_cdr3_nt = open( "%s/%s_goodCDR3.fa" % (prj_tree.nt, prj_name), "w" ) all_cdr3_aa = open( "%s/%s_allCDR3.fa" % (prj_tree.aa, prj_name), "w" ) all_cdr3_nt = open( "%s/%s_allCDR3.fa" % (prj_tree.nt, prj_name), "w" ) #also open final rearrangements tsv seq_stats = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity','consensus_count','cell_id']) #initiate overall counters raw_count, total = 0, 0 counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0,'noV':0,'noJ':0,'missingNterm':0} dict_jcounts = Counter() dict_ccounts = Counter() dict_dcounts = Counter() c = False if os.path.isfile("%s/%s_C_001.txt" % (prj_tree.jgene, prj_name)): c = True d = False if os.path.isfile("%s/%s_D_001.txt" % (prj_tree.jgene, prj_name)): d = True #iterate over subset rearrangement files and combine #include generating fasta output as appropriate for f_ind in range(1, maxFiles+1): #merge partial blast hit tables with open( "%s/%s_jgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/jtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) if d: with open( "%s/%s_dgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/dtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) if c: with open( "%s/%s_cgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table: with open( "%s/ctophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial: table.write(partial.read()) #go through partial rearrangements files for r in airr.read_rearrangement( "%s/rearrangements_%03d.tsv"%(prj_tree.internal, f_ind) ): seq_stats.write( r ) #count j/d/c gene usages if not r['j_call'] == "": dict_jcounts[ r['j_call'].split(",")[0] ] += 1 if not r['j_call'] == "": dict_jcounts[ r['d_call'].split(",")[0] ] += 1 if not r['j_call'] == "": dict_jcounts[ r['c_call'].split(",")[0] ] += 1 #count statuses counts[ r['status'] ] += 1 total += 1 raw_count = int( r['sequence_id'] ) #technically, this undercounts if the last one # isn't in the `correct_length` interval, but I # don't have a better solution that isn't super # kludgy right now #ok, now do sequence output # start by collecting metadata for fasta def line def_line = ">%s" % r['sequence_id'] if not r['v_call'] == '': def_line += " v_call=%s" % r['v_call'] if not r['d_call'] == '': def_line += " d_call=%s" % r['d_call'] if not r['j_call'] == '': def_line += " j_call=%s" % r['j_call'] if not r['locus'] == '': def_line += " locus=%s" % r['locus'] if not r['c_call'] == '': def_line += " c_call=%s" % r['c_call'] if not r['status'] == '': def_line += " status=%s" % r['status'] # if not r['v_identity'] == '': def_line += " v_identity=%s" % r['v_identity'] if not r['junction_length'] == '': def_line += " junction_length=%s" % r['junction_length'] if not r['junction'] == '': def_line += " junction=%s" % r['junction'] if not r['junction_aa'] == '': def_line += " junction_aa=%s" % r['junction'] if not r['duplicate_count'] == '': def_line += " duplicate_count=%s" % r['duplicate_count'] if not r['consensus_count'] == '': def_line += " consensus_count=%s" % r['consensus_count'] if not r['cell_id'] == '': def_line += " cell_id=%s" % r['cell_id'] #work our way up the hierarchy, putting sequences in the appropriate files ungapped = re.sub( "-", "", r['sequence_alignment']) #reintroduces any frameshift errors in translation # this has always been the behavior, but I wonder # if I should change/update now that I am using # proper alignments. if not r['status'] in ['noV', 'missingNterm']: allV_nt.write( "%s\n%s\n" % (def_line, ungapped) ) allV_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) if not r['status'] == 'noJ': allJ_nt.write( "%s\n%s\n" % (def_line, ungapped) ) allJ_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) if not r['status'] == 'noCDR3': all_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) ) all_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) ) if r['status'] == "good": vj_nt.write( "%s\n%s\n" % (def_line, ungapped) ) vj_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) ) good_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) ) good_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) ) #close outputs allV_aa.close() allV_nt.close() allJ_aa.close() allJ_nt.close() vj_aa.close() vj_nt.close() good_cdr3_aa.close() good_cdr3_nt.close() all_cdr3_aa.close() all_cdr3_nt.close() #useful number found = total - counts['noV'] - counts['noJ'] #print out some statistics handle = open("%s/%s_jgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_jcounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_jcounts[key], "%4.2f" % (dict_jcounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() if len(dict_ccounts) > 0: handle = open("%s/%s_cgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_ccounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_ccounts[key], "%4.2f" % (dict_ccounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() if len(dict_dcounts) > 0: handle = open("%s/%s_dgerm_stat.txt" %(prj_tree.tables, prj_name),'w') writer = csv.writer(handle, delimiter = sep) keys = sorted(dict_dcounts.keys()) writer.writerow(["gene", "count", "percent"]) for key in keys: aline = [ key, dict_dcounts[key], "%4.2f" % (dict_dcounts[key] / float(found) * 100) ] writer.writerow(aline) handle.close() message = "\nTotal raw reads: %d\nCorrect Length: %d\nV assigned: %d\nJ assigned: %d\nCDR3 assigned: %d\nIn-frame junction: %d\nNo indels: %d\nContinuous ORF with no stop codons: %d\n\n" % \ (raw_count, total, total-counts['noV'], found, found-counts['noCDR3'], found-counts['noCDR3']-counts['nonproductive'], found-counts['noCDR3']-counts['nonproductive']-counts['indel'], counts['good']) print( message ) handle = open("%s/finalize_blast.log"%prj_tree.logs, "w") handle.write(message) handle.close() # call 1.4 if requested if arguments['--runClustering']: cmd = "%s/annotate/1.4-cluster_sequences.py" % SCRIPT_FOLDER for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save']: if arguments[opt] is not None: cmd += " %s '%s'" % (opt, arguments[opt]) if arguments['--runCellStatistics']: cmd += " --runCellStatistics" print( "Calling 1.4 with command line: %s" % cmd ) os.system( cmd ) #clean up!! oldFiles = glob.glob("%s/*txt"%prj_tree.vgene) + glob.glob("%s/*fasta"%prj_tree.vgene) + glob.glob("%s/*txt"%prj_tree.jgene) + glob.glob("%s/*fasta"%prj_tree.jgene) + glob.glob("%s/*tsv"%prj_tree.jgene) + glob.glob("%s/lookup*"%prj_tree.internal) if len(oldFiles) > 0 and not arguments['--noclean']: [os.remove(f) for f in oldFiles]