Example #1
0
def filterAirrTsv(rearrangementsFile, annotationList, exact=False):
    good = 0

    for r in airr.read_rearrangement(rearrangementsFile):
        keep = True
        for filter in annotationList:
            if len(filter['list']) > 1 or exact:
                #want exact matches (will break if trying to match exactly on a single value - use regex '^foo$')
                if str(r[filter['column']]) not in filter['list']:
                    keep = False
                    break
            elif not re.search(filter['list'][0], r[filter['column']]):
                keep = False
                break

        if keep:
            good += 1
            if good % 10000 == 0:
                sys.stderr.write(
                    "Found %d matching rearrangements so far...\n" % good)
            yield r
Example #2
0
def format_data(args):
    airr.validate_rearrangement(args.rearrangements, True)
    reader = airr.read_rearrangement(args.rearrangements)
    empty_list = []

    # keep only the Junction, Vgene, Jgene and Repertoire ID columns
    keys = ["junction_aa", "v_call", "j_call", "junction", "repertoire_id"]
    for row in reader:
        empty_list.append({x: row[x] for x in keys})

    df = pd.DataFrame(empty_list)

    # replace cells without junction with Nan
    df["junction_aa"].replace("", np.nan, inplace=True)

    # delete lines with Nan
    df.dropna(subset=["junction_aa"], inplace=True)

    # delete lines with an X on the junction_aa
    df = df[~df.junction_aa.str.contains("X")]

    # delete lines where junction_aa doesn"t start with C
    df = df[df.junction_aa.str.startswith("C")]

    # delete lines where junction_aa doesn"t end with F or W
    df = df[df.junction_aa.str.endswith(("F", "W"))]

    # delete lines where the chain in v_call and j_call doesn"t match
    df = df[(df["v_call"].str[2] == df["j_call"].str[2])]

    # keep only one first Vgene when there are multiple in the column
    df["v_call"] = df.v_call.str.split(",", n=1, expand=True)[0]

    # remove allele information from v_call and keep only the gene information
    df["v_call"] = df.apply(lambda x: x["v_call"][:-3], axis=1)
    df["chain"] = df.apply(lambda x: x["v_call"][2], axis=1)

    return df
Example #3
0
def main():

    global germs
    germs = dict()
    for entry in SeqIO.parse(open(arguments['-g'], "r"), "fasta"):
        germs[entry.id] = entry

    global mature
    mature = dict()
    if arguments['-a'] is not None:
        for entry in SeqIO.parse(open(arguments['-a'], "r"), "fasta"):
            mature[entry.id] = entry

    inputFile = arguments['-f']
    dedup = dict()
    if arguments['-d']:
        subprocess.call([
            vsearch, "-derep_fulllength", arguments['-f'], "-output",
            "temp_dedup.fa", "-uc", "temp.uc", "-notrunclabels"
        ])

        inputFile = "temp_dedup.fa"

        #process the uc file
        with open("temp.uc", "r") as handle:
            uc = csv.reader(handle, delimiter="\t")
            for row in uc:
                if row[0] == "S":
                    dedup[row[8].split(" ")[0]] = row[8].split(" ")[0]
                elif row[0] == "H":
                    dedup[row[8].split(" ")[0]] = row[9].split(" ")[0]

    results = dict()
    #If we are multithreading, split input into chunks
    if arguments['-t'] > 1:
        index = 0
        counter = 0
        chunk = []
        reader = SeqIO.parse(open(inputFile, "r"), "fasta")
        for entry in reader:
            chunk.append(entry)
            counter += 1
            if counter == 1000:
                with open("%s/align/align%06d.fa" % (prj_tree.lineage, index),
                          "w") as handle:
                    SeqIO.write(chunk, handle, "fasta")
                index += 1
                counter = 0
                chunk = []
        if counter > 0:
            with open("%s/align/align%06d.fa" % (prj_tree.lineage, index),
                      "w") as handle:
                SeqIO.write(chunk, handle, "fasta")
            index += 1  #so we can use range properly

        #now create a pool and start the actual work
        filterPool = Pool(arguments['-t'])
        dataBlob = filterPool.map(runAlign, [
            "%s/align/align%06d.fa" % (prj_tree.lineage, i)
            for i in range(index)
        ])
        filterPool.close()
        filterPool.join()

        #Recover results
        for blob in dataBlob:
            results.update(blob)

    else:
        #unthreaded, just do the whole thing
        results = runAlign(inputFile)

    #get some outputs set up
    outFile = os.path.basename(os.path.splitext(arguments['-f'])[0])
    if os.path.isdir(prj_tree.tables):
        outFile = "output/tables/" + outFile
    if arguments['-o'] is not None:
        outFile = arguments['-o']

    nats = sorted(mature.keys())

    covFile = open("%s_coverage.tab" % outFile, "w")
    coverage = csv.writer(covFile, delimiter="\t")
    coverage.writerow(['sequence_id', 'germ_cov'] + nats)

    idFile = open("%s_id-div.tab" % outFile, "w")
    iddiv = csv.writer(idFile, delimiter="\t")
    iddiv.writerow(['sequence_id', 'v_gene', 'germ_div'] + nats)

    #sort the freaking list and output
    if arguments['-d']:
        for s in sorted(dedup.keys()):
            (germc, germi) = results[dedup[s]]['germline']
            if not germc == "NA":
                germc = "%.1f" % germc
                germi = "%.1f" % (100 - germi)
            coverage.writerow([s, germc] + [
                "NA" if results[dedup[s]][n][0] == "NA" else "%.1f" %
                results[dedup[s]][n][0] for n in nats
            ])
            iddiv.writerow([s, results[dedup[s]]['vlookup'], germi] + [
                "NA" if results[dedup[s]][n][1] == "NA" else "%.1f" %
                results[dedup[s]][n][1] for n in nats
            ])

        #take this opportunity to do some cleanup
        os.remove("temp_dedup.fa")
        os.remove("temp.uc")
    else:
        for s in sorted(results.keys()):
            (germc, germi) = results[s]['germline']
            if not germc == "NA":
                germc = "%.1f" % germc
                germi = "%.1f" % (100 - germi)
            coverage.writerow([s, germc] + [
                "NA" if results[s][n][0] == "NA" else "%.1f" % results[s][n][0]
                for n in nats
            ])
            iddiv.writerow([s, results[s]['vlookup'], germi] + [
                "NA" if results[s][n][1] == "NA" else "%.1f" % results[s][n][1]
                for n in nats
            ])

    covFile.close()
    idFile.close()

    #do AIRR output
    if os.path.dirname(
            arguments['-f']
    ) == "output/sequences/nucleotide" and not 'CDR3' in arguments['-f']:
        if os.path.isfile("%s/%s_rearrangements.tsv" %
                          (prj_tree.tables, prj_name)):
            withDiv = airr.derive_rearrangement("updateRearrangements.tsv",
                                                "%s/%s_rearrangements.tsv" %
                                                (prj_tree.tables, prj_name),
                                                fields=['v_identity'])
            for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" %
                                             (prj_tree.tables, prj_name)):
                if dedup.get(r['sequence_id'], r['sequence_id']) in results:
                    # omit NAs here to comply with AIRR format
                    if not results[dedup.get(
                            r['sequence_id'],
                            r['sequence_id'])]['germline'][1] == "NA":
                        r['v_identity'] = "%0.3f" % (results[dedup.get(
                            r['sequence_id'], r['sequence_id'])]['germline'][1]
                                                     / 100)
                withDiv.write(r)
            withDiv.close()
            os.rename("updateRearrangements.tsv",
                      "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
def main():
	cells_raw = defaultdict( dict )
	most_used = defaultdict( list )
	output = open("%s/%s_cell_stats.tsv"%(prj_tree.tables,prj_name), 'w')
	output.write("cell\tstatus\tisotype\tproductive_IGH\ttotal_IGH\tIGH_junctions\tproductive_IGK\ttotal_IGK\tIGK_junctions\tproductive_IGL\ttotal_IGL\tIGL_junctions\n")
	data = airr.read_rearrangement(arguments['--rearrangements'])
	cells_only = airr.derive_rearrangement(re.sub(".tsv", "_single-cell.tsv", arguments['--rearrangements']), arguments['--rearrangements'])

	#assume cells might not be grouped together, so make a first pass
	#    to collect everything
	for r in data:
		if r['status'] in ['good', 'indel', 'stop', 'nonproductive', 'unique']: #skip irrelevant sequences
			if r['locus'] not in cells_raw[r['cell_id']]:
				cells_raw[r['cell_id']][r['locus']] = [ r ]
			else:
				cells_raw[r['cell_id']][r['locus']].append( r )
			#need better heuristic for this, omit for now
			#if r['cell_id'] not in most_used[r['centroid']]:
			#	most_used[r['centroid']].append(r['cell_id'])

	#now go back and process each cell
	status_list  = [ 'canonical_pair', 'possible_inclusion', 'heavy_only', 'light_only', 'multi_light', 'multi_heavy', 'probable_multiplet', 'none_productive' ]
	status_count = dict( zip( status_list, [0,0,0,0,0,0,0,0] )  )
	status_dict  = dict( )

	for c in cells_raw:
		cell_processed	= defaultdict( list )
		cell_productive = defaultdict( list )
		for locus in cells_raw[c]:
			#Start with the one with the most UMIs
			for rep in sorted( [ r for r in cells_raw[c][locus] ], key=lambda k: k['duplicate_count'], reverse=True ):
				#check if this is a duplicate of a previously kept read
				keep = True
				for previous in cell_processed[locus]:
					#shortcut: assume identical junctions means duplicates
					if previous['junction_aa'] == rep['junction_aa']:
						keep = False
						break
					#heuristic (for 10x data as of March 2019):  omit gaps and cut off possible noise at 5' end
					else:
						cov, score = scoreAlign( quickAlign(previous['sequence_alignment'],rep['sequence_alignment']), countInternalGaps=False, skip=50 )
						if score >= 0.95:
							keep = False
							break
					
				if keep:
					cell_processed[locus].append( rep )
					if rep['status'] == "good": cell_productive[locus].append( rep )

		status = ""
		h_type = ""
		if len(cell_productive['IGH']) == 0:
			if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0:
				status = "none_productive"
			elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 1:
				status = "light_only"
			elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 1:
				status = "multi_light"
		elif len(cell_productive['IGH']) == 1:
			h_type = re.sub("\*.+", "", cell_productive['IGH'][0]['c_call'])
			if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0:
				status = "heavy_only"
			elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 1:
				status = "canonical_pair"
			elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 2:
				status = "possible_inclusion"
			elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 2:
				status = "probable_multiplet"
		elif len(cell_productive['IGH']) > 1:
				status = "multi_heavy"

		status_count[status] += 1
		status_dict[c] = status

		#print to filtered rearrangements file
		if status in arguments['--save']:
			for loc in cell_processed:
				for chain in cell_processed[loc]:
					cells_only.write( chain )

		#now log the cell
		print( "\t".join( [c, status, h_type,
				   str(len(cell_productive['IGH'])), str(len(cell_processed['IGH'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGH']]),
				   str(len(cell_productive['IGK'])), str(len(cell_processed['IGK'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGK']]),
				   str(len(cell_productive['IGL'])), str(len(cell_processed['IGL'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGL']]) ] ),
		       file=output)


	output.close()

	with open("%s/cell_processing.log"%prj_tree.logs, "w") as log:
		print("\t".join(status_list), file=log)
		print("\t".join([str(status_count[s]) for s in status_list]), file=log)
	
	print("\t".join(status_list))
	print("\t".join([str(status_count[s]) for s in status_list]))
    parser.add_argument('rearrangement_file',
                        type=str,
                        help='Rearrangement AIRR TSV file name')
    args = parser.parse_args()

    if args:
        # connection header
        config = getConfig()
        header = 'var conn = new Mongo();\n'
        header += 'var db = conn.getDB("admin");\n'
        header += 'db.auth("' + config['service_user'] + '", "' + config[
            'service_secret'] + '");\n'
        header += 'db = db.getSiblingDB("' + config['db'] + '");\n'

        print("Reading file: " + args.rearrangement_file)
        reader = airr.read_rearrangement(args.rearrangement_file)

        os.system("mkdir /work_data/tmp")
        fnum = 0
        fname = '/work_data/tmp/rearrangement' + str(fnum) + '.js'
        print('Creating file: ' + fname)
        fout = open(fname, 'w')
        fout.write(header)

        # delete any existing records
        fout.write('db.rearrangement.deleteMany({"repertoire_id":"' +
                   args.repertoire_id + '"});\n')

        seqCount = 0
        for row in reader:
            if row.get('repertoire_id') is None:
Example #6
0
import numpy as np

# We have 4 T cell subsets
subsets = {
    'CL_0000895': [0 for number in range(0, 50)],
    'CL_0000900': [0 for number in range(0, 50)],
    'CL_0000897': [0 for number in range(0, 50)],
    'CL_0000909': [0 for number in range(0, 50)]
}

# Load the repertoire metadata
data = airr.load_repertoire('repertoires.airr.json')
repertoires = {obj['repertoire_id']: obj for obj in data['Repertoire']}

# Iterate through the rearrangement data and tabulate the counts
reader = airr.read_rearrangement('rearrangements.tsv')
for row in reader:
    # get the appropriate repertoire
    rep = repertoires[row['repertoire_id']]
    # use the cell_subset field in the repertoire
    c = subsets[rep['sample'][0]['cell_subset']['id']]
    # increment the length count
    if row['junction_aa_length']:
        if int(row['junction_aa_length']) >= 50:
            continue
        #print(int(row['junction_aa_length']))
        c[int(row['junction_aa_length'])] += 1

# normalize the counts so the histograms are comparable
for cnts in subsets:
    total = 0
Example #7
0
def read_airr(path: Union[str, Sequence[str], Path, Sequence[Path]]) -> AnnData:
    """\
    Read AIRR-compliant data.

    Reads data organized in the `AIRR rearrangement schema <https://docs.airr-community.org/en/latest/datarep/rearrangements.html>`_.

    The following columns are required:
     * `cell_id`
     * `productive`
     * `locus`
     * `consensus_count`
     * at least one of `junction_aa` or `junction`.


    {doc_working_model}

    Parameters
    ----------
    path
        Path to the AIRR rearrangement tsv file. If different
        chains are split up into multiple files, these can be specified
        as a List, e.g. `["path/to/tcr_alpha.tsv", "path/to/tcr_beta.tsv"]`.

    Returns
    -------
    AnnData object with IR data in `obs` for each cell. For more details see
    :ref:`data-structure`.
    """
    ir_objs = {}

    if isinstance(path, str) or isinstance(path, Path):
        path = [path]

    for tmp_path in path:
        tmp_path = str(tmp_path)
        reader = airr.read_rearrangement(tmp_path)
        for row in reader:
            cell_id = row["cell_id"]
            try:
                tmp_cell = ir_objs[cell_id]
            except KeyError:
                tmp_cell = IrCell(cell_id=cell_id)
                ir_objs[cell_id] = tmp_cell

            try:
                # this is not an official field
                expr = row["umi_count"]
                expr_raw = row["consensus_count"]
            except KeyError:
                expr = row["consensus_count"]
                expr_raw = None

            tmp_cell.add_chain(
                IrChain(
                    is_productive=row["productive"],
                    locus=row["locus"],
                    v_gene=row["v_call"] if "v_call" in row else None,
                    d_gene=row["d_call"] if "d_call" in row else None,
                    j_gene=row["j_call"] if "j_call" in row else None,
                    c_gene=row["c_call"] if "c_call" in row else None,
                    cdr3=row["junction_aa"] if "junction_aa" in row else None,
                    cdr3_nt=row["junction"] if "junction" in row else None,
                    expr=expr,
                    expr_raw=expr_raw,
                )
            )

    return from_ir_objs(ir_objs.values())
def main():

    #look for cell hashing
    hashDict = dict()
    sampleList = []
    if os.path.exists(f"{prj_tree.tables}/{prj_name}_hashes.tsv"):
        with open(f"{prj_tree.tables}/{prj_name}_hashes.tsv", 'r') as handle:
            reader = csv.reader(handle, delimiter="\t")
            for row in reader:
                hashDict[row[0]] = [row[1]]
                if row[1] == "unknown" or row[1] == "ambiguous":
                    continue
                elif not row[1] in sampleList:
                    sampleList.append(row[1])
    sampleList.sort()
    sampleList += ["unknown", "ambiguous"]

    #look for feature barcoding
    featureDict = dict()
    if os.path.exists(f"{prj_tree.tables}/{prj_name}_features.tsv"):
        with open(f"{prj_tree.tables}/{prj_name}_features.tsv", 'r') as handle:
            reader = csv.reader(handle, delimiter="\t")
            header = next(reader)
            featureDict["keys"] = header[1:]
            for row in reader:
                featureDict[row[0]] = row[1:]

    cells_raw = defaultdict(dict)
    most_used = defaultdict(list)

    output = open("%s/%s_cell_stats.tsv" % (prj_tree.tables, prj_name), 'w')
    outwriter = csv.writer(output, delimiter="\t")
    outheader = ["cell", "status", "isotype"]
    if len(hashDict) > 0:
        outheader += ["hash_sample"]
    if len(featureDict) > 0:
        outheader += featureDict["keys"]
    outheader += [
        "productive_IGH", "total_IGH", "IGH_junctions", "productive_IGK",
        "total_IGK", "IGK_junctions", "productive_IGL", "total_IGL",
        "IGL_junctions"
    ]
    outwriter.writerow(outheader)

    data = airr.read_rearrangement(arguments['--rearrangements'])
    fields = ["cell_status"]
    if len(hashDict) > 0:
        fields += ["hash_sample"]
    cells_only = airr.derive_rearrangement(re.sub(
        ".tsv", "_single-cell.tsv", arguments['--rearrangements']),
                                           arguments['--rearrangements'],
                                           fields=fields)

    #assume cells might not be grouped together, so make a first pass
    #    to collect everything
    for r in data:
        if r['status'] in ['good', 'indel', 'stop', 'nonproductive',
                           'unique']:  #skip irrelevant sequences
            if r['locus'] not in cells_raw[r['cell_id']]:
                cells_raw[r['cell_id']][r['locus']] = [r]
            else:
                cells_raw[r['cell_id']][r['locus']].append(r)
            #need better heuristic for this, omit for now
            #if r['cell_id'] not in most_used[r['centroid']]:
            #	most_used[r['centroid']].append(r['cell_id'])

    #now go back and process each cell
    status_list = [
        'canonical_pair', 'possible_inclusion', 'heavy_only', 'light_only',
        'multi_light', 'multi_heavy', 'probable_multiplet', 'none_productive'
    ]
    status_count = dict()
    for sample in sampleList:
        status_count[sample] = dict(zip(status_list, [0, 0, 0, 0, 0, 0, 0, 0]))
    status_dict = dict()

    for c in cells_raw:
        cell_processed = defaultdict(list)
        cell_productive = defaultdict(list)
        for locus in cells_raw[c]:
            #Start with the one with the most UMIs
            for rep in sorted([r for r in cells_raw[c][locus]],
                              key=lambda k: k['duplicate_count'] or 0,
                              reverse=True):
                #check if this is a duplicate of a previously kept read
                keep = True
                for previous in cell_processed[locus]:
                    #shortcut: assume identical junctions means duplicates
                    if previous['junction_aa'] == rep['junction_aa']:
                        keep = False
                        if previous['duplicate_count'] is not None:
                            previous['duplicate_count'] += rep[
                                'duplicate_count']
                        if previous['consensus_count'] is not None:
                            previous['consensus_count'] += rep[
                                'consensus_count']
                        break
                    #heuristic (for 10x data as of March 2019):  omit gaps and cut off possible noise at 5' end
                    else:
                        score, cov = scoreAlign(quickAlign(
                            previous['sequence_alignment'],
                            rep['sequence_alignment']),
                                                countInternalGaps=False,
                                                skip=50)
                        if score >= 0.95:
                            keep = False
                            if previous['duplicate_count'] is not None:
                                previous['duplicate_count'] += rep[
                                    'duplicate_count']
                            if previous['consensus_count'] is not None:
                                previous['consensus_count'] += rep[
                                    'consensus_count']
                            break

                if keep:
                    cell_processed[locus].append(rep)
                    if rep['status'] == "good":
                        cell_productive[locus].append(rep)

        status = ""
        h_type = ""
        if len(cell_processed['IGH']) > 2 or len(
                cell_processed['IGK']) > 2 or len(cell_processed['IGL']) > 2:
            status = "probable_multiplet"
        elif len(cell_productive['IGH']) == 0:
            if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0:
                status = "none_productive"
            elif len(cell_productive['IGK']) + len(
                    cell_productive['IGL']) == 1:
                status = "light_only"
            elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 1:
                status = "multi_light"
        elif len(cell_productive['IGH']) == 1:
            h_type = re.sub("\*.+", "", cell_productive['IGH'][0]['c_call'])
            if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0:
                status = "heavy_only"
            elif len(cell_productive['IGK']) + len(
                    cell_productive['IGL']) == 1:
                status = "canonical_pair"
            elif len(cell_productive['IGK']) + len(
                    cell_productive['IGL']) == 2:
                status = "possible_inclusion"
            elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 2:
                status = "probable_multiplet"
        elif len(cell_productive['IGH']) > 1:
            status = "multi_heavy"

        status_count[hashDict.get(c, ["unknown"])[0]][status] += 1
        status_dict[c] = status

        #print to filtered rearrangements file
        #leave out cells with ambiguous hashing assignments if we are doing any filtering
        if status in arguments['--save']:
            if hashDict.get(
                    c, ["unknown"]
            )[0] != "ambiguous" or "probable_multiplet" in arguments['--save']:
                for loc in cell_processed:
                    for chain in cell_processed[loc]:
                        chain['cell_status'] = status
                        if len(hashDict) > 0:
                            chain['hash_sample'] = hashDict.get(
                                chain['cell_id'], ['unknown'])[0]
                        cells_only.write(chain)

        #now log the cell
        outwriter.writerow(
            [c, status, h_type] + hashDict.get(c, ['unknown'] *
                                               (len(hashDict) > 0)) +
            featureDict.get(c, ['0'] * len(featureDict.get("keys", []))) + [
                len(cell_productive['IGH']),
                len(cell_processed['IGH']), ";".join(
                    [chain['junction_aa'] for chain in cell_processed['IGH']]),
                len(cell_productive['IGK']),
                len(cell_processed['IGK']), ";".join(
                    [chain['junction_aa'] for chain in cell_processed['IGK']]),
                len(cell_productive['IGL']),
                len(cell_processed['IGL']), ";".join(
                    [chain['junction_aa'] for chain in cell_processed['IGL']])
            ])

    output.close()

    with open("%s/cell_processing.log" % prj_tree.logs, "w") as log:
        print("sample\t" + "\t".join(status_list), file=log)
        print("sample\t" + "\t".join(status_list))
        for sample in sampleList:
            if sum([status_count[sample][s] for s in status_list]) == 0:
                continue  #leave out `ambiguous` if it's not a hashed sample
            print(
                "\t".join([sample] +
                          [str(status_count[sample][s]) for s in status_list]),
                file=log)
            print(
                "\t".join([sample] +
                          [str(status_count[sample][s]) for s in status_list]))
def main():

    #start by reading in the GSSP
    gssp = GSSP(arguments['--gssp'])
    gssp.computeRarity()

    #now get germline genes
    germDB = load_fastas(arguments['--germ'])

    rareSubs = dict()

    if arguments['-r'] is not None:
        for seq in airr.read_rearrangement(arguments['-r']):
            gl = re.sub("\*.*", "", seq['v_call'])
            if checkGermSeq(gl, germDB) and checkGSSP(gl, gssp.rarity):
                rareSubs[seq['sequence_id']] = score(seq['sequence_alignment'],
                                                     germDB[gl + "*01"],
                                                     gssp.rarity[gl])
    elif arguments['-f'] is not None:

        #if there's a global V gene, check it
        if arguments['-v'] is not None:
            if not checkGermSeq(arguments['-v'], germDB) or not checkGSSP(
                    arguments['-v'], gssp.rarity):
                sys.exit(1)

        #set up incase it's a prealigned file
        alignedV = None

        for seq in generate_read_fasta(arguments['-f']):

            #if aligned, then first seq is germline
            if arguments['-a'] and alignedV is None:
                alignedV = seq.seq
                if arguments['-n']:
                    alignedV = alignedV.translate(table=GAPPED_CODON_TABLE)
                alignedV = str(alignedV)
                continue

            #score all other sequences
            if arguments['-v'] is not None:
                if arguments['-a']:
                    rareSubs[seq.id] = score(str(seq.seq), alignedV,
                                             gssp.rarity[arguments['-v']])
                else:
                    rareSubs[seq.id] = score(str(seq.seq),
                                             germDB[arguments['-v'] + "*01"],
                                             gssp.rarity[arguments['-v']])
            else:
                gl = re.search("(v_call|V_gene)=([^\*\s]+)", seq.description)
                if gl:
                    if checkGermSeq(gl.group(2), germDB) and checkGSSP(
                            gl.group(2), gssp.rarity):
                        rareSubs[seq.id] = score(str(seq.seq),
                                                 germDB[gl.group(2) + "*01"],
                                                 gssp.rarity[gl.group(2)])
                else:
                    print(
                        "Could not find V gene annotation for %s, skipping..."
                        % seq.id,
                        file=sys.stderr)
                    continue
    else:
        if checkGermSeq(arguments['-v'], germDB) and checkGSSP(
                arguments['-v'], gssp.rarity):
            for sequence in arguments['QVQLVQ']:
                rareSubs[sequence] = score(sequence,
                                           germDB[arguments['-v'] + "*01"],
                                           gssp.rarity[arguments['-v']])
        else:
            sys.exit(1)

    #now do output
    count = 0
    if arguments['--lineage']:
        reverse_dict = defaultdict(list)
        for seq in rareSubs:
            for sub in rareSubs[seq]:
                reverse_dict[sub].append(seq)
        for sub in sorted(reverse_dict.keys(),
                          key=lambda x: int(re.search("(\d+)", x).group(1))):
            if 100 * len(reverse_dict[sub]) / len(
                    rareSubs) >= arguments['--threshold']:
                print(sub)
                count += 1
    else:
        for seq in rareSubs:
            if len(rareSubs[seq]) > 0:
                print(seq + ": " + ",".join(rareSubs[seq]))
                count += 1

    if count == 0:
        print("No rare substitutions were found")
Example #10
0
#!/usr/bin/env python3

# imports
import airr

# read a rearrangments file
print('*****')
print('*****')
print('Read a rearrangements file.')
print('*****')
print('*****')
data = airr.read_rearrangement('toy_data.tsv')
print(data.fields)
print(data.external_fields)
for r in data:
    print(r)

# Create a new rearrangements file with an intermediate parser
# Technically, the parser tool should be reading the VDJ rearrangements
# output file, parsing it, then writing the row data.
print('*****')
print('*****')
print('Create new rearrangements file.')
print('*****')
print('*****')
data = airr.read_rearrangement('toy_data.tsv')
newd = airr.create_rearrangement('my_data.tsv', fields=data.fields)
print(newd.fields)
print(newd.external_fields)
for r in data:
    newd.write(r)
def main():

    #first, open the input file and parse into groups with same V/J
    vj_partition = dict()
    cdr3_info = dict()
    seqSize = Counter()

    #start off by getting size annotations
    for read in generate_read_fasta(arguments['--full']):
        seqSize[read.id] = 1
        check = re.search("cluster_count=(\d+)", read.description)
        if check:
            seqSize[read.id] = int(check.group(1))

    gene_pat = re.compile(
        "(?:v_call|V_gene)=IG([HKL]V[^*]+).*(?:j_call|J_gene)=IG([HKL]J\d)")
    for sequence in SeqIO.parse(open(arguments['--cdr3'], "r"), "fasta"):
        genes = re.search(gene_pat, sequence.description)
        if genes:
            key = genes.group(1) + "_" + genes.group(2)
            key = re.sub(
                "[()/]", "",
                key)  #so /OR or (II) genes don't screw up the file system
            if key not in vj_partition:
                temp = "%s/%s.fa" % (prj_tree.lineage, key)
                vj_partition[key] = {
                    'group': key,
                    'handle': open(temp, "w"),
                    'file': temp,
                    'count': 0,
                    'ids': []
                }

            vj_partition[key]['count'] += 1
            vj_partition[key]['ids'].append(sequence.id)
            cdr3_info[sequence.id] = {
                'cdr3_len': int(len(sequence.seq) / 3),
                'cdr3_seq': sequence.seq.translate()
            }

            #make sizes available to vsearch
            sequence.id += ";size=%d" % seqSize[
                sequence.id]  #do this even if there's no label
            #so I don't need to divide the cases for vsearch
            #and write
            SeqIO.write([sequence], vj_partition[key]['handle'], 'fasta')
        else:
            print("Couldn't find V and J genes for %s %s, skipping..." %
                  (sequence.id, sequence.description))

    global natives
    natives = dict()
    if arguments['--natives'] is not None:
        natives = load_fastas(arguments['--natives'])
        for n, s in natives.items():
            if arguments['-v'] is not None:
                key = arguments['-v'] + "_" + arguments['-j']
            else:
                genes = re.search(gene_pat, s.description)
                if genes:
                    key = genes.group(1) + "_" + genes.group(2)
                else:
                    sys.exit(
                        "Can't find V and J gene annotations for native sequence %s. Please specify using the -v and -j parameters."
                        % n)

            key = re.sub(
                "[()/]", "", key
            )  #wouldn't expect this to be relevant for natives, but just in case...

            if key not in vj_partition:
                print(
                    "No NGS sequences with the same V/J genes as native sequence %s (%s); skipping..."
                    % (n, key))
                continue

            seqSize[n] = 0
            s.id += ";size=1"
            vj_partition[key]['count'] += 1
            vj_partition[key]['ids'].append(n)
            cdr3_info[n] = {
                'cdr3_len': int(len(s.seq) / 3),
                'cdr3_seq': s.seq.translate()
            }
            SeqIO.write([s], vj_partition[key]['handle'], 'fasta')

    #close the file handles and delete the reference, so dict can be pickled for multithreading
    for cluster in vj_partition:
        vj_partition[cluster]['handle'].close()
        del vj_partition[cluster]['handle']

    #now go through and cluster each V/J grouping
    clusterLookup = dict()
    centroidData = dict()
    clusterSizes = Counter()
    if arguments['-t'] > 1:
        pool = Pool(arguments['-t'])
        blob = pool.map(processClusters,
                        iterator_slice(
                            vj_partition.values(),
                            25))  #number per slice needs optimization
        pool.close()
        pool.join()

        for d in blob:
            clusterLookup.update(d['cl'])
            centroidData.update(d['cd'])
            clusterSizes.update(d['cs'])
    else:
        #don't thread
        d = processClusters((0, vj_partition.values()))
        clusterLookup.update(d['cl'])
        centroidData.update(d['cd'])
        clusterSizes.update(d['cs'])

    #now process all clusters and do tabular output
    with open("%s/%s_lineages.txt" % (prj_tree.tables, prj_name),
              "w") as handle:
        writer = csv.writer(handle, delimiter=sep)
        writer.writerow([
            "clone_id", "sequence_id", "v_call", "j_call",
            "junction_length_aa", "junction_aa", "clone_count", "included_mAbs"
        ])
        for rank, (centroid, size) in enumerate(clusterSizes.most_common()):
            centroidData[centroid]['rank'] = rank + 1
            writer.writerow([
                "%05d" % (rank + 1), centroid, centroidData[centroid]['vgene'],
                centroidData[centroid]['jgene'],
                cdr3_info[centroid]['cdr3_len'],
                cdr3_info[centroid]['cdr3_seq'], size,
                ",".join(centroidData[centroid]['nats'])
            ])

    #do sequence output
    notationFile = re.sub("\.f.+", "_lineageNotations.fa", arguments['--full'])
    repFile = re.sub("\.f.+", "_lineageRepresentatives.fa",
                     arguments['--full'])

    rep_seqs = []
    with open(notationFile, "w") as handle:
        for read in generate_read_fasta(arguments['--full']):
            if ";" in read.id:
                read.id = read.id[
                    0:8]  #this is for raw VSearch output with size annotations
                #shouldn't be relevant in pipeline context
            if read.id not in clusterLookup: continue
            read.description += " clone_id=%05d clone_rep=%s clone_count=%d" % (
                centroidData[clusterLookup[read.id]]['rank'],
                clusterLookup[read.id], clusterSizes[clusterLookup[read.id]])
            SeqIO.write([read], handle, "fasta")
            if read.id in centroidData:
                rep_seqs.append(read)

    with open(repFile, "w") as handle:
        #use a sort to put them out in order of lineage rank (ie size)
        SeqIO.write(
            sorted(rep_seqs, key=lambda cent: centroidData[cent.id]['rank']),
            handle, "fasta")

    #do AIRR output
    if os.path.dirname(arguments['--full']) == prj_tree.nt:
        if os.path.isfile("%s/%s_rearrangements.tsv" %
                          (prj_tree.tables, prj_name)):
            withLin = airr.derive_rearrangement(
                "updateRearrangements.tsv",
                "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name),
                fields=["clone_id", "clone_count"])
            for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" %
                                             (prj_tree.tables, prj_name)):
                if r['sequence_id'] in clusterLookup:
                    r['clone_id'] = "%05d" % centroidData[clusterLookup[
                        r['sequence_id']]]['rank']
                    r['clone_count'] = clusterSizes[clusterLookup[
                        r['sequence_id']]]
                else:
                    #prevent mix-and-match data if this gets run multiple times with multiple settings
                    r['clone_id'] = ""
                    r['clone_count'] = ""

                withLin.write(r)
            withLin.close()
            os.rename("updateRearrangements.tsv",
                      "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
Example #12
0
def main():

    #start by making possible "duplicate_count" info available to vsearch
    with open("temp.fa", "w") as handle:
        SeqIO.write(reformatInput(arguments['-f']), handle, "fasta")

    #first step on higher identity
    subprocess.call([
        vsearch, "-derep_fulllength", "temp.fa", "-output", "temp_dedup.fa",
        "-uc", "temp.uc", "-sizein", "-sizeout", "-minuniquesize",
        arguments['--min1']
    ])

    #process the uc file
    centroid = dict()
    with open("temp.uc", "r") as handle:
        uc = csv.reader(handle, delimiter="\t")
        for row in uc:
            if row[0] == "H":
                centroid[row[8]] = row[9]

    #second clustering step
    subprocess.call([
        vsearch, "-cluster_size", "temp_dedup.fa", "-sizein", "-sizeout",
        "-maxgaps", "0", "-id", arguments['--id'], "-uc",
        "%s.cluster" % os.path.splitext(arguments['-f'])[0]
    ])

    #process the uc file
    size = dict()
    with open("%s.cluster" % os.path.splitext(arguments['-f'])[0],
              "r") as handle:
        uc = csv.reader(handle, delimiter="\t")
        for row in uc:
            if row[0] == "H":
                centroid[re.sub(";size=\d+", "",
                                row[8])] = re.sub(";size=\d+", "", row[9])
            elif row[0] == "C" and int(row[2]) >= arguments['--min2']:
                size[re.sub(";size=\d+", "", row[8])] = int(row[2])

    #clean up
    os.remove("temp.fa")
    os.remove("temp_dedup.fa")
    os.remove("temp.uc")

    #do sequence outputs
    with open("%s_unique.fa" % os.path.splitext(arguments['-f'])[0],
              "w") as handle:
        SeqIO.write(getUniques(arguments['-f'], size), handle, 'fasta')

    #retrieve unique CDR3s (and do AA seqs as appropriate)
    if "goodVJ" in arguments['-f']:
        cdr3_file = re.sub("goodVJ", "goodCDR3", arguments['-f'])
        if os.path.isfile(cdr3_file):
            with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0],
                      "w") as handle:
                SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta')
        else:
            print("Can't find %s to extract unique sequences..." % cdr3_file,
                  file=sys.stderr)

        if "nucleotide" in cdr3_file:
            cdr3_file = re.sub("nucleotide", "amino_acid", cdr3_file)
            if os.path.isfile(cdr3_file):
                with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0],
                          "w") as handle:
                    SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta')
            else:
                print("Can't find %s to extract unique sequences..." %
                      cdr3_file,
                      file=sys.stderr)
    if "nucleotide" in arguments['-f']:
        aa_file = re.sub("nucleotide", "amino_acid", arguments['-f'])
        if os.path.isfile(aa_file):
            with open("%s_unique.fa" % os.path.splitext(aa_file)[0],
                      "w") as handle:
                SeqIO.write(getUniques(aa_file, size), handle, 'fasta')
        else:
            print("Can't find %s to extract unique sequences..." % aa_file,
                  file=sys.stderr)

    #now do AIRR output
    if arguments[
            '-f'] == "output/sequences/nucleotide/%s_goodVJ.fa" % prj_name:
        if os.path.isfile("%s/%s_rearrangements.tsv" %
                          (prj_tree.tables, prj_name)):
            clustered = airr.derive_rearrangement(
                "updateRearrangements.tsv",
                "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name),
                fields=['centroid', 'cluster_count'])
            for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" %
                                             (prj_tree.tables, prj_name)):
                r['centroid'] = centroid.get(
                    centroid.get(r['sequence_id'], ""),
                    centroid.get(r['sequence_id'], ""))
                if r['sequence_id'] in size:
                    r['cluster_count'] = size[r['sequence_id']]
                    r['status'] = "unique"
                elif r['sequence_id'] in centroid:
                    #prevent mix-and-match data if this gets run multiple times with multiple settings
                    #I can get away with this because rearrangements.tsv only gets edited when clustering
                    #   the goodVJ file
                    r['cluster_count'] = ""
                    r['status'] = "good"
                clustered.write(r)
            clustered.close()
            os.rename("updateRearrangements.tsv",
                      "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
        else:
            print(
                "Can't find the rearrangements file, not saving data in AIRR format",
                file=sys.stderr)
import airr
import sys

for line in airr.read_rearrangement(sys.argv[1], validate=True):
    continue
def main():

	if not glob.glob("%s/%s_*.fasta" % (prj_tree.jgene, prj_name)):
		sys.exit("No jBlast output found!\n")

	maxFiles = len( glob.glob("%s/%s_*.fasta" % (prj_tree.vgene, prj_name)) )

	print( "curating junction and 3' end..." )

	if arguments['--cluster']:
		command = "NUM=`printf \"%s\" $SGE_TASK_ID`\n%s/annotate/parse_blast.py --jmotif '%s' --nterm %s --chunk $NUM\n" % \
					( "%03d", SCRIPT_FOLDER, arguments['--jmotif'], arguments['--nterm'] )
		if arguments['--noFallBack']: command += " --noFallBack"
		pbs = open("%s/parse.sh"%prj_tree.jgene, 'w')
		pbs.write( "#!/bin/bash\n#$ -N parse-%s\n#$ -l h_vmem=2G\n#$ -cwd\n#$ -o %s/parse.o$JOB_ID.$SGE_TASK_ID\n#$ -o %s/parse.e$JOB_ID.$SGE_TASK_ID\n\n%s\n" % (prj_name, prj_tree.annotate, prj_tree.annotate, command) )
		pbs.close()
		subprocess.call([qsub, '-sync', 'y', '-t', "1-%d"%maxFiles, "%s/parse.sh"%prj_tree.jgene])

	else: #do it locally
		parse_pool = Pool(arguments['--threads'])
		parse_pool.map(callParser, range(1,maxFiles+1))
		parse_pool.close()
		parse_pool.join()

	#ok, now collect all of the partial outputs and merge them
	print( "collecting information...")

	#open fasta outputs
	allV_aa	     = open ("%s/%s_allV.fa"	 % (prj_tree.aa, prj_name), "w" )
	allV_nt	     = open( "%s/%s_allV.fa"	 % (prj_tree.nt, prj_name), "w" )

	allJ_aa	     = open( "%s/%s_allJ.fa"	 % (prj_tree.aa, prj_name), "w" )
	allJ_nt	     = open( "%s/%s_allJ.fa"	 % (prj_tree.nt, prj_name), "w" )

	vj_aa	     = open( "%s/%s_goodVJ.fa"	 % (prj_tree.aa, prj_name), "w" )
	vj_nt	     = open( "%s/%s_goodVJ.fa"	 % (prj_tree.nt, prj_name), "w" )

	good_cdr3_aa = open( "%s/%s_goodCDR3.fa" % (prj_tree.aa, prj_name), "w" )
	good_cdr3_nt = open( "%s/%s_goodCDR3.fa" % (prj_tree.nt, prj_name), "w" )

	all_cdr3_aa  = open( "%s/%s_allCDR3.fa"	 % (prj_tree.aa, prj_name), "w" )
	all_cdr3_nt  = open( "%s/%s_allCDR3.fa"	 % (prj_tree.nt, prj_name), "w" )


	#also open final rearrangements tsv
	seq_stats = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity','consensus_count','cell_id'])


	#initiate overall counters
	raw_count, total = 0, 0
	counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0,'noV':0,'noJ':0,'missingNterm':0,'chimera':0}

	dict_jcounts = Counter()
	dict_ccounts = Counter()
	dict_dcounts = Counter()

	c = False
	if os.path.isfile("%s/%s_C_001.txt" % (prj_tree.jgene, prj_name)):
		c = True

	d = False
	if os.path.isfile("%s/%s_D_001.txt" % (prj_tree.jgene, prj_name)):
		d = True


	#iterate over subset rearrangement files and combine
	#include generating fasta output as appropriate
	for f_ind in range(1, maxFiles+1):

		#merge partial blast hit tables
		with open( "%s/%s_jgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
			with open( "%s/jtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
				table.write(partial.read())

		if d:
			with open( "%s/%s_dgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
				with open( "%s/dtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
					table.write(partial.read())

		if c:
			with open( "%s/%s_cgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
				with open( "%s/ctophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
					table.write(partial.read())

		#go through partial rearrangements files
		for r in airr.read_rearrangement( "%s/rearrangements_%03d.tsv"%(prj_tree.internal, f_ind) ):

			seq_stats.write( r )

			#count j/d/c gene usages
			if not r['j_call'] == "":
				dict_jcounts[ r['j_call'].split(",")[0] ] += 1
			if not r['j_call'] == "":
				dict_jcounts[ r['d_call'].split(",")[0] ] += 1
			if not r['j_call'] == "":
				dict_jcounts[ r['c_call'].split(",")[0] ] += 1

			#count statuses
			counts[ r['status'] ] += 1
			total += 1
			raw_count = int( r['sequence_id'] ) #technically, this undercounts if the last one
												# isn't in the `correct_length` interval, but I
												# don't have a better solution that isn't super
												# kludgy right now

			#ok, now do sequence output
			# start by collecting metadata for fasta def line
			def_line = ">%s" % r['sequence_id']
			if not r['v_call'] == '':          def_line += " v_call=%s"          % r['v_call']
			if not r['d_call'] == '':          def_line += " d_call=%s"          % r['d_call']
			if not r['j_call'] == '':          def_line += " j_call=%s"          % r['j_call']
			if not r['locus']  == '':          def_line += " locus=%s"           % r['locus']
			if not r['c_call'] == '':          def_line += " c_call=%s"          % r['c_call']
			if not r['status'] == '':          def_line += " status=%s"          % r['status']
#			if not r['v_identity'] == '':      def_line += " v_identity=%s"      % r['v_identity']
			if not r['junction_length'] == '': def_line += " junction_length=%s" % r['junction_length']
			if not r['junction'] == '':        def_line += " junction=%s"        % r['junction']
			if not r['junction_aa'] == '':     def_line += " junction_aa=%s"     % r['junction_aa']
			if not r['duplicate_count'] == '': def_line += " duplicate_count=%s" % r['duplicate_count']
			if not r['consensus_count'] == '': def_line += " consensus_count=%s" % r['consensus_count']
			if not r['cell_id'] == '':         def_line += " cell_id=%s"         % r['cell_id']

			#work our way up the hierarchy, putting sequences in the appropriate files
			ungapped = re.sub( "-", "", r['sequence_alignment']) #reintroduces any frameshift errors in translation
																 #  this has always been the behavior, but I wonder
																 #  if I should change/update now that I am using
																 #  proper alignments.

			if not r['status'] in ['noV', 'missingNterm', "chimera"]:
				allV_nt.write( "%s\n%s\n" % (def_line, ungapped) )
				allV_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )

				if not r['status'] == 'noJ':
					allJ_nt.write( "%s\n%s\n" % (def_line, ungapped) )
					allJ_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )

					if not r['status'] == 'noCDR3':
						all_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) )
						all_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) )

						if r['status'] == "good":
							vj_nt.write( "%s\n%s\n" % (def_line, ungapped) )
							vj_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )
							good_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) )
							good_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) )


	#close outputs
	allV_aa.close()
	allV_nt.close()
	allJ_aa.close()
	allJ_nt.close()
	vj_aa.close()
	vj_nt.close()
	good_cdr3_aa.close()
	good_cdr3_nt.close()
	all_cdr3_aa.close()
	all_cdr3_nt.close()

	#useful number
	found = total - counts['noV'] - counts['noJ'] - counts['chimera']

	#print out some statistics
	handle = open("%s/%s_jgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
	writer	= csv.writer(handle, delimiter = sep)
	keys	= sorted(dict_jcounts.keys())
	writer.writerow(["gene", "count", "percent"])
	for key in keys:
		aline = [ key, dict_jcounts[key], "%4.2f" % (dict_jcounts[key] / float(found) * 100) ]
		writer.writerow(aline)
	handle.close()

	if len(dict_ccounts) > 0:
		handle = open("%s/%s_cgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
		writer	= csv.writer(handle, delimiter = sep)
		keys	= sorted(dict_ccounts.keys())
		writer.writerow(["gene", "count", "percent"])
		for key in keys:
			aline = [ key, dict_ccounts[key], "%4.2f" % (dict_ccounts[key] / float(found) * 100) ]
			writer.writerow(aline)
		handle.close()

	if len(dict_dcounts) > 0:
		handle = open("%s/%s_dgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
		writer	= csv.writer(handle, delimiter = sep)
		keys	= sorted(dict_dcounts.keys())
		writer.writerow(["gene", "count", "percent"])
		for key in keys:
			aline = [ key, dict_dcounts[key], "%4.2f" % (dict_dcounts[key] / float(found) * 100) ]
			writer.writerow(aline)
		handle.close()

	message = "\nTotal raw reads: %d\nCorrect Length: %d\nV assigned: %d\nJ assigned: %d\nCDR3 assigned: %d\nIn-frame junction: %d\nNo indels: %d\nContinuous ORF with no stop codons: %d\n\n"  % \
								(raw_count, total, total-counts['noV']-counts['chimera'], found, found-counts['noCDR3'], found-counts['noCDR3']-counts['nonproductive'], found-counts['noCDR3']-counts['nonproductive']-counts['indel'], counts['good'])
	print( message )
	handle = open("%s/finalize_blast.log"%prj_tree.logs, "w")
	handle.write(message)
	handle.close()

	# call 1.4 or 1.5 if requested
	if arguments['--runClustering']:
		cmd = "%s/annotate/1.4-cluster_sequences.py" % SCRIPT_FOLDER
		for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save' ]:
			if arguments[opt] is not None:
				cmd += " %s '%s'" % (opt, arguments[opt])
		if arguments['--runCellStatistics']:
			cmd += " --runCellStatistics"
		print( "Calling 1.4 with command line: %s" % cmd )
		os.system( cmd )
	elif arguments['--runCellStatistics']:
		cmd = "%s/annotate/1.5-single_cell_statistics.py" % SCRIPT_FOLDER
		for opt in [ '--rearrangements', '--save' ]:
			if arguments[opt] is not None:
				cmd += " %s '%s'" % (opt, arguments[opt])
		print( "Calling 1.5 with command line: %s" % cmd )
		os.system( cmd )

	#clean up!!
	oldFiles = glob.glob("%s/*txt"%prj_tree.vgene) + glob.glob("%s/*fasta"%prj_tree.vgene) +  glob.glob("%s/*txt"%prj_tree.jgene) + glob.glob("%s/*fasta"%prj_tree.jgene) + glob.glob("%s/*tsv"%prj_tree.jgene) + glob.glob("%s/lookup*"%prj_tree.internal)
	if len(oldFiles) > 0 and not arguments['--noclean']:
		[os.remove(f) for f in oldFiles]
def main():

    #start by making possible "duplicate_count" info available to vsearch
    with open("temp.fa", "w") as handle:
        SeqIO.write(reformatInput(arguments['--file']), handle, "fasta")

    #first step on higher identity
    subprocess.call([
        vsearch, "-derep_fulllength", "temp.fa", "-output", "temp_dedup.fa",
        "-uc", "temp.uc", "-sizein", "-sizeout", "-minuniquesize",
        arguments['--min1']
    ])

    #process the uc file
    centroid = dict()
    with open("temp.uc", "r") as handle:
        uc = csv.reader(handle, delimiter="\t")
        for row in uc:
            if row[0] == "H":
                centroid[re.sub(";size=\d+", "",
                                row[8])] = re.sub(";size=\d+", "", row[9])

    #second clustering step
    subprocess.call([
        vsearch, "-cluster_size", "temp_dedup.fa", "-sizein", "-sizeout",
        "-maxgaps", arguments['--maxgaps'], "-id", arguments['--id'], "-uc",
        "%s.cluster" % os.path.splitext(arguments['--file'])[0]
    ])

    #process the uc file
    size = dict()
    with open("%s.cluster" % os.path.splitext(arguments['--file'])[0],
              "r") as handle:
        uc = csv.reader(handle, delimiter="\t")
        for row in uc:
            if row[0] == "H":
                centroid[re.sub(";size=\d+", "",
                                row[8])] = re.sub(";size=\d+", "", row[9])
            elif row[0] == "C":
                #have the centroids point to themselves for more uniform dowsntream processing
                centroid[re.sub(";size=\d+", "",
                                row[8])] = re.sub(";size=\d+", "", row[8])
                #but only save them if they meet the threshold
                if int(row[2]) >= arguments['--min2']:
                    size[re.sub(";size=\d+", "", row[8])] = int(row[2])

    #clean up
    os.remove("temp.fa")
    os.remove("temp_dedup.fa")
    os.remove("temp.uc")

    #do sequence outputs
    with open("%s_unique.fa" % os.path.splitext(arguments['--file'])[0],
              "w") as handle:
        SeqIO.write(getUniques(arguments['--file'], size), handle, 'fasta')

    #retrieve unique CDR3s (and do AA seqs as appropriate)
    if "goodVJ" in arguments['--file']:
        cdr3_file = re.sub("goodVJ", "goodCDR3", arguments['--file'])
        if os.path.isfile(cdr3_file):
            with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0],
                      "w") as handle:
                SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta')
        else:
            print("Can't find %s to extract unique sequences..." % cdr3_file,
                  file=sys.stderr)

        if "nucleotide" in cdr3_file:
            cdr3_file = re.sub("nucleotide", "amino_acid", cdr3_file)
            if os.path.isfile(cdr3_file):
                with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0],
                          "w") as handle:
                    SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta')
            else:
                print("Can't find %s to extract unique sequences..." %
                      cdr3_file,
                      file=sys.stderr)
    if "nucleotide" in arguments['--file']:
        aa_file = re.sub("nucleotide", "amino_acid", arguments['--file'])
        if os.path.isfile(aa_file):
            with open("%s_unique.fa" % os.path.splitext(aa_file)[0],
                      "w") as handle:
                SeqIO.write(getUniques(aa_file, size), handle, 'fasta')
        else:
            print("Can't find %s to extract unique sequences..." % aa_file,
                  file=sys.stderr)

    #now do AIRR output
    if "output/sequences/nucleotide" in arguments['--file']:
        if os.path.isfile("%s/%s_rearrangements.tsv" %
                          (prj_tree.tables, prj_name)):
            clustered = airr.derive_rearrangement(
                "updateRearrangements.tsv",
                "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name),
                fields=['centroid', 'cluster_count'])
            for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" %
                                             (prj_tree.tables, prj_name)):
                #clear old annotations in case we ran 1.4 previously
                r['centroid'] = ""
                r['cluster_count'] = ""

                #now add back current annotations
                #two rounds of clustering means we start by looking for the centroid of the centroid,
                #    falling back to the first level centroid (second clustering step only) if appropriate
                r['centroid'] = centroid.get(
                    centroid.get(r['sequence_id'], ""),
                    centroid.get(r['sequence_id'], ""))

                #add cluster size information for final centroids. I am doing away with changing the 'status' of
                #    the centroids to 'unique' because I've started using this script in a lot of cases where it
                #    doesn't make sense to treat 'unique' as a subset of 'good', and I therefore need to preserve
                #    the original status designation. To find centroids, look for a non-null 'cluster_count' field
                if r['sequence_id'] in size:
                    r['cluster_count'] = size[r['sequence_id']]

                clustered.write(r)
            clustered.close()
            os.rename("updateRearrangements.tsv",
                      "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
        else:
            print(
                "Can't find the rearrangements file, not saving data in AIRR format",
                file=sys.stderr)

    # call 1.5 if requested
    if arguments['--runCellStatistics']:
        cmd = "%s/annotate/1.5-single_cell_statistics.py" % SCRIPT_FOLDER
        if arguments['--rearrangements'] is not None:
            cmd += " --rearrangements %s" % arguments['--rearrangements']
        if arguments['--save'] is not None:
            cmd += " --save %s" % arguments['--save']

        print("Calling 1.5 with command line: %s" % cmd)
        os.system(cmd)
Example #16
0
def read_airr(
    path: Union[str, Sequence[str], Path, Sequence[Path]],
    use_umi_count_col: Union[bool, Literal["auto"]] = "auto",
    infer_locus: bool = True,
    cell_attributes: Collection[str] = DEFAULT_AIRR_CELL_ATTRIBUTES,
    include_fields: Optional[Collection[str]] = DEFAULT_AIRR_FIELDS,
) -> AnnData:
    """\
    Read data from `AIRR rearrangement <https://docs.airr-community.org/en/latest/datarep/rearrangements.html>`_ format.

    The following columns are required by scirpy:
     * `cell_id`
     * `productive`
     * `locus`
     * at least one of `consensus_count`, `duplicate_count`, or `umi_count`
     * at least one of `junction_aa` or `junction`.

    Data should still import if one of these fields is missing, but they are required
    by most of scirpy's processing functions. All chains for which the field
    `junction_aa` is missing or empty, will be considered as non-productive and
    will be moved to the `extra_chains` column.

    {doc_working_model}

    Parameters
    ----------
    path
        Path to the AIRR rearrangement tsv file. If different
        chains are split up into multiple files, these can be specified
        as a List, e.g. `["path/to/tcr_alpha.tsv", "path/to/tcr_beta.tsv"]`.
    use_umi_count_col
        Whether to add UMI counts from the non-strandard (but common) `umi_count`
        column. When this column is used, the UMI counts are moved over to the
        standard `duplicate_count` column. Default: Use `umi_count` if there is
        no `duplicate_count` column present.
    infer_locus
        Try to infer the `locus` column from gene names, in case it is not specified.
    cell_attributes
        Fields in the rearrangement schema that are specific for a cell rather
        than a chain. The values must be identical over all records belonging to a
        cell. This defaults to {cell_attributes}.
    include_fields
        The fields to include in `adata`. The AIRR rearrangment schema contains
        can contain a lot of columns, most of which irrelevant for most analyses.
        Per default, this includes a subset of columns relevant for a typical
        scirpy analysis, to keep `adata.obs` a bit cleaner. Defaults to {include_fields}.
        Set this to `None` to include all columns.

    Returns
    -------
    AnnData object with IR data in `obs` for each cell. For more details see
    :ref:`data-structure`.
    """
    airr_cells = {}
    logger = _IOLogger()

    if isinstance(path, (str, Path, pd.DataFrame)):
        path: list = [path]

    def _decide_use_umi_count_col(chain_dict):
        """Logic to decide whether or not to use counts form the `umi_counts` column."""
        if (
            "umi_count" in chain_dict
            and use_umi_count_col == "auto"
            and "duplicate_count" not in chain_dict
        ):
            logger.warning(
                "Renaming the non-standard `umi_count` column to `duplicate_count`. "
            )  # type: ignore
            return True
        elif use_umi_count_col is True:
            return True
        else:
            return False

    for tmp_path in path:
        if isinstance(tmp_path, pd.DataFrame):
            iterator = tmp_path.to_dict(orient="records")
        else:
            iterator = airr.read_rearrangement(str(tmp_path))

        for chain_dict in iterator:
            cell_id = chain_dict.pop("cell_id")

            try:
                tmp_cell = airr_cells[cell_id]
            except KeyError:
                tmp_cell = AirrCell(
                    cell_id=cell_id,
                    logger=logger,
                    cell_attribute_fields=cell_attributes,
                )
                airr_cells[cell_id] = tmp_cell

            if _decide_use_umi_count_col(chain_dict):
                chain_dict["duplicate_count"] = RearrangementSchema.to_int(
                    chain_dict.pop("umi_count")
                )

            if infer_locus and "locus" not in chain_dict:
                logger.warning(
                    "`locus` column not found in input data. The locus is being inferred from the {v,d,j,c}_call columns."
                )
                chain_dict["locus"] = _infer_locus_from_gene_names(chain_dict)

            tmp_cell.add_chain(chain_dict)

    return from_airr_cells(airr_cells.values(), include_fields=include_fields)
Example #17
0
def main():

	if not glob.glob("%s/%s_*.fasta" % (prj_tree.jgene, prj_name)):
		sys.exit("No jBlast output found!\n")
		
	maxFiles = len( glob.glob("%s/%s_*.fasta" % (prj_tree.vgene, prj_name)) )
	
	if not arguments['--reenter']:
		print( "curating junction and 3' end..." )

		if arguments['--cluster']:
			command = "NUM=`printf \"%s\" $SGE_TASK_ID`\n%s/annotate/parse_blast.py --jmotif '%s' --nterm %s --chunk $NUM\n" % \
						( "%03d", SCRIPT_FOLDER, arguments['--jmotif'], arguments['--nterm'] )
			if arguments['--noFallBack']: command += " --noFallBack"
			pbs = open("%s/parse.sh"%prj_tree.jgene, 'w')
			pbs.write( "#!/bin/bash\n#$ -N parse-%s\n#$ -l mem=2G\n#$ -cwd\n\n%s\n" % (prj_name, command) )
			pbs.close()
			os.system( "%s -t 1-%d %s/parse.sh"%(qsub,maxFiles,prj_tree.jgene) )
		
			restart = "%s/annotate/1.3-finalize_assignments.py --reenter" % SCRIPT_FOLDER
			for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save']: 
				if arguments[opt] is not None:
					restart += " %s %s" % (opt, arguments[opt])
			for flag in ['--noclean', '--runClustering', '--runCellStatistics']:
				if arguments[flag]:
					restart += " %s" % flag

			monitor = open("%s/parse_monitor.sh"%prj_tree.jgene, 'w')
			monitor.write( "#!/bin/bash\n#$ -N monitor-%s\n#$ -l mem=2G\n#$ -cwd\n#$ -hold_jid parse-%s\n\n%s\n"%(prj_name, prj_name,restart) )
			monitor.close()
			os.system( "%s %s/parse_monitor.sh"%(qsub,prj_tree.jgene) )
			sys.exit()

		else: #do it locally

			parse_pool = Pool(arguments['--threads'])
			parse_pool.map(callParser, range(1,maxFiles+1))
			parse_pool.close()
			parse_pool.join()


	#ok, now collect all of the partial outputs and merge them
	print( "collecting information...")

	#open fasta outputs
	allV_aa	     = open ("%s/%s_allV.fa"	 % (prj_tree.aa, prj_name), "w" )
	allV_nt	     = open( "%s/%s_allV.fa"	 % (prj_tree.nt, prj_name), "w" )

	allJ_aa	     = open( "%s/%s_allJ.fa"	 % (prj_tree.aa, prj_name), "w" )
	allJ_nt	     = open( "%s/%s_allJ.fa"	 % (prj_tree.nt, prj_name), "w" )

	vj_aa	     = open( "%s/%s_goodVJ.fa"	 % (prj_tree.aa, prj_name), "w" )
	vj_nt	     = open( "%s/%s_goodVJ.fa"	 % (prj_tree.nt, prj_name), "w" )

	good_cdr3_aa = open( "%s/%s_goodCDR3.fa" % (prj_tree.aa, prj_name), "w" )
	good_cdr3_nt = open( "%s/%s_goodCDR3.fa" % (prj_tree.nt, prj_name), "w" )

	all_cdr3_aa  = open( "%s/%s_allCDR3.fa"	 % (prj_tree.aa, prj_name), "w" )
	all_cdr3_nt  = open( "%s/%s_allCDR3.fa"	 % (prj_tree.nt, prj_name), "w" )


	#also open final rearrangements tsv
	seq_stats = airr.create_rearrangement( "%s/%s_rearrangements.tsv"%(prj_tree.tables, prj_name), fields=['vj_in_frame','stop_codon','locus','c_call','junction_length','source_file','source_id','duplicate_count','length_raw','length_trimmed','indels','status','blast_identity','consensus_count','cell_id'])


	#initiate overall counters
	raw_count, total = 0, 0
	counts = {'good':0,'nonproductive':0,'indel':0,'noCDR3':0,'stop':0,'noV':0,'noJ':0,'missingNterm':0}

	dict_jcounts = Counter()
	dict_ccounts = Counter()
	dict_dcounts = Counter()
		
	c = False
	if os.path.isfile("%s/%s_C_001.txt" % (prj_tree.jgene, prj_name)):
		c = True

	d = False
	if os.path.isfile("%s/%s_D_001.txt" % (prj_tree.jgene, prj_name)):
		d = True


	#iterate over subset rearrangement files and combine
	#include generating fasta output as appropriate
	for f_ind in range(1, maxFiles+1):

		#merge partial blast hit tables
		with open( "%s/%s_jgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
			with open( "%s/jtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
				table.write(partial.read())

		if d:
			with open( "%s/%s_dgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
				with open( "%s/dtophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
					table.write(partial.read())

		if c:
			with open( "%s/%s_cgerm_tophit.txt" % (prj_tree.tables, prj_name), "a") as table:
				with open( "%s/ctophit_%03d.txt" % (prj_tree.jgene, f_ind), "r" ) as partial:
					table.write(partial.read())

		#go through partial rearrangements files
		for r in airr.read_rearrangement( "%s/rearrangements_%03d.tsv"%(prj_tree.internal, f_ind) ):

			seq_stats.write( r )

			#count j/d/c gene usages
			if not r['j_call'] == "":
				dict_jcounts[ r['j_call'].split(",")[0] ] += 1
			if not r['j_call'] == "":
				dict_jcounts[ r['d_call'].split(",")[0] ] += 1
			if not r['j_call'] == "":
				dict_jcounts[ r['c_call'].split(",")[0] ] += 1

			#count statuses
			counts[ r['status'] ] += 1
			total += 1
			raw_count = int( r['sequence_id'] ) #technically, this undercounts if the last one
												# isn't in the `correct_length` interval, but I
												# don't have a better solution that isn't super
												# kludgy right now

			#ok, now do sequence output
			# start by collecting metadata for fasta def line
			def_line = ">%s" % r['sequence_id']
			if not r['v_call'] == '':          def_line += " v_call=%s"          % r['v_call']
			if not r['d_call'] == '':          def_line += " d_call=%s"          % r['d_call']
			if not r['j_call'] == '':          def_line += " j_call=%s"          % r['j_call']
			if not r['locus']  == '':          def_line += " locus=%s"           % r['locus']
			if not r['c_call'] == '':          def_line += " c_call=%s"          % r['c_call']
			if not r['status'] == '':          def_line += " status=%s"          % r['status']
#			if not r['v_identity'] == '':      def_line += " v_identity=%s"      % r['v_identity']
			if not r['junction_length'] == '': def_line += " junction_length=%s" % r['junction_length']
			if not r['junction'] == '':        def_line += " junction=%s"        % r['junction']
			if not r['junction_aa'] == '':     def_line += " junction_aa=%s"     % r['junction']
			if not r['duplicate_count'] == '': def_line += " duplicate_count=%s" % r['duplicate_count']
			if not r['consensus_count'] == '': def_line += " consensus_count=%s" % r['consensus_count']
			if not r['cell_id'] == '':         def_line += " cell_id=%s"         % r['cell_id']

			#work our way up the hierarchy, putting sequences in the appropriate files
			ungapped = re.sub( "-", "", r['sequence_alignment']) #reintroduces any frameshift errors in translation
																 #  this has always been the behavior, but I wonder
																 #  if I should change/update now that I am using
																 #  proper alignments.

			if not r['status'] in ['noV', 'missingNterm']:
				allV_nt.write( "%s\n%s\n" % (def_line, ungapped) )
				allV_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )
	
				if not r['status'] == 'noJ':
					allJ_nt.write( "%s\n%s\n" % (def_line, ungapped) )
					allJ_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )

					if not r['status'] == 'noCDR3':
						all_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) )
						all_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) )

						if r['status'] == "good":
							vj_nt.write( "%s\n%s\n" % (def_line, ungapped) )
							vj_aa.write( "%s\n%s\n" % (def_line, Seq.Seq(ungapped).translate()) )
							good_cdr3_nt.write( "%s\n%s\n" % (def_line, r['junction']) )
							good_cdr3_aa.write( "%s\n%s\n" % (def_line, r['junction_aa']) )


	#close outputs
	allV_aa.close()
	allV_nt.close()
	allJ_aa.close()
	allJ_nt.close()
	vj_aa.close()
	vj_nt.close()
	good_cdr3_aa.close()
	good_cdr3_nt.close()
	all_cdr3_aa.close()
	all_cdr3_nt.close()

	#useful number
	found = total - counts['noV'] - counts['noJ']

	#print out some statistics
	handle = open("%s/%s_jgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
	writer	= csv.writer(handle, delimiter = sep)
	keys	= sorted(dict_jcounts.keys())
	writer.writerow(["gene", "count", "percent"])
	for key in keys:
		aline = [ key, dict_jcounts[key], "%4.2f" % (dict_jcounts[key] / float(found) * 100) ]
		writer.writerow(aline)
	handle.close()

	if len(dict_ccounts) > 0:
		handle = open("%s/%s_cgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
		writer	= csv.writer(handle, delimiter = sep)
		keys	= sorted(dict_ccounts.keys())
		writer.writerow(["gene", "count", "percent"])
		for key in keys:
			aline = [ key, dict_ccounts[key], "%4.2f" % (dict_ccounts[key] / float(found) * 100) ]
			writer.writerow(aline)
		handle.close()

	if len(dict_dcounts) > 0:
		handle = open("%s/%s_dgerm_stat.txt" %(prj_tree.tables, prj_name),'w')
		writer	= csv.writer(handle, delimiter = sep)
		keys	= sorted(dict_dcounts.keys())
		writer.writerow(["gene", "count", "percent"])
		for key in keys:
			aline = [ key, dict_dcounts[key], "%4.2f" % (dict_dcounts[key] / float(found) * 100) ]
			writer.writerow(aline)
		handle.close()

	message = "\nTotal raw reads: %d\nCorrect Length: %d\nV assigned: %d\nJ assigned: %d\nCDR3 assigned: %d\nIn-frame junction: %d\nNo indels: %d\nContinuous ORF with no stop codons: %d\n\n"  % \
								(raw_count, total, total-counts['noV'], found, found-counts['noCDR3'], found-counts['noCDR3']-counts['nonproductive'], found-counts['noCDR3']-counts['nonproductive']-counts['indel'], counts['good'])
	print( message )
	handle = open("%s/finalize_blast.log"%prj_tree.logs, "w")
	handle.write(message)
	handle.close()

	# call 1.4 if requested
	if arguments['--runClustering']:
		cmd = "%s/annotate/1.4-cluster_sequences.py" % SCRIPT_FOLDER
		for opt in [ '--file', '--min1', '--min2', '--id', '--maxgaps', '--rearrangements', '--save']: 
			if arguments[opt] is not None:
				cmd += " %s '%s'" % (opt, arguments[opt])
		if arguments['--runCellStatistics']:
			cmd += " --runCellStatistics"

		print( "Calling 1.4 with command line: %s" % cmd )
		os.system( cmd )

	#clean up!!
	oldFiles = glob.glob("%s/*txt"%prj_tree.vgene) + glob.glob("%s/*fasta"%prj_tree.vgene) +  glob.glob("%s/*txt"%prj_tree.jgene) + glob.glob("%s/*fasta"%prj_tree.jgene) + glob.glob("%s/*tsv"%prj_tree.jgene) + glob.glob("%s/lookup*"%prj_tree.internal)
	if len(oldFiles) > 0 and not arguments['--noclean']:
		[os.remove(f) for f in oldFiles]