def main():
	cells_raw = defaultdict( dict )
	most_used = defaultdict( list )
	output = open("%s/%s_cell_stats.tsv"%(prj_tree.tables,prj_name), 'w')
	output.write("cell\tstatus\tisotype\tproductive_IGH\ttotal_IGH\tIGH_junctions\tproductive_IGK\ttotal_IGK\tIGK_junctions\tproductive_IGL\ttotal_IGL\tIGL_junctions\n")
	data = airr.read_rearrangement(arguments['--rearrangements'])
	cells_only = airr.derive_rearrangement(re.sub(".tsv", "_single-cell.tsv", arguments['--rearrangements']), arguments['--rearrangements'])

	#assume cells might not be grouped together, so make a first pass
	#    to collect everything
	for r in data:
		if r['status'] in ['good', 'indel', 'stop', 'nonproductive', 'unique']: #skip irrelevant sequences
			if r['locus'] not in cells_raw[r['cell_id']]:
				cells_raw[r['cell_id']][r['locus']] = [ r ]
			else:
				cells_raw[r['cell_id']][r['locus']].append( r )
			#need better heuristic for this, omit for now
			#if r['cell_id'] not in most_used[r['centroid']]:
			#	most_used[r['centroid']].append(r['cell_id'])

	#now go back and process each cell
	status_list  = [ 'canonical_pair', 'possible_inclusion', 'heavy_only', 'light_only', 'multi_light', 'multi_heavy', 'probable_multiplet', 'none_productive' ]
	status_count = dict( zip( status_list, [0,0,0,0,0,0,0,0] )  )
	status_dict  = dict( )

	for c in cells_raw:
		cell_processed	= defaultdict( list )
		cell_productive = defaultdict( list )
		for locus in cells_raw[c]:
			#Start with the one with the most UMIs
			for rep in sorted( [ r for r in cells_raw[c][locus] ], key=lambda k: k['duplicate_count'], reverse=True ):
				#check if this is a duplicate of a previously kept read
				keep = True
				for previous in cell_processed[locus]:
					#shortcut: assume identical junctions means duplicates
					if previous['junction_aa'] == rep['junction_aa']:
						keep = False
						break
					#heuristic (for 10x data as of March 2019):  omit gaps and cut off possible noise at 5' end
					else:
						cov, score = scoreAlign( quickAlign(previous['sequence_alignment'],rep['sequence_alignment']), countInternalGaps=False, skip=50 )
						if score >= 0.95:
							keep = False
							break
					
				if keep:
					cell_processed[locus].append( rep )
					if rep['status'] == "good": cell_productive[locus].append( rep )

		status = ""
		h_type = ""
		if len(cell_productive['IGH']) == 0:
			if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0:
				status = "none_productive"
			elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 1:
				status = "light_only"
			elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 1:
				status = "multi_light"
		elif len(cell_productive['IGH']) == 1:
			h_type = re.sub("\*.+", "", cell_productive['IGH'][0]['c_call'])
			if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0:
				status = "heavy_only"
			elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 1:
				status = "canonical_pair"
			elif len(cell_productive['IGK']) + len(cell_productive['IGL']) == 2:
				status = "possible_inclusion"
			elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 2:
				status = "probable_multiplet"
		elif len(cell_productive['IGH']) > 1:
				status = "multi_heavy"

		status_count[status] += 1
		status_dict[c] = status

		#print to filtered rearrangements file
		if status in arguments['--save']:
			for loc in cell_processed:
				for chain in cell_processed[loc]:
					cells_only.write( chain )

		#now log the cell
		print( "\t".join( [c, status, h_type,
				   str(len(cell_productive['IGH'])), str(len(cell_processed['IGH'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGH']]),
				   str(len(cell_productive['IGK'])), str(len(cell_processed['IGK'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGK']]),
				   str(len(cell_productive['IGL'])), str(len(cell_processed['IGL'])), ";".join([chain['junction_aa'] for chain in cell_processed['IGL']]) ] ),
		       file=output)


	output.close()

	with open("%s/cell_processing.log"%prj_tree.logs, "w") as log:
		print("\t".join(status_list), file=log)
		print("\t".join([str(status_count[s]) for s in status_list]), file=log)
	
	print("\t".join(status_list))
	print("\t".join([str(status_count[s]) for s in status_list]))
def main():

    global germs
    germs = dict()
    for entry in SeqIO.parse(open(arguments['-g'], "r"), "fasta"):
        germs[entry.id] = entry

    global mature
    mature = dict()
    if arguments['-a'] is not None:
        for entry in SeqIO.parse(open(arguments['-a'], "r"), "fasta"):
            mature[entry.id] = entry

    inputFile = arguments['-f']
    dedup = dict()
    if arguments['-d']:
        subprocess.call([
            vsearch, "-derep_fulllength", arguments['-f'], "-output",
            "temp_dedup.fa", "-uc", "temp.uc", "-notrunclabels"
        ])

        inputFile = "temp_dedup.fa"

        #process the uc file
        with open("temp.uc", "r") as handle:
            uc = csv.reader(handle, delimiter="\t")
            for row in uc:
                if row[0] == "S":
                    dedup[row[8].split(" ")[0]] = row[8].split(" ")[0]
                elif row[0] == "H":
                    dedup[row[8].split(" ")[0]] = row[9].split(" ")[0]

    results = dict()
    #If we are multithreading, split input into chunks
    if arguments['-t'] > 1:
        index = 0
        counter = 0
        chunk = []
        reader = SeqIO.parse(open(inputFile, "r"), "fasta")
        for entry in reader:
            chunk.append(entry)
            counter += 1
            if counter == 1000:
                with open("%s/align/align%06d.fa" % (prj_tree.lineage, index),
                          "w") as handle:
                    SeqIO.write(chunk, handle, "fasta")
                index += 1
                counter = 0
                chunk = []
        if counter > 0:
            with open("%s/align/align%06d.fa" % (prj_tree.lineage, index),
                      "w") as handle:
                SeqIO.write(chunk, handle, "fasta")
            index += 1  #so we can use range properly

        #now create a pool and start the actual work
        filterPool = Pool(arguments['-t'])
        dataBlob = filterPool.map(runAlign, [
            "%s/align/align%06d.fa" % (prj_tree.lineage, i)
            for i in range(index)
        ])
        filterPool.close()
        filterPool.join()

        #Recover results
        for blob in dataBlob:
            results.update(blob)

    else:
        #unthreaded, just do the whole thing
        results = runAlign(inputFile)

    #get some outputs set up
    outFile = os.path.basename(os.path.splitext(arguments['-f'])[0])
    if os.path.isdir(prj_tree.tables):
        outFile = "output/tables/" + outFile
    if arguments['-o'] is not None:
        outFile = arguments['-o']

    nats = sorted(mature.keys())

    covFile = open("%s_coverage.tab" % outFile, "w")
    coverage = csv.writer(covFile, delimiter="\t")
    coverage.writerow(['sequence_id', 'germ_cov'] + nats)

    idFile = open("%s_id-div.tab" % outFile, "w")
    iddiv = csv.writer(idFile, delimiter="\t")
    iddiv.writerow(['sequence_id', 'v_gene', 'germ_div'] + nats)

    #sort the freaking list and output
    if arguments['-d']:
        for s in sorted(dedup.keys()):
            (germc, germi) = results[dedup[s]]['germline']
            if not germc == "NA":
                germc = "%.1f" % germc
                germi = "%.1f" % (100 - germi)
            coverage.writerow([s, germc] + [
                "NA" if results[dedup[s]][n][0] == "NA" else "%.1f" %
                results[dedup[s]][n][0] for n in nats
            ])
            iddiv.writerow([s, results[dedup[s]]['vlookup'], germi] + [
                "NA" if results[dedup[s]][n][1] == "NA" else "%.1f" %
                results[dedup[s]][n][1] for n in nats
            ])

        #take this opportunity to do some cleanup
        os.remove("temp_dedup.fa")
        os.remove("temp.uc")
    else:
        for s in sorted(results.keys()):
            (germc, germi) = results[s]['germline']
            if not germc == "NA":
                germc = "%.1f" % germc
                germi = "%.1f" % (100 - germi)
            coverage.writerow([s, germc] + [
                "NA" if results[s][n][0] == "NA" else "%.1f" % results[s][n][0]
                for n in nats
            ])
            iddiv.writerow([s, results[s]['vlookup'], germi] + [
                "NA" if results[s][n][1] == "NA" else "%.1f" % results[s][n][1]
                for n in nats
            ])

    covFile.close()
    idFile.close()

    #do AIRR output
    if os.path.dirname(
            arguments['-f']
    ) == "output/sequences/nucleotide" and not 'CDR3' in arguments['-f']:
        if os.path.isfile("%s/%s_rearrangements.tsv" %
                          (prj_tree.tables, prj_name)):
            withDiv = airr.derive_rearrangement("updateRearrangements.tsv",
                                                "%s/%s_rearrangements.tsv" %
                                                (prj_tree.tables, prj_name),
                                                fields=['v_identity'])
            for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" %
                                             (prj_tree.tables, prj_name)):
                if dedup.get(r['sequence_id'], r['sequence_id']) in results:
                    # omit NAs here to comply with AIRR format
                    if not results[dedup.get(
                            r['sequence_id'],
                            r['sequence_id'])]['germline'][1] == "NA":
                        r['v_identity'] = "%0.3f" % (results[dedup.get(
                            r['sequence_id'], r['sequence_id'])]['germline'][1]
                                                     / 100)
                withDiv.write(r)
            withDiv.close()
            os.rename("updateRearrangements.tsv",
                      "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
def main():

    #look for cell hashing
    hashDict = dict()
    sampleList = []
    if os.path.exists(f"{prj_tree.tables}/{prj_name}_hashes.tsv"):
        with open(f"{prj_tree.tables}/{prj_name}_hashes.tsv", 'r') as handle:
            reader = csv.reader(handle, delimiter="\t")
            for row in reader:
                hashDict[row[0]] = [row[1]]
                if row[1] == "unknown" or row[1] == "ambiguous":
                    continue
                elif not row[1] in sampleList:
                    sampleList.append(row[1])
    sampleList.sort()
    sampleList += ["unknown", "ambiguous"]

    #look for feature barcoding
    featureDict = dict()
    if os.path.exists(f"{prj_tree.tables}/{prj_name}_features.tsv"):
        with open(f"{prj_tree.tables}/{prj_name}_features.tsv", 'r') as handle:
            reader = csv.reader(handle, delimiter="\t")
            header = next(reader)
            featureDict["keys"] = header[1:]
            for row in reader:
                featureDict[row[0]] = row[1:]

    cells_raw = defaultdict(dict)
    most_used = defaultdict(list)

    output = open("%s/%s_cell_stats.tsv" % (prj_tree.tables, prj_name), 'w')
    outwriter = csv.writer(output, delimiter="\t")
    outheader = ["cell", "status", "isotype"]
    if len(hashDict) > 0:
        outheader += ["hash_sample"]
    if len(featureDict) > 0:
        outheader += featureDict["keys"]
    outheader += [
        "productive_IGH", "total_IGH", "IGH_junctions", "productive_IGK",
        "total_IGK", "IGK_junctions", "productive_IGL", "total_IGL",
        "IGL_junctions"
    ]
    outwriter.writerow(outheader)

    data = airr.read_rearrangement(arguments['--rearrangements'])
    fields = ["cell_status"]
    if len(hashDict) > 0:
        fields += ["hash_sample"]
    cells_only = airr.derive_rearrangement(re.sub(
        ".tsv", "_single-cell.tsv", arguments['--rearrangements']),
                                           arguments['--rearrangements'],
                                           fields=fields)

    #assume cells might not be grouped together, so make a first pass
    #    to collect everything
    for r in data:
        if r['status'] in ['good', 'indel', 'stop', 'nonproductive',
                           'unique']:  #skip irrelevant sequences
            if r['locus'] not in cells_raw[r['cell_id']]:
                cells_raw[r['cell_id']][r['locus']] = [r]
            else:
                cells_raw[r['cell_id']][r['locus']].append(r)
            #need better heuristic for this, omit for now
            #if r['cell_id'] not in most_used[r['centroid']]:
            #	most_used[r['centroid']].append(r['cell_id'])

    #now go back and process each cell
    status_list = [
        'canonical_pair', 'possible_inclusion', 'heavy_only', 'light_only',
        'multi_light', 'multi_heavy', 'probable_multiplet', 'none_productive'
    ]
    status_count = dict()
    for sample in sampleList:
        status_count[sample] = dict(zip(status_list, [0, 0, 0, 0, 0, 0, 0, 0]))
    status_dict = dict()

    for c in cells_raw:
        cell_processed = defaultdict(list)
        cell_productive = defaultdict(list)
        for locus in cells_raw[c]:
            #Start with the one with the most UMIs
            for rep in sorted([r for r in cells_raw[c][locus]],
                              key=lambda k: k['duplicate_count'] or 0,
                              reverse=True):
                #check if this is a duplicate of a previously kept read
                keep = True
                for previous in cell_processed[locus]:
                    #shortcut: assume identical junctions means duplicates
                    if previous['junction_aa'] == rep['junction_aa']:
                        keep = False
                        if previous['duplicate_count'] is not None:
                            previous['duplicate_count'] += rep[
                                'duplicate_count']
                        if previous['consensus_count'] is not None:
                            previous['consensus_count'] += rep[
                                'consensus_count']
                        break
                    #heuristic (for 10x data as of March 2019):  omit gaps and cut off possible noise at 5' end
                    else:
                        score, cov = scoreAlign(quickAlign(
                            previous['sequence_alignment'],
                            rep['sequence_alignment']),
                                                countInternalGaps=False,
                                                skip=50)
                        if score >= 0.95:
                            keep = False
                            if previous['duplicate_count'] is not None:
                                previous['duplicate_count'] += rep[
                                    'duplicate_count']
                            if previous['consensus_count'] is not None:
                                previous['consensus_count'] += rep[
                                    'consensus_count']
                            break

                if keep:
                    cell_processed[locus].append(rep)
                    if rep['status'] == "good":
                        cell_productive[locus].append(rep)

        status = ""
        h_type = ""
        if len(cell_processed['IGH']) > 2 or len(
                cell_processed['IGK']) > 2 or len(cell_processed['IGL']) > 2:
            status = "probable_multiplet"
        elif len(cell_productive['IGH']) == 0:
            if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0:
                status = "none_productive"
            elif len(cell_productive['IGK']) + len(
                    cell_productive['IGL']) == 1:
                status = "light_only"
            elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 1:
                status = "multi_light"
        elif len(cell_productive['IGH']) == 1:
            h_type = re.sub("\*.+", "", cell_productive['IGH'][0]['c_call'])
            if len(cell_productive['IGK']) + len(cell_productive['IGL']) == 0:
                status = "heavy_only"
            elif len(cell_productive['IGK']) + len(
                    cell_productive['IGL']) == 1:
                status = "canonical_pair"
            elif len(cell_productive['IGK']) + len(
                    cell_productive['IGL']) == 2:
                status = "possible_inclusion"
            elif len(cell_productive['IGK']) + len(cell_productive['IGL']) > 2:
                status = "probable_multiplet"
        elif len(cell_productive['IGH']) > 1:
            status = "multi_heavy"

        status_count[hashDict.get(c, ["unknown"])[0]][status] += 1
        status_dict[c] = status

        #print to filtered rearrangements file
        #leave out cells with ambiguous hashing assignments if we are doing any filtering
        if status in arguments['--save']:
            if hashDict.get(
                    c, ["unknown"]
            )[0] != "ambiguous" or "probable_multiplet" in arguments['--save']:
                for loc in cell_processed:
                    for chain in cell_processed[loc]:
                        chain['cell_status'] = status
                        if len(hashDict) > 0:
                            chain['hash_sample'] = hashDict.get(
                                chain['cell_id'], ['unknown'])[0]
                        cells_only.write(chain)

        #now log the cell
        outwriter.writerow(
            [c, status, h_type] + hashDict.get(c, ['unknown'] *
                                               (len(hashDict) > 0)) +
            featureDict.get(c, ['0'] * len(featureDict.get("keys", []))) + [
                len(cell_productive['IGH']),
                len(cell_processed['IGH']), ";".join(
                    [chain['junction_aa'] for chain in cell_processed['IGH']]),
                len(cell_productive['IGK']),
                len(cell_processed['IGK']), ";".join(
                    [chain['junction_aa'] for chain in cell_processed['IGK']]),
                len(cell_productive['IGL']),
                len(cell_processed['IGL']), ";".join(
                    [chain['junction_aa'] for chain in cell_processed['IGL']])
            ])

    output.close()

    with open("%s/cell_processing.log" % prj_tree.logs, "w") as log:
        print("sample\t" + "\t".join(status_list), file=log)
        print("sample\t" + "\t".join(status_list))
        for sample in sampleList:
            if sum([status_count[sample][s] for s in status_list]) == 0:
                continue  #leave out `ambiguous` if it's not a hashed sample
            print(
                "\t".join([sample] +
                          [str(status_count[sample][s]) for s in status_list]),
                file=log)
            print(
                "\t".join([sample] +
                          [str(status_count[sample][s]) for s in status_list]))
def main():

    #start by making possible "duplicate_count" info available to vsearch
    with open("temp.fa", "w") as handle:
        SeqIO.write(reformatInput(arguments['--file']), handle, "fasta")

    #first step on higher identity
    subprocess.call([
        vsearch, "-derep_fulllength", "temp.fa", "-output", "temp_dedup.fa",
        "-uc", "temp.uc", "-sizein", "-sizeout", "-minuniquesize",
        arguments['--min1']
    ])

    #process the uc file
    centroid = dict()
    with open("temp.uc", "r") as handle:
        uc = csv.reader(handle, delimiter="\t")
        for row in uc:
            if row[0] == "H":
                centroid[re.sub(";size=\d+", "",
                                row[8])] = re.sub(";size=\d+", "", row[9])

    #second clustering step
    subprocess.call([
        vsearch, "-cluster_size", "temp_dedup.fa", "-sizein", "-sizeout",
        "-maxgaps", arguments['--maxgaps'], "-id", arguments['--id'], "-uc",
        "%s.cluster" % os.path.splitext(arguments['--file'])[0]
    ])

    #process the uc file
    size = dict()
    with open("%s.cluster" % os.path.splitext(arguments['--file'])[0],
              "r") as handle:
        uc = csv.reader(handle, delimiter="\t")
        for row in uc:
            if row[0] == "H":
                centroid[re.sub(";size=\d+", "",
                                row[8])] = re.sub(";size=\d+", "", row[9])
            elif row[0] == "C":
                #have the centroids point to themselves for more uniform dowsntream processing
                centroid[re.sub(";size=\d+", "",
                                row[8])] = re.sub(";size=\d+", "", row[8])
                #but only save them if they meet the threshold
                if int(row[2]) >= arguments['--min2']:
                    size[re.sub(";size=\d+", "", row[8])] = int(row[2])

    #clean up
    os.remove("temp.fa")
    os.remove("temp_dedup.fa")
    os.remove("temp.uc")

    #do sequence outputs
    with open("%s_unique.fa" % os.path.splitext(arguments['--file'])[0],
              "w") as handle:
        SeqIO.write(getUniques(arguments['--file'], size), handle, 'fasta')

    #retrieve unique CDR3s (and do AA seqs as appropriate)
    if "goodVJ" in arguments['--file']:
        cdr3_file = re.sub("goodVJ", "goodCDR3", arguments['--file'])
        if os.path.isfile(cdr3_file):
            with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0],
                      "w") as handle:
                SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta')
        else:
            print("Can't find %s to extract unique sequences..." % cdr3_file,
                  file=sys.stderr)

        if "nucleotide" in cdr3_file:
            cdr3_file = re.sub("nucleotide", "amino_acid", cdr3_file)
            if os.path.isfile(cdr3_file):
                with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0],
                          "w") as handle:
                    SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta')
            else:
                print("Can't find %s to extract unique sequences..." %
                      cdr3_file,
                      file=sys.stderr)
    if "nucleotide" in arguments['--file']:
        aa_file = re.sub("nucleotide", "amino_acid", arguments['--file'])
        if os.path.isfile(aa_file):
            with open("%s_unique.fa" % os.path.splitext(aa_file)[0],
                      "w") as handle:
                SeqIO.write(getUniques(aa_file, size), handle, 'fasta')
        else:
            print("Can't find %s to extract unique sequences..." % aa_file,
                  file=sys.stderr)

    #now do AIRR output
    if "output/sequences/nucleotide" in arguments['--file']:
        if os.path.isfile("%s/%s_rearrangements.tsv" %
                          (prj_tree.tables, prj_name)):
            clustered = airr.derive_rearrangement(
                "updateRearrangements.tsv",
                "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name),
                fields=['centroid', 'cluster_count'])
            for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" %
                                             (prj_tree.tables, prj_name)):
                #clear old annotations in case we ran 1.4 previously
                r['centroid'] = ""
                r['cluster_count'] = ""

                #now add back current annotations
                #two rounds of clustering means we start by looking for the centroid of the centroid,
                #    falling back to the first level centroid (second clustering step only) if appropriate
                r['centroid'] = centroid.get(
                    centroid.get(r['sequence_id'], ""),
                    centroid.get(r['sequence_id'], ""))

                #add cluster size information for final centroids. I am doing away with changing the 'status' of
                #    the centroids to 'unique' because I've started using this script in a lot of cases where it
                #    doesn't make sense to treat 'unique' as a subset of 'good', and I therefore need to preserve
                #    the original status designation. To find centroids, look for a non-null 'cluster_count' field
                if r['sequence_id'] in size:
                    r['cluster_count'] = size[r['sequence_id']]

                clustered.write(r)
            clustered.close()
            os.rename("updateRearrangements.tsv",
                      "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
        else:
            print(
                "Can't find the rearrangements file, not saving data in AIRR format",
                file=sys.stderr)

    # call 1.5 if requested
    if arguments['--runCellStatistics']:
        cmd = "%s/annotate/1.5-single_cell_statistics.py" % SCRIPT_FOLDER
        if arguments['--rearrangements'] is not None:
            cmd += " --rearrangements %s" % arguments['--rearrangements']
        if arguments['--save'] is not None:
            cmd += " --save %s" % arguments['--save']

        print("Calling 1.5 with command line: %s" % cmd)
        os.system(cmd)
newd.close()

data = airr.read_rearrangement('my_data.tsv')
print(data.fields)
print(data.external_fields)
for r in data:
    print(r)

# create a derived rearrangements file with additional annotation
print('*****')
print('*****')
print('Derive rearrangements file from another.')
print('*****')
print('*****')
mored = airr.derive_rearrangement('more_data.tsv',
                                  'my_data.tsv',
                                  fields=['new_field', 'more_annotation'])
print(mored.fields)
print(mored.external_fields)
for r in airr.read_rearrangement('my_data.tsv'):
    r['new_field'] = 'A'
    r['more_annotation'] = 'B'
    print(r)
    mored.write(r)
mored.close()

# validate rearrangements file
print('*****')
print('*****')
print('Validate rearrangements file.')
print('*****')
def main():

    #first, open the input file and parse into groups with same V/J
    vj_partition = dict()
    cdr3_info = dict()
    seqSize = Counter()

    #start off by getting size annotations
    for read in generate_read_fasta(arguments['--full']):
        seqSize[read.id] = 1
        check = re.search("cluster_count=(\d+)", read.description)
        if check:
            seqSize[read.id] = int(check.group(1))

    gene_pat = re.compile(
        "(?:v_call|V_gene)=IG([HKL]V[^*]+).*(?:j_call|J_gene)=IG([HKL]J\d)")
    for sequence in SeqIO.parse(open(arguments['--cdr3'], "r"), "fasta"):
        genes = re.search(gene_pat, sequence.description)
        if genes:
            key = genes.group(1) + "_" + genes.group(2)
            key = re.sub(
                "[()/]", "",
                key)  #so /OR or (II) genes don't screw up the file system
            if key not in vj_partition:
                temp = "%s/%s.fa" % (prj_tree.lineage, key)
                vj_partition[key] = {
                    'group': key,
                    'handle': open(temp, "w"),
                    'file': temp,
                    'count': 0,
                    'ids': []
                }

            vj_partition[key]['count'] += 1
            vj_partition[key]['ids'].append(sequence.id)
            cdr3_info[sequence.id] = {
                'cdr3_len': int(len(sequence.seq) / 3),
                'cdr3_seq': sequence.seq.translate()
            }

            #make sizes available to vsearch
            sequence.id += ";size=%d" % seqSize[
                sequence.id]  #do this even if there's no label
            #so I don't need to divide the cases for vsearch
            #and write
            SeqIO.write([sequence], vj_partition[key]['handle'], 'fasta')
        else:
            print("Couldn't find V and J genes for %s %s, skipping..." %
                  (sequence.id, sequence.description))

    global natives
    natives = dict()
    if arguments['--natives'] is not None:
        natives = load_fastas(arguments['--natives'])
        for n, s in natives.items():
            if arguments['-v'] is not None:
                key = arguments['-v'] + "_" + arguments['-j']
            else:
                genes = re.search(gene_pat, s.description)
                if genes:
                    key = genes.group(1) + "_" + genes.group(2)
                else:
                    sys.exit(
                        "Can't find V and J gene annotations for native sequence %s. Please specify using the -v and -j parameters."
                        % n)

            key = re.sub(
                "[()/]", "", key
            )  #wouldn't expect this to be relevant for natives, but just in case...

            if key not in vj_partition:
                print(
                    "No NGS sequences with the same V/J genes as native sequence %s (%s); skipping..."
                    % (n, key))
                continue

            seqSize[n] = 0
            s.id += ";size=1"
            vj_partition[key]['count'] += 1
            vj_partition[key]['ids'].append(n)
            cdr3_info[n] = {
                'cdr3_len': int(len(s.seq) / 3),
                'cdr3_seq': s.seq.translate()
            }
            SeqIO.write([s], vj_partition[key]['handle'], 'fasta')

    #close the file handles and delete the reference, so dict can be pickled for multithreading
    for cluster in vj_partition:
        vj_partition[cluster]['handle'].close()
        del vj_partition[cluster]['handle']

    #now go through and cluster each V/J grouping
    clusterLookup = dict()
    centroidData = dict()
    clusterSizes = Counter()
    if arguments['-t'] > 1:
        pool = Pool(arguments['-t'])
        blob = pool.map(processClusters,
                        iterator_slice(
                            vj_partition.values(),
                            25))  #number per slice needs optimization
        pool.close()
        pool.join()

        for d in blob:
            clusterLookup.update(d['cl'])
            centroidData.update(d['cd'])
            clusterSizes.update(d['cs'])
    else:
        #don't thread
        d = processClusters((0, vj_partition.values()))
        clusterLookup.update(d['cl'])
        centroidData.update(d['cd'])
        clusterSizes.update(d['cs'])

    #now process all clusters and do tabular output
    with open("%s/%s_lineages.txt" % (prj_tree.tables, prj_name),
              "w") as handle:
        writer = csv.writer(handle, delimiter=sep)
        writer.writerow([
            "clone_id", "sequence_id", "v_call", "j_call",
            "junction_length_aa", "junction_aa", "clone_count", "included_mAbs"
        ])
        for rank, (centroid, size) in enumerate(clusterSizes.most_common()):
            centroidData[centroid]['rank'] = rank + 1
            writer.writerow([
                "%05d" % (rank + 1), centroid, centroidData[centroid]['vgene'],
                centroidData[centroid]['jgene'],
                cdr3_info[centroid]['cdr3_len'],
                cdr3_info[centroid]['cdr3_seq'], size,
                ",".join(centroidData[centroid]['nats'])
            ])

    #do sequence output
    notationFile = re.sub("\.f.+", "_lineageNotations.fa", arguments['--full'])
    repFile = re.sub("\.f.+", "_lineageRepresentatives.fa",
                     arguments['--full'])

    rep_seqs = []
    with open(notationFile, "w") as handle:
        for read in generate_read_fasta(arguments['--full']):
            if ";" in read.id:
                read.id = read.id[
                    0:8]  #this is for raw VSearch output with size annotations
                #shouldn't be relevant in pipeline context
            if read.id not in clusterLookup: continue
            read.description += " clone_id=%05d clone_rep=%s clone_count=%d" % (
                centroidData[clusterLookup[read.id]]['rank'],
                clusterLookup[read.id], clusterSizes[clusterLookup[read.id]])
            SeqIO.write([read], handle, "fasta")
            if read.id in centroidData:
                rep_seqs.append(read)

    with open(repFile, "w") as handle:
        #use a sort to put them out in order of lineage rank (ie size)
        SeqIO.write(
            sorted(rep_seqs, key=lambda cent: centroidData[cent.id]['rank']),
            handle, "fasta")

    #do AIRR output
    if os.path.dirname(arguments['--full']) == prj_tree.nt:
        if os.path.isfile("%s/%s_rearrangements.tsv" %
                          (prj_tree.tables, prj_name)):
            withLin = airr.derive_rearrangement(
                "updateRearrangements.tsv",
                "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name),
                fields=["clone_id", "clone_count"])
            for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" %
                                             (prj_tree.tables, prj_name)):
                if r['sequence_id'] in clusterLookup:
                    r['clone_id'] = "%05d" % centroidData[clusterLookup[
                        r['sequence_id']]]['rank']
                    r['clone_count'] = clusterSizes[clusterLookup[
                        r['sequence_id']]]
                else:
                    #prevent mix-and-match data if this gets run multiple times with multiple settings
                    r['clone_id'] = ""
                    r['clone_count'] = ""

                withLin.write(r)
            withLin.close()
            os.rename("updateRearrangements.tsv",
                      "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
def main():

    #start by making possible "duplicate_count" info available to vsearch
    with open("temp.fa", "w") as handle:
        SeqIO.write(reformatInput(arguments['-f']), handle, "fasta")

    #first step on higher identity
    subprocess.call([
        vsearch, "-derep_fulllength", "temp.fa", "-output", "temp_dedup.fa",
        "-uc", "temp.uc", "-sizein", "-sizeout", "-minuniquesize",
        arguments['--min1']
    ])

    #process the uc file
    centroid = dict()
    with open("temp.uc", "r") as handle:
        uc = csv.reader(handle, delimiter="\t")
        for row in uc:
            if row[0] == "H":
                centroid[row[8]] = row[9]

    #second clustering step
    subprocess.call([
        vsearch, "-cluster_size", "temp_dedup.fa", "-sizein", "-sizeout",
        "-maxgaps", "0", "-id", arguments['--id'], "-uc",
        "%s.cluster" % os.path.splitext(arguments['-f'])[0]
    ])

    #process the uc file
    size = dict()
    with open("%s.cluster" % os.path.splitext(arguments['-f'])[0],
              "r") as handle:
        uc = csv.reader(handle, delimiter="\t")
        for row in uc:
            if row[0] == "H":
                centroid[re.sub(";size=\d+", "",
                                row[8])] = re.sub(";size=\d+", "", row[9])
            elif row[0] == "C" and int(row[2]) >= arguments['--min2']:
                size[re.sub(";size=\d+", "", row[8])] = int(row[2])

    #clean up
    os.remove("temp.fa")
    os.remove("temp_dedup.fa")
    os.remove("temp.uc")

    #do sequence outputs
    with open("%s_unique.fa" % os.path.splitext(arguments['-f'])[0],
              "w") as handle:
        SeqIO.write(getUniques(arguments['-f'], size), handle, 'fasta')

    #retrieve unique CDR3s (and do AA seqs as appropriate)
    if "goodVJ" in arguments['-f']:
        cdr3_file = re.sub("goodVJ", "goodCDR3", arguments['-f'])
        if os.path.isfile(cdr3_file):
            with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0],
                      "w") as handle:
                SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta')
        else:
            print("Can't find %s to extract unique sequences..." % cdr3_file,
                  file=sys.stderr)

        if "nucleotide" in cdr3_file:
            cdr3_file = re.sub("nucleotide", "amino_acid", cdr3_file)
            if os.path.isfile(cdr3_file):
                with open("%s_unique.fa" % os.path.splitext(cdr3_file)[0],
                          "w") as handle:
                    SeqIO.write(getUniques(cdr3_file, size), handle, 'fasta')
            else:
                print("Can't find %s to extract unique sequences..." %
                      cdr3_file,
                      file=sys.stderr)
    if "nucleotide" in arguments['-f']:
        aa_file = re.sub("nucleotide", "amino_acid", arguments['-f'])
        if os.path.isfile(aa_file):
            with open("%s_unique.fa" % os.path.splitext(aa_file)[0],
                      "w") as handle:
                SeqIO.write(getUniques(aa_file, size), handle, 'fasta')
        else:
            print("Can't find %s to extract unique sequences..." % aa_file,
                  file=sys.stderr)

    #now do AIRR output
    if arguments[
            '-f'] == "output/sequences/nucleotide/%s_goodVJ.fa" % prj_name:
        if os.path.isfile("%s/%s_rearrangements.tsv" %
                          (prj_tree.tables, prj_name)):
            clustered = airr.derive_rearrangement(
                "updateRearrangements.tsv",
                "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name),
                fields=['centroid', 'cluster_count'])
            for r in airr.read_rearrangement("%s/%s_rearrangements.tsv" %
                                             (prj_tree.tables, prj_name)):
                r['centroid'] = centroid.get(
                    centroid.get(r['sequence_id'], ""),
                    centroid.get(r['sequence_id'], ""))
                if r['sequence_id'] in size:
                    r['cluster_count'] = size[r['sequence_id']]
                    r['status'] = "unique"
                elif r['sequence_id'] in centroid:
                    #prevent mix-and-match data if this gets run multiple times with multiple settings
                    #I can get away with this because rearrangements.tsv only gets edited when clustering
                    #   the goodVJ file
                    r['cluster_count'] = ""
                    r['status'] = "good"
                clustered.write(r)
            clustered.close()
            os.rename("updateRearrangements.tsv",
                      "%s/%s_rearrangements.tsv" % (prj_tree.tables, prj_name))
        else:
            print(
                "Can't find the rearrangements file, not saving data in AIRR format",
                file=sys.stderr)