Exemple #1
0
def checkCDR3_fastq(mock_dict,
                    fastqfile,
                    hits_out=False,
                    misses_out=False,
                    failures_out=False,
                    title_split=' ',
                    verbose=True):
    """
    Compares the sliced CDR3 in a fastq file with those simulated by MiXCR (and given in a .descr file)
    """
    import reptools
    reptoolsdict = {}
    with open(fastqfile) as infile:
        for title, seq, qual in reptools.FASTQparser(infile):
            id = title.split(';')[0].split(title_split)[0]
            reptoolsdict[id] = seq

    results = {'hit': 0, 'miss': 0, 'fail': 0}

    with open(
            hits_out,
            'wb') if hits_out else reptools.dummy_context_mgr() as hits_handle:
        with open(misses_out,
                  'wb') if misses_out else reptools.dummy_context_mgr(
                  ) as misses_handle:
            with open(failures_out,
                      'wb') if failures_out else reptools.dummy_context_mgr(
                      ) as failures_handle:
                for id in mock_dict:
                    try:
                        if reptoolsdict[id].lower(
                        ) == mock_dict[id]['CDR3'].lower():
                            results['hit'] += 1
                            hits_handle.write('>%s\n%s\n' %
                                              (id, reptoolsdict[id]))
                        else:
                            if reptoolsdict[id].lower() == 'n':
                                results['fail'] += 1
                                failures_handle.write('>%s\n%s\n' %
                                                      (id, reptoolsdict[id]))
                            else:
                                results['miss'] += 1
                                misses_handle.write('>%s\n%s\n' %
                                                    (id, reptoolsdict[id]))
                    except KeyError:
                        results['fail'] += 1
                        print(id)
                        failures_handle.write('>%s\n%s\n' % (id, ''))

    if verbose:
        print(('hit = %s' % results['hit']))
        print(('miss = %s' % results['miss']))
        print(('failed = %s' % results['fail']))
        print(('pct hit = %s' %
               (results['hit'] / float(sum(results.values())) * 100)))
        print(('pct miss = %s' %
               (results['miss'] / float(sum(results.values())) * 100)))
    return (results)
Exemple #2
0
def saveFASTX(seqs_np, gene_ids_sets, FASTAout, FASTQout, gene_labels):
    #make gene_strings
    gene_ids = make_gene_strings(gene_ids_sets, gene_labels)
    #find maximum count
    maxcount = np.amax([np.amax(seqs_np[seqlen][3]) for seqlen in seqs_np])
    #descend through counts
    with open(
            FASTAout,
            'w') if FASTAout else reptools.dummy_context_mgr() as fasta_handle:
        with open(FASTQout, 'w') if FASTQout else reptools.dummy_context_mgr(
        ) as fastq_handle:
            for count in range(maxcount, 0,
                               -1):  #don't want to write where count=0
                for seqlen in seqs_np:
                    towrite = np.where(seqs_np[seqlen][3] == count)[0]
                    for x in towrite:
                        #try:
                        title = '{};{};size={}'.format(
                            seqs_np[seqlen][4][x],
                            gene_ids[seqlen][seqs_np[seqlen][1][x]],
                            seqs_np[seqlen][3][x])
                        #except TypeError: #if there are no seqs with this count, the iterator returns an empty numpy
                        #    #array, which breaks the indexing
                        #   continue
                        seq = ''.join([chr(c) for c in seqs_np[seqlen][0][x]])
                        try:
                            qual = ''.join([
                                chr(int(c + 33)) if c <= 93 else chr(126)
                                for c in  #prob_toqual(seqs_np[seqlen][2][x])
                                [
                                    np.around(
                                        np.multiply(np.log10(prob), -10),
                                        decimals=0)  ##TODO - change for logs
                                    for prob in seqs_np[seqlen][2][
                                        x]  #TODO - change for logs
                                ]
                            ])
                        except:
                            print(seqlen)
                            print(x)
                            #print(seqs_np[seqlen][2])
                            print((seqs_np[seqlen][2][x]))
                            raise
                        fasta_handle.write('>{}\n{}\n'.format(title, seq))
                        fastq_handle.write('@{}\n{}\n+\n{}\n'.format(
                            title, seq, qual))
Exemple #3
0
def simplify_genes(seqs_np, gene_ids_sets, clust_file, seqlens=False):
    """
    remove unnecessary ambiguities in gene segment IDs:
        where a sequence is identical to another, the rarer sequence is combined with the more common, IF the two
        sequences have at least one gene segment ID in common for each gene in the id line.
        The daughter record will keep the gene segment IDs of the more numerous parent.
        The rarer parent will have its read count set to zero.
        If the two records have an qual read count, the simpler (i.e. shorter) set of possible gene segments is taken
        for each gene (so that the output selection may take some genes from one sequence, and some from the other).
        TODO: daughter qual score is the posterior probability of the input qual scores (weighted by read count)
        Input:
        seqs_np = a dictionary of lists of numpy arrays, one entry per sequence length, with the value being a list
        of numpy arrays for: sequences, gene id codes, probabilities, rea counts, sequence names
        gene_ids_sets = a dictionary of dictionaries, one entry per sequence length, with the value being a dictionary
        where keys match the integers in the gene id codes numpy array, and the values are a list of sets, giving the
        gene segments identified for each gene (one set per gene).
        Output:
            gene_ids_sets.  Modified gene set dictionary.  Numpy arrays are modified in place
    """
    if not seqlens:
        seqlens = list(seqs_np.keys())
    seqlens = [
        ln for ln in seqlens if ln > 1
    ]  #single nt sequences break the array creation code, and are not useful
    with open(clust_file, 'w') if clust_file else reptools.dummy_context_mgr(
    ) as clust_handle:
        for seqlen in seqlens:
            #print('Seqlen={}'.format(str(seqlen)))
            changes = {}
            seq_array, gene_id_array, prob_array, counts_array, seq_names = seqs_np[
                seqlen]
            reptools.sort_by_freq(seq_array, gene_id_array, prob_array,
                                  counts_array, seq_names)
            #iterate from least to most common (missing the very most common, as there will be nothing to compare it with)
            for n in range(len(counts_array) - 1, 0, -1):
                #compare this row with all at least as common (i.e. earlier) rows
                identical = np.where(
                    identical_rows(seq_array[:n, ], seq_array[n, :]))[0]
                #work through the identical rows, from the first (most common) on, checking for gene identity
                if len(identical) > 0:
                    for p in identical:
                        qry_genes = gene_ids_sets[seqlen][gene_id_array[n]]
                        targ_genes = gene_ids_sets[seqlen][gene_id_array[p]]
                        #if there is a match
                        if sum([
                                len(_q.union(_t)) == len(_q) + len(_t)
                                for _q, _t in zip(qry_genes, targ_genes)
                        ]) == 0:
                            #check if the read count is equal (should never be less)
                            if counts_array[p] == counts_array[n]:
                                #if so, take the simplest gene set for each
                                newgeneset = []
                                for seg in range(len(qry_genes)):
                                    if len(qry_genes[seg]) < len(
                                            targ_genes[seg]):
                                        newgeneset.append(qry_genes[seg])
                                    else:
                                        newgeneset.append(targ_genes[seg])
                                #is this combination of gene ids already in the dictionary?
                                if newgeneset in list(
                                        gene_ids_sets[seqlen].values()):
                                    gene_id_array[p] = [
                                        k for k in gene_ids_sets[seqlen] if
                                        gene_ids_sets[seqlen][k] == newgeneset
                                    ][0]  #if so, set gene_id code
                                else:  #if not, create an entry
                                    newentry = max(gene_ids_sets[seqlen]) + 1
                                    if newentry > 65535:
                                        raise ValueError(
                                            'Too many unique gene id combinations (>65535)'
                                        )
                                    gene_ids_sets[seqlen][
                                        newentry] = newgeneset
                                    gene_id_array[p] = newentry

                            #combine read counts
                            counts_array[p] += counts_array[n]
                            counts_array[n] = 0
                            #record changes
                            try:
                                changes[seq_names[p]].append(seq_names[n])
                            except KeyError:
                                changes[seq_names[p]] = [seq_names[n]]
                            changes_made = True
                            break  #if a match was found, continue to the next rarest row
            reptools.save_changes(clust_handle, changes, seq_names,
                                  counts_array)
    return (gene_ids_sets)
Exemple #4
0
def denoise_indelonly(seqs_np,
                      gene_ids_sets,
                      threshold,
                      clust_file,
                      seqlens=False):
    """
    denoise reads differing length, removing indels only (and no more than one indel):
        where a sequence is threshold less numerous than another and has only a single indel difference,
        add its reads to the more common sequence IF the two sequences have at least one gene segment ID in common
        for each gene in the id line.
        The daughter record will keep the gene segment IDs of the more numerous parent.
        The rarer parent will have its read count set to zero.
        TODO: Is there a way to modify qual socres with indels?  Nothing obvious (perhaps with insertions?)
        Input:
        seqs_np = a dictionary of lists of numpy arrays, one entry per sequence length, with the value being a list
        of numpy arrays for: sequences, gene id codes, probabilities, rea counts, sequence names
        gene_ids_sets = a dictionary of dictionaries, one entry per sequence length, with the value being a dictionary
        where keys match the integers in the gene id codes numpy array, and the values are a list of sets, giving the
        gene segments identified for each gene (one set per gene).
        Output:
            None.  Numpy arrays are modified in place
    """
    if not seqlens:
        seqlens = list(seqs_np.keys())
    seqlens = [
        ln for ln in seqlens if ln > 1
    ]  #single nt sequences break the array creation code, and are not useful
    seqlens = sorted(seqlens)
    if len(seqlens) < 2:
        print(
            'indel denoising cannot be performed when all sequences are the same length'
        )
        return
    previous_seq_array = None
    with open(clust_file, 'w') if clust_file else reptools.dummy_context_mgr(
    ) as clust_handle:
        for seqlen in seqlens:
            seq_array, gene_id_array, prob_array, counts_array, seq_names = seqs_np[
                seqlen]
            changes = {}
            if previous_seq_array is not None:
                if seqlen - previous_seqlen == 1:  #only look for indels when seqlength delta==1
                    #the previous seq_array is one base shorter than the present one
                    #so want to mask each position in the current one in turn, and get the hamming distance
                    changes_made = True
                    #loopcounter=0
                    while changes_made:  #iterate until no more improvements
                        changes_made = False
                        #loopcounter+=1
                        #print('Seqlen={}, iteration={}'.format(str(seqlen),str(loopcounter)))
                        reptools.sort_by_freq(seq_array, gene_id_array,
                                              prob_array, counts_array,
                                              seq_names)
                        if np.any(counts_array == 0):
                            firstzero = np.where(counts_array == 0)[0][0]
                        else:
                            firstzero = seq_array.shape[0]
                        for n in range(
                                0, firstzero
                        ):  #iterate from most to least common (but non-zero)
                            indels = np.full(previous_seq_array.shape[0],
                                             False)
                            b = np.full(seq_array.shape[1], True)
                            for x in range(
                                    seq_array.shape[1]
                            ):  #loop through the columns, sliding the missing column across
                                c = np.copy(b)
                                c[x] = False  #set missing column
                                #look for perfect matches (with the missing column excluded)
                                indels = find_indels(indels,
                                                     previous_seq_array,
                                                     seq_array[n, c])
                            indels = np.logical_and(indels,
                                                    previous_counts_array >
                                                    0)  #exclude read zero seqs
                            if np.any(indels):
                                indels_where = np.where(indels)[0]
                                cur_into_prev_ratios = previous_counts_array[
                                    indels_where] / (
                                        counts_array[n] * threshold
                                    )  #rewrite to avoid division?
                                prev_into_cur_ratios = counts_array[n] / (
                                    previous_counts_array[indels_where] *
                                    threshold)
                                #these two arrays will index indels_where
                                #a value of >1 meets the threshold
                                #check genes for all where >1
                                gene_matches = [False] * len(indels_where)
                                for p1, p2 in enumerate(indels_where):
                                    qry_genes = gene_ids_sets[seqlen][
                                        gene_id_array[n]]
                                    targ_genes = gene_ids_sets[
                                        previous_seqlen][
                                            previous_gene_id_array[p2]]
                                    if sum([
                                            len(_q.union(_t)) == len(_q) +
                                            len(_t) for _q, _t in zip(
                                                qry_genes, targ_genes)
                                    ]) == 0:
                                        gene_matches[
                                            p1] = True  #gene_matches will reference indels_where, and also the ratios
                                if np.any(gene_matches):
                                    cur_into_prev_ratios = cur_into_prev_ratios * gene_matches  #set ratios to 0 where no match
                                    prev_into_cur_ratios = prev_into_cur_ratios * gene_matches
                                    cur_into_prev_max_idx = np.argmax(
                                        cur_into_prev_ratios)  #get best ratio
                                    prev_into_cur_max_idx = np.argmax(
                                        prev_into_cur_ratios)
                                    #which direction do we prefer to move the reads?
                                    if (cur_into_prev_ratios[
                                            cur_into_prev_max_idx] >
                                            prev_into_cur_ratios[
                                                prev_into_cur_max_idx]):
                                        if cur_into_prev_ratios[
                                                cur_into_prev_max_idx] >= 1:
                                            #print((cur_into_prev_ratios[cur_into_prev_max_idx]))
                                            targetrow = indels_where[
                                                cur_into_prev_max_idx]
                                            previous_counts_array[
                                                targetrow] += counts_array[n]
                                            counts_array[n] = 0
                                            try:
                                                changes[previous_seq_names[
                                                    targetrow]].append(
                                                        seq_names[n])
                                            except KeyError:
                                                changes[previous_seq_names[
                                                    targetrow]] = [
                                                        seq_names[n]
                                                    ]
                                            changes_made = True
                                    else:
                                        if prev_into_cur_ratios[
                                                prev_into_cur_max_idx] >= 1:
                                            #print((prev_into_cur_ratios[prev_into_cur_max_idx]))
                                            targetrow = indels_where[
                                                prev_into_cur_max_idx]
                                            counts_array[
                                                n] += previous_counts_array[
                                                    targetrow]
                                            previous_counts_array[
                                                targetrow] = 0
                                            try:
                                                changes[seq_names[n]].append(
                                                    previous_seq_names[
                                                        targetrow])
                                            except KeyError:
                                                changes[seq_names[n]] = [
                                                    previous_seq_names[
                                                        targetrow]
                                                ]
                                            changes_made = True
                reptools.save_changes(clust_handle, previous_changes,
                                      previous_seq_names,
                                      previous_counts_array)

                reptools.sort_by_freq(  #resort, to allow omission of zeros
                    seq_array, gene_id_array, prob_array, counts_array,
                    seq_names)

            if np.any(counts_array == 0):
                firstzero = np.where(counts_array == 0)[0][0]
            else:
                firstzero = seq_array.shape[0]

            previous_seqlen = seqlen
            #the next four lines previously used np.copy.  I don't think this is necessary, and is not desirable for modify
            #in place
            previous_seq_array = seq_array[0:firstzero, :]
            previous_gene_id_array = gene_id_array[0:firstzero]
            previous_counts_array = counts_array[0:firstzero]
            previous_seq_names = seq_names[0:firstzero]
            previous_changes = changes
        #output: save current changes (because they will never be saved as the previous changes)
        reptools.save_changes(clust_handle, changes, seq_names, counts_array)

    return  #modification should have occurred in place
Exemple #5
0
def denoise_substitutions(seqs_np,
                          gene_ids_sets,
                          threshold,
                          clust_file,
                          weight_by_qual=True,
                          seqlens=False):
    """
    denoise reads of same length:
        where a sequence is (hamming distance)*threshold less numerous than another, add its reads to the more common
        sequence IF the two sequences have at least one gene segment ID in common for each gene in the id line.
        The daughter record will keep the gene segment IDs of the more numerous parent.
        The rarer parent will have its read count set to zero.
        If weight_by_qual=True (the default), adjusts hamming distance according to qual scores.
        TODO: daughter qual score is the psoterior probability of the input qual scores (weighted by read count)
        Input:
        seqs_np = a dictionary of lists of numpy arrays, one entry per sequence length, with the value being a list
        of numpy arrays for: sequences, gene id codes, probabilities, rea counts, sequence names
        gene_ids_sets = a dictionary of dictionaries, one entry per sequence length, with the value being a dictionary
        where keys match the integers in the gene id codes numpy array, and the values are a list of sets, giving the
        gene segments identified for each gene (one set per gene).
        Output:
            None.  Numpy arrays are modified in place
    """
    if not seqlens:  #if False, process all
        seqlens = list(seqs_np.keys())
    seqlens = [
        ln for ln in seqlens if ln > 1
    ]  #single nt sequences break the array creation code, and are not useful
    with open(clust_file, 'w') if clust_file else reptools.dummy_context_mgr(
    ) as clust_handle:
        for seqlen in seqlens:
            changes = {}
            seq_array, gene_id_array, prob_array, counts_array, seq_names = seqs_np[
                seqlen]
            changes_made = True
            #loopcounter=0
            while changes_made:  #iterate until no more improvements
                changes_made = False
                #loopcounter+=1
                #print('Seqlen={}, iteration={}'.format(str(seqlen),str(loopcounter)))
                reptools.sort_by_freq(seq_array, gene_id_array, prob_array,
                                      counts_array, seq_names)
                if np.any(counts_array == 0):
                    firstzero = np.where(counts_array == 0)[0][0]
                else:
                    firstzero = seq_array.shape[0]
                np.seterr(
                    over='ignore')  #to not report overflow errors - see below
                for n in range(
                        0, firstzero
                ):  #iterate from most to least common (but non-zero)
                    matching = np.equal(seq_array[n + 1:, :], seq_array[
                        n, :])  #compare this row with all subsequent rows
                    if weight_by_qual:
                        #weight by probabilities
                        weights = generic_chance_of_miss(
                            prob_array[n, :], prob_array[n + 1:, :],
                            matching[:, :]
                        )  #TODO - change generic_chance_of_miss() for logs
                        dists = np.sum(
                            weights, axis=1
                        )  #gives hamming distances weighted by qual scores
                    else:
                        dists = np.sum(np.invert(matching), axis=1)
                    #find reads which are "close enough"
                    #the next line may cause overflows, but if the the value is too high, it will be insanely large,
                    #and set to Inf, so the comparison should still work
                    hits = np.where(counts_array[n + 1:] >= np.power(
                        threshold, dists, dtype=np.float32) * counts_array[n])
                    #loop through the hits, checking that the genes match, and continue until a hit is found where they do.
                    for hit in hits[0]:
                        #check gene ids
                        query_genes = gene_ids_sets[seqlen][gene_id_array[n]]
                        target_genes = gene_ids_sets[seqlen][gene_id_array[
                            n + 1 + hit]]
                        if sum([
                                len(_q.union(_t)) == len(_q) + len(_t)
                                for _q, _t in zip(query_genes, target_genes)
                        ]) == 0:
                            #all genes have at least one allele in common
                            #v0.14.1: modify qual score of target
                            # calculate prob of base in source ACTUALLY being base in target
                            #   (1-prob_array[n])/3 because a 1/3 chance of each uncalled base
                            source_prob_misread = (
                                (1 - prob_array[n]) /
                                3)**counts_array[n]  #TODO - change for logs
                            #multiply target prob by the calculated source misread prob
                            prob_array[n + 1 + hit] = prob_array[
                                n + 1 +
                                hit] * source_prob_misread  #TODO - change for logs
                            #add counts from query to target
                            counts_array[n + 1 + hit] += counts_array[n]
                            #set query counts to zero
                            counts_array[n] = 0
                            #record changes
                            try:
                                changes[seq_names[n + 1 + hit]].append(
                                    seq_names[n])
                            except KeyError:
                                changes[seq_names[n + 1 +
                                                  hit]] = [seq_names[n]]
                            changes_made = True
                            break
                np.seterr(over='warn')
            #print('Seqlen {} denoised'.format(seqlen,loopcounter))
            reptools.save_changes(clust_handle, changes, seq_names,
                                  counts_array)
    return  #numpy arrays will have been modified in-place, and gene_ids,gene_ids_sets have not been modified
Exemple #6
0
def checkCDR3_prod(fastqfile,
                   minlen=3 * 5,
                   maxlen=3 * 30,
                   startchars='C',
                   endchars='FWH',
                   hits_out=False,
                   failures_out=False,
                   frameshift_out=False,
                   long_out=False,
                   short_out=False,
                   stop_out=False,
                   bad_out=False,
                   title_split=' ',
                   verbose=True):
    """
    This for use where no reference file is available.
    Reports % of CDR3 which are productive or start with C and end with F/W/H, and are within a sensible length range
    Over- and under-length CDR3 are eliminated first
    Then those with a bad start or end residue (not C and F/W/H)
    Then those with a stop
    Then those with a frameshift
    """
    import reptools
    results = {
        'good': 0,
        'frameshift': 0,
        'stop': 0,
        'bad': 0,
        'long': 0,
        'short': 0,
        'fail': 0
    }

    with open(fastqfile) as infile:
        with open(hits_out, 'wb') if hits_out else reptools.dummy_context_mgr(
        ) as hits_handle:
            with open(failures_out,
                      'wb') if failures_out else reptools.dummy_context_mgr(
                      ) as failures_handle:
                with open(frameshift_out, 'wb'
                          ) if frameshift_out else reptools.dummy_context_mgr(
                          ) as shift_handle:
                    with open(
                            long_out,
                            'wb') if long_out else reptools.dummy_context_mgr(
                            ) as long_handle:
                        with open(
                                short_out, 'wb'
                        ) if short_out else reptools.dummy_context_mgr(
                        ) as short_handle:
                            with open(
                                    stop_out, 'wb'
                            ) if stop_out else reptools.dummy_context_mgr(
                            ) as stop_handle:
                                with open(
                                        bad_out, 'wb'
                                ) if bad_out else reptools.dummy_context_mgr(
                                ) as bad_handle:
                                    for title, seq, qual in reptools.FASTQparser(
                                            infile):
                                        id = title.split(';')[0].split(
                                            title_split)[0]
                                        seq = seq.strip()
                                        if seq.lower() == 'n':
                                            results['fail'] += 1
                                            failures_handle.write('>%s\n%s\n' %
                                                                  (id, seq))
                                        elif len(seq) > maxlen:
                                            results['long'] += 1
                                            long_handle.write('>%s\n%s\n' %
                                                              (id, seq))
                                        elif len(seq) < minlen:
                                            results['short'] += 1
                                            short_handle.write('>%s\n%s\n' %
                                                               (id, seq))
                                        elif (reptools.trans(seq[0:3]).lower()
                                              not in startchars.lower() or
                                              reptools.trans(seq[-3:]).lower()
                                              not in endchars.lower()):
                                            results['bad'] += 1
                                            bad_handle.write('>%s\n%s\n' %
                                                             (id, seq))
                                        elif '*' in reptools.trans(seq):
                                            results['stop'] += 1
                                            stop_handle.write('>%s\n%s\n' %
                                                              (id, seq))
                                        elif len(seq) % 3 != 0:
                                            results['frameshift'] += 1
                                            shift_handle.write('>%s\n%s\n' %
                                                               (id, seq))
                                        else:
                                            results['good'] += 1
                                            hits_handle.write('>%s\n%s\n' %
                                                              (id, seq))

    if verbose:
        totalreads = float(sum(results.values()))
        print(('over length = %s (%s pct)' %
               (results['long'], results['long'] / totalreads * 100)))
        print(('under length = %s (%s pct)' %
               (results['short'], results['short'] / totalreads * 100)))
        print(('bad start/end = %s (%s pct)' %
               (results['bad'], results['bad'] / totalreads * 100)))
        print(('stop codon = %s (%s pct)' %
               (results['stop'], results['stop'] / totalreads * 100)))
        print((
            'frameshift = %s (%s pct)' %
            (results['frameshift'], results['frameshift'] / totalreads * 100)))
        print(('no CDR3 = %s (%s pct)' %
               (results['fail'], results['fail'] / totalreads * 100)))
        print(('good CDR3 = %s (%s pct)' %
               (results['good'], results['good'] / totalreads * 100)))
    return (results)
Exemple #7
0
def checkgeneID_fastq(mock_dict,
                      gene,
                      fastqfile,
                      title_split=' ',
                      hits_out=False,
                      misses_out=False,
                      failures_out=False,
                      ambiguous_out=False,
                      verbose=True):
    #TODO: add transtable option to replace the [s.split('gamma')[0].split('alpha')[0].split('_')[0] for s in gene_strings] line
    #or, process the mock_dict first, to match
    import reptools
    reptoolsdict = {}
    with open(fastqfile) as infile:
        for title, seq, qual in reptools.FASTQparser(infile):
            id = title.split(';')[0].split(title_split)[0]
            gene_strings = [
                s.split('=')[1] for s in title.split(';')
                if s.split('=')[0] == gene
            ]
            gene_strings = gene_strings[0].split('+')
            gene_strings = [
                s.split('gamma')[0].split('alpha')[0].split('_')[0]
                for s in gene_strings
            ]
            reptoolsdict[id] = gene_strings

    results = {'hit': 0, 'miss': 0, 'ambiguous': 0, 'fail': 0}

    with open(
            hits_out,
            'wb') if hits_out else reptools.dummy_context_mgr() as hits_handle:
        with open(misses_out,
                  'wb') if misses_out else reptools.dummy_context_mgr(
                  ) as misses_handle:
            with open(failures_out,
                      'wb') if failures_out else reptools.dummy_context_mgr(
                      ) as failures_handle:
                with open(
                        ambiguous_out,
                        'wb') if ambiguous_out else reptools.dummy_context_mgr(
                        ) as ambiguous_handle:
                    for id in mock_dict:
                        try:
                            if len(reptoolsdict[id]) == 1:
                                if reptoolsdict[id][0] == mock_dict[id][
                                        gene].split('*')[0]:
                                    results['hit'] += 1
                                    if hits_out:
                                        hits_handle.write(
                                            '>%s\n%s\n' %
                                            (id, reptoolsdict[id]))
                                elif reptoolsdict[id] == 'none':
                                    results['fail'] += 1
                                    failures_handle.write(
                                        '>%s\n%s\n' % (id, reptoolsdict[id]))
                                else:
                                    results['miss'] += 1
                                    misses_handle.write('>%s\n%s\n' %
                                                        (id, reptoolsdict[id]))
                            elif len(reptoolsdict[id]) == 0:
                                results['fail'] += 1
                                failures_handle.write('>%s\n%s\n' %
                                                      (id, reptoolsdict[id]))
                            elif len(reptoolsdict[id]) > 1:
                                if mock_dict[id][gene].split(
                                        '*')[0] in reptoolsdict[id]:
                                    results['ambiguous'] += 1
                                    ambiguous_handle.write(
                                        '>%s\n%s\n' % (id, reptoolsdict[id]))
                                else:
                                    results['miss'] += 1
                                    misses_handle.write('>%s\n%s\n' %
                                                        (id, reptoolsdict[id]))
                            else:
                                raise ValueError('How did we get here?')
                        except KeyError:
                            results['fail'] += 1
                            failures_handle.write('>%s\n%s\n' %
                                                  (id, reptoolsdict[id]))
    if verbose:
        print(('hit = %s' % results['hit']))
        print(('miss = %s' % results['miss']))
        print(('ambiguous (including hit) = %s' % results['ambiguous']))
        print(('failed = %s' % results['fail']))
        print(('pct hit = %s' %
               (results['hit'] / float(sum(results.values())) * 100)))
        print(('pct hit (including ambiguous hit) = %s' %
               ((results['hit'] + results['ambiguous']) /
                float(sum(results.values())) * 100)))

    return (results)