Ejemplo n.º 1
0
def getReadIDs(in1,in2,title_split=' ',clusterID_position=0):
    readIDs = []
    with open(in1) as inhandle1, open(in2) as inhandle2:
        for (title1,seq1,qual1),(title2,seq2,qual2) in zip(
                                                           reptools.FASTQparser(inhandle1),
                                                           reptools.FASTQparser(inhandle2)
                                                           ): #loop through the fastq lines
            if title1.split(title_split)[clusterID_position] != title2.split(title_split)[clusterID_position]:
                raise IOError('Sequence titles do not match between files:\n{}\n{}\n'.format(title1,title2))
            else:
                readIDs.append(title1.split(title_split)[clusterID_position])
    return(readIDs)
Ejemplo n.º 2
0
def countgenehits_hitsfile(fastqFile,
                           hitsFile,
                           type,
                           evalue=False,
                           mincols=False,
                           title_split=' ',
                           verbose=True):
    import reptools
    import io
    results = {'hit': 0, 'ambiguous': 0, 'fail': 0}
    reptoolsdict = reptools.retrieve_tophits(hitsFile,
                                             type,
                                             evalue=evalue,
                                             mincols=mincols,
                                             title_split_char=title_split)

    with io.open(fastqFile) as fastq_handle:
        for title, seq, qual in reptools.FASTQparser(fastq_handle):
            trimmed_id = title.split(title_split)[0]
            if trimmed_id in reptoolsdict:
                if len(reptoolsdict[trimmed_id]) == 1:
                    results['hit'] += 1
                elif len(reptoolsdict[trimmed_id]) > 1:
                    results['ambiguous'] += 1
                else:
                    results['fail'] += 1
            else:
                results['fail'] += 1

    if verbose:
        print(('hit = %s' % results['hit']))
        print(('fail = %s' % results['fail']))
        print(('ambiguous = %s' % results['ambiguous']))

    return (results)
Ejemplo n.º 3
0
def fastqcounter(infile):
    """
    Returns the number of unique sequences in a fastq file
    """
    #check if file is derep'd using DerepCheck()
    derep = reptools.DerepCheck(infile)

    n = 0
    if derep:
        with open(infile) as fn:
            for title, seq, qual in reptools.FASTQparser(fn):
                n += reptools.DerepCount(title)
    else:
        with open(infile) as fn:
            for title, seq, qual in reptools.FASTQparser(fn):
                n += 1
    return (n)
Ejemplo n.º 4
0
def checkCDR3_fastq(mock_dict,
                    fastqfile,
                    hits_out=False,
                    misses_out=False,
                    failures_out=False,
                    title_split=' ',
                    verbose=True):
    """
    Compares the sliced CDR3 in a fastq file with those simulated by MiXCR (and given in a .descr file)
    """
    import reptools
    reptoolsdict = {}
    with open(fastqfile) as infile:
        for title, seq, qual in reptools.FASTQparser(infile):
            id = title.split(';')[0].split(title_split)[0]
            reptoolsdict[id] = seq

    results = {'hit': 0, 'miss': 0, 'fail': 0}

    with open(
            hits_out,
            'wb') if hits_out else reptools.dummy_context_mgr() as hits_handle:
        with open(misses_out,
                  'wb') if misses_out else reptools.dummy_context_mgr(
                  ) as misses_handle:
            with open(failures_out,
                      'wb') if failures_out else reptools.dummy_context_mgr(
                      ) as failures_handle:
                for id in mock_dict:
                    try:
                        if reptoolsdict[id].lower(
                        ) == mock_dict[id]['CDR3'].lower():
                            results['hit'] += 1
                            hits_handle.write('>%s\n%s\n' %
                                              (id, reptoolsdict[id]))
                        else:
                            if reptoolsdict[id].lower() == 'n':
                                results['fail'] += 1
                                failures_handle.write('>%s\n%s\n' %
                                                      (id, reptoolsdict[id]))
                            else:
                                results['miss'] += 1
                                misses_handle.write('>%s\n%s\n' %
                                                    (id, reptoolsdict[id]))
                    except KeyError:
                        results['fail'] += 1
                        print(id)
                        failures_handle.write('>%s\n%s\n' % (id, ''))

    if verbose:
        print(('hit = %s' % results['hit']))
        print(('miss = %s' % results['miss']))
        print(('failed = %s' % results['fail']))
        print(('pct hit = %s' %
               (results['hit'] / float(sum(results.values())) * 100)))
        print(('pct miss = %s' %
               (results['miss'] / float(sum(results.values())) * 100)))
    return (results)
Ejemplo n.º 5
0
def DerepCheck(fn):
    try:
        with open(fn) as inhandle:
            for title, seq in reptools.FASTAparser(inhandle):
                return (is_derepFas(title))
    except ValueError:
        with open(fn) as inhandle:
            for title, seq, qual in reptools.FASTQparser(inhandle):
                return (is_derepFas(title))
Ejemplo n.º 6
0
def fastq2fasta(infile, outfile="", trimstart=0, overwrite=False):
    """
    takes infile and outfile (file names)
    """
    if outfile == "":
        if infile[-5:] == 'fastq':
            outfile = infile[:-5] + 'fas'
        else:
            outfile = infile + '.fas'
    if os.path.isfile(outfile) and not overwrite:
        raise IOError('Output file (%s) already exists' % (outfile))
    for (title, seq, qual) in reptools.FASTQparser(open(infile, 'r')):
        with open(outfile, 'a') as f:
            f.write('>%s\n%s\n' % (title, seq[trimstart:]))
    return outfile
Ejemplo n.º 7
0
def EEfilter_file(infile, FASTAout=False, FASTQout=False, maxee=1):
    from reptools import dummy_context_mgr as dummy
    if not FASTAout and not FASTQout:
        raise ValueError(
            'Please supply one or both of FASTAout and FASTQout to EEfilter()')
    with open(infile) as inhandle:
        with open(FASTAout, 'w') if FASTAout else dummy() as outfasta_handle:
            with open(FASTQout,
                      'w') if FASTQout else dummy() as outfastq_handle:
                for title, seq, qual in reptools.FASTQparser(inhandle):
                    if reptools.calculate_EE(qual) <= maxee:
                        outfasta_handle.write('>{}\n{}\n'.format(title, seq))
                        outfastq_handle.write('@{}\n{}\n+\n{}\n'.format(
                            title, seq, qual))
    return (
        reptools.removeemptyfile(FASTQout)
    )  #returns None if the file was empty (and has been removed), else the fn
Ejemplo n.º 8
0
def derep_FASTQ(fn, clust_file):
    seqs = collections.defaultdict(dict)
    gene_ids = collections.defaultdict(
        list
    )  # stored in a list to avoid repetition - slower, but saves memory
    changes = collections.defaultdict(list)
    with open(fn) as in_handle:
        for title, seq, qual in reptools.FASTQparser(in_handle):
            if any([
                    nt not in ['A', 'T', 'G', 'C', 'a', 't', 'g', 'c']
                    for nt in seq.strip()
            ]):
                continue  #omit ambiguous sequences
            title_list = title.strip().strip(';').split(';')
            seqlen = len(seq)
            #seqprobs = [-float(Q)/10 for Q in [ord(c)-33 for c in qual]] #for logs
            seqprobs = [
                10**(-float(Q) / 10) for Q in [ord(c) - 33 for c in qual]
            ]
            if title_list[1:] not in gene_ids[seqlen]:
                #if the geneid is new, the sequence is new (by definition)
                gene_ids[seqlen].append(title_list[1:])
                seqs[seqlen][(seq, len(gene_ids[seqlen]) - 1)] = [
                    1,  #1, because this is the first time this sequence has been found
                    title_list[0],  #the first title found is stored for output
                    seqprobs
                ]
            elif (seq,
                  gene_ids[seqlen].index(title_list[1:])) not in seqs[seqlen]:
                #if the sequence string is new, but not geneid, the geneid code can be reused
                seqs[seqlen][(seq, gene_ids[seqlen].index(
                    title_list[1:]))] = [1, title_list[0], seqprobs]
            else:
                #not a unique sequence
                #add to changes
                changes[seqs[seqlen][(seq, gene_ids[seqlen].index(
                    title_list[1:]))][1]].append(title_list[0])
                #calculate new probs list
                newprobs = [
                    old * new for old, new in zip(
                        seqs[seqlen][(seq,
                                      gene_ids[seqlen].index(title_list[1:])
                                      )][2], seqprobs)
                ]
                seqs[seqlen][(seq,
                              gene_ids[seqlen].index(title_list[1:]))][0] += 1
                seqs[seqlen][(seq, gene_ids[seqlen].index(
                    title_list[1:]))][2] = newprobs

    #very high counts can result in probs of zero (float underrun), which is invalid; so change them to Phred=120
    #TODO = change the probability handling to working with the log probabilities, which will also save
    #memory, as I can then use float16 - N.B. I can't, because numba doesn't support float16 (yet)
    for seqlen in seqs:
        for k in seqs[seqlen]:
            if min(seqs[seqlen][k][2]) < 0.000000000001:
                seqs[seqlen][k][2] = [
                    prob if prob >= 0.000000000001 else 0.000000000001
                    for prob in seqs[seqlen][k][2]
                ]

    if clust_file:
        with open(clust_file, 'w') as clust_handle:
            for recipient in changes:
                clust_handle.write('{}\t{}\n'.format(
                    recipient, '\t'.join(changes[recipient])))

    return (dict(seqs), dict(gene_ids))
Ejemplo n.º 9
0
def checkCDR3_prod(fastqfile,
                   minlen=3 * 5,
                   maxlen=3 * 30,
                   startchars='C',
                   endchars='FWH',
                   hits_out=False,
                   failures_out=False,
                   frameshift_out=False,
                   long_out=False,
                   short_out=False,
                   stop_out=False,
                   bad_out=False,
                   title_split=' ',
                   verbose=True):
    """
    This for use where no reference file is available.
    Reports % of CDR3 which are productive or start with C and end with F/W/H, and are within a sensible length range
    Over- and under-length CDR3 are eliminated first
    Then those with a bad start or end residue (not C and F/W/H)
    Then those with a stop
    Then those with a frameshift
    """
    import reptools
    results = {
        'good': 0,
        'frameshift': 0,
        'stop': 0,
        'bad': 0,
        'long': 0,
        'short': 0,
        'fail': 0
    }

    with open(fastqfile) as infile:
        with open(hits_out, 'wb') if hits_out else reptools.dummy_context_mgr(
        ) as hits_handle:
            with open(failures_out,
                      'wb') if failures_out else reptools.dummy_context_mgr(
                      ) as failures_handle:
                with open(frameshift_out, 'wb'
                          ) if frameshift_out else reptools.dummy_context_mgr(
                          ) as shift_handle:
                    with open(
                            long_out,
                            'wb') if long_out else reptools.dummy_context_mgr(
                            ) as long_handle:
                        with open(
                                short_out, 'wb'
                        ) if short_out else reptools.dummy_context_mgr(
                        ) as short_handle:
                            with open(
                                    stop_out, 'wb'
                            ) if stop_out else reptools.dummy_context_mgr(
                            ) as stop_handle:
                                with open(
                                        bad_out, 'wb'
                                ) if bad_out else reptools.dummy_context_mgr(
                                ) as bad_handle:
                                    for title, seq, qual in reptools.FASTQparser(
                                            infile):
                                        id = title.split(';')[0].split(
                                            title_split)[0]
                                        seq = seq.strip()
                                        if seq.lower() == 'n':
                                            results['fail'] += 1
                                            failures_handle.write('>%s\n%s\n' %
                                                                  (id, seq))
                                        elif len(seq) > maxlen:
                                            results['long'] += 1
                                            long_handle.write('>%s\n%s\n' %
                                                              (id, seq))
                                        elif len(seq) < minlen:
                                            results['short'] += 1
                                            short_handle.write('>%s\n%s\n' %
                                                               (id, seq))
                                        elif (reptools.trans(seq[0:3]).lower()
                                              not in startchars.lower() or
                                              reptools.trans(seq[-3:]).lower()
                                              not in endchars.lower()):
                                            results['bad'] += 1
                                            bad_handle.write('>%s\n%s\n' %
                                                             (id, seq))
                                        elif '*' in reptools.trans(seq):
                                            results['stop'] += 1
                                            stop_handle.write('>%s\n%s\n' %
                                                              (id, seq))
                                        elif len(seq) % 3 != 0:
                                            results['frameshift'] += 1
                                            shift_handle.write('>%s\n%s\n' %
                                                               (id, seq))
                                        else:
                                            results['good'] += 1
                                            hits_handle.write('>%s\n%s\n' %
                                                              (id, seq))

    if verbose:
        totalreads = float(sum(results.values()))
        print(('over length = %s (%s pct)' %
               (results['long'], results['long'] / totalreads * 100)))
        print(('under length = %s (%s pct)' %
               (results['short'], results['short'] / totalreads * 100)))
        print(('bad start/end = %s (%s pct)' %
               (results['bad'], results['bad'] / totalreads * 100)))
        print(('stop codon = %s (%s pct)' %
               (results['stop'], results['stop'] / totalreads * 100)))
        print((
            'frameshift = %s (%s pct)' %
            (results['frameshift'], results['frameshift'] / totalreads * 100)))
        print(('no CDR3 = %s (%s pct)' %
               (results['fail'], results['fail'] / totalreads * 100)))
        print(('good CDR3 = %s (%s pct)' %
               (results['good'], results['good'] / totalreads * 100)))
    return (results)
Ejemplo n.º 10
0
def checkgeneID_fastq(mock_dict,
                      gene,
                      fastqfile,
                      title_split=' ',
                      hits_out=False,
                      misses_out=False,
                      failures_out=False,
                      ambiguous_out=False,
                      verbose=True):
    #TODO: add transtable option to replace the [s.split('gamma')[0].split('alpha')[0].split('_')[0] for s in gene_strings] line
    #or, process the mock_dict first, to match
    import reptools
    reptoolsdict = {}
    with open(fastqfile) as infile:
        for title, seq, qual in reptools.FASTQparser(infile):
            id = title.split(';')[0].split(title_split)[0]
            gene_strings = [
                s.split('=')[1] for s in title.split(';')
                if s.split('=')[0] == gene
            ]
            gene_strings = gene_strings[0].split('+')
            gene_strings = [
                s.split('gamma')[0].split('alpha')[0].split('_')[0]
                for s in gene_strings
            ]
            reptoolsdict[id] = gene_strings

    results = {'hit': 0, 'miss': 0, 'ambiguous': 0, 'fail': 0}

    with open(
            hits_out,
            'wb') if hits_out else reptools.dummy_context_mgr() as hits_handle:
        with open(misses_out,
                  'wb') if misses_out else reptools.dummy_context_mgr(
                  ) as misses_handle:
            with open(failures_out,
                      'wb') if failures_out else reptools.dummy_context_mgr(
                      ) as failures_handle:
                with open(
                        ambiguous_out,
                        'wb') if ambiguous_out else reptools.dummy_context_mgr(
                        ) as ambiguous_handle:
                    for id in mock_dict:
                        try:
                            if len(reptoolsdict[id]) == 1:
                                if reptoolsdict[id][0] == mock_dict[id][
                                        gene].split('*')[0]:
                                    results['hit'] += 1
                                    if hits_out:
                                        hits_handle.write(
                                            '>%s\n%s\n' %
                                            (id, reptoolsdict[id]))
                                elif reptoolsdict[id] == 'none':
                                    results['fail'] += 1
                                    failures_handle.write(
                                        '>%s\n%s\n' % (id, reptoolsdict[id]))
                                else:
                                    results['miss'] += 1
                                    misses_handle.write('>%s\n%s\n' %
                                                        (id, reptoolsdict[id]))
                            elif len(reptoolsdict[id]) == 0:
                                results['fail'] += 1
                                failures_handle.write('>%s\n%s\n' %
                                                      (id, reptoolsdict[id]))
                            elif len(reptoolsdict[id]) > 1:
                                if mock_dict[id][gene].split(
                                        '*')[0] in reptoolsdict[id]:
                                    results['ambiguous'] += 1
                                    ambiguous_handle.write(
                                        '>%s\n%s\n' % (id, reptoolsdict[id]))
                                else:
                                    results['miss'] += 1
                                    misses_handle.write('>%s\n%s\n' %
                                                        (id, reptoolsdict[id]))
                            else:
                                raise ValueError('How did we get here?')
                        except KeyError:
                            results['fail'] += 1
                            failures_handle.write('>%s\n%s\n' %
                                                  (id, reptoolsdict[id]))
    if verbose:
        print(('hit = %s' % results['hit']))
        print(('miss = %s' % results['miss']))
        print(('ambiguous (including hit) = %s' % results['ambiguous']))
        print(('failed = %s' % results['fail']))
        print(('pct hit = %s' %
               (results['hit'] / float(sum(results.values())) * 100)))
        print(('pct hit (including ambiguous hit) = %s' %
               ((results['hit'] + results['ambiguous']) /
                float(sum(results.values())) * 100)))

    return (results)