Beispiel #1
0
def test_pcdhit():
    import os
    import lilbio
    import pcdhit
    from lilbio.funcs import uppercase_only
    source = os.path.join(tests_dir(), alignment_file)
    records = lilbio.parse(source, 'stockholm', func=uppercase_only)
    filtered_records = pcdhit.filter(records, 0.7)
    assert len(list(filtered_records)) == 61
Beispiel #2
0
def filter(records, threshold):
    """Filter non-redundant records via cd-hit.

    cdhit: http://weizhongli-lab.org/cd-hit/
    cdhit will cluster sequences that meet a similarity threshold and return a
    representative record for each cluster:
    cdhit -i <fin> -c <threshold> -o <fout>

    Parameters
    ----------
    records : iterable
        Iterable of (header, sequence) tuples.

    threshold : float, optional (0.9)
        Sequence identity threshold (cd-hit '-c <thr>' option).

    Yields
    ------
    (header, sequence) : tuple (str, str)
        For each non-redundant record, a tuple containing header and sequence.

    """

    # check for cd-hit on path
    cdhit_exe = is_command(['cd-hit', 'cdhit'])
    logger.debug('cd-hit executable: %r', cdhit_exe)
    if cdhit_exe is None:
        raise CdhitNotFoundError

    if not 0.7 <= threshold <= 1.0:
        raise IdentityThresholdError

    # open tmp files
    with opentf() as fin, opentf() as fout:

        print_input_fasta(records, fin)

        call_cdhit(cdhit_exe, fin, fout, threshold)

        for rec in lilbio.parse(fout, fmt='fasta'):
            head, seq = rec[0].split('@')
            yield head, seq
def test_stockholm():
    fname = os.path.join(tests_dir(), '1.sto')
    a = list(lilbio.parse(fname, 'stockholm'))
    assert repr(a) == RECORDS
def test_fasta():
    fname = os.path.join(tests_dir(), '1.fa')
    a = list(lilbio.parse(fname, 'fasta'))
    assert repr(a) == RECORDS