Ejemplo n.º 1
0
 def records():
     if refseq:
         yield HMMER.valid(refseq, is_dna=is_dna)
     for record in seqrecords:
         if not is_dna and record.seq.alphabet == DNAAlphabet:
             record = translate(record)
         yield HMMER.valid(record, is_dna=is_dna)
Ejemplo n.º 2
0
 def records():
     if refseq:
         yield HMMER.valid(refseq, is_dna=is_dna)
     for record in seqrecords:
         if not is_dna and record.seq.alphabet == DNAAlphabet:
             record = translate(record)
         yield HMMER.valid(record, is_dna=is_dna)
Ejemplo n.º 3
0
def generate_hmm_(opts):
    fd, tmphmm = mkstemp()
    close(fd)
    fd, tmpaln = mkstemp()
    close(fd)

    is_dna = opts.ENCODER == DNAEncoder

    try:
        with open(opts.REFMSA) as msa_fh:
            with open(tmpaln, 'w') as aln_fh:
                msa_fmt = seqfile_format(opts.REFMSA)
                source = Verifier(SeqIO.parse(msa_fh, msa_fmt), DNAAlphabet)
                try:
                    SeqIO.write((record if is_dna else translate(record)
                                 for record in source), aln_fh, 'stockholm')
                except VerifyError:
                    if is_dna:
                        raise RuntimeError(
                            "DNA encoding incompatible with protein reference MSA"
                        )
                    source.set_alphabet(AminoAlphabet)
                    aln_fh.seek(0)
                    SeqIO.write(source, aln_fh, 'stockholm')

        hmmer = HMMER(opts.HMMER_ALIGN_BIN, opts.HMMER_BUILD_BIN)
        hmmer.build(tmphmm,
                    tmpaln,
                    alphabet=HMMER.DNA if is_dna else HMMER.AMINO)
    finally:
        if exists(tmpaln):
            remove(tmpaln)

    return tmphmm
Ejemplo n.º 4
0
 def seqrecords():
     is_dna = ARGS.ENCODER == DNAEncoder
     seq_fmt = seqfile_format(ARGS.SEQUENCES)
     source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet)
     try:
         for record in source:
             yield record if is_dna else translate(record)
     except VerifyError:
         if is_dna:
             msg = (
                 "your model specifies a DNA encoding "
                 "which is incompatible with protein sequences"
                 )
             raise RuntimeError(msg)
         source.set_alphabet(AminoAlphabet)
         for record in source:
             yield record
Ejemplo n.º 5
0
 def seqrecords():
     is_dna = ARGS.ENCODER == DNAEncoder
     seq_fmt = seqfile_format(ARGS.SEQUENCES)
     source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet)
     try:
         for record in source:
             yield record if is_dna else translate(record)
     except VerifyError:
         if is_dna:
             msg = (
                 "your model specifies a DNA encoding "
                 "which is incompatible with protein sequences"
                 )
             raise RuntimeError(msg)
         source.set_alphabet(AminoAlphabet)
         for record in source:
             yield record
Ejemplo n.º 6
0
 def __call__(self, string):
     try:
         with open(string) as h:
             source = Verifier(SeqIO.parse(h, seqfile_format(string)), DNAAlphabet)
             try:
                 seq = next(iter(source))
                 if not self.is_dna:
                     seq = translate(seq)
             except VerifyError:
                 if self.is_dna:
                     raise ArgumentTypeError("DNA encoding incompatible with protein reference")
                 source.set_alphabet(AminoAlphabet)
                 seq = next(iter(source))
         return seq
     except ArgumentTypeError:
         raise sys.exc_info()[1]
     except:
         raise ArgumentTypeError("invalid FASTA file '{0:s}'".format(string))
Ejemplo n.º 7
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    try:
        signal.signal(signal.SIGPIPE, signal.SIG_DFL)
    except ValueError:
        pass

    parser = ArgumentParser(description='translate a FASTA nucleotide file')
    parser.add_argument('-f', '--frame', type=int, choices=range(3), default=0)
    parser.add_argument('input',  nargs='?', type=FileType('r'), default=sys.stdin)
    parser.add_argument('output', nargs='?', type=FileType('w'), default=sys.stdout)

    ns = parser.parse_args(args)

    SeqIO.write((translate(s[ns.frame:]) for s in SeqIO.parse(ns.input, 'fasta')), ns.output, 'fasta')

    return 0
Ejemplo n.º 8
0
 def __call__(self, string):
     try:
         with open(string) as h:
             source = Verifier(SeqIO.parse(h, seqfile_format(string)),
                               DNAAlphabet)
             try:
                 seq = next(iter(source))
                 if not self.is_dna:
                     seq = translate(seq)
             except VerifyError:
                 if self.is_dna:
                     raise ArgumentTypeError(
                         "DNA encoding incompatible with protein reference")
                 source.set_alphabet(AminoAlphabet)
                 seq = next(iter(source))
         return seq
     except ArgumentTypeError:
         raise sys.exc_info()[1]
     except:
         raise ArgumentTypeError(
             "invalid FASTA file '{0:s}'".format(string))
Ejemplo n.º 9
0
def generate_hmm_(opts):
    fd, tmphmm = mkstemp(); close(fd)
    fd, tmpaln = mkstemp(); close(fd)

    is_dna = opts.ENCODER == DNAEncoder

    try:
        with open(opts.REFMSA) as msa_fh:
            with open(tmpaln, 'w') as aln_fh:
                msa_fmt = seqfile_format(opts.REFMSA)
                source = Verifier(SeqIO.parse(msa_fh, msa_fmt), DNAAlphabet)
                try:
                    SeqIO.write(
                        (record if is_dna else translate(record) for record in source),
                        aln_fh,
                        'stockholm')
                except VerifyError:
                    if is_dna:
                        raise RuntimeError("DNA encoding incompatible with protein reference MSA")
                    source.set_alphabet(AminoAlphabet)
                    aln_fh.seek(0)
                    SeqIO.write(
                        source,
                        aln_fh,
                        'stockholm')

        hmmer = HMMER(opts.HMMER_ALIGN_BIN, opts.HMMER_BUILD_BIN)
        hmmer.build(
            tmphmm,
            tmpaln,
            alphabet=HMMER.DNA if is_dna else HMMER.AMINO
            )
    finally:
        if exists(tmpaln):
            remove(tmpaln)

    return tmphmm
Ejemplo n.º 10
0
from functools import reduce
from operator import add

from docopt import docopt

from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO

from BioExt.misc import translate

if __name__ == "__main__":
    args = docopt(__doc__)
    infile = args["<infile>"]
    outfile = args["<outfile>"]
    thresh = float(args["--threshold"])

    if thresh < 0 or thresh > 1:
        raise Exception("threshold must be between 0 and 1,"
                        " but got {}".format(thresh))

    aln = AlignIO.read(infile, "fasta")
    taln = MultipleSeqAlignment(list(translate(r) for r in aln))
    n_seqs = len(taln)
    percents = list(1 - taln[:, i].count('-') / n_seqs for i in range(len(taln[0])))
    keep = list(i for i, p in enumerate(percents) if p > thresh)

    trunc_aln = reduce(add, (aln[:, i * 3 : i * 3 + 3] for i in keep), aln[:, 0:0])

    AlignIO.write(trunc_aln, outfile, 'fasta')
Ejemplo n.º 11
0
def init_args(description, args):
    from idepi import __path__ as idepi_path

    parser = ArgumentParser(description=description)

    # handle the datasource, we need to know to setup labeltype and subtype info
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--csv',    type=PathType, dest='_DATA', nargs=2, metavar=('FASTA', 'CSV'))
    group.add_argument('--sqlite', type=PathType, dest='_DATA', nargs=1, metavar='SQLITE3')
    group.set_defaults(
        _DATA=[join(idepi_path[0], 'data', 'allneuts.sqlite3')]
        )

    # handle the encoder early as well
    encoders = dict((str(enc), enc) for enc in (AminoEncoder, DNAEncoder, StanfelEncoder))
    parser.add_argument(
        '--encoding',
        type=lambda s: encoders.get(s, s),
        choices=sorted(encoders.values(), key=str),
        dest='ENCODER',
        default=AminoEncoder
        )

    # rather than removing the help and making a new parser,
    # if help options are passed defer them to the next parsing
    deferred = []
    for arg in ('-h', '--help'):
        try:
            args.remove(arg)
            deferred.append(arg)
        except ValueError:
            pass

    ns, args = parser.parse_known_args(args)

    # deferred
    args += deferred

    # setup a "subtypetype for the parser"
    is_dna = ns.ENCODER == DNAEncoder
    ns.DATA = DataSource(*ns._DATA)
    fastatype = FastaTypeFactory(is_dna)
    # labeltype = labeltypefactory(ns.DATA)
    subtype = SubtypeTypeFactory(ns.DATA)

    #                   option             action               type                dest
    parser.add_argument('--log',                                type=logtype,       dest='LOGGING')
    parser.add_argument('--label',                              type=str,           dest='LABEL')
    parser.add_argument('--filter',                             type=csvtype,       dest='FILTER')
    parser.add_argument('--clonal',        action='store_true',                     dest='CLONAL')
    parser.add_argument('--subtypes',                           type=subtype,       dest='SUBTYPES')
    parser.add_argument('--weighting',     action='store_true',                     dest='WEIGHTING')
    parser.add_argument('--refmsa',                             type=PathType,      dest='REFMSA')
    parser.add_argument('--refseq',                             type=fastatype,     dest='REFSEQ')
    parser.add_argument('--test',          action='store_true',                     dest='TEST')
    parser.add_argument('--seed',                               type=SeedType,      dest='RAND_SEED')
    parser.add_argument('-o', '--output',                       type=FileType('w'), dest='OUTPUT')

    refseq = hxb2.env.load()

    parser.set_defaults(
        LOGGING    =None,
        LABEL      ='max(IC50) > 20',
        FILTER     =[],
        CLONAL     =False,
        SUBTYPES   =set(),
        WEIGHTING  =False,
        REFMSA     =PathType(join(idepi_path[0], 'data', 'HIV1_FLT_2012_env_DNA.sto')),
        REFSEQ     =refseq if is_dna else translate(refseq),
        RAND_SEED  =42, # magic number for determinism
        PHYLOFILTER=False,
        OUTPUT     =sys.stdout
        )

    return parser, ns, args
Ejemplo n.º 12
0
def init_args(description, args):
    from idepi import __path__ as idepi_path

    parser = ArgumentParser(description=description)

    # handle the datasource, we need to know to setup labeltype and subtype info
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--csv',
                       type=PathType,
                       dest='_DATA',
                       nargs=2,
                       metavar=('FASTA', 'CSV'))
    group.add_argument('--sqlite',
                       type=PathType,
                       dest='_DATA',
                       nargs=1,
                       metavar='SQLITE3')
    group.set_defaults(_DATA=[join(idepi_path[0], 'data', 'allneuts.sqlite3')])

    # handle the encoder early as well
    encoders = dict(
        (str(enc), enc) for enc in (AminoEncoder, DNAEncoder, StanfelEncoder))
    parser.add_argument('--encoding',
                        type=lambda s: encoders.get(s, s),
                        choices=sorted(encoders.values(), key=str),
                        dest='ENCODER',
                        default=AminoEncoder)

    # rather than removing the help and making a new parser,
    # if help options are passed defer them to the next parsing
    deferred = []
    for arg in ('-h', '--help'):
        try:
            args.remove(arg)
            deferred.append(arg)
        except ValueError:
            pass

    ns, args = parser.parse_known_args(args)

    # deferred
    args += deferred

    # setup a "subtypetype for the parser"
    is_dna = ns.ENCODER == DNAEncoder
    ns.DATA = DataSource(*ns._DATA)
    fastatype = FastaTypeFactory(is_dna)
    # labeltype = labeltypefactory(ns.DATA)
    subtype = SubtypeTypeFactory(ns.DATA)

    #                   option             action               type                dest
    parser.add_argument('--log', type=logtype, dest='LOGGING')
    parser.add_argument('--label', type=str, dest='LABEL')
    parser.add_argument('--filter', type=csvtype, dest='FILTER')
    parser.add_argument('--clonal', action='store_true', dest='CLONAL')
    parser.add_argument('--subtypes', type=subtype, dest='SUBTYPES')
    parser.add_argument('--weighting', action='store_true', dest='WEIGHTING')
    parser.add_argument('--refmsa', type=PathType, dest='REFMSA')
    parser.add_argument('--refseq', type=fastatype, dest='REFSEQ')
    parser.add_argument('--test', action='store_true', dest='TEST')
    parser.add_argument('--seed', type=SeedType, dest='RAND_SEED')
    parser.add_argument('-o', '--output', type=FileType('w'), dest='OUTPUT')

    refseq = hxb2.env.load()

    parser.set_defaults(
        LOGGING=None,
        LABEL='max(IC50) > 20',
        FILTER=[],
        CLONAL=False,
        SUBTYPES=set(),
        WEIGHTING=False,
        REFMSA=PathType(
            join(idepi_path[0], 'data', 'HIV1_FLT_2012_env_DNA.sto')),
        REFSEQ=refseq if is_dna else translate(refseq),
        RAND_SEED=42,  # magic number for determinism
        PHYLOFILTER=False,
        OUTPUT=sys.stdout)

    return parser, ns, args
Ejemplo n.º 13
0
def validate(
    refseq,
    seqs,
    dna_score_matrix=None,
    protein_score_matrix=None,
    dna_mismatch=0,
    protein_mismatch=0,
    codon=True,
    revcomp=True,
    expected_identity=0.,
    keep_insertions=True,
    quiet=False):

    msg = "cannot validate sequences that are not SeqRecord, Seq, or str objects"

    if isinstance(refseq, SeqRecord):
        r = str(refseq.seq)
    elif isinstance(refseq, Seq):
        r = str(refseq)
    elif isinstance(refseq, str):
        r = refseq
    else:
        raise ValueError(msg)

    qs = []
    for i, q in enumerate(seqs):
        if isinstance(q, SeqRecord):
            qs.append(str(q.seq))
        elif isinstance(q, Seq):
            qs.append(str(q))
        elif isinstance(q, str):
            qs.append(q)
        else:
            raise ValueError(msg)

    if dna_score_matrix is None:
        dna_score_matrix = DNA80

    if protein_score_matrix is None:
        score_matrix = BLOSUM62.load()

    if codon:
        score_matrix = protein_score_matrix
    else:
        score_matrix = dna_score_matrix

    aligner = Aligner(codon=codon)
    refs, queries, _, _, identities = aligner(
        r,
        qs,
        score_matrix,
        revcomp,
        expected_identity,
        keep_insertions,
        quiet
    )

    lengths = []
    dna_scores = []
    protein_scores = []
    for r, q, i in zip(refs, queries, identities):
        assert len(r) == len(q), 'sequences unaligned for some reason'
        lengths.append(len(r))
        if expected_identity > 0. and i < expected_identity:
            dna_scores.append(None)
            protein_scores.append(None)
        else:
            dna_scores.append(dna_score_matrix(r, q, dna_mismatch))
            # we can translate codon-aligned sequences,
            # but not DNA-aligned sequences
            if codon:
                protein_scores.append(
                    protein_score_matrix(
                        translate(r),
                        translate(q),
                        protein_mismatch
                    )
                )
            else:
                protein_scores.append(None)

    return lengths, dna_scores, protein_scores