def records(): if refseq: yield HMMER.valid(refseq, is_dna=is_dna) for record in seqrecords: if not is_dna and record.seq.alphabet == DNAAlphabet: record = translate(record) yield HMMER.valid(record, is_dna=is_dna)
def generate_hmm_(opts): fd, tmphmm = mkstemp() close(fd) fd, tmpaln = mkstemp() close(fd) is_dna = opts.ENCODER == DNAEncoder try: with open(opts.REFMSA) as msa_fh: with open(tmpaln, 'w') as aln_fh: msa_fmt = seqfile_format(opts.REFMSA) source = Verifier(SeqIO.parse(msa_fh, msa_fmt), DNAAlphabet) try: SeqIO.write((record if is_dna else translate(record) for record in source), aln_fh, 'stockholm') except VerifyError: if is_dna: raise RuntimeError( "DNA encoding incompatible with protein reference MSA" ) source.set_alphabet(AminoAlphabet) aln_fh.seek(0) SeqIO.write(source, aln_fh, 'stockholm') hmmer = HMMER(opts.HMMER_ALIGN_BIN, opts.HMMER_BUILD_BIN) hmmer.build(tmphmm, tmpaln, alphabet=HMMER.DNA if is_dna else HMMER.AMINO) finally: if exists(tmpaln): remove(tmpaln) return tmphmm
def seqrecords(): is_dna = ARGS.ENCODER == DNAEncoder seq_fmt = seqfile_format(ARGS.SEQUENCES) source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet) try: for record in source: yield record if is_dna else translate(record) except VerifyError: if is_dna: msg = ( "your model specifies a DNA encoding " "which is incompatible with protein sequences" ) raise RuntimeError(msg) source.set_alphabet(AminoAlphabet) for record in source: yield record
def __call__(self, string): try: with open(string) as h: source = Verifier(SeqIO.parse(h, seqfile_format(string)), DNAAlphabet) try: seq = next(iter(source)) if not self.is_dna: seq = translate(seq) except VerifyError: if self.is_dna: raise ArgumentTypeError("DNA encoding incompatible with protein reference") source.set_alphabet(AminoAlphabet) seq = next(iter(source)) return seq except ArgumentTypeError: raise sys.exc_info()[1] except: raise ArgumentTypeError("invalid FASTA file '{0:s}'".format(string))
def main(args=None): if args is None: args = sys.argv[1:] try: signal.signal(signal.SIGPIPE, signal.SIG_DFL) except ValueError: pass parser = ArgumentParser(description='translate a FASTA nucleotide file') parser.add_argument('-f', '--frame', type=int, choices=range(3), default=0) parser.add_argument('input', nargs='?', type=FileType('r'), default=sys.stdin) parser.add_argument('output', nargs='?', type=FileType('w'), default=sys.stdout) ns = parser.parse_args(args) SeqIO.write((translate(s[ns.frame:]) for s in SeqIO.parse(ns.input, 'fasta')), ns.output, 'fasta') return 0
def __call__(self, string): try: with open(string) as h: source = Verifier(SeqIO.parse(h, seqfile_format(string)), DNAAlphabet) try: seq = next(iter(source)) if not self.is_dna: seq = translate(seq) except VerifyError: if self.is_dna: raise ArgumentTypeError( "DNA encoding incompatible with protein reference") source.set_alphabet(AminoAlphabet) seq = next(iter(source)) return seq except ArgumentTypeError: raise sys.exc_info()[1] except: raise ArgumentTypeError( "invalid FASTA file '{0:s}'".format(string))
def generate_hmm_(opts): fd, tmphmm = mkstemp(); close(fd) fd, tmpaln = mkstemp(); close(fd) is_dna = opts.ENCODER == DNAEncoder try: with open(opts.REFMSA) as msa_fh: with open(tmpaln, 'w') as aln_fh: msa_fmt = seqfile_format(opts.REFMSA) source = Verifier(SeqIO.parse(msa_fh, msa_fmt), DNAAlphabet) try: SeqIO.write( (record if is_dna else translate(record) for record in source), aln_fh, 'stockholm') except VerifyError: if is_dna: raise RuntimeError("DNA encoding incompatible with protein reference MSA") source.set_alphabet(AminoAlphabet) aln_fh.seek(0) SeqIO.write( source, aln_fh, 'stockholm') hmmer = HMMER(opts.HMMER_ALIGN_BIN, opts.HMMER_BUILD_BIN) hmmer.build( tmphmm, tmpaln, alphabet=HMMER.DNA if is_dna else HMMER.AMINO ) finally: if exists(tmpaln): remove(tmpaln) return tmphmm
from functools import reduce from operator import add from docopt import docopt from Bio.Align import MultipleSeqAlignment from Bio import AlignIO from BioExt.misc import translate if __name__ == "__main__": args = docopt(__doc__) infile = args["<infile>"] outfile = args["<outfile>"] thresh = float(args["--threshold"]) if thresh < 0 or thresh > 1: raise Exception("threshold must be between 0 and 1," " but got {}".format(thresh)) aln = AlignIO.read(infile, "fasta") taln = MultipleSeqAlignment(list(translate(r) for r in aln)) n_seqs = len(taln) percents = list(1 - taln[:, i].count('-') / n_seqs for i in range(len(taln[0]))) keep = list(i for i, p in enumerate(percents) if p > thresh) trunc_aln = reduce(add, (aln[:, i * 3 : i * 3 + 3] for i in keep), aln[:, 0:0]) AlignIO.write(trunc_aln, outfile, 'fasta')
def init_args(description, args): from idepi import __path__ as idepi_path parser = ArgumentParser(description=description) # handle the datasource, we need to know to setup labeltype and subtype info group = parser.add_mutually_exclusive_group() group.add_argument('--csv', type=PathType, dest='_DATA', nargs=2, metavar=('FASTA', 'CSV')) group.add_argument('--sqlite', type=PathType, dest='_DATA', nargs=1, metavar='SQLITE3') group.set_defaults( _DATA=[join(idepi_path[0], 'data', 'allneuts.sqlite3')] ) # handle the encoder early as well encoders = dict((str(enc), enc) for enc in (AminoEncoder, DNAEncoder, StanfelEncoder)) parser.add_argument( '--encoding', type=lambda s: encoders.get(s, s), choices=sorted(encoders.values(), key=str), dest='ENCODER', default=AminoEncoder ) # rather than removing the help and making a new parser, # if help options are passed defer them to the next parsing deferred = [] for arg in ('-h', '--help'): try: args.remove(arg) deferred.append(arg) except ValueError: pass ns, args = parser.parse_known_args(args) # deferred args += deferred # setup a "subtypetype for the parser" is_dna = ns.ENCODER == DNAEncoder ns.DATA = DataSource(*ns._DATA) fastatype = FastaTypeFactory(is_dna) # labeltype = labeltypefactory(ns.DATA) subtype = SubtypeTypeFactory(ns.DATA) # option action type dest parser.add_argument('--log', type=logtype, dest='LOGGING') parser.add_argument('--label', type=str, dest='LABEL') parser.add_argument('--filter', type=csvtype, dest='FILTER') parser.add_argument('--clonal', action='store_true', dest='CLONAL') parser.add_argument('--subtypes', type=subtype, dest='SUBTYPES') parser.add_argument('--weighting', action='store_true', dest='WEIGHTING') parser.add_argument('--refmsa', type=PathType, dest='REFMSA') parser.add_argument('--refseq', type=fastatype, dest='REFSEQ') parser.add_argument('--test', action='store_true', dest='TEST') parser.add_argument('--seed', type=SeedType, dest='RAND_SEED') parser.add_argument('-o', '--output', type=FileType('w'), dest='OUTPUT') refseq = hxb2.env.load() parser.set_defaults( LOGGING =None, LABEL ='max(IC50) > 20', FILTER =[], CLONAL =False, SUBTYPES =set(), WEIGHTING =False, REFMSA =PathType(join(idepi_path[0], 'data', 'HIV1_FLT_2012_env_DNA.sto')), REFSEQ =refseq if is_dna else translate(refseq), RAND_SEED =42, # magic number for determinism PHYLOFILTER=False, OUTPUT =sys.stdout ) return parser, ns, args
def init_args(description, args): from idepi import __path__ as idepi_path parser = ArgumentParser(description=description) # handle the datasource, we need to know to setup labeltype and subtype info group = parser.add_mutually_exclusive_group() group.add_argument('--csv', type=PathType, dest='_DATA', nargs=2, metavar=('FASTA', 'CSV')) group.add_argument('--sqlite', type=PathType, dest='_DATA', nargs=1, metavar='SQLITE3') group.set_defaults(_DATA=[join(idepi_path[0], 'data', 'allneuts.sqlite3')]) # handle the encoder early as well encoders = dict( (str(enc), enc) for enc in (AminoEncoder, DNAEncoder, StanfelEncoder)) parser.add_argument('--encoding', type=lambda s: encoders.get(s, s), choices=sorted(encoders.values(), key=str), dest='ENCODER', default=AminoEncoder) # rather than removing the help and making a new parser, # if help options are passed defer them to the next parsing deferred = [] for arg in ('-h', '--help'): try: args.remove(arg) deferred.append(arg) except ValueError: pass ns, args = parser.parse_known_args(args) # deferred args += deferred # setup a "subtypetype for the parser" is_dna = ns.ENCODER == DNAEncoder ns.DATA = DataSource(*ns._DATA) fastatype = FastaTypeFactory(is_dna) # labeltype = labeltypefactory(ns.DATA) subtype = SubtypeTypeFactory(ns.DATA) # option action type dest parser.add_argument('--log', type=logtype, dest='LOGGING') parser.add_argument('--label', type=str, dest='LABEL') parser.add_argument('--filter', type=csvtype, dest='FILTER') parser.add_argument('--clonal', action='store_true', dest='CLONAL') parser.add_argument('--subtypes', type=subtype, dest='SUBTYPES') parser.add_argument('--weighting', action='store_true', dest='WEIGHTING') parser.add_argument('--refmsa', type=PathType, dest='REFMSA') parser.add_argument('--refseq', type=fastatype, dest='REFSEQ') parser.add_argument('--test', action='store_true', dest='TEST') parser.add_argument('--seed', type=SeedType, dest='RAND_SEED') parser.add_argument('-o', '--output', type=FileType('w'), dest='OUTPUT') refseq = hxb2.env.load() parser.set_defaults( LOGGING=None, LABEL='max(IC50) > 20', FILTER=[], CLONAL=False, SUBTYPES=set(), WEIGHTING=False, REFMSA=PathType( join(idepi_path[0], 'data', 'HIV1_FLT_2012_env_DNA.sto')), REFSEQ=refseq if is_dna else translate(refseq), RAND_SEED=42, # magic number for determinism PHYLOFILTER=False, OUTPUT=sys.stdout) return parser, ns, args
def validate( refseq, seqs, dna_score_matrix=None, protein_score_matrix=None, dna_mismatch=0, protein_mismatch=0, codon=True, revcomp=True, expected_identity=0., keep_insertions=True, quiet=False): msg = "cannot validate sequences that are not SeqRecord, Seq, or str objects" if isinstance(refseq, SeqRecord): r = str(refseq.seq) elif isinstance(refseq, Seq): r = str(refseq) elif isinstance(refseq, str): r = refseq else: raise ValueError(msg) qs = [] for i, q in enumerate(seqs): if isinstance(q, SeqRecord): qs.append(str(q.seq)) elif isinstance(q, Seq): qs.append(str(q)) elif isinstance(q, str): qs.append(q) else: raise ValueError(msg) if dna_score_matrix is None: dna_score_matrix = DNA80 if protein_score_matrix is None: score_matrix = BLOSUM62.load() if codon: score_matrix = protein_score_matrix else: score_matrix = dna_score_matrix aligner = Aligner(codon=codon) refs, queries, _, _, identities = aligner( r, qs, score_matrix, revcomp, expected_identity, keep_insertions, quiet ) lengths = [] dna_scores = [] protein_scores = [] for r, q, i in zip(refs, queries, identities): assert len(r) == len(q), 'sequences unaligned for some reason' lengths.append(len(r)) if expected_identity > 0. and i < expected_identity: dna_scores.append(None) protein_scores.append(None) else: dna_scores.append(dna_score_matrix(r, q, dna_mismatch)) # we can translate codon-aligned sequences, # but not DNA-aligned sequences if codon: protein_scores.append( protein_score_matrix( translate(r), translate(q), protein_mismatch ) ) else: protein_scores.append(None) return lengths, dna_scores, protein_scores