Example #1
0
def main(args=None):
    if args is None:
        args = sys_argv[1:]

    parser = ArgumentParser(
        description='Generate a phylogeny from an alignment.')
    parser.add_argument('ALIGNMENT', type=PathType)
    parser.add_argument('OUTPUT', type=PathType)

    ns = parser.parse_args(args)

    msa = load_stockholm(ns.ALIGNMENT)

    try:
        refidx = reference_index(msa, is_refseq)
    except IndexError:
        raise RuntimeError('No reference sequence found!')

    labels = MSAVectorizer(AminoEncoder).fit(msa).get_feature_names()

    seqrecords = [r for i, r in enumerate(msa) if not i == refidx]

    tree, alignment = Phylo()(seqrecords)

    PhyloGzFile.write(ns.OUTPUT, tree, alignment, labels)

    return 0
Example #2
0
def main(args=None):
    if args is None:
        args = sys_argv[1:]

    parser = ArgumentParser(description='Generate a phylogeny from an alignment.')
    parser.add_argument('ALIGNMENT', type=PathType)
    parser.add_argument('OUTPUT', type=PathType)

    ns = parser.parse_args(args)

    msa = load_stockholm(ns.ALIGNMENT)

    try:
        refidx = reference_index(msa, is_refseq)
    except IndexError:
        raise RuntimeError('No reference sequence found!')

    labels = SiteVectorizer(AminoEncoder).fit(msa).get_feature_names()

    seqrecords = [r for i, r in enumerate(msa) if not i == refidx]

    tree, alignment = Phylo()(seqrecords)

    PhyloGzFile.write(ns.OUTPUT, tree, alignment, labels)

    return 0
Example #3
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    parser, ns, args = init_args(description='Predict label for unlabeled sequences', args=args)

    parser = hmmer_args(parser)

    parser.add_argument('MODEL', type=PathType)
    parser.add_argument('SEQUENCES', type=PathType)

    ARGS = parse_args(parser, args, namespace=ns)

    with gzip_open(ARGS.MODEL, 'rb') as fh:
        try:
            model = pickle_load(fh)
            if model[0] != MODEL_VERSION:
                raise ImportError('incompatible model version')
            ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf = model[1:]
        except ImportError:
            msg = 'your model is not of the appropriate version, please re-learn your model'
            raise RuntimeError(msg)

    # create a temporary file wherein space characters have been removed
    with open(ARGS.SEQUENCES) as seq_fh:

        def seqrecords():
            is_dna = ARGS.ENCODER == DNAEncoder
            seq_fmt = seqfile_format(ARGS.SEQUENCES)
            source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet)
            try:
                for record in source:
                    yield record if is_dna else translate(record)
            except VerifyError:
                if is_dna:
                    msg = (
                        "your model specifies a DNA encoding "
                        "which is incompatible with protein sequences"
                        )
                    raise RuntimeError(msg)
                source.set_alphabet(AminoAlphabet)
                for record in source:
                    yield record

        try:
            fd, tmphmm = mkstemp(); close(fd)
            with open(tmphmm, 'wb') as hmm_fh:
                hmm_fh.write(hmm)
                # explicitly gc hmm
                hmm = None
            tmpaln = generate_alignment_(seqrecords(), tmphmm, ARGS)
            alignment = load_stockholm(tmpaln, trim=True)
        finally:
            if exists(tmphmm):
                remove(tmphmm)
            if exists(tmpaln):
                remove(tmpaln)

    X = extractor.transform(alignment)
    y = clf.predict(X)

    feature_names = extractor.get_feature_names()
    support = clf.named_steps['mrmr'].support_
    labels = ['"{0:s}"'.format(feature_names[i]) for i, s in enumerate(support) if s]
    emptys = [' ' * (len(label) + 2) for label in labels]
    idlen = max(len(r.id) for r in alignment) + 3

    print('{{\n  "label": "{0:s}",\n  "predictions": ['.format(ARGS.LABEL), file=ARGS.OUTPUT)
    for i, r in enumerate(alignment):
        if i > 0:
            print(',')
        features = ['[ ']
        for j, x in enumerate(X[i, support]):
            if x:
                features.append(labels[j])
                features.append(', ')
            else:
                features.append(emptys[j])
        features.append(' ]')
        # replace the last comma with a space
        idx = None
        for k, f in enumerate(features):
            if f == ', ':
                idx = k
        if idx is None:
            features[0] = features[0].rstrip()
            features[-1] = features[-1].lstrip()
        else:
            features[idx] = ''
        features_ = ''.join(features)
        print(
            '    {{{{ "id": {{0:<{0:d}s}} "value": {{1: d}}, "features": {{2:s}} }}}}'.format(
                idlen).format('"{0:s}",'.format(r.id), y[i], features_),
            file=ARGS.OUTPUT, end='')
    print('\n  ]\n}', file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return 0
Example #4
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    parser, ns, args = init_args(description='Predict label for unlabeled sequences', args=args)

    parser = hmmer_args(parser)

    parser.add_argument('MODEL', type=PathType)
    parser.add_argument('SEQUENCES', type=PathType)

    ARGS = parse_args(parser, args, namespace=ns)

    with gzip_open(ARGS.MODEL, 'rb') as fh:
        try:
            model = pickle_load(fh)
            if model[0] != 4:
                raise ImportError('incompatible model version')
            ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf = model[1:]
        except ImportError:
            msg = 'your model is not of the appropriate version, please re-learn your model'
            raise RuntimeError(msg)

    # create a temporary file wherein space characters have been removed
    with open(ARGS.SEQUENCES) as seq_fh:

        def seqrecords():
            is_dna = ARGS.ENCODER == DNAEncoder
            seq_fmt = seqfile_format(ARGS.SEQUENCES)
            source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet)
            try:
                for record in source:
                    yield record if is_dna else translate(record)
            except VerifyError:
                if is_dna:
                    msg = (
                        "your model specifies a DNA encoding "
                        "which is incompatible with protein sequences"
                        )
                    raise RuntimeError(msg)
                source.set_alphabet(AminoAlphabet)
                for record in source:
                    yield record

        try:
            fd, tmphmm = mkstemp(); close(fd)
            with open(tmphmm, 'wb') as hmm_fh:
                hmm_fh.write(hmm)
                # explicitly gc hmm
                hmm = None
            tmpaln = generate_alignment_(seqrecords(), tmphmm, ARGS)
            alignment = load_stockholm(tmpaln, trim=True)
        finally:
            if exists(tmphmm):
                remove(tmphmm)
            if exists(tmpaln):
                remove(tmpaln)

    X = extractor.transform(alignment)
    y = clf.predict(X)

    feature_names = extractor.get_feature_names()
    support = clf.named_steps['mrmr'].support_
    labels = ['"{0:s}"'.format(feature_names[i]) for i, s in enumerate(support) if s]
    emptys = [' ' * (len(label) + 2) for label in labels]
    idlen = max(len(r.id) for r in alignment) + 3

    print('{{\n  "label": "{0:s}",\n  "predictions": ['.format(ARGS.LABEL), file=ARGS.OUTPUT)
    for i, r in enumerate(alignment):
        if i > 0:
            print(',')
        features = ['[ ']
        for j, x in enumerate(X[i, support]):
            if x:
                features.append(labels[j])
                features.append(', ')
            else:
                features.append(emptys[j])
        features.append(' ]')
        # replace the last comma with a space
        idx = None
        for k, f in enumerate(features):
            if f == ', ':
                idx = k
        if idx is None:
            features[0] = features[0].rstrip()
            features[-1] = features[-1].lstrip()
        else:
            features[idx] = ''
        features_ = ''.join(features)
        print(
            '    {{{{ "id": {{0:<{0:d}s}} "value": {{1: d}}, "features": {{2:s}} }}}}'.format(
                idlen).format('"{0:s}",'.format(r.id), y[i], features_),
            file=ARGS.OUTPUT, end='')
    print('\n  ]\n}', file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return 0