Exemple #1
0
def predict(sequence, model_or_filelike, compute_posterior=True):
    if isinstance(model_or_filelike, tuple):
        model = model_or_filelike
    else:
        _, model = parse(open(model_or_filelike))

    _, path = viterbi(sequence, *model)
    if compute_posterior:
        forward_table, constants = forward(sequence, *model)
        backward_table = backward(sequence, constants, *model)

        posterior = forward_table * backward_table
        _, _, _, char_map, label_map, name_map = model

        observations = len(sequence)
        states = len(name_map)

        # just counts how many states there are per label
        group_counts = Counter(label_map.values())

        table = np.zeros(shape=(observations, 3))
        for i in range(observations):
            group_probs = defaultdict(float)
            for j in range(states):
                group = label_map[j].lower()
                group_probs[group] += posterior[i, j]

            for k, group in enumerate(GROUP_NAMES):
                table[i, k] = group_probs[group]
        return path, table / table.sum(axis=1, keepdims=True)
    return path
Exemple #2
0
def predict(sequence, header, model_or_filelike, compute_posterior=True):
    if isinstance(model_or_filelike, tuple):
        model = model_or_filelike
    else:
        _, model = parse(open(model_or_filelike))

    _, path = viterbi(sequence, *model)
    if compute_posterior:
        forward_table, constants = forward(sequence, *model)
        backward_table = backward(sequence, constants, *model)

        posterior = forward_table * backward_table
        _, _, _, char_map, label_map, name_map = model

        observations = len(sequence)
        states = len(name_map)

        table = np.zeros(shape=(observations, 3))
        for i in range(observations):
            group_probs = defaultdict(float)
            for j in range(states):
                group = label_map[j].lower()
                group_probs[group] += posterior[i, j]

            for k, group in enumerate(GROUP_NAMES):
                table[i, k] = group_probs[group]
        return path, table/table.sum(axis=1, keepdims=True)
    return path
Exemple #3
0
def cli():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--file', dest='sequence_file',
                        type=argparse.FileType('r'), required=True,
                        help='path to file in fasta format with sequences')
    parser.add_argument('-m', '--model', dest='model_file',
                        type=argparse.FileType('r'), default=DEFAULT_MODEL,
                        help='path to the model to use')
    if has_matplotlib:
        parser.add_argument('-p', '--plot', dest='plot_posterior', action='store_true',
                            help='plot posterior probabilies')

    args = parser.parse_args()

    header, model = parse(args.model_file)
    for entry in load_fasta_file(args.sequence_file):
        path, posterior = predict(entry.sequence, header, model)

        with open(entry.id + '.summary', 'w') as summary_file:
            for start, end, state in summarize(path):
                print("{} {} {}".format(start, end, PRETTY_NAMES[state]),
                      file=summary_file)

        with open(entry.id + '.annotation', 'w') as ann_file:
            print('>', entry.id, ' ', entry.description, sep='', file=ann_file)
            for line in textwrap.wrap(path, 79):
                print(line, file=ann_file)

        plot_filename = entry.id + '.plot'
        with open(plot_filename, 'w') as plot_file:
            dump_posterior_file(plot_file, posterior)

        if hasattr(args, 'plot_posterior') and args.plot_posterior:
            with open(plot_filename, 'r') as fileobj:
                plot(fileobj, entry.id + '.pdf')
Exemple #4
0
def cli():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f',
                        '--file',
                        dest='sequence_file',
                        type=argparse.FileType('r'),
                        required=True,
                        help='path to file in fasta format with sequences')
    parser.add_argument('-m',
                        '--model',
                        dest='model_file',
                        type=argparse.FileType('r'),
                        default=DEFAULT_MODEL,
                        help='path to the model to use')
    parser.add_argument('-p',
                        '--plot',
                        dest='plot_posterior',
                        action='store_true',
                        help='plot posterior probabilies')

    args = parser.parse_args()

    header, model = parse(args.model_file)
    for record in sk.io.read(args.sequence_file, format='fasta'):
        path, posterior = predict(normalized_sequence, header, model)

        with open(record.metadata['id'] + '.summary', 'w') as summary_file:
            for start, end, state in summarize(path):
                print("{} {} {}".format(start, end, PRETTY_NAMES[state]),
                      file=summary_file)

        with open(record.metadata['id'] + '.annotation', 'w') as ann_file:
            print('>',
                  record.metadata['id'],
                  ' ',
                  record.metadata['description'],
                  sep='',
                  file=ann_file)
            for line in textwrap.wrap(path, 79):
                print(line, file=ann_file)

        plot_filename = record.metadata['id'] + '.plot'
        with open(plot_filename, 'w') as plot_file:
            print('inside', 'membrane', 'outside', file=plot_file)
            for i in range(len(normalized_sequence)):
                print('{} {} {}'.format(posterior[i, 0], posterior[i, 1],
                                        posterior[i, 2]),
                      file=plot_file)

        if args.plot_posterior:
            plot(plot_filename, record.metadata['id'] + '.pdf')
Exemple #5
0
def cli():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f',
                        '--file',
                        dest='sequence_file',
                        type=argparse.FileType('r'),
                        required=True,
                        help='path to file in fasta format with sequences')
    parser.add_argument('-m',
                        '--model',
                        dest='model_file',
                        type=argparse.FileType('r'),
                        default=DEFAULT_MODEL,
                        help='path to the model to use')
    if has_matplotlib:
        parser.add_argument('-p',
                            '--plot',
                            dest='plot_posterior',
                            action='store_true',
                            help='plot posterior probabilies')

    args = parser.parse_args()

    header, model = parse(args.model_file)
    for entry in load_fasta_file(args.sequence_file):
        path, posterior = predict(entry.sequence, header, model)

        with open(entry.id + '.summary', 'w') as summary_file:
            for start, end, state in summarize(path):
                print("{} {} {}".format(start, end, PRETTY_NAMES[state]),
                      file=summary_file)

        with open(entry.id + '.annotation', 'w') as ann_file:
            print('>', entry.id, ' ', entry.description, sep='', file=ann_file)
            for line in textwrap.wrap(path, 79):
                print(line, file=ann_file)

        plot_filename = entry.id + '.plot'
        with open(plot_filename, 'w') as plot_file:
            dump_posterior_file(plot_file, posterior)

        if hasattr(args, 'plot_posterior') and args.plot_posterior:
            with open(plot_filename, 'r') as fileobj:
                plot(fileobj, entry.id + '.pdf')