Exemple #1
0
def main(args):
    logging.basicConfig(level=logging.getLevelName(args.logging))
    logging.info('User args: %s' % pformat(args))
    config = load_config_from_json_file(
        args.config,
        ['gap', 'same', 'diff', 'max_number_of_paths', 'max_sequence_length'])
    logging.info('Config is: \n%s' % pformat(config))

    seq1 = load_fasta_file(args.input1)
    seq2 = load_fasta_file(args.input2)
    if config['max_sequence_length'] != 0 and max(
            len(seq1), len(seq2)) > config['max_sequence_length']:
        raise ValueError('Sequence exceeded max_sequence_length ')

    score_matrix, nodes_mapping = solve(seq1, seq2, config['gap'],
                                        config['diff'], config['same'])

    logging.debug('Score matrix: \n%s' % pformat(score_matrix))
    logging.debug('Nodes mapping: (target_node): [(parent_node),...]\n%s' %
                  pformat(nodes_mapping))
    logging.info('Alignments score: %s' % score_matrix[len(seq1), len(seq2)])

    paths = PathResolver(nodes_mapping).resolve_paths(
        len(seq1), len(seq2), config['max_number_of_paths'])

    allignments = [get_allignments(path, seq1, seq2) for path in paths]

    for (allignment_1, allignment_2), i in zip(allignments,
                                               range(len(allignments))):
        logging.info('[A%04d] %s' % (i, allignment_1))
        logging.info('[A%04d] %s' % (i, allignment_2))

    if args.output:
        save_output(
            args.output, {
                'seq1': seq1,
                'seq2': seq2,
                'config': config,
                'allignments': allignments,
                'score_matrix': score_matrix.tolist()
            })
        logging.info('Saved output to %s' % args.output)
Exemple #2
0
def load_train_sequences(fasta_file_path, clusters_file_path):
    """
    :param fasta_file_path:{string}, path to fasta input file.
    :param clusters_file_path:{string},path to cluster file.
    :return:{list}, list of train sequences.
    """
    observation = load_fasta_file(fasta_file_path)
    train_sequences = []
    with open(clusters_file_path,'r') as tsv:
        reader = csv.reader(tsv, dialect='excel-tab')
        for line in [line for line in reader][1:]:
            start, end = map(int, line)
            train_sequences.append(observation[start-100:end+100])
    return train_sequences
Exemple #3
0
def main():

    from hmm import load_hmm_model_from_files

    fasta_file_path = r'data/chrom17.fasta'
    emission_input_file = r'data/initial_emission.tsv'
    transition_input_file = r'data/initial_transition.tsv'

    hmm_model = load_hmm_model_from_files(emission_input_file, transition_input_file)
    observation = load_fasta_file(fasta_file_path)
    v, ptr = viterbi_algorithm(hmm_model, observation)
    np.savez_compressed('output/viterbi_algorithm_output.npz', v=v, ptr=ptr)
    path = restore_viterbi_path(v,ptr)
    np.savez_compressed('output/path.npz', path=path)
    q_3_1_c(path, observation)
Exemple #4
0
def build_draft_genome_seq(task):
    draft_genome_summary = {}
    vc_summary_path = task.path.joinpath(task.id, task.id + '_vc_summary.json')
    vc_dict = utils.load_json_file(vc_summary_path)
    dominant_vc = {}
    for ref_order in range(1, task.ref_num + 1):
        dominant_vc[ref_order] = {}
        draft_genome_summary[ref_order] = {
            'conflicts': [],
            'snv_list': [],
            'error': [],
            'file_path': ''
        }
        for vc_table in vc_dict.values():
            for pos, snvs in vc_table[str(ref_order)].items():
                ref = snvs['REF']
                for snv, alns in snvs['SNV'].items():
                    for aligner in alns:
                        if Decimal(alns[aligner]['FREQ'][:-1]) / 100 > Decimal(
                                task.vc_threshold):
                            if dominant_vc[ref_order].get(pos) == None:
                                dominant_vc[ref_order][pos] = {
                                    'REF': ref,
                                    'ALT': {}
                                }
                            if dominant_vc[ref_order][pos]['ALT'].get(
                                    snv) == None:
                                dominant_vc[ref_order][pos]['ALT'].update(
                                    {snv: {
                                        'SCORE': 0
                                    }})
                            dominant_vc[ref_order][pos]['ALT'][snv][
                                'SCORE'] += 1

        fasta_base_list = []
        imported_ref = task.path.joinpath(
            task.id, 'reference', '%s_ref_%d.fasta' % (task.id, ref_order))
        ref_fasta_dict = utils.load_fasta_file(imported_ref)
        for base in list(ref_fasta_dict.values())[0]:
            fasta_base_list.append(base)

        for pos, vc in dominant_vc[ref_order].items():
            if len(vc['ALT']) > 1:
                # skip conflict results
                draft_genome_summary[ref_order]['conflicts'].append(pos)
            else:
                ref_mer = vc['REF']
                alt_mer = list(vc['ALT'].keys())[0]
                score = int(vc['ALT'][alt_mer]['SCORE'])
                if score >= int(task.min_vc_score):
                    # apply snv onto reference sequence
                    i = 0
                    for base in ref_mer[1:]:
                        if fasta_base_list[int(pos) + i] != base:
                            draft_genome_summary[ref_order]['error'].append(
                                pos)
                            break
                        fasta_base_list[int(pos) + i] = ''
                        i += 1
                    fasta_base_list[int(pos) - 1] = alt_mer
                    # record apllied snv
                    draft_genome_summary[ref_order]['snv_list'].append(
                        '%s%s%s' % (ref_mer, pos, alt_mer))

        draft_fasta_dict = {
            '%s_draft_%d' % (task.id, ref_order): ''.join(fasta_base_list)
        }

        draft_cwd = task.path.joinpath(task.id, 'draft_genome')
        draft_fasta_path = draft_cwd.joinpath('%s_draft_%d.fasta' %
                                              (task.id, ref_order))
        draft_genome_summary[ref_order]['file_path'] = str(draft_fasta_path)
        Path.mkdir(draft_cwd, parents=True, exist_ok=True)
        utils.build_fasta_file(draft_fasta_path, draft_fasta_dict)
    utils.build_json_file(
        draft_cwd.joinpath('%s_draft_summary.json' % task.id),
        draft_genome_summary)
Exemple #5
0
                                                  size=len(observation),
                                                  replace=True))
    f = forward_algorithm(hmm_model, random_observation)
    print(f[1, -1])


def q_2_b_iii(hmm_model, observation):
    """
    calculating the log probability of observing chrom17.fasta given the model.
    :return:
    """
    repetitive_observation = 'A'*len(observation)
    f = forward_algorithm(hmm_model, repetitive_observation)
    print(f[1, -1])


if __name__ == '__main__':

    from hmm import load_hmm_model_from_files

    fasta_file_path = r'data/chrom17.fasta'
    observation = load_fasta_file(fasta_file_path)

    emission_input_file = r'data/initial_emission.tsv'
    transition_input_file = r'data/initial_transition.tsv'

    hmm_model = load_hmm_model_from_files(emission_input_file, transition_input_file)

    q_2_b_i(hmm_model, observation)
    #q_2_b_ii(hmm_model, observation)
    #q_2_b_iii(hmm_model, observation)