def main(args): logging.basicConfig(level=logging.getLevelName(args.logging)) logging.info('User args: %s' % pformat(args)) config = load_config_from_json_file( args.config, ['gap', 'same', 'diff', 'max_number_of_paths', 'max_sequence_length']) logging.info('Config is: \n%s' % pformat(config)) seq1 = load_fasta_file(args.input1) seq2 = load_fasta_file(args.input2) if config['max_sequence_length'] != 0 and max( len(seq1), len(seq2)) > config['max_sequence_length']: raise ValueError('Sequence exceeded max_sequence_length ') score_matrix, nodes_mapping = solve(seq1, seq2, config['gap'], config['diff'], config['same']) logging.debug('Score matrix: \n%s' % pformat(score_matrix)) logging.debug('Nodes mapping: (target_node): [(parent_node),...]\n%s' % pformat(nodes_mapping)) logging.info('Alignments score: %s' % score_matrix[len(seq1), len(seq2)]) paths = PathResolver(nodes_mapping).resolve_paths( len(seq1), len(seq2), config['max_number_of_paths']) allignments = [get_allignments(path, seq1, seq2) for path in paths] for (allignment_1, allignment_2), i in zip(allignments, range(len(allignments))): logging.info('[A%04d] %s' % (i, allignment_1)) logging.info('[A%04d] %s' % (i, allignment_2)) if args.output: save_output( args.output, { 'seq1': seq1, 'seq2': seq2, 'config': config, 'allignments': allignments, 'score_matrix': score_matrix.tolist() }) logging.info('Saved output to %s' % args.output)
def load_train_sequences(fasta_file_path, clusters_file_path): """ :param fasta_file_path:{string}, path to fasta input file. :param clusters_file_path:{string},path to cluster file. :return:{list}, list of train sequences. """ observation = load_fasta_file(fasta_file_path) train_sequences = [] with open(clusters_file_path,'r') as tsv: reader = csv.reader(tsv, dialect='excel-tab') for line in [line for line in reader][1:]: start, end = map(int, line) train_sequences.append(observation[start-100:end+100]) return train_sequences
def main(): from hmm import load_hmm_model_from_files fasta_file_path = r'data/chrom17.fasta' emission_input_file = r'data/initial_emission.tsv' transition_input_file = r'data/initial_transition.tsv' hmm_model = load_hmm_model_from_files(emission_input_file, transition_input_file) observation = load_fasta_file(fasta_file_path) v, ptr = viterbi_algorithm(hmm_model, observation) np.savez_compressed('output/viterbi_algorithm_output.npz', v=v, ptr=ptr) path = restore_viterbi_path(v,ptr) np.savez_compressed('output/path.npz', path=path) q_3_1_c(path, observation)
def build_draft_genome_seq(task): draft_genome_summary = {} vc_summary_path = task.path.joinpath(task.id, task.id + '_vc_summary.json') vc_dict = utils.load_json_file(vc_summary_path) dominant_vc = {} for ref_order in range(1, task.ref_num + 1): dominant_vc[ref_order] = {} draft_genome_summary[ref_order] = { 'conflicts': [], 'snv_list': [], 'error': [], 'file_path': '' } for vc_table in vc_dict.values(): for pos, snvs in vc_table[str(ref_order)].items(): ref = snvs['REF'] for snv, alns in snvs['SNV'].items(): for aligner in alns: if Decimal(alns[aligner]['FREQ'][:-1]) / 100 > Decimal( task.vc_threshold): if dominant_vc[ref_order].get(pos) == None: dominant_vc[ref_order][pos] = { 'REF': ref, 'ALT': {} } if dominant_vc[ref_order][pos]['ALT'].get( snv) == None: dominant_vc[ref_order][pos]['ALT'].update( {snv: { 'SCORE': 0 }}) dominant_vc[ref_order][pos]['ALT'][snv][ 'SCORE'] += 1 fasta_base_list = [] imported_ref = task.path.joinpath( task.id, 'reference', '%s_ref_%d.fasta' % (task.id, ref_order)) ref_fasta_dict = utils.load_fasta_file(imported_ref) for base in list(ref_fasta_dict.values())[0]: fasta_base_list.append(base) for pos, vc in dominant_vc[ref_order].items(): if len(vc['ALT']) > 1: # skip conflict results draft_genome_summary[ref_order]['conflicts'].append(pos) else: ref_mer = vc['REF'] alt_mer = list(vc['ALT'].keys())[0] score = int(vc['ALT'][alt_mer]['SCORE']) if score >= int(task.min_vc_score): # apply snv onto reference sequence i = 0 for base in ref_mer[1:]: if fasta_base_list[int(pos) + i] != base: draft_genome_summary[ref_order]['error'].append( pos) break fasta_base_list[int(pos) + i] = '' i += 1 fasta_base_list[int(pos) - 1] = alt_mer # record apllied snv draft_genome_summary[ref_order]['snv_list'].append( '%s%s%s' % (ref_mer, pos, alt_mer)) draft_fasta_dict = { '%s_draft_%d' % (task.id, ref_order): ''.join(fasta_base_list) } draft_cwd = task.path.joinpath(task.id, 'draft_genome') draft_fasta_path = draft_cwd.joinpath('%s_draft_%d.fasta' % (task.id, ref_order)) draft_genome_summary[ref_order]['file_path'] = str(draft_fasta_path) Path.mkdir(draft_cwd, parents=True, exist_ok=True) utils.build_fasta_file(draft_fasta_path, draft_fasta_dict) utils.build_json_file( draft_cwd.joinpath('%s_draft_summary.json' % task.id), draft_genome_summary)
size=len(observation), replace=True)) f = forward_algorithm(hmm_model, random_observation) print(f[1, -1]) def q_2_b_iii(hmm_model, observation): """ calculating the log probability of observing chrom17.fasta given the model. :return: """ repetitive_observation = 'A'*len(observation) f = forward_algorithm(hmm_model, repetitive_observation) print(f[1, -1]) if __name__ == '__main__': from hmm import load_hmm_model_from_files fasta_file_path = r'data/chrom17.fasta' observation = load_fasta_file(fasta_file_path) emission_input_file = r'data/initial_emission.tsv' transition_input_file = r'data/initial_transition.tsv' hmm_model = load_hmm_model_from_files(emission_input_file, transition_input_file) q_2_b_i(hmm_model, observation) #q_2_b_ii(hmm_model, observation) #q_2_b_iii(hmm_model, observation)