def create_irs_data(step_data, annotation_step, params): SeqIO = import_bio_seq_io() seq_idents = annotation_step.all_sequences() # set ref_ident = find_referent_genome(seq_idents, params.referent_genome) step = annotation_step.project.new_step(ChloroplastSSCBlast, step_data) ref_seq_rec = annotation_step.get_sequence_record(ref_ident) ssc_location = step.get_type_description_elem('ssc_location', default=dict()) ensure_directory(step.step_file('run_dir')) # Store query data query_file = step.step_file('run_dir', 'query.fa') if not os.path.isfile(query_file): irs = find_chloroplast_irs(ref_seq_rec) if not irs: raise ZCItoolsValueError( f"Referent genome ({ref_ident}) doesn't have IRS!") write_fasta(query_file, [('ira', str(irs[0].extract(ref_seq_rec).seq))]) files_to_zip = [query_file] calc_seq_idents = [] # All sequences, to create database from for seq_ident in sorted(seq_idents): if not os.path.isfile(step.step_file('run_dir', f'{seq_ident}.xml')): fa_file = step.step_file('run_dir', f'{seq_ident}.fa') files_to_zip.append(fa_file) calc_seq_idents.append(seq_ident) if not os.path.isfile(fa_file): seq_rec = annotation_step.get_sequence_record(seq_ident) SeqIO.write([seq_rec], fa_file, 'fasta') # Store SSC position irs = find_chloroplast_irs(seq_rec) ssc_location[seq_ident] = [len(seq_rec), int(irs[0].location.end), irb_start(irs[1])] \ if irs else [len(seq_rec), -1, -1] if calc_seq_idents: # Store finish.yml finish_f = step.step_file('finish.yml') write_yaml(dict(calc_seq_idents=calc_seq_idents), finish_f) run = True # ToDo: ... step.save(dict(ssc_location=ssc_location), completed=False) if run: run_module_script(run_irs_blast, step) finish_irs_data(step) else: files_to_zip.append(finish_f) set_run_instructions(run_irs_blast, step, files_to_zip, _instructions) # elif params.force_blast_parse: finish_irs_data(step) return step
def create_new_hybrids_data(project, step_data, params): # Check input files if not os.path.isfile(params.data_file): raise ZCItoolsValueError( f"Input data file {params.data_file} doesn't exist!") if not os.path.isfile(params.gtyp_cat_file): raise ZCItoolsValueError( f"Input genotype category probabilities {params.gtyp_cat_file} doesn't exist!" ) data_file = os.path.basename(params.data_file) gtyp_cat_file = os.path.basename(params.gtyp_cat_file) step = NewHybridsStep(project, step_data, remove_data=True) step.set_data(data_file, gtyp_cat_file, params.theta_prior, params.pi_prior, params.burn_in, params.num_sweeps) # Copy input files files_to_zip = [step.step_file(data_file), step.step_file(gtyp_cat_file)] copy_file(params.data_file, files_to_zip[0]) copy_file(params.gtyp_cat_file, files_to_zip[1]) # Create run directories seeds = random.sample( list(itertools.product(range(1, _MAX_SMALL_NUMBER + 1), repeat=2)), params.num_runs) for seed in seeds: files_to_zip.append(step.step_file(step.seed_dir(seed))) ensure_directory(files_to_zip[-1]) files_to_zip.append(step.step_file('finish.yml')) write_yaml( dict(data_file=data_file, gtyp_cat_file=gtyp_cat_file, theta_prior=params.theta_prior, pi_prior=params.pi_prior, burn_in=params.burn_in, num_sweeps=params.num_sweeps), files_to_zip[-1]) # Stores description.yml step.save(completed=params.run) # Run or set instructions if params.run: run_module_script(run_new_hybrids, step) else: set_run_instructions(run_new_hybrids, step, files_to_zip, _instructions) # return step
def create_irs_data(step_data, input_step, params, common_db): # , run): # Creates Annotations step from input sequences/annotations # Steps subdirectory 'run_dir' contains input and output calculation files SeqIO = import_bio_seq_io() files_to_zip = [] calc_seq_idents = [] step = input_step.project.new_step(AnnotationsStep, step_data) # Set sequences step.set_sequences(input_step.all_sequences()) ensure_directory(step.step_file('run_dir')) for seq_ident in input_step.all_sequences(): out_file = step.step_file('run_dir', f'{seq_ident}.out') if not os.path.isfile(out_file): seq_rec = input_step.get_sequence_record(seq_ident) # Set fasta file for calculation files_to_zip.append(step.step_file('run_dir', f'{seq_ident}.fa')) SeqIO.write([seq_rec], files_to_zip[-1], 'fasta') calc_seq_idents.append(seq_ident) elif not os.path.isfile(step.step_file(f'{seq_ident}.gb')): calc_seq_idents.append(seq_ident) if files_to_zip: # Store finish.yml finish_f = step.step_file('finish.yml') write_yaml(dict(fa_files=files_to_zip), finish_f) run = True # ToDo: ... step.save(completed=False) if run: run_module_script(run_irs_mummer, step) finish_irs_data(step, common_db, calc_seq_idents=calc_seq_idents) else: files_to_zip.append(finish_f) set_run_instructions(run_irs_mummer, step, files_to_zip, _instructions) # elif calc_seq_idents: finish_irs_data(step, common_db, calc_seq_idents=calc_seq_idents) elif params.force_mummer_parse: finish_irs_data(step, common_db) # return step
def create_raxml_data(step_data, alignment_step, partitions_obj, run_threads): # List of dicts with attrs: filename, short, partitions (filename or None) # This data is used to optimize calculation files_to_proc = [] files_to_zip = [] if alignment_step._IS_COLLECTION: step = RAxMLSteps(alignment_step.project, step_data, remove_data=True) for align_step in alignment_step.step_objects(): substep = step.create_substep(align_step.get_local_name()) substep.set_sequences(align_step.all_sequences()) substep.seq_sequence_type(align_step.get_sequence_type()) _copy_alignment_file(align_step, substep, files_to_proc, partitions_obj) # substep.save(completed=False) else: step = RAxMLStep(alignment_step.project, step_data, remove_data=True) step.set_sequences(alignment_step.all_sequences()) step.seq_sequence_type(alignment_step.get_sequence_type()) _copy_alignment_file(alignment_step, step, files_to_proc, partitions_obj) # Store files desc files_to_zip = [d['filename'] for d in files_to_proc] # files to zip files_to_zip.extend(filter(None, (d['partitions'] for d in files_to_proc))) # Remove step directory from files since run script is called from step directory for d in files_to_proc: d['filename'] = step.strip_step_dir(d['filename']) finish_f = step.step_file('finish.yml') write_yaml(files_to_proc, finish_f) # Stores description.yml step.save(completed=bool(run_threads)) if run_threads: run_module_script(run_raxml, step, threads=run_threads) else: files_to_zip.append(finish_f) set_run_instructions(run_raxml, step, files_to_zip, _instructions) # return step
def create_permutations(project, step_data, raw_file, permutations, num_traits=None, run=False): # Check input files map_file = raw_file.replace('.raw', '.map') data_dir, base_raw_file = os.path.split(raw_file) tmp_files = ('tmp.00m', 'tmp.00c', 'tmp.00r') for mf in (raw_file, map_file): if not os.path.isfile(mf): raise ZCItoolsValueError( f"Input MapMaker file {mf} doesn't exist!") for qf in tmp_files: f = os.path.join(data_dir, qf) if not os.path.isfile(f): raise ZCItoolsValueError( f"Input Windows QTL Cartographer file {qf} doesn't exist!") # step = QTLCartStep(project, step_data, remove_data=True) step.set_data(num_traits, permutations) # Copy input files files_to_zip = [] for qf in tmp_files: files_to_zip.append(step.step_file(qf)) copy_file(os.path.join(data_dir, qf), files_to_zip[-1]) # Create trait directories # ToDo: find max traits and fix it/set default assert num_traits and num_traits > 0, num_traits trait_dirs = [] for t_idx in range(1, num_traits + 1): trait_dirs.append(step.trait_dir(t_idx)) t_dir = step.step_file(trait_dirs[-1]) ensure_directory(t_dir) files_to_zip.append(os.path.join(t_dir, 'qtlcart.rc')) write_str_in_file( files_to_zip[-1], _qtlcart_rc.format(trait=t_idx, num_traits=num_traits)) # # Create links to input files # for qf in tmp_files: # link_file(os.path.join('..', qf), os.path.join(t_dir, qf)) # files_to_zip.append(step.step_file('finish.yml')) write_yaml(dict(permutations=permutations, trait_dirs=trait_dirs), files_to_zip[-1]) # Stores description.yml step.save(completed=run) # Run or set instructions if run: run_module_script(run_qtl_cart_perm, step) else: set_run_instructions(run_qtl_cart_perm, step, files_to_zip, _instructions) # return step
def create_irs_data(step_data, input_step, params): # Creates Annotations step from input sequences/annotations # Steps subdirectory 'run_dir' contains input and output calculation files SeqIO = import_bio_seq_io() seq_idents = input_step.all_sequences() step = input_step.project.new_step(AnnotationsStep, step_data) step.set_sequences(seq_idents) # seq_ident -> mummer data ([length, start_1, start_2]) mummer_results = step.get_type_description_elem('mummer_results', default=dict()) # ensure_directory(step.step_file('run_dir')) calc_mummer = [] # tuples (seq_ident, fasta file, mummer output file) # Mummer for seq_ident in sorted(seq_idents - set(mummer_results)): fa_file = step.step_file('run_dir', f'{seq_ident}.fa') mummer_res_file = step.step_file('run_dir', f'{seq_ident}.out') if not os.path.isfile(fa_file): seq_rec = input_step.get_sequence_record(seq_ident) SeqIO.write([seq_rec], fa_file, 'fasta') calc_mummer.append((seq_ident, fa_file, mummer_res_file)) elif not os.path.isfile(mummer_res_file): calc_mummer.append((seq_ident, fa_file, mummer_res_file)) # Run mummer if calc_mummer: mummer_exe = 'repeat-match' # ToDo: n = 3000 threads = multiprocessing.cpu_count() with ThreadPoolExecutor(max_workers=threads) as executor: for seq_ident, fa_file, mummer_res_file in calc_mummer: executor.submit(_run_single, mummer_exe, n, fa_file, mummer_res_file) for seq_ident, _, mummer_res_file in calc_mummer: rep = _read_mummer_repeat(mummer_res_file) if not rep: raise ZCItoolsValueError(f'No repeat for sequence {seq_ident}!') mummer_results[seq_ident] = rep # Find sequences extend with alignment files_to_zip = [] calc_mafft = [] for seq_ident in sorted(seq_idents): length, s1, s2 = mummer_results[seq_ident] if length >= 23000: continue if step.is_file('run_dir', f'{seq_ident}_right_align.fa') and \ step.is_file('run_dir', f'{seq_ident}_right_align.fa'): continue # calc_mafft.append(seq_ident) _seq = input_step.get_sequence_record(seq_ident).seq seq = str(_seq) comp_seq = str(_seq.complement()) missing = 26000 - length # Right side p1 = _extract_subseq_plus(seq, s1 + length, missing) p2 = _extract_subseq_minus(comp_seq, s2 - length, missing) assert len(p1) == len(p2), (length, s1, s2, missing, (len(p1), len(p2))) files_to_zip.append(step.step_file('run_dir', f'{seq_ident}_right.fa')) write_fasta(files_to_zip[-1], [('p1', p1), ('p2', p2)]) # Left side p1 = _extract_subseq_minus(comp_seq, s1 - 1, missing) p2 = _extract_subseq_plus(seq, s2 + 1, missing) assert len(p1) == len(p2), (length, s1, s2, missing, (len(p1), len(p2))) files_to_zip.append(step.step_file('run_dir', f'{seq_ident}_left.fa')) write_fasta(files_to_zip[-1], [('p1', p1), ('p2', p2)]) # Mafft if calc_mafft: finish_f = step.step_file('finish.yml') write_yaml(dict(calc_seq_idents=calc_mafft), finish_f) run = True # ToDo: ... step.save(additional_data=dict(mummer_results=mummer_results), completed=False) if run: run_module_script(run_mafft_irs, step) finish_irs_data(step) else: files_to_zip.append(finish_f) set_run_instructions(run_mafft_irs, step, files_to_zip, _instructions) # elif params.force_parse: finish_irs_data(step) return step
def orientate_chloroplast_start(step_data, annotation_step, params): # Find referent genome # For each sequence, different than referent, directory is created named <seq_ident>. # It contains files: # - {lsc|ira|ss}_{plus|minus}.fa : input alignment files, contain 2 sequences. # - align_{lsc|ira|ss}_{plus|minus}.fa : result alignment files. seq_idents = annotation_step.all_sequences() # set ref_ident = find_referent_genome(seq_idents, params.referent_genome) # length = params.length_to_check step = annotation_step.project.new_step(ChloroplastOrientateStep, step_data, remove_data=False) sequence_data = step.get_type_description_elem('sequence_data', default=dict()) # seq_rec = annotation_step.get_sequence_record(ref_ident) partition = find_chloroplast_partition(seq_rec) ref_parts = [str(partition.get_part_by_name(n).extract(seq_rec).seq)[:length] for n in _part_names] files_to_zip = [] align_files = [] # all_versions = ('plus', 'minus', 'plus_c', 'minus_c') if params.complement else ('plus', 'minus') for seq_ident in sorted(seq_idents): seq_rec = None if seq_ident not in sequence_data: seq_rec = annotation_step.get_sequence_record(seq_ident) partition = find_chloroplast_partition(seq_rec) # Count gene orientation l_seq = len(seq_rec) in_parts = partition.put_features_in_parts( Feature(l_seq, feature=f) for f in seq_rec.features if f.type == 'gene') lsc_count = sum(f.feature.strand if any(x in f.name for x in ('rpl', 'rps')) else 0 for f in in_parts.get('lsc', [])) ssc_count = sum(f.feature.strand for f in in_parts.get('ssc', [])) ira_count = sum(f.feature.strand if 'rrn' in f.name else 0 for f in in_parts.get('ira', [])) sequence_data[seq_ident] = dict( length=len(seq_rec), lsc=(lsc_count <= 0), lsc_count=lsc_count, lsc_length=len(partition.get_part_by_name('lsc')), ssc=(ssc_count <= 0), ssc_count=ssc_count, ssc_length=len(partition.get_part_by_name('ssc')), ira=(ira_count >= 0), ira_count=ira_count, ira_length=len(partition.get_part_by_name('ira'))) if all(all(step.is_file(seq_ident, f'align_{n}_{v}.fa') for v in all_versions) for n in _part_names): continue # if seq_rec is None: seq_rec = annotation_step.get_sequence_record(seq_ident) partition = find_chloroplast_partition(seq_rec) for n, ref_p in zip(_part_names, ref_parts): # Find missing output files _num = len(align_files) for x in all_versions: if not step.is_file(seq_ident, f'align_{n}_{x}.fa'): files_to_zip.append(step.step_file(seq_ident, f'{n}_{x}.fa')) align_files.append((seq_ident, n, x)) if _num == len(align_files): continue # Store input files if all(step.is_file(seq_ident, f'align_{n}_{v}.fa') for v in all_versions): continue ensure_directory(step.step_file(seq_ident)) part_s = partition.get_part_by_name(n).extract(seq_rec) f_p = step.step_file(seq_ident, f'{n}_plus.fa') f_p_c = step.step_file(seq_ident, f'{n}_plus_c.fa') if not os.path.isfile(f_p): write_fasta(f_p, [(ref_ident, ref_p), (seq_ident, str(part_s.seq)[:length])]) if not os.path.isfile(f_p_c): write_fasta(f_p_c, [(ref_ident, ref_p), (seq_ident, str(part_s.reverse_complement().seq)[:(-length-1):-1])]) f_m = step.step_file(seq_ident, f'{n}_minus.fa') f_m_c = step.step_file(seq_ident, f'{n}_minus_c.fa') if not os.path.isfile(f_m): write_fasta(f_m, [(ref_ident, ref_p), (seq_ident, str(part_s.reverse_complement().seq)[:length])]) if not os.path.isfile(f_m_c): write_fasta(f_m_c, [(ref_ident, ref_p), (seq_ident, str(part_s.seq)[:(-length-1):-1])]) # output_file = f"{params.output_file_prefix}_{length}{'_c' if params.complement else ''}.xlsx" data = dict(sequence_data=sequence_data, check_length=length, output_file=output_file, complement=params.complement) if align_files: # Store finish.yml finish_f = step.step_file('finish.yml') write_yaml(dict(align_files=align_files), finish_f) run = True # ToDo: ... step.save(data, completed=False) if run: run_module_script(run_orientate, step) orientate_chloroplast_finish(step) # , common_db, calc_seq_idents=calc_seq_idents) else: files_to_zip.append(finish_f) set_run_instructions(run_orientate, step, files_to_zip, _instructions) # elif params.force_parse: step.save(data) orientate_chloroplast_finish(step) # , common_db, calc_seq_idents=calc_seq_idents) # else: step.save(data, completed=False) return step
def create_mr_bayes_data(step_data, alignment_step, args, partitions_obj, run_threads): # List of dicts with attrs: filename, short # This data is used to optimize calculation # ToDo: almost the same as raxml.py. Differs in class types, _copy_alignment_file() and file formats files_to_proc = [] if alignment_step._IS_COLLECTION: step = MrBayesSteps(alignment_step.project, step_data, remove_data=True) for align_step in alignment_step.step_objects(): substep = step.create_substep(align_step.get_local_name()) substep.set_sequences(align_step.all_sequences()) substep.seq_sequence_type(align_step.get_sequence_type()) _copy_alignment_file(align_step, substep, files_to_proc, args, partitions_obj) # substep.save(completed=False) if args.num_runs and args.num_runs > 1: print( 'Warning: number of runs for collection of alignments is not supported.' ) else: if args.num_runs and args.num_runs > 1: step = MrBayesSteps(alignment_step.project, step_data, remove_data=True) for run_idx in range(args.num_runs): substep = step.create_substep(f'RUN_{run_idx + 1}') substep.set_sequences(alignment_step.all_sequences()) substep.seq_sequence_type(alignment_step.get_sequence_type()) # ToDo: make symbolic links? _copy_alignment_file(alignment_step, substep, files_to_proc, args, partitions_obj) # substep.save(completed=False) else: step = MrBayesStep(alignment_step.project, step_data, remove_data=True) step.set_sequences(alignment_step.all_sequences()) step.seq_sequence_type(alignment_step.get_sequence_type()) _copy_alignment_file(alignment_step, step, files_to_proc, args, partitions_obj) # Store files desc files_to_zip = [d['filename'] for d in files_to_proc] # files to zip # Remove step directory from files since run script is called from step directory for d in files_to_proc: d['filename'] = step.strip_step_dir(d['filename']) d['result_prefix'] = step.strip_step_dir(d['result_prefix']) finish_f = step.step_file('finish.yml') write_yaml(files_to_proc, finish_f) # Stores description.yml step.save(completed=bool(run_threads)) if run_threads: run_module_script(run_mr_bayes, step, threads=run_threads, use_mpi=(not args.no_mpi)) else: files_to_zip.append(finish_f) set_run_instructions(run_mr_bayes, step, files_to_zip, _instructions) # return step