def finish_raxml_data(step_obj): output_f = step_obj.step_file('output.zip') if not os.path.isfile(output_f): raise ZCItoolsValueError('No calculation output file output.zip!') # Check are all file RAxML outputs, in same directories as files to process and # filenames matches RAxML_.*\.raxml_output dirs = set( os.path.dirname(d['filename']) for d in read_yaml(step_obj.step_file('finish.yml'))) for z_file in list_zip_files(output_f): parts = z_file.split('/') # ZipFile uses '/' as separator _dir = '' if len(parts) == 1 else os.sep.join(parts[:-1]) if _dir not in dirs: raise ZCItoolsValueError( f'Output contains file(s) in not step directory ({_dir})!') if not _re_raxml_output.search( parts[-1]) and parts[-1] != 'run_info.txt': raise ZCItoolsValueError( f'Not RAxML output file(s)found in the output ({parts[-1]})!') # Unzip data unzip_file(output_f, step_obj.directory) step_obj._check_data() step_obj.save(create=False)
def finish_mr_bayes_data(step_obj): output_f = step_obj.step_file('output.zip') if not os.path.isfile(output_f): raise ZCItoolsValueError('No calculation output file output.zip!') allowed_files = set(_RESULT_PREFIX + ext for ext in ('.ckp', '.con.tre', '.parts', '.run1.p', '.run1.t', '.run2.p', '.run2.t', '.tstat', '.vstat')) # Check are all file MrBayes outputs dirs = set( os.path.dirname(d['filename']) for d in read_yaml(step_obj.step_file('finish.yml'))) for z_file in list_zip_files(output_f): parts = z_file.split('/') # ZipFile uses '/' as separator _dir = '' if len(parts) == 1 else os.sep.join(parts[:-1]) if _dir not in dirs: raise ZCItoolsValueError( f'Output contains file(s) in not step directory ({_dir})!') if parts[-1] not in allowed_files and parts[-1] != 'run_info.txt': raise ZCItoolsValueError( f'Not MrBayes output file(s)found in the output ({parts[-1]})!' ) # Unzip data unzip_file(output_f, step_obj.directory) step_obj._check_data() step_obj.save(create=False)
def run(self, step_data): a = self.args if not os.path.isdir(a.step_directory): raise ZCItoolsValueError( f"Step directory {a.step_directory} is not an directory!") if not (desc_data := read_yaml( os.path.join(a.step_directory, 'description.yml'))): raise ZCItoolsValueError( f"Directory {a.step_directory} is not an step!")
def find_referent_genome(seq_idents, referent_seq_ident): if referent_seq_ident in seq_idents: return referent_seq_ident refs = [ seq_ident for seq_ident in seq_idents if seq_ident.startswith(referent_seq_ident) ] if not refs: raise ZCItoolsValueError( f'No referent genome which name starts with {referent_seq_ident}!') elif len(refs) > 1: raise ZCItoolsValueError( f'More genomes which name starts with {referent_seq_ident}!') return refs[0]
def _check_columns(columns): if not isinstance(columns, (tuple, list)): raise ZCItoolsValueError( f"Column specification is not a list or tuple ({type(columns)})!") wrong_columns = [ d for d in columns if not isinstance(d, (tuple, list)) or len(d) != 2 ] if wrong_columns: raise ZCItoolsValueError( f"Error in columns specification: {wrong_columns}") # wrong_types = [(n, ct) for n, ct in columns if ct not in KNOWN_DATA_TYPES] if wrong_types: raise ZCItoolsValueError( f"Unsupported column type(s) for: {wrong_types}")
def orient_chloroplast_parts_by_data(seq_rec, orientation, starts=None, partition=None): if not partition: if starts: partition = create_chloroplast_partition_all( len(seq_rec.seq), starts) else: raise ZCItoolsValueError( f'No partition data to orient chloroplast parts for sequence {seq_rec.name}!' ) parts = partition.extract(seq_rec) # dict name -> Seq object if 'lsc' in orientation: # LSC parts['lsc'] = parts['lsc'].reverse_complement() if 'ssc' in orientation: # SSC parts['ssc'] = parts['ssc'].reverse_complement() if 'ira' in orientation: # IRs parts['ira'], parts['irb'] = parts['irb'].reverse_complement( ), parts['ira'].reverse_complement() new_seq = parts['lsc'] + parts['ira'] + parts['ssc'] + parts['irb'] assert len(seq_rec.seq) == len(new_seq.seq), \ (seq_rec.name, len(seq_rec.seq), len(new_seq.seq), starts, [(n, len(p)) for n, p in parts.items()], [(n, len(p)) for n, p in partition.extract(seq_rec).items()]) return new_seq
def _fetch_complete_chloroplasts(organisms, args, max_taxid): from ...utils.entrez import Entrez ts = ' OR '.join(f'"{t}"[Organism]' for t in organisms) titles = ['chloroplast'] if args.fetch_plastids: titles.append('plastid') # rows = [] for title in titles: data = Entrez().search_summary( 'nucleotide', term= f'({ts}) AND ("complete genome"[Title] AND {title}[Title]) AND refseq' ) assert isinstance(data, list), type(data) if args.max_update_date: max_d = date(*map(int, args.max_update_date.split('-'))) data = [d for d in data if _to_date(d['UpdateDate']) <= max_d] rows.extend(data) ncbi_taxonomy = get_ncbi_taxonomy() parents = ncbi_taxonomy._nt().get_lineage_translator( [int(r['TaxId']) for r in rows]) if not_in := [r for r in rows if int(r['TaxId']) not in parents]: if args.check_taxids: raise ZCItoolsValueError( f"Fetch cpDNA genomes. Taxid not known!!! {not_in}") prev_len = len(rows) rows = [r for r in rows if r not in not_in] print( f'Removed sequences with problematic taxids: {prev_len - len(rows)}!' )
def actions(self): # Returns list of WfAction objects actions = [] for x in self._actions(): assert 2 <= len(x) <= 3, x sn, cmd = x[:2] a_prev = [] if len(x) == 2 else ( [x[2]] if isinstance(x[2], str) else x[2]) actions.append( (sn, (cmd.split() if isinstance(cmd, str) else cmd), a_prev)) # actions.append((sn, (shlex.split(cmd) if isinstance(cmd, str) else cmd), a_prev)) assert all(isinstance(cmd, (list, tuple)) for _, cmd, _ in actions), actions assert all( isinstance(prevs, (list, tuple)) for _, _, prevs in actions), actions # Check commands commands_map = self.project.commands_map if (not_in := [cmd[0] for _, cmd, _ in actions if cmd[0] not in commands_map]): raise ZCItoolsValueError( f"Worflow actions, not existing command(s)! {', '.join(sorted(not_in))}" )
def register_workflows(self, workflow_classes): for w_cls in workflow_classes: wf = w_cls._WORKFLOW if wf in self.workflows_map: raise ZCItoolsValueError( f"Workflow {wf} already registerd ({self.workflows_map[wf].__class__.__name__})!" ) self.workflows_map[wf] = w_cls
def register_commands(self, command_classes): for command_cls in command_classes: cmd = command_cls._COMMAND if cmd in self.commands_map: raise ZCItoolsValueError( f"Command {cmd} already registerd ({self.commands_map[cmd].__class__.__name__})!" ) self.commands_map[cmd] = command_cls
def _check_rows(columns, rows): n_cols = len(columns) diff_lengts = [(i, len(row)) for i, row in enumerate(rows) if len(row) != n_cols] if diff_lengts: raise ZCItoolsValueError( f"Rows have different length than specified columns {n_cols}: {diff_lengts}" )
def run(self, step_data): from .common_methods import create_alignment_data align_params = [a.lower() for a in self.args.alignments] sup = set(a for a, _ in self._ALIGNMENTS) if not_sup_params := [a for a in align_params if a not in sup]: raise ZCItoolsValueError( f'No valid alignments set ({", ".join(sorted(not_sup_params))}).' )
def register_steps(self, step_classes): for step_cls in step_classes: s_type = step_cls._STEP_TYPE if s_type in self.steps_map: raise ZCItoolsValueError( f"Step {s_type} already registerd ({self.steps_map[s_type].__class__.__name__})!" ) self.steps_map[s_type] = step_cls
def choose_first_column(self, *columns, error=False): for c in columns: if self.has_column(c): return c if error: raise ZCItoolsValueError( f'No column found from: {columns}.\nExisting columns: {[c for c, _ in self._columns]}' )
def create_irs_data(step_data, annotation_step, params): SeqIO = import_bio_seq_io() seq_idents = annotation_step.all_sequences() # set ref_ident = find_referent_genome(seq_idents, params.referent_genome) step = annotation_step.project.new_step(ChloroplastSSCBlast, step_data) ref_seq_rec = annotation_step.get_sequence_record(ref_ident) ssc_location = step.get_type_description_elem('ssc_location', default=dict()) ensure_directory(step.step_file('run_dir')) # Store query data query_file = step.step_file('run_dir', 'query.fa') if not os.path.isfile(query_file): irs = find_chloroplast_irs(ref_seq_rec) if not irs: raise ZCItoolsValueError( f"Referent genome ({ref_ident}) doesn't have IRS!") write_fasta(query_file, [('ira', str(irs[0].extract(ref_seq_rec).seq))]) files_to_zip = [query_file] calc_seq_idents = [] # All sequences, to create database from for seq_ident in sorted(seq_idents): if not os.path.isfile(step.step_file('run_dir', f'{seq_ident}.xml')): fa_file = step.step_file('run_dir', f'{seq_ident}.fa') files_to_zip.append(fa_file) calc_seq_idents.append(seq_ident) if not os.path.isfile(fa_file): seq_rec = annotation_step.get_sequence_record(seq_ident) SeqIO.write([seq_rec], fa_file, 'fasta') # Store SSC position irs = find_chloroplast_irs(seq_rec) ssc_location[seq_ident] = [len(seq_rec), int(irs[0].location.end), irb_start(irs[1])] \ if irs else [len(seq_rec), -1, -1] if calc_seq_idents: # Store finish.yml finish_f = step.step_file('finish.yml') write_yaml(dict(calc_seq_idents=calc_seq_idents), finish_f) run = True # ToDo: ... step.save(dict(ssc_location=ssc_location), completed=False) if run: run_module_script(run_irs_blast, step) finish_irs_data(step) else: files_to_zip.append(finish_f) set_run_instructions(run_irs_blast, step, files_to_zip, _instructions) # elif params.force_blast_parse: finish_irs_data(step) return step
def run(self, step_data): from .mvista import create_mvista_data from common_utils.file_utils import get_settings email = self.args.email or get_settings()['email'] if self.args.run and not email: raise ZCItoolsValueError( 'Email address is needed to post mVISTA data!') return create_mvista_data(step_data, self._input_step(), self.args.run, email)
def run(self, step_data): from .ogdraw import create_ogdraw img_f = self.args.image_format.lower() if img_f not in self._IMAGE_FORMATS: raise ZCItoolsValueError(f'Given format {img_f} is not supported!') return create_ogdraw(step_data, img_f, self._input_step(), self.get_common_db_object(), sequences=self.args.sequences)
def create_table_step(project, step_data, params): if not os.path.isfile(params.filename): raise ZCItoolsValueError(f"Table file {params.filename} doesn't exist.") # Find how to read data data_format = params.data_format if data_format is None: data_format = filetype_from_ext(params.filename) if not data_format: raise ZCItoolsValueError(f"Data format for input table is not specified or found! Filename {params.filename}.") columns = [x.split(',') for x in params.columns.split(':')] if params.columns else None if columns: columns = [(c[0], column_name_2_type(c[0])) if len(c) == 1 else c for c in columns] # Read data. data = None if data_format == 'text': # ToDo: separator for more columns. For now only list supported with open(params.filename, 'r') as r: data = [[line] for line in filter(None, (_l.strip() for _l in r.readlines()))] data = sorted(data) elif data_format == 'csv': with open(params.filename, 'r') as incsv: reader = csv.reader(incsv, delimiter=params.delimiter, quotechar='"') if params.has_header: header = next(reader) # Skip header if not columns: columns = [(c, column_name_2_type(c)) for c in header] # Default data = sorted(reader) elif data_format == 'raw_data': data = [[line] for line in filename.split(';') if line] else: raise ZCItoolsValueError(f'Data format {data_format} is not supported!') if not columns: raise ZCItoolsValueError(f"Columns are not specified for input table! Filename {params.filename}.") # Store (or overwrite) step data step = TableStep(project, step_data, remove_data=True) step.set_table_data(data, columns) step.save() return step
def run_command_with_args(self, *args): command = args[0].lower() if command not in self.commands_map: raise ZCItoolsValueError(f'Command "{command}" is not supported!') parser = self._get_parser(command, False) print(f"Run: {' '.join(args)}") return self._run_command(command, parser.parse_args(args), cmd_args=args)
def read_step(self, step_name, check_data_type=None, update_mode=False, no_check=False, outside_of_project=False): if isinstance(step_name, str): desc_data = read_yaml(os.path.join(step_name, 'description.yml')) else: assert isinstance(step_name, (list, tuple)), type(step_name) desc_data = read_yaml(os.path.join(*step_name, 'description.yml')) if not desc_data: raise ZCItoolsValueError(f"'{step_name}' is not a step!") data_type = desc_data['data_type'] if check_data_type: if isinstance(check_data_type, str): if check_data_type != data_type: raise ZCItoolsValueError( f"Step {step_name} is not of data type '{check_data_type}'!" ) else: if data_type not in check_data_type: raise ZCItoolsValueError( f"Step {step_name} is not of data types: {', '.join(check_data_type)}!" ) cls = self.steps_map.get(data_type) if not cls: raise ZCItoolsValueError( f"No step class for data type {data_type}!") if outside_of_project and isinstance(step_name, (list, tuple)): return cls(self, desc_data['project'], update_mode=update_mode, no_check=no_check, step_directory=step_name) return cls(self, desc_data['project'], update_mode=update_mode, no_check=no_check)
def _input_step(self, no_data_check=False): assert self._INPUT_STEP_DATA_TYPE a = self.args step = self.project.read_step( a.step, check_data_type=self._INPUT_STEP_DATA_TYPE, no_check=(no_data_check or a.no_data_check)) if not step.is_completed(): raise ZCItoolsValueError(f"Input step {a.step} is not completed!") return step
def format_parameters(params): # Methods from ..chloroplast.irs.analyse_irs import METHOD_NAMES, METHOD_NAMES_RESEARCH ms = params['methods'].lower().split(',') if 'all' in ms: methods = METHOD_NAMES elif 'research' in ms: methods = METHOD_NAMES_RESEARCH elif not_known_methods := set(m for m in ms if m not in METHOD_NAMES): raise ZCItoolsValueError( f'Not know method(s): {", ".join(not_known_methods)}!')
def run(self): from ..utils.helpers import change_sequence_data a = self.args method = a.method.lower() if method not in ('revert', 'translate'): raise ZCItoolsValueError(f"Not known method {a.method}!") change_sequence_data(method, a.input_filename, a.output_filename, input_format=a.input_format, output_format=a.output_format, position=a.position)
def step_base_name(self): m = self.args.method[0].lower() if m == 'p': n = self._format_step_name('FixByParts') # elif m == 'h': # n = self._format_step_name('FixByTrnH-GUG') elif m == 'f': n = self._format_step_name('FixByTrnF-GAA') else: raise ZCItoolsValueError(f'Not known method {self.args.method}!') # Add offset or not? o = self.args.keep_offset return f'{n}_{o}' if o != DEFAULT_KEEP_OFFSET else n
def _have_same_nodes(self, t2, tree_idx): assert self.node_names[tree_idx], tree_idx assert t2.node_names[tree_idx], tree_idx not_in_2 = self.node_names[tree_idx] - t2.node_names[tree_idx] not_in_1 = t2.node_names[tree_idx] - self.node_names[tree_idx] if not_in_2 or not_in_1: if not_in_2: print("Nodes missing in the second tree:", ', '.join(sorted(not_in_2))) if not_in_1: print("Nodes missing in the first tree:", ', '.join(sorted(not_in_1))) raise ZCItoolsValueError("Trees don't have same set of nodes.")
def run(self, step_data): m = self.args.method[0].lower() if m == 'p': from .fix_by_analysis import fix_by_parts as fix_method # elif m == 'h': # from .fix_by_analysis import fix_by_trnH_GUG as fix_method elif m == 'f': from .fix_by_analysis import fix_by_trnF_GAA as fix_method else: raise ZCItoolsValueError(f'Not known method {self.args.method}!') # return fix_method(step_data, self._input_step(no_data_check=True), self.args.subset, self.args.keep_offset, self.get_common_db_object())
def fetch_common_db_data(step_data, table_step, step_type, common_db): step = table_step.project.new_step_by_type(step_type, step_data, remove_data=True) for seq_ident in table_step.get_column_values_by_type('seq_ident'): f = common_db.get_record(seq_ident, step.directory, info=True) if not f: raise ZCItoolsValueError( f"There is not CommonDB record for seq ident {seq_ident}!") step.add_sequence_file(os.path.basename(f)) step.save() return step
def _check_data(self): # Check does alignment file exist if all(not os.path.isfile(self.step_file(a)) for a in ('alignment.phy', 'alignment.fa')): raise ZCItoolsValueError( f'No alignment file for step {self.directory}!') # Check are all sequences in sequence file exist_seq_idents = set( read_fasta_identifiers(self.step_file('sequences.fa'))) sets_equal(self._sequences, exist_seq_idents, 'sequence', step=self.directory)
def new_step_by_type(self, data_type, step_data, remove_data=False, update_mode=False, no_check=False): cls = self.steps_map.get(data_type) if not cls: raise ZCItoolsValueError( f"No step class for data type {data_type}!") return cls(self, step_data, remove_data=remove_data, update_mode=update_mode, no_check=no_check or self._args.no_data_check)
def read_complete_step(self, step_name, check_data_type=None, update_mode=False, no_check=False, outside_of_project=False): step = self.read_step(step_name, check_data_type=check_data_type, update_mode=update_mode, no_check=no_check, outside_of_project=outside_of_project) if not step.is_completed(): raise ZCItoolsValueError( f"Input step {step_name} is not completed!") return step