def finish_raxml_data(step_obj): output_f = step_obj.step_file('output.zip') if not os.path.isfile(output_f): raise ZCItoolsValueError('No calculation output file output.zip!') # Check are all file RAxML outputs, in same directories as files to process and # filenames matches RAxML_.*\.raxml_output dirs = set( os.path.dirname(d['filename']) for d in read_yaml(step_obj.step_file('finish.yml'))) for z_file in list_zip_files(output_f): parts = z_file.split('/') # ZipFile uses '/' as separator _dir = '' if len(parts) == 1 else os.sep.join(parts[:-1]) if _dir not in dirs: raise ZCItoolsValueError( f'Output contains file(s) in not step directory ({_dir})!') if not _re_raxml_output.search( parts[-1]) and parts[-1] != 'run_info.txt': raise ZCItoolsValueError( f'Not RAxML output file(s)found in the output ({parts[-1]})!') # Unzip data unzip_file(output_f, step_obj.directory) step_obj._check_data() step_obj.save(create=False)
def finish(self, step_obj): from .common_methods import finish_alignment_data # Check are needed files in zip, not something strange files = set(d['filename'].replace('sequences.fa', 'alignment.phy') for d in read_yaml(step_obj.step_file('finish.yml'))) files.add('run_info.txt') finish_alignment_data(step_obj, files)
def finish_mr_bayes_data(step_obj): output_f = step_obj.step_file('output.zip') if not os.path.isfile(output_f): raise ZCItoolsValueError('No calculation output file output.zip!') allowed_files = set(_RESULT_PREFIX + ext for ext in ('.ckp', '.con.tre', '.parts', '.run1.p', '.run1.t', '.run2.p', '.run2.t', '.tstat', '.vstat')) # Check are all file MrBayes outputs dirs = set( os.path.dirname(d['filename']) for d in read_yaml(step_obj.step_file('finish.yml'))) for z_file in list_zip_files(output_f): parts = z_file.split('/') # ZipFile uses '/' as separator _dir = '' if len(parts) == 1 else os.sep.join(parts[:-1]) if _dir not in dirs: raise ZCItoolsValueError( f'Output contains file(s) in not step directory ({_dir})!') if parts[-1] not in allowed_files and parts[-1] != 'run_info.txt': raise ZCItoolsValueError( f'Not MrBayes output file(s)found in the output ({parts[-1]})!' ) # Unzip data unzip_file(output_f, step_obj.directory) step_obj._check_data() step_obj.save(create=False)
def from_file(filename, relative_dir=None): sr = SequenceReads(data=read_yaml(filename)) _dir = os.path.dirname(filename) if relative_dir: _dir = os.path.join(relative_dir, _dir) if _dir else relative_dir if _dir: sr.add_relative_path(_dir) return sr
def read_step(self, step_name, check_data_type=None, update_mode=False, no_check=False, outside_of_project=False): if isinstance(step_name, str): desc_data = read_yaml(os.path.join(step_name, 'description.yml')) else: assert isinstance(step_name, (list, tuple)), type(step_name) desc_data = read_yaml(os.path.join(*step_name, 'description.yml')) if not desc_data: raise ZCItoolsValueError(f"'{step_name}' is not a step!") data_type = desc_data['data_type'] if check_data_type: if isinstance(check_data_type, str): if check_data_type != data_type: raise ZCItoolsValueError( f"Step {step_name} is not of data type '{check_data_type}'!" ) else: if data_type not in check_data_type: raise ZCItoolsValueError( f"Step {step_name} is not of data types: {', '.join(check_data_type)}!" ) cls = self.steps_map.get(data_type) if not cls: raise ZCItoolsValueError( f"No step class for data type {data_type}!") if outside_of_project and isinstance(step_name, (list, tuple)): return cls(self, desc_data['project'], update_mode=update_mode, no_check=no_check, step_directory=step_name) return cls(self, desc_data['project'], update_mode=update_mode, no_check=no_check)
def finish_ogdraw(step_obj, common_db): # Note: original files are left in directory # Check files ogdraw-result-<num>-<hash>.zip zip_files = step_obj.step_files(matches='^ogdraw-result-[0-9]+-.*.zip') if not zip_files: print( "Warning: can't find any OGDraw output file (ogdraw-result-*.zip)!" ) return # Collect sequence idents submited d = read_yaml(step.step_file('finish.yml')) image_format = d['image_format'] seq_ident_map = dict() # (sequence file idx, line idx) -> seq_ident for seq_idx, sequences in d['sequences'].items(): # Note: line idx starts from 1, since files in zip has that numbering seq_ident_map.update(((seq_idx, i + 1), seq_ident) for i, seq_ident in enumerate(sequences)) # extract ogdraw-result-<num>-<hash>/sequences_<num>ff_<num>/ogdraw_job_<hash>-outfile.<image_format> # Zip subdirectory naming depends on naming of OGDraw input files (sequences_<num>.gbff) f_end = f'-outfile.{image_format}' added_images = [] for filename in zip_files: with ZipFile(step_obj.step_file(filename), 'r') as zip_f: for z_i in zip_f.infolist(): if z_i.filename.endswith(f_end): # Find sequence id of that file rest = z_i.filename.split('sequences_')[1] nums = re.findall(r'\d+', rest) file_idx = int(nums[0]) line_idx = int(nums[1]) seq_ident = seq_ident_map[(file_idx, line_idx)] # added_images.append(seq_ident) extract_from_zip( zip_f, z_i.filename, step_obj.step_file(f'{seq_ident}.{image_format}')) step_obj._check_data() step_obj.save(create=False) # Set into the common db if common_db: for image_ident in added_images: common_db.set_record( image_ident, step_obj.step_file(f'{image_ident}.{image_format}'))
def _run_command(self, command, args, cmd_args=None): self._args = args # Store commands args command_obj = self.commands_map[command](self, args) command_type = command_obj.get_command_type() # General work if not command_type: if command_obj._PROJECT_COMMAND and not self._check_is_project_valid( ): return command_obj.run() # Create new step elif command_type in ('new_step', 'new_steps'): if not self._check_is_project_valid(): return # Run command command_args = dict( (k, v) for k, v in vars(args).items() if k not in ('command', 'step_num', 'step_description')) db_id = command_obj.common_db_identifier() step_data = dict( prev_steps=command_obj.prev_steps(), common_db_identifier=list(db_id) if db_id else None, command=command, command_args=command_args, cmd=' '.join(cmd_args or sys.argv[1:])) ret = None if command_type == 'new_step': step_data['step_name'] = self.new_step_name(command_obj, args) ret = command_obj.run(step_data) if ret: if not ret.is_completed(): print( f'Step is not finished, check instruction ({ret.directory}/INSTRUCTIONS.txt)!' ) else: print( "Warning: create step command didn't return step object!" ) else: ret = command_obj.run(step_data) if ret is not None: for s in ret: if not s.is_completed(): print( f'Step is not finished, check instruction ({s.directory}/INSTRUCTIONS.txt)!' ) else: print( "Warning: create steps command didn't return any step object!" ) if ret: # Store log data into project_log.yml step_data = dict((k, v) for k, v in step_data.items() if k in ('cmd', 'step_name')) # Do not store if step_data is equal as from last command? log = read_yaml('project_log.yml') if not log or log[-1] != step_data: write_yaml([step_data], 'project_log.yml', mode='a') # Appends yml list else: print(f"Warning: not supported command_type {command_type}?!")
def get_description(self): return read_yaml(self.step_file('description.yml'))
d = self.get_description() if d: return d['data'].get(attr, default) return default # Summary data def save_summary_data(self, data): assert isinstance(data, dict), data write_yaml(data, self.step_file('summary.yml')) def make_summary_data(self): return def get_summary_data(self): if os.path.isfile(f := self.step_file('summary.yml')): return read_yaml(f) # 'Cached' version if self.is_completed() and (d := self.make_summary_data()): self.save_summary_data(d) return d # def get_finish_data(self): if os.path.isfile(f := self.step_file('finish.yml')): return read_yaml(f) # Substep methods def get_substep_step_data(self, step_name): return dict( step_name=step_name ) # , prev_steps=None, command=None, command_args=None, cmd=None)