Esempio n. 1
0
def finish_raxml_data(step_obj):
    output_f = step_obj.step_file('output.zip')
    if not os.path.isfile(output_f):
        raise ZCItoolsValueError('No calculation output file output.zip!')

    # Check are all file RAxML outputs, in same directories as files to process and
    # filenames matches RAxML_.*\.raxml_output
    dirs = set(
        os.path.dirname(d['filename'])
        for d in read_yaml(step_obj.step_file('finish.yml')))
    for z_file in list_zip_files(output_f):
        parts = z_file.split('/')  # ZipFile uses '/' as separator
        _dir = '' if len(parts) == 1 else os.sep.join(parts[:-1])
        if _dir not in dirs:
            raise ZCItoolsValueError(
                f'Output contains file(s) in not step directory ({_dir})!')

        if not _re_raxml_output.search(
                parts[-1]) and parts[-1] != 'run_info.txt':
            raise ZCItoolsValueError(
                f'Not RAxML output file(s)found in the output ({parts[-1]})!')

    # Unzip data
    unzip_file(output_f, step_obj.directory)

    step_obj._check_data()
    step_obj.save(create=False)
Esempio n. 2
0
def finish_mr_bayes_data(step_obj):
    output_f = step_obj.step_file('output.zip')
    if not os.path.isfile(output_f):
        raise ZCItoolsValueError('No calculation output file output.zip!')

    allowed_files = set(_RESULT_PREFIX + ext
                        for ext in ('.ckp', '.con.tre', '.parts', '.run1.p',
                                    '.run1.t', '.run2.p', '.run2.t', '.tstat',
                                    '.vstat'))

    # Check are all file MrBayes outputs
    dirs = set(
        os.path.dirname(d['filename'])
        for d in read_yaml(step_obj.step_file('finish.yml')))
    for z_file in list_zip_files(output_f):
        parts = z_file.split('/')  # ZipFile uses '/' as separator
        _dir = '' if len(parts) == 1 else os.sep.join(parts[:-1])
        if _dir not in dirs:
            raise ZCItoolsValueError(
                f'Output contains file(s) in not step directory ({_dir})!')

        if parts[-1] not in allowed_files and parts[-1] != 'run_info.txt':
            raise ZCItoolsValueError(
                f'Not MrBayes output file(s)found in the output ({parts[-1]})!'
            )

    # Unzip data
    unzip_file(output_f, step_obj.directory)

    step_obj._check_data()
    step_obj.save(create=False)
Esempio n. 3
0
 def run(self, step_data):
     a = self.args
     if not os.path.isdir(a.step_directory):
         raise ZCItoolsValueError(
             f"Step directory {a.step_directory} is not an directory!")
     if not (desc_data := read_yaml(
             os.path.join(a.step_directory, 'description.yml'))):
         raise ZCItoolsValueError(
             f"Directory {a.step_directory} is not an step!")
Esempio n. 4
0
def find_referent_genome(seq_idents, referent_seq_ident):
    if referent_seq_ident in seq_idents:
        return referent_seq_ident
    refs = [
        seq_ident for seq_ident in seq_idents
        if seq_ident.startswith(referent_seq_ident)
    ]
    if not refs:
        raise ZCItoolsValueError(
            f'No referent genome which name starts with {referent_seq_ident}!')
    elif len(refs) > 1:
        raise ZCItoolsValueError(
            f'More genomes which name starts with {referent_seq_ident}!')
    return refs[0]
Esempio n. 5
0
def _check_columns(columns):
    if not isinstance(columns, (tuple, list)):
        raise ZCItoolsValueError(
            f"Column specification is not a list or tuple ({type(columns)})!")
    wrong_columns = [
        d for d in columns if not isinstance(d, (tuple, list)) or len(d) != 2
    ]
    if wrong_columns:
        raise ZCItoolsValueError(
            f"Error in columns specification: {wrong_columns}")
    #
    wrong_types = [(n, ct) for n, ct in columns if ct not in KNOWN_DATA_TYPES]
    if wrong_types:
        raise ZCItoolsValueError(
            f"Unsupported column type(s) for: {wrong_types}")
Esempio n. 6
0
def orient_chloroplast_parts_by_data(seq_rec,
                                     orientation,
                                     starts=None,
                                     partition=None):
    if not partition:
        if starts:
            partition = create_chloroplast_partition_all(
                len(seq_rec.seq), starts)
        else:
            raise ZCItoolsValueError(
                f'No partition data to orient chloroplast parts for sequence {seq_rec.name}!'
            )

    parts = partition.extract(seq_rec)  # dict name -> Seq object
    if 'lsc' in orientation:  # LSC
        parts['lsc'] = parts['lsc'].reverse_complement()
    if 'ssc' in orientation:  # SSC
        parts['ssc'] = parts['ssc'].reverse_complement()
    if 'ira' in orientation:  # IRs
        parts['ira'], parts['irb'] = parts['irb'].reverse_complement(
        ), parts['ira'].reverse_complement()

    new_seq = parts['lsc'] + parts['ira'] + parts['ssc'] + parts['irb']
    assert len(seq_rec.seq) == len(new_seq.seq), \
        (seq_rec.name, len(seq_rec.seq), len(new_seq.seq), starts,
            [(n, len(p)) for n, p in parts.items()],
            [(n, len(p)) for n, p in partition.extract(seq_rec).items()])
    return new_seq
Esempio n. 7
0
def _fetch_complete_chloroplasts(organisms, args, max_taxid):
    from ...utils.entrez import Entrez

    ts = ' OR '.join(f'"{t}"[Organism]' for t in organisms)
    titles = ['chloroplast']
    if args.fetch_plastids:
        titles.append('plastid')
    #
    rows = []
    for title in titles:
        data = Entrez().search_summary(
            'nucleotide',
            term=
            f'({ts}) AND ("complete genome"[Title] AND {title}[Title]) AND refseq'
        )
        assert isinstance(data, list), type(data)
        if args.max_update_date:
            max_d = date(*map(int, args.max_update_date.split('-')))
            data = [d for d in data if _to_date(d['UpdateDate']) <= max_d]
        rows.extend(data)

    ncbi_taxonomy = get_ncbi_taxonomy()
    parents = ncbi_taxonomy._nt().get_lineage_translator(
        [int(r['TaxId']) for r in rows])

    if not_in := [r for r in rows if int(r['TaxId']) not in parents]:
        if args.check_taxids:
            raise ZCItoolsValueError(
                f"Fetch cpDNA genomes. Taxid not known!!! {not_in}")
        prev_len = len(rows)
        rows = [r for r in rows if r not in not_in]
        print(
            f'Removed sequences with problematic taxids: {prev_len - len(rows)}!'
        )
Esempio n. 8
0
    def actions(self):
        # Returns list of WfAction objects
        actions = []
        for x in self._actions():
            assert 2 <= len(x) <= 3, x
            sn, cmd = x[:2]
            a_prev = [] if len(x) == 2 else (
                [x[2]] if isinstance(x[2], str) else x[2])
            actions.append(
                (sn, (cmd.split() if isinstance(cmd, str) else cmd), a_prev))
            # actions.append((sn, (shlex.split(cmd) if isinstance(cmd, str) else cmd), a_prev))

        assert all(isinstance(cmd, (list, tuple))
                   for _, cmd, _ in actions), actions
        assert all(
            isinstance(prevs, (list, tuple))
            for _, _, prevs in actions), actions

        # Check commands
        commands_map = self.project.commands_map
        if (not_in :=
            [cmd[0] for _, cmd, _ in actions if cmd[0] not in commands_map]):
            raise ZCItoolsValueError(
                f"Worflow actions, not existing command(s)! {', '.join(sorted(not_in))}"
            )
Esempio n. 9
0
 def register_workflows(self, workflow_classes):
     for w_cls in workflow_classes:
         wf = w_cls._WORKFLOW
         if wf in self.workflows_map:
             raise ZCItoolsValueError(
                 f"Workflow {wf} already registerd ({self.workflows_map[wf].__class__.__name__})!"
             )
         self.workflows_map[wf] = w_cls
Esempio n. 10
0
 def register_commands(self, command_classes):
     for command_cls in command_classes:
         cmd = command_cls._COMMAND
         if cmd in self.commands_map:
             raise ZCItoolsValueError(
                 f"Command {cmd} already registerd ({self.commands_map[cmd].__class__.__name__})!"
             )
         self.commands_map[cmd] = command_cls
Esempio n. 11
0
def _check_rows(columns, rows):
    n_cols = len(columns)
    diff_lengts = [(i, len(row)) for i, row in enumerate(rows)
                   if len(row) != n_cols]
    if diff_lengts:
        raise ZCItoolsValueError(
            f"Rows have different length than specified columns {n_cols}: {diff_lengts}"
        )
Esempio n. 12
0
 def run(self, step_data):
     from .common_methods import create_alignment_data
     align_params = [a.lower() for a in self.args.alignments]
     sup = set(a for a, _ in self._ALIGNMENTS)
     if not_sup_params := [a for a in align_params if a not in sup]:
         raise ZCItoolsValueError(
             f'No valid alignments set ({", ".join(sorted(not_sup_params))}).'
         )
Esempio n. 13
0
 def register_steps(self, step_classes):
     for step_cls in step_classes:
         s_type = step_cls._STEP_TYPE
         if s_type in self.steps_map:
             raise ZCItoolsValueError(
                 f"Step {s_type} already registerd ({self.steps_map[s_type].__class__.__name__})!"
             )
         self.steps_map[s_type] = step_cls
Esempio n. 14
0
 def choose_first_column(self, *columns, error=False):
     for c in columns:
         if self.has_column(c):
             return c
     if error:
         raise ZCItoolsValueError(
             f'No column found from: {columns}.\nExisting columns: {[c for c, _ in self._columns]}'
         )
Esempio n. 15
0
def create_irs_data(step_data, annotation_step, params):
    SeqIO = import_bio_seq_io()

    seq_idents = annotation_step.all_sequences()  # set
    ref_ident = find_referent_genome(seq_idents, params.referent_genome)

    step = annotation_step.project.new_step(ChloroplastSSCBlast, step_data)
    ref_seq_rec = annotation_step.get_sequence_record(ref_ident)
    ssc_location = step.get_type_description_elem('ssc_location',
                                                  default=dict())
    ensure_directory(step.step_file('run_dir'))

    # Store query data
    query_file = step.step_file('run_dir', 'query.fa')
    if not os.path.isfile(query_file):
        irs = find_chloroplast_irs(ref_seq_rec)
        if not irs:
            raise ZCItoolsValueError(
                f"Referent genome ({ref_ident}) doesn't have IRS!")
        write_fasta(query_file,
                    [('ira', str(irs[0].extract(ref_seq_rec).seq))])

    files_to_zip = [query_file]
    calc_seq_idents = []

    # All sequences, to create database from
    for seq_ident in sorted(seq_idents):
        if not os.path.isfile(step.step_file('run_dir', f'{seq_ident}.xml')):
            fa_file = step.step_file('run_dir', f'{seq_ident}.fa')
            files_to_zip.append(fa_file)
            calc_seq_idents.append(seq_ident)
            if not os.path.isfile(fa_file):
                seq_rec = annotation_step.get_sequence_record(seq_ident)
                SeqIO.write([seq_rec], fa_file, 'fasta')
                # Store SSC position
                irs = find_chloroplast_irs(seq_rec)
                ssc_location[seq_ident] = [len(seq_rec), int(irs[0].location.end), irb_start(irs[1])] \
                    if irs else [len(seq_rec), -1, -1]

    if calc_seq_idents:
        # Store finish.yml
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(calc_seq_idents=calc_seq_idents), finish_f)

        run = True  # ToDo: ...
        step.save(dict(ssc_location=ssc_location), completed=False)
        if run:
            run_module_script(run_irs_blast, step)
            finish_irs_data(step)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_irs_blast, step, files_to_zip,
                                 _instructions)
    #
    elif params.force_blast_parse:
        finish_irs_data(step)

    return step
Esempio n. 16
0
 def run(self, step_data):
     from .mvista import create_mvista_data
     from common_utils.file_utils import get_settings
     email = self.args.email or get_settings()['email']
     if self.args.run and not email:
         raise ZCItoolsValueError(
             'Email address is needed to post mVISTA data!')
     return create_mvista_data(step_data, self._input_step(), self.args.run,
                               email)
Esempio n. 17
0
 def run(self, step_data):
     from .ogdraw import create_ogdraw
     img_f = self.args.image_format.lower()
     if img_f not in self._IMAGE_FORMATS:
         raise ZCItoolsValueError(f'Given format {img_f} is not supported!')
     return create_ogdraw(step_data,
                          img_f,
                          self._input_step(),
                          self.get_common_db_object(),
                          sequences=self.args.sequences)
Esempio n. 18
0
def create_table_step(project, step_data, params):
    if not os.path.isfile(params.filename):
        raise ZCItoolsValueError(f"Table file {params.filename} doesn't exist.")

    # Find how to read data
    data_format = params.data_format
    if data_format is None:
        data_format = filetype_from_ext(params.filename)
    if not data_format:
        raise ZCItoolsValueError(f"Data format for input table is not specified or found! Filename {params.filename}.")

    columns = [x.split(',') for x in params.columns.split(':')] if params.columns else None
    if columns:
        columns = [(c[0], column_name_2_type(c[0])) if len(c) == 1 else c for c in columns]

    # Read data.
    data = None
    if data_format == 'text':
        # ToDo: separator for more columns. For now only list supported
        with open(params.filename, 'r') as r:
            data = [[line] for line in filter(None, (_l.strip() for _l in r.readlines()))]
        data = sorted(data)
    elif data_format == 'csv':
        with open(params.filename, 'r') as incsv:
            reader = csv.reader(incsv, delimiter=params.delimiter, quotechar='"')
            if params.has_header:
                header = next(reader)  # Skip header
                if not columns:
                    columns = [(c, column_name_2_type(c)) for c in header]  # Default
            data = sorted(reader)
    elif data_format == 'raw_data':
        data = [[line] for line in filename.split(';') if line]
    else:
        raise ZCItoolsValueError(f'Data format {data_format} is not supported!')

    if not columns:
        raise ZCItoolsValueError(f"Columns are not specified for input table! Filename {params.filename}.")

    # Store (or overwrite) step data
    step = TableStep(project, step_data, remove_data=True)
    step.set_table_data(data, columns)
    step.save()
    return step
Esempio n. 19
0
    def run_command_with_args(self, *args):
        command = args[0].lower()
        if command not in self.commands_map:
            raise ZCItoolsValueError(f'Command "{command}" is not supported!')

        parser = self._get_parser(command, False)
        print(f"Run: {' '.join(args)}")
        return self._run_command(command,
                                 parser.parse_args(args),
                                 cmd_args=args)
Esempio n. 20
0
    def read_step(self,
                  step_name,
                  check_data_type=None,
                  update_mode=False,
                  no_check=False,
                  outside_of_project=False):
        if isinstance(step_name, str):
            desc_data = read_yaml(os.path.join(step_name, 'description.yml'))
        else:
            assert isinstance(step_name, (list, tuple)), type(step_name)
            desc_data = read_yaml(os.path.join(*step_name, 'description.yml'))
        if not desc_data:
            raise ZCItoolsValueError(f"'{step_name}' is not a step!")

        data_type = desc_data['data_type']

        if check_data_type:
            if isinstance(check_data_type, str):
                if check_data_type != data_type:
                    raise ZCItoolsValueError(
                        f"Step {step_name} is not of data type '{check_data_type}'!"
                    )
            else:
                if data_type not in check_data_type:
                    raise ZCItoolsValueError(
                        f"Step {step_name} is not of data types: {', '.join(check_data_type)}!"
                    )

        cls = self.steps_map.get(data_type)
        if not cls:
            raise ZCItoolsValueError(
                f"No step class for data type {data_type}!")

        if outside_of_project and isinstance(step_name, (list, tuple)):
            return cls(self,
                       desc_data['project'],
                       update_mode=update_mode,
                       no_check=no_check,
                       step_directory=step_name)
        return cls(self,
                   desc_data['project'],
                   update_mode=update_mode,
                   no_check=no_check)
Esempio n. 21
0
 def _input_step(self, no_data_check=False):
     assert self._INPUT_STEP_DATA_TYPE
     a = self.args
     step = self.project.read_step(
         a.step,
         check_data_type=self._INPUT_STEP_DATA_TYPE,
         no_check=(no_data_check or a.no_data_check))
     if not step.is_completed():
         raise ZCItoolsValueError(f"Input step {a.step} is not completed!")
     return step
Esempio n. 22
0
 def format_parameters(params):
     # Methods
     from ..chloroplast.irs.analyse_irs import METHOD_NAMES, METHOD_NAMES_RESEARCH
     ms = params['methods'].lower().split(',')
     if 'all' in ms:
         methods = METHOD_NAMES
     elif 'research' in ms:
         methods = METHOD_NAMES_RESEARCH
     elif not_known_methods := set(m for m in ms if m not in METHOD_NAMES):
         raise ZCItoolsValueError(
             f'Not know method(s): {", ".join(not_known_methods)}!')
Esempio n. 23
0
 def run(self):
     from ..utils.helpers import change_sequence_data
     a = self.args
     method = a.method.lower()
     if method not in ('revert', 'translate'):
         raise ZCItoolsValueError(f"Not known method {a.method}!")
     change_sequence_data(method,
                          a.input_filename,
                          a.output_filename,
                          input_format=a.input_format,
                          output_format=a.output_format,
                          position=a.position)
Esempio n. 24
0
 def step_base_name(self):
     m = self.args.method[0].lower()
     if m == 'p':
         n = self._format_step_name('FixByParts')
     # elif m == 'h':
     #     n = self._format_step_name('FixByTrnH-GUG')
     elif m == 'f':
         n = self._format_step_name('FixByTrnF-GAA')
     else:
         raise ZCItoolsValueError(f'Not known method {self.args.method}!')
     # Add offset or not?
     o = self.args.keep_offset
     return f'{n}_{o}' if o != DEFAULT_KEEP_OFFSET else n
Esempio n. 25
0
 def _have_same_nodes(self, t2, tree_idx):
     assert self.node_names[tree_idx], tree_idx
     assert t2.node_names[tree_idx], tree_idx
     not_in_2 = self.node_names[tree_idx] - t2.node_names[tree_idx]
     not_in_1 = t2.node_names[tree_idx] - self.node_names[tree_idx]
     if not_in_2 or not_in_1:
         if not_in_2:
             print("Nodes missing in the second tree:",
                   ', '.join(sorted(not_in_2)))
         if not_in_1:
             print("Nodes missing in the first tree:",
                   ', '.join(sorted(not_in_1)))
         raise ZCItoolsValueError("Trees don't have same set of nodes.")
Esempio n. 26
0
 def run(self, step_data):
     m = self.args.method[0].lower()
     if m == 'p':
         from .fix_by_analysis import fix_by_parts as fix_method
     # elif m == 'h':
     #     from .fix_by_analysis import fix_by_trnH_GUG as fix_method
     elif m == 'f':
         from .fix_by_analysis import fix_by_trnF_GAA as fix_method
     else:
         raise ZCItoolsValueError(f'Not known method {self.args.method}!')
     #
     return fix_method(step_data, self._input_step(no_data_check=True),
                       self.args.subset, self.args.keep_offset,
                       self.get_common_db_object())
Esempio n. 27
0
def fetch_common_db_data(step_data, table_step, step_type, common_db):
    step = table_step.project.new_step_by_type(step_type,
                                               step_data,
                                               remove_data=True)

    for seq_ident in table_step.get_column_values_by_type('seq_ident'):
        f = common_db.get_record(seq_ident, step.directory, info=True)
        if not f:
            raise ZCItoolsValueError(
                f"There is not CommonDB record for seq ident {seq_ident}!")
        step.add_sequence_file(os.path.basename(f))

    step.save()
    return step
Esempio n. 28
0
    def _check_data(self):
        # Check does alignment file exist
        if all(not os.path.isfile(self.step_file(a))
               for a in ('alignment.phy', 'alignment.fa')):
            raise ZCItoolsValueError(
                f'No alignment file for step {self.directory}!')

        # Check are all sequences in sequence file
        exist_seq_idents = set(
            read_fasta_identifiers(self.step_file('sequences.fa')))
        sets_equal(self._sequences,
                   exist_seq_idents,
                   'sequence',
                   step=self.directory)
Esempio n. 29
0
 def new_step_by_type(self,
                      data_type,
                      step_data,
                      remove_data=False,
                      update_mode=False,
                      no_check=False):
     cls = self.steps_map.get(data_type)
     if not cls:
         raise ZCItoolsValueError(
             f"No step class for data type {data_type}!")
     return cls(self,
                step_data,
                remove_data=remove_data,
                update_mode=update_mode,
                no_check=no_check or self._args.no_data_check)
Esempio n. 30
0
 def read_complete_step(self,
                        step_name,
                        check_data_type=None,
                        update_mode=False,
                        no_check=False,
                        outside_of_project=False):
     step = self.read_step(step_name,
                           check_data_type=check_data_type,
                           update_mode=update_mode,
                           no_check=no_check,
                           outside_of_project=outside_of_project)
     if not step.is_completed():
         raise ZCItoolsValueError(
             f"Input step {step_name} is not completed!")
     return step