Ejemplo n.º 1
0
def create_irs_data(step_data, annotation_step, params):
    SeqIO = import_bio_seq_io()

    seq_idents = annotation_step.all_sequences()  # set
    ref_ident = find_referent_genome(seq_idents, params.referent_genome)

    step = annotation_step.project.new_step(ChloroplastSSCBlast, step_data)
    ref_seq_rec = annotation_step.get_sequence_record(ref_ident)
    ssc_location = step.get_type_description_elem('ssc_location',
                                                  default=dict())
    ensure_directory(step.step_file('run_dir'))

    # Store query data
    query_file = step.step_file('run_dir', 'query.fa')
    if not os.path.isfile(query_file):
        irs = find_chloroplast_irs(ref_seq_rec)
        if not irs:
            raise ZCItoolsValueError(
                f"Referent genome ({ref_ident}) doesn't have IRS!")
        write_fasta(query_file,
                    [('ira', str(irs[0].extract(ref_seq_rec).seq))])

    files_to_zip = [query_file]
    calc_seq_idents = []

    # All sequences, to create database from
    for seq_ident in sorted(seq_idents):
        if not os.path.isfile(step.step_file('run_dir', f'{seq_ident}.xml')):
            fa_file = step.step_file('run_dir', f'{seq_ident}.fa')
            files_to_zip.append(fa_file)
            calc_seq_idents.append(seq_ident)
            if not os.path.isfile(fa_file):
                seq_rec = annotation_step.get_sequence_record(seq_ident)
                SeqIO.write([seq_rec], fa_file, 'fasta')
                # Store SSC position
                irs = find_chloroplast_irs(seq_rec)
                ssc_location[seq_ident] = [len(seq_rec), int(irs[0].location.end), irb_start(irs[1])] \
                    if irs else [len(seq_rec), -1, -1]

    if calc_seq_idents:
        # Store finish.yml
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(calc_seq_idents=calc_seq_idents), finish_f)

        run = True  # ToDo: ...
        step.save(dict(ssc_location=ssc_location), completed=False)
        if run:
            run_module_script(run_irs_blast, step)
            finish_irs_data(step)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_irs_blast, step, files_to_zip,
                                 _instructions)
    #
    elif params.force_blast_parse:
        finish_irs_data(step)

    return step
Ejemplo n.º 2
0
def create_new_hybrids_data(project, step_data, params):
    # Check input files
    if not os.path.isfile(params.data_file):
        raise ZCItoolsValueError(
            f"Input data file {params.data_file} doesn't exist!")
    if not os.path.isfile(params.gtyp_cat_file):
        raise ZCItoolsValueError(
            f"Input genotype category probabilities {params.gtyp_cat_file} doesn't exist!"
        )
    data_file = os.path.basename(params.data_file)
    gtyp_cat_file = os.path.basename(params.gtyp_cat_file)

    step = NewHybridsStep(project, step_data, remove_data=True)
    step.set_data(data_file, gtyp_cat_file, params.theta_prior,
                  params.pi_prior, params.burn_in, params.num_sweeps)

    # Copy input files
    files_to_zip = [step.step_file(data_file), step.step_file(gtyp_cat_file)]
    copy_file(params.data_file, files_to_zip[0])
    copy_file(params.gtyp_cat_file, files_to_zip[1])

    # Create run directories
    seeds = random.sample(
        list(itertools.product(range(1, _MAX_SMALL_NUMBER + 1), repeat=2)),
        params.num_runs)

    for seed in seeds:
        files_to_zip.append(step.step_file(step.seed_dir(seed)))
        ensure_directory(files_to_zip[-1])

    files_to_zip.append(step.step_file('finish.yml'))
    write_yaml(
        dict(data_file=data_file,
             gtyp_cat_file=gtyp_cat_file,
             theta_prior=params.theta_prior,
             pi_prior=params.pi_prior,
             burn_in=params.burn_in,
             num_sweeps=params.num_sweeps), files_to_zip[-1])

    # Stores description.yml
    step.save(completed=params.run)

    # Run or set instructions
    if params.run:
        run_module_script(run_new_hybrids, step)
    else:
        set_run_instructions(run_new_hybrids, step, files_to_zip,
                             _instructions)
    #
    return step
Ejemplo n.º 3
0
def create_irs_data(step_data, input_step, params, common_db):  # , run):
    # Creates Annotations step from input sequences/annotations
    # Steps subdirectory 'run_dir' contains input and output calculation files
    SeqIO = import_bio_seq_io()
    files_to_zip = []
    calc_seq_idents = []

    step = input_step.project.new_step(AnnotationsStep, step_data)
    # Set sequences
    step.set_sequences(input_step.all_sequences())
    ensure_directory(step.step_file('run_dir'))

    for seq_ident in input_step.all_sequences():
        out_file = step.step_file('run_dir', f'{seq_ident}.out')
        if not os.path.isfile(out_file):
            seq_rec = input_step.get_sequence_record(seq_ident)
            # Set fasta file for calculation
            files_to_zip.append(step.step_file('run_dir', f'{seq_ident}.fa'))
            SeqIO.write([seq_rec], files_to_zip[-1], 'fasta')
            calc_seq_idents.append(seq_ident)
        elif not os.path.isfile(step.step_file(f'{seq_ident}.gb')):
            calc_seq_idents.append(seq_ident)

    if files_to_zip:
        # Store finish.yml
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(fa_files=files_to_zip), finish_f)

        run = True  # ToDo: ...
        step.save(completed=False)
        if run:
            run_module_script(run_irs_mummer, step)
            finish_irs_data(step, common_db, calc_seq_idents=calc_seq_idents)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_irs_mummer, step, files_to_zip,
                                 _instructions)
    #
    elif calc_seq_idents:
        finish_irs_data(step, common_db, calc_seq_idents=calc_seq_idents)
    elif params.force_mummer_parse:
        finish_irs_data(step, common_db)

    #
    return step
Ejemplo n.º 4
0
def create_raxml_data(step_data, alignment_step, partitions_obj, run_threads):
    # List of dicts with attrs: filename, short, partitions (filename or None)
    # This data is used to optimize calculation
    files_to_proc = []
    files_to_zip = []

    if alignment_step._IS_COLLECTION:
        step = RAxMLSteps(alignment_step.project, step_data, remove_data=True)
        for align_step in alignment_step.step_objects():
            substep = step.create_substep(align_step.get_local_name())
            substep.set_sequences(align_step.all_sequences())
            substep.seq_sequence_type(align_step.get_sequence_type())
            _copy_alignment_file(align_step, substep, files_to_proc,
                                 partitions_obj)
            #
            substep.save(completed=False)
    else:
        step = RAxMLStep(alignment_step.project, step_data, remove_data=True)
        step.set_sequences(alignment_step.all_sequences())
        step.seq_sequence_type(alignment_step.get_sequence_type())
        _copy_alignment_file(alignment_step, step, files_to_proc,
                             partitions_obj)

    # Store files desc
    files_to_zip = [d['filename'] for d in files_to_proc]  # files to zip
    files_to_zip.extend(filter(None, (d['partitions'] for d in files_to_proc)))

    # Remove step directory from files since run script is called from step directory
    for d in files_to_proc:
        d['filename'] = step.strip_step_dir(d['filename'])
    finish_f = step.step_file('finish.yml')
    write_yaml(files_to_proc, finish_f)

    # Stores description.yml
    step.save(completed=bool(run_threads))

    if run_threads:
        run_module_script(run_raxml, step, threads=run_threads)
    else:
        files_to_zip.append(finish_f)
        set_run_instructions(run_raxml, step, files_to_zip, _instructions)
    #
    return step
Ejemplo n.º 5
0
def create_permutations(project,
                        step_data,
                        raw_file,
                        permutations,
                        num_traits=None,
                        run=False):
    # Check input files
    map_file = raw_file.replace('.raw', '.map')
    data_dir, base_raw_file = os.path.split(raw_file)
    tmp_files = ('tmp.00m', 'tmp.00c', 'tmp.00r')
    for mf in (raw_file, map_file):
        if not os.path.isfile(mf):
            raise ZCItoolsValueError(
                f"Input MapMaker file {mf} doesn't exist!")
    for qf in tmp_files:
        f = os.path.join(data_dir, qf)
        if not os.path.isfile(f):
            raise ZCItoolsValueError(
                f"Input Windows QTL Cartographer file {qf} doesn't exist!")

    #
    step = QTLCartStep(project, step_data, remove_data=True)
    step.set_data(num_traits, permutations)

    # Copy input files
    files_to_zip = []
    for qf in tmp_files:
        files_to_zip.append(step.step_file(qf))
        copy_file(os.path.join(data_dir, qf), files_to_zip[-1])

    # Create trait directories
    # ToDo: find max traits and fix it/set default
    assert num_traits and num_traits > 0, num_traits
    trait_dirs = []
    for t_idx in range(1, num_traits + 1):
        trait_dirs.append(step.trait_dir(t_idx))
        t_dir = step.step_file(trait_dirs[-1])
        ensure_directory(t_dir)
        files_to_zip.append(os.path.join(t_dir, 'qtlcart.rc'))
        write_str_in_file(
            files_to_zip[-1],
            _qtlcart_rc.format(trait=t_idx, num_traits=num_traits))
        # # Create links to input files
        # for qf in tmp_files:
        #     link_file(os.path.join('..', qf), os.path.join(t_dir, qf))
        #

    files_to_zip.append(step.step_file('finish.yml'))
    write_yaml(dict(permutations=permutations, trait_dirs=trait_dirs),
               files_to_zip[-1])

    # Stores description.yml
    step.save(completed=run)

    # Run or set instructions
    if run:
        run_module_script(run_qtl_cart_perm, step)
    else:
        set_run_instructions(run_qtl_cart_perm, step, files_to_zip,
                             _instructions)
    #
    return step
Ejemplo n.º 6
0
def create_irs_data(step_data, input_step, params):
    # Creates Annotations step from input sequences/annotations
    # Steps subdirectory 'run_dir' contains input and output calculation files
    SeqIO = import_bio_seq_io()
    seq_idents = input_step.all_sequences()

    step = input_step.project.new_step(AnnotationsStep, step_data)
    step.set_sequences(seq_idents)
    # seq_ident -> mummer data ([length, start_1, start_2])
    mummer_results = step.get_type_description_elem('mummer_results', default=dict())
    #
    ensure_directory(step.step_file('run_dir'))
    calc_mummer = []  # tuples (seq_ident, fasta file, mummer output file)

    # Mummer
    for seq_ident in sorted(seq_idents - set(mummer_results)):
        fa_file = step.step_file('run_dir', f'{seq_ident}.fa')
        mummer_res_file = step.step_file('run_dir', f'{seq_ident}.out')
        if not os.path.isfile(fa_file):
            seq_rec = input_step.get_sequence_record(seq_ident)
            SeqIO.write([seq_rec], fa_file, 'fasta')
            calc_mummer.append((seq_ident, fa_file, mummer_res_file))
        elif not os.path.isfile(mummer_res_file):
            calc_mummer.append((seq_ident, fa_file, mummer_res_file))

    # Run mummer
    if calc_mummer:
        mummer_exe = 'repeat-match'  # ToDo:
        n = 3000
        threads = multiprocessing.cpu_count()
        with ThreadPoolExecutor(max_workers=threads) as executor:
            for seq_ident, fa_file, mummer_res_file in calc_mummer:
                executor.submit(_run_single, mummer_exe, n, fa_file, mummer_res_file)

        for seq_ident, _, mummer_res_file in calc_mummer:
            rep = _read_mummer_repeat(mummer_res_file)
            if not rep:
                raise ZCItoolsValueError(f'No repeat for sequence {seq_ident}!')
            mummer_results[seq_ident] = rep

    # Find sequences extend with alignment
    files_to_zip = []
    calc_mafft = []
    for seq_ident in sorted(seq_idents):
        length, s1, s2 = mummer_results[seq_ident]
        if length >= 23000:
            continue

        if step.is_file('run_dir', f'{seq_ident}_right_align.fa') and \
           step.is_file('run_dir', f'{seq_ident}_right_align.fa'):
            continue

        #
        calc_mafft.append(seq_ident)
        _seq = input_step.get_sequence_record(seq_ident).seq
        seq = str(_seq)
        comp_seq = str(_seq.complement())
        missing = 26000 - length

        # Right side
        p1 = _extract_subseq_plus(seq, s1 + length, missing)
        p2 = _extract_subseq_minus(comp_seq, s2 - length, missing)
        assert len(p1) == len(p2), (length, s1, s2, missing, (len(p1), len(p2)))
        files_to_zip.append(step.step_file('run_dir', f'{seq_ident}_right.fa'))
        write_fasta(files_to_zip[-1], [('p1', p1), ('p2', p2)])

        # Left side
        p1 = _extract_subseq_minus(comp_seq, s1 - 1, missing)
        p2 = _extract_subseq_plus(seq, s2 + 1, missing)
        assert len(p1) == len(p2), (length, s1, s2, missing, (len(p1), len(p2)))
        files_to_zip.append(step.step_file('run_dir', f'{seq_ident}_left.fa'))
        write_fasta(files_to_zip[-1], [('p1', p1), ('p2', p2)])

    # Mafft
    if calc_mafft:
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(calc_seq_idents=calc_mafft), finish_f)

        run = True  # ToDo: ...
        step.save(additional_data=dict(mummer_results=mummer_results), completed=False)
        if run:
            run_module_script(run_mafft_irs, step)
            finish_irs_data(step)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_mafft_irs, step, files_to_zip, _instructions)
    #
    elif params.force_parse:
        finish_irs_data(step)

    return step
Ejemplo n.º 7
0
def orientate_chloroplast_start(step_data, annotation_step, params):
    # Find referent genome
    # For each sequence, different than referent, directory is created named <seq_ident>.
    # It contains files:
    #  - {lsc|ira|ss}_{plus|minus}.fa       : input alignment files, contain 2 sequences.
    #  - align_{lsc|ira|ss}_{plus|minus}.fa : result alignment files.
    seq_idents = annotation_step.all_sequences()  # set
    ref_ident = find_referent_genome(seq_idents, params.referent_genome)
    #
    length = params.length_to_check
    step = annotation_step.project.new_step(ChloroplastOrientateStep, step_data, remove_data=False)
    sequence_data = step.get_type_description_elem('sequence_data', default=dict())
    #
    seq_rec = annotation_step.get_sequence_record(ref_ident)
    partition = find_chloroplast_partition(seq_rec)
    ref_parts = [str(partition.get_part_by_name(n).extract(seq_rec).seq)[:length] for n in _part_names]
    files_to_zip = []
    align_files = []

    #
    all_versions = ('plus', 'minus', 'plus_c', 'minus_c') if params.complement else ('plus', 'minus')
    for seq_ident in sorted(seq_idents):
        seq_rec = None
        if seq_ident not in sequence_data:
            seq_rec = annotation_step.get_sequence_record(seq_ident)
            partition = find_chloroplast_partition(seq_rec)

            # Count gene orientation
            l_seq = len(seq_rec)
            in_parts = partition.put_features_in_parts(
                Feature(l_seq, feature=f) for f in seq_rec.features if f.type == 'gene')

            lsc_count = sum(f.feature.strand if any(x in f.name for x in ('rpl', 'rps')) else 0
                            for f in in_parts.get('lsc', []))
            ssc_count = sum(f.feature.strand for f in in_parts.get('ssc', []))
            ira_count = sum(f.feature.strand if 'rrn' in f.name else 0 for f in in_parts.get('ira', []))

            sequence_data[seq_ident] = dict(
                length=len(seq_rec),
                lsc=(lsc_count <= 0), lsc_count=lsc_count, lsc_length=len(partition.get_part_by_name('lsc')),
                ssc=(ssc_count <= 0), ssc_count=ssc_count, ssc_length=len(partition.get_part_by_name('ssc')),
                ira=(ira_count >= 0), ira_count=ira_count, ira_length=len(partition.get_part_by_name('ira')))

        if all(all(step.is_file(seq_ident, f'align_{n}_{v}.fa') for v in all_versions) for n in _part_names):
            continue
        #
        if seq_rec is None:
            seq_rec = annotation_step.get_sequence_record(seq_ident)
            partition = find_chloroplast_partition(seq_rec)
        for n, ref_p in zip(_part_names, ref_parts):
            # Find missing output files
            _num = len(align_files)
            for x in all_versions:
                if not step.is_file(seq_ident, f'align_{n}_{x}.fa'):
                    files_to_zip.append(step.step_file(seq_ident, f'{n}_{x}.fa'))
                    align_files.append((seq_ident, n, x))
            if _num == len(align_files):
                continue

            # Store input files
            if all(step.is_file(seq_ident, f'align_{n}_{v}.fa') for v in all_versions):
                continue
            ensure_directory(step.step_file(seq_ident))
            part_s = partition.get_part_by_name(n).extract(seq_rec)

            f_p = step.step_file(seq_ident, f'{n}_plus.fa')
            f_p_c = step.step_file(seq_ident, f'{n}_plus_c.fa')
            if not os.path.isfile(f_p):
                write_fasta(f_p, [(ref_ident, ref_p), (seq_ident, str(part_s.seq)[:length])])
            if not os.path.isfile(f_p_c):
                write_fasta(f_p_c, [(ref_ident, ref_p),
                                    (seq_ident, str(part_s.reverse_complement().seq)[:(-length-1):-1])])

            f_m = step.step_file(seq_ident, f'{n}_minus.fa')
            f_m_c = step.step_file(seq_ident, f'{n}_minus_c.fa')
            if not os.path.isfile(f_m):
                write_fasta(f_m, [(ref_ident, ref_p), (seq_ident, str(part_s.reverse_complement().seq)[:length])])
            if not os.path.isfile(f_m_c):
                write_fasta(f_m_c, [(ref_ident, ref_p), (seq_ident, str(part_s.seq)[:(-length-1):-1])])

    #
    output_file = f"{params.output_file_prefix}_{length}{'_c' if params.complement else ''}.xlsx"
    data = dict(sequence_data=sequence_data, check_length=length, output_file=output_file, complement=params.complement)
    if align_files:
        # Store finish.yml
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(align_files=align_files), finish_f)

        run = True  # ToDo: ...
        step.save(data, completed=False)
        if run:
            run_module_script(run_orientate, step)
            orientate_chloroplast_finish(step)  # , common_db, calc_seq_idents=calc_seq_idents)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_orientate, step, files_to_zip, _instructions)
    #
    elif params.force_parse:
        step.save(data)
        orientate_chloroplast_finish(step)  # , common_db, calc_seq_idents=calc_seq_idents)
    #
    else:
        step.save(data, completed=False)

    return step
Ejemplo n.º 8
0
def create_mr_bayes_data(step_data, alignment_step, args, partitions_obj,
                         run_threads):
    # List of dicts with attrs: filename, short
    # This data is used to optimize calculation
    # ToDo: almost the same as raxml.py. Differs in class types, _copy_alignment_file() and file formats
    files_to_proc = []

    if alignment_step._IS_COLLECTION:
        step = MrBayesSteps(alignment_step.project,
                            step_data,
                            remove_data=True)
        for align_step in alignment_step.step_objects():
            substep = step.create_substep(align_step.get_local_name())
            substep.set_sequences(align_step.all_sequences())
            substep.seq_sequence_type(align_step.get_sequence_type())
            _copy_alignment_file(align_step, substep, files_to_proc, args,
                                 partitions_obj)
            #
            substep.save(completed=False)
        if args.num_runs and args.num_runs > 1:
            print(
                'Warning: number of runs for collection of alignments is not supported.'
            )
    else:
        if args.num_runs and args.num_runs > 1:
            step = MrBayesSteps(alignment_step.project,
                                step_data,
                                remove_data=True)
            for run_idx in range(args.num_runs):
                substep = step.create_substep(f'RUN_{run_idx + 1}')
                substep.set_sequences(alignment_step.all_sequences())
                substep.seq_sequence_type(alignment_step.get_sequence_type())
                # ToDo: make symbolic links?
                _copy_alignment_file(alignment_step, substep, files_to_proc,
                                     args, partitions_obj)
                #
                substep.save(completed=False)
        else:
            step = MrBayesStep(alignment_step.project,
                               step_data,
                               remove_data=True)
            step.set_sequences(alignment_step.all_sequences())
            step.seq_sequence_type(alignment_step.get_sequence_type())
            _copy_alignment_file(alignment_step, step, files_to_proc, args,
                                 partitions_obj)

    # Store files desc
    files_to_zip = [d['filename'] for d in files_to_proc]  # files to zip
    # Remove step directory from files since run script is called from step directory
    for d in files_to_proc:
        d['filename'] = step.strip_step_dir(d['filename'])
        d['result_prefix'] = step.strip_step_dir(d['result_prefix'])
    finish_f = step.step_file('finish.yml')
    write_yaml(files_to_proc, finish_f)

    # Stores description.yml
    step.save(completed=bool(run_threads))

    if run_threads:
        run_module_script(run_mr_bayes,
                          step,
                          threads=run_threads,
                          use_mpi=(not args.no_mpi))
    else:
        files_to_zip.append(finish_f)
        set_run_instructions(run_mr_bayes, step, files_to_zip, _instructions)
    #
    return step