Example #1
0
import argparse
import os

from helper_functions import is_fasta, is_user_info_yaml, is_valid_repo, is_valid_slurm_config, parse_output_path, is_valid_fastq_path

__location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))
included_pipelines = os.listdir(__location__ + '/assembler_commands/')
included_pipelines = [os.path.splitext(pl)[0] for pl in included_pipelines]
included_pipelines = ', '.join(included_pipelines)

working_dir = ('-w', '--working-dir', {
    'type': lambda x: parse_output_path(x),
    'required': True,
    'help': 'Intermediate folder where results are stored.'
})

# TODO: implement argcheck
fast5_dir = ('-f', '--fast5-dir', {
    'type':
    lambda x: os.path.realpath(x),
    'required':
    False,
    'help':
    'Directory containing fast5-reads for the provided reads. required for some tools (e.g. Nanopolish).'
})

# TODO: implement argcheck
gff_file = ('-g', '--gff-file', {
    'type':
    lambda x: os.path.realpath(x),
                    action='store_true',
                    default=False,
                    help='Given hdf path refers to a tombo analysis.')
parser.add_argument(
    '--clipped-bases',
    type=int,
    required=False,
    default=10,
    help='Define how many bases should be clipped off ends of training reads.'
    'Overruled by --use-tombo (in which case base clipping will depend on'
    'an alignment).')

args = parser.parse_args()

file_list = parse_input_path(args.input)
out_path = parse_output_path(args.db_dir)
db_name = out_path + 'db.fs'
error_fn = out_path + 'failed_reads.txt'
npz_path = out_path + 'test_squiggles/'
npz_path = parse_output_path(npz_path)
if isfile(db_name):
    raise ValueError('DB  by this name already exists!')

# Very light check on tombo usage
if args.use_tombo and 'RawGenomeCorrected' not in args.hdf_path:
    raise UserWarning(
        'Tombo files should be used, but hdf path does not seem tombo-generated...'
    )

db = ExampleDb(db_name=db_name, width=args.width)
nb_files = len(file_list)
Example #3
0
def main(args):
    __location__ = os.path.realpath(
        os.path.join(os.getcwd(), os.path.dirname(__file__)))

    # additional argument checks
    if not os.path.isdir(args.working_dir):
        raise ValueError('Working directory not found')
    args.working_dir = os.path.realpath(args.working_dir) + '/'
    if os.path.isdir(args.working_dir + 'analysis/'):
        shutil.rmtree(args.working_dir + 'analysis/')

    options_dict = dict()
    options_dict['wd_envs'] = hp.parse_output_path(args.working_dir + 'envs/')
    options_dict['threads'] = args.threads_per_job
    options_dict['ref_fasta'] = os.path.realpath(args.ref_fasta)
    options_dict['reads_fastq'] = args.working_dir + 'all_reads.fastq'
    options_dict['wd_analysis'] = hp.parse_output_path(args.working_dir +
                                                       'analysis/')
    options_dict[
        'wd_analysis_condas'] = __location__ + '/analysis_conda_files/'
    options_dict['__location__'] = __location__

    # --- create output directories
    if os.path.isdir(options_dict['wd_analysis']):
        shutil.rmtree(options_dict['wd_analysis'])
    _ = hp.parse_output_path(options_dict['wd_analysis'] + 'quast')
    _ = hp.parse_output_path(options_dict['wd_analysis'] + 'jellyfish')
    _ = hp.parse_output_path(options_dict['wd_analysis'] + 'readset_analysis')

    options_dict['wd_analysis_summary'] = hp.parse_output_path(
        options_dict['wd_analysis'] + 'summary/')
    options_dict[
        'wd_assembler_results'] = args.working_dir + 'assembler_results/'
    options_dict[
        'wd_assemblies'] = args.working_dir + 'assembler_results/assemblies/'
    assemblies_list = hp.parse_input_path(options_dict['wd_assemblies'],
                                          pattern='*.fasta')
    if len(assemblies_list) == 0:
        raise ValueError('No succesful assemblies found to analyze!')
    assemblies_names_list = [
        os.path.splitext(os.path.basename(af))[0] for af in assemblies_list
    ]
    options_dict['assemblies_string'] = ' '.join(assemblies_names_list)
    with open(args.user_info, 'r') as f:
        md_yaml = yaml.load(f)
    md = Metadata(md_yaml)
    md.write_publication_info(options_dict['wd_analysis_summary'] +
                              'publication_info.yaml')
    # --- Quast ---
    options_dict['quast_options'] = ''
    if md.is_eukaryote:
        options_dict['quast_options'] += '-e '
    if args.gff_file:
        options_dict['quast_options'] += '-G ' + os.path.abspath(
            args.gff_file) + ' '
    quast_output = ''
    quast_output_cmd = ''
    for anl in assemblies_names_list:
        quast_output += (
            ',\n\t\t{anl}_fplot=\'{wd_analysis_summary}quast/{anl}.fplot\''
            ',\n\t\t{anl}_rplot=\'{wd_analysis_summary}quast/{anl}.rplot\''
        ).format(anl=anl,
                 wd_analysis_summary=options_dict['wd_analysis_summary'])
        quast_output_cmd += (
            'if [ -e contigs_reports/nucmer_output/{anl}.fplot ]; then '  # for quast <5.0.0
            'cp contigs_reports/nucmer_output/{anl}.fplot {wd_analysis_summary}quast/.\n'
            'cp contigs_reports/nucmer_output/{anl}.rplot {wd_analysis_summary}quast/.\n'
            'fi\n').format(
                anl=anl,
                wd_analysis_summary=options_dict['wd_analysis_summary'])
        quast_output_cmd += (
            'if [ -e contigs_reports/all_alignments_{anl}.tsv ]; then '  # for quast =>5.0.0
            'cp contigs_reports/all_alignments_{anl}.tsv {wd_analysis_summary}quast/.\n'
            'fi\n').format(
                anl=anl,
                wd_analysis_summary=options_dict['wd_analysis_summary'])
    options_dict['quast_output'] = quast_output
    options_dict['quast_output_cmd'] = quast_output_cmd

    # --- Construct snakemake file ---
    sf_fn = args.working_dir + 'Snakefile_analysis_' + datetime.datetime.now(
    ).strftime('%Y%m%d%H%M%S')
    with open(__location__ + '/Snakemake_analysis', 'r') as f:
        sf = f.read()

    sf = sf.format(**options_dict)
    with open(sf_fn, 'w') as sf_handle:
        sf_handle.write(sf)

    sm_dict = {'use_conda': True}

    if args.slurm_config is not None:
        sm_dict['cluster'] = 'sbatch'
        sm_dict['cluster_config'] = args.slurm_config
        sm_dict['nodes'] = 5

    snakemake.snakemake(sf_fn, **sm_dict)
Example #4
0
def main(args):
    summary_dir = os.path.realpath(args.working_dir) + '/analysis/summary/'
    if not os.path.isdir(summary_dir):
        raise ValueError('No summary directory found at {}, '
                         'did you run analysis already?'.format(
                             args.working_dir))
    if args.git:
        git_dir = os.path.realpath(args.working_dir) + '/analysis/to_github/'
        if os.path.isdir(git_dir):
            rmtree(git_dir)
        repo_obj = Repo.clone_from(url=args.git, to_path=git_dir)
        copy_tree(summary_dir, git_dir)
        file_list = [f for f in parse_input_path(git_dir) if '.git' not in f]
        repo_obj.index.add(file_list)
        repo_obj.index.commit(message='added benchmark results')
        repo_obj.remote('origin').push()
        print('Results summary pushed to {}'.format(args.git))
    share_results = input(
        f'\n\nporeTally has finished! Check your report here: {summary_dir+ "REPORT.html"} \n'
        '\n'
        'You can help the MinION user community gain insight in the performance of de novo '
        'assemblers and pick the best one for their dataset, by submitting your results to a shared '
        'Github repository (github.com/cvdelannoy/assembler_benchmark_new_submissions). In short, '
        'this will be done completely automatically, by issuing a fork request from a given Github '
        'account. The whole process is transparent, as all the pull requests are publicly visible. '
        'The collected benchmark info will periodically be curated and summarized in insightful '
        'tables and figures. Of course, submissions will be duely credited and no sequence-specific '
        'information will be shared, only quality metrics!'
        '\n'
        'Would you like to submit your results? (y/n) ')
    while share_results not in ['y', 'n', 'yes', 'no']:
        share_results = input('please answer with y(es) or n(o): ')
    if share_results in ['y', 'yes']:
        print('\nGreat, thanks for sharing!\n')

        # ---- Forking ----
        while True:
            ses = requests.Session()

            github_username = input(
                'Github username for the account from which you want to submit results: '
            )

            ses.auth = (github_username, getpass('Enter Github password: '******'https://github.com/{git_id}/poreTally_collective_submissions.git'.format(
                git_id=github_username)
            fork_req = ses.post(
                'https://api.github.com/repos/cvdelannoy/poreTally_collective_submissions/forks'
            )
            if int(fork_req.status_code) == 202:
                break
            elif int(fork_req.status_code) == 401:
                print(
                    'Authentication for Github account {} failed. Please retry.'
                    .format(github_username))
            else:
                print(
                    'Authentication failed due to some unforeseen '
                    'circumstance (HTTP status code {status}). Please retry. '.
                    format(status=fork_req.status_code))

        # ---- add, commit, push ----
        print('\nPushing results to fork...')
        all_submissions_dir = os.path.realpath(
            args.working_dir) + '/analysis/collective_submissions/'
        if os.path.isdir(all_submissions_dir):
            rmtree(all_submissions_dir)
        cur_foldername = strftime('%Y%m%d%H%M%S', gmtime())
        repo_obj = Repo.clone_from(url=submission_url,
                                   to_path=all_submissions_dir)
        branch_obj = repo_obj.create_head(cur_foldername)
        repo_obj.head.reference = branch_obj
        repo_obj.head.reset(index=True, working_tree=True)
        submission_dir = parse_output_path(all_submissions_dir +
                                           cur_foldername)
        copy_tree(summary_dir, submission_dir)
        file_list = [
            f for f in parse_input_path(submission_dir) if '.git' not in f
        ]
        repo_obj.index.add(file_list)
        repo_obj.index.commit(message='collective benchmark submission')

        # child.logfile = sys.stdout
        child = pexpect.spawn(
            'git push origin HEAD:{branch}'.format(branch=cur_foldername),
            cwd=submission_dir)
        child.expect('Username for \'https://github.com\':.*', timeout=2)
        child.sendline(ses.auth[0] + '\n')
        child.expect(
            '{un}\r\n\r\nPassword for \'https://{un}@github.com\':.*'.format(
                un=ses.auth[0]))
        child.sendline(ses.auth[1])
        push_check = child.read()
        if b'[new branch] HEAD' not in push_check:
            ValueError(
                '''Pushing to collective benchmark fork has failed...Sorry, that should not happen!
            
            Please report the error below on the poreTally github:
            
            {push_txt}'''.format(push_txt=push_check))

        # ---- submit pull request ----
        print('\nResults pushed! Issuing pull request...')
        pull_url = 'https://api.github.com/repos/cvdelannoy/poreTally_collective_submissions/pulls'
        pull_bool = False
        for _ in range(3):
            pull_params = {
                'title': 'poreTally_collective_submissions',
                'base': 'master',
                'head': github_username + ':' + cur_foldername
            }
            pull_req = ses.post(pull_url, json=pull_params)
            if int(pull_req.status_code) == 201:
                print(
                    '\nResults were successfully submitted to the poreTally collective benchmark!\n\n'
                    'Keep an Eye on https://github.com/cvdelannoy/poreTallyCommunity for future analyses.'
                )
                pull_bool = True
                break
            elif int(pull_req.status_code) == 401:
                print(
                    'Authentication for Github account {} failed when attempting a pull request. '
                    'Retrying...'.format(github_username))
            else:
                print(
                    'Authentication for issuing a pull request failed due to some unforeseen '
                    'circumstance (HTTP status code {status}). Retrying... '.
                    format(status=pull_req.status_code))
        if not pull_bool:
            ValueError(
                '''Making a pull request for the collective benchmark fork has failed...Sorry, that 
            should not happen! Please report the error below on the poreTally github:
            
            HTTP status code {sc}: {pull_txt}'''.format(
                    sc=pull_req.status_code, pull_txt=pull_req.reason))
Example #5
0
def main(args):
    __location__ = os.path.realpath(
        os.path.join(os.getcwd(), os.path.dirname(__file__)))
    fastq_list = hp.parse_input_path(args.reads_dir, pattern='*.f*q')
    wd = args.working_dir

    # Make necessary subdirs
    wd_envs = hp.parse_output_path(wd + 'envs/')
    wd_results = hp.parse_output_path(wd + 'assembler_results/')
    wd_assemblies = hp.parse_output_path(wd_results + 'assemblies/')
    wd_logs = hp.parse_output_path(wd_results + 'log_files/')
    wd_cpu = hp.parse_output_path(wd_results + 'cpu_files/')
    wd_condas = hp.parse_output_path(wd_results + 'conda_files/')
    wd_commands = hp.parse_output_path(wd_results + 'command_files/')

    if os.path.exists(wd + 'Snakefile'):
        os.remove(wd + 'Snakefile')

    # merge fastq's
    all_reads_fastq = wd + 'all_reads.fastq'
    with open(all_reads_fastq, 'wb') as afq:
        for f in fastq_list:
            with open(f, 'rb') as fd:
                shutil.copyfileobj(fd, afq)

    param_dict = dict()
    param_dict['NB_THREADS'] = args.threads_per_job
    param_dict['REFGENOME_SIZE'] = hp.get_nb_bases(args.ref_fasta, 'fasta')
    param_dict['SEQUENCED_SIZE'] = hp.get_nb_bases(all_reads_fastq, 'fastq')
    param_dict['COVERAGE'] = param_dict['SEQUENCED_SIZE'] / param_dict[
        'REFGENOME_SIZE']
    param_dict['WD'] = wd
    if args.fast5_dir:
        fast5_dir_abs = os.path.abspath(args.fast5_dir) + '/'
        param_dict['FAST5_DIR'] = fast5_dir_abs

    # Construct Snakefile
    # construct unique name for snakefile first
    sf_fn = wd + 'Snakefile_assemblies_' + datetime.datetime.now().strftime(
        '%Y%m%d%H%M%S')
    cmds_dict = dict()
    sf_dict = dict()
    if 'default' in args.pipelines:
        args.pipelines += [
            'canu', 'flye', 'smartdenovo', 'minimap2_miniasm',
            'minimap2_miniasm_raconX2'
        ]
        args.pipelines.remove('default')
    nb_pipelines = 0
    pipelines_list = []
    for pipeline in args.pipelines:
        if os.path.isfile(pipeline):
            yaml_fn = pipeline
            pipeline = os.path.splitext(os.path.basename(pipeline))[0]
        else:
            yaml_fn = __location__ + '/assembler_commands/' + pipeline + '.yaml'
        if os.path.isfile(yaml_fn):
            with open(yaml_fn, 'r') as plf:
                pl_dict = yaml.load(plf)
        else:
            warnings.warn('Could not find yaml file for {pl}, skipping'.format(
                pl=yaml_fn))
            continue

        wd_cur_path = wd_results + pipeline
        if os.path.isdir(
                wd_cur_path
        ):  # Ensure clean output folder, as some assemblers error out otherwise
            shutil.rmtree(wd_cur_path)
        wd_cur = hp.parse_output_path(wd_cur_path)

        sf_dict[pipeline] = {
            'input': {
                'fastq': wd + 'all_reads.fastq'
            },
            'threads': [args.threads_per_job],
            'output': [wd_assemblies + pipeline + '.fasta'],
            'log': [wd_logs + pipeline + '.log'],
            'benchmark': [wd_cpu + pipeline + '.bm']
        }

        conda = pl_dict.get('conda')
        if conda:
            with open(wd_condas + pipeline + '.yaml', 'w') as cf:
                yaml.dump(conda, cf, default_flow_style=False)
            sf_dict[pipeline]['conda'] = [wd_condas + pipeline + '.yaml']
        sf_dict[pipeline]['group'] = ['pipelines']
        assembly_cmds = pl_dict['commands'].format(**param_dict)
        cmds = list()
        cmds.extend(
            hp.parse_version_commands(pl_dict['versions'],
                                      pl_dict['description']))
        cmds.append('cd {}'.format(wd_cur))
        cmds.extend(assembly_cmds.split(sep='\n'))
        cmds_dict[pipeline] = cmds
        with open(wd_commands + pipeline + '.cmd', 'w') as f:
            f.write(assembly_cmds)
        nb_pipelines += 1
        pipelines_list.append(pipeline)
    sf_string = 'workdir: \'{}\'\n\n'.format(
        wd_envs
    )  # save envs in same location as results (otherwise defaults to current loc)
    sf_string += hp.dict_to_snakefile(cmds_dict, sf_dict)
    with open(sf_fn, 'a') as sf:
        sf.write(sf_string)

    sm_dict = {
        'targets': pipelines_list,
        'use_conda': True,
        'cores': args.threads_per_job
    }

    # ---- Cluster-related ----
    if args.slurm_config is not None:
        with open(args.slurm_config, 'r') as slurmf:
            slurm_config_dict = json.load(slurmf)
        partition_name = slurm_config_dict['__default__']['partition']
        sinfo_list = check_output(['sinfo']).decode('utf-8').split('\n')
        sinfo_header = {n: i for i, n in enumerate(sinfo_list[0].split())}
        nb_nodes = None
        for sil in sinfo_list[1:]:
            if partition_name in sil:
                nb_nodes = int(sil.split()[sinfo_header['NODES']])
                break
        if nb_nodes is None:
            raise ValueError(
                'supplied SLURM partition {} not found'.format(partition_name))
        nb_nodes = min(nb_nodes, nb_pipelines)
        sm_dict['nodes'] = nb_nodes
        tasks_per_node = max(nb_pipelines // nb_nodes, 1)
        sbatch_line = 'sbatch --nodes={nbn} --ntasks-per-node={tpn} --cpus-per-task={cpt}'.format(
            nbn=nb_nodes, tpn=tasks_per_node, cpt=args.threads_per_job)
        for n in list(slurm_config_dict['__default__']):
            sbatch_line += ' --{opt}={{cluster.{opt}}}'.format(opt=n)
        sm_dict['cluster'] = sbatch_line
        sm_dict['cluster_config'] = args.slurm_config
        sm_dict['local_cores'] = args.threads_per_job

    snakemake.snakemake(sf_fn, **sm_dict)