import argparse import os from helper_functions import is_fasta, is_user_info_yaml, is_valid_repo, is_valid_slurm_config, parse_output_path, is_valid_fastq_path __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) included_pipelines = os.listdir(__location__ + '/assembler_commands/') included_pipelines = [os.path.splitext(pl)[0] for pl in included_pipelines] included_pipelines = ', '.join(included_pipelines) working_dir = ('-w', '--working-dir', { 'type': lambda x: parse_output_path(x), 'required': True, 'help': 'Intermediate folder where results are stored.' }) # TODO: implement argcheck fast5_dir = ('-f', '--fast5-dir', { 'type': lambda x: os.path.realpath(x), 'required': False, 'help': 'Directory containing fast5-reads for the provided reads. required for some tools (e.g. Nanopolish).' }) # TODO: implement argcheck gff_file = ('-g', '--gff-file', { 'type': lambda x: os.path.realpath(x),
action='store_true', default=False, help='Given hdf path refers to a tombo analysis.') parser.add_argument( '--clipped-bases', type=int, required=False, default=10, help='Define how many bases should be clipped off ends of training reads.' 'Overruled by --use-tombo (in which case base clipping will depend on' 'an alignment).') args = parser.parse_args() file_list = parse_input_path(args.input) out_path = parse_output_path(args.db_dir) db_name = out_path + 'db.fs' error_fn = out_path + 'failed_reads.txt' npz_path = out_path + 'test_squiggles/' npz_path = parse_output_path(npz_path) if isfile(db_name): raise ValueError('DB by this name already exists!') # Very light check on tombo usage if args.use_tombo and 'RawGenomeCorrected' not in args.hdf_path: raise UserWarning( 'Tombo files should be used, but hdf path does not seem tombo-generated...' ) db = ExampleDb(db_name=db_name, width=args.width) nb_files = len(file_list)
def main(args): __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) # additional argument checks if not os.path.isdir(args.working_dir): raise ValueError('Working directory not found') args.working_dir = os.path.realpath(args.working_dir) + '/' if os.path.isdir(args.working_dir + 'analysis/'): shutil.rmtree(args.working_dir + 'analysis/') options_dict = dict() options_dict['wd_envs'] = hp.parse_output_path(args.working_dir + 'envs/') options_dict['threads'] = args.threads_per_job options_dict['ref_fasta'] = os.path.realpath(args.ref_fasta) options_dict['reads_fastq'] = args.working_dir + 'all_reads.fastq' options_dict['wd_analysis'] = hp.parse_output_path(args.working_dir + 'analysis/') options_dict[ 'wd_analysis_condas'] = __location__ + '/analysis_conda_files/' options_dict['__location__'] = __location__ # --- create output directories if os.path.isdir(options_dict['wd_analysis']): shutil.rmtree(options_dict['wd_analysis']) _ = hp.parse_output_path(options_dict['wd_analysis'] + 'quast') _ = hp.parse_output_path(options_dict['wd_analysis'] + 'jellyfish') _ = hp.parse_output_path(options_dict['wd_analysis'] + 'readset_analysis') options_dict['wd_analysis_summary'] = hp.parse_output_path( options_dict['wd_analysis'] + 'summary/') options_dict[ 'wd_assembler_results'] = args.working_dir + 'assembler_results/' options_dict[ 'wd_assemblies'] = args.working_dir + 'assembler_results/assemblies/' assemblies_list = hp.parse_input_path(options_dict['wd_assemblies'], pattern='*.fasta') if len(assemblies_list) == 0: raise ValueError('No succesful assemblies found to analyze!') assemblies_names_list = [ os.path.splitext(os.path.basename(af))[0] for af in assemblies_list ] options_dict['assemblies_string'] = ' '.join(assemblies_names_list) with open(args.user_info, 'r') as f: md_yaml = yaml.load(f) md = Metadata(md_yaml) md.write_publication_info(options_dict['wd_analysis_summary'] + 'publication_info.yaml') # --- Quast --- options_dict['quast_options'] = '' if md.is_eukaryote: options_dict['quast_options'] += '-e ' if args.gff_file: options_dict['quast_options'] += '-G ' + os.path.abspath( args.gff_file) + ' ' quast_output = '' quast_output_cmd = '' for anl in assemblies_names_list: quast_output += ( ',\n\t\t{anl}_fplot=\'{wd_analysis_summary}quast/{anl}.fplot\'' ',\n\t\t{anl}_rplot=\'{wd_analysis_summary}quast/{anl}.rplot\'' ).format(anl=anl, wd_analysis_summary=options_dict['wd_analysis_summary']) quast_output_cmd += ( 'if [ -e contigs_reports/nucmer_output/{anl}.fplot ]; then ' # for quast <5.0.0 'cp contigs_reports/nucmer_output/{anl}.fplot {wd_analysis_summary}quast/.\n' 'cp contigs_reports/nucmer_output/{anl}.rplot {wd_analysis_summary}quast/.\n' 'fi\n').format( anl=anl, wd_analysis_summary=options_dict['wd_analysis_summary']) quast_output_cmd += ( 'if [ -e contigs_reports/all_alignments_{anl}.tsv ]; then ' # for quast =>5.0.0 'cp contigs_reports/all_alignments_{anl}.tsv {wd_analysis_summary}quast/.\n' 'fi\n').format( anl=anl, wd_analysis_summary=options_dict['wd_analysis_summary']) options_dict['quast_output'] = quast_output options_dict['quast_output_cmd'] = quast_output_cmd # --- Construct snakemake file --- sf_fn = args.working_dir + 'Snakefile_analysis_' + datetime.datetime.now( ).strftime('%Y%m%d%H%M%S') with open(__location__ + '/Snakemake_analysis', 'r') as f: sf = f.read() sf = sf.format(**options_dict) with open(sf_fn, 'w') as sf_handle: sf_handle.write(sf) sm_dict = {'use_conda': True} if args.slurm_config is not None: sm_dict['cluster'] = 'sbatch' sm_dict['cluster_config'] = args.slurm_config sm_dict['nodes'] = 5 snakemake.snakemake(sf_fn, **sm_dict)
def main(args): summary_dir = os.path.realpath(args.working_dir) + '/analysis/summary/' if not os.path.isdir(summary_dir): raise ValueError('No summary directory found at {}, ' 'did you run analysis already?'.format( args.working_dir)) if args.git: git_dir = os.path.realpath(args.working_dir) + '/analysis/to_github/' if os.path.isdir(git_dir): rmtree(git_dir) repo_obj = Repo.clone_from(url=args.git, to_path=git_dir) copy_tree(summary_dir, git_dir) file_list = [f for f in parse_input_path(git_dir) if '.git' not in f] repo_obj.index.add(file_list) repo_obj.index.commit(message='added benchmark results') repo_obj.remote('origin').push() print('Results summary pushed to {}'.format(args.git)) share_results = input( f'\n\nporeTally has finished! Check your report here: {summary_dir+ "REPORT.html"} \n' '\n' 'You can help the MinION user community gain insight in the performance of de novo ' 'assemblers and pick the best one for their dataset, by submitting your results to a shared ' 'Github repository (github.com/cvdelannoy/assembler_benchmark_new_submissions). In short, ' 'this will be done completely automatically, by issuing a fork request from a given Github ' 'account. The whole process is transparent, as all the pull requests are publicly visible. ' 'The collected benchmark info will periodically be curated and summarized in insightful ' 'tables and figures. Of course, submissions will be duely credited and no sequence-specific ' 'information will be shared, only quality metrics!' '\n' 'Would you like to submit your results? (y/n) ') while share_results not in ['y', 'n', 'yes', 'no']: share_results = input('please answer with y(es) or n(o): ') if share_results in ['y', 'yes']: print('\nGreat, thanks for sharing!\n') # ---- Forking ---- while True: ses = requests.Session() github_username = input( 'Github username for the account from which you want to submit results: ' ) ses.auth = (github_username, getpass('Enter Github password: '******'https://github.com/{git_id}/poreTally_collective_submissions.git'.format( git_id=github_username) fork_req = ses.post( 'https://api.github.com/repos/cvdelannoy/poreTally_collective_submissions/forks' ) if int(fork_req.status_code) == 202: break elif int(fork_req.status_code) == 401: print( 'Authentication for Github account {} failed. Please retry.' .format(github_username)) else: print( 'Authentication failed due to some unforeseen ' 'circumstance (HTTP status code {status}). Please retry. '. format(status=fork_req.status_code)) # ---- add, commit, push ---- print('\nPushing results to fork...') all_submissions_dir = os.path.realpath( args.working_dir) + '/analysis/collective_submissions/' if os.path.isdir(all_submissions_dir): rmtree(all_submissions_dir) cur_foldername = strftime('%Y%m%d%H%M%S', gmtime()) repo_obj = Repo.clone_from(url=submission_url, to_path=all_submissions_dir) branch_obj = repo_obj.create_head(cur_foldername) repo_obj.head.reference = branch_obj repo_obj.head.reset(index=True, working_tree=True) submission_dir = parse_output_path(all_submissions_dir + cur_foldername) copy_tree(summary_dir, submission_dir) file_list = [ f for f in parse_input_path(submission_dir) if '.git' not in f ] repo_obj.index.add(file_list) repo_obj.index.commit(message='collective benchmark submission') # child.logfile = sys.stdout child = pexpect.spawn( 'git push origin HEAD:{branch}'.format(branch=cur_foldername), cwd=submission_dir) child.expect('Username for \'https://github.com\':.*', timeout=2) child.sendline(ses.auth[0] + '\n') child.expect( '{un}\r\n\r\nPassword for \'https://{un}@github.com\':.*'.format( un=ses.auth[0])) child.sendline(ses.auth[1]) push_check = child.read() if b'[new branch] HEAD' not in push_check: ValueError( '''Pushing to collective benchmark fork has failed...Sorry, that should not happen! Please report the error below on the poreTally github: {push_txt}'''.format(push_txt=push_check)) # ---- submit pull request ---- print('\nResults pushed! Issuing pull request...') pull_url = 'https://api.github.com/repos/cvdelannoy/poreTally_collective_submissions/pulls' pull_bool = False for _ in range(3): pull_params = { 'title': 'poreTally_collective_submissions', 'base': 'master', 'head': github_username + ':' + cur_foldername } pull_req = ses.post(pull_url, json=pull_params) if int(pull_req.status_code) == 201: print( '\nResults were successfully submitted to the poreTally collective benchmark!\n\n' 'Keep an Eye on https://github.com/cvdelannoy/poreTallyCommunity for future analyses.' ) pull_bool = True break elif int(pull_req.status_code) == 401: print( 'Authentication for Github account {} failed when attempting a pull request. ' 'Retrying...'.format(github_username)) else: print( 'Authentication for issuing a pull request failed due to some unforeseen ' 'circumstance (HTTP status code {status}). Retrying... '. format(status=pull_req.status_code)) if not pull_bool: ValueError( '''Making a pull request for the collective benchmark fork has failed...Sorry, that should not happen! Please report the error below on the poreTally github: HTTP status code {sc}: {pull_txt}'''.format( sc=pull_req.status_code, pull_txt=pull_req.reason))
def main(args): __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) fastq_list = hp.parse_input_path(args.reads_dir, pattern='*.f*q') wd = args.working_dir # Make necessary subdirs wd_envs = hp.parse_output_path(wd + 'envs/') wd_results = hp.parse_output_path(wd + 'assembler_results/') wd_assemblies = hp.parse_output_path(wd_results + 'assemblies/') wd_logs = hp.parse_output_path(wd_results + 'log_files/') wd_cpu = hp.parse_output_path(wd_results + 'cpu_files/') wd_condas = hp.parse_output_path(wd_results + 'conda_files/') wd_commands = hp.parse_output_path(wd_results + 'command_files/') if os.path.exists(wd + 'Snakefile'): os.remove(wd + 'Snakefile') # merge fastq's all_reads_fastq = wd + 'all_reads.fastq' with open(all_reads_fastq, 'wb') as afq: for f in fastq_list: with open(f, 'rb') as fd: shutil.copyfileobj(fd, afq) param_dict = dict() param_dict['NB_THREADS'] = args.threads_per_job param_dict['REFGENOME_SIZE'] = hp.get_nb_bases(args.ref_fasta, 'fasta') param_dict['SEQUENCED_SIZE'] = hp.get_nb_bases(all_reads_fastq, 'fastq') param_dict['COVERAGE'] = param_dict['SEQUENCED_SIZE'] / param_dict[ 'REFGENOME_SIZE'] param_dict['WD'] = wd if args.fast5_dir: fast5_dir_abs = os.path.abspath(args.fast5_dir) + '/' param_dict['FAST5_DIR'] = fast5_dir_abs # Construct Snakefile # construct unique name for snakefile first sf_fn = wd + 'Snakefile_assemblies_' + datetime.datetime.now().strftime( '%Y%m%d%H%M%S') cmds_dict = dict() sf_dict = dict() if 'default' in args.pipelines: args.pipelines += [ 'canu', 'flye', 'smartdenovo', 'minimap2_miniasm', 'minimap2_miniasm_raconX2' ] args.pipelines.remove('default') nb_pipelines = 0 pipelines_list = [] for pipeline in args.pipelines: if os.path.isfile(pipeline): yaml_fn = pipeline pipeline = os.path.splitext(os.path.basename(pipeline))[0] else: yaml_fn = __location__ + '/assembler_commands/' + pipeline + '.yaml' if os.path.isfile(yaml_fn): with open(yaml_fn, 'r') as plf: pl_dict = yaml.load(plf) else: warnings.warn('Could not find yaml file for {pl}, skipping'.format( pl=yaml_fn)) continue wd_cur_path = wd_results + pipeline if os.path.isdir( wd_cur_path ): # Ensure clean output folder, as some assemblers error out otherwise shutil.rmtree(wd_cur_path) wd_cur = hp.parse_output_path(wd_cur_path) sf_dict[pipeline] = { 'input': { 'fastq': wd + 'all_reads.fastq' }, 'threads': [args.threads_per_job], 'output': [wd_assemblies + pipeline + '.fasta'], 'log': [wd_logs + pipeline + '.log'], 'benchmark': [wd_cpu + pipeline + '.bm'] } conda = pl_dict.get('conda') if conda: with open(wd_condas + pipeline + '.yaml', 'w') as cf: yaml.dump(conda, cf, default_flow_style=False) sf_dict[pipeline]['conda'] = [wd_condas + pipeline + '.yaml'] sf_dict[pipeline]['group'] = ['pipelines'] assembly_cmds = pl_dict['commands'].format(**param_dict) cmds = list() cmds.extend( hp.parse_version_commands(pl_dict['versions'], pl_dict['description'])) cmds.append('cd {}'.format(wd_cur)) cmds.extend(assembly_cmds.split(sep='\n')) cmds_dict[pipeline] = cmds with open(wd_commands + pipeline + '.cmd', 'w') as f: f.write(assembly_cmds) nb_pipelines += 1 pipelines_list.append(pipeline) sf_string = 'workdir: \'{}\'\n\n'.format( wd_envs ) # save envs in same location as results (otherwise defaults to current loc) sf_string += hp.dict_to_snakefile(cmds_dict, sf_dict) with open(sf_fn, 'a') as sf: sf.write(sf_string) sm_dict = { 'targets': pipelines_list, 'use_conda': True, 'cores': args.threads_per_job } # ---- Cluster-related ---- if args.slurm_config is not None: with open(args.slurm_config, 'r') as slurmf: slurm_config_dict = json.load(slurmf) partition_name = slurm_config_dict['__default__']['partition'] sinfo_list = check_output(['sinfo']).decode('utf-8').split('\n') sinfo_header = {n: i for i, n in enumerate(sinfo_list[0].split())} nb_nodes = None for sil in sinfo_list[1:]: if partition_name in sil: nb_nodes = int(sil.split()[sinfo_header['NODES']]) break if nb_nodes is None: raise ValueError( 'supplied SLURM partition {} not found'.format(partition_name)) nb_nodes = min(nb_nodes, nb_pipelines) sm_dict['nodes'] = nb_nodes tasks_per_node = max(nb_pipelines // nb_nodes, 1) sbatch_line = 'sbatch --nodes={nbn} --ntasks-per-node={tpn} --cpus-per-task={cpt}'.format( nbn=nb_nodes, tpn=tasks_per_node, cpt=args.threads_per_job) for n in list(slurm_config_dict['__default__']): sbatch_line += ' --{opt}={{cluster.{opt}}}'.format(opt=n) sm_dict['cluster'] = sbatch_line sm_dict['cluster_config'] = args.slurm_config sm_dict['local_cores'] = args.threads_per_job snakemake.snakemake(sf_fn, **sm_dict)