def main(sysargs=sys.argv[1:]): parser = argparse.ArgumentParser( prog=_program, description= 'pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages', usage='''pangolin <query> [options]''') parser.add_argument('query', nargs="*", help='Query fasta file of sequences to analyse.') parser.add_argument( '-o', '--outdir', action="store", help="Output directory. Default: current working directory") parser.add_argument( '--outfile', action="store", help="Optional output file name. Default: lineage_report.csv") parser.add_argument('--alignment', action="store_true", help="Optional alignment output.") parser.add_argument( '-d', '--datadir', action='store', dest="datadir", help= "Data directory minimally containing a fasta alignment and guide tree") parser.add_argument( '--tempdir', action="store", help="Specify where you want the temp stuff to go. Default: $TMPDIR") parser.add_argument( "--no-temp", action="store_true", help="Output all intermediate files, for dev purposes.") parser.add_argument( '--decompress-model', action="store_true", dest="decompress", help= "Permanently decompress the model file to save time running pangolin.") parser.add_argument( '--max-ambig', action="store", default=0.5, type=float, help= "Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.5", dest="maxambig") parser.add_argument( '--min-length', action="store", default=10000, type=int, help= "Minimum query length allowed for pangolin to attempt assignment. Default: 10000", dest="minlen") parser.add_argument('--panGUIlin', action='store_true', help="Run web-app version of pangolin", dest="panGUIlin") parser.add_argument("--verbose", action="store_true", help="Print lots of stuff to screen") parser.add_argument("-t", "--threads", action="store", help="Number of threads") parser.add_argument("-v", "--version", action='version', version=f"pangolin {__version__}") parser.add_argument("-pv", "--pangoLEARN-version", action='version', version=f"pangoLEARN {pangoLEARN.__version__}", help="show pangoLEARN's version number and exit") parser.add_argument( "--update", action='store_true', default=False, help= "Automatically updates to latest release of pangolin and pangoLEARN, then exits" ) if len(sysargs) < 1: parser.print_help() sys.exit(-1) else: args = parser.parse_args(sysargs) args = parser.parse_args() if args.update: update(__version__, pangoLEARN.__version__) snakefile = os.path.join(thisdir, 'scripts', 'pangolearn.smk') if not os.path.exists(snakefile): sys.stderr.write( 'Error: cannot find Snakefile at {}\n'.format(snakefile)) sys.exit(-1) else: print(pfunk.green("Found the snakefile")) # to enable not having to pass a query if running update # by allowing query to accept 0 to many arguments if len(args.query) > 1: print( pfunk.cyan( f"Error: Too many query (input) fasta files supplied: {args.query}\nPlease supply one only" )) parser.print_help() sys.exit(-1) else: # find the query fasta query = os.path.join(cwd, args.query[0]) if not os.path.exists(query): sys.stderr.write( 'Error: cannot find query (input) fasta file at {}\nPlease enter your fasta sequence file and refer to pangolin usage at:\nhttps://github.com/hCoV-2019/pangolin#usage\n for detailed instructions\n' .format(query)) sys.exit(-1) else: print(pfunk.green(f"The query file is:") + f"{query}") # default output dir outdir = '' if args.outdir: outdir = os.path.join(cwd, args.outdir) if not os.path.exists(outdir): try: os.mkdir(outdir) except: sys.stderr.write( pfunk.cyan(f'Error: cannot create directory:') + f"{outdir}") sys.exit(-1) else: outdir = cwd outfile = "" if args.outfile: outfile = os.path.join(outdir, args.outfile) else: outfile = os.path.join(outdir, "lineage_report.csv") tempdir = '' if args.tempdir: to_be_dir = os.path.join(cwd, args.tempdir) if not os.path.exists(to_be_dir): os.mkdir(to_be_dir) temporary_directory = tempfile.TemporaryDirectory(suffix=None, prefix=None, dir=to_be_dir) tempdir = temporary_directory.name else: temporary_directory = tempfile.TemporaryDirectory(suffix=None, prefix=None, dir=None) tempdir = temporary_directory.name if args.no_temp: print( pfunk.green(f"--no-temp:") + "all intermediate files will be written to {outdir}") tempdir = outdir if args.alignment: align_dir = outdir alignment_out = True else: align_dir = tempdir alignment_out = False if args.threads: print( pfunk.cyan( f"\n--threads flag used, but threading not currently supported. Continuing with one thread." )) """ QC steps: 1) check no empty seqs 2) check N content 3) write a file that contains just the seqs to run """ do_not_run = [] run = [] for record in SeqIO.parse(query, "fasta"): # replace spaces in sequence headers with underscores record.id = record.description.replace(' ', '_') if "," in record.id: record.id = record.id.replace(",", "_") if len(record) < args.minlen: record.description = record.description + f" fail=seq_len:{len(record)}" do_not_run.append(record) print(record.id, "\tsequence too short") else: num_N = str(record.seq).upper().count("N") prop_N = round((num_N) / len(record.seq), 2) if prop_N > args.maxambig: record.description = record.description + f" fail=N_content:{prop_N}" do_not_run.append(record) print(f"{record.id}\thas an N content of {prop_N}") else: run.append(record) if run == []: with open(outfile, "w") as fw: fw.write( "taxon,lineage,probability,pangoLEARN_version,status,note\n") for record in do_not_run: desc = record.description.split(" ") reason = "" for item in desc: if item.startswith("fail="): reason = item.split("=")[1] fw.write( f"{record.id},None,0,{pangoLEARN.__version__},fail,{reason}\n" ) print(pfunk.cyan(f'Note: no query sequences have passed the qc\n')) sys.exit(0) post_qc_query = os.path.join(tempdir, 'query.post_qc.fasta') with open(post_qc_query, "w") as fw: SeqIO.write(run, fw, "fasta") qc_fail = os.path.join(tempdir, 'query.failed_qc.fasta') with open(qc_fail, "w") as fw: SeqIO.write(do_not_run, fw, "fasta") config = { "query_fasta": post_qc_query, "outdir": outdir, "outfile": outfile, "tempdir": tempdir, "aligndir": align_dir, "alignment_out": alignment_out, "trim_start": 265, # where to pad to using datafunk "trim_end": 29674, # where to pad after using datafunk "qc_fail": qc_fail, "pangoLEARN_version": pangoLEARN.__version__ } # find the data data_dir = "" if args.datadir: data_dir = os.path.join(cwd, args.datadir) version = "Unknown" for r, d, f in os.walk(data_dir): for fn in f: if fn == "__init__.py": print("Found __init__.py") with open(os.path.join(r, fn), "r") as fr: for l in fr: if l.startswith("__version__"): l = l.rstrip("\n") version = l.split('=')[1] version = version.replace('"', "").replace(" ", "") print("pangoLEARN version", version) config["pangoLEARN_version"] = version if not args.datadir: pangoLEARN_dir = pangoLEARN.__path__[0] data_dir = os.path.join(pangoLEARN_dir, "data") print(f"Looking in {data_dir} for data files...") trained_model = "" header_file = "" lineages_csv = "" for r, d, f in os.walk(data_dir): for fn in f: if fn == "decisionTreeHeaders_v1.joblib": header_file = os.path.join(r, fn) elif fn == "decisionTree_v1.joblib": trained_model = os.path.join(r, fn) elif fn == "lineages.metadata.csv": lineages_csv = os.path.join(r, fn) if trained_model == "" or header_file == "" or lineages_csv == "": print( pfunk.cyan( """Check your environment, didn't find appropriate files from the pangoLEARN repo.\n Trained model must be installed, please see https://cov-lineages.org/pangolin.html for installation instructions.""" )) exit(1) else: if args.decompress: prev_size = os.path.getsize(trained_model) print("Decompressing model and header files") model = joblib.load(trained_model) joblib.dump(model, trained_model, compress=0) headers = joblib.load(header_file) joblib.dump(headers, header_file, compress=0) if os.path.getsize(trained_model) >= prev_size: print( pfunk.green( f'Success! Decompressed the model file. Exiting\n')) sys.exit(0) else: print( pfunk.cyan( f'Error: failed to decompress model. Exiting\n')) sys.exit(0) print(pfunk.green("\nData files found")) print(f"Trained model:\t{trained_model}") print(f"Header file:\t{header_file}") print(f"Lineages csv:\t{lineages_csv}") config["trained_model"] = trained_model config["header_file"] = header_file reference_fasta = pkg_resources.resource_filename('pangolin', 'data/reference.fasta') config["reference_fasta"] = reference_fasta variants_file = pkg_resources.resource_filename('pangolin', 'data/config_b.1.1.7.csv') config["b117_variants"] = variants_file variants_file = pkg_resources.resource_filename('pangolin', 'data/config_b.1.351.csv') config["b1351_variants"] = variants_file variants_file = pkg_resources.resource_filename('pangolin', 'data/config_p.1.csv') config["p1_variants"] = variants_file variants_file = pkg_resources.resource_filename('pangolin', 'data/config_p.2.csv') config["p2_variants"] = variants_file if args.panGUIlin: config["lineages_csv"] = lineages_csv if args.verbose: quiet_mode = False config["log_string"] = "" else: quiet_mode = True lh_path = os.path.realpath(lh.__file__) config["log_string"] = f"--quiet --log-handler-script {lh_path} " if args.verbose: print(pfunk.green("\n**** CONFIG ****")) for k in sorted(config): print(pfunk.green(k), config[k]) status = snakemake.snakemake(snakefile, printshellcmds=True, forceall=True, force_incomplete=True, workdir=tempdir, config=config, cores=1, lock=False) else: logger = custom_logger.Logger() status = snakemake.snakemake(snakefile, printshellcmds=False, forceall=True, force_incomplete=True, workdir=tempdir, config=config, cores=1, lock=False, quiet=True, log_handler=logger.log_handler) if status: # translate "success" into shell exit code of 0 return 0 return 1
def main(sysargs = sys.argv[1:]): parser = argparse.ArgumentParser(prog = _program, description=qcfunk.preamble(__version__), usage='''apollo -i <path/to/reads> [options] apollo -c <config.yaml>''') io_group = parser.add_argument_group('input output options') io_group.add_argument('-c',"--configfile",help="Config file with apollo run settings",dest="configfile") io_group.add_argument('-i','--read-path',help="Path to the directory containing fastq files",dest="read_path") io_group.add_argument('-o','--output-prefix', action="store",help="Output prefix. Default: apollo_<species>_<date>") io_group.add_argument('--outdir', action="store",help="Output directory. Default: current working directory") io_group.add_argument('--tempdir',action="store",help="Specify where you want the temp stuff to go. Default: $TMPDIR") barcode_group = parser.add_argument_group('barcode options') barcode_group.add_argument('-b','--barcodes-csv',help="CSV file describing which barcodes were used on which sample",dest="barcodes_csv") barcode_group.add_argument('-k','--barcode-kit',help="Indicates which barcode kit was used. Default: native. Options: native, rapid, pcr, all",dest="barcode_kit") demux_group = parser.add_argument_group('demultiplexing options') demux_group.add_argument('--demultiplex',action="store_true",help="Indicates that your reads have not been demultiplexed and will run guppy demultiplex on your provided read directory",dest="demultiplex") demux_group.add_argument('--path-to-guppy',action="store",help="Path to guppy_barcoder executable",dest="path_to_guppy") run_group = parser.add_argument_group('run options') run_group.add_argument('-s',"--species", action="store",help="Indicate which species is being sequenced. Options: mus, apodemus", dest="species") run_group.add_argument("-r","--report",action="store_true",help="Generate markdown report of estimated age") misc_group = parser.add_argument_group('misc options') misc_group.add_argument('-t', '--threads', action='store',type=int,help="Number of threads") misc_group.add_argument("--no-temp",action="store_true",help="Output all intermediate files, for dev purposes.") misc_group.add_argument("--verbose",action="store_true",help="Print lots of stuff to screen") misc_group.add_argument("-v","--version", action='version', version=f"apollo {__version__}") """ Exit with help menu if no args supplied """ args = parser.parse_args(sysargs) """ Initialising dicts """ config = qcfunk.get_defaults() configfile = qcfunk.look_for_config(args.configfile,cwd,config) # if a yaml file is detected, add everything in it to the config dict if configfile: qcfunk.parse_yaml_file(configfile, config) else: if len(sysargs)<1: parser.print_help() sys.exit(0) """ Get outdir, tempdir and the data """ # default output dir qcfunk.get_outdir(args.outdir,args.output_prefix,cwd,config) # specifying temp directory, outdir if no_temp (tempdir becomes working dir) tempdir = qcfunk.get_temp_dir(args.tempdir, args.no_temp,cwd,config) # get data for a particular species, and get species qcfunk.get_package_data(thisdir, args.species, config) config["cpg_header"] = qcfunk.make_cpg_header(config["cpg_sites"]) # add min and max read lengths to the config qcfunk.get_read_length_filter(config) # looks for basecalled directory qcfunk.look_for_basecalled_reads(args.read_path,cwd,config) # looks for the csv file saying which barcodes in sample qcfunk.look_for_barcodes_csv(args.barcodes_csv,cwd,config) """ Configure whether guppy barcoder needs to be run """ qcfunk.look_for_guppy_barcoder(args.demultiplex,args.path_to_guppy,cwd,config) # don't run in quiet mode if verbose specified if args.verbose: quiet_mode = False config["log_string"] = "" else: quiet_mode = True lh_path = os.path.realpath(lh.__file__) config["log_string"] = f"--quiet --log-handler-script {lh_path} " qcfunk.add_arg_to_config("threads",args.threads,config) try: config["threads"]= int(config["threads"]) except: sys.stderr.write(qcfunk.cyan('Error: Please specifiy an integer for variable `threads`.\n')) sys.exit(-1) threads = config["threads"] print(f"Number of threads: {threads}\n") # find the master Snakefile snakefile = qcfunk.get_snakefile(thisdir) if args.verbose: print("\n**** CONFIG ****") for k in sorted(config): print(qcfunk.green(k), config[k]) status = snakemake.snakemake(snakefile, printshellcmds=True, forceall=True, force_incomplete=True, workdir=tempdir,config=config, cores=threads,lock=False ) else: logger = custom_logger.Logger() status = snakemake.snakemake(snakefile, printshellcmds=False, forceall=True,force_incomplete=True,workdir=tempdir, config=config, cores=threads,lock=False,quiet=True,log_handler=logger.log_handler ) if status: # translate "success" into shell exit code of 0 return 0 return 1
def log_handler(msg): logger = custom_logger.Logger() return logger.log_handler