Beispiel #1
0
def main(sysargs=sys.argv[1:]):

    parser = argparse.ArgumentParser(
        prog=_program,
        description=
        'pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages',
        usage='''pangolin <query> [options]''')

    parser.add_argument('query',
                        nargs="*",
                        help='Query fasta file of sequences to analyse.')
    parser.add_argument(
        '-o',
        '--outdir',
        action="store",
        help="Output directory. Default: current working directory")
    parser.add_argument(
        '--outfile',
        action="store",
        help="Optional output file name. Default: lineage_report.csv")
    parser.add_argument('--alignment',
                        action="store_true",
                        help="Optional alignment output.")
    parser.add_argument(
        '-d',
        '--datadir',
        action='store',
        dest="datadir",
        help=
        "Data directory minimally containing a fasta alignment and guide tree")
    parser.add_argument(
        '--tempdir',
        action="store",
        help="Specify where you want the temp stuff to go. Default: $TMPDIR")
    parser.add_argument(
        "--no-temp",
        action="store_true",
        help="Output all intermediate files, for dev purposes.")
    parser.add_argument(
        '--decompress-model',
        action="store_true",
        dest="decompress",
        help=
        "Permanently decompress the model file to save time running pangolin.")
    parser.add_argument(
        '--max-ambig',
        action="store",
        default=0.5,
        type=float,
        help=
        "Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.5",
        dest="maxambig")
    parser.add_argument(
        '--min-length',
        action="store",
        default=10000,
        type=int,
        help=
        "Minimum query length allowed for pangolin to attempt assignment. Default: 10000",
        dest="minlen")
    parser.add_argument('--panGUIlin',
                        action='store_true',
                        help="Run web-app version of pangolin",
                        dest="panGUIlin")
    parser.add_argument("--verbose",
                        action="store_true",
                        help="Print lots of stuff to screen")
    parser.add_argument("-t",
                        "--threads",
                        action="store",
                        help="Number of threads")
    parser.add_argument("-v",
                        "--version",
                        action='version',
                        version=f"pangolin {__version__}")
    parser.add_argument("-pv",
                        "--pangoLEARN-version",
                        action='version',
                        version=f"pangoLEARN {pangoLEARN.__version__}",
                        help="show pangoLEARN's version number and exit")
    parser.add_argument(
        "--update",
        action='store_true',
        default=False,
        help=
        "Automatically updates to latest release of pangolin and pangoLEARN, then exits"
    )

    if len(sysargs) < 1:
        parser.print_help()
        sys.exit(-1)
    else:
        args = parser.parse_args(sysargs)
    args = parser.parse_args()

    if args.update:
        update(__version__, pangoLEARN.__version__)

    snakefile = os.path.join(thisdir, 'scripts', 'pangolearn.smk')
    if not os.path.exists(snakefile):
        sys.stderr.write(
            'Error: cannot find Snakefile at {}\n'.format(snakefile))
        sys.exit(-1)
    else:
        print(pfunk.green("Found the snakefile"))

    # to enable not having to pass a query if running update
    # by allowing query to accept 0 to many arguments
    if len(args.query) > 1:
        print(
            pfunk.cyan(
                f"Error: Too many query (input) fasta files supplied: {args.query}\nPlease supply one only"
            ))
        parser.print_help()
        sys.exit(-1)
    else:
        # find the query fasta
        query = os.path.join(cwd, args.query[0])
        if not os.path.exists(query):
            sys.stderr.write(
                'Error: cannot find query (input) fasta file at {}\nPlease enter your fasta sequence file and refer to pangolin usage at:\nhttps://github.com/hCoV-2019/pangolin#usage\n for detailed instructions\n'
                .format(query))
            sys.exit(-1)
        else:
            print(pfunk.green(f"The query file is:") + f"{query}")

        # default output dir
    outdir = ''
    if args.outdir:
        outdir = os.path.join(cwd, args.outdir)
        if not os.path.exists(outdir):
            try:
                os.mkdir(outdir)
            except:
                sys.stderr.write(
                    pfunk.cyan(f'Error: cannot create directory:') +
                    f"{outdir}")
                sys.exit(-1)
    else:
        outdir = cwd

    outfile = ""
    if args.outfile:
        outfile = os.path.join(outdir, args.outfile)
    else:
        outfile = os.path.join(outdir, "lineage_report.csv")

    tempdir = ''
    if args.tempdir:
        to_be_dir = os.path.join(cwd, args.tempdir)
        if not os.path.exists(to_be_dir):
            os.mkdir(to_be_dir)
        temporary_directory = tempfile.TemporaryDirectory(suffix=None,
                                                          prefix=None,
                                                          dir=to_be_dir)
        tempdir = temporary_directory.name
    else:
        temporary_directory = tempfile.TemporaryDirectory(suffix=None,
                                                          prefix=None,
                                                          dir=None)
        tempdir = temporary_directory.name

    if args.no_temp:
        print(
            pfunk.green(f"--no-temp:") +
            "all intermediate files will be written to {outdir}")
        tempdir = outdir

    if args.alignment:
        align_dir = outdir
        alignment_out = True
    else:
        align_dir = tempdir
        alignment_out = False

    if args.threads:
        print(
            pfunk.cyan(
                f"\n--threads flag used, but threading not currently supported. Continuing with one thread."
            ))
    """
    QC steps:
    1) check no empty seqs
    2) check N content
    3) write a file that contains just the seqs to run
    """

    do_not_run = []
    run = []
    for record in SeqIO.parse(query, "fasta"):
        # replace spaces in sequence headers with underscores
        record.id = record.description.replace(' ', '_')
        if "," in record.id:
            record.id = record.id.replace(",", "_")

        if len(record) < args.minlen:
            record.description = record.description + f" fail=seq_len:{len(record)}"
            do_not_run.append(record)
            print(record.id, "\tsequence too short")
        else:
            num_N = str(record.seq).upper().count("N")
            prop_N = round((num_N) / len(record.seq), 2)
            if prop_N > args.maxambig:
                record.description = record.description + f" fail=N_content:{prop_N}"
                do_not_run.append(record)
                print(f"{record.id}\thas an N content of {prop_N}")
            else:
                run.append(record)

    if run == []:
        with open(outfile, "w") as fw:
            fw.write(
                "taxon,lineage,probability,pangoLEARN_version,status,note\n")
            for record in do_not_run:
                desc = record.description.split(" ")
                reason = ""
                for item in desc:
                    if item.startswith("fail="):
                        reason = item.split("=")[1]
                fw.write(
                    f"{record.id},None,0,{pangoLEARN.__version__},fail,{reason}\n"
                )
        print(pfunk.cyan(f'Note: no query sequences have passed the qc\n'))
        sys.exit(0)

    post_qc_query = os.path.join(tempdir, 'query.post_qc.fasta')
    with open(post_qc_query, "w") as fw:
        SeqIO.write(run, fw, "fasta")
    qc_fail = os.path.join(tempdir, 'query.failed_qc.fasta')
    with open(qc_fail, "w") as fw:
        SeqIO.write(do_not_run, fw, "fasta")

    config = {
        "query_fasta": post_qc_query,
        "outdir": outdir,
        "outfile": outfile,
        "tempdir": tempdir,
        "aligndir": align_dir,
        "alignment_out": alignment_out,
        "trim_start": 265,  # where to pad to using datafunk
        "trim_end": 29674,  # where to pad after using datafunk
        "qc_fail": qc_fail,
        "pangoLEARN_version": pangoLEARN.__version__
    }

    # find the data
    data_dir = ""
    if args.datadir:
        data_dir = os.path.join(cwd, args.datadir)
        version = "Unknown"
        for r, d, f in os.walk(data_dir):
            for fn in f:
                if fn == "__init__.py":
                    print("Found __init__.py")
                    with open(os.path.join(r, fn), "r") as fr:
                        for l in fr:
                            if l.startswith("__version__"):
                                l = l.rstrip("\n")
                                version = l.split('=')[1]
                                version = version.replace('"',
                                                          "").replace(" ", "")
                                print("pangoLEARN version", version)
        config["pangoLEARN_version"] = version

    if not args.datadir:
        pangoLEARN_dir = pangoLEARN.__path__[0]
        data_dir = os.path.join(pangoLEARN_dir, "data")
    print(f"Looking in {data_dir} for data files...")
    trained_model = ""
    header_file = ""
    lineages_csv = ""

    for r, d, f in os.walk(data_dir):
        for fn in f:
            if fn == "decisionTreeHeaders_v1.joblib":
                header_file = os.path.join(r, fn)
            elif fn == "decisionTree_v1.joblib":
                trained_model = os.path.join(r, fn)
            elif fn == "lineages.metadata.csv":
                lineages_csv = os.path.join(r, fn)
    if trained_model == "" or header_file == "" or lineages_csv == "":
        print(
            pfunk.cyan(
                """Check your environment, didn't find appropriate files from the pangoLEARN repo.\n Trained model must be installed, please see https://cov-lineages.org/pangolin.html for installation instructions."""
            ))
        exit(1)
    else:
        if args.decompress:
            prev_size = os.path.getsize(trained_model)

            print("Decompressing model and header files")
            model = joblib.load(trained_model)
            joblib.dump(model, trained_model, compress=0)
            headers = joblib.load(header_file)
            joblib.dump(headers, header_file, compress=0)

            if os.path.getsize(trained_model) >= prev_size:
                print(
                    pfunk.green(
                        f'Success! Decompressed the model file. Exiting\n'))
                sys.exit(0)
            else:
                print(
                    pfunk.cyan(
                        f'Error: failed to decompress model. Exiting\n'))
                sys.exit(0)

        print(pfunk.green("\nData files found"))
        print(f"Trained model:\t{trained_model}")
        print(f"Header file:\t{header_file}")
        print(f"Lineages csv:\t{lineages_csv}")
        config["trained_model"] = trained_model
        config["header_file"] = header_file

    reference_fasta = pkg_resources.resource_filename('pangolin',
                                                      'data/reference.fasta')
    config["reference_fasta"] = reference_fasta

    variants_file = pkg_resources.resource_filename('pangolin',
                                                    'data/config_b.1.1.7.csv')
    config["b117_variants"] = variants_file

    variants_file = pkg_resources.resource_filename('pangolin',
                                                    'data/config_b.1.351.csv')
    config["b1351_variants"] = variants_file

    variants_file = pkg_resources.resource_filename('pangolin',
                                                    'data/config_p.1.csv')
    config["p1_variants"] = variants_file

    variants_file = pkg_resources.resource_filename('pangolin',
                                                    'data/config_p.2.csv')
    config["p2_variants"] = variants_file

    if args.panGUIlin:
        config["lineages_csv"] = lineages_csv

    if args.verbose:
        quiet_mode = False
        config["log_string"] = ""
    else:
        quiet_mode = True
        lh_path = os.path.realpath(lh.__file__)
        config["log_string"] = f"--quiet --log-handler-script {lh_path} "

    if args.verbose:
        print(pfunk.green("\n**** CONFIG ****"))
        for k in sorted(config):
            print(pfunk.green(k), config[k])

        status = snakemake.snakemake(snakefile,
                                     printshellcmds=True,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=tempdir,
                                     config=config,
                                     cores=1,
                                     lock=False)
    else:
        logger = custom_logger.Logger()
        status = snakemake.snakemake(snakefile,
                                     printshellcmds=False,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=tempdir,
                                     config=config,
                                     cores=1,
                                     lock=False,
                                     quiet=True,
                                     log_handler=logger.log_handler)

    if status:  # translate "success" into shell exit code of 0
        return 0

    return 1
Beispiel #2
0
def main(sysargs = sys.argv[1:]):

    parser = argparse.ArgumentParser(prog = _program, 
    description=qcfunk.preamble(__version__), 
    usage='''apollo -i <path/to/reads> [options]
        apollo -c <config.yaml>''')

    io_group = parser.add_argument_group('input output options')
    io_group.add_argument('-c',"--configfile",help="Config file with apollo run settings",dest="configfile")
    io_group.add_argument('-i','--read-path',help="Path to the directory containing fastq files",dest="read_path")
    io_group.add_argument('-o','--output-prefix', action="store",help="Output prefix. Default: apollo_<species>_<date>")
    io_group.add_argument('--outdir', action="store",help="Output directory. Default: current working directory")
    io_group.add_argument('--tempdir',action="store",help="Specify where you want the temp stuff to go. Default: $TMPDIR")
    

    barcode_group = parser.add_argument_group('barcode options')
    barcode_group.add_argument('-b','--barcodes-csv',help="CSV file describing which barcodes were used on which sample",dest="barcodes_csv")
    barcode_group.add_argument('-k','--barcode-kit',help="Indicates which barcode kit was used. Default: native. Options: native, rapid, pcr, all",dest="barcode_kit")

    demux_group = parser.add_argument_group('demultiplexing options')
    demux_group.add_argument('--demultiplex',action="store_true",help="Indicates that your reads have not been demultiplexed and will run guppy demultiplex on your provided read directory",dest="demultiplex")
    demux_group.add_argument('--path-to-guppy',action="store",help="Path to guppy_barcoder executable",dest="path_to_guppy")

    run_group = parser.add_argument_group('run options')
    run_group.add_argument('-s',"--species", action="store",help="Indicate which species is being sequenced. Options: mus, apodemus", dest="species")
    run_group.add_argument("-r","--report",action="store_true",help="Generate markdown report of estimated age")
    
    misc_group = parser.add_argument_group('misc options')
    misc_group.add_argument('-t', '--threads', action='store',type=int,help="Number of threads")
    misc_group.add_argument("--no-temp",action="store_true",help="Output all intermediate files, for dev purposes.")
    misc_group.add_argument("--verbose",action="store_true",help="Print lots of stuff to screen")
    misc_group.add_argument("-v","--version", action='version', version=f"apollo {__version__}")

    """
    Exit with help menu if no args supplied
    """

    args = parser.parse_args(sysargs)
    
    """
    Initialising dicts
    """

    config = qcfunk.get_defaults()

    configfile = qcfunk.look_for_config(args.configfile,cwd,config)

    # if a yaml file is detected, add everything in it to the config dict
    if configfile:
        qcfunk.parse_yaml_file(configfile, config)
    else:
        if len(sysargs)<1: 
            parser.print_help()
            sys.exit(0)
    
    """
    Get outdir, tempdir and the data
    """
    # default output dir
    qcfunk.get_outdir(args.outdir,args.output_prefix,cwd,config)

    # specifying temp directory, outdir if no_temp (tempdir becomes working dir)
    tempdir = qcfunk.get_temp_dir(args.tempdir, args.no_temp,cwd,config)

    # get data for a particular species, and get species
    qcfunk.get_package_data(thisdir, args.species, config)

    config["cpg_header"] = qcfunk.make_cpg_header(config["cpg_sites"])

    # add min and max read lengths to the config
    qcfunk.get_read_length_filter(config)

    # looks for basecalled directory
    qcfunk.look_for_basecalled_reads(args.read_path,cwd,config)
    
    # looks for the csv file saying which barcodes in sample
    qcfunk.look_for_barcodes_csv(args.barcodes_csv,cwd,config)

    """
    Configure whether guppy barcoder needs to be run
    """

    qcfunk.look_for_guppy_barcoder(args.demultiplex,args.path_to_guppy,cwd,config)


    # don't run in quiet mode if verbose specified
    if args.verbose:
        quiet_mode = False
        config["log_string"] = ""
    else:
        quiet_mode = True
        lh_path = os.path.realpath(lh.__file__)
        config["log_string"] = f"--quiet --log-handler-script {lh_path} "

    qcfunk.add_arg_to_config("threads",args.threads,config)
    
    try:
        config["threads"]= int(config["threads"])
    except:
        sys.stderr.write(qcfunk.cyan('Error: Please specifiy an integer for variable `threads`.\n'))
        sys.exit(-1)
    threads = config["threads"]

    print(f"Number of threads: {threads}\n")

    # find the master Snakefile
    snakefile = qcfunk.get_snakefile(thisdir)

    if args.verbose:
        print("\n**** CONFIG ****")
        for k in sorted(config):
            print(qcfunk.green(k), config[k])

        status = snakemake.snakemake(snakefile, printshellcmds=True, forceall=True, force_incomplete=True,
                                        workdir=tempdir,config=config, cores=threads,lock=False
                                        )
    else:
        logger = custom_logger.Logger()
        status = snakemake.snakemake(snakefile, printshellcmds=False, forceall=True,force_incomplete=True,workdir=tempdir,
                                    config=config, cores=threads,lock=False,quiet=True,log_handler=logger.log_handler
                                    )

    if status: # translate "success" into shell exit code of 0
       return 0

    return 1
Beispiel #3
0
def log_handler(msg):
    logger = custom_logger.Logger()
    return logger.log_handler