Ejemplo n.º 1
0
def get_1000GProject_vcf():
    """
        Downloads a VCF file from the 1000 Genome Project database,
        containing SNPs and indels for each subject involved in the
        study and returns the path to it
        ----
        Parameters:
            None
        ----
        Returns:
            vcf (str) : path to the vcf downloaded vcf file (in .vcf.gz)
    """

    # download the VCF
    address = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/'
    address += '1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/'
    address += 'ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz'

    cmd = 'wget -c {0}'.format(address)

    code = subprocess.call(cmd, shell=True)

    if code != 0:
        errmsg = ''.join(["\n\nERROR: An error occurred while executing ", cmd, ". Exiting"])
        raise SubprocessError(errmsg)
        die(1)

    vcf_file = './ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz'
    vcf = os.path.abspath(vcf_file)

    return vcf
Ejemplo n.º 2
0
def build_motif_JASPAR(motif_file, bg_file, pseudocount, no_reverse, verbose):
    """
        Build a Motif object starting from raw counts
        data stored in a JASPAR motif file.

        The raw counts are processed and the resulting values
        are used to define the scoring matrix for the motif
        ----
        Parameters:
            motif_file (str) : path to the motif file
            bg_file (str) : path to the background file
            pseudocount (float) : value to add to the motif counts (to avoid
                                    division by 0)
            no_reverse (bool) : flag parameter to consider or not the reverse
                                complement building the Motif object
        ----
        Returns:
            motif (Motif) : returns the corresponding Motif object
    """

    if not motif_file:
        raise FileNotFoundError("\n\nERROR: the motif file is missing")
        die(1)

    # check if the input file is in JASPAR format
    if not isJaspar_ff(motif_file):
        raise NotValidFFException(
            "ERROR: the given motif file is not in JASPAR or MEME format")
        die(1)

    assert pseudocount > 0

    # read the motif file
    motif = read_JASPAR_motif(motif_file, bg_file, pseudocount, no_reverse,
                              verbose)

    if verbose:
        start_mp = time.time()

    #  get log-odds values for motif
    motif = process_motif_for_logodds(motif)

    if verbose:
        end_mp = time.time()
        msg = ''.join([
            "Processed motif ",
            motif.getMotifID(), " in ",
            str(end_mp - start_mp), "s"
        ])
        print(msg)
    # end if

    return motif
Ejemplo n.º 3
0
def get_reference_genome_from_ucsc():
    """
        Download the reference genome (hg38 assembly), from the UCSC
        database, in the current directory and returns the path to it
        ----
        Parameters:
            None
        ----
        Returns:
            genome (str) : path to the genome downloaded (in .fa format)
    """

    # download genome
    address = 'ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz'
    cmd = 'wget -c {0}'.format(address)
    code = subprocess.call(cmd, shell=True)  # downloaded in the current directory

    if code != 0:
        errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
        raise SubprocessError(errmsg)
        die(1)

    # decompress genome
    print("Uncompressing the genome...")

    genome_comp = './hg38.fa.gz'

    cmd = 'gunzip {0}'.format(genome_comp)
    code = subprocess.call(cmd, shell=True)

    if code != 0:
        errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
        raise SubprocessError(errmsg)
        die(1)

    # remove FASTA.GZ file if still present
    if os.path.exists(genome_comp):
        cmd = 'rm {0}'.format(genome_comp)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
            raise SubprocessError(errmsg)
            die(1)

    # get the path to the genome file
    genome_uncomp = "./hg38.fa"
    genome = os.path.abspath(genome_uncomp)

    return genome
Ejemplo n.º 4
0
def main(cmdLineargs: Optional[List[str]] = None) -> None:

    try:
        # starting point of the execution time
        start: float = time.time()

        # read the command-line arguments
        parser: GRAFIMOArgumentParser = get_parser()

        if cmdLineargs is None:
            cmdLineargs: List[str] = sys.argv[1:]  # get input args

        # no arguments given --> print help
        if len(cmdLineargs) == 0:
            parser.error_noargs()
            die(2)

        # the second argument must be buildvg or findmotif
        if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help")
                and cmdLineargs[0] != "--version" and
            (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")):
            parser.error(
                "The second argument must be one between 'buildvg' and 'findmotif'"
            )
            die(1)

        args: argparse.Namespace = parser.parse_args(cmdLineargs)

        if args.verbose:
            print("Parsing arguments...")
            start_args_parse: float = time.time()

        #--------------------------------------------------------------#
        # check commandline arguments consistency
        #

        #---------------------- general options -----------------------#

        # workflow type
        if args.workflow != "buildvg" and args.workflow != "findmotif":
            parser.error("Unexpected workflow given. Available options:\n"
                         "\tbuildvg: construct VG from user data.\n"
                         "\tfindmotif: scan VG for DNA motif(s) occurrences")
            die(1)

        # cpu cores
        if args.cores < 0:
            parser.error("Negative number of CPU cores given")
        elif args.cores == 0 and args.graph_genome:
            # when whole genome variation graph is given, it is safer to
            # use 1 CPU core by default. This beacuse of the space needed
            # to load the whole VG on RAM.
            #
            # CAVEAT: before requiring more CPU cores to be used, be sure
            # your system has enough memory
            args.cores = 1
        elif args.cores == 0:
            # default option -> use all available CPU cores
            args.cores = mp.cpu_count()
        else:  # args.cores > 0
            if args.cores > mp.cpu_count():
                parser.error("Too many CPU cores to use ({})".format(
                    args.cores))

        # verbosity
        if (not isinstance(args.verbose, bool)
                or (args.verbose != False and args.verbose != True)):
            parser.error(
                '\"--verbose\" does not accept any positional argument')

        # debugging
        if (not isinstance(args.debug, bool)
                or (args.debug != False and args.debug != True)):
            parser.error("\"--debug\" does not accept any positional argument")

        #---------------------- buildvg options -----------------------#

        buildvg_err_msg: str = "Unexpected arguments for \"grafimo buildvg\": \"{}\""

        if args.workflow == "buildvg":

            if args.graph_genome_dir:
                parser.error(buildvg_err_msg.format("-d, --genome-graph-dir"))
                die(1)
            elif args.graph_genome:
                parser.error(buildvg_err_msg.format("-g, --genome-graph"))
                die(1)
            elif args.bedfile:
                parser.error(buildvg_err_msg.format("-b, --bedfile"))
                die(1)
            elif args.motif:
                parser.error(buildvg_err_msg.format("-m, --motif"))
                die(1)
            elif args.bgfile != UNIF:  # if default ignored
                parser.error(buildvg_err_msg.format("-k, --bgfile"))
                die(1)
            elif args.pseudo != 0.1:  # if default ignored
                parser.error(buildvg_err_msg.format("-p, --pseudo"))
                die(1)
            elif args.threshold != 1e-4:  # if default ignored
                parser.error(buildvg_err_msg.format("-t, --thresh"))
                die(1)
            elif args.no_qvalue:
                parser.error(buildvg_err_msg.format("-q, --no-qvalue"))
                die(1)
            elif args.no_reverse:
                parser.error(buildvg_err_msg.format("-r, --no-reverse"))
                die(1)
            elif args.text_only:
                parser.error(buildvg_err_msg.format("-f, --text-only"))
                die(1)
            elif args.chroms_find:
                parser.error(buildvg_err_msg.format("--chroms-find"))
                die(1)
            elif args.chroms_prefix_find:
                parser.error(buildvg_err_msg.format("--chroms-prefix-find"))
                die(1)
            elif args.chroms_namemap_find != NOMAP:  # if default ignored
                parser.error(buildvg_err_msg.format("--chroms-namemap-find"))
                die(1)
            elif args.qval_t:
                parser.error(buildvg_err_msg.format("--qvalueT"))
                die(1)
            elif args.recomb:
                parser.error(buildvg_err_msg.format("--recomb"))
                die(1)
            elif args.top_graphs != 0:  # if default ignored
                parser.error(buildvg_err_msg.format("--top-graphs"))
                die(1)
            elif not args.linear_genome:
                parser.error("No reference genome given")
                die(1)
            elif not args.vcf:
                parser.error("No VCF file given")
                die(1)
            else:  # arguments for buildvg are correct
                # reference genome
                if (args.linear_genome.split('.')[-1] != 'fa'
                        and args.linear_genome.split('.')[-1] != 'fasta'):
                    parser.error(
                        "The reference genome file must be in FASTA format")
                    die(1)
                else:
                    if not os.path.isfile(args.linear_genome):
                        parser.error("Unable to find {}".format(
                            args.linear_genome))
                        die(1)
                    if os.stat(args.linear_genome).st_size == 0:  # empty file
                        parser.error("{} seems to be empty.".format(
                            args.linear_genome))
                        die(1)
                    args.linear_genome = os.path.abspath(args.linear_genome)
                # VCF --> the VCF file must have been compressed with
                # bgzip (https://github.com/samtools/tabix)
                if (args.vcf.split(".")[-1] != "gz"
                        and args.vcf.split(".")[-2] != "vcf"):
                    parser.error(
                        "Wrong VCF file given. The VCF file must have been "
                        "compressed with bgzip (e.g. myvcf.vcf.gz)")
                    die(1)
                else:
                    if not os.path.isfile(args.vcf):
                        parser.error('Unable to find {}'.format(args.vcf))
                        die(1)
                    if os.stat(args.vcf).st_size == 0:  # empty file
                        parser.error("{} seems to be empty.".format(args.vcf))
                        die(1)
                    args.vcf = os.path.abspath(args.vcf)

                # chromosome to construct VG
                if len(args.chroms_build) == 0:
                    args.chroms_build = [ALL_CHROMS]  # use all chromosome
                else:
                    if anydup(args.chroms_build):
                        parser.error(
                            "Duplicated chromosome names given to \"--chroms-build\""
                        )

                # chromosome name-map
                if args.chroms_namemap_build != NOMAP:
                    if not os.path.isfile(args.chroms_namemap_build):
                        parser.error("Unable to locate {}".format(
                            args.chroms_namemap_build))
                if (args.chroms_prefix_build
                        and args.chroms_namemap_build != NOMAP):
                    parser.error(
                        "\"--chroms-prefix-build\" and \"chroms-namemap-build\" "
                        "cannot used together. Choose one of those options")

                # if no out directory is specified the VGs are stored in
                # the current directory
                if args.out == "":
                    args.out = os.path.abspath("./")

                workflow: BuildVG = BuildVG(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs." %
                          (end_args_parse - start_args_parse))
            # end if
        # end if

        #---------------------- findmotif options -----------------------#

        findmotif_err_msg: str = "Unexpected arguments for \"grafimo findmotif\": \"{}\""

        if args.workflow == "findmotif":
            if args.linear_genome:
                parser.error(findmotif_err_msg.format("-l, --linear-genome"))
                die(1)
            elif args.vcf:
                parser.error(findmotif_err_msg.format("-v, --vcf"))
                die(1)
            elif args.chroms_build:
                parser.error(findmotif_err_msg.format("--chroms-build"))
            elif args.chroms_prefix_build:
                parser.error(findmotif_err_msg.format("--chroms-prefix-build"))
            elif args.chroms_namemap_build != NOMAP:
                parser.error(
                    findmotif_err_msg.format("--chroms-namemap-build"))
            elif args.reindex:  # if default ignored
                parser.error(findmotif_err_msg.format("--reindex"))
                die(1)
            elif not args.graph_genome_dir and not args.graph_genome:
                parser.error(
                    "No arguments given for both \"--genome-graph\" and \"--genome-graph-dir\""
                )
                die(1)
            elif not args.bedfile:
                parser.error("No BED file given")
                die(1)
            elif not args.motif:
                parser.error("No motif PWM given")
                die(1)
            else:
                # only one between graph_genome and graph_genome_dir is allowed
                if args.graph_genome and args.graph_genome_dir:
                    parser.error(
                        "Only one argument between \"--genome-graph\" and \"--genome-graph-dir\""
                        " can be used")
                    die(1)

                # genome graph
                if args.graph_genome:
                    if (args.graph_genome.split('.')[-1] != "xg"
                            and args.graph_genome.split('.')[-1] != "vg"):
                        parser.error(
                            "Unrecognized genome variation graph format. Only"
                            "VG and XG format are allowed")
                        die(1)
                    elif not os.path.isfile(args.graph_genome):
                        parser.error("Unable to locate {}".format(
                            args.graph_genome))
                        die(1)
                    else:
                        # using absolute path avoid potential problems
                        args.graph_genome = os.path.abspath(args.graph_genome)

                # genome graphs directory
                if args.graph_genome_dir:
                    if not os.path.isdir(args.graph_genome_dir):
                        parser.error("Unable to locate {}".format(
                            args.graph_genome_dir))
                        die(1)
                    if len(glob(os.path.join(args.graph_genome_dir,
                                             "*.xg"))) <= 0:
                        parser.error(
                            "No genome variation graph found in {}".format(
                                args.graph_genome_dir))
                        die(1)
                    else:
                        # using absolute path avoid potential problems
                        args.graph_genome_dir = os.path.abspath(
                            args.graph_genome_dir)

                # BED file
                if args.bedfile:
                    if not isbed(args.bedfile, args.debug):
                        parser.error(
                            "The genomic coordinates must be given in UCSC BED files"
                        )
                        die(1)
                    else:
                        if not os.path.isfile(args.bedfile):
                            parser.error("Unable to locate {}".format(
                                args.bedfile))
                else:
                    parser.error("No BED file given")

                # motif pwm
                if not args.motif:
                    parser.error("No motif PWM given")

                else:
                    motifs: List[str] = args.motif
                    for m in motifs:
                        if not isMEME_ff(m, args.debug) and not isJaspar_ff(
                                m, args.debug):
                            parser.error(
                                "Unrecognized motif PWM file format. "
                                "{} does not follow the MEME or JASPAR format rules"
                                .format(m))
                            die(1)
                        if not os.path.isfile(m):
                            parser.error("Unable to locate {}".format(m))

                # background file
                if args.bgfile != UNIF:
                    if not os.path.isfile(args.bgfile):
                        parser.error("Unable to locate {}".format(args.bgfile))

                # pseudocount
                if args.pseudo <= 0:
                    parser.error(
                        "Pseudocount values must be > 0, got {}".format(
                            args.pseudo))
                    die(1)

                # statistical significance threshold
                if args.threshold <= 0 or args.threshold > 1:
                    parser.error(
                        "Motif statistical significance threshold must be between 0 and 1"
                    )
                    die(1)

                # q-value flag
                if (not isinstance(args.no_qvalue, bool) or
                    (args.no_qvalue != False and args.no_qvalue != True)):
                    parser.error(
                        "\"--qvalue\" accepts only True or False values")
                    die(1)

                # no reverse flag
                if (not isinstance(args.no_reverse, bool) or
                    (args.no_reverse != False and args.no_reverse != True)):
                    parser.error(
                        "\"--no-reverse\" accepts only True or False values")
                    die(1)

                # text only flag
                if (not isinstance(args.text_only, bool) or
                    (args.text_only != False and args.text_only != True)):
                    parser.error(
                        "\"--text-only\" accepts only True or False values")
                    die(1)

                # chromosome to consider during VG scan
                if len(args.chroms_find) == 0:
                    args.chroms_find = [ALL_CHROMS]  # use all chromosome
                else:
                    if anydup(args.chroms_find):
                        parser.error(
                            "Duplicated chromosome names given to \"--chroms-find\""
                        )

                # chromosome name-map
                if args.chroms_namemap_find != NOMAP:
                    if not os.path.isfile(args.chroms_namemap_find):
                        parser.error("Unable to locate {}".format(
                            args.chroms_namemap_find))
                if (args.chroms_prefix_find
                        and args.chroms_namemap_find != NOMAP):
                    parser.error(
                        "\"--chroms-prefix-find\" and \"chroms-namemap-find\" "
                        "cannot used together. Choose one of those options")

                # recomb flag
                if (not isinstance(args.recomb, bool)
                        or (args.recomb != False and args.recomb != True)):
                    parser.error(
                        "\"--recomb\" accepts only True or False values")
                    die(1)

                # out directory
                if args.out == "":  # default option
                    args.out = DEFAULT_OUTDIR
                    print(args.out)

                # threshold on q-value flag
                if (not isinstance(args.qval_t, bool)
                        or (args.qval_t != False and args.qval_t != True)):
                    parser.error(
                        "\"--qvalueT accepts only True or False values")
                    die(1)
                elif args.no_qvalue == True and args.qval_t == True:
                    parser.error(
                        "Unable to apply statistical significance threshold on"
                        " q-values if you don't want them")
                    die(1)

                # number of graph regions to store as PNG images
                if args.top_graphs < 0:
                    parser.error("Negative number of regions to display")

                workflow: Findmotif = Findmotif(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs." %
                          (end_args_parse - start_args_parse))
            # end if
        # end if

        # chck that external dependencies are satisfied
        if args.verbose:
            sys.stderr.write(
                "Checking GRAFIMO external dependencies {}\n".format(EXT_DEPS))
            start_deps: float = time.time()
        satisfied: bool
        deps_lack: List[str]
        satisfied, deps_lack = check_deps()
        if not satisfied and len(deps_lack) > 0:
            errmsg = "Some dependencies are not satisfied: {}.\nPlease solve them before running GRAFIMO.\n"
            exception_handler(DependencyError, errmsg.format(deps_lack),
                              args.debug)
        elif not satisfied and len(deps_lack) <= 0:
            errmsg = "Dependencies satisfied, but unable to recover them.\n Be sure they are in system PATH.\n"
            exception_handler(DependencyError, errmsg, args.debug)

        if args.verbose and satisfied:
            end_deps: float = time.time()
            print("Dependencies satisfied.")
            print("Dependencies checked in %.2fs." % (end_deps - start_deps))

        #---------------------------------------------------------------
        # dependency check was ok, so we go to workflow selection:
        #   * construction of the genome variation graph for
        #     each chromosome or a user defined subset of them
        #   * scan of a precomputed VG or a set of precomputed VG
        if isinstance(workflow, BuildVG): buildvg(workflow, args.debug)
        elif isinstance(workflow, Findmotif): findmotif(workflow, args.debug)
        else:
            errmsg = "Expected BuildVG or Findmotif, got {}.\n"
            exception_handler(TypeError,
                              errmsg.format(type(workflow).__name__),
                              args.debug)

        end: float = time.time()  # GRAFIMO execution finishes here
        print("Elapsed time %.2fs." % (end - start))

    except KeyboardInterrupt:
        sigint_handler()
    finally:
        pass
Ejemplo n.º 5
0
def main(cmdLineargs=None):
    """

        Main function of GRAFIMO.

        The arguments given in input are checked for consistency,
        then a pipeline is followed.

        ----
        Parameters:
            cmdLineargs (str)
        ----
        Returns:
            None

    """

    try:
        # starting point of the execution time
        start = time.time()

        # read the command-line arguments
        parser = get_AP()

        if cmdLineargs is None:
            cmdLineargs = sys.argv[1:] # take input args

        # the second argument must be buildvg or findmotif
        if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and
                cmdLineargs[0] != "--version" and
                (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")):
            parser.error("The second argument must be one between 'buildvg' and 'findmotif'")
            die(1)

        args = parser.parse_args(cmdLineargs)  # parse args

        if args.verbose:
            print("Parsing arguments...")
            start_args_parse = time.time()

        #####################################################################
        # check arguments consistency
        #####################################################################

        if args.workflow != "buildvg" and args.workflow != "findmotif":
            parser.error("Do not know what to do. Available options: create VGs with 'grafimo buildvg' or scan a "
                         "precomputed genome variation graph with 'grafimo findmotif'")
            die(1)

        # cores (shared by the two workflows)
        if args.cores < 0:
            parser.error("The number of cores cannot be negative")

        elif args.cores == 0 and args.graph_genome:
            args.cores = 1     # to query a whole genome graph is loaded into RAM, since usually are
                                 # very heavy in terms of bytes is safer to use 1 thread by default, otherwise
                                 # it would be loaded #cores times. If you want use more cores, be sure
                                 # your system can handle the resulting amount of data

        elif args.cores == 0:
            args.cores = mp.cpu_count()  # by default take all the available CPUs
        # end if

        # check verbose flag
        if (not isinstance(args.verbose, bool) or
                (args.verbose != False and args.verbose != True)):
            parser.error('The --verbose parameter accepts only True or False values')

        # chromosomes check (shared by the two workflows)
        for c in args.chroms:
            if c not in CHROMS_LIST:
                parser.error("Invalid chromosome")
                
        args.chroms = initialize_chroms_list(args.chroms)

        # checks for buildvg workflow
        if args.workflow == "buildvg":

            if args.graph_genome_dir:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.graph_genome:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.bedfile:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.motif:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.bgfile != 'UNIF':  # if default ignored
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.pseudo != 0.1:  # if default ignored
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.threshold != 1e-4:  # if default ignored"
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.no_qvalue:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.no_reverse:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.text_only:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.qval_t:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.top_graphs != 0:  # if default ignored
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif not args.linear_genome:
                parser.error("No reference genome given")
                die(1)

            elif not args.vcf:
                parser.error("No VCF file given")
                die(1)

            else:
                # check linear genome
                if (args.linear_genome.split('.')[-1] != 'fa' and
                        args.linear_genome.split('.')[-1] != 'fasta'):
                    parser.error('The linear genome must be in FASTA format (FASTA and FA extensions allowed)')
                    die(1)

                else:
                    if len(glob.glob(args.linear_genome)) != 1:
                        parser.error('Cannot find the given reference genome file')
                        die(1)

                    args.linear_genome = os.path.abspath(args.linear_genome)
                # end if

                # check VCF
                if ((args.vcf.split('.')[-1] != 'gz' and args.vcf.split('.')[-1] != 'zip')
                        or args.vcf.split('.')[-2] != 'vcf'):  # allow only compressed VCF files
                    parser.error('Incorrect VCF file given: the VCF must be compressed (e.g. myvcf.vcf.gz)')
                    die(1)

                else:
                    if len(glob.glob(args.vcf)) <= 0:
                        parser.error('Cannot find the given VCF file')
                        die(1)
                    args.vcf = os.path.abspath(args.vcf)

                # by deafult the built VGs will be stored in the current directory
                if args.out == "grafimo_out":  # general default value
                    args.out = os.path.abspath("./")

                workflow = BuildVG(args)

                if args.verbose:
                    end_args_parse = time.time()
                    print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"]))
            # end if
        # end if

        # checks for findmotif workflow
        if args.workflow == "findmotif":
            if args.linear_genome:
                parser.error("Invalid arguments for grafimo findmotif")
                die(1)

            elif args.vcf:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif not args.graph_genome_dir and not args.graph_genome:
                parser.error("No genome variation graph or directory containing them given")
                die(1)

            elif not args.bedfile:
                parser.error("No BED file given")
                die(1)

            elif not args.motif:
                parser.error("No motif file (MEME of JASPAR format) given")
                die(1)

            else:

                # only one between graph_genome and graph_genome_dir allowed
                if args.graph_genome and args.graph_genome_dir:
                    parser.error("Invalid arguments for grafimo buildvg")
                    die(1)

                # check graph_genome
                if args.graph_genome:
                    if (args.graph_genome.split('.')[-1] != 'xg' and
                            args.graph_genome.split('.')[-1] != 'vg'):
                        parser.error("Cannot use the given genome variation graph (only VG or XG format allowed)")
                        die(1)

                    elif not os.path.isfile(args.graph_genome):
                        parser.error("Unable to find the given variation genome graph")
                        die(1)

                    else:
                        graph_genome = os.path.abspath(args.graph_genome)  # safer to use absolute path
                        args.graph_genome = graph_genome
                    # end if
                # end if

                # check graph_genome_dir
                if args.graph_genome_dir:
                    if not os.path.isdir(args.graph_genome_dir):
                        parser.error("Cannot find the given directory containing the genome variation graphs")
                        die(1)

                    if args.graph_genome_dir[-1] == '/':
                        graph_genome_dir = args.graph_genome_dir

                    else:
                        graph_genome_dir = ''.join([args.graph_genome_dir, '/'])
                    # end if

                    if len(glob.glob(graph_genome_dir + '*.xg')) <= 0:
                        parser.error(' '.join(['No XG genome variation graph found in', graph_genome_dir]))
                        die(1)

                    else:
                        graph_genome_dir = os.path.abspath(graph_genome_dir)
                        args.graph_genome_dir = graph_genome_dir
                    # end if
                # end if

                # check BED file
                if args.bedfile:
                    if args.bedfile.split('.')[-1] != 'bed':
                        parser.error('Incorrect BED file given')
                        die(1)

                    else:
                        bedfile = args.bedfile

                        if len(glob.glob(bedfile)) <= 0:
                            parser.error('Cannot find the given BED file')
                    # end if

                else:
                    parser.error('No BED file given')
                # end if

                # check motif file
                if not args.motif:
                    parser.error('No motif given')

                else:
                    motifs = args.motif

                    # check if the given motifs exist
                    for m in motifs:
                        if not isMEME_ff(m) and not isJaspar_ff(m):
                            parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)")
                            die(1)

                        if len(glob.glob(m)) <= 0:
                            parser.error('Cannot find motif file: ' + m)
                            die(1)
                    # end for
                # end if

                # check background file
                if args.bgfile != 'UNIF':
                    bgfile = args.bgfile  # we have a path to a bg file

                    if len(glob.glob(bgfile)) <= 0:
                        parser.error('Cannot find the given background file')
                        die(1)
                # end if

                # check pseudocount
                if args.pseudo <= 0:
                    parser.error('The pseudocount cannot be less than or equal 0')
                    die(1)

                # check threshold
                if args.threshold <= 0 or args.threshold > 1:
                    parser.error('The pvalue threshold must be between 0 and 1')
                    die(1)

                # check q-value flag
                if (not isinstance(args.no_qvalue, bool) or
                        (args.no_qvalue != False and args.no_qvalue != True)):
                    parser.error('The --qvalue parameter accepts only True or False as values')
                    die(1)

                # check no reverse flag
                if (not isinstance(args.no_reverse, bool) or
                        (args.no_reverse != False and args.no_reverse != True)):
                    parser.error('The --no-reverse parameter accepts only True or False as values')
                    die(1)

                # check text only flag
                if (not isinstance(args.text_only, bool) or
                        (args.text_only != False and args.text_only != True)):
                    parser.error('The --text-only parameter accepts only True or False values')
                    die(1)

                # out directory
                if args.out == 'grafimo_out':  # default option
                    # to make unique the output directory we add the PID
                    # to the name.
                    #
                    # This is useful when calling grafimo in different runs on the
                    # same machine.

                    args.out = ''.join([args.out, '_', str(os.getpid())])

                # check threshold on q-value flag
                if (not isinstance(args.qval_t, bool) or
                        (args.qval_t != False and args.qval_t != True)):
                    parser.error("The --qvalueT parameter accepts only True or False as values")
                    die(1)

                elif args.no_qvalue == True and args.qval_t == True:
                    parser.error("Cannot apply the threshold on q-values if you don't want them")
                    die(1)

                # check the number of graph regions to store as PNG images
                if args.top_graphs < 0:
                    parser.error("The number of region graphs to show must be positive")

                workflow = Findmotif(args)

                if args.verbose:
                    end_args_parse = time.time()
                    print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"]))

            # end if
        # end if

        # check that external dependencies are satisfied
        if args.verbose:
            print("Checking GRAFIMO external dependencies " + str(EXT_DEPS))
            start_deps = time.time()

        satisfied, deps_lack = check_deps()

        if not satisfied and len(deps_lack) > 0:
            raise DependencyError("\n\nERROR: The following dependencies are not sastisfied: " +
                                      str(deps_lack) +
                                      "\nPlease, solve them before running GRAFIMO")
            die(1)

        elif not satisfied and len(deps_lack) <= 0:
            raise DependencyError("Some dependencies were found, but was not possible to track them." 
                                        "\nBe sure they are available in system PATH")
            die(1)
        # end if

        if args.verbose and satisfied:
            end_deps = time.time()
            print("Dependencies correctly satisfied")
            print(''.join(["Dependencies checked in ", str(end_deps - start_deps), "s"]))

        #####################################################################

        """
            dependency check was ok, so we go to workflow selection:
               - creation of the genome variation graph for 
                   each chromosome or a user defined subset of them
               - scan of a precomputed VG or a set of precomputed VG
        """

        if isinstance(workflow, BuildVG):
            # build the VG for each chromosome or a user defined subset of them
            buildvg(workflow)

        elif isinstance(workflow, Findmotif):
            # scan a precomputed VG or a set of VGs
            findmotif(workflow)

        else:
            raise ValueError("Unknown arguments object type")
        # end if

        end = time.time()  # GRAFIMO execution finishes here

        print(''.join(["\nElapsed time: ", str(end - start), "s"]))

    except KeyboardInterrupt:
        sigint_handler()

    finally:
        pass
Ejemplo n.º 6
0
def main(cmdLineargs: Optional[List[str]] = None) -> None :

    try:
        # starting point of the execution time
        start: float = time.time()

        # read the command-line arguments
        parser: GRAFIMOArgumentParser = get_parser()

        if cmdLineargs is None:
            cmdLineargs: List[str] = sys.argv[1:]  # take input args

        # no argument given
        if len(cmdLineargs) == 0:
            parser.error_noargs()
            die(1)

        # the second argument must be buildvg or findmotif
        if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and
                cmdLineargs[0] != "--version" and
                (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")):
            parser.error(
                "The second argument must be one between 'buildvg' and 'findmotif'")
            die(1)

        args: argparse.Namespace = parser.parse_args(cmdLineargs) 

        if args.verbose:
            print("Parsing arguments...")
            start_args_parse: float = time.time()

        ################################################################
        # check arguments consistency
        ################################################################

        if args.workflow != "buildvg" and args.workflow != "findmotif":
            parser.error("Do not know what to do. Available options: create VGs "
                         "with 'grafimo buildvg' or scan a precomputed genome "
                         "variation graph with 'grafimo findmotif'")
            die(1)

        # cores (shared by the two workflows)
        if args.cores < 0:
            parser.error("The number of cores cannot be negative")

        elif args.cores == 0 and args.graph_genome:
            # to query a whole genome graph is loaded into RAM, since 
            # usually they are very heavy in terms of bytes is safer to 
            # use 1 thread by default, otherwise it would be loaded 
            # #cores times. If you want use more cores, be sure your 
            # system can handle the resulting amount of data
            args.cores = 1  

        elif args.cores == 0:
            # by default take all the available CPUs
            args.cores = mp.cpu_count() 
        # end if

        # check verbose flag
        if (not isinstance(args.verbose, bool) or
                (args.verbose != False and args.verbose != True)):
            parser.error(
                'The --verbose parameter accepts only True or False values')

        # chromosomes check (shared by the two workflows)
        if len(args.chroms) == 0:
            args.chroms = ['ALL_CHROMS']

        buildvg_err_msg = "Invalid arguments for grafimo buildvg"

        # checks for buildvg workflow
        if args.workflow == "buildvg":

            if args.graph_genome_dir:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.graph_genome:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.bedfile:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.motif:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.bgfile != 'UNIF':  # if default ignored
                parser.error(buildvg_err_msg)
                die(1)

            elif args.pseudo != 0.1:  # if default ignored
                parser.error(buildvg_err_msg)
                die(1)

            elif args.threshold != 1e-4:  # if default ignored"
                parser.error(buildvg_err_msg)
                die(1)

            elif args.no_qvalue:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.no_reverse:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.text_only:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.qval_t:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.recomb:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.top_graphs != 0:  # if default ignored
                parser.error(buildvg_err_msg)
                die(1)

            elif not args.linear_genome:
                parser.error("No reference genome given")
                die(1)

            elif not args.vcf:
                parser.error("No VCF file given")
                die(1)

            else:
                # check linear genome
                if (args.linear_genome.split('.')[-1] != 'fa' and
                        args.linear_genome.split('.')[-1] != 'fasta'):
                    parser.error(
                        "The linear genome must be in FASTA format (FASTA and "
                        "FA extensions allowed)")
                    die(1)

                else:
                    if len(glob.glob(args.linear_genome)) != 1:
                        parser.error(
                            'Cannot find the given reference genome file')
                        die(1)

                    args.linear_genome = os.path.abspath(args.linear_genome)
                # end if

                # check VCF --> the VCF must have been compressed with
                # bgzip (https://github.com/samtools/tabix)
                if ((args.vcf.split('.')[-1] != 'gz' 
                        and args.vcf.split('.')[-1] != 'zip')
                        or args.vcf.split('.')[-2] != 'vcf'):  
                    parser.error(
                        "Incorrect VCF file given: the VCF must be compressed "
                        "(e.g. myvcf.vcf.gz)")
                    die(1)

                else:
                    if len(glob.glob(args.vcf)) <= 0:
                        parser.error('Cannot find the given VCF file')
                        die(1)
                    args.vcf = os.path.abspath(args.vcf)

                # by deafult the built VGs will be stored in the current 
                # directory
                if args.out == "":  # general default value
                    args.out = os.path.abspath("./")

                workflow: BuildVG = BuildVG(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs" % 
                          (end_args_parse - start_args_parse))
            # end if
        # end if

        findmotif_err_msg = "Invalid arguments for grafimo findmotif"
        
        # checks for findmotif workflow
        if args.workflow == "findmotif":
            if args.linear_genome:
                parser.error(findmotif_err_msg)
                die(1)

            elif args.vcf:
                parser.error(findmotif_err_msg)
                die(1)

            elif args.reindex:  # if default value is ignored
                parser.error(findmotif_err_msg)
                die(1)

            elif not args.graph_genome_dir and not args.graph_genome:
                parser.error(
                    "No genome variation graph or directory containing them given")
                die(1)

            elif not args.bedfile:
                parser.error("No BED file given")
                die(1)

            elif not args.motif:
                parser.error("No motif file (MEME of JASPAR format) given")
                die(1)

            else:

                # only one between graph_genome and graph_genome_dir 
                # are allowed
                if args.graph_genome and args.graph_genome_dir:
                    parser.error("Invalid arguments for grafimo buildvg")
                    die(1)

                # check graph_genome
                if args.graph_genome:
                    if (args.graph_genome.split('.')[-1] != 'xg' and
                            args.graph_genome.split('.')[-1] != 'vg'):
                        parser.error(
                            "Cannot use the given genome variation graph (only "
                            "VG or XG format allowed)")
                        die(1)

                    elif not os.path.isfile(args.graph_genome):
                        parser.error(
                            "Unable to find the given variation genome graph")
                        die(1)

                    else:
                        # it is safer to use absolute path to avoid bugs
                        graph_genome: str = os.path.abspath(args.graph_genome)  
                        args.graph_genome = graph_genome
                    # end if
                # end if

                # check graph_genome_dir
                if args.graph_genome_dir:
                    if not os.path.isdir(args.graph_genome_dir):
                        parser.error(
                            "Cannot find the given directory containing the "
                            "genome variation graphs")
                        die(1)

                    if args.graph_genome_dir[-1] == '/':
                        graph_genome_dir = args.graph_genome_dir

                    else:
                        graph_genome_dir = ''.join([args.graph_genome_dir, '/'])
                    # end if

                    if len(glob.glob(graph_genome_dir + '*.xg')) <= 0:
                        parser.error(
                            ' '.join(['No XG genome variation graph found in', 
                                      graph_genome_dir]))
                        die(1)

                    else:
                        graph_genome_dir: str = os.path.abspath(graph_genome_dir)
                        args.graph_genome_dir = graph_genome_dir
                    # end if
                # end if

                # check BED file
                if args.bedfile:
                    if args.bedfile.split('.')[-1] != 'bed':
                        parser.error('Incorrect BED file given')
                        die(1)

                    else:
                        bedfile: str = args.bedfile

                        if len(glob.glob(bedfile)) <= 0:
                            parser.error('Cannot find the given BED file')
                    # end if

                else:
                    parser.error('No BED file given')
                # end if

                # check motif file
                if not args.motif:
                    parser.error('No motif given')

                else:
                    motifs: List[str] = args.motif

                    # check if the given motifs exist
                    for m in motifs:
                        if not isMEME_ff(m) and not isJaspar_ff(m):
                            parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)")
                            die(1)

                        if len(glob.glob(m)) <= 0:
                            parser.error('Cannot find motif file: ' + m)
                            die(1)
                    # end for
                # end if

                # check background file
                if args.bgfile != 'UNIF':
                    bgfile: str = args.bgfile

                    if len(glob.glob(bgfile)) <= 0:
                        parser.error('Cannot find the given background file')
                        die(1)
                # end if

                # check pseudocount
                if args.pseudo <= 0:
                    parser.error(
                        'The pseudocount cannot be less than or equal 0')
                    die(1)

                # check threshold
                if args.threshold <= 0 or args.threshold > 1:
                    parser.error('The pvalue threshold must be between 0 and 1')
                    die(1)

                # check q-value flag
                if (not isinstance(args.no_qvalue, bool) or
                        (args.no_qvalue != False and args.no_qvalue != True)):
                    parser.error(
                        "The --qvalue parameter accepts only True or False as "
                        "values")
                    die(1)

                # check no reverse flag
                if (not isinstance(args.no_reverse, bool) or
                        (args.no_reverse != False and args.no_reverse != True)):
                    parser.error(
                        "The --no-reverse parameter accepts only True or False "
                        "as values")
                    die(1)

                # check text only flag
                if (not isinstance(args.text_only, bool) or
                        (args.text_only != False and args.text_only != True)):
                    parser.error(
                        "The --text-only parameter accepts only True or False "
                        "values")
                    die(1)

                # check recombinant flag
                if (not isinstance(args.recomb, bool) or
                        (args.recomb != False and args.recomb != True)):
                    parser.error(
                        "The --recomb parameter accepts only True or False values")
                    die(1)

                # out directory
                if args.out == '':  # default option
                    args.out = DEFAULT_OUTDIR 
                    
                # check threshold on q-value flag
                if (not isinstance(args.qval_t, bool) or
                        (args.qval_t != False and args.qval_t != True)):
                    parser.error
                    ("The --qvalueT parameter accepts only True or False as values")
                    die(1)

                elif args.no_qvalue == True and args.qval_t == True:
                    parser.error(
                        "Cannot apply the threshold on q-values if you don't "
                        "want them")
                    die(1)

                # check the number of graph regions to store as PNG images
                if args.top_graphs < 0:
                    parser.error(
                        "The number of region graphs to show must be positive")

                workflow: Findmotif = Findmotif(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs" % 
                          (end_args_parse - start_args_parse))

            # end if
        # end if

        # check that external dependencies are satisfied
        if args.verbose:
            print("Checking GRAFIMO external dependencies " + str(EXT_DEPS))
            start_deps: float = time.time()

        satisfied: bool 
        deps_lack: List[str] 
        
        satisfied, deps_lack = check_deps()

        if not satisfied and len(deps_lack) > 0:
            raise DependencyError("\n\nERROR: The following dependencies are not" 
                                  " sastisfied: " + str(deps_lack) +
                                  "\nPlease, solve them before running GRAFIMO")

        elif not satisfied and len(deps_lack) <= 0:
            raise DependencyError("Some dependencies were found, but was not "
                                  "possible to track them.\n" 
                                  "Be sure they are available in system PATH")
        # end if

        if args.verbose and satisfied:
            end_deps: float = time.time()
            print("Dependencies correctly satisfied")
            print("Dependencies checked in %.2fs" % (end_deps - start_deps))

        ################################################################

        # dependency check was ok, so we go to workflow selection:
        #   * creation of the genome variation graph for 
        #     each chromosome or a user defined subset of them
        #   * scan of a precomputed VG or a set of precomputed VG

        if isinstance(workflow, BuildVG):
            # build the VG for each chromosome or a user defined subset 
            # of them
            buildvg(workflow)

        elif isinstance(workflow, Findmotif):
            # scan a precomputed VG or a set of VGs
            findmotif(workflow)

        else:
            raise ValueError("Unknown arguments object type")
        # end if

        end: float = time.time()  # GRAFIMO execution finishes here

        print("Elapsed time %.2fs" % (end - start))

    except KeyboardInterrupt:
        sigint_handler()

    finally:
        pass
Ejemplo n.º 7
0
def construct_vg(buildvg_args):
    """
        Create the genome graph, for the reference and VCF file given
        in input by the user.

        The genome is not built as a single whole genome graph but a
        single graph is constructed for each chromosome.
        This choice was made to avoid memory issues and make able
        also the less powerful machines to run GRAFIMO.

        There is NO drawback using this approach wrt
        construct the whole genome graph and query it.
        ----
        Parameters:
            chroms (list) : list of chromosomes for whicgh the genome
                            graph will be constructed
            linear_genome (str) : path to the linear genome used as
                                    reference to build the genome
                                    graphs
            vcf (str) : path to the VCF file used to build the genome
                        graphs
        ----
        Return:
            None
    """

    if not isinstance(buildvg_args, BuildVG):
        raise ValueError("Unknown arguments object type. Cannot Build the genome variation graph. Exiting")
        die(1)

    # read the arguments to build the VGs
    chroms = buildvg_args.get_chroms()
    threads = buildvg_args.get_cores()
    outdir = buildvg_args.get_outdir()
    verbose = buildvg_args.get_verbose()
    test = buildvg_args.get_test()

    if test:
        reference = get_reference_genome_from_ucsc()
        vcf = get_1000GProject_vcf()

    else:
        reference = buildvg_args.get_reference_genome()
        vcf = buildvg_args.get_vcf()
    # end if

    if verbose:
        print("using reference genome: ", reference)
        print("Using VCF file: ", vcf, "\n\n")
    # end if

    cwd = os.getcwd()

    # check if the VCF file has already been indexed with tabix
    if not tbiexist(vcf):
        msg = ''.join(["TBI file not found for ", vcf.split('/')[-1], ". Indexing the VCF file with tabix..."])
        print(msg)
        cmd = 'tabix -p vcf {0}'.format(vcf)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            # tabix didn't work
            errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
            raise SubprocessError(errmsg)
            die(1)

    else:  # update the indexed VCF
        msg = ''.join(["Reindexing ", vcf.split('/')[-1], "..."])
        print(msg)

        # remove the existing TBI file
        cmd = "rm {0}".format(''.join([vcf, ".tbi"]))
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
            raise SubprocessError(errmsg)
            die(1)

        # reindex the VCF
        cmd = "tabix -p vcf {0}".format(vcf)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            # tabix didn't work
            errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
            raise SubprocessError(errmsg)
            die(1)
        # end if
    # end if

    # enter the output directory
    os.chdir(outdir)

    # build the VG for each chromosome or a user defined
    # subset of them
    for chrom_n in chroms:

        chrom = ''.join(['chr', chrom_n])  # to call vg construct we need both the
                                           # chromosome number and it preceded by 'chr

        vg = chrom + '.vg'

        # build the VG for the current chromosome
        if verbose:
            start_build = time.time()

        code = build_vg(vg, reference, vcf, chrom, chrom_n, threads)
        if code != 0:
            msg = '\n\nERROR: an error occurred during {0} construction. '.format(vg)
            msg += 'Unable to build the VG of the genome using {0} and {1}'.format(reference,
                                                                                   vcf)
            raise VGException(msg)
            die(1)
        # end if

        if verbose:
            end_build = time.time()
            msg = "Elapsed time to build {0} ".format(vg)
            msg = ''.join([msg, str(end_build - start_build), "s"])
            print(msg)
        # end if

        # to query efficiently the VGs we index them (VG -> XG)
        if verbose:
            start_index = time.time()

        msg = ''.join(["Indexing ", vg, '...'])
        print(msg)

        code = indexVG(vg, threads)

        if code != 0:
            errmsg = "\n\nERROR: an error occurred during indexing {0}.\nUnable to index {0}. Exiting".format(vg)
            raise VGException(errmsg)
            die(1)
        # end if

        if verbose:
            end_index = time.time()
            msg = "Elapsed time to index {0} ".format(vg)
            msg = ''.join([msg, str(end_index - start_index), "s"])
            print(msg)
        # end if

        # The majority of applications work only with indexed graph,
        # so to save disk space is worth to delete the VGs and keep
        # only the XGs (is simple to get back using VG built-in functions)
        if verbose:
            print("Deleting {0}".format(vg))

        cmd = 'rm {0}'.format(vg)
        subprocess.call(cmd, shell=True)

        if code != 0:  # we have errors in the vg indexing
            errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
            raise SubprocessError()
            die(1)
        # end if
    # end for

    # get the VGs location
    graphs_loc = os.getcwd()

    # return to the original working directory
    os.chdir(cwd)
Ejemplo n.º 8
0
def compute_results(
    motif: Motif,
    sequence_loc: str,
    debug: bool,
    args_obj: Optional[Findmotif] = None,
    testmode: Optional[bool] = False,
) -> pd.DataFrame:
    """Score the sequences extracted from the genome variation graph.

    The potential motif occurrences are scored using the scaled scoring matrix.
    The scaled values are then used to retrieve the corresponding P-value.

    ...
    
    Parameters
    ----------
    motif : Motif
        motif object
    sequence_loc : str
        path to sequences extracted
    debug : bool
        trace the full error stack
    args_obj : Findmotif, optional
        commandline arguments container
    testmode : bool, optional
        test (manually set)

    Returns
    -------
    pandas.DataFrame
        results
    """

    if not isinstance(motif, Motif):
        errmsg = "Expected Motif, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif).__name__),
                          debug)
    if not isinstance(sequence_loc, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError,
                          errmsg.format(type(sequence_loc).__name__), debug)
    if not os.path.isdir(sequence_loc):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(sequence_loc),
                          debug)
    if not testmode:
        if not isinstance(args_obj, Findmotif):
            errmsg = "Expected Findmotif, got {}.\n"
            exception_handler(TypeError,
                              errmsg.format(type(args_obj).__name__), debug)

    if not testmode:
        cores: int = args_obj.cores
        threshold: float = args_obj.threshold
        no_qvalue: bool = args_obj.noqvalue
        qval_t: bool = args_obj.qvalueT
        no_reverse: bool = args_obj.noreverse
        recomb: bool = args_obj.recomb
        verbose: bool = args_obj.verbose
    else:  # pytest - during normal execution we should never go here
        cores = 1
        threshold = float(1)
        recomb = True
        no_qvalue = False
        qval_t = False
        no_reverse = False
        verbose = False
    assert threshold > 0 and threshold <= 1
    assert cores >= 1

    print_scoring_msg(motif, no_reverse, debug)
    cwd: str = os.getcwd()
    os.chdir(sequence_loc)
    manager: SyncManager = mp.Manager()
    return_dict: DictProxy = manager.dict()  # results
    scanned_nucs_dict: DictProxy = manager.dict()  # scanned nucleotides
    scanned_seqs_dict: DictProxy = manager.dict()  # scanned sequences
    sequences: List[str] = glob.glob('*.tsv')  # sequences
    if len(sequences) < cores: cores = len(sequences)
    # split the sequence set in no. cores chunks
    sequences_split: List[str] = np.array_split(sequences, cores)
    jobs = list()  # jobs list
    proc_finished: int = 0
    # overwrite the default SIGINT handler to exit gracefully
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    signal.signal(signal.SIGINT, original_sigint_handler)
    if verbose: start_s: float = time.time()
    try:
        for i in range(cores):
            p = mp.Process(target=score_seqs,
                           args=(sequences_split[i], motif, no_reverse,
                                 return_dict, scanned_seqs_dict,
                                 scanned_nucs_dict, i, debug))
            jobs.append(p)
            p.start()
        # to print 0%, otherwise start from % as first chunk id already completed completed
        printProgressBar(proc_finished,
                         cores,
                         prefix='Progress:',
                         suffix='Complete',
                         length=50)
        for job in jobs:
            job.join()  # sync point
            proc_finished += 1
            printProgressBar(proc_finished,
                             cores,
                             prefix='Progress:',
                             suffix='Complete',
                             length=50)
    except KeyboardInterrupt:
        sigint_handler()
        die(2)
    else:
        if verbose:
            end_s: float = time.time()
            print("Scored all sequences in %.2fs" % (end_s - start_s))
    os.chdir(cwd)
    if not testmode:
        cmd: str = "rm -rf {}".format(sequence_loc)
        code: int = subprocess.call(cmd, shell=True)
        if code != 0:
            errmsg = "An error occurred while executing {}.\n"
            exception_handler(SubprocessError, errmsg.format(cmd), debug)
    if verbose: start_df: str = time.time()
    # recover all analysis results and summarize them in a single
    # data structure
    seqs_scanned: int = 0
    nucs_scanned: int = 0
    summary = ResultTmp()
    for key in return_dict.keys():
        partialres = return_dict[key]
        summary.append_list(partialres[0], partialres[1], partialres[2],
                            partialres[3], partialres[4], partialres[5],
                            partialres[6], partialres[7], partialres[8],
                            partialres[9])
        seqs_scanned += scanned_seqs_dict[key]
        nucs_scanned += scanned_nucs_dict[key]
    if summary.isempty():
        errmsg = "No result retrieved. Unable to proceed. Are you using the correct VGs and searching on the right chromosomes?\n"
        exception_handler(ValueError, errmsg, debug)
    # compute the q-values
    if not no_qvalue:
        if verbose: start_q = time.time()
        qvalues = compute_qvalues(summary.pvalues, debug)
        summary.add_qvalues(qvalues)
        if verbose:
            end_q = time.time()
            print("Q-values computed in %.2fs." % (end_q - start_q))
    print("Scanned sequences:\t{}".format(seqs_scanned))
    print("Scanned nucleotides:\t{}".format(nucs_scanned))
    # summarize results in a pandas DataFrame
    finaldf = summary.to_df(motif,
                            threshold,
                            qval_t,
                            recomb,
                            ignore_qvals=no_qvalue)
    if verbose:
        end_df: float = time.time()
        print("\nResults summary built in %.2fs" % (end_df - start_df))

    return finaldf
Ejemplo n.º 9
0
def scale_pwm(motif_matrix, alphabet, motif_width):
    """
        Scale the motif matrix values
        ----
        Parameters:
            motif_matrix (str) : count matrix
            alphabet (str) : motif alphabet
            motif_width (int) : motif width
        ----
        Returns:
            motif_matrix_sc (np.ndarray) : scaled motif matrix
            min_val (int) : lowest value in the scaled motif matrix
            max_val (int) : higest value in the scaled motif matrix
            scale_factor (int)
            offset (int)
    """

    if not isinstance(motif_matrix, pd.DataFrame):
        raise NoDataFrameException(
            "The given motif matrix must be an instance of pandas.DataFrame")
        die(1)

    if motif_matrix.empty:
        raise NotValidMotifMatrixException("The given motif matrix is empty")
        die(1)

    if not isinstance(alphabet, list):
        raise NotValidAlphabetException("The alphabet given is not in a list")
        die(1)

    if not isListEqual(alphabet, DNA_ALPHABET):
        raise NotValidAlphabetException(
            "The alphabet given is not a valid DNA alphabet")
        die(1)

    assert motif_width > 0

    min_val = min(motif_matrix.min())
    max_val = max(motif_matrix.max())
    motif_matrix_sc = pd.DataFrame(index=list(motif_matrix.index),
                                   columns=list(motif_matrix.columns),
                                   data=0)

    lower = min_val
    upper = max_val

    if lower == upper:  # all values are equal
        lower = np.double(upper - 1)

    lower = np.floor(lower)
    offset = np.round(np.floor(lower))
    scale_factor = np.floor(RANGE / (upper - lower))

    # values will be in [0, 1000]
    for nuc in alphabet:
        for j in range(motif_width):
            scaled_score = np.round(
                (motif_matrix.loc[nuc, j] - (offset)) * scale_factor)
            motif_matrix_sc.loc[nuc, j] = scaled_score
        # end for
    # end for

    # make sure the values are integers
    motif_matrix_sc[:] = motif_matrix_sc[:].astype(int)

    # now they are scaled
    min_val = min(motif_matrix_sc.min())
    max_val = max(motif_matrix_sc.max())

    return motif_matrix_sc, min_val, max_val, int(scale_factor), offset