def get_1000GProject_vcf(): """ Downloads a VCF file from the 1000 Genome Project database, containing SNPs and indels for each subject involved in the study and returns the path to it ---- Parameters: None ---- Returns: vcf (str) : path to the vcf downloaded vcf file (in .vcf.gz) """ # download the VCF address = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/' address += '1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/' address += 'ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz' cmd = 'wget -c {0}'.format(address) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = ''.join(["\n\nERROR: An error occurred while executing ", cmd, ". Exiting"]) raise SubprocessError(errmsg) die(1) vcf_file = './ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz' vcf = os.path.abspath(vcf_file) return vcf
def build_motif_JASPAR(motif_file, bg_file, pseudocount, no_reverse, verbose): """ Build a Motif object starting from raw counts data stored in a JASPAR motif file. The raw counts are processed and the resulting values are used to define the scoring matrix for the motif ---- Parameters: motif_file (str) : path to the motif file bg_file (str) : path to the background file pseudocount (float) : value to add to the motif counts (to avoid division by 0) no_reverse (bool) : flag parameter to consider or not the reverse complement building the Motif object ---- Returns: motif (Motif) : returns the corresponding Motif object """ if not motif_file: raise FileNotFoundError("\n\nERROR: the motif file is missing") die(1) # check if the input file is in JASPAR format if not isJaspar_ff(motif_file): raise NotValidFFException( "ERROR: the given motif file is not in JASPAR or MEME format") die(1) assert pseudocount > 0 # read the motif file motif = read_JASPAR_motif(motif_file, bg_file, pseudocount, no_reverse, verbose) if verbose: start_mp = time.time() # get log-odds values for motif motif = process_motif_for_logodds(motif) if verbose: end_mp = time.time() msg = ''.join([ "Processed motif ", motif.getMotifID(), " in ", str(end_mp - start_mp), "s" ]) print(msg) # end if return motif
def get_reference_genome_from_ucsc(): """ Download the reference genome (hg38 assembly), from the UCSC database, in the current directory and returns the path to it ---- Parameters: None ---- Returns: genome (str) : path to the genome downloaded (in .fa format) """ # download genome address = 'ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz' cmd = 'wget -c {0}'.format(address) code = subprocess.call(cmd, shell=True) # downloaded in the current directory if code != 0: errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"]) raise SubprocessError(errmsg) die(1) # decompress genome print("Uncompressing the genome...") genome_comp = './hg38.fa.gz' cmd = 'gunzip {0}'.format(genome_comp) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"]) raise SubprocessError(errmsg) die(1) # remove FASTA.GZ file if still present if os.path.exists(genome_comp): cmd = 'rm {0}'.format(genome_comp) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"]) raise SubprocessError(errmsg) die(1) # get the path to the genome file genome_uncomp = "./hg38.fa" genome = os.path.abspath(genome_uncomp) return genome
def main(cmdLineargs: Optional[List[str]] = None) -> None: try: # starting point of the execution time start: float = time.time() # read the command-line arguments parser: GRAFIMOArgumentParser = get_parser() if cmdLineargs is None: cmdLineargs: List[str] = sys.argv[1:] # get input args # no arguments given --> print help if len(cmdLineargs) == 0: parser.error_noargs() die(2) # the second argument must be buildvg or findmotif if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and cmdLineargs[0] != "--version" and (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")): parser.error( "The second argument must be one between 'buildvg' and 'findmotif'" ) die(1) args: argparse.Namespace = parser.parse_args(cmdLineargs) if args.verbose: print("Parsing arguments...") start_args_parse: float = time.time() #--------------------------------------------------------------# # check commandline arguments consistency # #---------------------- general options -----------------------# # workflow type if args.workflow != "buildvg" and args.workflow != "findmotif": parser.error("Unexpected workflow given. Available options:\n" "\tbuildvg: construct VG from user data.\n" "\tfindmotif: scan VG for DNA motif(s) occurrences") die(1) # cpu cores if args.cores < 0: parser.error("Negative number of CPU cores given") elif args.cores == 0 and args.graph_genome: # when whole genome variation graph is given, it is safer to # use 1 CPU core by default. This beacuse of the space needed # to load the whole VG on RAM. # # CAVEAT: before requiring more CPU cores to be used, be sure # your system has enough memory args.cores = 1 elif args.cores == 0: # default option -> use all available CPU cores args.cores = mp.cpu_count() else: # args.cores > 0 if args.cores > mp.cpu_count(): parser.error("Too many CPU cores to use ({})".format( args.cores)) # verbosity if (not isinstance(args.verbose, bool) or (args.verbose != False and args.verbose != True)): parser.error( '\"--verbose\" does not accept any positional argument') # debugging if (not isinstance(args.debug, bool) or (args.debug != False and args.debug != True)): parser.error("\"--debug\" does not accept any positional argument") #---------------------- buildvg options -----------------------# buildvg_err_msg: str = "Unexpected arguments for \"grafimo buildvg\": \"{}\"" if args.workflow == "buildvg": if args.graph_genome_dir: parser.error(buildvg_err_msg.format("-d, --genome-graph-dir")) die(1) elif args.graph_genome: parser.error(buildvg_err_msg.format("-g, --genome-graph")) die(1) elif args.bedfile: parser.error(buildvg_err_msg.format("-b, --bedfile")) die(1) elif args.motif: parser.error(buildvg_err_msg.format("-m, --motif")) die(1) elif args.bgfile != UNIF: # if default ignored parser.error(buildvg_err_msg.format("-k, --bgfile")) die(1) elif args.pseudo != 0.1: # if default ignored parser.error(buildvg_err_msg.format("-p, --pseudo")) die(1) elif args.threshold != 1e-4: # if default ignored parser.error(buildvg_err_msg.format("-t, --thresh")) die(1) elif args.no_qvalue: parser.error(buildvg_err_msg.format("-q, --no-qvalue")) die(1) elif args.no_reverse: parser.error(buildvg_err_msg.format("-r, --no-reverse")) die(1) elif args.text_only: parser.error(buildvg_err_msg.format("-f, --text-only")) die(1) elif args.chroms_find: parser.error(buildvg_err_msg.format("--chroms-find")) die(1) elif args.chroms_prefix_find: parser.error(buildvg_err_msg.format("--chroms-prefix-find")) die(1) elif args.chroms_namemap_find != NOMAP: # if default ignored parser.error(buildvg_err_msg.format("--chroms-namemap-find")) die(1) elif args.qval_t: parser.error(buildvg_err_msg.format("--qvalueT")) die(1) elif args.recomb: parser.error(buildvg_err_msg.format("--recomb")) die(1) elif args.top_graphs != 0: # if default ignored parser.error(buildvg_err_msg.format("--top-graphs")) die(1) elif not args.linear_genome: parser.error("No reference genome given") die(1) elif not args.vcf: parser.error("No VCF file given") die(1) else: # arguments for buildvg are correct # reference genome if (args.linear_genome.split('.')[-1] != 'fa' and args.linear_genome.split('.')[-1] != 'fasta'): parser.error( "The reference genome file must be in FASTA format") die(1) else: if not os.path.isfile(args.linear_genome): parser.error("Unable to find {}".format( args.linear_genome)) die(1) if os.stat(args.linear_genome).st_size == 0: # empty file parser.error("{} seems to be empty.".format( args.linear_genome)) die(1) args.linear_genome = os.path.abspath(args.linear_genome) # VCF --> the VCF file must have been compressed with # bgzip (https://github.com/samtools/tabix) if (args.vcf.split(".")[-1] != "gz" and args.vcf.split(".")[-2] != "vcf"): parser.error( "Wrong VCF file given. The VCF file must have been " "compressed with bgzip (e.g. myvcf.vcf.gz)") die(1) else: if not os.path.isfile(args.vcf): parser.error('Unable to find {}'.format(args.vcf)) die(1) if os.stat(args.vcf).st_size == 0: # empty file parser.error("{} seems to be empty.".format(args.vcf)) die(1) args.vcf = os.path.abspath(args.vcf) # chromosome to construct VG if len(args.chroms_build) == 0: args.chroms_build = [ALL_CHROMS] # use all chromosome else: if anydup(args.chroms_build): parser.error( "Duplicated chromosome names given to \"--chroms-build\"" ) # chromosome name-map if args.chroms_namemap_build != NOMAP: if not os.path.isfile(args.chroms_namemap_build): parser.error("Unable to locate {}".format( args.chroms_namemap_build)) if (args.chroms_prefix_build and args.chroms_namemap_build != NOMAP): parser.error( "\"--chroms-prefix-build\" and \"chroms-namemap-build\" " "cannot used together. Choose one of those options") # if no out directory is specified the VGs are stored in # the current directory if args.out == "": args.out = os.path.abspath("./") workflow: BuildVG = BuildVG(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs." % (end_args_parse - start_args_parse)) # end if # end if #---------------------- findmotif options -----------------------# findmotif_err_msg: str = "Unexpected arguments for \"grafimo findmotif\": \"{}\"" if args.workflow == "findmotif": if args.linear_genome: parser.error(findmotif_err_msg.format("-l, --linear-genome")) die(1) elif args.vcf: parser.error(findmotif_err_msg.format("-v, --vcf")) die(1) elif args.chroms_build: parser.error(findmotif_err_msg.format("--chroms-build")) elif args.chroms_prefix_build: parser.error(findmotif_err_msg.format("--chroms-prefix-build")) elif args.chroms_namemap_build != NOMAP: parser.error( findmotif_err_msg.format("--chroms-namemap-build")) elif args.reindex: # if default ignored parser.error(findmotif_err_msg.format("--reindex")) die(1) elif not args.graph_genome_dir and not args.graph_genome: parser.error( "No arguments given for both \"--genome-graph\" and \"--genome-graph-dir\"" ) die(1) elif not args.bedfile: parser.error("No BED file given") die(1) elif not args.motif: parser.error("No motif PWM given") die(1) else: # only one between graph_genome and graph_genome_dir is allowed if args.graph_genome and args.graph_genome_dir: parser.error( "Only one argument between \"--genome-graph\" and \"--genome-graph-dir\"" " can be used") die(1) # genome graph if args.graph_genome: if (args.graph_genome.split('.')[-1] != "xg" and args.graph_genome.split('.')[-1] != "vg"): parser.error( "Unrecognized genome variation graph format. Only" "VG and XG format are allowed") die(1) elif not os.path.isfile(args.graph_genome): parser.error("Unable to locate {}".format( args.graph_genome)) die(1) else: # using absolute path avoid potential problems args.graph_genome = os.path.abspath(args.graph_genome) # genome graphs directory if args.graph_genome_dir: if not os.path.isdir(args.graph_genome_dir): parser.error("Unable to locate {}".format( args.graph_genome_dir)) die(1) if len(glob(os.path.join(args.graph_genome_dir, "*.xg"))) <= 0: parser.error( "No genome variation graph found in {}".format( args.graph_genome_dir)) die(1) else: # using absolute path avoid potential problems args.graph_genome_dir = os.path.abspath( args.graph_genome_dir) # BED file if args.bedfile: if not isbed(args.bedfile, args.debug): parser.error( "The genomic coordinates must be given in UCSC BED files" ) die(1) else: if not os.path.isfile(args.bedfile): parser.error("Unable to locate {}".format( args.bedfile)) else: parser.error("No BED file given") # motif pwm if not args.motif: parser.error("No motif PWM given") else: motifs: List[str] = args.motif for m in motifs: if not isMEME_ff(m, args.debug) and not isJaspar_ff( m, args.debug): parser.error( "Unrecognized motif PWM file format. " "{} does not follow the MEME or JASPAR format rules" .format(m)) die(1) if not os.path.isfile(m): parser.error("Unable to locate {}".format(m)) # background file if args.bgfile != UNIF: if not os.path.isfile(args.bgfile): parser.error("Unable to locate {}".format(args.bgfile)) # pseudocount if args.pseudo <= 0: parser.error( "Pseudocount values must be > 0, got {}".format( args.pseudo)) die(1) # statistical significance threshold if args.threshold <= 0 or args.threshold > 1: parser.error( "Motif statistical significance threshold must be between 0 and 1" ) die(1) # q-value flag if (not isinstance(args.no_qvalue, bool) or (args.no_qvalue != False and args.no_qvalue != True)): parser.error( "\"--qvalue\" accepts only True or False values") die(1) # no reverse flag if (not isinstance(args.no_reverse, bool) or (args.no_reverse != False and args.no_reverse != True)): parser.error( "\"--no-reverse\" accepts only True or False values") die(1) # text only flag if (not isinstance(args.text_only, bool) or (args.text_only != False and args.text_only != True)): parser.error( "\"--text-only\" accepts only True or False values") die(1) # chromosome to consider during VG scan if len(args.chroms_find) == 0: args.chroms_find = [ALL_CHROMS] # use all chromosome else: if anydup(args.chroms_find): parser.error( "Duplicated chromosome names given to \"--chroms-find\"" ) # chromosome name-map if args.chroms_namemap_find != NOMAP: if not os.path.isfile(args.chroms_namemap_find): parser.error("Unable to locate {}".format( args.chroms_namemap_find)) if (args.chroms_prefix_find and args.chroms_namemap_find != NOMAP): parser.error( "\"--chroms-prefix-find\" and \"chroms-namemap-find\" " "cannot used together. Choose one of those options") # recomb flag if (not isinstance(args.recomb, bool) or (args.recomb != False and args.recomb != True)): parser.error( "\"--recomb\" accepts only True or False values") die(1) # out directory if args.out == "": # default option args.out = DEFAULT_OUTDIR print(args.out) # threshold on q-value flag if (not isinstance(args.qval_t, bool) or (args.qval_t != False and args.qval_t != True)): parser.error( "\"--qvalueT accepts only True or False values") die(1) elif args.no_qvalue == True and args.qval_t == True: parser.error( "Unable to apply statistical significance threshold on" " q-values if you don't want them") die(1) # number of graph regions to store as PNG images if args.top_graphs < 0: parser.error("Negative number of regions to display") workflow: Findmotif = Findmotif(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs." % (end_args_parse - start_args_parse)) # end if # end if # chck that external dependencies are satisfied if args.verbose: sys.stderr.write( "Checking GRAFIMO external dependencies {}\n".format(EXT_DEPS)) start_deps: float = time.time() satisfied: bool deps_lack: List[str] satisfied, deps_lack = check_deps() if not satisfied and len(deps_lack) > 0: errmsg = "Some dependencies are not satisfied: {}.\nPlease solve them before running GRAFIMO.\n" exception_handler(DependencyError, errmsg.format(deps_lack), args.debug) elif not satisfied and len(deps_lack) <= 0: errmsg = "Dependencies satisfied, but unable to recover them.\n Be sure they are in system PATH.\n" exception_handler(DependencyError, errmsg, args.debug) if args.verbose and satisfied: end_deps: float = time.time() print("Dependencies satisfied.") print("Dependencies checked in %.2fs." % (end_deps - start_deps)) #--------------------------------------------------------------- # dependency check was ok, so we go to workflow selection: # * construction of the genome variation graph for # each chromosome or a user defined subset of them # * scan of a precomputed VG or a set of precomputed VG if isinstance(workflow, BuildVG): buildvg(workflow, args.debug) elif isinstance(workflow, Findmotif): findmotif(workflow, args.debug) else: errmsg = "Expected BuildVG or Findmotif, got {}.\n" exception_handler(TypeError, errmsg.format(type(workflow).__name__), args.debug) end: float = time.time() # GRAFIMO execution finishes here print("Elapsed time %.2fs." % (end - start)) except KeyboardInterrupt: sigint_handler() finally: pass
def main(cmdLineargs=None): """ Main function of GRAFIMO. The arguments given in input are checked for consistency, then a pipeline is followed. ---- Parameters: cmdLineargs (str) ---- Returns: None """ try: # starting point of the execution time start = time.time() # read the command-line arguments parser = get_AP() if cmdLineargs is None: cmdLineargs = sys.argv[1:] # take input args # the second argument must be buildvg or findmotif if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and cmdLineargs[0] != "--version" and (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")): parser.error("The second argument must be one between 'buildvg' and 'findmotif'") die(1) args = parser.parse_args(cmdLineargs) # parse args if args.verbose: print("Parsing arguments...") start_args_parse = time.time() ##################################################################### # check arguments consistency ##################################################################### if args.workflow != "buildvg" and args.workflow != "findmotif": parser.error("Do not know what to do. Available options: create VGs with 'grafimo buildvg' or scan a " "precomputed genome variation graph with 'grafimo findmotif'") die(1) # cores (shared by the two workflows) if args.cores < 0: parser.error("The number of cores cannot be negative") elif args.cores == 0 and args.graph_genome: args.cores = 1 # to query a whole genome graph is loaded into RAM, since usually are # very heavy in terms of bytes is safer to use 1 thread by default, otherwise # it would be loaded #cores times. If you want use more cores, be sure # your system can handle the resulting amount of data elif args.cores == 0: args.cores = mp.cpu_count() # by default take all the available CPUs # end if # check verbose flag if (not isinstance(args.verbose, bool) or (args.verbose != False and args.verbose != True)): parser.error('The --verbose parameter accepts only True or False values') # chromosomes check (shared by the two workflows) for c in args.chroms: if c not in CHROMS_LIST: parser.error("Invalid chromosome") args.chroms = initialize_chroms_list(args.chroms) # checks for buildvg workflow if args.workflow == "buildvg": if args.graph_genome_dir: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.graph_genome: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.bedfile: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.motif: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.bgfile != 'UNIF': # if default ignored parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.pseudo != 0.1: # if default ignored parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.threshold != 1e-4: # if default ignored" parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.no_qvalue: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.no_reverse: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.text_only: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.qval_t: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.top_graphs != 0: # if default ignored parser.error("Invalid arguments for grafimo buildvg") die(1) elif not args.linear_genome: parser.error("No reference genome given") die(1) elif not args.vcf: parser.error("No VCF file given") die(1) else: # check linear genome if (args.linear_genome.split('.')[-1] != 'fa' and args.linear_genome.split('.')[-1] != 'fasta'): parser.error('The linear genome must be in FASTA format (FASTA and FA extensions allowed)') die(1) else: if len(glob.glob(args.linear_genome)) != 1: parser.error('Cannot find the given reference genome file') die(1) args.linear_genome = os.path.abspath(args.linear_genome) # end if # check VCF if ((args.vcf.split('.')[-1] != 'gz' and args.vcf.split('.')[-1] != 'zip') or args.vcf.split('.')[-2] != 'vcf'): # allow only compressed VCF files parser.error('Incorrect VCF file given: the VCF must be compressed (e.g. myvcf.vcf.gz)') die(1) else: if len(glob.glob(args.vcf)) <= 0: parser.error('Cannot find the given VCF file') die(1) args.vcf = os.path.abspath(args.vcf) # by deafult the built VGs will be stored in the current directory if args.out == "grafimo_out": # general default value args.out = os.path.abspath("./") workflow = BuildVG(args) if args.verbose: end_args_parse = time.time() print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"])) # end if # end if # checks for findmotif workflow if args.workflow == "findmotif": if args.linear_genome: parser.error("Invalid arguments for grafimo findmotif") die(1) elif args.vcf: parser.error("Invalid arguments for grafimo buildvg") die(1) elif not args.graph_genome_dir and not args.graph_genome: parser.error("No genome variation graph or directory containing them given") die(1) elif not args.bedfile: parser.error("No BED file given") die(1) elif not args.motif: parser.error("No motif file (MEME of JASPAR format) given") die(1) else: # only one between graph_genome and graph_genome_dir allowed if args.graph_genome and args.graph_genome_dir: parser.error("Invalid arguments for grafimo buildvg") die(1) # check graph_genome if args.graph_genome: if (args.graph_genome.split('.')[-1] != 'xg' and args.graph_genome.split('.')[-1] != 'vg'): parser.error("Cannot use the given genome variation graph (only VG or XG format allowed)") die(1) elif not os.path.isfile(args.graph_genome): parser.error("Unable to find the given variation genome graph") die(1) else: graph_genome = os.path.abspath(args.graph_genome) # safer to use absolute path args.graph_genome = graph_genome # end if # end if # check graph_genome_dir if args.graph_genome_dir: if not os.path.isdir(args.graph_genome_dir): parser.error("Cannot find the given directory containing the genome variation graphs") die(1) if args.graph_genome_dir[-1] == '/': graph_genome_dir = args.graph_genome_dir else: graph_genome_dir = ''.join([args.graph_genome_dir, '/']) # end if if len(glob.glob(graph_genome_dir + '*.xg')) <= 0: parser.error(' '.join(['No XG genome variation graph found in', graph_genome_dir])) die(1) else: graph_genome_dir = os.path.abspath(graph_genome_dir) args.graph_genome_dir = graph_genome_dir # end if # end if # check BED file if args.bedfile: if args.bedfile.split('.')[-1] != 'bed': parser.error('Incorrect BED file given') die(1) else: bedfile = args.bedfile if len(glob.glob(bedfile)) <= 0: parser.error('Cannot find the given BED file') # end if else: parser.error('No BED file given') # end if # check motif file if not args.motif: parser.error('No motif given') else: motifs = args.motif # check if the given motifs exist for m in motifs: if not isMEME_ff(m) and not isJaspar_ff(m): parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)") die(1) if len(glob.glob(m)) <= 0: parser.error('Cannot find motif file: ' + m) die(1) # end for # end if # check background file if args.bgfile != 'UNIF': bgfile = args.bgfile # we have a path to a bg file if len(glob.glob(bgfile)) <= 0: parser.error('Cannot find the given background file') die(1) # end if # check pseudocount if args.pseudo <= 0: parser.error('The pseudocount cannot be less than or equal 0') die(1) # check threshold if args.threshold <= 0 or args.threshold > 1: parser.error('The pvalue threshold must be between 0 and 1') die(1) # check q-value flag if (not isinstance(args.no_qvalue, bool) or (args.no_qvalue != False and args.no_qvalue != True)): parser.error('The --qvalue parameter accepts only True or False as values') die(1) # check no reverse flag if (not isinstance(args.no_reverse, bool) or (args.no_reverse != False and args.no_reverse != True)): parser.error('The --no-reverse parameter accepts only True or False as values') die(1) # check text only flag if (not isinstance(args.text_only, bool) or (args.text_only != False and args.text_only != True)): parser.error('The --text-only parameter accepts only True or False values') die(1) # out directory if args.out == 'grafimo_out': # default option # to make unique the output directory we add the PID # to the name. # # This is useful when calling grafimo in different runs on the # same machine. args.out = ''.join([args.out, '_', str(os.getpid())]) # check threshold on q-value flag if (not isinstance(args.qval_t, bool) or (args.qval_t != False and args.qval_t != True)): parser.error("The --qvalueT parameter accepts only True or False as values") die(1) elif args.no_qvalue == True and args.qval_t == True: parser.error("Cannot apply the threshold on q-values if you don't want them") die(1) # check the number of graph regions to store as PNG images if args.top_graphs < 0: parser.error("The number of region graphs to show must be positive") workflow = Findmotif(args) if args.verbose: end_args_parse = time.time() print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"])) # end if # end if # check that external dependencies are satisfied if args.verbose: print("Checking GRAFIMO external dependencies " + str(EXT_DEPS)) start_deps = time.time() satisfied, deps_lack = check_deps() if not satisfied and len(deps_lack) > 0: raise DependencyError("\n\nERROR: The following dependencies are not sastisfied: " + str(deps_lack) + "\nPlease, solve them before running GRAFIMO") die(1) elif not satisfied and len(deps_lack) <= 0: raise DependencyError("Some dependencies were found, but was not possible to track them." "\nBe sure they are available in system PATH") die(1) # end if if args.verbose and satisfied: end_deps = time.time() print("Dependencies correctly satisfied") print(''.join(["Dependencies checked in ", str(end_deps - start_deps), "s"])) ##################################################################### """ dependency check was ok, so we go to workflow selection: - creation of the genome variation graph for each chromosome or a user defined subset of them - scan of a precomputed VG or a set of precomputed VG """ if isinstance(workflow, BuildVG): # build the VG for each chromosome or a user defined subset of them buildvg(workflow) elif isinstance(workflow, Findmotif): # scan a precomputed VG or a set of VGs findmotif(workflow) else: raise ValueError("Unknown arguments object type") # end if end = time.time() # GRAFIMO execution finishes here print(''.join(["\nElapsed time: ", str(end - start), "s"])) except KeyboardInterrupt: sigint_handler() finally: pass
def main(cmdLineargs: Optional[List[str]] = None) -> None : try: # starting point of the execution time start: float = time.time() # read the command-line arguments parser: GRAFIMOArgumentParser = get_parser() if cmdLineargs is None: cmdLineargs: List[str] = sys.argv[1:] # take input args # no argument given if len(cmdLineargs) == 0: parser.error_noargs() die(1) # the second argument must be buildvg or findmotif if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and cmdLineargs[0] != "--version" and (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")): parser.error( "The second argument must be one between 'buildvg' and 'findmotif'") die(1) args: argparse.Namespace = parser.parse_args(cmdLineargs) if args.verbose: print("Parsing arguments...") start_args_parse: float = time.time() ################################################################ # check arguments consistency ################################################################ if args.workflow != "buildvg" and args.workflow != "findmotif": parser.error("Do not know what to do. Available options: create VGs " "with 'grafimo buildvg' or scan a precomputed genome " "variation graph with 'grafimo findmotif'") die(1) # cores (shared by the two workflows) if args.cores < 0: parser.error("The number of cores cannot be negative") elif args.cores == 0 and args.graph_genome: # to query a whole genome graph is loaded into RAM, since # usually they are very heavy in terms of bytes is safer to # use 1 thread by default, otherwise it would be loaded # #cores times. If you want use more cores, be sure your # system can handle the resulting amount of data args.cores = 1 elif args.cores == 0: # by default take all the available CPUs args.cores = mp.cpu_count() # end if # check verbose flag if (not isinstance(args.verbose, bool) or (args.verbose != False and args.verbose != True)): parser.error( 'The --verbose parameter accepts only True or False values') # chromosomes check (shared by the two workflows) if len(args.chroms) == 0: args.chroms = ['ALL_CHROMS'] buildvg_err_msg = "Invalid arguments for grafimo buildvg" # checks for buildvg workflow if args.workflow == "buildvg": if args.graph_genome_dir: parser.error(buildvg_err_msg) die(1) elif args.graph_genome: parser.error(buildvg_err_msg) die(1) elif args.bedfile: parser.error(buildvg_err_msg) die(1) elif args.motif: parser.error(buildvg_err_msg) die(1) elif args.bgfile != 'UNIF': # if default ignored parser.error(buildvg_err_msg) die(1) elif args.pseudo != 0.1: # if default ignored parser.error(buildvg_err_msg) die(1) elif args.threshold != 1e-4: # if default ignored" parser.error(buildvg_err_msg) die(1) elif args.no_qvalue: parser.error(buildvg_err_msg) die(1) elif args.no_reverse: parser.error(buildvg_err_msg) die(1) elif args.text_only: parser.error(buildvg_err_msg) die(1) elif args.qval_t: parser.error(buildvg_err_msg) die(1) elif args.recomb: parser.error(buildvg_err_msg) die(1) elif args.top_graphs != 0: # if default ignored parser.error(buildvg_err_msg) die(1) elif not args.linear_genome: parser.error("No reference genome given") die(1) elif not args.vcf: parser.error("No VCF file given") die(1) else: # check linear genome if (args.linear_genome.split('.')[-1] != 'fa' and args.linear_genome.split('.')[-1] != 'fasta'): parser.error( "The linear genome must be in FASTA format (FASTA and " "FA extensions allowed)") die(1) else: if len(glob.glob(args.linear_genome)) != 1: parser.error( 'Cannot find the given reference genome file') die(1) args.linear_genome = os.path.abspath(args.linear_genome) # end if # check VCF --> the VCF must have been compressed with # bgzip (https://github.com/samtools/tabix) if ((args.vcf.split('.')[-1] != 'gz' and args.vcf.split('.')[-1] != 'zip') or args.vcf.split('.')[-2] != 'vcf'): parser.error( "Incorrect VCF file given: the VCF must be compressed " "(e.g. myvcf.vcf.gz)") die(1) else: if len(glob.glob(args.vcf)) <= 0: parser.error('Cannot find the given VCF file') die(1) args.vcf = os.path.abspath(args.vcf) # by deafult the built VGs will be stored in the current # directory if args.out == "": # general default value args.out = os.path.abspath("./") workflow: BuildVG = BuildVG(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs" % (end_args_parse - start_args_parse)) # end if # end if findmotif_err_msg = "Invalid arguments for grafimo findmotif" # checks for findmotif workflow if args.workflow == "findmotif": if args.linear_genome: parser.error(findmotif_err_msg) die(1) elif args.vcf: parser.error(findmotif_err_msg) die(1) elif args.reindex: # if default value is ignored parser.error(findmotif_err_msg) die(1) elif not args.graph_genome_dir and not args.graph_genome: parser.error( "No genome variation graph or directory containing them given") die(1) elif not args.bedfile: parser.error("No BED file given") die(1) elif not args.motif: parser.error("No motif file (MEME of JASPAR format) given") die(1) else: # only one between graph_genome and graph_genome_dir # are allowed if args.graph_genome and args.graph_genome_dir: parser.error("Invalid arguments for grafimo buildvg") die(1) # check graph_genome if args.graph_genome: if (args.graph_genome.split('.')[-1] != 'xg' and args.graph_genome.split('.')[-1] != 'vg'): parser.error( "Cannot use the given genome variation graph (only " "VG or XG format allowed)") die(1) elif not os.path.isfile(args.graph_genome): parser.error( "Unable to find the given variation genome graph") die(1) else: # it is safer to use absolute path to avoid bugs graph_genome: str = os.path.abspath(args.graph_genome) args.graph_genome = graph_genome # end if # end if # check graph_genome_dir if args.graph_genome_dir: if not os.path.isdir(args.graph_genome_dir): parser.error( "Cannot find the given directory containing the " "genome variation graphs") die(1) if args.graph_genome_dir[-1] == '/': graph_genome_dir = args.graph_genome_dir else: graph_genome_dir = ''.join([args.graph_genome_dir, '/']) # end if if len(glob.glob(graph_genome_dir + '*.xg')) <= 0: parser.error( ' '.join(['No XG genome variation graph found in', graph_genome_dir])) die(1) else: graph_genome_dir: str = os.path.abspath(graph_genome_dir) args.graph_genome_dir = graph_genome_dir # end if # end if # check BED file if args.bedfile: if args.bedfile.split('.')[-1] != 'bed': parser.error('Incorrect BED file given') die(1) else: bedfile: str = args.bedfile if len(glob.glob(bedfile)) <= 0: parser.error('Cannot find the given BED file') # end if else: parser.error('No BED file given') # end if # check motif file if not args.motif: parser.error('No motif given') else: motifs: List[str] = args.motif # check if the given motifs exist for m in motifs: if not isMEME_ff(m) and not isJaspar_ff(m): parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)") die(1) if len(glob.glob(m)) <= 0: parser.error('Cannot find motif file: ' + m) die(1) # end for # end if # check background file if args.bgfile != 'UNIF': bgfile: str = args.bgfile if len(glob.glob(bgfile)) <= 0: parser.error('Cannot find the given background file') die(1) # end if # check pseudocount if args.pseudo <= 0: parser.error( 'The pseudocount cannot be less than or equal 0') die(1) # check threshold if args.threshold <= 0 or args.threshold > 1: parser.error('The pvalue threshold must be between 0 and 1') die(1) # check q-value flag if (not isinstance(args.no_qvalue, bool) or (args.no_qvalue != False and args.no_qvalue != True)): parser.error( "The --qvalue parameter accepts only True or False as " "values") die(1) # check no reverse flag if (not isinstance(args.no_reverse, bool) or (args.no_reverse != False and args.no_reverse != True)): parser.error( "The --no-reverse parameter accepts only True or False " "as values") die(1) # check text only flag if (not isinstance(args.text_only, bool) or (args.text_only != False and args.text_only != True)): parser.error( "The --text-only parameter accepts only True or False " "values") die(1) # check recombinant flag if (not isinstance(args.recomb, bool) or (args.recomb != False and args.recomb != True)): parser.error( "The --recomb parameter accepts only True or False values") die(1) # out directory if args.out == '': # default option args.out = DEFAULT_OUTDIR # check threshold on q-value flag if (not isinstance(args.qval_t, bool) or (args.qval_t != False and args.qval_t != True)): parser.error ("The --qvalueT parameter accepts only True or False as values") die(1) elif args.no_qvalue == True and args.qval_t == True: parser.error( "Cannot apply the threshold on q-values if you don't " "want them") die(1) # check the number of graph regions to store as PNG images if args.top_graphs < 0: parser.error( "The number of region graphs to show must be positive") workflow: Findmotif = Findmotif(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs" % (end_args_parse - start_args_parse)) # end if # end if # check that external dependencies are satisfied if args.verbose: print("Checking GRAFIMO external dependencies " + str(EXT_DEPS)) start_deps: float = time.time() satisfied: bool deps_lack: List[str] satisfied, deps_lack = check_deps() if not satisfied and len(deps_lack) > 0: raise DependencyError("\n\nERROR: The following dependencies are not" " sastisfied: " + str(deps_lack) + "\nPlease, solve them before running GRAFIMO") elif not satisfied and len(deps_lack) <= 0: raise DependencyError("Some dependencies were found, but was not " "possible to track them.\n" "Be sure they are available in system PATH") # end if if args.verbose and satisfied: end_deps: float = time.time() print("Dependencies correctly satisfied") print("Dependencies checked in %.2fs" % (end_deps - start_deps)) ################################################################ # dependency check was ok, so we go to workflow selection: # * creation of the genome variation graph for # each chromosome or a user defined subset of them # * scan of a precomputed VG or a set of precomputed VG if isinstance(workflow, BuildVG): # build the VG for each chromosome or a user defined subset # of them buildvg(workflow) elif isinstance(workflow, Findmotif): # scan a precomputed VG or a set of VGs findmotif(workflow) else: raise ValueError("Unknown arguments object type") # end if end: float = time.time() # GRAFIMO execution finishes here print("Elapsed time %.2fs" % (end - start)) except KeyboardInterrupt: sigint_handler() finally: pass
def construct_vg(buildvg_args): """ Create the genome graph, for the reference and VCF file given in input by the user. The genome is not built as a single whole genome graph but a single graph is constructed for each chromosome. This choice was made to avoid memory issues and make able also the less powerful machines to run GRAFIMO. There is NO drawback using this approach wrt construct the whole genome graph and query it. ---- Parameters: chroms (list) : list of chromosomes for whicgh the genome graph will be constructed linear_genome (str) : path to the linear genome used as reference to build the genome graphs vcf (str) : path to the VCF file used to build the genome graphs ---- Return: None """ if not isinstance(buildvg_args, BuildVG): raise ValueError("Unknown arguments object type. Cannot Build the genome variation graph. Exiting") die(1) # read the arguments to build the VGs chroms = buildvg_args.get_chroms() threads = buildvg_args.get_cores() outdir = buildvg_args.get_outdir() verbose = buildvg_args.get_verbose() test = buildvg_args.get_test() if test: reference = get_reference_genome_from_ucsc() vcf = get_1000GProject_vcf() else: reference = buildvg_args.get_reference_genome() vcf = buildvg_args.get_vcf() # end if if verbose: print("using reference genome: ", reference) print("Using VCF file: ", vcf, "\n\n") # end if cwd = os.getcwd() # check if the VCF file has already been indexed with tabix if not tbiexist(vcf): msg = ''.join(["TBI file not found for ", vcf.split('/')[-1], ". Indexing the VCF file with tabix..."]) print(msg) cmd = 'tabix -p vcf {0}'.format(vcf) code = subprocess.call(cmd, shell=True) if code != 0: # tabix didn't work errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"]) raise SubprocessError(errmsg) die(1) else: # update the indexed VCF msg = ''.join(["Reindexing ", vcf.split('/')[-1], "..."]) print(msg) # remove the existing TBI file cmd = "rm {0}".format(''.join([vcf, ".tbi"])) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"]) raise SubprocessError(errmsg) die(1) # reindex the VCF cmd = "tabix -p vcf {0}".format(vcf) code = subprocess.call(cmd, shell=True) if code != 0: # tabix didn't work errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"]) raise SubprocessError(errmsg) die(1) # end if # end if # enter the output directory os.chdir(outdir) # build the VG for each chromosome or a user defined # subset of them for chrom_n in chroms: chrom = ''.join(['chr', chrom_n]) # to call vg construct we need both the # chromosome number and it preceded by 'chr vg = chrom + '.vg' # build the VG for the current chromosome if verbose: start_build = time.time() code = build_vg(vg, reference, vcf, chrom, chrom_n, threads) if code != 0: msg = '\n\nERROR: an error occurred during {0} construction. '.format(vg) msg += 'Unable to build the VG of the genome using {0} and {1}'.format(reference, vcf) raise VGException(msg) die(1) # end if if verbose: end_build = time.time() msg = "Elapsed time to build {0} ".format(vg) msg = ''.join([msg, str(end_build - start_build), "s"]) print(msg) # end if # to query efficiently the VGs we index them (VG -> XG) if verbose: start_index = time.time() msg = ''.join(["Indexing ", vg, '...']) print(msg) code = indexVG(vg, threads) if code != 0: errmsg = "\n\nERROR: an error occurred during indexing {0}.\nUnable to index {0}. Exiting".format(vg) raise VGException(errmsg) die(1) # end if if verbose: end_index = time.time() msg = "Elapsed time to index {0} ".format(vg) msg = ''.join([msg, str(end_index - start_index), "s"]) print(msg) # end if # The majority of applications work only with indexed graph, # so to save disk space is worth to delete the VGs and keep # only the XGs (is simple to get back using VG built-in functions) if verbose: print("Deleting {0}".format(vg)) cmd = 'rm {0}'.format(vg) subprocess.call(cmd, shell=True) if code != 0: # we have errors in the vg indexing errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"]) raise SubprocessError() die(1) # end if # end for # get the VGs location graphs_loc = os.getcwd() # return to the original working directory os.chdir(cwd)
def compute_results( motif: Motif, sequence_loc: str, debug: bool, args_obj: Optional[Findmotif] = None, testmode: Optional[bool] = False, ) -> pd.DataFrame: """Score the sequences extracted from the genome variation graph. The potential motif occurrences are scored using the scaled scoring matrix. The scaled values are then used to retrieve the corresponding P-value. ... Parameters ---------- motif : Motif motif object sequence_loc : str path to sequences extracted debug : bool trace the full error stack args_obj : Findmotif, optional commandline arguments container testmode : bool, optional test (manually set) Returns ------- pandas.DataFrame results """ if not isinstance(motif, Motif): errmsg = "Expected Motif, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif).__name__), debug) if not isinstance(sequence_loc, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(sequence_loc).__name__), debug) if not os.path.isdir(sequence_loc): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(sequence_loc), debug) if not testmode: if not isinstance(args_obj, Findmotif): errmsg = "Expected Findmotif, got {}.\n" exception_handler(TypeError, errmsg.format(type(args_obj).__name__), debug) if not testmode: cores: int = args_obj.cores threshold: float = args_obj.threshold no_qvalue: bool = args_obj.noqvalue qval_t: bool = args_obj.qvalueT no_reverse: bool = args_obj.noreverse recomb: bool = args_obj.recomb verbose: bool = args_obj.verbose else: # pytest - during normal execution we should never go here cores = 1 threshold = float(1) recomb = True no_qvalue = False qval_t = False no_reverse = False verbose = False assert threshold > 0 and threshold <= 1 assert cores >= 1 print_scoring_msg(motif, no_reverse, debug) cwd: str = os.getcwd() os.chdir(sequence_loc) manager: SyncManager = mp.Manager() return_dict: DictProxy = manager.dict() # results scanned_nucs_dict: DictProxy = manager.dict() # scanned nucleotides scanned_seqs_dict: DictProxy = manager.dict() # scanned sequences sequences: List[str] = glob.glob('*.tsv') # sequences if len(sequences) < cores: cores = len(sequences) # split the sequence set in no. cores chunks sequences_split: List[str] = np.array_split(sequences, cores) jobs = list() # jobs list proc_finished: int = 0 # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) signal.signal(signal.SIGINT, original_sigint_handler) if verbose: start_s: float = time.time() try: for i in range(cores): p = mp.Process(target=score_seqs, args=(sequences_split[i], motif, no_reverse, return_dict, scanned_seqs_dict, scanned_nucs_dict, i, debug)) jobs.append(p) p.start() # to print 0%, otherwise start from % as first chunk id already completed completed printProgressBar(proc_finished, cores, prefix='Progress:', suffix='Complete', length=50) for job in jobs: job.join() # sync point proc_finished += 1 printProgressBar(proc_finished, cores, prefix='Progress:', suffix='Complete', length=50) except KeyboardInterrupt: sigint_handler() die(2) else: if verbose: end_s: float = time.time() print("Scored all sequences in %.2fs" % (end_s - start_s)) os.chdir(cwd) if not testmode: cmd: str = "rm -rf {}".format(sequence_loc) code: int = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing {}.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) if verbose: start_df: str = time.time() # recover all analysis results and summarize them in a single # data structure seqs_scanned: int = 0 nucs_scanned: int = 0 summary = ResultTmp() for key in return_dict.keys(): partialres = return_dict[key] summary.append_list(partialres[0], partialres[1], partialres[2], partialres[3], partialres[4], partialres[5], partialres[6], partialres[7], partialres[8], partialres[9]) seqs_scanned += scanned_seqs_dict[key] nucs_scanned += scanned_nucs_dict[key] if summary.isempty(): errmsg = "No result retrieved. Unable to proceed. Are you using the correct VGs and searching on the right chromosomes?\n" exception_handler(ValueError, errmsg, debug) # compute the q-values if not no_qvalue: if verbose: start_q = time.time() qvalues = compute_qvalues(summary.pvalues, debug) summary.add_qvalues(qvalues) if verbose: end_q = time.time() print("Q-values computed in %.2fs." % (end_q - start_q)) print("Scanned sequences:\t{}".format(seqs_scanned)) print("Scanned nucleotides:\t{}".format(nucs_scanned)) # summarize results in a pandas DataFrame finaldf = summary.to_df(motif, threshold, qval_t, recomb, ignore_qvals=no_qvalue) if verbose: end_df: float = time.time() print("\nResults summary built in %.2fs" % (end_df - start_df)) return finaldf
def scale_pwm(motif_matrix, alphabet, motif_width): """ Scale the motif matrix values ---- Parameters: motif_matrix (str) : count matrix alphabet (str) : motif alphabet motif_width (int) : motif width ---- Returns: motif_matrix_sc (np.ndarray) : scaled motif matrix min_val (int) : lowest value in the scaled motif matrix max_val (int) : higest value in the scaled motif matrix scale_factor (int) offset (int) """ if not isinstance(motif_matrix, pd.DataFrame): raise NoDataFrameException( "The given motif matrix must be an instance of pandas.DataFrame") die(1) if motif_matrix.empty: raise NotValidMotifMatrixException("The given motif matrix is empty") die(1) if not isinstance(alphabet, list): raise NotValidAlphabetException("The alphabet given is not in a list") die(1) if not isListEqual(alphabet, DNA_ALPHABET): raise NotValidAlphabetException( "The alphabet given is not a valid DNA alphabet") die(1) assert motif_width > 0 min_val = min(motif_matrix.min()) max_val = max(motif_matrix.max()) motif_matrix_sc = pd.DataFrame(index=list(motif_matrix.index), columns=list(motif_matrix.columns), data=0) lower = min_val upper = max_val if lower == upper: # all values are equal lower = np.double(upper - 1) lower = np.floor(lower) offset = np.round(np.floor(lower)) scale_factor = np.floor(RANGE / (upper - lower)) # values will be in [0, 1000] for nuc in alphabet: for j in range(motif_width): scaled_score = np.round( (motif_matrix.loc[nuc, j] - (offset)) * scale_factor) motif_matrix_sc.loc[nuc, j] = scaled_score # end for # end for # make sure the values are integers motif_matrix_sc[:] = motif_matrix_sc[:].astype(int) # now they are scaled min_val = min(motif_matrix_sc.min()) max_val = max(motif_matrix_sc.max()) return motif_matrix_sc, min_val, max_val, int(scale_factor), offset