def set_path_RNAseq(self, input_RNAseq): """Sets path to RNAseq file according to user input""" list_RNAseq = input_RNAseq.split(",") if len(list_RNAseq) != 2: message = ("Error: wrong number of RNAseq files").format( path=input_RNAseq) raise exceptions.IncorrectPathError(message) else: for path_RNA in list_RNAseq: if not os.path.isfile(path_RNA): message = ("Error: the following file path <{path}> " "is incorrect").format(path=path_RNA) raise exceptions.IncorrectPathError(message) self.path_RNAseq1 = list_RNAseq[0] self.path_RNAseq2 = list_RNAseq[1]
def set_motif_path(self, motif_path): if not os.path.isfile(motif_path): message = ("Error: the following file path <{path}> " "is not correct").format(path=motif_path) raise exceptions.IncorrectPathError(message) else: self.motif_path = motif_path
def bedtools(motif_pipeline, genome): '''Extract fasta sequences from genome using bedtools''' file_prefix = motif_pipeline.subdir_name + motif_pipeline.prefix for file_path in [file_prefix + "_rand.bed", file_prefix + "_sv.bed"]: if not os.path.isfile(file_path): message = ("Error: the following file path <{path}> " "is missing").format(path=file_path) raise exceptions.IncorrectPathError(message) for rand_sv in ["rand", "sv"]: bedtools_script = ("bedtools getfasta -name -fo {output_fasta} " "-fi {genome_fasta} " "-bed {input_bed}").format( output_fasta=file_prefix + "_" + rand_sv + ".fasta", genome_fasta=genome.path_fasta, input_bed=file_prefix + "_" + rand_sv + ".bed") bedtools_sub = subprocess.Popen(bedtools_script.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) motif_pipeline.write_log( "running bedtools for {rand_sv}".format(rand_sv=rand_sv)) outs, errs = bedtools_sub.communicate() print(outs) print(errs) if bedtools_sub.returncode != 0: print("An error occurred, the program did not run to completion.") sys.exit()
def FIMO(motif_pipeline): '''Runs FIMO (Find Individual Motif Occurrences) program on samples''' file_prefix = motif_pipeline.subdir_name + motif_pipeline.prefix for file_path in [file_prefix + "_rand.fasta", file_prefix + "_sv.fasta"]: if not os.path.isfile(file_path): message = ("Error: the following file <{path}> " "is missing").format(path=file_path) raise exceptions.IncorrectPathError(message) for rand_sv in ["rand", "sv"]: FIMO_script = ( "fimo --oc {output_dir} --thresh {FIMO_thresh} --max-stored-scores 100000000 {meme_file} " "{fasta_file}").format( output_dir=motif_pipeline.subdir_name + "FIMO_" + rand_sv, meme_file=motif_pipeline.motif_path, fasta_file=file_prefix + "_" + rand_sv + ".fasta", FIMO_thresh=motif_pipeline.FIMO_thresh) FIMO_sub = subprocess.Popen(FIMO_script.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) motif_pipeline.write_log( "running FIMO for {rand_sv}".format(rand_sv=rand_sv)) outs, errs = FIMO_sub.communicate() print(outs) print(errs) if FIMO_sub.returncode != 0: print("An error occurred, the program did not run to completion.") sys.exit()
def AME(motif_pipeline): '''Run AME (Analysis of Motif Enrichment) program on samples''' file_prefix = motif_pipeline.subdir_name + motif_pipeline.prefix for file_path in [file_prefix + "_rand.fasta", file_prefix + "_sv.fasta"]: if not os.path.isfile(file_path): message = ("Error: the following file path <{path}> " "is missing").format(path=file_path) raise exceptions.IncorrectPathError(message) AME_script = ("ame --verbose 5 --oc {output_dir} " "--scoring {method} " "--method ranksum " "--control {fasta_rand} " "{fasta_sv} {meme_file} ").format( output_dir=motif_pipeline.subdir_name + "AME", fasta_rand=file_prefix + "_rand.fasta", fasta_sv=file_prefix + "_sv.fasta", meme_file=motif_pipeline.motif_path, method=motif_pipeline.AME_scoring) with open(file_prefix + "_AME_results.csv", "w") as output_file: AME_sub = subprocess.Popen(AME_script.split(), stdout=subprocess.PIPE, stderr=output_file, universal_newlines=True) motif_pipeline.write_log("running AME") outs, _ = AME_sub.communicate() print(outs) if AME_sub.returncode != 0: print("An error occurred, the program did not run to completion.") sys.exit()
def set_output_dir(self, output_dir): try: os.makedirs(output_dir, exist_ok=True) except: raise exceptions.IncorrectPathError(output_dir) if output_dir[-1] != "/": output_dir += "/" self.output_dir = output_dir
def set_path_WES(self, input_WES): """Sets path to WES file according to user input""" if not os.path.isfile(input_WES): message = ("Error: the following file path <{path}> " "is incorrect").format(path=input_WES) raise exceptions.IncorrectPathError(message) else: self.path_WES = input_WES
def set_path_correct_HLA(self, correct_HLA): """Sets path to correct HLA typing CSV file according to user input""" if not os.path.isfile(correct_HLA) or not correct_HLA.endswith(".csv"): message = ("Error: the following file path <{path}> " "is incorrect").format(path=correct_HLA) raise exceptions.IncorrectPathError(message) else: self.path_correct_HLA = correct_HLA
def set_input_dir(self, input_dir): if not os.path.isdir(input_dir): message = ("Error: the following directory path <{path}> " "is not correct").format(path=input_dir) raise exceptions.IncorrectPathError(message) else: if input_dir[-1] != "/": input_dir += "/" self.input_dir = input_dir
def set_output_dir(self, output_dir): try: os.makedirs(output_dir, exist_ok=True) if output_dir[-1] != "/": output_dir += "/" self.output_dir = output_dir except: message = ("Error: the following directory path <{path}> " "is not correct").format(path=output_dir) raise exceptions.IncorrectPathError(message)
def set_dir_genome(self, dir_genome): """Sets path to genome directory according to user input""" if not os.path.isdir(dir_genome): message = ("Error: the following directory path <{path}> " "is not correct").format(path=dir_genome) raise exceptions.IncorrectPathError(message) else: if dir_genome[-1] != "/": dir_genome += "/" self.dir_genome = dir_genome
def set_num_SV_breakpoints(self): file_path = self.subdir_name + self.prefix + "_sv.bed" if not os.path.isfile(file_path): message = ("Error: the following file path <{path}> " "is missing").format(path=file_path) raise exceptions.IncorrectPathError(message) else: with open(file_path) as f: for i, l in enumerate(f): pass self.num_SV_breakpoints = i
def __init__(self, genome_fasta, genome_length_file, genome_include_file): for file_path in [ genome_fasta, genome_length_file, genome_include_file ]: if not os.path.isfile(file_path): message = ("Error: the following file path <{path}> " "is missing").format(path=file_path) raise exceptions.IncorrectPathError(message) self.path_fasta = genome_fasta self.length_dict = generate_genome_dict(genome_length_file) self.path_include = genome_include_file
def set_output_dir(self, output_dir): """Sets path to output directory according to user input, throws error if it cannot make the directory""" try: os.makedirs(output_dir, exist_ok=True) if output_dir[-1] != "/": output_dir += "/" self.output_dir = output_dir except: message = ("Error: the following directory path <{path}> " "is not correct").format(path=output_dir) raise exceptions.IncorrectPathError(message)
def set_fastq_dict(self, fastq_path): try: base_dir = fastq_path.rsplit("/", 2)[0] + "/" name_format = fastq_path.rsplit("/", 1)[-1] except: raise exceptions.IncorrectPathError(fastq_path) list_files = wcmatchglob.glob(fastq_path, flags=wcmatchglob.BRACE) if not list_files: raise exceptions.IncorrectPathError(fastq_path) list_samples = list(set([file.split("/")[-2] for file in list_files])) self.fastq_dict = {} if self.single_end: for sample in list_samples: fastq_specific = wcmatchglob.glob(base_dir + sample + "/" + name_format, flags=wcmatchglob.BRACE) if len(fastq_specific) > 1: print( f"More than 1 matching fastq file for sample {sample} in single_end mode. Sample ignored." ) else: self.fastq_dict[sample] = fastq_specific else: for sample in list_samples: fastq_specific = wcmatchglob.glob(base_dir + sample + "/" + name_format, flags=wcmatchglob.BRACE) if len(fastq_specific) > 2: print( f"More than 2 matching fastq files for sample {sample} in paired_end mode. Sample ignored." ) elif len(fastq_specific) < 2: print( f"Only than 1 matching fastq file for sample {sample} in paired_end mode. Sample ignored." ) else: self.fastq_dict[sample] = fastq_specific
def load_config(config_name): base_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1]) config = os.path.join(base_dir, 'configs', '{}.py'.format(config_name)) if not os.path.isfile(config): raise exceptions.IncorrectPathError( "Cannot find the config file <{config}.py>.".format( config=config_name)) try: spec = importlib.util.spec_from_file_location('', config) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) parsl.load(module.config) except Exception as e: raise exceptions.IncorrectInputFiles( ("Could not load specified config from <{config}.py> :" "\n {exception}.").format(config=config_name, exception=e))
def extract_json(self, json_name): if json_name is None: self.config_dict = {} else: base_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1]) json_file = os.path.join(base_dir, 'configs', '{}.json'.format(json_name)) if not os.path.isfile(json_file): raise exceptions.IncorrectPathError( "Cannot find the json file <{json_name}.json>.".format( json_name=json_name)) try: json_fileobject = open(json_file, "r") self.config_dict = json.load(json_fileobject) except Exception as e: raise exceptions.IncorrectInputFiles( ("Could not load json from <{json_name}.json> :" "\n {exception}.").format(json_name=json_name, exception=e))
def set_list_bedpe(self, sample_attr_path): if pd.isnull(sample_attr_path): self.list_bedpe = [ (file.split("/")[-1]).split(".")[0] for file in glob.glob(self.input_dir + "*.bedpe") ] else: if not os.path.isfile(sample_attr_path): message = ("Error: the following file path <{path}> " "is not correct").format(path=sample_attr_path) raise exceptions.IncorrectPathError(message) else: df_attr = pd.read_csv(sample_attr_path, sep=",") list_bedpe = [] for group in self.sample_attr.keys(): for attr in self.sample_attr[group]: list_bedpe += df_attr[df_attr[group] == attr][ df_attr.columns[0]].tolist() self.list_bedpe = [ (file.split("/")[-1]).split(".")[0] for file in glob.glob(self.input_dir + "*.bedpe") if (file.split("/")[-1]).split(".")[0] in list_bedpe ]
def main(): '''Reads input from terminal and coordinates pipeline''' try: opts, args = getopt.getopt( sys.argv[1:], "i:o:f:l:e:m:a:t:s:r:F:A:c:p:h", [ "input_dir=", "output_dir=", "genome_fasta=", "genome_len=", "genome_include=", "motif_path=", "sample_attr=", "sampleinfo_table=", "SV_types=", "rand_sv_ratio=", "FIMO_thresh=", "AME_scoring=", "config=", "prefix=", "help" ]) except getopt.GetoptError as e: print(e) sys.exit(2) if len(args) > 0: message = "Error: non-paired arguments are not allowed." raise exceptions.WrongArgumentError(message) motif_pipeline = pipeline.MotifPipeline() sample_attr_path = None genome_fasta = None genome_len = None genome_include = None prefix = None config_name = "local" for opt, arg in opts: if opt in ("-h", "--help"): description() sys.exit() elif opt in ("-i", "--input_dir"): motif_pipeline.set_input_dir(arg) elif opt in ("-o", "--output_dir"): motif_pipeline.set_output_dir(arg) elif opt in ("-f", "--genome_fasta"): genome_fasta = arg elif opt in ("-l", "--genome_len"): genome_len = arg elif opt in ("-e", "--genome_include"): genome_include = arg elif opt in ("-m", "--motif_path"): motif_pipeline.set_motif_path(arg) elif opt in ("-a", "--sample_attr"): motif_pipeline.set_sample_attr(arg) elif opt in ("-t", "--sampleinfo_table"): sample_attr_path = arg elif opt in ("-s", "--SV_types"): motif_pipeline.set_SV_types(arg) elif opt in ("-r", "--rand_sv_ratio"): motif_pipeline.set_rand_sv_ratio(arg) elif opt in ("-F", "--FIMO_thresh"): motif_pipeline.set_FIMO_thresh(arg) elif opt in ("-A", "--AME_scoring"): motif_pipeline.set_AME_scoring(arg) elif opt in ("-c", "--config"): config_name = arg elif opt in ("-p", "--prefix"): prefix = arg else: message = "Error: {opt} is not a valid option".format(opt=opt) raise exceptions.WrongArgumentError(message) if ((sample_attr_path is None and not motif_pipeline.sample_attr == "all") or (sample_attr_path is not None and motif_pipeline.sample_attr == "all")): message = "Error: you must indicate both --sampleinfo_table and --sample_attr, or neither." raise exceptions.MissingArgumentError(message) if genome_fasta is None: message = "Error: you must indicate --genome_fasta." raise exceptions.MissingArgumentError(message) if genome_len is None: message = "Error: you must indicate --genome_len." raise exceptions.MissingArgumentError(message) if genome_include is None: message = "Error: you must indicate --genome_include." raise exceptions.MissingArgumentError(message) for pipeline_attr in ["input_dir", "output_dir", "motif_path"]: if not hasattr(motif_pipeline, pipeline_attr): message = ("Error: you must indicate --{attr}.").format( attr=pipeline_attr) raise exceptions.MissingArgumentError(message) motif_pipeline.set_subdir_name(prefix) motif_pipeline.write_description() motif_pipeline.set_list_bedpe(sample_attr_path) reference_genome = refgenome.ReferenceGenome(genome_fasta, genome_len, genome_include) base_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1]) try: config = os.path.join(base_dir, 'configs', '{}.py'.format(config_name)) spec = importlib.util.spec_from_file_location('', config) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) parsl.load(module.config) except: raise exceptions.IncorrectPathError( "Cannot find the config file <{config_name}>.".format( config_name=config_name)) if not os.path.isdir(motif_pipeline.output_dir + "bed_files"): os.mkdir(motif_pipeline.output_dir + "bed_files") for file_name in motif_pipeline.list_bedpe: sv_types_to_run = get_SV_types(motif_pipeline, file_name) if sv_types_to_run: extractdata.bedpe_to_bed(reference_genome, motif_pipeline, file_name, sv_types_to_run) parsl.wait_for_current_tasks() runprogram.merge(motif_pipeline) motif_pipeline.set_num_SV_breakpoints() runprogram.bedtools(motif_pipeline, reference_genome) runprogram.FIMO(motif_pipeline) runprogram.AME(motif_pipeline) extractdata.extract_list_sequences_AME(motif_pipeline) extractdata.extract_output_FIMO(motif_pipeline) extractdata.extract_output_AME(motif_pipeline) graphs.generate_histogram(motif_pipeline)