Esempio n. 1
0
 def set_path_RNAseq(self, input_RNAseq):
     """Sets path to RNAseq file according to user input"""
     list_RNAseq = input_RNAseq.split(",")
     if len(list_RNAseq) != 2:
         message = ("Error: wrong number of RNAseq files").format(
             path=input_RNAseq)
         raise exceptions.IncorrectPathError(message)
     else:
         for path_RNA in list_RNAseq:
             if not os.path.isfile(path_RNA):
                 message = ("Error: the following file path <{path}> "
                            "is incorrect").format(path=path_RNA)
                 raise exceptions.IncorrectPathError(message)
         self.path_RNAseq1 = list_RNAseq[0]
         self.path_RNAseq2 = list_RNAseq[1]
Esempio n. 2
0
 def set_motif_path(self, motif_path):
     if not os.path.isfile(motif_path):
         message = ("Error: the following file path <{path}> "
                    "is not correct").format(path=motif_path)
         raise exceptions.IncorrectPathError(message)
     else:
         self.motif_path = motif_path
Esempio n. 3
0
def bedtools(motif_pipeline, genome):
    '''Extract fasta sequences from genome using bedtools'''
    file_prefix = motif_pipeline.subdir_name + motif_pipeline.prefix
    for file_path in [file_prefix + "_rand.bed", file_prefix + "_sv.bed"]:
        if not os.path.isfile(file_path):
            message = ("Error: the following file path <{path}> "
                       "is missing").format(path=file_path)
            raise exceptions.IncorrectPathError(message)

    for rand_sv in ["rand", "sv"]:
        bedtools_script = ("bedtools getfasta -name -fo {output_fasta} "
                           "-fi {genome_fasta} "
                           "-bed {input_bed}").format(
                               output_fasta=file_prefix + "_" + rand_sv +
                               ".fasta",
                               genome_fasta=genome.path_fasta,
                               input_bed=file_prefix + "_" + rand_sv + ".bed")
        bedtools_sub = subprocess.Popen(bedtools_script.split(),
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.PIPE,
                                        universal_newlines=True)
        motif_pipeline.write_log(
            "running bedtools for {rand_sv}".format(rand_sv=rand_sv))
        outs, errs = bedtools_sub.communicate()
        print(outs)
        print(errs)
        if bedtools_sub.returncode != 0:
            print("An error occurred, the program did not run to completion.")
            sys.exit()
Esempio n. 4
0
def FIMO(motif_pipeline):
    '''Runs FIMO (Find Individual Motif Occurrences) program on samples'''
    file_prefix = motif_pipeline.subdir_name + motif_pipeline.prefix
    for file_path in [file_prefix + "_rand.fasta", file_prefix + "_sv.fasta"]:
        if not os.path.isfile(file_path):
            message = ("Error: the following file <{path}> "
                       "is missing").format(path=file_path)
            raise exceptions.IncorrectPathError(message)
    for rand_sv in ["rand", "sv"]:
        FIMO_script = (
            "fimo --oc {output_dir} --thresh {FIMO_thresh} --max-stored-scores 100000000 {meme_file} "
            "{fasta_file}").format(
                output_dir=motif_pipeline.subdir_name + "FIMO_" + rand_sv,
                meme_file=motif_pipeline.motif_path,
                fasta_file=file_prefix + "_" + rand_sv + ".fasta",
                FIMO_thresh=motif_pipeline.FIMO_thresh)
        FIMO_sub = subprocess.Popen(FIMO_script.split(),
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    universal_newlines=True)
        motif_pipeline.write_log(
            "running FIMO for {rand_sv}".format(rand_sv=rand_sv))
        outs, errs = FIMO_sub.communicate()
        print(outs)
        print(errs)
        if FIMO_sub.returncode != 0:
            print("An error occurred, the program did not run to completion.")
            sys.exit()
Esempio n. 5
0
def AME(motif_pipeline):
    '''Run AME (Analysis of Motif Enrichment) program on samples'''
    file_prefix = motif_pipeline.subdir_name + motif_pipeline.prefix
    for file_path in [file_prefix + "_rand.fasta", file_prefix + "_sv.fasta"]:
        if not os.path.isfile(file_path):
            message = ("Error: the following file path <{path}> "
                       "is missing").format(path=file_path)
            raise exceptions.IncorrectPathError(message)
    AME_script = ("ame --verbose 5 --oc {output_dir} "
                  "--scoring {method} "
                  "--method ranksum "
                  "--control {fasta_rand} "
                  "{fasta_sv} {meme_file} ").format(
                      output_dir=motif_pipeline.subdir_name + "AME",
                      fasta_rand=file_prefix + "_rand.fasta",
                      fasta_sv=file_prefix + "_sv.fasta",
                      meme_file=motif_pipeline.motif_path,
                      method=motif_pipeline.AME_scoring)
    with open(file_prefix + "_AME_results.csv", "w") as output_file:
        AME_sub = subprocess.Popen(AME_script.split(),
                                   stdout=subprocess.PIPE,
                                   stderr=output_file,
                                   universal_newlines=True)
    motif_pipeline.write_log("running AME")
    outs, _ = AME_sub.communicate()
    print(outs)
    if AME_sub.returncode != 0:
        print("An error occurred, the program did not run to completion.")
        sys.exit()
Esempio n. 6
0
 def set_output_dir(self, output_dir):
     try:
         os.makedirs(output_dir, exist_ok=True)
     except:
         raise exceptions.IncorrectPathError(output_dir)
     if output_dir[-1] != "/":
         output_dir += "/"
     self.output_dir = output_dir
Esempio n. 7
0
 def set_path_WES(self, input_WES):
     """Sets path to WES file according to user input"""
     if not os.path.isfile(input_WES):
         message = ("Error: the following file path <{path}> "
                    "is incorrect").format(path=input_WES)
         raise exceptions.IncorrectPathError(message)
     else:
         self.path_WES = input_WES
Esempio n. 8
0
 def set_path_correct_HLA(self, correct_HLA):
     """Sets path to correct HLA typing CSV file according to user input"""
     if not os.path.isfile(correct_HLA) or not correct_HLA.endswith(".csv"):
         message = ("Error: the following file path <{path}> "
                    "is incorrect").format(path=correct_HLA)
         raise exceptions.IncorrectPathError(message)
     else:
         self.path_correct_HLA = correct_HLA
Esempio n. 9
0
 def set_input_dir(self, input_dir):
     if not os.path.isdir(input_dir):
         message = ("Error: the following directory path <{path}> "
                    "is not correct").format(path=input_dir)
         raise exceptions.IncorrectPathError(message)
     else:
         if input_dir[-1] != "/":
             input_dir += "/"
         self.input_dir = input_dir
Esempio n. 10
0
 def set_output_dir(self, output_dir):
     try:
         os.makedirs(output_dir, exist_ok=True)
         if output_dir[-1] != "/":
             output_dir += "/"
         self.output_dir = output_dir
     except:
         message = ("Error: the following directory path <{path}> "
                    "is not correct").format(path=output_dir)
         raise exceptions.IncorrectPathError(message)
Esempio n. 11
0
 def set_dir_genome(self, dir_genome):
     """Sets path to genome directory according to user input"""
     if not os.path.isdir(dir_genome):
         message = ("Error: the following directory path <{path}> "
                    "is not correct").format(path=dir_genome)
         raise exceptions.IncorrectPathError(message)
     else:
         if dir_genome[-1] != "/":
             dir_genome += "/"
         self.dir_genome = dir_genome
Esempio n. 12
0
 def set_num_SV_breakpoints(self):
     file_path = self.subdir_name + self.prefix + "_sv.bed"
     if not os.path.isfile(file_path):
         message = ("Error: the following file path <{path}> "
                    "is missing").format(path=file_path)
         raise exceptions.IncorrectPathError(message)
     else:
         with open(file_path) as f:
             for i, l in enumerate(f):
                 pass
             self.num_SV_breakpoints = i
Esempio n. 13
0
 def __init__(self, genome_fasta, genome_length_file, genome_include_file):
     for file_path in [
             genome_fasta, genome_length_file, genome_include_file
     ]:
         if not os.path.isfile(file_path):
             message = ("Error: the following file path <{path}> "
                        "is missing").format(path=file_path)
             raise exceptions.IncorrectPathError(message)
     self.path_fasta = genome_fasta
     self.length_dict = generate_genome_dict(genome_length_file)
     self.path_include = genome_include_file
Esempio n. 14
0
 def set_output_dir(self, output_dir):
     """Sets path to output directory according to user input, throws error if it cannot make the directory"""
     try:
         os.makedirs(output_dir, exist_ok=True)
         if output_dir[-1] != "/":
             output_dir += "/"
         self.output_dir = output_dir
     except:
         message = ("Error: the following directory path <{path}> "
                    "is not correct").format(path=output_dir)
         raise exceptions.IncorrectPathError(message)
Esempio n. 15
0
 def set_fastq_dict(self, fastq_path):
     try:
         base_dir = fastq_path.rsplit("/", 2)[0] + "/"
         name_format = fastq_path.rsplit("/", 1)[-1]
     except:
         raise exceptions.IncorrectPathError(fastq_path)
     list_files = wcmatchglob.glob(fastq_path, flags=wcmatchglob.BRACE)
     if not list_files:
         raise exceptions.IncorrectPathError(fastq_path)
     list_samples = list(set([file.split("/")[-2] for file in list_files]))
     self.fastq_dict = {}
     if self.single_end:
         for sample in list_samples:
             fastq_specific = wcmatchglob.glob(base_dir + sample + "/" +
                                               name_format,
                                               flags=wcmatchglob.BRACE)
             if len(fastq_specific) > 1:
                 print(
                     f"More than 1 matching fastq file for sample {sample} in single_end mode. Sample ignored."
                 )
             else:
                 self.fastq_dict[sample] = fastq_specific
     else:
         for sample in list_samples:
             fastq_specific = wcmatchglob.glob(base_dir + sample + "/" +
                                               name_format,
                                               flags=wcmatchglob.BRACE)
             if len(fastq_specific) > 2:
                 print(
                     f"More than 2 matching fastq files for sample {sample} in paired_end mode. Sample ignored."
                 )
             elif len(fastq_specific) < 2:
                 print(
                     f"Only than 1 matching fastq file for sample {sample} in paired_end mode. Sample ignored."
                 )
             else:
                 self.fastq_dict[sample] = fastq_specific
Esempio n. 16
0
def load_config(config_name):
    base_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
    config = os.path.join(base_dir, 'configs', '{}.py'.format(config_name))
    if not os.path.isfile(config):
        raise exceptions.IncorrectPathError(
            "Cannot find the config file <{config}.py>.".format(
                config=config_name))
    try:
        spec = importlib.util.spec_from_file_location('', config)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        parsl.load(module.config)
    except Exception as e:
        raise exceptions.IncorrectInputFiles(
            ("Could not load specified config from <{config}.py> :"
             "\n {exception}.").format(config=config_name, exception=e))
Esempio n. 17
0
 def extract_json(self, json_name):
     if json_name is None:
         self.config_dict = {}
     else:
         base_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
         json_file = os.path.join(base_dir, 'configs',
                                  '{}.json'.format(json_name))
         if not os.path.isfile(json_file):
             raise exceptions.IncorrectPathError(
                 "Cannot find the json file <{json_name}.json>.".format(
                     json_name=json_name))
         try:
             json_fileobject = open(json_file, "r")
             self.config_dict = json.load(json_fileobject)
         except Exception as e:
             raise exceptions.IncorrectInputFiles(
                 ("Could not load json from <{json_name}.json> :"
                  "\n {exception}.").format(json_name=json_name,
                                            exception=e))
Esempio n. 18
0
 def set_list_bedpe(self, sample_attr_path):
     if pd.isnull(sample_attr_path):
         self.list_bedpe = [
             (file.split("/")[-1]).split(".")[0]
             for file in glob.glob(self.input_dir + "*.bedpe")
         ]
     else:
         if not os.path.isfile(sample_attr_path):
             message = ("Error: the following file path <{path}> "
                        "is not correct").format(path=sample_attr_path)
             raise exceptions.IncorrectPathError(message)
         else:
             df_attr = pd.read_csv(sample_attr_path, sep=",")
             list_bedpe = []
             for group in self.sample_attr.keys():
                 for attr in self.sample_attr[group]:
                     list_bedpe += df_attr[df_attr[group] == attr][
                         df_attr.columns[0]].tolist()
             self.list_bedpe = [
                 (file.split("/")[-1]).split(".")[0]
                 for file in glob.glob(self.input_dir + "*.bedpe")
                 if (file.split("/")[-1]).split(".")[0] in list_bedpe
             ]
def main():
    '''Reads input from terminal and coordinates pipeline'''
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], "i:o:f:l:e:m:a:t:s:r:F:A:c:p:h", [
                "input_dir=", "output_dir=", "genome_fasta=", "genome_len=",
                "genome_include=", "motif_path=", "sample_attr=",
                "sampleinfo_table=", "SV_types=", "rand_sv_ratio=",
                "FIMO_thresh=", "AME_scoring=", "config=", "prefix=", "help"
            ])
    except getopt.GetoptError as e:
        print(e)
        sys.exit(2)

    if len(args) > 0:
        message = "Error: non-paired arguments are not allowed."
        raise exceptions.WrongArgumentError(message)

    motif_pipeline = pipeline.MotifPipeline()
    sample_attr_path = None
    genome_fasta = None
    genome_len = None
    genome_include = None
    prefix = None
    config_name = "local"

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            description()
            sys.exit()
        elif opt in ("-i", "--input_dir"):
            motif_pipeline.set_input_dir(arg)
        elif opt in ("-o", "--output_dir"):
            motif_pipeline.set_output_dir(arg)
        elif opt in ("-f", "--genome_fasta"):
            genome_fasta = arg
        elif opt in ("-l", "--genome_len"):
            genome_len = arg
        elif opt in ("-e", "--genome_include"):
            genome_include = arg
        elif opt in ("-m", "--motif_path"):
            motif_pipeline.set_motif_path(arg)
        elif opt in ("-a", "--sample_attr"):
            motif_pipeline.set_sample_attr(arg)
        elif opt in ("-t", "--sampleinfo_table"):
            sample_attr_path = arg
        elif opt in ("-s", "--SV_types"):
            motif_pipeline.set_SV_types(arg)
        elif opt in ("-r", "--rand_sv_ratio"):
            motif_pipeline.set_rand_sv_ratio(arg)
        elif opt in ("-F", "--FIMO_thresh"):
            motif_pipeline.set_FIMO_thresh(arg)
        elif opt in ("-A", "--AME_scoring"):
            motif_pipeline.set_AME_scoring(arg)
        elif opt in ("-c", "--config"):
            config_name = arg
        elif opt in ("-p", "--prefix"):
            prefix = arg
        else:
            message = "Error: {opt} is not a valid option".format(opt=opt)
            raise exceptions.WrongArgumentError(message)

    if ((sample_attr_path is None and not motif_pipeline.sample_attr == "all")
            or (sample_attr_path is not None
                and motif_pipeline.sample_attr == "all")):
        message = "Error: you must indicate both --sampleinfo_table and --sample_attr, or neither."
        raise exceptions.MissingArgumentError(message)
    if genome_fasta is None:
        message = "Error: you must indicate --genome_fasta."
        raise exceptions.MissingArgumentError(message)
    if genome_len is None:
        message = "Error: you must indicate --genome_len."
        raise exceptions.MissingArgumentError(message)
    if genome_include is None:
        message = "Error: you must indicate --genome_include."
        raise exceptions.MissingArgumentError(message)
    for pipeline_attr in ["input_dir", "output_dir", "motif_path"]:
        if not hasattr(motif_pipeline, pipeline_attr):
            message = ("Error: you must indicate --{attr}.").format(
                attr=pipeline_attr)
            raise exceptions.MissingArgumentError(message)
    motif_pipeline.set_subdir_name(prefix)
    motif_pipeline.write_description()
    motif_pipeline.set_list_bedpe(sample_attr_path)
    reference_genome = refgenome.ReferenceGenome(genome_fasta, genome_len,
                                                 genome_include)
    base_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
    try:
        config = os.path.join(base_dir, 'configs', '{}.py'.format(config_name))
        spec = importlib.util.spec_from_file_location('', config)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        parsl.load(module.config)
    except:
        raise exceptions.IncorrectPathError(
            "Cannot find the config file <{config_name}>.".format(
                config_name=config_name))

    if not os.path.isdir(motif_pipeline.output_dir + "bed_files"):
        os.mkdir(motif_pipeline.output_dir + "bed_files")
    for file_name in motif_pipeline.list_bedpe:
        sv_types_to_run = get_SV_types(motif_pipeline, file_name)
        if sv_types_to_run:
            extractdata.bedpe_to_bed(reference_genome, motif_pipeline,
                                     file_name, sv_types_to_run)
    parsl.wait_for_current_tasks()
    runprogram.merge(motif_pipeline)
    motif_pipeline.set_num_SV_breakpoints()
    runprogram.bedtools(motif_pipeline, reference_genome)
    runprogram.FIMO(motif_pipeline)
    runprogram.AME(motif_pipeline)
    extractdata.extract_list_sequences_AME(motif_pipeline)
    extractdata.extract_output_FIMO(motif_pipeline)
    extractdata.extract_output_AME(motif_pipeline)
    graphs.generate_histogram(motif_pipeline)