def __init__(self, input_file, output_dir, index_name, threads): which = Cmd("which salmon") which.run() self.input_file = input_file self.output_dir = output_dir self.index_name = index_name self.threads = threads
def build_index(self) -> None: """Builds the salmon index.""" logger.debug("Build salmon index.") # TODO: Implement check to avoid duplicate runs indexing = Cmd( f"salmon index -p {self.threads} -t {self.input_file} -i {self.index_name} --keepDuplicates" ) indexing.run()
def __init__(self, input_file, index_name, threads): which = Cmd("which hisat2") which.run() # Check if indexing already run self.index_build_has_run = (True if len( glob.glob(f"{index_name}.*.ht2")) == 8 else False) self.input_file = input_file self.index_name = index_name self.threads = threads
def build_index(self): """ Build the Hisat2 index. """ if not self.index_build_has_run: logger.debug("Build Hisat2 index.") indexing = Cmd( f"hisat2-build -q -p {self.threads} {self.input_file} {self.index_name}" ) indexing.run() self.index_build_has_run = True else: logger.debug("Skipping index building.")
def update_database(self, database_dir, busco_group): """ Updates the dammit database. """ logger.info("Update dammit database.") self.database_dir = database_dir self.busco_group = busco_group database = Cmd( f"dammit databases --install --n_threads {self.threads} --database-dir {self.database_dir} --busco-group {self.busco_group}" ) database.run()
def run(self, reads): """ Run the Hisat2 mapping with the given reads. """ logger.debug("Perform Hisat2 mapping.") if len(reads) == 1: # single end reads hisat = Cmd( f"hisat2 -q --threads {self.threads} -k 1 -x {self.index_name} -U {reads[0]} --no-unal | \ samtools view --threads {self.threads} -hS -F 4 -q 1 -O SAM" ) elif len(reads) == 2: # paired end reads hisat = Cmd( f"hisat2 -q --threads {self.threads} -k 1 -x {self.index_name} -1 {reads[0]} -2 {reads[1]} --no-unal | \ samtools view --threads {self.threads} -hS -F 4 -q 1 -O SAM" ) hisat.run() self.mapping_has_run = True return (entry for entry in hisat.stdout.split("\n")[:-1] if not entry.startswith("@"))
def run(self, graph_file, output_file): """ MCL: The input is then a file or stream in which each line encodes an edge in terms of two labels (the 'A' and the 'B') and a numerical value (the 'C'), all separated by white space. A B 20 A C 10 The output is then a file where each line is a cluster of tab-separated labels. """ logger.debug("MCL clustering...") if os.path.exists(output_file): os.remove(output_file) mcl = Cmd( f"mcl {graph_file} -I {self.inflation} --abc -o {output_file} -te {self.threads} -resource 4 -V all" ) mcl.run()
def run_pipe(self, graph_file): """ Runs the MCL command, but uses stdin as input and stdout as output. Is a lot faster than writing and reading a lot of files. MCL: The input is then a file or stream in which each line encodes an edge in terms of two labels (the 'A' and the 'B') and a numerical value (the 'C'), all separated by white space. A B 20 A C 10 The output is then a file where each line is a cluster of tab-separated labels. """ logger.debug("MCL clustering...") mcl = Cmd( f"mcl - -I {self.inflation} --abc -o - -te {self.threads} -resource 4 -V all" ) mcl.run(in_stream=graph_file) return mcl.stdout
def run(self, reads: list) -> None: """Run the salmon mapping with the given reads. Args: reads: List of reads. Either paired end or single end. """ logger.debug("Perform salmon mapping.") if not os.path.exists(f"{self.output_dir}/aux_info/eq_classes.txt"): if len(reads) == 1: # single end reads salmon = Cmd( f"salmon quant --libType A --validateMappings --dumpEq -p {self.threads} -i {self.index_name} --unmatedReads {reads[0]} -o {self.output_dir}" ) elif len(reads) == 2: # paired end reads salmon = Cmd( f"salmon quant --libType A --validateMappings --dumpEq -p {self.threads} -i {self.index_name} -1 {reads[0]} -2 {reads[1]} -o {self.output_dir}" ) salmon.run() else: logger.info("Skipping mapping.")
def run(self): """ Executes the dammit annotation for the original and reduced fasta file. """ logger.info("Run dammit annotation.") for name, transcriptome in self.transcriptomes.items(): output_dir = f"{self.output_dir}/{name}" annotation_file = ( f"{output_dir}/{os.path.basename(transcriptome)}.dammit.gff3") self.gff_files[name] = annotation_file namemap_file = ( f"{output_dir}/{os.path.basename(transcriptome)}.dammit.namemap.csv" ) self.namemaps[name] = namemap_file if not (os.path.exists(annotation_file) and os.path.exists(namemap_file)): dammit = Cmd( f"dammit annotate {transcriptome} -o {output_dir} --database-dir {self.database_dir} --busco-group {self.busco_group} --n_threads {self.threads}" ) dammit.run()
def __init__(self, threads, inflation): self.threads = threads self.inflation = inflation mcl = Cmd("which mcl") mcl.run()
""" Benjamin Weeks CS472 Project 2 November, 1st, 2015 Command Line Interface """ from cmd import Cmd cmd = Cmd() cmd.run()