def retrieve(self): queue = [] for key in self._links_dict: for url in self._links_dict[key]: queue.append({"url": url, "out_dir": os.path.join(self.reference_dir, key)}) Utilities.single_core_queue(self._dl_handler, queue) print("Download completed")
def __init__(self, charts_dir, deploy_prefix, nodes_number, threads_number, sampledata_file, refdata_file, output_mask, output_dir): self.charts_directory = Utilities.ends_with_slash(charts_dir) self.deploy_prefix = re.sub("[^A-Za-z0-9\-]+", "-", deploy_prefix) self.config_chart = Chart( file="{}config.yaml".format(self.charts_directory), # URL is not supported url= "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/config.yaml" ) self.cfgDict = { "QUEUE_NAME": "{}-queue".format(self.deploy_prefix), "MASTER_CONTAINER_NAME": "{}-master".format(self.deploy_prefix), "JOB_NAME": "{}-job".format(self.deploy_prefix), "WORKER_CONTAINER_NAME": "{}-worker".format(self.deploy_prefix), "ACTIVE_NODES_NUMBER": nodes_number, "THREADS_NUMBER": threads_number, "SAMPLEDATA": sampledata_file, "REFDATA": refdata_file, "OUTPUT_MASK": output_mask, "OUTPUT_DIR": Utilities.ends_with_slash(output_dir) } self.master_chart = Chart( file="{}master.yaml".format(self.charts_directory), url= "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/master.yaml" ) self.worker_chart = Chart( file="{}worker.yaml".format(self.charts_directory), url= "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/worker.yaml" )
def run_cutadapt(input_list: list): # Note the order, it hardly depends from the order of the upstream dataframe columns sample_name, sample_file_1, sample_file_2 = input_list _ADAPTER = "AGATCGGAAGAG" out_file_1, out_file_2, log_file = [ os.path.join(cutadaptDir, "{}_cutadapt.{}".format(sample_name, i)) for i in ("1.fq.gz", "2.fq.gz", "log") ] cmd = "cutadapt -a {ad} -A {ad} -m 50 -o {o1} -p {o2} {i1} {i2}".format( ad=_ADAPTER, i1=sample_file_1, i2=sample_file_2, o1=out_file_1, o2=out_file_2) try: for _f in [out_file_1, out_file_2, log_file]: if os.path.exists(_f): os.remove(_f) log = subprocess.getoutput(cmd) except PermissionError: raise ValueError( "Permission denied, please run `sudo chmod -R 777 {}`".format( os.path.dirname(sample_file_1))) Utilities.dump_string(log, file=log_file) return { "sample_name": sample_name, "trimmed_file_1": out_file_1, "trimmed_file_2": out_file_2 }
def parse_fastq(self, output_file: str): import os import math if os.path.isfile(output_file): os.remove(output_file) print("Deleted file in order to replace it with new data: '{}'". format(output_file)) counter = 0 first_position = 0 last_position = self.CHUNK_SIZE chunks_number = math.ceil(len(self.raw_fastqs_list) / self.CHUNK_SIZE) while last_position < len(self.raw_fastqs_list): with open(output_file, mode="a", encoding="utf-8") as f: f.write("{}\n".format("\n".join( Utilities.multi_core_queue( self.mp_parse_fastq_line, self.raw_fastqs_list[first_position:last_position])))) counter += 1 print("Passed FASTQ parse iteration: {} (of {})".format( counter, chunks_number)) first_position += self.CHUNK_SIZE last_position += self.CHUNK_SIZE if first_position <= len(self.raw_fastqs_list): with open(output_file, mode="a", encoding="utf-8") as f: f.write("{}\n".format("\n".join( Utilities.multi_core_queue( self.mp_parse_fastq_line, self.raw_fastqs_list[ first_position:len(self.raw_fastqs_list)])))) print("Passed FASTQ parse last iteration") print("Finished parse FASTQ items: {}".format(len( self.raw_fastqs_list)))
def generate_from_directory( directory: str, regex: str = DEFAULT_REGEX, reads_extension: str = DEFAULT_READS_EXTENSION): pair_2d_array = Utilities.get_most_similar_word_pairs( Utilities.find_file_by_tail(directory, reads_extension)) return SampleDataArray.generate(pair_2d_array, regex=regex, extension=reads_extension)
def split(self, output_dir: str): output_dir = Utilities.ends_with_slash(output_dir) os.makedirs(output_dir, exist_ok=True) # Note: the dataframe must have only index and value columns for sample_col_name in list(self.pivot_df): sample_name = Utilities.filename_only(sample_col_name).split( "_")[0] sample_file_name = "{}{}.tsv".format(output_dir, sample_name) self.pivot_df[sample_col_name].reset_index().rename( columns={ sample_col_name: self.value_col_name }).to_csv(sample_file_name, sep="\t", header=True, index=False) self._sample_names_list.append(sample_file_name)
def parse_spades_version(sample_name_): log_file = [ i for i in Utilities.scan_whole_dir( "/data1/bio/projects/vradchenko/lactobacillus_salivarius/pga-pe/log" ) if i.endswith(".log") and all(j in i for j in ["spades", sample_name_]) ][0] log_lines = Utilities.load_list(log_file) image_version_line = [ i for i in log_lines if i.strip().startswith("Status: Image is up to date for ") ][0].strip() spades_version = re.split("[\t ]+", image_version_line)[-1] return spades_version
def run_spades(input_list: list): # Same about the order sample_name, sample_file_1, sample_file_2 = input_list out_dir = os.path.join(spadesDir, sample_name) subprocess.getoutput("rm -rf {}".format(out_dir)) os.makedirs(out_dir) cmd = "spades.py --careful -o {out} -1 {i1} -2 {i2}".format( out=out_dir, i1=sample_file_1, i2=sample_file_2) log = subprocess.getoutput(cmd) log_file = os.path.join(out_dir, "{}_spades.log".format(sample_name)) Utilities.dump_string(log, file=log_file) return { "sample_name": sample_name, "assembly": os.path.join(out_dir, "contigs.fasta") }
def __init__(self, sample_name: str, sample_read_files: list): self.state = dict() self.name = sample_name.strip() self.reads = sorted(Utilities.remove_empty_values(sample_read_files)) self.taxa = "" self.is_valid = False self._validate_reads()
def generate(pair_2d_array: list, regex: str = DEFAULT_REGEX, extension: str = DEFAULT_READS_EXTENSION): arr = SampleDataArray() for sample_read_files in pair_2d_array: sample_read_files = sorted(sample_read_files) sample_file = os.path.basename(sample_read_files[0]) sample_name = Utilities.safe_findall( regex, re.sub(f"{extension}$", "", sample_file)) if len(sample_name) == 0: raise ValueError( f"Cannot process the file '{sample_file}' with the regex '{regex}'" ) if any(sample_name not in i for i in sample_read_files): raise ValueError( f"Some files from the list '{sample_read_files}' do not contain {sample_name} parsed by the regex '{regex}'" ) if sample_name in arr.lines.keys(): print( f"Duplicate sample data line key, the regex check is considered: '{sample_name}'" ) c = 0 sample_name_ = str(sample_name) while sample_name in arr.lines.keys(): c += 1 sample_name = "{}.{}".format(sample_name_, c) arr.lines[sample_name] = SampleDataLine(sample_name, sample_read_files) return arr
def _validate_reads(self): c = 0 for read_file in self.reads: if not Utilities.is_file_valid(read_file, False): print("Not found the raw read file: '{}'".format(read_file)) c += 1 self.is_valid = c == 0
def set_groupdata_dict(self, groupdata_file: str): self.groupdata_file = groupdata_file self.groupdata_digest_name = Utilities.filename_only(self.groupdata_file).replace(".groupdata", "") groupdata_df = pd.read_table(self.groupdata_file, sep="\t", header="infer", names=["sample_name", "group_name"]) self.groupdata_dict = {i: sorted(set( groupdata_df.loc[groupdata_df["group_name"] == i, ["sample_name"]])) for i in sorted( set(groupdata_df["group_name"]))} self.raw_all_sample_names_list = sorted(set(groupdata_df["sample_name"]))
def __init__(self, fastas_string): self._fastas_string = fastas_string self._raw_fastas_list = [ ">{}".format(j) if not j.startswith(">") else j for j in [i.strip() for i in re.split("\n>", self._fastas_string)] ] self._parsed_fastas_list = Utilities.remove_empty_values( [FASTA(i) for i in self._raw_fastas_list])
def set_raw_pvals_dir(self, output_dir: str): self.output_dir = Utilities.ends_with_slash(output_dir) self.raw_pvals_dir = "{OUTPUT_DIR}pvals/{VALUE_COLUMN}/{GROUPS}/".format(OUTPUT_DIR=self.output_dir, VALUE_COLUMN=self.pivot_value_col_name_abbreviation, GROUPS=self.groupdata_digest_name) self.digest_dir = "{OUTPUT_DIR}digest/{VALUE_COLUMN}/{GROUPS}/".format(OUTPUT_DIR=self.output_dir, VALUE_COLUMN=self.pivot_value_col_name_abbreviation, GROUPS=self.groupdata_digest_name)
def process_blast_report(high_scoring_pairs: list): first_report = high_scoring_pairs[0] reference_header = first_report.get("title") accession_id = Utilities.safe_findall("\|* *gi\| *([^|]+) *\|", reference_header) return dict(assembly_file=first_report.get("assembly_file"), reference_header=reference_header, accession_id=accession_id)
def get_genera_dict(input_list: list): return { j: () for j in sorted([ Utilities.safe_findall("([A-Z][a-z]{4,})", i).strip() for i in set(input_list) if isinstance(i, str) ]) if len(j) > 0 }
def mp_get_and_blast_largest_contig(assembly_file: str): if os.path.getsize(assembly_file) == 0: print("Cannot process the empty file: '{}'".format(assembly_file)) return with open(assembly_file) as f: contig_records = sorted(list(SeqIO.parse(f, "fasta")), key=lambda x: len(x), reverse=True) f.close() largest_contig = randomize_gene_slice(contig_records[0]).format("fasta") # The delay to avoid NCBI ban randomize_sleep() # NCBI query result_handle = attempt_func(NCBIWWW.qblast, ("blastn", "nt", largest_contig)) blast_record = NCBIXML.read(result_handle) # Based on: https://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc95 _E_VALUE_THRESH = 0.04 _QUERY_REPORT_SYMBOLS = 75 high_scoring_pairs = [] for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < _E_VALUE_THRESH: high_scoring_pairs.append( dict(title=alignment.title, length=alignment.length, expect=hsp.expect, score=hsp.score, bits=hsp.bits, identities=hsp.identities, positives=hsp.positives, assembly_file=assembly_file, query="...\n".join([ hsp.query[:_QUERY_REPORT_SYMBOLS], hsp.match[:_QUERY_REPORT_SYMBOLS], hsp.sbjct[:_QUERY_REPORT_SYMBOLS], "" ]))) high_scoring_pairs = sorted(high_scoring_pairs, key=lambda x: x.get("score"), reverse=True) # Export BLAST results Utilities.dump_string( json.dumps(high_scoring_pairs, sort_keys=True, indent=4), "{}.BLAST.json".format(os.path.splitext(assembly_file)[0])) return high_scoring_pairs
def join_by_value_columns(tables: list, index_col_name: str, value_col_name_: str): dfs_list = [ Utilities.load_tsv(i).set_index(index_col_name) [value_col_name_].rename(i) for i in tables ] out = pd.concat(dfs_list, axis=1, sort=False).sort_index() out.index.names = [index_col_name] return out
def join_and_annotate(self): annotation_df = Utilities.load_tsv( self.annotation_file).set_index(REFERENCE_COL_NAME) values_df = self.join_by_value_columns(self.coverage_files, REFERENCE_COL_NAME, self.value_col_name) out = pd.concat([annotation_df, values_df], axis=1, sort=False) out.index.names = [REFERENCE_COL_NAME] return out
def dump_index_guide(input_nucleotide_fasta: str, output_dir: str): if not Utilities.is_file_valid(input_nucleotide_fasta): raise ValueError(f"Invalid file: '{input_nucleotide_fasta}'") cmd_0 = f""" export IMG=ivasilyev/bwt_filtering_pipeline_worker:latest && \ docker pull "$IMG" && \ docker run --rm -v /data:/data -v /data1:/data1 -v /data2:/data2 -it "$IMG" \ bash -c ' cd "{output_dir}"; python3 "$HOME/scripts/cook_the_reference.py" \ --input "{input_nucleotide_fasta}" \ --output "{output_dir}"; ' """ cmd = Utilities.join_lines(cmd_0) out_file = os.path.join(output_dir, "index.sh") Utilities.dump_string(cmd, out_file) print(f"For indexing, run outside of Docker: 'bash \"{out_file}\"'")
def query2fasta(self): output_dict = {"FASTAs_list": [], "annotations_series_list": []} for _soup in self._row_soups_list: _row_dict = self.parse_table_row(_soup) _locations_list = _row_dict["Sequence Location"].split("..") # Filtering expression if (self._gene.lower() in _row_dict["Name"].lower() or self._gene.lower() in _row_dict["Description"].lower() or self._gene.lower() in _row_dict["Aliases"].lower()) and len(_locations_list) == 2: fasta = self._get_fasta(_row_dict["Sequence ID"], _locations_list) output_dict["FASTAs_list"].append(fasta) output_dict["annotations_series_list"].append(pd.Series(Utilities.dict2pd_series(_row_dict), name=fasta.header)) return output_dict
def __init__(self, reference_describer_instance, value_col_name): self.describer = reference_describer_instance self.value_col_name = value_col_name self.coverage_files = [ i for i in Utilities.scan_whole_dir(projectDescriber.MAPPED_DATA_DIR) if all(j in i for j in [self.describer.ALIAS, "coverage.tsv"]) ] self.annotation_file = self.describer.get_refdata_dict().get( "sequence_1").annotation_file self.raw_annotated_pivot = self.join_and_annotate()
def retrieve(self): if os.path.exists(self.reference_dir): print("Warning! The reference path exists: '{}'".format( self.reference_dir)) os.makedirs(self.reference_dir, exist_ok=True) chromosomes_dir = os.path.join(self.reference_dir, "chromosomes") os.makedirs(chromosomes_dir, exist_ok=True) # UCSC returns HTTP 530 when attempting to download in multi-thread compressed_chromosomes = Utilities.single_core_queue( self._dl_wrapper, [{ "chromosome": i, "chromosomes_dir": chromosomes_dir } for i in self.CHROMOSOMES]) # Process sequence self.parsed_records = Utilities.flatten_2d_array( Utilities.single_core_queue(self._parse_gzip_fna, compressed_chromosomes)) self.nfasta_file = os.path.join(self.reference_dir, "hg19.fasta") SeqIO.write(self.parsed_records, self.nfasta_file, "fasta") # Process annotation self.index_dir = self.describer.get_index_guide(self.nfasta_file)
def process_genbank_report(d: dict): genbank_records = d.get("genbank_records") genbank_record = genbank_records[0] cds_number = len([i for i in genbank_record.features if i.type == "CDS"]) qualifiers_dict = [ i.qualifiers for i in genbank_record.features if i.type == "source" ][0] organism = Utilities.remove_empty_values( qualifiers_dict.get("organism")[0].split(" "))[:2] strain = " ".join(organism + [qualifiers_dict.get("strain")[0]]) taxonomy_id = Utilities.safe_findall("\d+", [ i for i in qualifiers_dict.get("db_xref") if i.split(":")[0].strip() == "taxon" ][0]) return dict(assembly_file=d.get("assembly_file"), strain=strain, taxonomy_id=taxonomy_id, reference_accession_id=genbank_record.id, cds_number=cds_number, reference_bp=len(genbank_record), reference_description=genbank_record.description)
def __init__(self, version: str): _DL_PAGE_URL = "http://202.120.12.135/TADB2/download.html" self.describer = ReferenceDescriber() self.describer.VERSION = version self.describer.update_alias() self.reference_dir = os.path.join("/data/reference", self.describer.NAME, self.describer.ALIAS) links = [i for i in Utilities.scrap_links_from_web_page(_DL_PAGE_URL) if i.endswith(".fas")] self._fasta_types = ["nucleotide", "protein"] self._links_dict = {k: [i for i in links if i.split("/")[-2] == k] for k in self._fasta_types} assert len(self._links_dict["nucleotide"]) + len(self._links_dict["protein"]) == len(links) self.nfasta = os.path.join(self.reference_dir, "{}.fasta".format(self.describer.ALIAS)) self.pfasta = os.path.join(self.reference_dir, "{}_protein.fasta".format(self.describer.ALIAS)) self.index_dir = ""
def annotate(self): # Process nucleotide FASTA self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)] self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id") self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id") # Process protein FASTA raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in open(self._raw_pfasta_file, mode="r", encoding="utf-8") if i.startswith(">")] if len(j) > 0])) processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)] self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id") self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id", "description": "protein_description", "host": "protein_host"}, inplace=True) self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol") self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")
def _mp_parse_nfasta_header(header): output_dict = dict(former_id=header) output_dict["genbank_id"] = Utilities.safe_findall( "^gb\|([^|]+)", header) output_dict["is_antisense_strand"] = header.split("|")[2].startswith( "-") output_dict["locus"] = Utilities.safe_findall("\|(\d+\-\d+)", header) output_dict["aro_id"] = Utilities.safe_findall("\|ARO:(\d+)", header) gene_chunk = header.split("|")[-1] output_dict["host"] = Utilities.safe_findall("\[(.+)\]", gene_chunk) output_dict["gene_description"] = gene_chunk.replace( "[{}]".format(output_dict["host"]), "").strip() _MIN_GENE_SYMBOL_LENGTH = 3 _NON_GENE_SYMBOL_WORDS = ("DNA", "RNA") output_dict["gene_symbol"] = min([ j for j in [ i.strip() for i in output_dict.get("gene_description").split(" ") ] if len(j) >= _MIN_GENE_SYMBOL_LENGTH and j not in _NON_GENE_SYMBOL_WORDS ], key=len) return Utilities.dict2pd_series(output_dict)
def _mp_parse_nfasta_header(header: str): _VFDB_REGEXES = (("vfdb_id", "^VFG(\d+)", "VFG{}"), ("gene_accession_id", "\(([^\(]+)\) ", "({}) "), ("gene_symbol", "^\(([^\(]+)\) ", "({}) "), ("gene_host", "\[([^\]]+)\]$", "[{}]"), ("gene_name", " \[([^\]]+)\] $", " [{}] "), ("gene_description", ".*", "{}")) out = {"former_id": header} # Spaces are important here for _tuple in _VFDB_REGEXES: key, regex, replacement = _tuple out[key] = Utilities.safe_findall(regex, header) if len(out.get(key)) > 0: header = header.replace(replacement.format(out.get(key)), "") return {k: out.get(k).strip() for k in out}
def annotate(self): self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_nfasta_header, raw_nfasta_headers) ] self._processed_nfasta_df = Utilities.merge_pd_series_list( processed_nfasta_headers).sort_values("former_id") zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist())) # Join table assembled from pFASTA headers raw_pfasta_headers = [] with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f: for _line in _f: if _line.startswith(">"): raw_pfasta_headers.append(re.sub("^>", "", _line).strip()) _f.close() raw_pfasta_headers = sorted( set([i for i in raw_pfasta_headers if len(i) > 0])) processed_pfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_pfasta_header, raw_pfasta_headers) ] self._processed_pfasta_df = Utilities.merge_pd_series_list( processed_pfasta_headers).sort_values("protein_header") self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[ "vfdb_id"].str.zfill(zf_len) # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file), "VFs.xls") vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs", header=1).fillna("") vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill( zf_len) self.merged_df = pd.concat([ i.set_index("vfdb_id").sort_index() for i in [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df] ], axis=1, sort=False).sort_index() self.merged_df.index.names = ["vfdb_id"] self.merged_df = self.merged_df.loc[ self.merged_df["former_id"].str.len() > 0].reset_index() self.merged_df = Utilities.left_merge(self._raw_nfasta_df, self.merged_df, "former_id")
def define_species(_sample_name: str): _SPECIES = { "Bacillus subtilis BZR 336g": 336, "Bacillus subtilis BZR 517": 517, "Lactobacillus salivarius": 1, "Lactobacillus curvatus": 2, "Lactobacillus heilongjiangensis": 8 } first_digits = Utilities.safe_findall("^\d+", _sample_name) if len(first_digits) > 0: first_digits = int(first_digits) for k in _SPECIES: if first_digits == _SPECIES.get(k): return k print("Cannot define species: '{}'".format(_sample_name)) return "_"