Beispiel #1
0
 def retrieve(self):
     queue = []
     for key in self._links_dict:
         for url in self._links_dict[key]:
             queue.append({"url": url, "out_dir": os.path.join(self.reference_dir, key)})
     Utilities.single_core_queue(self._dl_handler, queue)
     print("Download completed")
 def __init__(self, charts_dir, deploy_prefix, nodes_number, threads_number,
              sampledata_file, refdata_file, output_mask, output_dir):
     self.charts_directory = Utilities.ends_with_slash(charts_dir)
     self.deploy_prefix = re.sub("[^A-Za-z0-9\-]+", "-", deploy_prefix)
     self.config_chart = Chart(
         file="{}config.yaml".format(self.charts_directory),
         # URL is not supported
         url=
         "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/config.yaml"
     )
     self.cfgDict = {
         "QUEUE_NAME": "{}-queue".format(self.deploy_prefix),
         "MASTER_CONTAINER_NAME": "{}-master".format(self.deploy_prefix),
         "JOB_NAME": "{}-job".format(self.deploy_prefix),
         "WORKER_CONTAINER_NAME": "{}-worker".format(self.deploy_prefix),
         "ACTIVE_NODES_NUMBER": nodes_number,
         "THREADS_NUMBER": threads_number,
         "SAMPLEDATA": sampledata_file,
         "REFDATA": refdata_file,
         "OUTPUT_MASK": output_mask,
         "OUTPUT_DIR": Utilities.ends_with_slash(output_dir)
     }
     self.master_chart = Chart(
         file="{}master.yaml".format(self.charts_directory),
         url=
         "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/master.yaml"
     )
     self.worker_chart = Chart(
         file="{}worker.yaml".format(self.charts_directory),
         url=
         "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/worker.yaml"
     )
def run_cutadapt(input_list: list):
    # Note the order, it hardly depends from the order of the upstream dataframe columns
    sample_name, sample_file_1, sample_file_2 = input_list
    _ADAPTER = "AGATCGGAAGAG"
    out_file_1, out_file_2, log_file = [
        os.path.join(cutadaptDir, "{}_cutadapt.{}".format(sample_name, i))
        for i in ("1.fq.gz", "2.fq.gz", "log")
    ]
    cmd = "cutadapt -a {ad} -A {ad} -m 50 -o {o1} -p {o2} {i1} {i2}".format(
        ad=_ADAPTER,
        i1=sample_file_1,
        i2=sample_file_2,
        o1=out_file_1,
        o2=out_file_2)
    try:
        for _f in [out_file_1, out_file_2, log_file]:
            if os.path.exists(_f):
                os.remove(_f)
        log = subprocess.getoutput(cmd)
    except PermissionError:
        raise ValueError(
            "Permission denied, please run `sudo chmod -R 777 {}`".format(
                os.path.dirname(sample_file_1)))
    Utilities.dump_string(log, file=log_file)
    return {
        "sample_name": sample_name,
        "trimmed_file_1": out_file_1,
        "trimmed_file_2": out_file_2
    }
 def parse_fastq(self, output_file: str):
     import os
     import math
     if os.path.isfile(output_file):
         os.remove(output_file)
         print("Deleted file in order to replace it with new data: '{}'".
               format(output_file))
     counter = 0
     first_position = 0
     last_position = self.CHUNK_SIZE
     chunks_number = math.ceil(len(self.raw_fastqs_list) / self.CHUNK_SIZE)
     while last_position < len(self.raw_fastqs_list):
         with open(output_file, mode="a", encoding="utf-8") as f:
             f.write("{}\n".format("\n".join(
                 Utilities.multi_core_queue(
                     self.mp_parse_fastq_line,
                     self.raw_fastqs_list[first_position:last_position]))))
         counter += 1
         print("Passed FASTQ parse iteration: {} (of {})".format(
             counter, chunks_number))
         first_position += self.CHUNK_SIZE
         last_position += self.CHUNK_SIZE
     if first_position <= len(self.raw_fastqs_list):
         with open(output_file, mode="a", encoding="utf-8") as f:
             f.write("{}\n".format("\n".join(
                 Utilities.multi_core_queue(
                     self.mp_parse_fastq_line, self.raw_fastqs_list[
                         first_position:len(self.raw_fastqs_list)]))))
         print("Passed FASTQ parse last iteration")
     print("Finished parse FASTQ items: {}".format(len(
         self.raw_fastqs_list)))
 def generate_from_directory(
         directory: str,
         regex: str = DEFAULT_REGEX,
         reads_extension: str = DEFAULT_READS_EXTENSION):
     pair_2d_array = Utilities.get_most_similar_word_pairs(
         Utilities.find_file_by_tail(directory, reads_extension))
     return SampleDataArray.generate(pair_2d_array,
                                     regex=regex,
                                     extension=reads_extension)
Beispiel #6
0
 def split(self, output_dir: str):
     output_dir = Utilities.ends_with_slash(output_dir)
     os.makedirs(output_dir, exist_ok=True)
     # Note: the dataframe must have only index and value columns
     for sample_col_name in list(self.pivot_df):
         sample_name = Utilities.filename_only(sample_col_name).split(
             "_")[0]
         sample_file_name = "{}{}.tsv".format(output_dir, sample_name)
         self.pivot_df[sample_col_name].reset_index().rename(
             columns={
                 sample_col_name: self.value_col_name
             }).to_csv(sample_file_name, sep="\t", header=True, index=False)
         self._sample_names_list.append(sample_file_name)
Beispiel #7
0
def parse_spades_version(sample_name_):
    log_file = [
        i for i in Utilities.scan_whole_dir(
            "/data1/bio/projects/vradchenko/lactobacillus_salivarius/pga-pe/log"
        )
        if i.endswith(".log") and all(j in i for j in ["spades", sample_name_])
    ][0]
    log_lines = Utilities.load_list(log_file)
    image_version_line = [
        i for i in log_lines
        if i.strip().startswith("Status: Image is up to date for ")
    ][0].strip()
    spades_version = re.split("[\t ]+", image_version_line)[-1]
    return spades_version
def run_spades(input_list: list):
    # Same about the order
    sample_name, sample_file_1, sample_file_2 = input_list
    out_dir = os.path.join(spadesDir, sample_name)
    subprocess.getoutput("rm -rf {}".format(out_dir))
    os.makedirs(out_dir)
    cmd = "spades.py --careful -o {out} -1 {i1} -2 {i2}".format(
        out=out_dir, i1=sample_file_1, i2=sample_file_2)
    log = subprocess.getoutput(cmd)
    log_file = os.path.join(out_dir, "{}_spades.log".format(sample_name))
    Utilities.dump_string(log, file=log_file)
    return {
        "sample_name": sample_name,
        "assembly": os.path.join(out_dir, "contigs.fasta")
    }
 def __init__(self, sample_name: str, sample_read_files: list):
     self.state = dict()
     self.name = sample_name.strip()
     self.reads = sorted(Utilities.remove_empty_values(sample_read_files))
     self.taxa = ""
     self.is_valid = False
     self._validate_reads()
 def generate(pair_2d_array: list,
              regex: str = DEFAULT_REGEX,
              extension: str = DEFAULT_READS_EXTENSION):
     arr = SampleDataArray()
     for sample_read_files in pair_2d_array:
         sample_read_files = sorted(sample_read_files)
         sample_file = os.path.basename(sample_read_files[0])
         sample_name = Utilities.safe_findall(
             regex, re.sub(f"{extension}$", "", sample_file))
         if len(sample_name) == 0:
             raise ValueError(
                 f"Cannot process the file '{sample_file}' with the regex '{regex}'"
             )
         if any(sample_name not in i for i in sample_read_files):
             raise ValueError(
                 f"Some files from the list '{sample_read_files}' do not contain {sample_name} parsed by the regex '{regex}'"
             )
         if sample_name in arr.lines.keys():
             print(
                 f"Duplicate sample data line key, the regex check is considered: '{sample_name}'"
             )
             c = 0
             sample_name_ = str(sample_name)
             while sample_name in arr.lines.keys():
                 c += 1
                 sample_name = "{}.{}".format(sample_name_, c)
         arr.lines[sample_name] = SampleDataLine(sample_name,
                                                 sample_read_files)
     return arr
 def _validate_reads(self):
     c = 0
     for read_file in self.reads:
         if not Utilities.is_file_valid(read_file, False):
             print("Not found the raw read file: '{}'".format(read_file))
             c += 1
     self.is_valid = c == 0
Beispiel #12
0
 def set_groupdata_dict(self, groupdata_file: str):
     self.groupdata_file = groupdata_file
     self.groupdata_digest_name = Utilities.filename_only(self.groupdata_file).replace(".groupdata", "")
     groupdata_df = pd.read_table(self.groupdata_file, sep="\t", header="infer", names=["sample_name", "group_name"])
     self.groupdata_dict = {i: sorted(set(
         groupdata_df.loc[groupdata_df["group_name"] == i, ["sample_name"]])) for i in sorted(
         set(groupdata_df["group_name"]))}
     self.raw_all_sample_names_list = sorted(set(groupdata_df["sample_name"]))
 def __init__(self, fastas_string):
     self._fastas_string = fastas_string
     self._raw_fastas_list = [
         ">{}".format(j) if not j.startswith(">") else j
         for j in [i.strip() for i in re.split("\n>", self._fastas_string)]
     ]
     self._parsed_fastas_list = Utilities.remove_empty_values(
         [FASTA(i) for i in self._raw_fastas_list])
Beispiel #14
0
 def set_raw_pvals_dir(self, output_dir: str):
     self.output_dir = Utilities.ends_with_slash(output_dir)
     self.raw_pvals_dir = "{OUTPUT_DIR}pvals/{VALUE_COLUMN}/{GROUPS}/".format(OUTPUT_DIR=self.output_dir,
                                                                              VALUE_COLUMN=self.pivot_value_col_name_abbreviation,
                                                                              GROUPS=self.groupdata_digest_name)
     self.digest_dir = "{OUTPUT_DIR}digest/{VALUE_COLUMN}/{GROUPS}/".format(OUTPUT_DIR=self.output_dir,
                                                                            VALUE_COLUMN=self.pivot_value_col_name_abbreviation,
                                                                            GROUPS=self.groupdata_digest_name)
def process_blast_report(high_scoring_pairs: list):
    first_report = high_scoring_pairs[0]
    reference_header = first_report.get("title")
    accession_id = Utilities.safe_findall("\|* *gi\| *([^|]+) *\|",
                                          reference_header)
    return dict(assembly_file=first_report.get("assembly_file"),
                reference_header=reference_header,
                accession_id=accession_id)
Beispiel #16
0
 def get_genera_dict(input_list: list):
     return {
         j: ()
         for j in sorted([
             Utilities.safe_findall("([A-Z][a-z]{4,})", i).strip()
             for i in set(input_list) if isinstance(i, str)
         ]) if len(j) > 0
     }
def mp_get_and_blast_largest_contig(assembly_file: str):
    if os.path.getsize(assembly_file) == 0:
        print("Cannot process the empty file: '{}'".format(assembly_file))
        return
    with open(assembly_file) as f:
        contig_records = sorted(list(SeqIO.parse(f, "fasta")),
                                key=lambda x: len(x),
                                reverse=True)
        f.close()
    largest_contig = randomize_gene_slice(contig_records[0]).format("fasta")
    # The delay to avoid NCBI ban
    randomize_sleep()
    # NCBI query
    result_handle = attempt_func(NCBIWWW.qblast,
                                 ("blastn", "nt", largest_contig))
    blast_record = NCBIXML.read(result_handle)
    # Based on: https://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc95
    _E_VALUE_THRESH = 0.04
    _QUERY_REPORT_SYMBOLS = 75
    high_scoring_pairs = []
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < _E_VALUE_THRESH:
                high_scoring_pairs.append(
                    dict(title=alignment.title,
                         length=alignment.length,
                         expect=hsp.expect,
                         score=hsp.score,
                         bits=hsp.bits,
                         identities=hsp.identities,
                         positives=hsp.positives,
                         assembly_file=assembly_file,
                         query="...\n".join([
                             hsp.query[:_QUERY_REPORT_SYMBOLS],
                             hsp.match[:_QUERY_REPORT_SYMBOLS],
                             hsp.sbjct[:_QUERY_REPORT_SYMBOLS], ""
                         ])))
    high_scoring_pairs = sorted(high_scoring_pairs,
                                key=lambda x: x.get("score"),
                                reverse=True)
    # Export BLAST results
    Utilities.dump_string(
        json.dumps(high_scoring_pairs, sort_keys=True, indent=4),
        "{}.BLAST.json".format(os.path.splitext(assembly_file)[0]))
    return high_scoring_pairs
Beispiel #18
0
 def join_by_value_columns(tables: list, index_col_name: str,
                           value_col_name_: str):
     dfs_list = [
         Utilities.load_tsv(i).set_index(index_col_name)
         [value_col_name_].rename(i) for i in tables
     ]
     out = pd.concat(dfs_list, axis=1, sort=False).sort_index()
     out.index.names = [index_col_name]
     return out
Beispiel #19
0
 def join_and_annotate(self):
     annotation_df = Utilities.load_tsv(
         self.annotation_file).set_index(REFERENCE_COL_NAME)
     values_df = self.join_by_value_columns(self.coverage_files,
                                            REFERENCE_COL_NAME,
                                            self.value_col_name)
     out = pd.concat([annotation_df, values_df], axis=1, sort=False)
     out.index.names = [REFERENCE_COL_NAME]
     return out
def dump_index_guide(input_nucleotide_fasta: str, output_dir: str):
    if not Utilities.is_file_valid(input_nucleotide_fasta):
        raise ValueError(f"Invalid file: '{input_nucleotide_fasta}'")
    cmd_0 = f"""
    export IMG=ivasilyev/bwt_filtering_pipeline_worker:latest && \
    docker pull "$IMG" && \
    docker run --rm -v /data:/data -v /data1:/data1 -v /data2:/data2 -it "$IMG" \
        bash -c '
            cd "{output_dir}";
            python3 "$HOME/scripts/cook_the_reference.py" \
                --input "{input_nucleotide_fasta}" \
                --output "{output_dir}";
        '
    """
    cmd = Utilities.join_lines(cmd_0)
    out_file = os.path.join(output_dir, "index.sh")
    Utilities.dump_string(cmd, out_file)
    print(f"For indexing, run outside of Docker: 'bash \"{out_file}\"'")
Beispiel #21
0
 def query2fasta(self):
     output_dict = {"FASTAs_list": [], "annotations_series_list": []}
     for _soup in self._row_soups_list:
         _row_dict = self.parse_table_row(_soup)
         _locations_list = _row_dict["Sequence Location"].split("..")
         # Filtering expression
         if (self._gene.lower() in _row_dict["Name"].lower() or self._gene.lower() in _row_dict["Description"].lower() or self._gene.lower() in _row_dict["Aliases"].lower()) and len(_locations_list) == 2:
             fasta = self._get_fasta(_row_dict["Sequence ID"], _locations_list)
             output_dict["FASTAs_list"].append(fasta)
             output_dict["annotations_series_list"].append(pd.Series(Utilities.dict2pd_series(_row_dict), name=fasta.header))
     return output_dict
Beispiel #22
0
 def __init__(self, reference_describer_instance, value_col_name):
     self.describer = reference_describer_instance
     self.value_col_name = value_col_name
     self.coverage_files = [
         i
         for i in Utilities.scan_whole_dir(projectDescriber.MAPPED_DATA_DIR)
         if all(j in i for j in [self.describer.ALIAS, "coverage.tsv"])
     ]
     self.annotation_file = self.describer.get_refdata_dict().get(
         "sequence_1").annotation_file
     self.raw_annotated_pivot = self.join_and_annotate()
 def retrieve(self):
     if os.path.exists(self.reference_dir):
         print("Warning! The reference path exists: '{}'".format(
             self.reference_dir))
     os.makedirs(self.reference_dir, exist_ok=True)
     chromosomes_dir = os.path.join(self.reference_dir, "chromosomes")
     os.makedirs(chromosomes_dir, exist_ok=True)
     # UCSC returns HTTP 530 when attempting to download in multi-thread
     compressed_chromosomes = Utilities.single_core_queue(
         self._dl_wrapper, [{
             "chromosome": i,
             "chromosomes_dir": chromosomes_dir
         } for i in self.CHROMOSOMES])
     # Process sequence
     self.parsed_records = Utilities.flatten_2d_array(
         Utilities.single_core_queue(self._parse_gzip_fna,
                                     compressed_chromosomes))
     self.nfasta_file = os.path.join(self.reference_dir, "hg19.fasta")
     SeqIO.write(self.parsed_records, self.nfasta_file, "fasta")
     # Process annotation
     self.index_dir = self.describer.get_index_guide(self.nfasta_file)
def process_genbank_report(d: dict):
    genbank_records = d.get("genbank_records")
    genbank_record = genbank_records[0]
    cds_number = len([i for i in genbank_record.features if i.type == "CDS"])
    qualifiers_dict = [
        i.qualifiers for i in genbank_record.features if i.type == "source"
    ][0]
    organism = Utilities.remove_empty_values(
        qualifiers_dict.get("organism")[0].split(" "))[:2]
    strain = " ".join(organism + [qualifiers_dict.get("strain")[0]])
    taxonomy_id = Utilities.safe_findall("\d+", [
        i for i in qualifiers_dict.get("db_xref")
        if i.split(":")[0].strip() == "taxon"
    ][0])
    return dict(assembly_file=d.get("assembly_file"),
                strain=strain,
                taxonomy_id=taxonomy_id,
                reference_accession_id=genbank_record.id,
                cds_number=cds_number,
                reference_bp=len(genbank_record),
                reference_description=genbank_record.description)
Beispiel #25
0
 def __init__(self, version: str):
     _DL_PAGE_URL = "http://202.120.12.135/TADB2/download.html"
     self.describer = ReferenceDescriber()
     self.describer.VERSION = version
     self.describer.update_alias()
     self.reference_dir = os.path.join("/data/reference", self.describer.NAME, self.describer.ALIAS)
     links = [i for i in Utilities.scrap_links_from_web_page(_DL_PAGE_URL) if i.endswith(".fas")]
     self._fasta_types = ["nucleotide", "protein"]
     self._links_dict = {k: [i for i in links if i.split("/")[-2] == k] for k in self._fasta_types}
     assert len(self._links_dict["nucleotide"]) + len(self._links_dict["protein"]) == len(links)
     self.nfasta = os.path.join(self.reference_dir, "{}.fasta".format(self.describer.ALIAS))
     self.pfasta = os.path.join(self.reference_dir, "{}_protein.fasta".format(self.describer.ALIAS))
     self.index_dir = ""
Beispiel #26
0
 def annotate(self):
     # Process nucleotide FASTA
     self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0)
     raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist()
     processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in
                                 Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)]
     self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id")
     self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id")
     # Process protein FASTA
     raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in
                                                  open(self._raw_pfasta_file, mode="r", encoding="utf-8") if
                                                  i.startswith(">")] if len(j) > 0]))
     processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in
                                 Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)]
     self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id")
     self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id",
                                    "description": "protein_description", "host": "protein_host"}, inplace=True)
     self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol")
     self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")
Beispiel #27
0
 def _mp_parse_nfasta_header(header):
     output_dict = dict(former_id=header)
     output_dict["genbank_id"] = Utilities.safe_findall(
         "^gb\|([^|]+)", header)
     output_dict["is_antisense_strand"] = header.split("|")[2].startswith(
         "-")
     output_dict["locus"] = Utilities.safe_findall("\|(\d+\-\d+)", header)
     output_dict["aro_id"] = Utilities.safe_findall("\|ARO:(\d+)", header)
     gene_chunk = header.split("|")[-1]
     output_dict["host"] = Utilities.safe_findall("\[(.+)\]", gene_chunk)
     output_dict["gene_description"] = gene_chunk.replace(
         "[{}]".format(output_dict["host"]), "").strip()
     _MIN_GENE_SYMBOL_LENGTH = 3
     _NON_GENE_SYMBOL_WORDS = ("DNA", "RNA")
     output_dict["gene_symbol"] = min([
         j for j in [
             i.strip()
             for i in output_dict.get("gene_description").split(" ")
         ] if len(j) >= _MIN_GENE_SYMBOL_LENGTH
         and j not in _NON_GENE_SYMBOL_WORDS
     ],
                                      key=len)
     return Utilities.dict2pd_series(output_dict)
 def _mp_parse_nfasta_header(header: str):
     _VFDB_REGEXES = (("vfdb_id", "^VFG(\d+)", "VFG{}"),
                      ("gene_accession_id", "\(([^\(]+)\) ",
                       "({}) "), ("gene_symbol", "^\(([^\(]+)\) ", "({}) "),
                      ("gene_host", "\[([^\]]+)\]$",
                       "[{}]"), ("gene_name", " \[([^\]]+)\] $", " [{}] "),
                      ("gene_description", ".*", "{}"))
     out = {"former_id": header}
     # Spaces are important here
     for _tuple in _VFDB_REGEXES:
         key, regex, replacement = _tuple
         out[key] = Utilities.safe_findall(regex, header)
         if len(out.get(key)) > 0:
             header = header.replace(replacement.format(out.get(key)), "")
     return {k: out.get(k).strip() for k in out}
 def annotate(self):
     self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file)
     raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist()
     processed_nfasta_headers = [
         Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(
             self._mp_parse_nfasta_header, raw_nfasta_headers)
     ]
     self._processed_nfasta_df = Utilities.merge_pd_series_list(
         processed_nfasta_headers).sort_values("former_id")
     zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist()))
     # Join table assembled from pFASTA headers
     raw_pfasta_headers = []
     with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f:
         for _line in _f:
             if _line.startswith(">"):
                 raw_pfasta_headers.append(re.sub("^>", "", _line).strip())
         _f.close()
     raw_pfasta_headers = sorted(
         set([i for i in raw_pfasta_headers if len(i) > 0]))
     processed_pfasta_headers = [
         Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(
             self._mp_parse_pfasta_header, raw_pfasta_headers)
     ]
     self._processed_pfasta_df = Utilities.merge_pd_series_list(
         processed_pfasta_headers).sort_values("protein_header")
     self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[
         "vfdb_id"].str.zfill(zf_len)
     # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file
     vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file),
                                   "VFs.xls")
     vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs",
                            header=1).fillna("")
     vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill(
         zf_len)
     self.merged_df = pd.concat([
         i.set_index("vfdb_id").sort_index() for i in
         [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df]
     ],
                                axis=1,
                                sort=False).sort_index()
     self.merged_df.index.names = ["vfdb_id"]
     self.merged_df = self.merged_df.loc[
         self.merged_df["former_id"].str.len() > 0].reset_index()
     self.merged_df = Utilities.left_merge(self._raw_nfasta_df,
                                           self.merged_df, "former_id")
Beispiel #30
0
def define_species(_sample_name: str):
    _SPECIES = {
        "Bacillus subtilis BZR 336g": 336,
        "Bacillus subtilis BZR 517": 517,
        "Lactobacillus salivarius": 1,
        "Lactobacillus curvatus": 2,
        "Lactobacillus heilongjiangensis": 8
    }
    first_digits = Utilities.safe_findall("^\d+", _sample_name)
    if len(first_digits) > 0:
        first_digits = int(first_digits)
        for k in _SPECIES:
            if first_digits == _SPECIES.get(k):
                return k
    print("Cannot define species: '{}'".format(_sample_name))
    return "_"