def annotate(self): self.annotation_file = self.describer.get_refdata_dict().get( "sequence_1").annotation_file self._raw_nfasta_df = pd.read_table(self.annotation_file, sep='\t', header=0) mp_result = Utilities.multi_core_queue( self._mp_parse_nfasta_header, self._raw_nfasta_df["former_id"].values.tolist()) self._processed_nfasta_df = Utilities.merge_pd_series_list( mp_result).sort_values("former_id") self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id") # Join 'aro_index.tsv' aro_index_df = pd.read_table(os.path.join(self.reference_dir, "data", "aro_index.tsv"), sep='\t', header=0) aro_index_df["aro_id"] = aro_index_df["ARO Accession"].str.extract( "ARO:(\d+)") # 'aro_index.tsv' has more entries than 'nucleotide_fasta_protein_homolog_model.fasta' provides self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_index_df, "aro_id") # Join 'aro_categories_index.tsv' aro_categories_index_df = pd.read_table(os.path.join( self.reference_dir, "data", "aro_categories_index.tsv"), sep='\t', header=0) self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_categories_index_df, "Protein Accession") # Joining 'aro_categories.tsv' is useless: the resulting 'ARO Category' is filled by NaN # Join 'aro.tsv' aro_df = pd.read_table(os.path.join(self.reference_dir, "ontology", "aro.tsv"), sep='\t', header=0) aro_df.rename(columns={ "Accession": "ARO Accession", "Name": "ARO Name" }, inplace=True) self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_df, "ARO Accession") self.nfasta_df = Utilities.combine_duplicate_rows( self.nfasta_df, "reference_id")
def annotate(self): # Process nucleotide FASTA self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)] self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id") self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id") # Process protein FASTA raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in open(self._raw_pfasta_file, mode="r", encoding="utf-8") if i.startswith(">")] if len(j) > 0])) processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)] self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id") self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id", "description": "protein_description", "host": "protein_host"}, inplace=True) self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol") self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")