def annotate(self): self.annotation_file = self.describer.get_refdata_dict().get( "sequence_1").annotation_file self._raw_nfasta_df = pd.read_table(self.annotation_file, sep='\t', header=0) mp_result = Utilities.multi_core_queue( self._mp_parse_nfasta_header, self._raw_nfasta_df["former_id"].values.tolist()) self._processed_nfasta_df = Utilities.merge_pd_series_list( mp_result).sort_values("former_id") self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id") # Join 'aro_index.tsv' aro_index_df = pd.read_table(os.path.join(self.reference_dir, "data", "aro_index.tsv"), sep='\t', header=0) aro_index_df["aro_id"] = aro_index_df["ARO Accession"].str.extract( "ARO:(\d+)") # 'aro_index.tsv' has more entries than 'nucleotide_fasta_protein_homolog_model.fasta' provides self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_index_df, "aro_id") # Join 'aro_categories_index.tsv' aro_categories_index_df = pd.read_table(os.path.join( self.reference_dir, "data", "aro_categories_index.tsv"), sep='\t', header=0) self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_categories_index_df, "Protein Accession") # Joining 'aro_categories.tsv' is useless: the resulting 'ARO Category' is filled by NaN # Join 'aro.tsv' aro_df = pd.read_table(os.path.join(self.reference_dir, "ontology", "aro.tsv"), sep='\t', header=0) aro_df.rename(columns={ "Accession": "ARO Accession", "Name": "ARO Name" }, inplace=True) self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_df, "ARO Accession") self.nfasta_df = Utilities.combine_duplicate_rows( self.nfasta_df, "reference_id")
def annotate(self): # Process nucleotide FASTA self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)] self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id") self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id") # Process protein FASTA raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in open(self._raw_pfasta_file, mode="r", encoding="utf-8") if i.startswith(">")] if len(j) > 0])) processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)] self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id") self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id", "description": "protein_description", "host": "protein_host"}, inplace=True) self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol") self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")
def annotate(self): self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_nfasta_header, raw_nfasta_headers) ] self._processed_nfasta_df = Utilities.merge_pd_series_list( processed_nfasta_headers).sort_values("former_id") zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist())) # Join table assembled from pFASTA headers raw_pfasta_headers = [] with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f: for _line in _f: if _line.startswith(">"): raw_pfasta_headers.append(re.sub("^>", "", _line).strip()) _f.close() raw_pfasta_headers = sorted( set([i for i in raw_pfasta_headers if len(i) > 0])) processed_pfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_pfasta_header, raw_pfasta_headers) ] self._processed_pfasta_df = Utilities.merge_pd_series_list( processed_pfasta_headers).sort_values("protein_header") self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[ "vfdb_id"].str.zfill(zf_len) # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file), "VFs.xls") vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs", header=1).fillna("") vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill( zf_len) self.merged_df = pd.concat([ i.set_index("vfdb_id").sort_index() for i in [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df] ], axis=1, sort=False).sort_index() self.merged_df.index.names = ["vfdb_id"] self.merged_df = self.merged_df.loc[ self.merged_df["former_id"].str.len() > 0].reset_index() self.merged_df = Utilities.left_merge(self._raw_nfasta_df, self.merged_df, "former_id")
handler = InterpretationHandler(referenceDescriber, value_col_name) handler.update_sample_names("\/([A-Z][^\/_]+)_") for col_name_with_keywords in KEYWORDS_ASSOCIATIVE_PAIRS: df_to_digest = handler.raw_annotated_pivot.loc[:, [ col_name_with_keywords ] + handler.sample_names] associations = KEYWORDS_ASSOCIATIVE_PAIRS.get(col_name_with_keywords) if col_name_with_keywords == "gene_host": associations = digestAssociationsKeeper.generate_genera_dict( df_to_digest[col_name_with_keywords].values.tolist()) digest_df, raw_ds = digestAssociationsKeeper.digest_df( df_to_digest, associations=associations, columns_with_keywords=[col_name_with_keywords]) raw_ds = Utilities.left_merge( raw_ds, handler.raw_annotated_pivot[RAW_LABEL_COL_NAME].reset_index(), REFERENCE_COL_NAME) raw_ds[RAW_LABEL_COL_NAME] = raw_ds[RAW_LABEL_COL_NAME].apply( lambda x: min( (j for j in str(x).strip().split(" ") if j), key=len)) for sample_name in digest_df.columns: major_digest_df = Utilities.get_n_majors_from_df( digest_df, sample_name, n=INNER_DONUT_GROUPS - 1) sample_export_mask = InterpretationHandler.create_mirrored_path( [ projectDescriber.DATA_DIGEST_DIR, value_col_name, col_name_with_keywords, sample_name ], makedirs=True) # Create visualization fig, ax = plt.subplots()
def process(self): value_col_name_raw_pivot_annotated_mask = self.create_mirrored_path( [projectDescriber.DATA_DIGEST_DIR, self.value_col_name], makedirs=True) Utilities.dump_tsv( self.raw_annotated_pivot.reset_index(), "{}_raw_annotated_pivot.tsv".format( value_col_name_raw_pivot_annotated_mask)) for col_name_with_keywords in KEYWORDS_ASSOCIATIVE_PAIRS: df_to_digest = self.raw_annotated_pivot.loc[:, [ col_name_with_keywords ] + self.sample_names] associations = KEYWORDS_ASSOCIATIVE_PAIRS.get( col_name_with_keywords) if col_name_with_keywords == HOST_COL_NAME: associations = digestAssociationsKeeper.generate_genera_dict( df_to_digest[col_name_with_keywords].values.tolist()) digest_df, raw_ds = digestAssociationsKeeper.digest_df( df_to_digest, associations=associations, columns_with_keywords=[col_name_with_keywords]) raw_ds = Utilities.left_merge( raw_ds, self.raw_annotated_pivot[RAW_LABEL_COL_NAME].reset_index(), REFERENCE_COL_NAME).fillna("") raw_ds[RAW_LABEL_COL_NAME] = raw_ds[RAW_LABEL_COL_NAME].apply( lambda x: min(Utilities.remove_empty_values( [i for i in x.strip().split(" ")]), key=len)) keyword_export_mask = self.create_mirrored_path([ projectDescriber.DATA_DIGEST_DIR, self.value_col_name, col_name_with_keywords ], makedirs=True) Utilities.dump_tsv(digest_df.reset_index(), "{}_digest.tsv".format(keyword_export_mask)) Utilities.dump_tsv(raw_ds, "{}_raw.tsv".format(keyword_export_mask)) for sample_name in digest_df.columns: _BASE_FONT_SIZE = 15 _WEDGE_WIDTH = 0.3 _WEDGE_PROPERTIES = dict(width=_WEDGE_WIDTH, edgecolor="w") _LABEL_PROPERTIES = dict(fontsize=_BASE_FONT_SIZE, rotation_mode="anchor", verticalalignment="center", horizontalalignment="center") major_digest_df = Utilities.get_n_majors_from_df( digest_df, sample_name, n=INNER_DONUT_GROUPS - 1) # Create visualization fig, ax = plt.subplots() plt.rcParams.update({ "font.size": _BASE_FONT_SIZE, "figure.figsize": (20, 20) }) ax.axis("equal") y_col_name = major_digest_df.columns[0] # Returning value: [[wedges...], [labels...], [values...]] pie_int = ax.pie(major_digest_df[sample_name], radius=1 - _WEDGE_WIDTH, labels=major_digest_df.index, labeldistance=1 - _WEDGE_WIDTH, rotatelabels=False, autopct=self.make_autopct( major_digest_df[y_col_name]), pctdistance=1 - _WEDGE_WIDTH / 2.0, wedgeprops=_WEDGE_PROPERTIES, textprops=_LABEL_PROPERTIES) # Combine color values in 'RGBA' format into the one dictionary pie_int_colors = { pie_int[1][idx].get_text(): wedge.get_facecolor() for idx, wedge in enumerate(pie_int[0]) } # Manual sort the dataset with raw values prior to the order of digest keywords major_raw_ds = pd.DataFrame() for digest_keyword in major_digest_df.index: if digest_keyword == "Other": major_raw_ds_append = pd.DataFrame( major_digest_df.loc["Other"]).transpose() major_raw_ds_append.index.name = DIGEST_LABEL_COL_NAME major_raw_ds_append = major_raw_ds_append.reset_index() else: major_raw_ds_append_right = raw_ds.loc[ raw_ds[DIGEST_LABEL_COL_NAME] == digest_keyword, [ REFERENCE_COL_NAME, sample_name, DIGEST_LABEL_COL_NAME, RAW_LABEL_COL_NAME ]] major_raw_ds_append_left = Utilities.get_n_majors_from_df( major_raw_ds_append_right.set_index( REFERENCE_COL_NAME), sample_name, n=OUTER_DONUT_SUBGROUPS - 1).rename(index={ "Other": digest_keyword }).reset_index() major_raw_ds_append = Utilities.left_merge( major_raw_ds_append_left, major_raw_ds_append_right, REFERENCE_COL_NAME) major_raw_ds_append[ RAW_LABEL_COL_NAME] = major_raw_ds_append[ RAW_LABEL_COL_NAME].fillna( "{}_Other".format(digest_keyword)) major_raw_ds_append[ DIGEST_LABEL_COL_NAME] = major_raw_ds_append[ DIGEST_LABEL_COL_NAME].fillna("Other") pie_ext_append_colors = [] for row_number in major_raw_ds_append.index.values: row_color = pie_int_colors.get(digest_keyword) if not row_color: continue row_old_alpha = row_color[3] _MINIMAL_ALPHA = 0.2 if major_raw_ds_append.shape[0] < 4: row_new_alpha = row_old_alpha - ( row_old_alpha * row_number * _MINIMAL_ALPHA) else: row_new_alpha = row_old_alpha - ( (row_old_alpha - _MINIMAL_ALPHA) * row_number / float(major_raw_ds_append.shape[0] - 1)) pie_ext_append_colors.append(";".join( str(i) for i in list(row_color[:3]) + [row_new_alpha])) major_raw_ds_append["color"] = pie_ext_append_colors if major_raw_ds_append.shape[0] > 0: if major_raw_ds.shape[0] == 0: major_raw_ds = major_raw_ds_append else: major_raw_ds = pd.concat( [major_raw_ds, major_raw_ds_append], axis=0, ignore_index=True, sort=False) major_raw_ds = major_raw_ds.fillna("Other") pie_ext = ax.pie( major_raw_ds[sample_name], radius=1, labels=major_raw_ds[RAW_LABEL_COL_NAME], labeldistance=1 - _WEDGE_WIDTH / 2, rotatelabels=True, wedgeprops=_WEDGE_PROPERTIES, textprops=_LABEL_PROPERTIES, colors=major_raw_ds["color"].apply(lambda x: tuple( float(i) for i in x.split(";"))).values.tolist()) # Export visualization tables sample_export_mask = self.create_mirrored_path([ projectDescriber.DATA_DIGEST_DIR, self.value_col_name, col_name_with_keywords, sample_name ], makedirs=True) Utilities.dump_tsv( major_digest_df.reset_index(), "{}_inner_values.tsv".format(sample_export_mask)) Utilities.dump_tsv( major_raw_ds, "{}_outer_values.tsv".format(sample_export_mask)) # Set labels ax.set_xlabel(y_col_name) ax.set_ylabel(self.value_col_name) plt.tight_layout() # Export PNG pie_file = "{}_double_donut.png".format(sample_export_mask) fig.suptitle(pie_file, fontsize=_BASE_FONT_SIZE) plt.savefig(pie_file, dpi=300, bbox_inches="tight") plt.close("all") plt.clf()