def _repertoire_to_dataframe(repertoire: Repertoire, region_type): # get all fields (including custom fields) df = pd.DataFrame(repertoire.load_data()) for column in ['v_alleles', 'j_alleles', 'v_genes', 'j_genes']: if column not in df.columns: df.loc[:, column] = None AIRRExporter.update_gene_columns(df, 'alleles', 'genes') # rename mandatory fields for airr-compliance mapper = {"sequence_identifiers": "sequence_id", "v_alleles": "v_call", "j_alleles": "j_call", "chains": "locus", "counts": "duplicate_count", "sequences": AIRRExporter.get_sequence_field(region_type), "sequence_aas": AIRRExporter.get_sequence_aa_field(region_type)} df = df.rename(mapper=mapper, axis="columns") return df
def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire: data = pd.DataFrame(repertoire.load_data()) groupby_fields = DuplicateSequenceFilter._prepare_group_by_field( params, data.columns) custom_lists = list(set(data.columns) - set(Repertoire.FIELDS)) agg_dict = DuplicateSequenceFilter._prepare_agg_dict( params, data.columns, custom_lists) # Chain objects can not be aggregated, convert to strings if "chains" in data.columns: data["chains"] = [ chain.value if isinstance(chain, Chain) else chain for chain in data["chains"] ] else: data["chains"] = None no_duplicates = data.groupby(groupby_fields).agg( agg_dict).reset_index() processed_repertoire = Repertoire.build( sequence_aas=list(no_duplicates["sequence_aas"]) if "sequence_aas" in no_duplicates.columns else None, sequences=list(no_duplicates["sequences"]) if "sequences" in no_duplicates.columns else None, v_genes=list(no_duplicates["v_genes"]) if "v_genes" in no_duplicates.columns else None, j_genes=list(no_duplicates["j_genes"]) if 'j_genes' in no_duplicates.columns else None, chains=[Chain(key) for key in list(no_duplicates["chains"])] if "chains" in no_duplicates.columns else None, counts=list(no_duplicates["counts"]) if "counts" in no_duplicates else None, region_types=list(no_duplicates["region_types"]) if "region_types" in no_duplicates else None, custom_lists={ key: list(no_duplicates[key]) for key in custom_lists }, sequence_identifiers=list(no_duplicates["sequence_identifiers"]), metadata=copy.deepcopy(repertoire.metadata), path=params["result_path"]) return processed_repertoire