def preprocess_dataframe(dataframe: pd.DataFrame, params: DatasetImportParams): dataframe.loc[:, "frame_types"] = dataframe.frame_types.str.upper() frame_type_list = ImportHelper.prepare_frame_type_list(params) dataframe = dataframe[dataframe["frame_types"].isin(frame_type_list)] dataframe.loc[:, "region_types"] = params.region_type.name if params.region_type == RegionType.IMGT_CDR3: if "sequences" in dataframe.columns: dataframe.loc[:, 'sequences'] = [y[(84 - 3 * len(x)): 78] if x is not None else None for x, y in zip(dataframe['sequence_aas'], dataframe['sequences'])] dataframe.loc[:, 'sequence_aas'] = dataframe["sequence_aas"].str[1:-1] elif "sequences" in dataframe.columns: dataframe.loc[:, 'sequences'] = [y[(81 - 3 * len(x)): 81] if x is not None else None for x, y in zip(dataframe['sequence_aas'], dataframe['sequences'])] dataframe = AdaptiveImportHelper.parse_adaptive_germline_to_imgt(dataframe, params.organism) dataframe = ImportHelper.standardize_none_values(dataframe) ImportHelper.drop_empty_sequences(dataframe, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences(dataframe, params.import_illegal_characters) if "chains" in dataframe.columns: dataframe.loc[:, "chains"] = ImportHelper.load_chains(dataframe) else: # loading from v_subgroups is preferred as sometimes v_genes is None when v_subgroups is defined if "v_subgroups" in dataframe.columns: dataframe.loc[:, "chains"] = ImportHelper.load_chains_from_column(dataframe, "v_subgroups") else: dataframe.loc[:, "chains"] = ImportHelper.load_chains_from_genes(dataframe) return dataframe
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): """ Function for preprocessing data from a dataframe containing AIRR data, such that: - productive sequences, sequences with stop codons or out of frame sequences are filtered according to specification - if RegionType is CDR3, the leading C and trailing W are removed from the sequence to match the CDR3 definition - if no chain column was specified, the chain is extracted from the v gene name - the allele information is removed from the V and J genes """ if "productive" in df.columns: df["frame_types"] = SequenceFrameType.OUT.name df.loc[df["productive"] == True, "frame_types"] = SequenceFrameType.IN.name else: df["frame_types"] = None if "vj_in_frame" in df.columns: df.loc[df["vj_in_frame"] == True, "frame_types"] = SequenceFrameType.IN.name if "stop_codon" in df.columns: df.loc[df["stop_codon"] == True, "frame_types"] = SequenceFrameType.STOP.name if "productive" in df.columns: frame_type_list = ImportHelper.prepare_frame_type_list(params) df = df[df["frame_types"].isin(frame_type_list)] if params.region_type == RegionType.IMGT_CDR3: if "sequence_aas" not in df.columns and "sequences" not in df.columns: if "cdr3" in df.columns: df.rename(columns={"cdr3": "sequences"}, inplace=True) if "cdr3_aa" in df.columns: df.rename(columns={"cdr3_aa": "sequence_aas"}, inplace=True) df.loc[:, "region_types"] = params.region_type.name elif "junction" in params.column_mapping or "junction_aa" in params.column_mapping: ImportHelper.junction_to_cdr3(df, params.region_type) # todo else: support "full_sequence" import through regiontype? if "chains" not in df.columns: df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df) df = ImportHelper.update_gene_info(df) ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences( df, params.import_illegal_characters) return df