Example #1
0
    def preprocess_dataframe(dataframe: pd.DataFrame, params: DatasetImportParams):
        dataframe.loc[:, "frame_types"] = dataframe.frame_types.str.upper()

        frame_type_list = ImportHelper.prepare_frame_type_list(params)
        dataframe = dataframe[dataframe["frame_types"].isin(frame_type_list)]
        dataframe.loc[:, "region_types"] = params.region_type.name

        if params.region_type == RegionType.IMGT_CDR3:
            if "sequences" in dataframe.columns:
                dataframe.loc[:, 'sequences'] = [y[(84 - 3 * len(x)): 78] if x is not None else None for x, y in zip(dataframe['sequence_aas'], dataframe['sequences'])]
            dataframe.loc[:, 'sequence_aas'] = dataframe["sequence_aas"].str[1:-1]
        elif "sequences" in dataframe.columns:
            dataframe.loc[:, 'sequences'] = [y[(81 - 3 * len(x)): 81] if x is not None else None for x, y in zip(dataframe['sequence_aas'], dataframe['sequences'])]

        dataframe = AdaptiveImportHelper.parse_adaptive_germline_to_imgt(dataframe, params.organism)
        dataframe = ImportHelper.standardize_none_values(dataframe)
        ImportHelper.drop_empty_sequences(dataframe, params.import_empty_aa_sequences, params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(dataframe, params.import_illegal_characters)

        if "chains" in dataframe.columns:
            dataframe.loc[:, "chains"] = ImportHelper.load_chains(dataframe)
        else:
            # loading from v_subgroups is preferred as sometimes v_genes is None when v_subgroups is defined
            if "v_subgroups" in dataframe.columns:
                dataframe.loc[:, "chains"] = ImportHelper.load_chains_from_column(dataframe, "v_subgroups")
            else:
                dataframe.loc[:, "chains"] = ImportHelper.load_chains_from_genes(dataframe)

        return dataframe
Example #2
0
 def alternative_load_func(filename, params):
     df = airr.load_rearrangement(filename)
     df = ImportHelper.standardize_none_values(df)
     df.dropna(axis="columns", how="all", inplace=True)
     return df