def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): df["frame_types"] = SequenceFrameType.IN.name ImportHelper.junction_to_cdr3(df, params.region_type) if not params.is_repertoire and params.paired: n_single_chains = sum(df["sequence_identifiers"] == "0") if n_single_chains > 0: df.drop(df.loc[df["sequence_identifiers"] == "0"].index, inplace=True) warnings.warn( f"VDJdbImport: {n_single_chains} single chains were removed when trying to create a ReceptorDataset.\n" f"To import all chains as a SequenceDataset, use paired = False" ) else: df.loc[df["sequence_identifiers"] == "0", "sequence_identifiers"] = None ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences( df, params.import_illegal_characters) ImportHelper.update_gene_info(df) ImportHelper.load_chains(df) df["receptor_identifiers"] = df["sequence_identifiers"] df["sequence_identifiers"] = VDJdbImport.get_sequence_identifiers( df["sequence_identifiers"], df["chains"]) df = VDJdbImport.extract_meta_columns(df, params) return df
def preprocess_dataframe(df: pd.DataFrame, params): subframes = [] chain_dups_to_process = ("1", "2") if params.import_dual_chains is True else ("1") for chain in params.receptor_chains.value: for chain_dup in chain_dups_to_process: subframe_dict = {"cell_ids": df["Clonotype ID"], "sequence_aas": df[f"Chain: {chain} ({chain_dup})"], "v_genes": df[f"{chain} - V gene ({chain_dup})"], "j_genes": df[f"{chain} - J gene ({chain_dup})"], "chains": Chain.get_chain(chain).value} if params.extra_columns_to_load is not None: for extra_col in params.extra_columns_to_load: subframe_dict[extra_col] = df[extra_col] subframes.append(pd.DataFrame(subframe_dict)) df = pd.concat(subframes, axis=0) df.dropna(subset=["sequence_aas", "v_genes", "j_genes"], inplace=True) df.reset_index(drop=True, inplace=True) if params.import_all_gene_combinations: df = IRISImport.import_all_gene_combinations(df) else: for gene_column in ("v_genes", "j_genes"): processed_names = [IRISImport._load_gene(rn.choice(raw_v_string.split(" | "))) for raw_v_string in df[gene_column]] df[gene_column] = processed_names ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) return df
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences( df, params.import_illegal_characters) ImportHelper.junction_to_cdr3(df, params.region_type) ImportHelper.update_gene_info(df) ImportHelper.load_chains(df) return df
def import_dataset(params: dict, dataset_name: str) -> Dataset: iris_params = IRISImportParams.build_object(**params) dataset = ImportHelper.load_dataset_if_exists(params, iris_params, dataset_name) if dataset is None: if iris_params.is_repertoire: dataset = ImportHelper.import_repertoire_dataset(IRISImport, iris_params, dataset_name) else: dataset = IRISImport.load_sequence_dataset(params, dataset_name) return dataset
def import_dataset(params, dataset_name: str) -> ReceptorDataset: generic_params = DatasetImportParams.build_object(**params) filenames = ImportHelper.get_sequence_filenames(generic_params.path, dataset_name) PathBuilder.build(generic_params.result_path, warn_if_exists=True) dataset = SingleLineReceptorImport._import_from_files(filenames, generic_params) dataset.name = dataset_name dataset.labels = ImportHelper.extract_sequence_dataset_params(params=generic_params) PickleExporter.export(dataset, generic_params.result_path) return dataset
def prepare_reference(reference_params: dict, location: str, paired: bool): ParameterValidator.assert_keys(list(reference_params.keys()), ["format", "params"], location, "reference") seq_import_params = reference_params["params"] if "params" in reference_params else {} assert os.path.isfile(seq_import_params["path"]), f"{location}: the file {seq_import_params['path']} does not exist. " \ f"Specify the correct path under reference." if "is_repertoire" in seq_import_params: assert seq_import_params["is_repertoire"] == False, f"{location}: is_repertoire must be False for SequenceImport" else: seq_import_params["is_repertoire"] = False if "paired" in seq_import_params: assert seq_import_params["paired"] == paired, f"{location}: paired must be {paired} for SequenceImport" else: seq_import_params["paired"] = paired format_str = reference_params["format"] import_class = ReflectionHandler.get_class_by_name("{}Import".format(format_str)) default_params = DefaultParamsLoader.load(EnvironmentSettings.default_params_path / "datasets", DefaultParamsLoader.convert_to_snake_case(format_str)) params = {**default_params, **seq_import_params} processed_params = DatasetImportParams.build_object(**params) receptors = ImportHelper.import_items(import_class, reference_params["params"]["path"], processed_params) return receptors
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): """ Function for preprocessing data from a dataframe containing AIRR data, such that: - productive sequences, sequences with stop codons or out of frame sequences are filtered according to specification - if RegionType is CDR3, the leading C and trailing W are removed from the sequence to match the CDR3 definition - if no chain column was specified, the chain is extracted from the v gene name - the allele information is removed from the V and J genes """ if "productive" in df.columns: df["frame_types"] = SequenceFrameType.OUT.name df.loc[df["productive"] == True, "frame_types"] = SequenceFrameType.IN.name else: df["frame_types"] = None if "vj_in_frame" in df.columns: df.loc[df["vj_in_frame"] == True, "frame_types"] = SequenceFrameType.IN.name if "stop_codon" in df.columns: df.loc[df["stop_codon"] == True, "frame_types"] = SequenceFrameType.STOP.name if "productive" in df.columns: frame_type_list = ImportHelper.prepare_frame_type_list(params) df = df[df["frame_types"].isin(frame_type_list)] if params.region_type == RegionType.IMGT_CDR3: if "sequence_aas" not in df.columns and "sequences" not in df.columns: if "cdr3" in df.columns: df.rename(columns={"cdr3": "sequences"}, inplace=True) if "cdr3_aa" in df.columns: df.rename(columns={"cdr3_aa": "sequence_aas"}, inplace=True) df.loc[:, "region_types"] = params.region_type.name elif "junction" in params.column_mapping or "junction_aa" in params.column_mapping: ImportHelper.junction_to_cdr3(df, params.region_type) # todo else: support "full_sequence" import through regiontype? if "chains" not in df.columns: df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df) df = ImportHelper.update_gene_info(df) ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences( df, params.import_illegal_characters) return df
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): if "sequences" not in df.columns and "sequence_aas" not in df.columns: raise IOError( "OLGAImport: Columns should contain at least 'sequences' or 'sequence_aas'." ) if "counts" not in df.columns: df["counts"] = 1 df["sequence_identifiers"] = None ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences( df, params.import_illegal_characters) ImportHelper.junction_to_cdr3(df, params.region_type) ImportHelper.update_gene_info(df) ImportHelper.load_chains(df) return df
def load_sequence_dataset(params: dict, dataset_name: str) -> Dataset: iris_params = IRISImportParams.build_object(**params) filenames = ImportHelper.get_sequence_filenames(iris_params.path, dataset_name) file_index = 0 dataset_filenames = [] for index, filename in enumerate(filenames): items = IRISSequenceImport.import_items(filename, paired=iris_params.paired, all_dual_chains=iris_params.import_dual_chains, all_genes=iris_params.import_all_gene_combinations) while len(items) > iris_params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0): dataset_filenames.append(iris_params.result_path / "batch_{}.pickle".format(file_index)) ImportHelper.store_sequence_items(dataset_filenames, items, iris_params.sequence_file_size) items = items[iris_params.sequence_file_size:] file_index += 1 return ReceptorDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name) if iris_params.paired \ else SequenceDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name)
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): df["frame_types"] = None df.loc[df["productive"].eq("True"), "frame_types"] = SequenceFrameType.IN.name allowed_productive_values = [] if params.import_productive: allowed_productive_values.append("True") if params.import_unproductive: allowed_productive_values.append("False") df = df[df.productive.isin(allowed_productive_values)] ImportHelper.junction_to_cdr3(df, params.region_type) ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences( df, params.import_illegal_characters) ImportHelper.update_gene_info(df) ImportHelper.load_chains(df) return df
def import_sequence_dataset(params: dict, dataset_name: str) -> RepertoireDataset: base_result_path = params['result_path'] / "tmp_airr" unzipped_path = base_result_path / "tmp_unzipped" IReceptorImport._unzip_files(params['path'], unzipped_path, unzip_metadata=False) airr_params = copy.deepcopy(params) airr_params["path"] = unzipped_path dataset = ImportHelper.import_dataset(AIRRImport, airr_params, dataset_name) shutil.rmtree(unzipped_path) return dataset
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): """ Function for loading the data from one MiXCR file, such that: - for the given region (CDR3/full sequence), both nucleotide and amino acid sequence are loaded - if the region is CDR3, it adapts the sequence to the definition of the CDR3 (IMGT junction vs IMGT CDR3) - the chain for each sequence is extracted from the v gene name - the genes are loaded from the top score for gene without allele info Arguments: metadata: the corresponding row from the metadata file with metadata such as subject_id, age, HLA or other info given there params: DatasetImportParams object defining what to import and how to do it Returns: data frame corresponding to Repertoire.FIELDS and custom lists which can be used to create a Repertoire object """ df["sequence_aas"] = df[MiXCRImport.SEQUENCE_NAME_MAP[ params.region_type]["AA"]] df["sequences"] = df[MiXCRImport.SEQUENCE_NAME_MAP[params.region_type] ["NT"]] ImportHelper.junction_to_cdr3(df, params.region_type) df["counts"] = df["counts"].astype(float).astype(int) df["v_alleles"] = MiXCRImport._load_alleles(df, "v_alleles") df["j_alleles"] = MiXCRImport._load_alleles(df, "j_alleles") ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences( df, params.import_illegal_characters) ImportHelper.update_gene_info(df) ImportHelper.load_chains(df) return df
def import_repertoire_dataset(params: dict, dataset_name: str) -> RepertoireDataset: base_result_path = params['result_path'] / "tmp_airr" metadata_file_path = base_result_path / "metadata.csv" IReceptorImport._create_airr_repertoiredataset(params['path'], base_result_path, metadata_file_path) airr_params = copy.deepcopy(params) airr_params["path"] = base_result_path airr_params["metadata_file"] = metadata_file_path dataset = ImportHelper.import_dataset(AIRRImport, airr_params, dataset_name) shutil.rmtree(base_result_path) return dataset
def preprocess_dataframe(dataframe: pd.DataFrame, params: DatasetImportParams): if "frame_types" in dataframe.columns: dataframe.loc[:, "frame_types"] = dataframe.frame_types.str.upper() frame_type_list = ImportHelper.prepare_frame_type_list(params) dataframe = dataframe[dataframe["frame_types"].isin(frame_type_list)] dataframe.loc[:, "region_types"] = params.region_type.name if params.region_type == RegionType.IMGT_CDR3: if "sequences" in dataframe.columns: dataframe.loc[:, 'sequences'] = [y[(84 - 3 * len(x)): 78] if x is not None else None for x, y in zip(dataframe['sequence_aas'], dataframe['sequences'])] dataframe.loc[:, 'sequence_aas'] = dataframe["sequence_aas"].str[1:-1] elif "sequences" in dataframe.columns: dataframe.loc[:, 'sequences'] = [y[(81 - 3 * len(x)): 81] if x is not None else None for x, y in zip(dataframe['sequence_aas'], dataframe['sequences'])] dataframe = AdaptiveImportHelper.parse_adaptive_germline_to_imgt(dataframe, params.organism) ImportHelper.update_gene_info(dataframe) ImportHelper.load_chains(dataframe) ImportHelper.drop_empty_sequences(dataframe, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences(dataframe, params.import_illegal_characters) return dataframe
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): if "counts" not in df.columns: df["counts"] = 1 df = df[df.anchors_found == "1"] if not params.import_out_of_frame: df = df[df.is_inframe == "1"] df["sequence_aas"] = df["sequences"].apply(IGoRImport.translate_sequence) if not params.import_with_stop_codon: no_stop_codon = ["*" not in seq for seq in df.sequence_aas] df = df[no_stop_codon] ImportHelper.junction_to_cdr3(df, params.region_type) # note: import_empty_aa_sequences is set to true here; since IGoR doesnt output aa, this parameter is insensible ImportHelper.drop_empty_sequences(df, True, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters) # chain or at least receptorsequence? return df
def import_receptors(df, params): return ImportHelper.import_receptors(df, params)
def import_dataset(params: dict, dataset_name: str) -> Dataset: return ImportHelper.import_dataset(VDJdbImport, params, dataset_name)
def import_receptors(df, params): df["receptor_identifiers"] = df["cell_id"] return ImportHelper.import_receptors(df, params)
def alternative_load_func(filename, params): df = airr.load_rearrangement(filename) ImportHelper.standardize_none_values(df) df.dropna(axis="columns", how="all", inplace=True) return df
def import_dataset(params: dict, dataset_name: str) -> Dataset: return ImportHelper.import_dataset(ImmunoSEQRearrangementImport, params, dataset_name)
def import_dataset(params: dict, dataset_name: str) -> Dataset: return ImportHelper.import_dataset(TenxGenomicsImport, params, dataset_name)
def import_dataset(params: dict, dataset_name: str) -> Dataset: return ImportHelper.import_dataset(ImmunoSEQSampleImport, params, dataset_name)
def import_receptors(df, params): df["receptor_identifiers"] = df["sequence_identifiers"] return ImportHelper.import_receptors(df, params)