Example #1
0
    def preprocess_dataframe(df: pd.DataFrame, params):

        subframes = []

        chain_dups_to_process = ("1", "2") if params.import_dual_chains is True else ("1")

        for chain in params.receptor_chains.value:
            for chain_dup in chain_dups_to_process:
                subframe_dict = {"cell_ids": df["Clonotype ID"],
                                 "sequence_aas": df[f"Chain: {chain} ({chain_dup})"],
                                 "v_genes": df[f"{chain} - V gene ({chain_dup})"],
                                 "j_genes": df[f"{chain} - J gene ({chain_dup})"],
                                 "chains": Chain.get_chain(chain).value}
                if params.extra_columns_to_load is not None:
                    for extra_col in params.extra_columns_to_load:
                        subframe_dict[extra_col] = df[extra_col]
                subframes.append(pd.DataFrame(subframe_dict))

        df = pd.concat(subframes, axis=0)
        df.dropna(subset=["sequence_aas", "v_genes", "j_genes"], inplace=True)

        df.reset_index(drop=True, inplace=True)

        if params.import_all_gene_combinations:
            df = IRISImport.import_all_gene_combinations(df)
        else:
            for gene_column in ("v_genes", "j_genes"):
                processed_names = [IRISImport._load_gene(rn.choice(raw_v_string.split(" | "))) for raw_v_string in df[gene_column]]
                df[gene_column] = processed_names

        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences)

        return df
Example #2
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        df["frame_types"] = SequenceFrameType.IN.name
        ImportHelper.junction_to_cdr3(df, params.region_type)

        if not params.is_repertoire and params.paired:
            n_single_chains = sum(df["sequence_identifiers"] == "0")
            if n_single_chains > 0:
                df.drop(df.loc[df["sequence_identifiers"] == "0"].index, inplace=True)
                warnings.warn(f"VDJdbImport: {n_single_chains} single chains were removed when trying to create a ReceptorDataset.\n"
                              f"To import all chains as a SequenceDataset, use paired = False")
        else:
            df.loc[df["sequence_identifiers"] == "0", "sequence_identifiers"] = None

        if "chains" not in df.columns:
            df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df)

        df["receptor_identifiers"] = df["sequence_identifiers"]
        df["sequence_identifiers"] = VDJdbImport.get_sequence_identifiers(df["sequence_identifiers"], df["chains"])

        ImportHelper.update_gene_info(df)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters)
        df = VDJdbImport.extract_meta_columns(df, params)

        return df
Example #3
0
    def preprocess_dataframe(dataframe: pd.DataFrame, params: DatasetImportParams):
        dataframe.loc[:, "frame_types"] = dataframe.frame_types.str.upper()

        frame_type_list = ImportHelper.prepare_frame_type_list(params)
        dataframe = dataframe[dataframe["frame_types"].isin(frame_type_list)]
        dataframe.loc[:, "region_types"] = params.region_type.name

        if params.region_type == RegionType.IMGT_CDR3:
            if "sequences" in dataframe.columns:
                dataframe.loc[:, 'sequences'] = [y[(84 - 3 * len(x)): 78] if x is not None else None for x, y in zip(dataframe['sequence_aas'], dataframe['sequences'])]
            dataframe.loc[:, 'sequence_aas'] = dataframe["sequence_aas"].str[1:-1]
        elif "sequences" in dataframe.columns:
            dataframe.loc[:, 'sequences'] = [y[(81 - 3 * len(x)): 81] if x is not None else None for x, y in zip(dataframe['sequence_aas'], dataframe['sequences'])]

        dataframe = AdaptiveImportHelper.parse_adaptive_germline_to_imgt(dataframe, params.organism)
        dataframe = ImportHelper.standardize_none_values(dataframe)
        ImportHelper.drop_empty_sequences(dataframe, params.import_empty_aa_sequences, params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(dataframe, params.import_illegal_characters)

        if "chains" in dataframe.columns:
            dataframe.loc[:, "chains"] = ImportHelper.load_chains(dataframe)
        else:
            # loading from v_subgroups is preferred as sometimes v_genes is None when v_subgroups is defined
            if "v_subgroups" in dataframe.columns:
                dataframe.loc[:, "chains"] = ImportHelper.load_chains_from_column(dataframe, "v_subgroups")
            else:
                dataframe.loc[:, "chains"] = ImportHelper.load_chains_from_genes(dataframe)

        return dataframe
Example #4
0
    def import_dataset(params: dict, dataset_name: str) -> Dataset:
        iris_params = IRISImportParams.build_object(**params)

        dataset = ImportHelper.load_dataset_if_exists(params, iris_params, dataset_name)
        if dataset is None:
            if iris_params.is_repertoire:
                dataset = ImportHelper.import_repertoire_dataset(IRISImport, iris_params, dataset_name)
            else:
                dataset = IRISImport.load_sequence_dataset(params, dataset_name)

        return dataset
Example #5
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        ImportHelper.junction_to_cdr3(df, params.region_type)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences,
                                          params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(
            df, params.import_illegal_characters)
        ImportHelper.update_gene_info(df)

        return df
    def import_dataset(params, dataset_name: str) -> ReceptorDataset:
        generic_params = DatasetImportParams.build_object(**params)

        filenames = ImportHelper.get_sequence_filenames(
            generic_params.path, dataset_name)

        PathBuilder.build(generic_params.result_path, warn_if_exists=True)

        dataset = SingleLineReceptorImport._import_from_files(
            filenames, generic_params)
        dataset.name = dataset_name
        dataset.params = ImportHelper.extract_sequence_dataset_params(
            params=generic_params)

        PickleExporter.export(dataset, generic_params.result_path)

        return dataset
Example #7
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        """
        Function for preprocessing data from a dataframe containing AIRR data, such that:
            - productive sequences, sequences with stop codons or out of frame sequences are filtered according to specification
            - if RegionType is CDR3, the leading C and trailing W are removed from the sequence to match the CDR3 definition
            - if no chain column was specified, the chain is extracted from the v gene name
            - the allele information is removed from the V and J genes
        """
        if "productive" in df.columns:
            df["frame_types"] = SequenceFrameType.OUT.name
            df.loc[df["productive"] == True,
                   "frame_types"] = SequenceFrameType.IN.name
        else:
            df["frame_types"] = None

        if "vj_in_frame" in df.columns:
            df.loc[df["vj_in_frame"] == True,
                   "frame_types"] = SequenceFrameType.IN.name
        if "stop_codon" in df.columns:
            df.loc[df["stop_codon"] == True,
                   "frame_types"] = SequenceFrameType.STOP.name

        if "productive" in df.columns:
            frame_type_list = ImportHelper.prepare_frame_type_list(params)
            df = df[df["frame_types"].isin(frame_type_list)]

        if params.region_type == RegionType.IMGT_CDR3:
            if "sequence_aas" not in df.columns and "sequences" not in df.columns:
                if "cdr3" in df.columns:
                    df.rename(columns={"cdr3": "sequences"}, inplace=True)
                if "cdr3_aa" in df.columns:
                    df.rename(columns={"cdr3_aa": "sequence_aas"},
                              inplace=True)
                df.loc[:, "region_types"] = params.region_type.name
            elif "junction" in params.column_mapping or "junction_aa" in params.column_mapping:
                ImportHelper.junction_to_cdr3(df, params.region_type)
        # todo else: support "full_sequence" import through regiontype?

        if "chains" not in df.columns:
            df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df)

        df = ImportHelper.update_gene_info(df)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences,
                                          params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(
            df, params.import_illegal_characters)

        return df
Example #8
0
    def load_sequence_dataset(params: dict, dataset_name: str) -> Dataset:

        iris_params = IRISImportParams.build_object(**params)

        filenames = ImportHelper.get_sequence_filenames(iris_params.path, dataset_name)
        file_index = 0
        dataset_filenames = []

        for index, filename in enumerate(filenames):
            items = IRISSequenceImport.import_items(filename, paired=iris_params.paired,
                                                    all_dual_chains=iris_params.import_dual_chains,
                                                    all_genes=iris_params.import_all_gene_combinations)

            while len(items) > iris_params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0):
                dataset_filenames.append(iris_params.result_path + "batch_{}.pickle".format(file_index))
                ImportHelper.store_sequence_items(dataset_filenames, items, iris_params.sequence_file_size)
                items = items[iris_params.sequence_file_size:]
                file_index += 1

        return ReceptorDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name) if iris_params.paired \
            else SequenceDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name)
Example #9
0
    def import_sequence_dataset(params: dict, dataset_name: str) -> RepertoireDataset:
        base_result_path = params['result_path'] + "tmp_airr/"

        unzipped_path = base_result_path + "tmp_unzipped/"
        IReceptorImport._unzip_files(params['path'], unzipped_path, unzip_metadata=False)

        airr_params = copy.deepcopy(params)
        airr_params["path"] = unzipped_path

        dataset = ImportHelper.import_dataset(AIRRImport, airr_params, dataset_name)

        shutil.rmtree(unzipped_path)

        return dataset
Example #10
0
    def import_repertoire_dataset(params: dict, dataset_name: str) -> RepertoireDataset:
        base_result_path = params['result_path'] + "tmp_airr/"
        metadata_file_path = base_result_path + "metadata.csv"

        IReceptorImport._create_airr_repertoiredataset(params['path'], base_result_path, metadata_file_path)

        airr_params = copy.deepcopy(params)
        airr_params["path"] = base_result_path
        airr_params["metadata_file"] = metadata_file_path

        dataset = ImportHelper.import_dataset(AIRRImport, airr_params, dataset_name)

        shutil.rmtree(base_result_path)

        return dataset
Example #11
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        df["frame_types"] = None
        df.loc[df["productive"].eq("True"),
               "frame_types"] = SequenceFrameType.IN.name

        allowed_productive_values = []
        if params.import_productive:
            allowed_productive_values.append("True")
        if params.import_unproductive:
            allowed_productive_values.append("False")

        df = df[df.productive.isin(allowed_productive_values)]

        ImportHelper.junction_to_cdr3(df, params.region_type)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences,
                                          params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(
            df, params.import_illegal_characters)
        ImportHelper.update_gene_info(df)

        if "chains" not in df.columns:
            df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df)

        return df
Example #12
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        """
        Function for loading the data from one MiXCR file, such that:
            - for the given region (CDR3/full sequence), both nucleotide and amino acid sequence are loaded
            - if the region is CDR3, it adapts the sequence to the definition of the CDR3 (IMGT junction vs IMGT CDR3)
            - the chain for each sequence is extracted from the v gene name
            - the genes are loaded from the top score for gene without allele info

        Arguments:

            metadata: the corresponding row from the metadata file with metadata such as subject_id, age, HLA or other info given there
            params: DatasetImportParams object defining what to import and how to do it

        Returns:
            data frame corresponding to Repertoire.FIELDS and custom lists which can be used to create a Repertoire object

        """
        df["sequence_aas"] = df[MiXCRImport.SEQUENCE_NAME_MAP[
            params.region_type]["AA"]]
        df["sequences"] = df[MiXCRImport.SEQUENCE_NAME_MAP[params.region_type]
                             ["NT"]]
        ImportHelper.junction_to_cdr3(df, params.region_type)

        df["counts"] = df["counts"].astype(float).astype(int)

        df["v_genes"] = MiXCRImport._load_genes(df, "v_genes")
        df["j_genes"] = MiXCRImport._load_genes(df, "j_genes")
        df["chains"] = ImportHelper.load_chains_from_genes(df)

        ImportHelper.update_gene_info(df)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences,
                                          params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(
            df, params.import_illegal_characters)

        return df
Example #13
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        if "sequences" not in df.columns and "sequence_aas" not in df.columns:
            raise IOError(
                "OLGAImport: Columns should contain at least 'sequences' or 'sequence_aas'."
            )

        if "counts" not in df.columns:
            df["counts"] = 1

        df["sequence_identifiers"] = None

        ImportHelper.junction_to_cdr3(df, params.region_type)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences,
                                          params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(
            df, params.import_illegal_characters)

        if "chains" not in df.columns:
            df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df)

        return df
Example #14
0
    def prepare_reference(reference_params: dict, location: str, paired: bool):
        ParameterValidator.assert_keys(list(reference_params.keys()),
                                       ["format", "params"], location,
                                       "reference")

        seq_import_params = reference_params[
            "params"] if "params" in reference_params else {}

        assert os.path.isfile(seq_import_params["path"]), f"{location}: the file {seq_import_params['path']} does not exist. " \
                                                  f"Specify the correct path under reference."

        if "paired" in seq_import_params:
            assert seq_import_params[
                "paired"] == paired, f"{location}: paired must be {paired} for SequenceImport"
        else:
            seq_import_params["paired"] = paired

        format_str = reference_params["format"]

        if format_str == "IRIS":  # todo refactor this when refactoring IRISSequenceImport
            receptors = IRISSequenceImport.import_items(**seq_import_params)
        else:
            import_class = ReflectionHandler.get_class_by_name(
                "{}Import".format(format_str))
            params = DefaultParamsLoader.load(
                EnvironmentSettings.default_params_path + "datasets/",
                DefaultParamsLoader.convert_to_snake_case(format_str))
            for key, value in seq_import_params.items():
                params[key] = value
            params["paired"] = paired

            processed_params = DatasetImportParams.build_object(**params)

            receptors = ImportHelper.import_items(
                import_class, reference_params["params"]["path"],
                processed_params)

        return receptors
Example #15
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        if "counts" not in df.columns:
            df["counts"] = 1

        df = df[df.anchors_found == "1"]

        if not params.import_out_of_frame:
            df = df[df.is_inframe == "1"]

        df["sequence_aas"] = df["sequences"].apply(IGoRImport.translate_sequence)

        if not params.import_with_stop_codon:
            no_stop_codon = ["*" not in seq for seq in df.sequence_aas]
            df = df[no_stop_codon]

        ImportHelper.junction_to_cdr3(df, params.region_type)
        # note: import_empty_aa_sequences is set to true here; since IGoR doesnt output aa, this parameter is insensible
        ImportHelper.drop_empty_sequences(df, True, params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters)

        # chain or at least receptorsequence?

        return df
Example #16
0
 def import_receptors(df, params):
     df["receptor_identifiers"] = df["cell_id"]
     return ImportHelper.import_receptors(df, params)
Example #17
0
 def alternative_load_func(filename, params):
     df = airr.load_rearrangement(filename)
     df = ImportHelper.standardize_none_values(df)
     df.dropna(axis="columns", how="all", inplace=True)
     return df
Example #18
0
 def import_dataset(params: dict, dataset_name: str) -> Dataset:
     return ImportHelper.import_dataset(AIRRImport, params, dataset_name)
Example #19
0
 def import_dataset(params: dict, dataset_name: str) -> Dataset:
     return ImportHelper.import_dataset(ImmunoSEQSampleImport, params,
                                        dataset_name)
Example #20
0
 def import_receptors(df, params):
     return ImportHelper.import_receptors(df, params)
Example #21
0
 def import_receptors(df, params):
     df["receptor_identifiers"] = df["sequence_identifiers"]
     return ImportHelper.import_receptors(df, params)
Example #22
0
 def import_dataset(params: dict, dataset_name: str) -> Dataset:
     return ImportHelper.import_dataset(ImmunoSEQRearrangementImport, params, dataset_name)
Example #23
0
 def import_dataset(params: dict, dataset_name: str) -> Dataset:
     return ImportHelper.import_dataset(TenxGenomicsImport, params,
                                        dataset_name)