Esempio n. 1
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        df["frame_types"] = SequenceFrameType.IN.name
        ImportHelper.junction_to_cdr3(df, params.region_type)

        if not params.is_repertoire and params.paired:
            n_single_chains = sum(df["sequence_identifiers"] == "0")
            if n_single_chains > 0:
                df.drop(df.loc[df["sequence_identifiers"] == "0"].index, inplace=True)
                warnings.warn(f"VDJdbImport: {n_single_chains} single chains were removed when trying to create a ReceptorDataset.\n"
                              f"To import all chains as a SequenceDataset, use paired = False")
        else:
            df.loc[df["sequence_identifiers"] == "0", "sequence_identifiers"] = None

        if "chains" not in df.columns:
            df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df)

        df["receptor_identifiers"] = df["sequence_identifiers"]
        df["sequence_identifiers"] = VDJdbImport.get_sequence_identifiers(df["sequence_identifiers"], df["chains"])

        ImportHelper.update_gene_info(df)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters)
        df = VDJdbImport.extract_meta_columns(df, params)

        return df
Esempio n. 2
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        ImportHelper.junction_to_cdr3(df, params.region_type)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences,
                                          params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(
            df, params.import_illegal_characters)
        ImportHelper.update_gene_info(df)

        return df
Esempio n. 3
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        """
        Function for preprocessing data from a dataframe containing AIRR data, such that:
            - productive sequences, sequences with stop codons or out of frame sequences are filtered according to specification
            - if RegionType is CDR3, the leading C and trailing W are removed from the sequence to match the CDR3 definition
            - if no chain column was specified, the chain is extracted from the v gene name
            - the allele information is removed from the V and J genes
        """
        if "productive" in df.columns:
            df["frame_types"] = SequenceFrameType.OUT.name
            df.loc[df["productive"] == True,
                   "frame_types"] = SequenceFrameType.IN.name
        else:
            df["frame_types"] = None

        if "vj_in_frame" in df.columns:
            df.loc[df["vj_in_frame"] == True,
                   "frame_types"] = SequenceFrameType.IN.name
        if "stop_codon" in df.columns:
            df.loc[df["stop_codon"] == True,
                   "frame_types"] = SequenceFrameType.STOP.name

        if "productive" in df.columns:
            frame_type_list = ImportHelper.prepare_frame_type_list(params)
            df = df[df["frame_types"].isin(frame_type_list)]

        if params.region_type == RegionType.IMGT_CDR3:
            if "sequence_aas" not in df.columns and "sequences" not in df.columns:
                if "cdr3" in df.columns:
                    df.rename(columns={"cdr3": "sequences"}, inplace=True)
                if "cdr3_aa" in df.columns:
                    df.rename(columns={"cdr3_aa": "sequence_aas"},
                              inplace=True)
                df.loc[:, "region_types"] = params.region_type.name
            elif "junction" in params.column_mapping or "junction_aa" in params.column_mapping:
                ImportHelper.junction_to_cdr3(df, params.region_type)
        # todo else: support "full_sequence" import through regiontype?

        if "chains" not in df.columns:
            df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df)

        df = ImportHelper.update_gene_info(df)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences,
                                          params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(
            df, params.import_illegal_characters)

        return df
Esempio n. 4
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        if "sequences" not in df.columns and "sequence_aas" not in df.columns:
            raise IOError(
                "OLGAImport: Columns should contain at least 'sequences' or 'sequence_aas'."
            )

        if "counts" not in df.columns:
            df["counts"] = 1

        df["sequence_identifiers"] = None

        ImportHelper.junction_to_cdr3(df, params.region_type)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences,
                                          params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(
            df, params.import_illegal_characters)

        if "chains" not in df.columns:
            df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df)

        return df
Esempio n. 5
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        if "counts" not in df.columns:
            df["counts"] = 1

        df = df[df.anchors_found == "1"]

        if not params.import_out_of_frame:
            df = df[df.is_inframe == "1"]

        df["sequence_aas"] = df["sequences"].apply(IGoRImport.translate_sequence)

        if not params.import_with_stop_codon:
            no_stop_codon = ["*" not in seq for seq in df.sequence_aas]
            df = df[no_stop_codon]

        ImportHelper.junction_to_cdr3(df, params.region_type)
        # note: import_empty_aa_sequences is set to true here; since IGoR doesnt output aa, this parameter is insensible
        ImportHelper.drop_empty_sequences(df, True, params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters)

        # chain or at least receptorsequence?

        return df
Esempio n. 6
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        """
        Function for loading the data from one MiXCR file, such that:
            - for the given region (CDR3/full sequence), both nucleotide and amino acid sequence are loaded
            - if the region is CDR3, it adapts the sequence to the definition of the CDR3 (IMGT junction vs IMGT CDR3)
            - the chain for each sequence is extracted from the v gene name
            - the genes are loaded from the top score for gene without allele info

        Arguments:

            metadata: the corresponding row from the metadata file with metadata such as subject_id, age, HLA or other info given there
            params: DatasetImportParams object defining what to import and how to do it

        Returns:
            data frame corresponding to Repertoire.FIELDS and custom lists which can be used to create a Repertoire object

        """
        df["sequence_aas"] = df[MiXCRImport.SEQUENCE_NAME_MAP[
            params.region_type]["AA"]]
        df["sequences"] = df[MiXCRImport.SEQUENCE_NAME_MAP[params.region_type]
                             ["NT"]]
        ImportHelper.junction_to_cdr3(df, params.region_type)

        df["counts"] = df["counts"].astype(float).astype(int)

        df["v_genes"] = MiXCRImport._load_genes(df, "v_genes")
        df["j_genes"] = MiXCRImport._load_genes(df, "j_genes")
        df["chains"] = ImportHelper.load_chains_from_genes(df)

        ImportHelper.update_gene_info(df)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences,
                                          params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(
            df, params.import_illegal_characters)

        return df
Esempio n. 7
0
    def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
        df["frame_types"] = None
        df.loc[df["productive"].eq("True"),
               "frame_types"] = SequenceFrameType.IN.name

        allowed_productive_values = []
        if params.import_productive:
            allowed_productive_values.append("True")
        if params.import_unproductive:
            allowed_productive_values.append("False")

        df = df[df.productive.isin(allowed_productive_values)]

        ImportHelper.junction_to_cdr3(df, params.region_type)
        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences,
                                          params.import_empty_nt_sequences)
        ImportHelper.drop_illegal_character_sequences(
            df, params.import_illegal_characters)
        ImportHelper.update_gene_info(df)

        if "chains" not in df.columns:
            df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df)

        return df