Exemple #1
0
    def _repertoire_to_dataframe(repertoire: Repertoire, region_type):
        # get all fields (including custom fields)
        df = pd.DataFrame(repertoire.load_data())

        for column in ['v_alleles', 'j_alleles', 'v_genes', 'j_genes']:
            if column not in df.columns:
                df.loc[:, column] = None

        AIRRExporter.update_gene_columns(df, 'alleles', 'genes')

        # rename mandatory fields for airr-compliance
        mapper = {"sequence_identifiers": "sequence_id", "v_alleles": "v_call", "j_alleles": "j_call", "chains": "locus", "counts": "duplicate_count",
                  "sequences": AIRRExporter.get_sequence_field(region_type), "sequence_aas": AIRRExporter.get_sequence_aa_field(region_type)}

        df = df.rename(mapper=mapper, axis="columns")
        return df
Exemple #2
0
    def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire:
        data = pd.DataFrame(repertoire.load_data())

        groupby_fields = DuplicateSequenceFilter._prepare_group_by_field(
            params, data.columns)
        custom_lists = list(set(data.columns) - set(Repertoire.FIELDS))
        agg_dict = DuplicateSequenceFilter._prepare_agg_dict(
            params, data.columns, custom_lists)

        # Chain objects can not be aggregated, convert to strings
        if "chains" in data.columns:
            data["chains"] = [
                chain.value if isinstance(chain, Chain) else chain
                for chain in data["chains"]
            ]
        else:
            data["chains"] = None

        no_duplicates = data.groupby(groupby_fields).agg(
            agg_dict).reset_index()

        processed_repertoire = Repertoire.build(
            sequence_aas=list(no_duplicates["sequence_aas"])
            if "sequence_aas" in no_duplicates.columns else None,
            sequences=list(no_duplicates["sequences"])
            if "sequences" in no_duplicates.columns else None,
            v_genes=list(no_duplicates["v_genes"])
            if "v_genes" in no_duplicates.columns else None,
            j_genes=list(no_duplicates["j_genes"])
            if 'j_genes' in no_duplicates.columns else None,
            chains=[Chain(key) for key in list(no_duplicates["chains"])]
            if "chains" in no_duplicates.columns else None,
            counts=list(no_duplicates["counts"])
            if "counts" in no_duplicates else None,
            region_types=list(no_duplicates["region_types"])
            if "region_types" in no_duplicates else None,
            custom_lists={
                key: list(no_duplicates[key])
                for key in custom_lists
            },
            sequence_identifiers=list(no_duplicates["sequence_identifiers"]),
            metadata=copy.deepcopy(repertoire.metadata),
            path=params["result_path"])

        return processed_repertoire