Ejemplo n.º 1
0
    def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire:
        data = pd.DataFrame(repertoire.load_data())

        groupby_fields = DuplicateSequenceFilter._prepare_group_by_field(params, data.columns)
        custom_lists = list(set(data.columns) - set(Repertoire.FIELDS))
        agg_dict = DuplicateSequenceFilter._prepare_agg_dict(params, data.columns, custom_lists)

        # Chain objects can not be aggregated, convert to strings
        if "chains" in data.columns:
            data["chains"] = [chain.value if isinstance(chain, Chain) else chain for chain in data["chains"]]
        else:
            data["chains"] = None

        no_duplicates = data.groupby(groupby_fields).agg(agg_dict).reset_index()

        processed_repertoire = Repertoire.build(sequence_aas=list(no_duplicates["sequence_aas"]) if "sequence_aas" in no_duplicates.columns else None,
                                                sequences=list(no_duplicates["sequences"]) if "sequences" in no_duplicates.columns else None,
                                                v_genes=list(no_duplicates["v_genes"]) if "v_genes" in no_duplicates.columns else None,
                                                j_genes=list(no_duplicates["j_genes"]) if 'j_genes' in no_duplicates.columns else None,
                                                chains=[Chain(key) for key in list(no_duplicates["chains"])] if "chains" in no_duplicates.columns else None,
                                                counts=list(no_duplicates["counts"]) if "counts" in no_duplicates else None,
                                                region_types=list(no_duplicates["region_types"]) if "region_types" in no_duplicates else None,
                                                custom_lists={key: list(no_duplicates[key]) for key in custom_lists},
                                                sequence_identifiers=list(no_duplicates["sequence_identifiers"]),
                                                metadata=copy.deepcopy(repertoire.metadata),
                                                path=params["result_path"],
                                                filename_base=f"{repertoire.data_filename.stem}_filtered")

        return processed_repertoire
Ejemplo n.º 2
0
    def _repertoire_to_dataframe(repertoire: Repertoire, region_type):
        # get all fields (including custom fields)
        df = pd.DataFrame(repertoire.load_data())

        for column in ['v_alleles', 'j_alleles', 'v_genes', 'j_genes']:
            if column not in df.columns:
                df.loc[:, column] = ''

        AIRRExporter.update_gene_columns(df, 'alleles', 'genes')

        # rename mandatory fields for airr-compliance
        mapper = {
            "sequence_identifiers": "sequence_id",
            "v_alleles": "v_call",
            "j_alleles": "j_call",
            "chains": "locus",
            "counts": "duplicate_count",
            "sequences": AIRRExporter.get_sequence_field(region_type),
            "sequence_aas": AIRRExporter.get_sequence_aa_field(region_type)
        }

        df = df.rename(mapper=mapper, axis="columns")
        return df