Example #1
0
def validate_airr(data):
    """Validate dtypes in airr table."""
    tmp = data.copy()
    int_columns = []
    for d in tmp:
        try:
            tmp[d].replace(np.nan, pd.NA).astype("Int64")
            int_columns.append(d)
        except:
            pass
    bool_columns = [
        'rev_comp', 'productive', 'vj_in_frame', 'stop_codon', 'complete_vdj'
    ]
    str_columns = list(tmp.dtypes[tmp.dtypes == 'object'].index)
    columns = [
        c for c in list(set(int_columns + str_columns + bool_columns))
        if c in tmp
    ]
    if len(columns) > 0:
        for c in columns:
            tmp[c].fillna('', inplace=True)
    for _, row in tmp.iterrows():
        contig = Contig(row).contig
        for required in [
                'sequence', 'rev_comp', 'sequence_alignment',
                'germline_alignment', 'v_cigar', 'd_cigar', 'j_cigar'
        ]:
            if required not in contig:
                contig.update({required: ''})
    RearrangementSchema.validate_header(contig.keys())
    RearrangementSchema.validate_row(contig)
Example #2
0
def write_airr(adata: AnnData, filename: Union[str, Path]) -> None:
    """Export :term:`IR` data to :term:`AIRR` Rearrangement `tsv` format.

    Parameters
    ----------
    adata
        annotated data matrix
    filename
        destination filename
    """
    airr_cells = to_airr_cells(adata)
    try:
        fields = airr_cells[0].fields
        for tmp_cell in airr_cells[1:]:
            assert tmp_cell.fields == fields, "All rows of adata have the same fields."
    except IndexError:
        # case of an empty output file
        fields = None

    writer = airr.create_rearrangement(filename, fields=fields)
    for tmp_cell in airr_cells:
        for chain in tmp_cell.to_airr_records():
            # workaround for AIRR library writing out int field as floats (if it happens to be a float)
            for f in chain:
                if RearrangementSchema.type(f) == "integer":
                    chain[f] = int(chain[f])
            writer.write(chain)
    writer.close()
Example #3
0
def validate_airr(data):
    """Validate dtypes in airr table."""
    for _, row in data.iterrows():
        contig = dict(row)
        for k, v in contig.items():
            if data[k].dtype == 'Int64':
                if pd.isnull(v):
                    contig.update({k: str('')})
            if data[k].dtype == 'Float64':
                if pd.isnull(v):
                    contig.update({k: np.nan})
        for required in [
                'sequence', 'rev_comp', 'sequence_alignment', 'germline_alignment',
                'v_cigar', 'd_cigar', 'j_cigar'
        ]:
            if required not in contig:
                contig.update({required: ''})
    # check if airr-standards is happy
    RearrangementSchema.validate_header(contig.keys())
    RearrangementSchema.validate_row(contig)
Example #4
0
    def add_chain(self, chain: Mapping) -> None:
        """Add a chain to the cell.

        A chain is a dictionary following
        the `AIRR Rearrangement Schema <https://docs.airr-community.org/en/latest/datarep/rearrangements.html#productive>`__.
        """
        # ensure consistent ordering
        chain = dict(sorted(chain.items()))
        # sanitize NA values
        chain = {k: None if _is_na2(v) else v for k, v in chain.items()}

        # TODO this should be `.validate_obj` but currently does not work
        # because of https://github.com/airr-community/airr-standards/issues/508
        RearrangementSchema.validate_header(chain.keys())
        RearrangementSchema.validate_row(chain)

        for tmp_field in self._cell_attribute_fields:
            # It is ok if a field specified as cell attribute is not present in the chain
            try:
                self[tmp_field] = chain.pop(tmp_field)
            except KeyError:
                pass

        if self._chain_fields is None:
            self._chain_fields = list(chain.keys())
        elif self._chain_fields != list(chain.keys()):
            raise ValueError("All chains must have the same fields!")

        if "locus" not in chain:
            self._logger.warning(
                "`locus` field not specified, but required for most scirpy functionality. "
            )  # type: ignore
        elif chain["locus"] not in self.VALID_LOCI:
            # TODO seems this isn't actually ignored. Chain will just be moved to `extra chains`.
            self._logger.warning(
                f"Non-standard locus name ignored: {chain['locus']} "
            )  # type: ignore

        self.chains.append(chain)
Example #5
0
def read_airr(
    path: Union[str, Sequence[str], Path, Sequence[Path]],
    use_umi_count_col: Union[bool, Literal["auto"]] = "auto",
    infer_locus: bool = True,
    cell_attributes: Collection[str] = DEFAULT_AIRR_CELL_ATTRIBUTES,
    include_fields: Optional[Collection[str]] = DEFAULT_AIRR_FIELDS,
) -> AnnData:
    """\
    Read data from `AIRR rearrangement <https://docs.airr-community.org/en/latest/datarep/rearrangements.html>`_ format.

    The following columns are required by scirpy:
     * `cell_id`
     * `productive`
     * `locus`
     * at least one of `consensus_count`, `duplicate_count`, or `umi_count`
     * at least one of `junction_aa` or `junction`.

    Data should still import if one of these fields is missing, but they are required
    by most of scirpy's processing functions. All chains for which the field
    `junction_aa` is missing or empty, will be considered as non-productive and
    will be moved to the `extra_chains` column.

    {doc_working_model}

    Parameters
    ----------
    path
        Path to the AIRR rearrangement tsv file. If different
        chains are split up into multiple files, these can be specified
        as a List, e.g. `["path/to/tcr_alpha.tsv", "path/to/tcr_beta.tsv"]`.
    use_umi_count_col
        Whether to add UMI counts from the non-strandard (but common) `umi_count`
        column. When this column is used, the UMI counts are moved over to the
        standard `duplicate_count` column. Default: Use `umi_count` if there is
        no `duplicate_count` column present.
    infer_locus
        Try to infer the `locus` column from gene names, in case it is not specified.
    cell_attributes
        Fields in the rearrangement schema that are specific for a cell rather
        than a chain. The values must be identical over all records belonging to a
        cell. This defaults to {cell_attributes}.
    include_fields
        The fields to include in `adata`. The AIRR rearrangment schema contains
        can contain a lot of columns, most of which irrelevant for most analyses.
        Per default, this includes a subset of columns relevant for a typical
        scirpy analysis, to keep `adata.obs` a bit cleaner. Defaults to {include_fields}.
        Set this to `None` to include all columns.

    Returns
    -------
    AnnData object with IR data in `obs` for each cell. For more details see
    :ref:`data-structure`.
    """
    airr_cells = {}
    logger = _IOLogger()

    if isinstance(path, (str, Path, pd.DataFrame)):
        path: list = [path]

    def _decide_use_umi_count_col(chain_dict):
        """Logic to decide whether or not to use counts form the `umi_counts` column."""
        if (
            "umi_count" in chain_dict
            and use_umi_count_col == "auto"
            and "duplicate_count" not in chain_dict
        ):
            logger.warning(
                "Renaming the non-standard `umi_count` column to `duplicate_count`. "
            )  # type: ignore
            return True
        elif use_umi_count_col is True:
            return True
        else:
            return False

    for tmp_path in path:
        if isinstance(tmp_path, pd.DataFrame):
            iterator = tmp_path.to_dict(orient="records")
        else:
            iterator = airr.read_rearrangement(str(tmp_path))

        for chain_dict in iterator:
            cell_id = chain_dict.pop("cell_id")

            try:
                tmp_cell = airr_cells[cell_id]
            except KeyError:
                tmp_cell = AirrCell(
                    cell_id=cell_id,
                    logger=logger,
                    cell_attribute_fields=cell_attributes,
                )
                airr_cells[cell_id] = tmp_cell

            if _decide_use_umi_count_col(chain_dict):
                chain_dict["duplicate_count"] = RearrangementSchema.to_int(
                    chain_dict.pop("umi_count")
                )

            if infer_locus and "locus" not in chain_dict:
                logger.warning(
                    "`locus` column not found in input data. The locus is being inferred from the {v,d,j,c}_call columns."
                )
                chain_dict["locus"] = _infer_locus_from_gene_names(chain_dict)

            tmp_cell.add_chain(chain_dict)

    return from_airr_cells(airr_cells.values(), include_fields=include_fields)