def validate_airr(data): """Validate dtypes in airr table.""" tmp = data.copy() int_columns = [] for d in tmp: try: tmp[d].replace(np.nan, pd.NA).astype("Int64") int_columns.append(d) except: pass bool_columns = [ 'rev_comp', 'productive', 'vj_in_frame', 'stop_codon', 'complete_vdj' ] str_columns = list(tmp.dtypes[tmp.dtypes == 'object'].index) columns = [ c for c in list(set(int_columns + str_columns + bool_columns)) if c in tmp ] if len(columns) > 0: for c in columns: tmp[c].fillna('', inplace=True) for _, row in tmp.iterrows(): contig = Contig(row).contig for required in [ 'sequence', 'rev_comp', 'sequence_alignment', 'germline_alignment', 'v_cigar', 'd_cigar', 'j_cigar' ]: if required not in contig: contig.update({required: ''}) RearrangementSchema.validate_header(contig.keys()) RearrangementSchema.validate_row(contig)
def write_airr(adata: AnnData, filename: Union[str, Path]) -> None: """Export :term:`IR` data to :term:`AIRR` Rearrangement `tsv` format. Parameters ---------- adata annotated data matrix filename destination filename """ airr_cells = to_airr_cells(adata) try: fields = airr_cells[0].fields for tmp_cell in airr_cells[1:]: assert tmp_cell.fields == fields, "All rows of adata have the same fields." except IndexError: # case of an empty output file fields = None writer = airr.create_rearrangement(filename, fields=fields) for tmp_cell in airr_cells: for chain in tmp_cell.to_airr_records(): # workaround for AIRR library writing out int field as floats (if it happens to be a float) for f in chain: if RearrangementSchema.type(f) == "integer": chain[f] = int(chain[f]) writer.write(chain) writer.close()
def validate_airr(data): """Validate dtypes in airr table.""" for _, row in data.iterrows(): contig = dict(row) for k, v in contig.items(): if data[k].dtype == 'Int64': if pd.isnull(v): contig.update({k: str('')}) if data[k].dtype == 'Float64': if pd.isnull(v): contig.update({k: np.nan}) for required in [ 'sequence', 'rev_comp', 'sequence_alignment', 'germline_alignment', 'v_cigar', 'd_cigar', 'j_cigar' ]: if required not in contig: contig.update({required: ''}) # check if airr-standards is happy RearrangementSchema.validate_header(contig.keys()) RearrangementSchema.validate_row(contig)
def add_chain(self, chain: Mapping) -> None: """Add a chain to the cell. A chain is a dictionary following the `AIRR Rearrangement Schema <https://docs.airr-community.org/en/latest/datarep/rearrangements.html#productive>`__. """ # ensure consistent ordering chain = dict(sorted(chain.items())) # sanitize NA values chain = {k: None if _is_na2(v) else v for k, v in chain.items()} # TODO this should be `.validate_obj` but currently does not work # because of https://github.com/airr-community/airr-standards/issues/508 RearrangementSchema.validate_header(chain.keys()) RearrangementSchema.validate_row(chain) for tmp_field in self._cell_attribute_fields: # It is ok if a field specified as cell attribute is not present in the chain try: self[tmp_field] = chain.pop(tmp_field) except KeyError: pass if self._chain_fields is None: self._chain_fields = list(chain.keys()) elif self._chain_fields != list(chain.keys()): raise ValueError("All chains must have the same fields!") if "locus" not in chain: self._logger.warning( "`locus` field not specified, but required for most scirpy functionality. " ) # type: ignore elif chain["locus"] not in self.VALID_LOCI: # TODO seems this isn't actually ignored. Chain will just be moved to `extra chains`. self._logger.warning( f"Non-standard locus name ignored: {chain['locus']} " ) # type: ignore self.chains.append(chain)
def read_airr( path: Union[str, Sequence[str], Path, Sequence[Path]], use_umi_count_col: Union[bool, Literal["auto"]] = "auto", infer_locus: bool = True, cell_attributes: Collection[str] = DEFAULT_AIRR_CELL_ATTRIBUTES, include_fields: Optional[Collection[str]] = DEFAULT_AIRR_FIELDS, ) -> AnnData: """\ Read data from `AIRR rearrangement <https://docs.airr-community.org/en/latest/datarep/rearrangements.html>`_ format. The following columns are required by scirpy: * `cell_id` * `productive` * `locus` * at least one of `consensus_count`, `duplicate_count`, or `umi_count` * at least one of `junction_aa` or `junction`. Data should still import if one of these fields is missing, but they are required by most of scirpy's processing functions. All chains for which the field `junction_aa` is missing or empty, will be considered as non-productive and will be moved to the `extra_chains` column. {doc_working_model} Parameters ---------- path Path to the AIRR rearrangement tsv file. If different chains are split up into multiple files, these can be specified as a List, e.g. `["path/to/tcr_alpha.tsv", "path/to/tcr_beta.tsv"]`. use_umi_count_col Whether to add UMI counts from the non-strandard (but common) `umi_count` column. When this column is used, the UMI counts are moved over to the standard `duplicate_count` column. Default: Use `umi_count` if there is no `duplicate_count` column present. infer_locus Try to infer the `locus` column from gene names, in case it is not specified. cell_attributes Fields in the rearrangement schema that are specific for a cell rather than a chain. The values must be identical over all records belonging to a cell. This defaults to {cell_attributes}. include_fields The fields to include in `adata`. The AIRR rearrangment schema contains can contain a lot of columns, most of which irrelevant for most analyses. Per default, this includes a subset of columns relevant for a typical scirpy analysis, to keep `adata.obs` a bit cleaner. Defaults to {include_fields}. Set this to `None` to include all columns. Returns ------- AnnData object with IR data in `obs` for each cell. For more details see :ref:`data-structure`. """ airr_cells = {} logger = _IOLogger() if isinstance(path, (str, Path, pd.DataFrame)): path: list = [path] def _decide_use_umi_count_col(chain_dict): """Logic to decide whether or not to use counts form the `umi_counts` column.""" if ( "umi_count" in chain_dict and use_umi_count_col == "auto" and "duplicate_count" not in chain_dict ): logger.warning( "Renaming the non-standard `umi_count` column to `duplicate_count`. " ) # type: ignore return True elif use_umi_count_col is True: return True else: return False for tmp_path in path: if isinstance(tmp_path, pd.DataFrame): iterator = tmp_path.to_dict(orient="records") else: iterator = airr.read_rearrangement(str(tmp_path)) for chain_dict in iterator: cell_id = chain_dict.pop("cell_id") try: tmp_cell = airr_cells[cell_id] except KeyError: tmp_cell = AirrCell( cell_id=cell_id, logger=logger, cell_attribute_fields=cell_attributes, ) airr_cells[cell_id] = tmp_cell if _decide_use_umi_count_col(chain_dict): chain_dict["duplicate_count"] = RearrangementSchema.to_int( chain_dict.pop("umi_count") ) if infer_locus and "locus" not in chain_dict: logger.warning( "`locus` column not found in input data. The locus is being inferred from the {v,d,j,c}_call columns." ) chain_dict["locus"] = _infer_locus_from_gene_names(chain_dict) tmp_cell.add_chain(chain_dict) return from_airr_cells(airr_cells.values(), include_fields=include_fields)