def update_gene_columns(df, allele_name, gene_name): for index, row in df.iterrows(): for gene in ['v', 'j']: if NumpyHelper.is_nan_or_empty( row[f"{gene}_{allele_name}"] ) and not NumpyHelper.is_nan_or_empty( row[f"{gene}_{gene_name}"]): df[f"{gene}_{allele_name}"][index] = row[ f"{gene}_{gene_name}"]
def get_record(self): """exports the sequence object as a numpy record""" return [ NumpyHelper.get_numpy_representation(getattr(self, name)) if hasattr(self, name) else getattr(ReceptorSequence, name) for name in ReceptorSequence.FIELDS.keys() ]
def get_counts(self): counts = self.get_attribute("counts") if counts is not None: counts = np.array([ int(count) if not NumpyHelper.is_nan_or_empty(count) else None for count in counts ]) return counts
def _prepare_cell_lists(self): data = self.load_data() assert "cell_ids" in data.dtype.names and data["cell_ids"] is not None, \ f"Repertoire: cannot return receptor objects in repertoire {self.identifier} since cell_ids are not specified. " \ f"Existing fields are: {str(data.dtype.names)[1:-1]}" same_cell_lists = NumpyHelper.group_structured_array_by( data, "cell_ids") return same_cell_lists
def process_custom_lists(custom_lists): if custom_lists: field_list = list(custom_lists.keys()) values = [[ NumpyHelper.get_numpy_representation(el) for el in custom_lists[field] ] for field in custom_lists.keys()] dtype = [(field, np.array(values[index]).dtype) for index, field in enumerate(custom_lists.keys())] else: field_list, values, dtype = [], [], [] return field_list, values, dtype
def _make_sequence_object(self, row, load_implants: bool = False): fields = row.dtype.names implants = [] if load_implants: keys = [ key for key in row.dtype.names if key not in Repertoire.FIELDS ] for key in keys: value_dict = row[key] if value_dict: try: implants.append( ImplantAnnotation(**ast.literal_eval(value_dict))) except (SyntaxError, ValueError, TypeError) as e: pass seq = ReceptorSequence( amino_acid_sequence=row["sequence_aas"] if "sequence_aas" in fields else None, nucleotide_sequence=row["sequences"] if "sequences" in fields else None, identifier=row["sequence_identifiers"] if "sequence_identifiers" in fields else None, metadata=SequenceMetadata( v_gene=row["v_genes"] if "v_genes" in fields else None, j_gene=row["j_genes"] if "j_genes" in fields else None, v_subgroup=row["v_subgroups"] if "v_subgroups" in fields else None, j_subgroup=row["j_subgroups"] if "j_subgroups" in fields else None, v_allele=row["v_alleles"] if "v_alleles" in fields else None, j_allele=row["j_alleles"] if "j_alleles" in fields else None, chain=row["chains"] if "chains" in fields else None, count=row["counts"] if "counts" in fields and not NumpyHelper.is_nan_or_empty(row['counts']) else None, region_type=row["region_types"] if "region_types" in fields else None, frame_type=row["frame_types"] if "frame_types" in fields else "IN", cell_id=row["cell_ids"] if "cell_ids" in fields else None, custom_params={ key: row[key] if key in fields else None for key in set(self.fields) - set(Repertoire.FIELDS) }), annotation=SequenceAnnotation(implants=implants)) return seq
def build(cls, sequence_aas: list = None, sequences: list = None, v_genes: list = None, j_genes: list = None, v_subgroups: list = None, j_subgroups: list = None, v_alleles: list = None, j_alleles: list = None, chains: list = None, counts: list = None, region_types: list = None, frame_types: list = None, custom_lists: dict = None, sequence_identifiers: list = None, path: Path = None, metadata: dict = None, signals: dict = None, cell_ids: List[str] = None, filename_base: str = None): sequence_count = Repertoire.check_count(sequence_aas, sequences, custom_lists) if sequence_identifiers is None or len( sequence_identifiers) == 0 or any( identifier is None for identifier in sequence_identifiers): sequence_identifiers = np.arange(sequence_count).astype(str) identifier = uuid4().hex filename_base = filename_base if filename_base is not None else identifier data_filename = path / f"{filename_base}.npy" field_list, values, dtype = Repertoire.process_custom_lists( custom_lists) if signals: signals_filtered = { signal: signals[signal] for signal in signals if signal not in metadata["field_list"] } field_list_signals, values_signals, dtype_signals = Repertoire.process_custom_lists( signals_filtered) field_list.extend(field_list_signals) values.extend(values_signals) dtype.extend(dtype_signals) for field in Repertoire.FIELDS: if eval(field) is not None and not all(el is None for el in eval(field)): field_list.append(field) values.append([ NumpyHelper.get_numpy_representation(val) if val is not None else np.nan for val in eval(field) ]) dtype.append((field, np.array(values[-1]).dtype)) repertoire_matrix = np.array(list(map(tuple, zip(*values))), order='F', dtype=dtype) np.save(str(data_filename), repertoire_matrix, allow_pickle=False) metadata_filename = path / f"{filename_base}_metadata.yaml" metadata = {} if metadata is None else metadata metadata["field_list"] = field_list with metadata_filename.open("w") as file: yaml.dump(metadata, file) repertoire = Repertoire(data_filename, metadata_filename, identifier) return repertoire
def get_record(self): chains = self.get_chains() record = self.get_chain(chains[0]).get_record() + self.get_chain(chains[1]).get_record() \ + [NumpyHelper.get_numpy_representation(getattr(self, name)) for name in self.FIELDS if name not in chains] return record