def __getitem__(self, key): # we cache what we can. if key in ('het_samples', 'hom_alt_samples', 'unknown_samples', 'variant_samples', 'hom_ref_samples'): if self.genotype_dict == {}: self.genotype_dict = self.query._group_samples_by_genotype(self['gt_types']) if key == 'het_samples': return self.genotype_dict[HET] if key == 'hom_alt_samples': return self.genotype_dict[HOM_ALT] if key == 'hom_ref_samples': return self.genotype_dict[HOM_REF] if key == 'unknown_samples': return self.genotype_dict[UNKNOWN] if key == 'variant_samples': return self.genotype_dict[HET] + self.genotype_dict[HOM_ALT] if key in self.cache: return self.cache[key] if key == 'info': if 'info' not in self.cache: self.cache['info'] = compression.unpack_ordereddict_blob(self.row['info']) return self.cache['info'] if key not in self.query.gt_cols: return self.row[key] elif key in self.query.gt_cols: if key not in self.cache: self.cache[key] = compression.unpack_genotype_blob(self.row[key]) return self.cache[key] raise KeyError(key)
def row_2_GeminiRow(self, row): variant_names = [] het_names = [] hom_alt_names = [] hom_ref_names = [] unknown_names = [] info = None if 'info' in self.report_cols: info = compression.unpack_ordereddict_blob(row['info']) fields = OrderedDict() for col in self.report_cols: if col == "*": continue if not col == "info": fields[col] = row[col] elif col == "info": fields[col] = _info_dict_to_string(info) if self.show_variant_samples or self.needs_sample_names: het_names = self._get_variant_samples(row['variant_id'], HET) hom_alt_names = self._get_variant_samples(row['variant_id'], HOM_ALT) hom_ref_names = self._get_variant_samples(row['variant_id'], HOM_REF) unknown_names = self._get_variant_samples(row['variant_id'], UNKNOWN) variant_names = het_names | hom_alt_names if self.show_variant_samples: fields["variant_samples"] = \ self.variant_samples_delim.join(variant_names) fields["HET_samples"] = \ self.variant_samples_delim.join(het_names) fields["HOM_ALT_samples"] = \ self.variant_samples_delim.join(hom_alt_names) if self.show_families: families = map(str, list(set([self.sample_to_sample_object[x].family_id for x in variant_names]))) fields["families"] = self.variant_samples_delim.join(families) gemini_row = GeminiRow(fields, variant_names, het_names, hom_alt_names, hom_ref_names, unknown_names, info, formatter=self.formatter) if not all([predicate(gemini_row) for predicate in self.predicates]): return None if not self.for_browser: return gemini_row else: return fields
def barebones_row2geminiRow(row, report_cols): info = None if 'info' in report_cols: info = compression.unpack_ordereddict_blob(row['info']) fields = OrderedDict() for col in report_cols: if col == "*": continue if not col == "info": fields[col] = row[col] elif col == "info": fields[col] = _info_dict_to_string(info) gemini_row = GeminiRow(fields, [], [], [], [], [], info) return gemini_row
def next(self): """ Return the GeminiRow object for the next query result. """ # we use a while loop since we may skip records based upon # genotype filters. if we need to skip a record, we just # throw a continue and keep trying. the alternative is to just # recursively call self.next() if we need to skip, but this # can quickly exceed the stack. while (1): try: row = self.c.next() except Exception as e: self.conn.close() raise StopIteration gts = None gt_types = None gt_phases = None gt_depths = None gt_ref_depths = None gt_alt_depths = None gt_quals = None variant_names = [] het_names = [] hom_alt_names = [] hom_ref_names = [] unknown_names = [] info = None if 'info' in self.report_cols: info = compression.unpack_ordereddict_blob(row['info']) if self._query_needs_genotype_info(): gts = compression.unpack_genotype_blob(row['gts']) gt_types = \ compression.unpack_genotype_blob(row['gt_types']) gt_phases = \ compression.unpack_genotype_blob(row['gt_phases']) gt_depths = \ compression.unpack_genotype_blob(row['gt_depths']) gt_ref_depths = \ compression.unpack_genotype_blob(row['gt_ref_depths']) gt_alt_depths = \ compression.unpack_genotype_blob(row['gt_alt_depths']) gt_quals = \ compression.unpack_genotype_blob(row['gt_quals']) variant_samples = [ x for x, y in enumerate(gt_types) if y == HET or y == HOM_ALT ] variant_names = [ self.idx_to_sample[x] for x in variant_samples ] het_samples = [x for x, y in enumerate(gt_types) if y == HET] het_names = [self.idx_to_sample[x] for x in het_samples] hom_alt_samples = [ x for x, y in enumerate(gt_types) if y == HOM_ALT ] hom_alt_names = [ self.idx_to_sample[x] for x in hom_alt_samples ] hom_ref_samples = [ x for x, y in enumerate(gt_types) if y == HOM_REF ] hom_ref_names = [ self.idx_to_sample[x] for x in hom_ref_samples ] unknown_samples = [ x for x, y in enumerate(gt_types) if y == UNKNOWN ] unknown_names = [ self.idx_to_sample[x] for x in unknown_samples ] families = map( str, list( set([ self.idx_to_sample_object[x].family_id for x in variant_samples ]))) # skip the record if it does not meet the user's genotype filter if self.gt_filter and not eval(self.gt_filter, locals()): continue fields = OrderedDict() for idx, col in enumerate(self.report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith( "GT") and not col == "info": fields[col] = row[col] elif col == "info": fields[col] = self._info_dict_to_string(info) else: # reuse the original column name user requested # e.g. replace gts[1085] with gts.NA20814 if '[' in col: orig_col = self.gt_idx_to_name_map[col] val = eval(col.strip()) if type(val) in [np.int8, np.int32, np.bool_]: fields[orig_col] = int(val) elif type(val) in [np.float32]: fields[orig_col] = float(val) else: fields[orig_col] = val else: # asked for "gts" or "gt_types", e.g. if col == "gts": fields[col] = ','.join(gts) elif col == "gt_types": fields[col] = \ ','.join(str(t) for t in gt_types) elif col == "gt_phases": fields[col] = \ ','.join(str(p) for p in gt_phases) elif col == "gt_depths": fields[col] = \ ','.join(str(d) for d in gt_depths) elif col == "gt_quals": fields[col] = \ ','.join(str(d) for d in gt_quals) elif col == "gt_ref_depths": fields[col] = \ ','.join(str(d) for d in gt_ref_depths) elif col == "gt_alt_depths": fields[col] = \ ','.join(str(d) for d in gt_alt_depths) if self.show_variant_samples: fields["variant_samples"] = \ self.variant_samples_delim.join(variant_names) fields["HET_samples"] = \ self.variant_samples_delim.join(het_names) fields["HOM_ALT_samples"] = \ self.variant_samples_delim.join(hom_alt_names) if self.show_families: fields["families"] = self.variant_samples_delim.join(families) gemini_row = GeminiRow(fields, gts, gt_types, gt_phases, gt_depths, gt_ref_depths, gt_alt_depths, gt_quals, variant_names, het_names, hom_alt_names, hom_ref_names, unknown_names, info, formatter=self.formatter) if not all( [predicate(gemini_row) for predicate in self.predicates]): continue if not self.for_browser: return gemini_row else: return fields
def next(self): """ Return the GeminiRow object for the next query result. """ # we use a while loop since we may skip records based upon # genotype filters. if we need to skip a record, we just # throw a continue and keep trying. the alternative is to just # recursively call self.next() if we need to skip, but this # can quickly exceed the stack. while (1): try: row = self.c.next() except Exception as e: self.conn.close() raise StopIteration gts = None gt_types = None gt_phases = None gt_depths = None gt_ref_depths = None gt_alt_depths = None gt_quals = None variant_names = [] het_names = [] hom_alt_names = [] hom_ref_names = [] unknown_names = [] info = None if 'info' in self.report_cols: info = compression.unpack_ordereddict_blob(row['info']) if self._query_needs_genotype_info(): gts = compression.unpack_genotype_blob(row['gts']) gt_types = \ compression.unpack_genotype_blob(row['gt_types']) gt_phases = \ compression.unpack_genotype_blob(row['gt_phases']) gt_depths = \ compression.unpack_genotype_blob(row['gt_depths']) gt_ref_depths = \ compression.unpack_genotype_blob(row['gt_ref_depths']) gt_alt_depths = \ compression.unpack_genotype_blob(row['gt_alt_depths']) gt_quals = \ compression.unpack_genotype_blob(row['gt_quals']) variant_samples = [x for x, y in enumerate(gt_types) if y == HET or y == HOM_ALT] variant_names = [self.idx_to_sample[x] for x in variant_samples] het_samples = [x for x, y in enumerate(gt_types) if y == HET] het_names = [self.idx_to_sample[x] for x in het_samples] hom_alt_samples = [x for x, y in enumerate(gt_types) if y == HOM_ALT] hom_alt_names = [self.idx_to_sample[x] for x in hom_alt_samples] hom_ref_samples = [x for x, y in enumerate(gt_types) if y == HOM_REF] hom_ref_names = [self.idx_to_sample[x] for x in hom_ref_samples] unknown_samples = [x for x, y in enumerate(gt_types) if y == UNKNOWN] unknown_names = [self.idx_to_sample[x] for x in unknown_samples] families = map(str, list(set([self.idx_to_sample_object[x].family_id for x in variant_samples]))) # skip the record if it does not meet the user's genotype filter if self.gt_filter and not eval(self.gt_filter, locals()): continue fields = OrderedDict() for idx, col in enumerate(self.report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith("GT") and not col == "info": fields[col] = row[col] elif col == "info": fields[col] = self._info_dict_to_string(info) else: # reuse the original column name user requested # e.g. replace gts[1085] with gts.NA20814 if '[' in col: orig_col = self.gt_idx_to_name_map[col] val = eval(col.strip()) if type(val) in [np.int8, np.int32, np.bool_]: fields[orig_col] = int(val) elif type(val) in [np.float32]: fields[orig_col] = float(val) else: fields[orig_col] = val else: # asked for "gts" or "gt_types", e.g. if col == "gts": fields[col] = ','.join(gts) elif col == "gt_types": fields[col] = \ ','.join(str(t) for t in gt_types) elif col == "gt_phases": fields[col] = \ ','.join(str(p) for p in gt_phases) elif col == "gt_depths": fields[col] = \ ','.join(str(d) for d in gt_depths) elif col == "gt_quals": fields[col] = \ ','.join(str(d) for d in gt_quals) elif col == "gt_ref_depths": fields[col] = \ ','.join(str(d) for d in gt_ref_depths) elif col == "gt_alt_depths": fields[col] = \ ','.join(str(d) for d in gt_alt_depths) if self.show_variant_samples: fields["variant_samples"] = \ self.variant_samples_delim.join(variant_names) fields["HET_samples"] = \ self.variant_samples_delim.join(het_names) fields["HOM_ALT_samples"] = \ self.variant_samples_delim.join(hom_alt_names) if self.show_families: fields["families"] = self.variant_samples_delim.join(families) gemini_row = GeminiRow(fields, gts, gt_types, gt_phases, gt_depths, gt_ref_depths, gt_alt_depths, gt_quals, variant_names, het_names, hom_alt_names, hom_ref_names, unknown_names, info, formatter=self.formatter) if not all([predicate(gemini_row) for predicate in self.predicates]): continue if not self.for_browser: return gemini_row else: return fields
def row_2_GeminiRow(self, row): variant_names = [] het_names = [] hom_alt_names = [] hom_ref_names = [] unknown_names = [] info = None if 'info' in self.report_cols: info = compression.unpack_ordereddict_blob(row['info']) fields = OrderedDict() for col in self.report_cols: if col == "*": continue if not col == "info": fields[col] = row[col] elif col == "info": fields[col] = _info_dict_to_string(info) if self.show_variant_samples or self.needs_sample_names: het_names = self._get_variant_samples(row['variant_id'], HET) hom_alt_names = self._get_variant_samples(row['variant_id'], HOM_ALT) hom_ref_names = self._get_variant_samples(row['variant_id'], HOM_REF) unknown_names = self._get_variant_samples(row['variant_id'], UNKNOWN) variant_names = het_names | hom_alt_names if self.show_variant_samples: fields["variant_samples"] = \ self.variant_samples_delim.join(variant_names) fields["HET_samples"] = \ self.variant_samples_delim.join(het_names) fields["HOM_ALT_samples"] = \ self.variant_samples_delim.join(hom_alt_names) if self.show_families: families = map( str, list( set([ self.sample_to_sample_object[x].family_id for x in variant_names ]))) fields["families"] = self.variant_samples_delim.join(families) gemini_row = GeminiRow(fields, variant_names, het_names, hom_alt_names, hom_ref_names, unknown_names, info, formatter=self.formatter) if not all([predicate(gemini_row) for predicate in self.predicates]): return None if not self.for_browser: return gemini_row else: return fields
def next(self): """ Return the GeminiRow object for the next query result. """ # we use a while loop since we may skip records based upon # genotype filters. if we need to skip a record, we just # throw a continue and keep trying. the alternative is to just # recursively call self.next() if we need to skip, but this # can quickly exceed the stack. while (1): try: row = self.c.next() except Exception as e: self.conn.close() raise StopIteration variant_names = [] het_names = [] hom_alt_names = [] hom_ref_names = [] unknown_names = [] info = None if 'info' in self.report_cols: info = compression.unpack_ordereddict_blob(row['info']) unpacked = {'self': self} unpack = compression.unpack_genotype_blob if self._query_needs_genotype_info(): # TODO: see if HET, etc. are needed. if not, we can skip # _group_samples_by_genotype unpacked['gt_types'] = unpack(row['gt_types']) genotype_dict = self._group_samples_by_genotype(unpacked['gt_types']) if self.gt_filter or self.include_gt_cols: for k in ('gts', 'gt_phases', 'gt_depths', 'gt_ref_depths', 'gt_alt_depths', 'gt_quals', 'gt_copy_numbers', 'gt_phred_ll_homref', 'gt_phred_ll_het', 'gt_phred_ll_homalt', ): # only unpack what is needed. if (self.gt_filter is not None and k in self.gt_filter) or self.include_gt_cols: unpacked[k] = unpack(row[k]) # skip the record if it does not meet the user's genotype filter # short circuit some expensive ops try: if self.gt_filter and not eval(self.gt_filter_compiled, unpacked): continue except TypeError: # tried to eval on a phred_ll column that was None continue het_names = genotype_dict[HET] hom_alt_names = genotype_dict[HOM_ALT] hom_ref_names = genotype_dict[HOM_REF] unknown_names = genotype_dict[UNKNOWN] variant_names = het_names + hom_alt_names fields = OrderedDict() for idx, col in enumerate(self.report_cols): if col == "*": continue if not col[:2] in ("gt", "GT") and not col == "info": fields[col] = row[col] elif col == "info": fields[col] = self._info_dict_to_string(info) else: # reuse the original column name user requested # e.g. replace gts[1085] with gts.NA20814 if '[' in col: orig_col = self.gt_idx_to_name_map[col] source, extra = col.split('[', 1) if not source in unpacked: unpacked[source] = unpack(row[source]) assert extra[-1] == ']' if source.startswith('gt_phred_ll') and unpacked[source] is None: fields[orig_col] = None continue idx = int(extra[:-1]) val = unpacked[source][idx] #val = unpacked[col.split('[', 1)[0]] if type(val) in (np.int8, np.int32, np.bool_): fields[orig_col] = int(val) elif type(val) in (np.float32,): fields[orig_col] = float(val) else: fields[orig_col] = val else: # asked for "gts" or "gt_types", e.g. if not col in unpacked: unpacked[col] = unpack(row[col]) if unpacked[col] is not None: fields[col] = ",".join(str(v) for v in unpacked[col]) else: fields[col] = str(None) if self.show_variant_samples: fields["variant_samples"] = \ self.variant_samples_delim.join(self._filter_samples(variant_names)) fields["HET_samples"] = \ self.variant_samples_delim.join(self._filter_samples(genotype_dict[HET])) fields["HOM_ALT_samples"] = \ self.variant_samples_delim.join(self._filter_samples(genotype_dict[HOM_ALT])) if self.show_families: families = map(str, list(set([self.sample_to_sample_object[x].family_id for x in variant_names]))) fields["families"] = self.variant_samples_delim.join(families) gemini_row = GeminiRow(fields, unpacked.get('gts'), unpacked.get('gt_types'), unpacked.get('gt_phases'), unpacked.get('gt_depths'), unpacked.get('gt_ref_depths'), unpacked.get('gt_alt_depths'), unpacked.get('gt_quals'), unpacked.get('gt_copy_numbers'), unpacked.get('gt_phred_ll_homref'), unpacked.get('gt_phred_ll_het'), unpacked.get('gt_phred_ll_homalt'), variant_names, het_names, hom_alt_names, hom_ref_names, unknown_names, info, formatter=self.formatter) if not all(predicate(gemini_row) for predicate in self.predicates): continue if not self.for_browser: return gemini_row else: return fields
def next(self): """ Return the GeminiRow object for the next query result. """ # we use a while loop since we may skip records based upon # genotype filters. if we need to skip a record, we just # throw a continue and keep trying. the alternative is to just # recursively call self.next() if we need to skip, but this # can quickly exceed the stack. while (1): try: row = self.c.next() except Exception as e: self.conn.close() raise StopIteration variant_names = [] het_names = [] hom_alt_names = [] hom_ref_names = [] unknown_names = [] info = None if 'info' in self.report_cols: info = compression.unpack_ordereddict_blob(row['info']) unpacked = {'self': self} unpack = compression.unpack_genotype_blob if self._query_needs_genotype_info(): # TODO: see if HET, etc. are needed. if not, we can skip # _group_samples_by_genotype unpacked['gt_types'] = unpack(row['gt_types']) genotype_dict = self._group_samples_by_genotype( unpacked['gt_types']) if self.gt_filter or self.include_gt_cols: for k in ('gts', 'gt_phases', 'gt_depths', 'gt_ref_depths', 'gt_alt_depths', 'gt_quals', 'gt_copy_numbers'): # only unpack what is needed. if (self.gt_filter is not None and k in self.gt_filter) or self.include_gt_cols: unpacked[k] = unpack(row[k]) # skip the record if it does not meet the user's genotype filter # short circuit some expensive ops if self.gt_filter and not eval(self.gt_filter_compiled, unpacked): continue het_names = genotype_dict[HET] hom_alt_names = genotype_dict[HOM_ALT] hom_ref_names = genotype_dict[HOM_REF] unknown_names = genotype_dict[UNKNOWN] variant_names = het_names + hom_alt_names fields = OrderedDict() for idx, col in enumerate(self.report_cols): if col == "*": continue if not col[:2] in ("gt", "GT") and not col == "info": fields[col] = row[col] elif col == "info": fields[col] = self._info_dict_to_string(info) else: # reuse the original column name user requested # e.g. replace gts[1085] with gts.NA20814 if '[' in col: orig_col = self.gt_idx_to_name_map[col] source, extra = col.split('[', 1) if not source in unpacked: unpacked[source] = unpack(row[source]) assert extra[-1] == ']' idx = int(extra[:-1]) val = unpacked[source][idx] #val = unpacked[col.split('[', 1)[0]] if type(val) in (np.int8, np.int32, np.bool_): fields[orig_col] = int(val) elif type(val) in (np.float32, ): fields[orig_col] = float(val) else: fields[orig_col] = val else: # asked for "gts" or "gt_types", e.g. if not col in unpacked: unpacked[col] = unpack(row[col]) fields[col] = ",".join(str(v) for v in unpacked[col]) if self.show_variant_samples: fields["variant_samples"] = \ self.variant_samples_delim.join(self._filter_samples(variant_names)) fields["HET_samples"] = \ self.variant_samples_delim.join(self._filter_samples(genotype_dict[HET])) fields["HOM_ALT_samples"] = \ self.variant_samples_delim.join(self._filter_samples(genotype_dict[HOM_ALT])) if self.show_families: families = map( str, list( set([ self.sample_to_sample_object[x].family_id for x in variant_names ]))) fields["families"] = self.variant_samples_delim.join(families) gemini_row = GeminiRow(fields, unpacked.get('gts'), unpacked.get('gt_types'), unpacked.get('gt_phases'), unpacked.get('gt_depths'), unpacked.get('gt_ref_depths'), unpacked.get('gt_alt_depths'), unpacked.get('gt_quals'), unpacked.get('gt_copy_numbers'), variant_names, het_names, hom_alt_names, hom_ref_names, unknown_names, info, formatter=self.formatter) if not all(predicate(gemini_row) for predicate in self.predicates): continue if not self.for_browser: return gemini_row else: return fields