def read_in_roles(stream): length = int.from_bytes(stream.read(4), "big", signed=False) out = [] for _i in range(0, length): is_not_none = read_int8(stream) if is_not_none: out.append(Role(read_int32(stream))) else: out.append(None) return out
def _get_roles_value(self, allele, roles): result = [] variant_in_members = allele.variant_in_members_objects for role in roles: for member in variant_in_members: role = Role.from_name(role) if member.role == role: result.append(str(role) + member.sex.short()) return result
def persons_with_roles(self, roles=None, family_ids=None): if family_ids is None: persons = self.persons.values() else: family_ids = set(family_ids) persons = filter(lambda p: p.family_id in family_ids, self.persons.values()) if roles is None: return persons if not isinstance(roles[0], Role): roles = [Role.from_name(role) for role in roles] return list(filter(lambda m: m.role in roles, persons))
def redefine(self): assert "person_id" in self._attributes self.family_id = self._attributes["family_id"] self.family = None self.person_id = self._attributes["person_id"] self.sample_id = self._attributes.get("sample_id", None) self.index = self._attributes.get("index", None) self._sex = Sex.from_name(self._attributes["sex"]) if "role" not in self._attributes: self._role = None else: self._role = Role.from_name(self._attributes.get("role")) self._status = Status.from_name(self._attributes["status"]) self._attributes["sex"] = self._sex self._attributes["role"] = self._role self._attributes["status"] = self._status self.mom_id = self.get_attr("mom_id") if self.mom_id == "0": self.mom_id = None self._attributes["mom_id"] = None self.dad_id = self.get_attr("dad_id") if self.dad_id == "0": self.dad_id = None self._attributes["dad_id"] = None self.mom = None self.dad = None assert self.mom_id is None or type(self.mom_id) == str, \ (self, self._attributes) assert self.dad_id is None or type(self.dad_id) == str, \ (self, self._attributes) if self._attributes.get("not_sequenced"): value = self._attributes.get("not_sequenced") if value == "None" or value == "0" or value == "False": self._attributes["not_sequenced"] = None if self._attributes.get("generated"): value = self._attributes.get("generated") if value == "None" or value == "0" or value == "False": self._attributes["generated"] = None
def load_simple_families_file(infile, ped_sep="\t"): fam_df = pd.read_csv( infile, sep=ped_sep, index_col=False, skipinitialspace=True, converters={ "role": lambda r: Role.from_name(r), "gender": lambda s: Sex.from_name(s), "sex": lambda s: Sex.from_name(s), }, dtype={ "familyId": str, "personId": str }, comment="#", ) fam_df = fam_df.rename(columns={ "gender": "sex", "personId": "person_id", "familyId": "family_id", "momId": "mom_id", "dadId": "dad_id", "sampleId": "sample_id", }, ) fam_df["status"] = pd.Series(index=fam_df.index, data=1) fam_df.loc[fam_df.role == Role.prb, "status"] = 2 fam_df["status"] = fam_df.status.apply(lambda s: Status.from_value(s)) fam_df["mom_id"] = pd.Series(index=fam_df.index, data="0") fam_df["dad_id"] = pd.Series(index=fam_df.index, data="0") if "sample_id" not in fam_df.columns: sample_ids = pd.Series(data=fam_df["person_id"].values) fam_df["sample_id"] = sample_ids families = defaultdict(list) for rec in fam_df.to_dict(orient="records"): families[rec["family_id"]].append(rec) result = defaultdict(list) for fam_id, members in families.items(): mom_id = None dad_id = None children = [] for member in members: role = member["role"] if role == Role.mom: mom_id = member["person_id"] elif role == Role.dad: dad_id = member["person_id"] else: assert role in set([Role.prb, Role.sib]) children.append(member) for child in children: child["mom_id"] = mom_id child["dad_id"] = dad_id result[fam_id] = [Person(**member) for member in members] return FamiliesData.from_family_persons(result)
def roles_converter(a): if not isinstance(a, Role): return Role.from_name(a) return a
def main(gpf_instance=None, argv=None): description = "Generate autism gene profile statistics tool" parser = argparse.ArgumentParser(description=description) parser.add_argument('--verbose', '-V', '-v', action='count', default=0) default_dbfile = os.path.join(os.getenv("DAE_DB_DIR", "./"), "agpdb") parser.add_argument("--dbfile", default=default_dbfile) parser.add_argument( "--gene-sets-genes", action="store_true", help="Generate AGPs only for genes contained in the config's gene sets" ) parser.add_argument( "--genes", help="Comma separated list of genes to generate statistics for") parser.add_argument("--drop", action="store_true") args = parser.parse_args(argv) if args.verbose == 1: logging.basicConfig(level=logging.WARNING) elif args.verbose == 2: logging.basicConfig(level=logging.INFO) elif args.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) logging.getLogger("impala").setLevel(logging.WARNING) start = time.time() if gpf_instance is None: gpf_instance = GPFInstance() config = gpf_instance._autism_gene_profile_config # gpf_instance.gene_sets_db.get_all_gene_sets("main") collections_gene_sets = [] for gs_category in config.gene_sets: for gs in gs_category.sets: gs_id = gs["set_id"] collection_id = gs["collection_id"] collections_gene_sets.append( (collection_id, gpf_instance.gene_sets_db.get_gene_set(collection_id, gs_id))) # collections_gene_sets = [] # for name in config.gene_sets: # gene_set = gpf_instance.gene_sets_db.get_gene_set("main", name) # collections_gene_sets.append(gene_set) logger.info(f"collected gene sets: {len(collections_gene_sets)}") # gene_sets = list( # filter(lambda gs: gs["name"] in config.gene_sets, gene_sets) # ) gene_symbols = set() if args.genes: gene_symbols = [gs.strip() for gs in args.genes.split(",")] gene_symbols = set(gene_symbols) elif args.gene_sets_genes: for _, gs in collections_gene_sets: gene_symbols = gene_symbols.union(gs["syms"]) else: gene_models = gpf_instance.get_genome().get_gene_models().gene_models gene_symbols = set(gene_models.keys()) gs_count = len(gene_symbols) logger.info(f"Collected {gs_count} gene symbols") has_denovo = False has_rare = False person_ids = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id person_ids[dataset_id] = dict() for ps in filters.person_sets: person_set_query = (ps.collection_name, [ps.set_name]) person_ids[dataset_id][ps.set_name] = \ genotype_data._transform_person_set_collection_query( person_set_query, None ) for stat in filters.statistics: if stat.category == "denovo": has_denovo = True elif stat.category == "rare": has_rare = True agps = dict() gene_symbols = list(gene_symbols) gs_count = len(gene_symbols) elapsed = time.time() - start logger.info(f"data collected: {elapsed:.2f} secs") start = time.time() for idx, sym in enumerate(gene_symbols, 1): gs, agp = generate_agp(gpf_instance, sym, collections_gene_sets) agps[gs] = agp if idx % 25 == 0: elapsed = time.time() - start logger.info(f"Generated {idx}/{gs_count} AGP statistics " f"{elapsed:.2f} secs") logger.info("Done generating AGP statistics!") generate_end = time.time() elapsed = generate_end - start logger.info(f"Took {elapsed:.2f} secs") if has_denovo: logger.info("Collecting denovo variants") denovo_variants = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id if args.gene_sets_genes or args.genes: genes = gene_symbols else: genes = None denovo_variants[dataset_id] = list( genotype_data.query_variants(genes=genes, inheritance="denovo")) logger.info("Done collecting denovo variants") logger.info("Counting denovo variants...") fill_variant_counts(denovo_variants, agps, config, person_ids, True) logger.info("Done counting denovo variants") if has_rare: logger.info("Collecting rare variants") rare_variants = dict() for dataset_id, filters in config.datasets.items(): genotype_data = gpf_instance.get_genotype_data(dataset_id) assert genotype_data is not None, dataset_id if args.gene_sets_genes or args.genes: genes = gene_symbols else: genes = None rare_variants[dataset_id] = [] for statistic in filters.statistics: if statistic.category == "denovo": continue kwargs = dict() kwargs["roles"] = "prb or sib" if statistic.effects is not None: kwargs["effect_types"] = \ expand_effect_types(statistic.effects) if statistic.variant_types: variant_types = [ VariantType.from_name(statistic.variant_types).repr() ] kwargs["variant_type"] = " or ".join(variant_types) if statistic.scores: scores = [] for score in statistic.scores: min_max = (score.min, score.max) score_filter = (score.name, min_max) scores.append(score_filter) kwargs["real_attr_filter"] = scores if statistic.variant_types: roles = [Role.from_name(statistic.roles).repr()] kwargs["roles"] = " or ".join(roles) rare_variants[dataset_id].extend( list( genotype_data.query_variants( genes=genes, inheritance=[ "not denovo and " "not possible_denovo and not possible_omission", "mendelian or missing" ], frequency_filter=[("af_allele_freq", (None, 1.0))], **kwargs))) logger.info("Done collecting rare variants") logger.info("Counting rare variants...") fill_variant_counts(rare_variants, agps, config, person_ids, False) logger.info("Done counting rare variants") logger.info("Calculating rates...") calculate_rates(gpf_instance, agps, config) logger.info("Done calculating rates") elapsed = time.time() - generate_end logger.info(f"Took {elapsed:.2f} secs") agpdb = AutismGeneProfileDB( gpf_instance._autism_gene_profile_config.to_dict(), args.dbfile, clear=True) agpdb.clear_all_tables() agpdb.populate_data_tables(gpf_instance.get_genotype_data_ids()) logger.info("Inserting statistics into DB") agpdb.insert_agps(agps.values()) logger.info("Building AGP output view") agpdb.build_agp_view() logger.info("Generating cache table") agpdb.generate_cache_table() logger.info("Done")
def count_variant(v, dataset_id, agps, config, person_ids, denovo_flag): filters = config.datasets[dataset_id] members = set() for aa in v.alt_alleles: for member in aa.variant_in_members: if member is not None: members.add(member) for ps in filters.person_sets: pids = set(person_ids[dataset_id][ps.set_name]) for statistic in filters.statistics: dump = {} if statistic.category == "denovo" and not denovo_flag: continue if statistic.category == "rare" and denovo_flag: continue stat_id = statistic.id do_count = True in_members = len(pids.intersection(members)) > 0 do_count = do_count and in_members dump[1] = do_count if statistic.get("effects"): ets = set(expand_effect_types(statistic.effects)) in_effect_types = len(ets.intersection(v.effect_types)) > 0 do_count = do_count and in_effect_types dump[2] = do_count if statistic.get("scores"): for score in statistic.scores: score_name = score["name"] score_min = score.get("min") score_max = score.get("max") score_value = v.get_attribute(score_name)[0] if score_value is None: do_count = False if score_min: do_count = do_count and score_value >= score_min if score_max: do_count = do_count and score_value <= score_max dump[3] = do_count if statistic.get("category") == "rare": aa = v.alt_alleles[0] freq = aa.get_attribute("af_allele_freq") if freq: do_count = do_count and freq <= 1.0 dump[4] = do_count if statistic.get("variant_types"): variant_types = { VariantType.from_name(t) for t in statistic.variant_types } do_count = do_count and \ len(variant_types.intersection(v.variant_types)) dump[5] = do_count if statistic.get("roles"): roles = {Role.from_name(r) for r in statistic.roles} v_roles = set(v.alt_alleles[0].variant_in_roles) do_count = do_count and \ len(v_roles.intersection(roles)) dump[6] = do_count # if v.position == 152171343: # from pprint import pprint # print(100*"+") # print(ps.set_name, stat_id, do_count, v) # # for aa in v.alt_alleles: # # print(aa.attributes) # pprint(dump) # print(100*"+") if do_count: add_variant_count(v, agps, dataset_id, ps.set_name, stat_id)
def get_members_with_roles(self, roles): if not isinstance(roles[0], Role): roles = [Role.from_name(role) for role in roles] return list(filter(lambda m: m.role in roles, self.members_in_order))
def test_roles_simple(name, role): assert Role.from_name(name) == role