コード例 #1
0
def read_in_roles(stream):
    length = int.from_bytes(stream.read(4), "big", signed=False)
    out = []
    for _i in range(0, length):
        is_not_none = read_int8(stream)
        if is_not_none:
            out.append(Role(read_int32(stream)))
        else:
            out.append(None)
    return out
コード例 #2
0
    def _get_roles_value(self, allele, roles):
        result = []
        variant_in_members = allele.variant_in_members_objects
        for role in roles:
            for member in variant_in_members:
                role = Role.from_name(role)
                if member.role == role:
                    result.append(str(role) + member.sex.short())

        return result
コード例 #3
0
    def persons_with_roles(self, roles=None, family_ids=None):
        if family_ids is None:
            persons = self.persons.values()
        else:
            family_ids = set(family_ids)
            persons = filter(lambda p: p.family_id in family_ids,
                             self.persons.values())

        if roles is None:
            return persons

        if not isinstance(roles[0], Role):
            roles = [Role.from_name(role) for role in roles]

        return list(filter(lambda m: m.role in roles, persons))
コード例 #4
0
    def redefine(self):
        assert "person_id" in self._attributes
        self.family_id = self._attributes["family_id"]
        self.family = None
        self.person_id = self._attributes["person_id"]
        self.sample_id = self._attributes.get("sample_id", None)
        self.index = self._attributes.get("index", None)

        self._sex = Sex.from_name(self._attributes["sex"])
        if "role" not in self._attributes:
            self._role = None
        else:
            self._role = Role.from_name(self._attributes.get("role"))

        self._status = Status.from_name(self._attributes["status"])

        self._attributes["sex"] = self._sex
        self._attributes["role"] = self._role
        self._attributes["status"] = self._status

        self.mom_id = self.get_attr("mom_id")
        if self.mom_id == "0":
            self.mom_id = None
            self._attributes["mom_id"] = None
        self.dad_id = self.get_attr("dad_id")
        if self.dad_id == "0":
            self.dad_id = None
            self._attributes["dad_id"] = None
        self.mom = None
        self.dad = None
        assert self.mom_id is None or type(self.mom_id) == str, \
            (self, self._attributes)
        assert self.dad_id is None or type(self.dad_id) == str, \
            (self, self._attributes)
        if self._attributes.get("not_sequenced"):
            value = self._attributes.get("not_sequenced")
            if value == "None" or value == "0" or value == "False":
                self._attributes["not_sequenced"] = None
        if self._attributes.get("generated"):
            value = self._attributes.get("generated")
            if value == "None" or value == "0" or value == "False":
                self._attributes["generated"] = None
コード例 #5
0
    def load_simple_families_file(infile, ped_sep="\t"):
        fam_df = pd.read_csv(
            infile,
            sep=ped_sep,
            index_col=False,
            skipinitialspace=True,
            converters={
                "role": lambda r: Role.from_name(r),
                "gender": lambda s: Sex.from_name(s),
                "sex": lambda s: Sex.from_name(s),
            },
            dtype={
                "familyId": str,
                "personId": str
            },
            comment="#",
        )

        fam_df = fam_df.rename(columns={
            "gender": "sex",
            "personId": "person_id",
            "familyId": "family_id",
            "momId": "mom_id",
            "dadId": "dad_id",
            "sampleId": "sample_id",
        }, )

        fam_df["status"] = pd.Series(index=fam_df.index, data=1)
        fam_df.loc[fam_df.role == Role.prb, "status"] = 2
        fam_df["status"] = fam_df.status.apply(lambda s: Status.from_value(s))

        fam_df["mom_id"] = pd.Series(index=fam_df.index, data="0")
        fam_df["dad_id"] = pd.Series(index=fam_df.index, data="0")

        if "sample_id" not in fam_df.columns:
            sample_ids = pd.Series(data=fam_df["person_id"].values)
            fam_df["sample_id"] = sample_ids

        families = defaultdict(list)
        for rec in fam_df.to_dict(orient="records"):
            families[rec["family_id"]].append(rec)

        result = defaultdict(list)
        for fam_id, members in families.items():
            mom_id = None
            dad_id = None
            children = []
            for member in members:
                role = member["role"]
                if role == Role.mom:
                    mom_id = member["person_id"]
                elif role == Role.dad:
                    dad_id = member["person_id"]
                else:
                    assert role in set([Role.prb, Role.sib])
                    children.append(member)
            for child in children:
                child["mom_id"] = mom_id
                child["dad_id"] = dad_id

            result[fam_id] = [Person(**member) for member in members]

        return FamiliesData.from_family_persons(result)
コード例 #6
0
def roles_converter(a):
    if not isinstance(a, Role):
        return Role.from_name(a)
    return a
コード例 #7
0
def main(gpf_instance=None, argv=None):
    description = "Generate autism gene profile statistics tool"
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--verbose', '-V', '-v', action='count', default=0)
    default_dbfile = os.path.join(os.getenv("DAE_DB_DIR", "./"), "agpdb")
    parser.add_argument("--dbfile", default=default_dbfile)
    parser.add_argument(
        "--gene-sets-genes",
        action="store_true",
        help="Generate AGPs only for genes contained in the config's gene sets"
    )
    parser.add_argument(
        "--genes",
        help="Comma separated list of genes to generate statistics for")
    parser.add_argument("--drop", action="store_true")

    args = parser.parse_args(argv)
    if args.verbose == 1:
        logging.basicConfig(level=logging.WARNING)
    elif args.verbose == 2:
        logging.basicConfig(level=logging.INFO)
    elif args.verbose >= 3:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.ERROR)
    logging.getLogger("impala").setLevel(logging.WARNING)

    start = time.time()
    if gpf_instance is None:
        gpf_instance = GPFInstance()

    config = gpf_instance._autism_gene_profile_config

    # gpf_instance.gene_sets_db.get_all_gene_sets("main")

    collections_gene_sets = []

    for gs_category in config.gene_sets:
        for gs in gs_category.sets:
            gs_id = gs["set_id"]
            collection_id = gs["collection_id"]

            collections_gene_sets.append(
                (collection_id,
                 gpf_instance.gene_sets_db.get_gene_set(collection_id, gs_id)))

    # collections_gene_sets = []
    # for name in config.gene_sets:
    #     gene_set = gpf_instance.gene_sets_db.get_gene_set("main", name)
    #     collections_gene_sets.append(gene_set)
    logger.info(f"collected gene sets: {len(collections_gene_sets)}")

    # gene_sets = list(
    #     filter(lambda gs: gs["name"] in config.gene_sets, gene_sets)
    # )
    gene_symbols = set()
    if args.genes:
        gene_symbols = [gs.strip() for gs in args.genes.split(",")]
        gene_symbols = set(gene_symbols)
    elif args.gene_sets_genes:
        for _, gs in collections_gene_sets:
            gene_symbols = gene_symbols.union(gs["syms"])
    else:
        gene_models = gpf_instance.get_genome().get_gene_models().gene_models
        gene_symbols = set(gene_models.keys())
    gs_count = len(gene_symbols)
    logger.info(f"Collected {gs_count} gene symbols")
    has_denovo = False
    has_rare = False
    person_ids = dict()
    for dataset_id, filters in config.datasets.items():
        genotype_data = gpf_instance.get_genotype_data(dataset_id)
        assert genotype_data is not None, dataset_id
        person_ids[dataset_id] = dict()
        for ps in filters.person_sets:
            person_set_query = (ps.collection_name, [ps.set_name])
            person_ids[dataset_id][ps.set_name] = \
                genotype_data._transform_person_set_collection_query(
                    person_set_query, None
                )
        for stat in filters.statistics:
            if stat.category == "denovo":
                has_denovo = True
            elif stat.category == "rare":
                has_rare = True

    agps = dict()
    gene_symbols = list(gene_symbols)
    gs_count = len(gene_symbols)
    elapsed = time.time() - start
    logger.info(f"data collected: {elapsed:.2f} secs")

    start = time.time()
    for idx, sym in enumerate(gene_symbols, 1):
        gs, agp = generate_agp(gpf_instance, sym, collections_gene_sets)
        agps[gs] = agp
        if idx % 25 == 0:
            elapsed = time.time() - start
            logger.info(f"Generated {idx}/{gs_count} AGP statistics "
                        f"{elapsed:.2f} secs")

    logger.info("Done generating AGP statistics!")
    generate_end = time.time()
    elapsed = generate_end - start
    logger.info(f"Took {elapsed:.2f} secs")

    if has_denovo:
        logger.info("Collecting denovo variants")
        denovo_variants = dict()
        for dataset_id, filters in config.datasets.items():
            genotype_data = gpf_instance.get_genotype_data(dataset_id)
            assert genotype_data is not None, dataset_id
            if args.gene_sets_genes or args.genes:
                genes = gene_symbols
            else:
                genes = None

            denovo_variants[dataset_id] = list(
                genotype_data.query_variants(genes=genes,
                                             inheritance="denovo"))
        logger.info("Done collecting denovo variants")
        logger.info("Counting denovo variants...")
        fill_variant_counts(denovo_variants, agps, config, person_ids, True)
        logger.info("Done counting denovo variants")

    if has_rare:
        logger.info("Collecting rare variants")
        rare_variants = dict()
        for dataset_id, filters in config.datasets.items():
            genotype_data = gpf_instance.get_genotype_data(dataset_id)
            assert genotype_data is not None, dataset_id
            if args.gene_sets_genes or args.genes:
                genes = gene_symbols
            else:
                genes = None

            rare_variants[dataset_id] = []
            for statistic in filters.statistics:
                if statistic.category == "denovo":
                    continue
                kwargs = dict()
                kwargs["roles"] = "prb or sib"

                if statistic.effects is not None:
                    kwargs["effect_types"] = \
                        expand_effect_types(statistic.effects)

                if statistic.variant_types:
                    variant_types = [
                        VariantType.from_name(statistic.variant_types).repr()
                    ]
                    kwargs["variant_type"] = " or ".join(variant_types)

                if statistic.scores:
                    scores = []
                    for score in statistic.scores:
                        min_max = (score.min, score.max)
                        score_filter = (score.name, min_max)
                        scores.append(score_filter)
                    kwargs["real_attr_filter"] = scores

                if statistic.variant_types:
                    roles = [Role.from_name(statistic.roles).repr()]
                    kwargs["roles"] = " or ".join(roles)

                rare_variants[dataset_id].extend(
                    list(
                        genotype_data.query_variants(
                            genes=genes,
                            inheritance=[
                                "not denovo and "
                                "not possible_denovo and not possible_omission",
                                "mendelian or missing"
                            ],
                            frequency_filter=[("af_allele_freq", (None, 1.0))],
                            **kwargs)))
        logger.info("Done collecting rare variants")
        logger.info("Counting rare variants...")
        fill_variant_counts(rare_variants, agps, config, person_ids, False)
        logger.info("Done counting rare variants")

    logger.info("Calculating rates...")
    calculate_rates(gpf_instance, agps, config)
    logger.info("Done calculating rates")
    elapsed = time.time() - generate_end
    logger.info(f"Took {elapsed:.2f} secs")

    agpdb = AutismGeneProfileDB(
        gpf_instance._autism_gene_profile_config.to_dict(),
        args.dbfile,
        clear=True)

    agpdb.clear_all_tables()
    agpdb.populate_data_tables(gpf_instance.get_genotype_data_ids())
    logger.info("Inserting statistics into DB")
    agpdb.insert_agps(agps.values())
    logger.info("Building AGP output view")
    agpdb.build_agp_view()
    logger.info("Generating cache table")
    agpdb.generate_cache_table()
    logger.info("Done")
コード例 #8
0
def count_variant(v, dataset_id, agps, config, person_ids, denovo_flag):
    filters = config.datasets[dataset_id]
    members = set()

    for aa in v.alt_alleles:
        for member in aa.variant_in_members:
            if member is not None:
                members.add(member)

    for ps in filters.person_sets:
        pids = set(person_ids[dataset_id][ps.set_name])
        for statistic in filters.statistics:
            dump = {}
            if statistic.category == "denovo" and not denovo_flag:
                continue
            if statistic.category == "rare" and denovo_flag:
                continue

            stat_id = statistic.id
            do_count = True

            in_members = len(pids.intersection(members)) > 0

            do_count = do_count and in_members
            dump[1] = do_count

            if statistic.get("effects"):
                ets = set(expand_effect_types(statistic.effects))
                in_effect_types = len(ets.intersection(v.effect_types)) > 0
                do_count = do_count and in_effect_types
                dump[2] = do_count

            if statistic.get("scores"):
                for score in statistic.scores:
                    score_name = score["name"]
                    score_min = score.get("min")
                    score_max = score.get("max")
                    score_value = v.get_attribute(score_name)[0]

                    if score_value is None:
                        do_count = False

                    if score_min:
                        do_count = do_count and score_value >= score_min
                    if score_max:
                        do_count = do_count and score_value <= score_max

                dump[3] = do_count

            if statistic.get("category") == "rare":
                aa = v.alt_alleles[0]
                freq = aa.get_attribute("af_allele_freq")

                if freq:
                    do_count = do_count and freq <= 1.0
                dump[4] = do_count

            if statistic.get("variant_types"):
                variant_types = {
                    VariantType.from_name(t)
                    for t in statistic.variant_types
                }
                do_count = do_count and \
                    len(variant_types.intersection(v.variant_types))
                dump[5] = do_count

            if statistic.get("roles"):
                roles = {Role.from_name(r) for r in statistic.roles}
                v_roles = set(v.alt_alleles[0].variant_in_roles)
                do_count = do_count and \
                    len(v_roles.intersection(roles))
                dump[6] = do_count

            # if v.position == 152171343:
            #     from pprint import pprint
            #     print(100*"+")
            #     print(ps.set_name, stat_id, do_count, v)
            #     # for aa in v.alt_alleles:
            #     #     print(aa.attributes)
            #     pprint(dump)
            #     print(100*"+")
            if do_count:
                add_variant_count(v, agps, dataset_id, ps.set_name, stat_id)
コード例 #9
0
 def get_members_with_roles(self, roles):
     if not isinstance(roles[0], Role):
         roles = [Role.from_name(role) for role in roles]
     return list(filter(lambda m: m.role in roles, self.members_in_order))
コード例 #10
0
ファイル: test_roles.py プロジェクト: iossifovlab/gpf
def test_roles_simple(name, role):

    assert Role.from_name(name) == role