コード例 #1
0
def parse_siblings(dir_data, logger=None, f_mids='mid.csv'):
    """
    Parse brother pairs by referencing member ID LUT and relationship matrix.

    Siblings RID is 2 and brother, of course, are Males. Thus, these are the factors we use to identify pairs.

    :param dir_data:        Directory containing folders of FIDs (e.g., F0001/, ..., F????/).
    :param f_rel_matrix:
    :param f_mids:
    :return:
    """
    kind = 'siblings'
    # family directories
    dirs_fid, fid_list = load_fids(dir_data)

    print("{} families are being processed".format(len(fid_list)))
    # Load MID LUT for all FIDs.
    df_mids = db.load_mids(dirs_fid, f_csv=f_mids)

    # Load relationship matrices for all FIDs.
    # df_relationships = load_relationship_matrices(dirs_fid, f_csv=f_rel_matrix)
    siblings = []
    for i, fid in enumerate(fid_list):
        # ids = [i for i, s in enumerate(genders) if 'Male' in s]
        rel_mat = db.parse_relationship_matrices(df_mids[i])
        # rel_mat = np.array(df_relationships[i])
        genders = list(df_mids[i].Gender)

        success, genders = helpers.check_gender_label(genders)
        if not success:
            logger.error("Gender notation incorrect for {}".format(fid))
        # ids_not = [j for j, s in enumerate(genders) if 'Male' not in s]
        # rel_mat[ids_not, :] = 0
        # rel_mat[:, ids_not] = 0

        sibling_ids = np.where(rel_mat == 2)

        if not helpers.check_npairs(len(sibling_ids[1]), kind, fid):
            continue

        sib_ids = db.get_unique_pairs(sibling_ids)
        # sib_ids = [(b1, b2) if b1 < b2 else (b2, b1) for b1, b2 in zip(list(sibling_ids[0]), list(sibling_ids[1]))]
        # sib_id = list(set(sib_ids))

        sibling_ids = list(set(sib_ids))

        for ids in sib_ids:
            # remove if brother or sister pair
            if ('m' in genders[ids[0]] and 'm' in genders[ids[1]]) or \
                    ('f' in genders[ids[0]] and 'f' in genders[ids[1]]):
                print("Removing", ids)
                sibling_ids.remove(ids)

        for ids in enumerate(sibling_ids):
            print(ids)
            indices = list(np.array(ids[1]) + 1)
            siblings.append(db.Pair(mids=indices, fid=fid, kind=kind))
            del indices

    return siblings
コード例 #2
0
def load_families(dir_fids, f_mids='mid.csv'):
    """

    :param dir_fids: root folder containing FID/MID/ folders of DB.
    :return:
    """
    # family directories
    dirs_fid = glob.glob(dir_fids + '/F????/')
    fids = [d[-6:-1] for d in dirs_fid]

    fid_lut = db.load_fids()
    # Load MID LUT for all FIDs.
    df_mids = db.load_mids(dirs_fid, f_csv=f_mids)

    print("{} families are being processed".format(len(df_mids)))

    # Load relationship matrices for all FIDs.
    relationship_matrices = db.load_relationship_matrices(dirs_fid, f_csv=f_mids)

    fams = []
    for i, mids in enumerate(df_mids):
        fid = fids[i]
        rel_matrix = relationship_matrices[i]
        nmember = mids['MID'].max()

        ids = list(fid_lut['FIDs']).index(fid)
        surname = str(np.array(fid_lut['surnames'])[ids])

        fams.append(Family(surname=surname, fid=fid, nmember=nmember, mid_lut=mids, relationship_matrix=rel_matrix))

    return fams
コード例 #3
0
def parse_sisters(dir_data, logger=None, f_mids='mid.csv'):
    """
    Parse sister pairs by referencing member ID LUT and relationship matrix.

    Siblings RID is 2 and sister, of course, are Females. Thus, these are the factors we use to identify pairs.

    :param dir_data:        Directory containing folders of FIDs (e.g., F0001/, ..., F????/).
    :param f_rel_matrix:
    :param f_mids:
    :return:
    """

    # family directories
    kind = 'sisters'
    dirs_fid, fid_list = load_fids(dir_data)

    logger.info("{} families are being processed".format(len(fid_list)))
    # Load MID LUT for all FIDs.
    df_mids = db.load_mids(dirs_fid, f_csv=f_mids)

    # # Load relationship matrices for all FIDs.
    # df_relationships = load_relationship_matrices(dirs_fid, f_csv=f_rel_matrix)
    sisters = []
    for i, fid in enumerate(fid_list):
        # ids = [i for i, s in enumerate(genders) if 'Male' in s]
        rel_mat = db.parse_relationship_matrices(df_mids[i])
        genders = list(df_mids[i].Gender)
        success, genders = helpers.check_gender_label(genders)
        if not success:
            logger.error("Gender notation incorrect for {}".format(fid))
        # zero out female subjects
        rel_mat = db.specify_gender(rel_mat, genders, 'f')

        sister_ids = np.where(rel_mat == 2)

        if not helpers.check_npairs(len(sister_ids[1]), kind, fid):
            continue

        # add to list of brothers
        sisters = db.set_pairs(sisters, sister_ids, kind, fid)

    return sisters
コード例 #4
0
def parse_grandparents(dir_data, logger=None, f_mids='mid.csv'):
    """
    Parse sister pairs by referencing member ID LUT and relationship matrix.

    Siblings RID is 2 and sister, of course, are Females. Thus, these are the factors we use to identify pairs.

    :param dir_data:        Directory containing folders of FIDs (e.g., F0001/, ..., F????/).
    :param f_rel_matrix:
    :param f_mids:
    :return:
    """

    # family directories
    dirs_fid, fid_list = load_fids(dir_data)

    logger.info("{} families are being processed".format(len(fid_list)))
    # Load MID LUT for all FIDs.
    # Load MID LUT for all FIDs.
    df_mids = db.load_mids(dirs_fid, f_csv=f_mids)

    # Load relationship matrices for all FIDs.
    # df_relationships = load_relationship_matrices(dirs_fid, f_csv=f_rel_matrix)
    gfgd = []
    gfgs = []
    gmgd = []
    gmgs = []
    kind = 'parent-child'
    for i, fid in enumerate(fid_list):
        # ids = [i for i, s in enumerate(genders) if 'Male' in s]
        rel_mat = db.parse_relationship_matrices(df_mids[i])
        genders = list(df_mids[i].Gender)
        success, genders = helpers.check_gender_label(genders)
        if not success:
            logger.error("Gender notation incorrect for {}".format(fid))
        # ids_not = [j for j, s in enumerate(genders) if 'Female' not in s]
        # rel_mat[ids_not, :] = 0
        # rel_mat[:, ids_not] = 0

        # indices of matrix containing 4 or 1; that the matrix is inversed across the diagonal
        # mat_ids = np.where(rel_mat == 1) and np.where(rel_mat.T == 1), np.where(rel_mat == 1) and np.where(
        #     rel_mat.T == 4)

        c_ids = np.where(rel_mat == 3)
        p_ids = np.where(rel_mat == 6)
        if len(c_ids[0]) != len(p_ids[0]):
            logger.warn("Number of children and parents are different.")

        if not helpers.check_npairs(len(c_ids[0]), kind, fid):
            continue
        # ch_ids = [(p1, p2) for p1, p2 in zip(list(c_ids[0]), list(c_ids[1]))]
        par_ids = [(p1, p2) for p1, p2 in zip(list(p_ids[0]), list(p_ids[1]))]

        # ch_ids = list(set(ch_ids))
        for p in par_ids:
            print(p)
            p_mid = list(np.array(p) + 1)[0]
            c_mid = list(np.array(p) + 1)[1]

            p_gender = genders[p_mid - 1]
            c_gender = genders[c_mid - 1]
            if 'f' in p_gender:
                # fathers
                if 'f' in c_gender:
                    # son
                    gfgs.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gmgd'))
                else:
                    # daughter
                    gfgd.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gmgs'))
            else:
                # mothers
                if 'f' in c_gender:
                    gmgs.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gfgd'))
                else:
                    gmgd.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gfgd'))

    return gfgd, gfgs, gmgd, gmgs
コード例 #5
0
def tri_subjects(dir_data, logger=None, f_mids='mid.csv'):
    """
    Parse sister pairs by referencing member ID LUT and relationship matrix.
def accumulate(l):
    it = itertools.groupby(l, accumulate(operator.itemgetter(0)))
    for key, subiter in it:
        # print(subiter[0])
        yield key, (item[1] for item in subiter)
    Siblings RID is 2 and sister, of course, are Females. Thus, these are the factors we use to identify pairs.

    :param dir_data:        Directory containing folders of FIDs (e.g., F0001/, ..., F????/).
    :param f_mids:
    :return:
    """

    # family directories
    dirs_fid, fid_list = load_fids(dir_data)

    print("{} families are being processed".format(len(fid_list)))
    # Load MID LUT for all FIDs.
    df_mids = db.load_mids(dirs_fid, f_csv=f_mids)

    # # Load relationship matrices for all FIDs.
    # df_relationships = load_relationship_matrices(dirs_fid, f_csv=f_rel_matrix)
    fms = []
    fmd = []
    kind = 'parents-child'
    for i, fid in enumerate(fid_list):
        print(fid)
        # ids = [i for i, s in enumerate(genders) if 'Male' in s]
        # rel_mat = db.parse_relationship_matrices(df_mids[i])
        rel_mat = db.parse_relationship_matrices(df_mids[i])
        genders = list(df_mids[i].Gender)

        # indices of matrix containing 4 or 1; that the matrix is inversed across the diagonal
        # mat_ids = np.where(rel_mat == 1) and np.where(rel_mat.T == 1), np.where(rel_mat == 1) and np.where(
        #     rel_mat.T == 4)

        c_ids = np.where(rel_mat == 1)
        p_ids = np.where(rel_mat == 4)

        if len(list(c_ids)) == 0:
            logger.warn("No pair of parents for child in {}.".format(fid))
            # print("Two parents are not present for child")
            continue

        if len(set(p_ids[1]).__xor__(set(c_ids[0]))) or len(set(c_ids[1]).__xor__(set(p_ids[0]))):
            logger.error("Unmatched pair in {}.".format(fid))
            # print("Unmatched pair")
            continue
        ch_ids = [(p1, p2) for p1, p2 in zip(list(c_ids[0]), list(c_ids[1]))]

        cp_pairs = group_child_parents(ch_ids)

        for cid, pids in cp_pairs.items():
            # pars = rows[np.where(cols == cc)]
            if len(pids) != 2:
                if len(pids) > 2:
                    logger.error("{} parents in {}. {}".format(len(pids), fid, pids))
                    continue
                    # warn.warn("Three parents")
                else:
                    continue
            try:
                p_genders = [genders[pids[0][1]], genders[pids[1][1]]]

            except IndexError:
                print()
            if "m" in p_genders[0] and "f" in p_genders[1]:
                pars_ids = (pids[0][1] + 1, pids[1][1] + 1)
            elif "m" in p_genders[1] and "f" in p_genders[0]:
                pars_ids = pids[1][1] + 1, pids[0][1] + 1
            else:
                logger.error("Parents of same gender in {}. {}".format(fid, pids))
                continue
                # warn.warn("Parents are of same gender for ", fid)

            cmid = "{}/MID{}".format(fid, cid + 1)
            fmid = "{}/MID{}".format(fid, pars_ids[0])
            mmid = "{}/MID{}".format(fid, pars_ids[1])

            if "m" in genders[cid]:
                fms.append((fmid, mmid, cmid))
            else:
                fmd.append((fmid, mmid, cmid))

    return fmd, fms