def parse_siblings(dir_data, logger=None, f_mids='mid.csv'): """ Parse brother pairs by referencing member ID LUT and relationship matrix. Siblings RID is 2 and brother, of course, are Males. Thus, these are the factors we use to identify pairs. :param dir_data: Directory containing folders of FIDs (e.g., F0001/, ..., F????/). :param f_rel_matrix: :param f_mids: :return: """ kind = 'siblings' # family directories dirs_fid, fid_list = load_fids(dir_data) print("{} families are being processed".format(len(fid_list))) # Load MID LUT for all FIDs. df_mids = db.load_mids(dirs_fid, f_csv=f_mids) # Load relationship matrices for all FIDs. # df_relationships = load_relationship_matrices(dirs_fid, f_csv=f_rel_matrix) siblings = [] for i, fid in enumerate(fid_list): # ids = [i for i, s in enumerate(genders) if 'Male' in s] rel_mat = db.parse_relationship_matrices(df_mids[i]) # rel_mat = np.array(df_relationships[i]) genders = list(df_mids[i].Gender) success, genders = helpers.check_gender_label(genders) if not success: logger.error("Gender notation incorrect for {}".format(fid)) # ids_not = [j for j, s in enumerate(genders) if 'Male' not in s] # rel_mat[ids_not, :] = 0 # rel_mat[:, ids_not] = 0 sibling_ids = np.where(rel_mat == 2) if not helpers.check_npairs(len(sibling_ids[1]), kind, fid): continue sib_ids = db.get_unique_pairs(sibling_ids) # sib_ids = [(b1, b2) if b1 < b2 else (b2, b1) for b1, b2 in zip(list(sibling_ids[0]), list(sibling_ids[1]))] # sib_id = list(set(sib_ids)) sibling_ids = list(set(sib_ids)) for ids in sib_ids: # remove if brother or sister pair if ('m' in genders[ids[0]] and 'm' in genders[ids[1]]) or \ ('f' in genders[ids[0]] and 'f' in genders[ids[1]]): print("Removing", ids) sibling_ids.remove(ids) for ids in enumerate(sibling_ids): print(ids) indices = list(np.array(ids[1]) + 1) siblings.append(db.Pair(mids=indices, fid=fid, kind=kind)) del indices return siblings
def load_families(dir_fids, f_mids='mid.csv'): """ :param dir_fids: root folder containing FID/MID/ folders of DB. :return: """ # family directories dirs_fid = glob.glob(dir_fids + '/F????/') fids = [d[-6:-1] for d in dirs_fid] fid_lut = db.load_fids() # Load MID LUT for all FIDs. df_mids = db.load_mids(dirs_fid, f_csv=f_mids) print("{} families are being processed".format(len(df_mids))) # Load relationship matrices for all FIDs. relationship_matrices = db.load_relationship_matrices(dirs_fid, f_csv=f_mids) fams = [] for i, mids in enumerate(df_mids): fid = fids[i] rel_matrix = relationship_matrices[i] nmember = mids['MID'].max() ids = list(fid_lut['FIDs']).index(fid) surname = str(np.array(fid_lut['surnames'])[ids]) fams.append(Family(surname=surname, fid=fid, nmember=nmember, mid_lut=mids, relationship_matrix=rel_matrix)) return fams
def parse_sisters(dir_data, logger=None, f_mids='mid.csv'): """ Parse sister pairs by referencing member ID LUT and relationship matrix. Siblings RID is 2 and sister, of course, are Females. Thus, these are the factors we use to identify pairs. :param dir_data: Directory containing folders of FIDs (e.g., F0001/, ..., F????/). :param f_rel_matrix: :param f_mids: :return: """ # family directories kind = 'sisters' dirs_fid, fid_list = load_fids(dir_data) logger.info("{} families are being processed".format(len(fid_list))) # Load MID LUT for all FIDs. df_mids = db.load_mids(dirs_fid, f_csv=f_mids) # # Load relationship matrices for all FIDs. # df_relationships = load_relationship_matrices(dirs_fid, f_csv=f_rel_matrix) sisters = [] for i, fid in enumerate(fid_list): # ids = [i for i, s in enumerate(genders) if 'Male' in s] rel_mat = db.parse_relationship_matrices(df_mids[i]) genders = list(df_mids[i].Gender) success, genders = helpers.check_gender_label(genders) if not success: logger.error("Gender notation incorrect for {}".format(fid)) # zero out female subjects rel_mat = db.specify_gender(rel_mat, genders, 'f') sister_ids = np.where(rel_mat == 2) if not helpers.check_npairs(len(sister_ids[1]), kind, fid): continue # add to list of brothers sisters = db.set_pairs(sisters, sister_ids, kind, fid) return sisters
def parse_grandparents(dir_data, logger=None, f_mids='mid.csv'): """ Parse sister pairs by referencing member ID LUT and relationship matrix. Siblings RID is 2 and sister, of course, are Females. Thus, these are the factors we use to identify pairs. :param dir_data: Directory containing folders of FIDs (e.g., F0001/, ..., F????/). :param f_rel_matrix: :param f_mids: :return: """ # family directories dirs_fid, fid_list = load_fids(dir_data) logger.info("{} families are being processed".format(len(fid_list))) # Load MID LUT for all FIDs. # Load MID LUT for all FIDs. df_mids = db.load_mids(dirs_fid, f_csv=f_mids) # Load relationship matrices for all FIDs. # df_relationships = load_relationship_matrices(dirs_fid, f_csv=f_rel_matrix) gfgd = [] gfgs = [] gmgd = [] gmgs = [] kind = 'parent-child' for i, fid in enumerate(fid_list): # ids = [i for i, s in enumerate(genders) if 'Male' in s] rel_mat = db.parse_relationship_matrices(df_mids[i]) genders = list(df_mids[i].Gender) success, genders = helpers.check_gender_label(genders) if not success: logger.error("Gender notation incorrect for {}".format(fid)) # ids_not = [j for j, s in enumerate(genders) if 'Female' not in s] # rel_mat[ids_not, :] = 0 # rel_mat[:, ids_not] = 0 # indices of matrix containing 4 or 1; that the matrix is inversed across the diagonal # mat_ids = np.where(rel_mat == 1) and np.where(rel_mat.T == 1), np.where(rel_mat == 1) and np.where( # rel_mat.T == 4) c_ids = np.where(rel_mat == 3) p_ids = np.where(rel_mat == 6) if len(c_ids[0]) != len(p_ids[0]): logger.warn("Number of children and parents are different.") if not helpers.check_npairs(len(c_ids[0]), kind, fid): continue # ch_ids = [(p1, p2) for p1, p2 in zip(list(c_ids[0]), list(c_ids[1]))] par_ids = [(p1, p2) for p1, p2 in zip(list(p_ids[0]), list(p_ids[1]))] # ch_ids = list(set(ch_ids)) for p in par_ids: print(p) p_mid = list(np.array(p) + 1)[0] c_mid = list(np.array(p) + 1)[1] p_gender = genders[p_mid - 1] c_gender = genders[c_mid - 1] if 'f' in p_gender: # fathers if 'f' in c_gender: # son gfgs.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gmgd')) else: # daughter gfgd.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gmgs')) else: # mothers if 'f' in c_gender: gmgs.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gfgd')) else: gmgd.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gfgd')) return gfgd, gfgs, gmgd, gmgs
def tri_subjects(dir_data, logger=None, f_mids='mid.csv'): """ Parse sister pairs by referencing member ID LUT and relationship matrix. def accumulate(l): it = itertools.groupby(l, accumulate(operator.itemgetter(0))) for key, subiter in it: # print(subiter[0]) yield key, (item[1] for item in subiter) Siblings RID is 2 and sister, of course, are Females. Thus, these are the factors we use to identify pairs. :param dir_data: Directory containing folders of FIDs (e.g., F0001/, ..., F????/). :param f_mids: :return: """ # family directories dirs_fid, fid_list = load_fids(dir_data) print("{} families are being processed".format(len(fid_list))) # Load MID LUT for all FIDs. df_mids = db.load_mids(dirs_fid, f_csv=f_mids) # # Load relationship matrices for all FIDs. # df_relationships = load_relationship_matrices(dirs_fid, f_csv=f_rel_matrix) fms = [] fmd = [] kind = 'parents-child' for i, fid in enumerate(fid_list): print(fid) # ids = [i for i, s in enumerate(genders) if 'Male' in s] # rel_mat = db.parse_relationship_matrices(df_mids[i]) rel_mat = db.parse_relationship_matrices(df_mids[i]) genders = list(df_mids[i].Gender) # indices of matrix containing 4 or 1; that the matrix is inversed across the diagonal # mat_ids = np.where(rel_mat == 1) and np.where(rel_mat.T == 1), np.where(rel_mat == 1) and np.where( # rel_mat.T == 4) c_ids = np.where(rel_mat == 1) p_ids = np.where(rel_mat == 4) if len(list(c_ids)) == 0: logger.warn("No pair of parents for child in {}.".format(fid)) # print("Two parents are not present for child") continue if len(set(p_ids[1]).__xor__(set(c_ids[0]))) or len(set(c_ids[1]).__xor__(set(p_ids[0]))): logger.error("Unmatched pair in {}.".format(fid)) # print("Unmatched pair") continue ch_ids = [(p1, p2) for p1, p2 in zip(list(c_ids[0]), list(c_ids[1]))] cp_pairs = group_child_parents(ch_ids) for cid, pids in cp_pairs.items(): # pars = rows[np.where(cols == cc)] if len(pids) != 2: if len(pids) > 2: logger.error("{} parents in {}. {}".format(len(pids), fid, pids)) continue # warn.warn("Three parents") else: continue try: p_genders = [genders[pids[0][1]], genders[pids[1][1]]] except IndexError: print() if "m" in p_genders[0] and "f" in p_genders[1]: pars_ids = (pids[0][1] + 1, pids[1][1] + 1) elif "m" in p_genders[1] and "f" in p_genders[0]: pars_ids = pids[1][1] + 1, pids[0][1] + 1 else: logger.error("Parents of same gender in {}. {}".format(fid, pids)) continue # warn.warn("Parents are of same gender for ", fid) cmid = "{}/MID{}".format(fid, cid + 1) fmid = "{}/MID{}".format(fid, pars_ids[0]) mmid = "{}/MID{}".format(fid, pars_ids[1]) if "m" in genders[cid]: fms.append((fmid, mmid, cmid)) else: fmd.append((fmid, mmid, cmid)) return fmd, fms