Esempio n. 1
0
class SGE(object):
	'''This class can create the scripts required for submitting and running an array job on a computer cluster with a Sungrid Engine job scheduler.'''
	
	def __init__(self, scripts_dir):
		self.logger = Logger()
		self.scripts_dir = scripts_dir
		
		
	def make_map_reduce_jobs(self, prep, map_task_l, reduce_task, map_task_exec_l=[], mem="14G"):

		'''Delete existing .sh files in the scripts dir.'''
		for f in glob.glob(os.path.join(self.scripts_dir,"*.sh")):
			os.remove(f)
	
		'''Make map.sh'''
		map_file = os.path.join(self.scripts_dir,".".join([prep,"map","sh"]))
		map_stream = open(map_file,'w')
		map_stream.write(self.get_sge_job_txt(prep, map_cmd_n=len(map_task_exec_l), mem=mem))
		map_stream.close()
	
		'''Make reduce.sh'''
		reduce_file = os.path.join(self.scripts_dir,".".join([prep,"reduce","sh"]))
		reduce_stream = open(reduce_file,'w')
		reduce_stream.write(self.get_sge_job_txt(prep, reduce_task=reduce_task, mem=mem))
		reduce_stream.close()
		
		'''Make bash task scripts for each map task and a text file containing references to them.'''
		self.logger.log("# of map tasks to execute: {0}/{1}".format(len(map_task_exec_l),len(map_task_l)))
		map_task_exec_stream = open(os.path.join(self.scripts_dir,".".join([prep,"map_task_exec","txt"])), 'w') 	
		for i in range(len(map_task_l)):
			task_script = os.path.join(self.scripts_dir,".".join([prep,str(i+1),"sh"]))
			task_stream = open(task_script, 'w')
			task_stream.write("#!/bin/bash\n\n")
			task_stream.write(map_task_l[i])
			task_stream.close()
			if not map_task_exec_l == False:
				if map_task_l[i] in map_task_exec_l:
					map_task_exec_stream.write("bash " + task_script + "\n")
			else:
				map_task_exec_stream.write("bash " + task_script + "\n") 
		map_task_exec_stream.close()	
	
		'''Make the submit script.'''
		submit_stream = open(os.path.join(self.scripts_dir,"submit_map_reduce.sh"),'w')
		submit_stream_l = ["#!/bin/bash"]
		submit_stream_l.append("\n")
		if len(map_task_exec_l) == 0:
			submit_stream_l.append(" ".join(["qsub",reduce_file]))
		else:
			submit_stream_l.append(" ".join(["qsub","-N",prep+".map","-cwd",map_file]))
			submit_stream_l.append(" ".join(["qsub","-hold_jid",prep+".map","-cwd",reduce_file]))
		submit_stream.write("\n".join(submit_stream_l))
		submit_stream.close()
		if os.name != "nt":
			subprocess.call(['chmod', 'u+x', os.path.join(self.scripts_dir,"submit_map_reduce.sh")])
		
	
	def get_sge_job_txt(self, prep, map_cmd_n=None, reduce_task=None, mem="14G"):

		oe_txt_l = ["mkdir -p ${scriptname}.qsub.out ${scriptname}.qsub.err","exec >${scriptname}.qsub.out/${scriptname}_${JOB_ID}.out 2>${scriptname}.qsub.err/${scriptname}_${JOB_ID}.err"]
		sge_job_txt_l = ["#!/bin/bash","#$ -S /bin/bash","#$ -o /dev/null","#$ -e /dev/null","#$ -cwd","#$ -V","#$ -l tmem="+mem+",h_vmem="+mem,"#$ -l h_rt=24:0:0","set -u","set -x","\n"]
		if map_cmd_n != None:
			sge_job_txt_l.insert(-3,"#$ -t 1:"+str(map_cmd_n))
			sge_job_txt_l.extend(["scriptname=" + prep + ".$SGE_TASK_ID"] + oe_txt_l + ['CMDS=`sed -n -e "$SGE_TASK_ID p" ' + prep + '.map_task_exec.txt`',"$CMDS"])
		elif reduce_task != None:
			sge_job_txt_l.extend(["scriptname=" + prep + ".reduce"] + oe_txt_l + [reduce_task])
	
		sge_job_txt_str = "\n".join(sge_job_txt_l)
	
		return sge_job_txt_str	
Esempio n. 2
0
class Relatedness(object):
    '''Provides methods to find duplicates either between or within families, and to identify pairs of individuals within a family whose observed
    relationship (KING kinship coefficient) is different than expected given the pedigree.'''
    
    def __init__(self,bf_file=None,wf_file=None,cohort_fam=None,kinship_coef_thresh_dict={"0":0.354,"1":0.177,"2":0.0844,"3":0.0442}):
        
        self.logger = Logger()
        self.bf_file = bf_file
        self.wf_file = wf_file
        self.kinship_coef_thresh_dict = kinship_coef_thresh_dict
        self.cohort_fam = cohort_fam
        self.bf_df = None
        self.wf_df = None
        self.cohort_df = None

        
    def find_duplicates(self, bf_b=True):
        '''Find either between-family or within-family duplicates using the corresponding KING kinship coefficient output.
        
        Args:
            bf_b (boolean): whether to look for duplicates from different families (True) or duplicates within the same family, defaults to True.
        
        Returns:
            duplicate_l (list of strs): pairs of identified duplicates.'''
        
        self.logger.log("Checking for {between_or_within} duplicates...".format(between_or_within="between-family" if bf_b == True else "within-family"))
        kinship_coef_file = self.bf_file if bf_b == True else self.wf_file
        index_col = ["FID1","ID1","FID2","ID2"] if bf_b == True else ["FID","ID1","ID2"]
        kinship_coef_df = pd.read_csv(kinship_coef_file, sep="\t", index_col=index_col, dtype=str)
        kinship_coef_df["Kinship"] = pd.to_numeric(kinship_coef_df["Kinship"])
        duplicate_df = kinship_coef_df.ix[kinship_coef_df["Kinship"] >= self.kinship_coef_thresh_dict["0"],:]
        duplicate_l = []
        if ~duplicate_df.empty:
            duplicate_l = [self.convert_tuple_to_ids(duplicate_pair_tuple) for duplicate_pair_tuple in duplicate_df.index.tolist()]
        return duplicate_l
        
        
    def convert_tuple_to_ids(self, duplicate_pair_tuple):
        '''Convert a tuple containing the IDs of 2 duplicate individuals into a string. 
        
        Args:
            duplicate_pair_tuple (tuple): tuple containing IDs of 2 duplicate individuals.
        
        Returns:
            duplicate_pair_str (str): string containing IDs of the 2 duplicate individuals.'''
        
        #print duplicate_pair_tuple
        id1,id2 = "",""
        if len(duplicate_pair_tuple) == 3:
            id1,id2 = "_".join([duplicate_pair_tuple[0],duplicate_pair_tuple[1]]),"_".join([duplicate_pair_tuple[0],duplicate_pair_tuple[2]])
        elif len(duplicate_pair_tuple) == 4:
            id1,id2 = "_".join([duplicate_pair_tuple[0],duplicate_pair_tuple[1]]),"_".join([duplicate_pair_tuple[2],duplicate_pair_tuple[3]])
        duplicate_pair_str = ",".join([id1,id2])
        return duplicate_pair_str


    def get_exp_obs_df(self):
        '''Make a DataFrame containing the expected and observed relationships for individuals in the same family.
        
        Returns:
            exp_obs_df (DataFrame): contains columns for the expected and observed relationships.'''
        
        self.logger.log("Checking expected versus observed within-family relationships...")
        if self.wf_df == None:
            self.wf_df = pd.read_csv(self.wf_file, sep="\t", dtype=str)
        if self.cohort_df == None:
            self.cohort_df = pd.read_csv(self.cohort_fam, sep="\t", dtype=str)

        self.logger.log("Cohort contains {0} individuals and {1} families.".format(len(self.cohort_df.drop_duplicates(subset=["PERSON","FAMILY"]).index),
                                                                                   pd.unique(self.cohort_df["FAMILY"]).size))
        fam_ind_seq_dict = self.wf_df.groupby("FID").apply(lambda fam_df: pd.unique(fam_df["ID1"].append(fam_df["ID2"])).tolist()).to_dict()
        self.logger.log("# individuals with >=1 kinship coefficient: {0}".format(sum([len(fam_ind_seq_dict[fam]) for fam in fam_ind_seq_dict])))
        self.logger.log("Removing families with no kinship coefficients.") 
        self.cohort_df = self.cohort_df.ix[self.cohort_df["FAMILY"].isin(fam_ind_seq_dict),:]
        self.logger.log("Cohort contains {0} individuals and {1} families.".format(len(self.cohort_df.drop_duplicates(subset=["PERSON","FAMILY"]).index),
                                                                                   pd.unique(self.cohort_df["FAMILY"]).size))
        self.logger.log("Get the expected relationships between members of each family.")
        exp_obs_df = self.cohort_df.groupby(by="FAMILY", sort=False).apply(self.get_exp_rel_df, fam_ind_seq_dict)
        self.logger.log("Obtained expected relationship for {0} pairs across {1} families.".format(len(exp_obs_df.index),
                                                                                                   len(exp_obs_df.index.get_level_values(0).unique())))
        
        self.logger.log("Get and merge the observed relationships between members of each family.")
        exp_obs_df["Kinship"] = exp_obs_df.apply(func=self.get_kinship_coef, axis=1)
        self.logger.log("Retrieved kinship coefficients for {0} pairs across {1} families.".format(exp_obs_df["Kinship"].notnull().sum(),
                                                                                                   len(exp_obs_df.ix[exp_obs_df["Kinship"].notnull(),:].index.get_level_values(0).unique())))
        exp_obs_df["OBS_REL"] = exp_obs_df["Kinship"].apply(lambda x: np.NaN if pd.isnull(x) else "0" if x >= self.kinship_coef_thresh_dict["0"] else "1" if x >= self.kinship_coef_thresh_dict["1"] else "2" if x >= self.kinship_coef_thresh_dict["2"] else "3" if x >= self.kinship_coef_thresh_dict["3"] else "4")
        exp_obs_df.reset_index(inplace=True)
        exp_obs_df.drop("level_1", inplace=True, axis=1)
        exp_obs_df.set_index(["FAMILY","ID1","ID2"], inplace=True)
        return exp_obs_df


    def get_exp_rel_df(self, fam_df, fam_ind_seq_dict):
        '''Make a dataframe containing the expected relationships between individuals in a family.
        
        Args:
            fam_df (DataFrame): contains the pedigree information for a family.
        
        Returns:
            exp_rel_df (DataFrame): contains the expected relationships between the family members.'''
    
        self.logger.log(fam_df.name)        
        fam_df.replace({"0":np.NaN}, inplace=True) #Tidy fam_df
        
        #Get each individual's parents and grandparents.
        fam_uniq_ind_l = pd.unique(fam_df["PERSON"]).tolist()
        fam_uniq_ind_l = list(filter(lambda x: x in fam_ind_seq_dict[fam_df.name], fam_uniq_ind_l)) #Only need relations for samples with kinship coefs.
        relations_df = pd.DataFrame(data={"IND":fam_uniq_ind_l})
        relations_df[["siblings","parents","grandparents","ggrandparents"]] = relations_df["IND"].apply(self.get_relations_s, fam_df=fam_df)
        relations_df.set_index("IND", inplace=True)
    
        #Get every pair of individuals and store their expected and observed relationship.
        ind1_l, ind2_l = [],[]
        for subset in itertools.combinations(fam_uniq_ind_l,2):
            ind1_l.append(subset[0])
            ind2_l.append(subset[1])
        exp_rel_df = pd.DataFrame(OrderedDict([("ID1",ind1_l),("ID2",ind2_l)]))
        exp_rel_df["EXP_REL"] = exp_rel_df.apply(axis=1, func=self.get_exp_rel, relations_df=relations_df)
        return exp_rel_df


    def get_relations_s(self, ind, fam_df):
        '''For 1 individual, get a series containing lists of their siblings, parents and grandparents.

        Args:
            | ind (str): individual ID
            | fam_df (DataFrame): contains the pedigree information for a family.

        Returns:
            relations_s (Series): contains lists of the individual's siblings, parents, grandparents and great-grandparents.'''

        father,mother = fam_df.ix[fam_df["PERSON"]==ind,:].iloc[0]["FATHER"],fam_df.ix[fam_df["PERSON"]==ind,:].iloc[0]["MOTHER"]
        parent_l = list(filter(lambda x: pd.notnull(x), [father,mother]))
        #print "PARENTS: {0},{1}".format(father,mother)
        sibling_l = fam_df.ix[(fam_df["FATHER"]==father) & (fam_df["MOTHER"]==mother) & (fam_df["PERSON"]!=ind),"PERSON"].tolist() if pd.notnull(father) and pd.notnull(mother) else []
        #print "SIBLINGS: {0}".format(",".join(sibling_l))
        grandparent_l = [fam_df.ix[fam_df["PERSON"]==parent,:].iloc[0][grandparent] for parent,grandparent in list(itertools.product(parent_l,["FATHER","MOTHER"]))]
        grandparent_l = list(filter(lambda x: pd.notnull(x), grandparent_l))
        ggrandparent_l = [fam_df.ix[fam_df["PERSON"]==grandparent,:].iloc[0][ggrandparent] for grandparent,ggrandparent in list(itertools.product(grandparent_l,["FATHER","MOTHER"]))]
        ggrandparent_l = list(filter(lambda x: pd.notnull(x), ggrandparent_l))	

        relations_s = pd.Series([sibling_l,parent_l,grandparent_l,ggrandparent_l],
                         index=["siblings","parents","grandparents","ggrandparents"])
        return relations_s


    def get_exp_rel(self, ind_pair_s, relations_df):
        '''Get the expected degree of relationship (1-4) between a pair of individuals.
        
        Args:
            ind_pair_s (Series): the IDs of a pair of individuals.
            
        Returns:
            exp_rel (str): the expected degree of relationship.'''
    
        ind1, ind2 = ind_pair_s["ID1"], ind_pair_s["ID2"]
        ind1_sibling_l, ind2_sibling_l = relations_df.loc[ind1,"siblings"], relations_df.loc[ind2,"siblings"]
        ind1_parent_l, ind2_parent_l = relations_df.loc[ind1,"parents"], relations_df.loc[ind2,"parents"]
        ind1_grandparent_l, ind2_grandparent_l = relations_df.loc[ind1,"grandparents"], relations_df.loc[ind2,"grandparents"]
        ind1_ggrandparent_l, ind2_ggrandparent_l = relations_df.loc[ind1,"ggrandparents"], relations_df.loc[ind2,"ggrandparents"]   
 
        exp_rel = "4" #i.e. at least a 4th-degree relationship.
        if ind2 in ind1_parent_l or ind1 in ind2_parent_l: #Parent-child
            exp_rel = "1"
        elif len(ind1_parent_l) > 0 and set(ind1_parent_l) == set(ind2_parent_l): #Sibling
            exp_rel = "1"
        elif ind2 in ind1_grandparent_l or ind1 in ind2_grandparent_l: #Grandparent-grandchild 
            exp_rel = "2"
        elif (len(ind1_parent_l) > 1 and set(ind1_parent_l).issubset(set(ind2_grandparent_l))) or (len(ind2_parent_l) > 1 and set(ind2_parent_l).issubset(set(ind1_grandparent_l))): #Uncle/Aunt - nephew/niece
            exp_rel = "2"
        elif len(set(ind1_grandparent_l).intersection(set(ind2_grandparent_l))) > 0: #Cousins
            exp_rel = "3"
        elif len(set(ind1_grandparent_l).intersection(set(ind2_sibling_l))) > 0 or len(set(ind2_grandparent_l).intersection(set(ind1_sibling_l))) > 0: #Great Uncle/Aunt i.e. sibling of grandparent.
            exp_rel = "3"
        elif ind1 in ind2_ggrandparent_l or ind2 in ind1_ggrandparent_l: #Great-grandparent
            exp_rel = "3"
        return exp_rel
    
    
    def get_kinship_coef(self, s):
        '''Retrieve the kinship coefficient from a Series representing a row in a KING output file.
        
        Args:
            s (Series): represents a row in a KING output file.
            
        Returns:
            kinship_coef (str): kinship coefficient in the row.'''

        #print s
        kinship_coef = np.NaN
        df = self.wf_df.ix[(self.wf_df["FID"]==s.name[0]) & (self.wf_df["ID1"]==s.ix["ID1"]) & (self.wf_df["ID2"]==s.ix["ID2"]),:]  
        if df.empty == False:
            #print df
            kinship_coef = float(df.iloc[0]["Kinship"])
        else:
            df = self.wf_df.ix[(self.wf_df["FID"]==s.name[0]) & (self.wf_df["ID2"]==s.ix["ID1"]) & (self.wf_df["ID1"]==s.ix["ID2"]),:]
            if df.empty == False:
                kinship_coef = float(df.iloc[0]["Kinship"])
        return kinship_coef
Esempio n. 3
0
class CMC(object):
    '''Implements the combined multivariate and collapsing (CMC) burden test, where the multivariate test is a log-likelihood ratio test.'''
    def __init__(self):

        self.logger = Logger()
        self.sample_s = None
        self.geno_df = None
        self.group_col = None
        self.agg_col = None
        self.agg_cat_l = None

    def do_multivariate_tests(self,
                              sample_s,
                              geno_df,
                              group_col,
                              agg_col,
                              agg_cat_l,
                              covar_df=None,
                              results_path=None):
        '''Main method for doing multivariate tests.
        
        Args:
            | sample_s (Series): index is the sample names, and values are 1/2 (unaffected/affected).
            | geno_df (DataFrame): index is the variant ID and columns must include (1) group_col (see below); (2) agg_col (see below); (3) sample genotypes (# copies of alternate allele 0-2).
            | group_col (str): column in geno_df identifying the groups of variants to be tested for association with the phenotype e.g. gene, pathway or any other entity.
            | agg_col (str): column containing the aggregation categories (e.g. population allele frequency range) - variants in the same group and aggregation category will be aggregated (collapsed into a dichotomous variable 0/1).
            | agg_cat_l (list of strs): specifies which aggregation categories to present results for (i.e. coefficient and p-value in the alternative hypothesis logit model).
            | covar_df (DataFrame): index is the covariate names and columns are the samples.
            | results_path (str): path to results file.
        
        Returns:
            result_df (DataFrame): multivariate test results, where the index is the variant group (e.g. gene) and columns are (1) # variants in each aggregated (& unaggregated) category; (2) proportion of (un)affecteds carrying a variant in each aggregated (& unaggregated) category; (3) llr_p (and llr_cov_p), the log-likelihood ratio p-value (after controlling for covariates); (4) coefficient & p-value for each independent variable.
        '''

        #Set object attributes.
        self.group_col = group_col
        self.agg_col = agg_col
        self.agg_cat_l = agg_cat_l
        #Read in samples, genotypes and covariates.
        self.logger.log("Reading in samples and annotated genotypes...")
        self.sample_s = sample_s
        self.sample_s = self.sample_s.astype(int)
        self.sample_s = self.sample_s[self.sample_s != 0]
        self.geno_df = geno_df
        geno_df[self.sample_s.index] = geno_df[self.sample_s.index].apply(
            pd.to_numeric, errors='coerce', downcast='integer', axis=1)
        geno_df.loc[:, self.sample_s.index].fillna(0, inplace=True)
        self.covar_df = covar_df
        self.logger.log("# variants: {0}".format(len(geno_df.index)))
        self.logger.log("# genes: {0}".format(
            len(pd.unique(geno_df[self.group_col]))))
        if geno_df.empty:
            self.logger.log("Exiting because nothing to test.")
            return
        #Aggregate the genotypes.
        self.logger.log("Aggregating genotypes...")
        geno_agg_df = self.aggregate_by_agg_col(geno_df)
        self.logger.log("Retain groups which have >= 1 non-zero genotype.")
        all_zero_s = geno_agg_df[self.sample_s.index].groupby(
            level=group_col).apply(lambda df: df.values.sum() == 0)
        if all_zero_s.size > 0:
            self.logger.log("Dropped groups: {0}".format(",".join(
                all_zero_s[all_zero_s == True].index.tolist())))
            geno_agg_df = geno_agg_df.ix[all_zero_s[all_zero_s ==
                                                    False].index, :]
        self.logger.log("Drop variants which have all zero genotypes.")
        geno_agg_df = geno_agg_df.loc[~(geno_agg_df[sample_s.index] == 0).all(
            axis=1), :]
        self.logger.log("# variants: {0}".format(len(geno_df.index)))
        self.logger.log("# genes to test: {0}".format(
            len(pd.unique(geno_df[self.group_col]))))
        if geno_agg_df.empty:
            self.logger.log("Exiting because nothing to test.")
            return
        #Do the multivariate tests.
        self.logger.log("Doing multivariate tests...")
        result_df = geno_agg_df[self.sample_s.index].groupby(
            level=[self.group_col]).apply(self.do_multivariate_test,
                                          y=self.sample_s.values - 1,
                                          covar_df=covar_df)
        #Merge the results with the collapsed variant counts.
        self.logger.log(
            "Merge the results with the population frequency variant category counts..."
        )
        agg_cat_count_df = self.get_agg_cat_count_df(geno_agg_df)
        agg_cat_count_df = agg_cat_count_df.join(
            self.get_agg_cat_prop_by_affection(geno_agg_df))
        result_df = agg_cat_count_df.join(result_df)
        if covar_df is None:
            result_df.sort_values(by="llr_p", inplace=True)
        else:
            result_df.sort_values(by="llr_cov_p", inplace=True)
        self.logger.log("Write results.")
        result_df.to_csv(results_path, index=True)
        self.logger.log("\n" + result_df.to_string())
        return result_df

    def aggregate_by_agg_col(self, geno_df):
        '''Aggregate genotypes within variant population frequency categories.
        
        Args:
            geno_df (DataFrame): index is the variant ID and columns must include (1) group_col (see below); (2) agg_col (see below); (3) sample genotypes (# copies of alternate allele 0-2).
            
        Returns:
            geno_agg_df (DataFrame): index is the gene & variant aggregation category, and columns are the # of variants for each sample.
        '''

        self.logger.log(
            "For each gene, count the # of variants in each aggregation category."
        )
        agg_cat_n_s = geno_df.groupby(
            self.group_col)[self.agg_col].value_counts()
        agg_cat_n_s.name = "n"
        self.logger.log(
            "In each group, aggregate sample genotypes by self.agg_col.")
        geno_agg_df = geno_df.groupby(
            [self.group_col, self.agg_col])[self.sample_s.index].apply(
                lambda geno_col: geno_col.any() > 0).astype(int)
        geno_agg_df = geno_agg_df.join(agg_cat_n_s)
        return geno_agg_df

    def assign_variants_to_pop_frq_cats(self, geno_df, pop_frq_col_l,
                                        pop_frq_cat_dict):
        '''Assign variants to allele population frequency range categories.
        
        Args:
            | geno_df (DataFrame): index is the variant ID and columns must include the population allele frequency columns listed in the pop_frq_col_l parameter (see below)
            | pop_frq_col_l (list of str): contains the names of the population allele frequency columns in descending order of preference.
            | pop_frq_cat_dict (dict of (str,float)): mapping of frequency category name to exclusive upper bound. 
        
        Returns:
            geno_df (DataFrame): the inputted geno_df DataFrame with an extra column for the variant aggregation category ("pop_frq_cat").
        '''

        self.logger.log("Assign variants to a population frequency category.")
        pop_frq_cat_dict = OrderedDict(
            sorted(pop_frq_cat_dict.items(), key=operator.itemgetter(1)))
        pop_frq_cat_l = list(pop_frq_cat_dict.keys())
        pop_frq_bin_arr = np.array(list(pop_frq_cat_dict.values()))
        pop_frq_s = geno_df.apply(lambda row_s: list(
            filter(lambda x: pd.notnull(x), row_s.loc[pop_frq_col_l].tolist() +
                   [0.0]))[0],
                                  axis=1)
        pop_frq_idx_arr = np.digitize(pop_frq_s.values, pop_frq_bin_arr)
        geno_df = geno_df.join(
            pd.Series(data=pop_frq_idx_arr,
                      index=pop_frq_s.index,
                      name="pop_frq_cat_idx"))
        geno_df["pop_frq_cat"] = geno_df.apply(
            lambda row_s: row_s.name if row_s["pop_frq_cat_idx"] ==
            pop_frq_bin_arr.size else pop_frq_cat_l[row_s["pop_frq_cat_idx"]],
            axis=1)
        geno_df.drop("pop_frq_cat_idx", inplace=True, axis=1)
        return geno_df

    def do_multivariate_test(self, geno_agg_gene_df, y, covar_df=None):
        '''Do a multivariate test for 1 gene.
        
        Args:
            | geno_agg_gene_df (DataFrame): index is the gene & variant aggregation category, and columns are the # of variants for each sample.
            | y (numpy.ndarray): values are 0/1 (unaffected/affected). 
            | covar_df (DataFrame): index is the covariate names and columns are samples.
            
        Returns:
            test_result_s (Series): multivariate test results for 1 gene/functional unit containing (1) llr_p (and llr_cov_p), the log-likelihood ratio p-value (after controlling for covariates); (2) coefficient & p-value for each independent variable.
        '''

        self.logger.log(geno_agg_gene_df.name)
        geno_agg_gene_df.index = geno_agg_gene_df.index.droplevel(
        )  #Drop the group name so it won't be in the agg cat variable names.
        logit_result = self.fit_logit_model(geno_agg_gene_df, y)
        test_result_l = [("llr_p", logit_result.llr_pvalue)]
        coef_pval_l = None
        if covar_df is not None:
            #Model with only covariates
            self.logger.log("H0 (covariates only)")
            logit_result_h0 = self.fit_logit_model(covar_df, y)
            #Model with covariates plus aggregated variant independent variables
            self.logger.log("H1 (all independent variables)")
            logit_result_h1 = self.fit_logit_model(
                pd.concat([covar_df, geno_agg_gene_df]), y)
            llr_cov_p = stats.chisqprob(
                2 * (logit_result_h1.llf - logit_result_h0.llf),
                logit_result_h1.df_model - logit_result_h0.df_model)
            test_result_l.append(("llr_cov_p", llr_cov_p))
            coef_pval_l = self.get_coef_pval_l(logit_result_h1, covar_b=True)
        else:
            coef_pval_l = self.get_coef_pval_l(logit_result, covar_b=False)
        test_result_l.extend(coef_pval_l)
        test_result_s = pd.Series(OrderedDict(test_result_l))
        return test_result_s

    def get_coef_pval_l(self, logit_result, covar_b=False):
        '''Get the coefficients and corresponding p-values of the independent variables in the logit model.
        
        Args:
            | logit_result (statsmodels.discrete.discrete_model.BinaryResultsWrapper): contains results from fitting logit regression model.
        
        Returns:
            coef_pval_l (list): list of coefficients and corresponding p-values.
        '''

        ind_var_l = self.agg_cat_l if covar_b == False else self.agg_cat_l + self.covar_df.index.tolist(
        )
        coef_l = [(agg_cat + "_c", logit_result.params.loc[agg_cat]) if agg_cat
                  in logit_result.params.index else (agg_cat + "_c", np.NaN)
                  for agg_cat in ind_var_l]
        pval_l = [(agg_cat + "_p", np.NaN) for agg_cat in ind_var_l]
        try:
            pval_l = [(agg_cat + "_p", logit_result.pvalues.loc[agg_cat])
                      if agg_cat in logit_result.pvalues.index else
                      (agg_cat + "_p", np.NaN) for agg_cat in ind_var_l]
        except ValueError:
            self.logger.log("P-values unavailable.")
        coef_pval_l = [
            item for sublist in zip(coef_l, pval_l) for item in sublist
        ]
        return coef_pval_l

    def fit_logit_model(self, X_df, y):
        '''Fit a logit model.
        
        Args:
            | X_df: the independent variables (covariates and or aggregated genotypes) for 1 gene. 
            | y (numpy.ndarray): values are 0/1 (unaffected/affected). 
        
        Returns:
            logit_result (statsmodels.discrete.discrete_model.BinaryResultsWrapper): contains results from fitting logit regression model.
        '''

        logit_model = sm.Logit(y, X_df.transpose())
        logit_result = logit_model.fit(method='bfgs')
        return logit_result

    def get_agg_cat_count_df(self, geno_agg_df):
        '''For each group in group_col (e.g. gene), get the number of variants in each variant aggregation category (in agg_col) e.g. population allele frequency range.
        
        Args:
            geno_agg_df (DataFrame): index is the gene & variant aggregation category, and columns are the # of variants for each sample.
            
        Returns:
            agg_cat_count_df (DataFrame): index is the gene and columns are # variants in each aggregated category and # unaggregated.
        '''

        agg_cat_count_df = geno_agg_df['n'].reset_index()
        agg_cat_count_df[self.agg_col] = agg_cat_count_df[self.agg_col].apply(
            lambda x: x if x in self.agg_cat_l else "unagg")

        def sum_unagg_n(agg_cat_count_df):
            '''If the user has not listed all of the aggregation categories in the agg_cat_l parameter of the do_multivariate_tests method,
            then agg_cat_count_df may contain multiple rows per gene for "unagg", and so these must be summed.
            
            Args:
                agg_cat_count_df (DataFrame): contains the group_col, agg_col and a count (n) column.
                
            Returns:
                agg_cat_count_df (DataFrame): contains the group_col, agg_col and a count (n) column.
            '''

            unagg_count_df = agg_cat_count_df.loc[
                agg_cat_count_df["pop_frq_cat"] == "unagg", :].groupby(
                    "Gene")["n"].sum().to_frame()
            unagg_count_df.reset_index(inplace=True)
            unagg_count_df["pop_frq_cat"] = "unagg"
            agg_cat_count_df = pd.concat([
                agg_cat_count_df.loc[
                    agg_cat_count_df["pop_frq_cat"] != "unagg", :],
                unagg_count_df
            ])
            return agg_cat_count_df

        agg_cat_count_df = sum_unagg_n(agg_cat_count_df)
        agg_cat_count_df = pd.pivot_table(data=agg_cat_count_df,
                                          values='n',
                                          index=self.group_col,
                                          columns=self.agg_col)
        agg_cat_count_df = agg_cat_count_df.reindex(columns=self.agg_cat_l +
                                                    ["unagg"])
        agg_cat_count_df.fillna(0, inplace=True)
        agg_cat_count_df = agg_cat_count_df.apply(pd.to_numeric,
                                                  downcast='integer',
                                                  axis=1)
        return agg_cat_count_df

    def get_agg_cat_prop_by_affection(self, geno_agg_df):
        '''For each group in group_col (e.g. gene), get the proportion of (un)affecteds who are carriers in each variant aggregation category (in agg_col) e.g. population allele frequency range.
              
        Args:
            geno_agg_df (DataFrame): index is the group_col & aggregation category, and columns are the # of variants for each sample.
        
        Returns:
            agg_cat_prop_df(DataFrame): index is the group_col, and columns indicate the proportion of (un)affected carriers in each variant aggregation category, plus unaggregated variants.
        '''

        geno_agg_df = geno_agg_df.reset_index()
        geno_agg_df[self.agg_col] = geno_agg_df[self.agg_col].apply(
            lambda x: x if x in self.agg_cat_l else "unagg")

        def get_prop_s(geno_agg_df, sample_l, name):
            '''
            Args:
                | geno_agg_df (DataFrame): index is the group_col & aggregation category, and columns are the # of variants for each sample.
                | sample_l (list of strs): list of sample names (columns in geno_agg_df).
                | name (str): name to give to returned Series. 
            
            Returns:
                prop_s (Series): index is the group_col & aggregation category, and values are the proportion of (un)affecteds who are carriers.
            '''

            prop_s = geno_agg_df.groupby([
                self.group_col, self.agg_col
            ]).apply(lambda group: group[sample_l].values.mean())
            prop_s.name = name
            return prop_s

        ca_l, co_l = self.sample_s[self.sample_s == 2].index.tolist(
        ), self.sample_s[self.sample_s == 1].index.tolist()
        prop_df = pd.concat([
            get_prop_s(geno_agg_df, ca_l, "aff_p"),
            get_prop_s(geno_agg_df, co_l, "unaff_p")
        ],
                            axis=1)
        prop_df.reset_index(inplace=True)

        def pivot_prop_df(prop_df, affection):
            '''
            Args:
                | prop_df (DataFrame): contains the group_col and agg_col columns, plus columns for the proportion of (un)affected carriers. 
                | affection (str): affection 
            
            Returns:
                agg_cat_prop_df (DataFrame): index is the group_col, and columns are the proportion of (un)affecteds who are carriers of each variant aggregation category in agg_cat_l, plus unaggregated variants.
            '''

            agg_cat_prop_df = pd.pivot_table(data=prop_df,
                                             values="{0}_p".format(affection),
                                             index=self.group_col,
                                             columns=self.agg_col)
            column_rename_dict = dict(
                zip(agg_cat_prop_df.columns.tolist(), [
                    "{0}_{1}".format(col, "{0}_p".format(affection))
                    for col in agg_cat_prop_df.columns.tolist()
                ]))
            agg_cat_prop_df.rename(columns=column_rename_dict, inplace=True)
            agg_cat_prop_df = agg_cat_prop_df.reindex(columns=[
                "{0}_{1}_p".format(agg_cat, affection)
                for agg_cat in self.agg_cat_l + ["unagg"]
            ])
            return agg_cat_prop_df

        agg_cat_prop_df = (pivot_prop_df(prop_df,
                                         "aff")).merge(pivot_prop_df(
                                             prop_df, "unaff"),
                                                       left_index=True,
                                                       right_index=True)
        agg_cat_prop_df = agg_cat_prop_df.reindex(columns=[
            "{0}_{1}_p".format(agg_cat, affection)
            for agg_cat in self.agg_cat_l + ["unagg"]
            for affection in ["aff", "unaff"]
        ])

        return agg_cat_prop_df
Esempio n. 4
0
class Family(object):
    '''Represents a Family with various attributes: name, category (with respect to number of affected and unaffected members), IDs of affected and unaffected
	members, plus attributes relating to conditions which must be satisfied for a variant to "pass" in the family. These are the minimum number of affected members
	who are carriers, the minimum number of unaffecteds who are carriers, the minimum proportion of affecteds who are carriers, and the minimum difference in the
	proportion of affecteds and unaffecteds who are carriers. 
	'''
    def __init__(self,
                 name,
                 category,
                 A_l,
                 N_l,
                 A_n_min=0,
                 N_n_min=0,
                 A_p_min=None,
                 AN_p_diff_min=None):
        self.logger = Logger()
        self.name = name
        self.category = category
        self.A_l = natsorted(A_l)
        self.N_l = natsorted(N_l)
        self.A_n_min = A_n_min
        self.N_n_min = N_n_min
        self.A_p_min = A_p_min
        self.AN_p_diff_min = AN_p_diff_min

    def log_info(self):
        '''Log the object attributes.'''

        info_str_l = [
            "\nFamily name:\t{0}".format(self.name),
            "Category:\t{0}".format(self.category),
            "Affecteds:\t{0}".format(",".join(self.A_l)),
            "Unaffecteds:\t{0}".format(",".join(self.N_l)),
            "A_p_min:\t{0}".format(self.A_p_min),
            "AN_p_diff_min:\t{0}\n".format(self.AN_p_diff_min)
        ]
        info_str = "\n".join(info_str_l)
        self.logger.log(info_str)

    def pass_po(self,
                variant_genotypes_s,
                no_call="NA",
                carrier_call=["1", "2"]):
        '''Check whether a variant passes in the family.
		
		Args:
			| variant_genotypes_s (Series): the genotypes of family members for the variant of interest.
			| no_call (str): how a no-call is represented in the genotype data.
			| carrier_call (list of strs): genotypes which correspond to carrying the variant.
		
		Returns:
			boolean: whether the variant passes.
		'''

        #Check the A_n_min and N_n_min checks are satisfied.
        if (variant_genotypes_s[self.A_l] != no_call).sum() < self.A_n_min or (
                variant_genotypes_s[self.N_l] != no_call).sum() < self.N_n_min:
            return False
        A_p = None
        if self.A_p_min != None or self.AN_p_diff_min != None:
            A_geno_count_s = variant_genotypes_s.ix[self.A_l].value_counts(
                normalize=True, dropna=False)
            A_p = A_geno_count_s.ix[carrier_call].sum()
            A_p = 0.0 if pd.isnull(A_p) else A_p
        if self.A_p_min != None:
            if A_p < self.A_p_min:
                return False
        if self.AN_p_diff_min != None:
            N_geno_count_s = variant_genotypes_s.ix[self.N_l].value_counts(
                normalize=True, dropna=False)
            N_p = N_geno_count_s.ix[carrier_call].sum()
            if A_p - N_p < self.AN_p_diff_min:
                return False

        return True
Esempio n. 5
0
class Cohort(object):
    '''Represents a cohort of familial individuals as a list of FamilyTree objects.'''
    def __init__(self, cohort_fam):
        self.logger = Logger()
        self.node_generator = NodeGenerator()
        cohort_df = pd.read_csv(cohort_fam, sep="\t", dtype=str)
        self.logger.log("Making family tree objects...")
        self.fam_tree_l = cohort_df.groupby("FAMILY").apply(
            self.make_fam_tree).tolist()
        self.fam_tree_l = natsorted(self.fam_tree_l,
                                    key=lambda fam_tree: fam_tree.id)

    def make_fam_tree(self, ped_df):
        '''For a family, make a list of Nodes and from these, a Family Tree.
        
        Args:
            ped_df (pandas.core.frame.DataFrame): contains the pedigree information for the family.
        
        Returns:
            family_tree (FamilyTree obj): represents the family.'''

        family_id = ped_df.iloc[0]["FAMILY"]
        node_l = self.node_generator.convert_ped_df_to_node_l(ped_df)
        return FamilyTree(self.logger, family_id, node_l)

    def get_all_sample_l(self):
        '''Get a list of all of the samples in the cohort.
        
        Returns:
            all_sample_l (list): list of sample IDs tuples (<FAMILY_ID>,<INDIVIDUAL_ID>).'''

        all_sample_l = [(fam_tree.id, node.id) for fam_tree in self.fam_tree_l
                        for node in fam_tree.node_l]
        return all_sample_l

    def get_all_family_l(self):
        '''Get a list of all the families in the cohort.
        
        Returns:
           all_family_l (list): list of IDs of families present in cohort.'''

        all_family_l = [fam_tree.id for fam_tree in self.fam_tree_l]
        return all_family_l

    def remove(self, family_l):
        '''Remove families from the cohort (self.fam_tree_l).'''

        self.fam_tree_l = [
            fam_tree for fam_tree in self.fam_tree_l
            if fam_tree.id not in family_l
        ]

    def get_fam_ind_gtyped_dict(self, sample_genotyped_l):

        sample_genotyped_l.sort(key=lambda sample: sample[0])
        ind_gtyped_grps = groupby(sample_genotyped_l, lambda sample: sample[0])
        fam_ind_gtyped_dict = OrderedDict([
            (fam, [ind[1] for ind in list(ind_gtyped_grp)])
            for fam, ind_gtyped_grp in ind_gtyped_grps
        ])
        self.logger.log("# families with >=1 genotyped sample: {0}".format(
            len(fam_ind_gtyped_dict)))

        return fam_ind_gtyped_dict

    def gene_drop(self, pop_af, cohort_af, sample_genotyped_l, gene_drop_n):
        '''Perform gene dropping across the cohort and return the proportion of iterations in which the simulated allele frequency is less than or equal to the cohort frequency.
        
        Args:
            | pop_af (float): population allele frequency.
            | cohort_af (float): cohort allele frequency.
            | sample_genotyped (list of strs): the list of samples genotyped for this variant from which cohort af was calculated.
            | gene_drop_n (int): number of iterations to perform.
        
        Returns:
            cohort_enriched_p (float): proportion of iterations in which the simulated allele frequency is less than or equal to the cohort frequency.'''

        if any(pd.isnull([pop_af, cohort_af, gene_drop_n])):
            self.logger.log("Cohort.gene_drop input parameter is None.")
            return np.NaN

        fam_ind_gtyped_dict = self.get_fam_ind_gtyped_dict(sample_genotyped_l)
        self.logger.log(
            "Start gene drop with pop_af={0}, cohort_af={1}, # genotype calls={2} & gene_drop_n={3}."
            .format(pop_af, cohort_af, len(sample_genotyped_l), gene_drop_n))

        t0 = time.time()
        total_allele_count = len(sample_genotyped_l) * 2
        lt_thresh_n = 0
        for n in range(gene_drop_n):
            carrier_allele_count = sum(
                fam_tree.gene_drop(pop_af, fam_ind_gtyped_dict[fam_tree.id])
                for fam_tree in self.fam_tree_l
                if fam_tree.id in fam_ind_gtyped_dict)
            gene_drop_af = carrier_allele_count / float(total_allele_count)
            if cohort_af <= gene_drop_af:
                lt_thresh_n += 1
        cohort_enriched_p = lt_thresh_n / float(gene_drop_n)
        self.logger.log("p-value={0}".format(cohort_enriched_p))
        t1 = time.time()
        self.logger.log("Processing time: {0:.2f} secs\n".format(t1 - t0))
        return cohort_enriched_p

    def get_gene_drop_af_l(self, pop_af, sample_genotyped_l, gene_drop_n):

        if any(pd.isnull([pop_af, gene_drop_n])):
            self.logger.log("Cohort.gene_drop input parameter is None.")
            return np.NaN

        fam_ind_gtyped_dict = self.get_fam_ind_gtyped_dict(sample_genotyped_l)
        self.logger.log(
            "Start gene drop with pop_af={0}, # genotype calls={1} & gene_drop_n={2}."
            .format(pop_af, len(sample_genotyped_l), gene_drop_n))

        t0 = time.time()
        total_allele_count = len(sample_genotyped_l) * 2
        lt_thresh_n = 0
        gene_drop_af_l = []
        for n in range(gene_drop_n):
            carrier_allele_count = sum(
                fam_tree.gene_drop(pop_af, fam_ind_gtyped_dict[fam_tree.id])
                for fam_tree in self.fam_tree_l
                if fam_tree.id in fam_ind_gtyped_dict)
            gene_drop_af = carrier_allele_count / float(total_allele_count)
            gene_drop_af_l.append(gene_drop_af)
        cohort_enriched_p = lt_thresh_n / float(gene_drop_n)
        self.logger.log("p-value={0}".format(cohort_enriched_p))
        t1 = time.time()
        self.logger.log("Processing time: {0:.2f} secs\n".format(t1 - t0))

        return gene_drop_af_l

    def get_gene_drop_power(self,
                            gene_drop_af_l,
                            af_diff_l,
                            pop_af=0.01,
                            alpha=0.05):

        cohort_af_l = [pop_af + af for af in af_diff_l]

        def get_power(cohort_af, gene_drop_af_l, alpha):
            #Get binomial distribution.
            b_l = [
                1 if cohort_af > gene_drop_af else 0
                for gene_drop_af in gene_drop_af_l
            ]
            n = len(b_l)
            p = b_l.count(1) / float(n)
            q = 1.0 - p
            #Approximate to normal distribution.
            mu = n * p
            #print(n,p,q)
            sigma = (n * p * q)**0.5
            #print(mu,sigma)
            #Get power.
            if sigma == 0:
                return 1.0
            z = ((1.0 - alpha) * n - mu) / sigma
            power = 1 - st.norm.cdf(z)
            return (power)

        power_l = [
            get_power(cohort_af, gene_drop_af_l, alpha)
            for cohort_af in cohort_af_l
        ]
        return power_l