Python multiple_testing_correctionの例

プログラミング言語: Python

名前空間/パッケージ名: cpgmodule.padjust

メソッド/関数: multiple_testing_correction

hotexamples.comのコード掲載数: 5

Python multiple_testing_correction - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのcpgmodule.padjust.multiple_testing_correctionの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def main():
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.")
	parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs.  It must have a header row. Sample IDs should match to the \"Data file\". Note: automatically switch to use ANOVA if more than 2 groups were defined in this file.")
	parser.add_option("-p","--paired",action="store_true",default=False,dest="paired",help="If '-p/--paired' flag was specified, use paired t-test which requires the equal number of samples in both groups. Paired sampels are matched by the order. This option will be ignored for multiple group analysis.")
	parser.add_option("-w","--welch",action="store_true",default=False,dest="welch_ttest",help="If '-w/--welch' flag was specified, using Welch's t-test which does not assume the two samples have equal variance.  If omitted, use standard two-sample t-test (i.e. assuming the two samples have equal variance). This option will be ignored for paired t-test and multiple group analysis.")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of the output file.")
	(options,args)=parser.parse_args()
	
	print ()
	#print (options.paired)
	#print (options.welch_ttest)
	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)

	if not (options.group_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	
	
	FOUT = open(options.out_file + '.pval.txt','w')
	#ROUT = open(options.out_file + '.r','w')
	
	printlog("Read group file \"%s\" ..." % (options.group_file))
	(ss,gs) = read_grp_file1(options.group_file)
	
	s2g = {}
	for s,g in zip(ss,gs):
		s2g[s] = g	
	
	g2s = collections.defaultdict(list)
	for s,g in zip(ss, gs)
		g2s[g].append(s)
	
	group_IDs = sorted(g2s.keys())
	for g in group_IDs:
		print ("\tGroup %s has %d samples:" % (g, len(g2s[g])))
		print ('\t\t' + ','.join(g2s[g]))
	
	if len(group_IDs) < 2:
		printlog("You must have at least two groups!", file=sys.stderr)
		sys.exit(1)
	elif (len(group_IDs) == 2) and (options.paired is True):
		printlog("Perfrom paired t-test of two related samples ...")
		if len(g2s[group_IDs[0]]) != len(g2s[group_IDs[1]]):
			printlog("Unequal sample size. Cannot perform paired t-test.")
			sys.exit(2)
	elif (len(group_IDs) == 2) and (options.paired is False):
		printlog("Perfrom standard t-test of two independent samples ...")
	elif len(group_IDs) >= 3:
		printlog("Perfrom ANOVA ...")
	
	line_num = 1
	probe_list = []
	p_list = []
	for l in ireader.reader(options.input_file):
		f = l.split()
		if line_num == 1:
			
			sample_IDs = f[1:]

			# check if sample ID matches
			for s in s2g:
				if s not in sample_IDs:
					printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
					sys.exit(3)
		else:
			g2values = collections.defaultdict(list)
			probe_ID = f[0]
			beta_values = f[1:]
			for s,b in zip(sample_IDs, beta_values):
			
				#deal with non-numerical values
				try:
					b = float(b)
				except:
					b = np.nan
				
				#skip if s not in group file
				if s not in s2g:
					continue
				
				gid = s2g[s]
				g2values[gid].append(b)
			
			if len(g2values) == 2:
				a = np.array(g2values[group_IDs[0]])
				b = np.array(g2values[group_IDs[1]])
				if options.paired:
					(pval,tscore) = paired_ttest(a,b)
				else:
					(pval,tscore) = standard_ttest(a,b, equalVar = options.welch_ttest)				
			elif len(g2values) >= 3:
				tmp = []
				for g in group_IDs:
					tmp.append(np.array(g2values[g]))
				(pval,tscore) = anova(*tmp)
			probe_list.append(probe_ID)
			p_list.append(pval)
		line_num += 1
	
	printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
	adjusted_p = {}
	q_list =  padjust.multiple_testing_correction(p_list)
	for id,p,q in zip(probe_list, p_list, q_list):
		adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
	
	printlog("Writing to %s" % (options.out_file + '.pval.txt'))
	line_num = 1
	for l in ireader.reader(options.input_file):
		if line_num == 1:
			print (l + '\tpval\tadj.pval', file=FOUT)
		else:
			f = l.split()
			probe_ID = f[0]
			print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
		line_num += 1
	FOUT.close()

コード例 #2

ファイルを表示

ファイル: dmc_nonparametric.py プロジェクト: shulp2211/cpgtools

def main():
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (.gz, .bz2).")
	parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated two columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\". Note: automatically switch to use  Kruskal-Wallis H-test if more than two groups were defined in this file.")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
	(options,args)=parser.parse_args()
	
	print ()

	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)

	if not (options.group_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	
	
	FOUT = open(options.out_file + '.pval.txt','w')
	
	printlog("Read group file \"%s\" ..." % (options.group_file))
	(s,g) = read_grp_file1(options.group_file)
	s2g = dict(zip(s,g))
	g2s = collections.defaultdict(list)
	
	for k,v in s2g.items():
		g2s[v].append(k)
	
	group_IDs = sorted(g2s.keys())
	for g in group_IDs:
		print ("\tGroup %s has %d samples:" % (g, len(g2s[g])))
		print ('\t\t' + ','.join(g2s[g]))
	
	if len(group_IDs) < 2:
		printlog("You must have at least two groups!", file=sys.stderr)
		sys.exit(1)
	elif len(group_IDs) == 2:
		printlog("Perfrom Mann-Whitney rank test of two samples ...")
	elif len(group_IDs) >= 3:
		printlog("Perfrom Kruskal-Wallis H-test ...")
	
	line_num = 1
	probe_list = []
	p_list = []
	for l in ireader.reader(options.input_file):
		f = l.split()
		if len(f) == 0: continue
		if line_num == 1:
			sample_IDs = f[1:]

			# check if sample ID matches
			for s in s2g:
				if s not in sample_IDs:
					printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
					sys.exit(3)
		else:
			g2values = collections.defaultdict(list)
			probe_ID = f[0]
			beta_values = f[1:]
			for s,b in zip(sample_IDs, beta_values):
			
				#deal with non-numerical values
				try:
					b = float(b)
				except:
					b = np.nan
				
				#skip if s not in group file
				if s not in s2g:
					continue
				
				gid = s2g[s]
				g2values[gid].append(b)			
			
			if len(g2values) == 2:
				a = np.array(g2values[group_IDs[0]])
				b = np.array(g2values[group_IDs[1]])
				(pval,tscore) = mwu_test(a,b)
			elif len(g2values) >= 3:
				tmp = []
				for g in group_IDs:
					tmp.append(np.array(g2values[g]))
				(pval,tscore) = kruskal_test(*tmp)
			probe_list.append(probe_ID)
			p_list.append(pval)
		line_num += 1
	
	printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
	adjusted_p = {}
	q_list =  padjust.multiple_testing_correction(p_list)
	for id,p,q in zip(probe_list, p_list, q_list):
		adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
	
	printlog("Writing to %s" % (options.out_file + '.pval.txt'))
	line_num = 1
	for l in ireader.reader(options.input_file):
		if line_num == 1:
			print (l + '\tpval\tadj.pval', file=FOUT)
		else:
			f = l.split()
			probe_ID = f[0]
			print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
		line_num += 1
	FOUT.close()

コード例 #3

ファイルを表示

def main():
    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input-file",
        action="store",
        type="string",
        dest="input_file",
        help=
        "Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url."
    )
    parser.add_option(
        "-g",
        "--group",
        action="store",
        type="string",
        dest="group_file",
        help=
        "Group file define the biological groups of each samples. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs.  It must have a header row. Sample IDs shoud match to the \"Data file\". Note: automatically switch to use ANOVA if more than 2 groups were defined in this file."
    )
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type='string',
                      dest="out_file",
                      help="Prefix of output file.")
    (options, args) = parser.parse_args()

    print()
    #print (options.paired)
    #print (options.welch_ttest)
    if not (options.input_file):
        print(__doc__)
        parser.print_help()
        sys.exit(101)

    if not (options.group_file):
        print(__doc__)
        parser.print_help()
        sys.exit(102)

    if not (options.out_file):
        print(__doc__)
        parser.print_help()
        sys.exit(103)

    FOUT = open(options.out_file + '.pval.txt', 'w')
    #ROUT = open(options.out_file + '.r','w')

    printlog("Read group file \"%s\" ..." % (options.group_file))
    (s, g) = read_grp_file1(options.group_file)
    s2g = dict(zip(s, g))
    g2s = collections.defaultdict(list)

    for k, v in s2g.items():
        g2s[v].append(k)

    group_IDs = sorted(g2s.keys())
    for g in group_IDs:
        print("\tGroup %s has %d samples:" % (g, len(g2s[g])))
        print('\t\t' + ','.join(g2s[g]))

    if len(group_IDs) != 2:
        printlog("You must have two groups!", file=sys.stderr)
        sys.exit(1)

    line_num = 1
    probe_list = []
    p_list = []
    or_list = []
    for l in ireader.reader(options.input_file):
        f = l.split()
        if line_num == 1:
            sample_IDs = f[1:]
            # check if sample ID matches
            for s in s2g:
                if s not in sample_IDs:
                    printlog("Cannot find sample ID \"%s\" from file \"%s\"" %
                             (s, options.input_file))
                    sys.exit(3)
        else:
            cg_id = f[0]
            probe_list.append(cg_id)
            proportions = f[1:]
            methyl_reads = 0
            unmethyl_reads = 0
            g2values = collections.defaultdict(dict)
            for g in group_IDs:
                g2values[g]['methyl'] = 0
                g2values[g]['unmethyl'] = 0
            for s, p in zip(sample_IDs, proportions):
                gid = s2g[s]
                m = re.match(r'(\d+)\s*\,\s*(\d+)', p)
                if m is None:
                    continue
                else:
                    c = int(m.group(1))
                    n = int(m.group(2))
                    if n >= c and n > 0:
                        g2values[gid]['methyl'] += c
                        g2values[gid]['unmethyl'] += (n - c)
                    else:
                        printlog("Incorrect data format!")
                        print(f)
                        sys.exit(1)
            (odds,
             pval) = stats.fisher_exact([[
                 g2values[group_IDs[0]]['methyl'],
                 g2values[group_IDs[0]]['unmethyl']
             ],
                                         [
                                             g2values[group_IDs[1]]['methyl'],
                                             g2values[group_IDs[1]]['unmethyl']
                                         ]])
            #print (g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl'],g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl'])
            p_list.append(pval)
            or_list.append(odds)
        line_num += 1

    printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
    adjusted_p = {}
    q_list = padjust.multiple_testing_correction(p_list)
    for id, o, p, q in zip(probe_list, or_list, p_list, q_list):
        adjusted_p[id] = '\t'.join([str(i) for i in (o, p, q)])

    printlog("Writing to %s" % (options.out_file + '.pval.txt'))
    line_num = 1
    for l in ireader.reader(options.input_file):
        if line_num == 1:
            print(l + '\tOddsRatio\tpval\tadj.pval', file=FOUT)
        else:
            f = l.split()
            probe_ID = f[0]
            print(l + '\t' + adjusted_p[probe_ID], file=FOUT)
        line_num += 1
    FOUT.close()

コード例 #4

ファイルを表示

ファイル: dmc_logit.py プロジェクト: shulp2211/cpgtools

def main():
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.")
	parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\".")
	parser.add_option("-f","--family",action="store",type="int",dest="family_func",default=1, help="Error distribution and link function to be used in the GLM model. Can be integer 1 or 2 with 1 = \"quasibinomial\" and 2 = \"binomial\". Default=%default.")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
	(options,args)=parser.parse_args()
	
	print ()
	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)

	if not (options.group_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	
	if not os.path.isfile(options.input_file):
		print ("Input data file \"%s\" does not exist\n" % options.input_file) 
		sys.exit(104)
	if not os.path.isfile(options.group_file):
		print ("Input group file \"%s\" does not exist\n" % options.input_file) 
		sys.exit(105)
	
	ROUT = open(options.out_file + '.r','w')
	family = {1:'quasibinomial', 2:'binomial',}
	if not options.family_func in family.keys():
		print ("Incorrect value of '-f'!") 
		sys.exit(106)
		
	printlog("Read group file \"%s\" ..." % (options.group_file))
	(samples,cv_names, cvs, v_types) = read_grp_file2(options.group_file)
	for cv_name in cv_names:
		print ("%s: %s" % (cv_name, v_types[cv_name]))
		for sample in samples:
			print ('\t' + sample + '\t' + cvs[cv_name][sample])
	
	primary_variable = cv_names[0]
	
	print ('lrf1 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
	print ('try(fit <- glm(cbind(m,t - m) ~ %s, family=%s))' % ('+'.join(cv_names),family[options.family_func]), file=ROUT)
	print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
	print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
	print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
	print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
	print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(gsub("2","",names(coefs)), "coef",sep="."), paste(gsub("2","",names(pvals)), "pval",sep=".")))' % (options.out_file + '.results.txt'),  file = ROUT) 
	print ('}', file=ROUT)	
	print ('\n', file=ROUT)

	print ('lrf2 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
	print ('try(fit <- glm(cbind(m,t - m) ~ %s, family=%s))' % ('+'.join(cv_names),family[options.family_func]), file=ROUT)
	print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
	print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
	print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
	print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
	print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append = TRUE)' % (options.out_file + '.results.txt'),  file = ROUT) 
	print ('}', file=ROUT)	
	print ('\n', file=ROUT)
		
	printlog("Processing file \"%s\" ..." % (options.input_file))
	line_num = 0
	probe_list = []
	p_list = []
	for l in ireader.reader(options.input_file):
		line_num += 1
		f = l.split()
		if len(f) == 0: continue
		if line_num == 1:
			sample_IDs = f[1:]
			# check if sample ID matches
			for s in samples:
				if s not in sample_IDs:
					printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
					sys.exit(3)
			#for cv_name in cv_names:
			#	print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in  sample_IDs  ])), file = ROUT)
			for cv_name in cv_names:
				if v_types[cv_name] == 'continuous':
					print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in  sample_IDs  ])), file = ROUT)
				elif  v_types[cv_name] == 'categorical':
					print (cv_name + ' <- as.factor(c(%s))' % (','.join([str(cvs[cv_name][s]) for s in  sample_IDs  ])), file = ROUT)
				else:
					printlog("unknown vaiable type!")
					sys.exit(1)

			print ('\n', file=ROUT)
			continue
		else:
			methyl_reads = []			# c
			total_reads = []	# n
			cg_id = f[0]
			for i in f[1:]:
				#try:
				m = re.match(r'(\d+)\s*\,\s*(\d+)', i)
				if m is None:
					methyl_reads.append("NaN")
					total_reads.append("NaN")
					continue
				else:
					c = int(m.group(1))
					n = int(m.group(2))
					if n >= c and n > 0:
						methyl_reads.append(c)
						total_reads.append(n)
					else:
						printlog("Incorrect data format!")
						print (f)
						sys.exit(1)		
			if line_num == 2:
				print ('lrf1(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)
			else:
				print ('lrf2(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)

	ROUT.close()
	
	
	try:
		printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r'))
		subprocess.call("Rscript %s 2>%s" % (options.out_file + '.r', options.out_file + '.warnings.txt' ), shell=True)
	except:
		print ("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr)
		sys.exit(1)


	# read
	printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
	
	line_num = 0
	p_list = []
	probe_list = []
	for l in open(options.out_file + '.results.txt', 'r'):
		l = l.strip()
		line_num += 1
		if line_num == 1:
			headers = l.split()
			primary_v_index = headers.index(primary_variable + '.pval')
		else:
			v = l.split()
			try:
				pv = float(v[primary_v_index])
			except:
				continue
			if pv >= 0 and pv <= 1:
				p_list.append(pv)
				probe_list.append(v[0])

	
	# adjust
	q_list =  padjust.multiple_testing_correction(p_list)
	
	# write
	adjusted_p = {}
	for id,p,q in zip(probe_list, p_list, q_list):
		adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
	FOUT = open(options.out_file + '.pval.txt','w')
	printlog("Writing to %s" % (options.out_file + '.pval.txt'))
	line_num = 1
	for l in ireader.reader(options.input_file):
		if line_num == 1:
			print (l + '\tpval\tadj.pval', file=FOUT)
		else:
			f = l.split()
			probe_ID = f[0]
			try:
				print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
			except:
				print (l + '\tNaN\tNaN', file=FOUT)
		line_num += 1
	FOUT.close()

コード例 #5

ファイルを表示

ファイル: dmc_glm.py プロジェクト: liguowang/cpgtools

def main():
    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input_file",
        action="store",
        type="string",
        dest="input_file",
        help=
        "Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (.gz, .bz2)."
    )
    parser.add_option(
        "-g",
        "--group",
        action="store",
        type="string",
        dest="group_file",
        help=
        "Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\"."
    )
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type='string',
                      dest="out_file",
                      help="The prefix of the output file.")
    (options, args) = parser.parse_args()

    print()
    if not (options.input_file):
        print(__doc__)
        parser.print_help()
        sys.exit(101)

    if not (options.group_file):
        print(__doc__)
        parser.print_help()
        sys.exit(102)

    if not (options.out_file):
        print(__doc__)
        parser.print_help()
        sys.exit(103)

    if not os.path.isfile(options.input_file):
        print("Input data file \"%s\" does not exist\n" % options.input_file)
        sys.exit(104)
    if not os.path.isfile(options.group_file):
        print("Input group file \"%s\" does not exist\n" % options.input_file)
        sys.exit(105)

    ROUT = open(options.out_file + '.r', 'w')

    printlog("Read group file \"%s\" ..." % (options.group_file))
    (samples, cv_names, cvs, v_types) = read_grp_file2(options.group_file)
    for cv_name in cv_names:
        print("%s: %s" % (cv_name, v_types[cv_name]))
        for sample in samples:
            print('\t' + sample + '\t' + cvs[cv_name][sample])

    primary_variable = cv_names[0]

    print('lrf1 <- function (cgid, y, %s){' % ','.join(cv_names), file=ROUT)
    print('try(fit <- glm(y ~ %s, family=gaussian))' % ('+'.join(cv_names)),
          file=ROUT)
    print('pvals <- coef(summary(fit))[,4]', file=ROUT)
    print('coefs <- coef(summary(fit))[,1]', file=ROUT)
    print(
        'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(gsub("2","",names(coefs)), "coef",sep="."), paste(gsub("2","",names(pvals)), "pval",sep=".")))'
        % (options.out_file + '.results.txt'),
        file=ROUT)
    print('}', file=ROUT)
    print('\n', file=ROUT)

    print('lrf2 <- function (cgid, y,%s){' % ','.join(cv_names), file=ROUT)
    print('try(fit <- glm(y ~ %s, family=gaussian))' % ('+'.join(cv_names)),
          file=ROUT)
    print('pvals <- coef(summary(fit))[,4]', file=ROUT)
    print('coefs <- coef(summary(fit))[,1]', file=ROUT)
    print(
        'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append = TRUE)'
        % (options.out_file + '.results.txt'),
        file=ROUT)
    print('}', file=ROUT)
    print('\n', file=ROUT)

    printlog("Processing file \"%s\" ..." % (options.input_file))
    line_num = 0
    probe_list = []
    p_list = []
    for l in ireader.reader(options.input_file):
        line_num += 1
        f = l.split()
        if len(f) == 0: continue
        if line_num == 1:
            sample_IDs = f[1:]
            # check if sample ID matches
            for s in samples:
                if s not in sample_IDs:
                    printlog("Cannot find sample ID \"%s\" from file \"%s\"" %
                             (s, options.input_file))
                    sys.exit(3)
            for cv_name in cv_names:
                if v_types[cv_name] == 'continuous':
                    print(cv_name + ' <- c(%s)' %
                          (','.join([str(cvs[cv_name][s])
                                     for s in sample_IDs])),
                          file=ROUT)
                elif v_types[cv_name] == 'categorical':
                    print(cv_name + ' <- as.factor(c(%s))' %
                          (','.join([str(cvs[cv_name][s])
                                     for s in sample_IDs])),
                          file=ROUT)
                else:
                    printlog("unknown vaiable type!")
                    sys.exit(1)
            print('\n', file=ROUT)
            continue

            continue
        else:
            beta_values = []
            cg_id = f[0]
            for i in f[1:]:
                try:
                    beta_values.append(float(i))
                except:
                    beta_values.append("NaN")
            if line_num == 2:
                print('lrf1(\"%s\", c(%s), %s)' % (cg_id, ','.join(
                    [str(i) for i in beta_values]), ','.join(cv_names)),
                      file=ROUT)
            else:
                print('lrf2(\"%s\", c(%s), %s)' % (cg_id, ','.join(
                    [str(i) for i in beta_values]), ','.join(cv_names)),
                      file=ROUT)

    ROUT.close()

    try:
        printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r'))
        subprocess.call(
            "Rscript %s 2>%s" %
            (options.out_file + '.r', options.out_file + '.warnings.txt'),
            shell=True)
    except:
        print("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'),
              file=sys.stderr)
        sys.exit(1)

    # read
    printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")

    line_num = 0
    p_list = []
    probe_list = []
    for l in open(options.out_file + '.results.txt', 'r'):
        l = l.strip()
        line_num += 1
        if line_num == 1:
            headers = l.split()
            primary_v_index = headers.index(primary_variable + '.pval')
        else:
            v = l.split()
            try:
                pv = float(v[primary_v_index])
            except:
                continue
            if pv >= 0 and pv <= 1:
                p_list.append(pv)
                probe_list.append(v[0])

    # adjust
    q_list = padjust.multiple_testing_correction(p_list)

    # write
    adjusted_p = {}
    for id, p, q in zip(probe_list, p_list, q_list):
        adjusted_p[id] = '\t'.join([str(i) for i in (p, q)])
    FOUT = open(options.out_file + '.pval.txt', 'w')
    printlog("Writing to %s" % (options.out_file + '.pval.txt'))
    line_num = 1
    for l in ireader.reader(options.input_file):
        if line_num == 1:
            print(l + '\tpval\tadj.pval', file=FOUT)
        else:
            f = l.split()
            probe_ID = f[0]
            try:
                print(l + '\t' + adjusted_p[probe_ID], file=FOUT)
            except:
                print(l + '\tNaN\tNaN', file=FOUT)
        line_num += 1
    FOUT.close()