Ejemplo n.º 1
0
def print_help():
    print
    script = os.path.basename(sys.argv[0])
    print "usage:"
    print "       %s [options] input_file" % script
    print "   or "
    print "       %s --help" % script
    print "   or "
    print "       %s --version" % script
    CONFIG, info = standard_config()
    dump_config_info(CONFIG, info)
Ejemplo n.º 2
0
def print_help():
    print
    script = os.path.basename(sys.argv[0])
    print "usage:"
    print "       %s [options] input_file" % script
    print "   or "
    print "       %s --help" % script
    print "   or "
    print "       %s --version" % script
    CONFIG, info = standard_config()
    dump_config_info(CONFIG, info)
Ejemplo n.º 3
0
def _main(args):

    options = dict()
    path = None

    if "--help" in args:
        print_help()
        return

    if "--version" in args:
        print_version()
        return

    for arg in args:
        if arg.startswith("--"):
            if "=" in arg:
                pre, __, post = arg.partition("=")
                options[pre[2:]] = post
            else:
                options[arg[2:]] = True
        else:
            if path is not None:
                print_help()
                raise Exception("duplicate input file argument")
            path = arg

    if path is None:
        print_help()
        raise Exception("no input file given")

    CONFIG, info = standard_config()
    CONFIG.update(options)
    fix_config_types(CONFIG)
    dump_config(CONFIG)

    delim_in = CONFIG.get("delim.in", ",")
    delim_out = CONFIG.get("delim.out", ",")

    dirname = CONFIG.get("target.dir", None)
    if dirname is None:
        dirname = os.path.dirname(path)

    basename = os.path.basename(path)
    prefix, __ = os.path.splitext(basename)

    persisted_scorer = None
    apply_scorer = CONFIG.get("apply_scorer")
    if apply_scorer:
        if not os.path.exists(apply_scorer):
            raise Exception("scorer file %s does not exist" % apply_scorer)
        try:
            persisted_scorer = cPickle.loads(zlib.decompress(open(apply_scorer, "rb").read()))
        except:
            import traceback
            traceback.print_exc()
            raise

    apply_existing_scorer = persisted_scorer is not None

    persisted_weights = None
    apply_weights = CONFIG.get("apply_weights")
    if apply_weights:
        if not os.path.exists(apply_weights):
            raise Exception("weights file %s does not exist" % apply_weights)
        try:
            persisted_weights = np.loadtxt(apply_weights)

        except:
            import traceback
            traceback.print_exc()
            raise

    apply_existing_weights = persisted_weights is not None


    class Pathes(dict):

        def __init__(self, prefix=prefix, dirname=dirname, **kw):
            for k, postfix in kw.items():
                self[k] = os.path.join(dirname, prefix + postfix)
        __getattr__ = dict.__getitem__

    pathes = Pathes(scored_table="_with_dscore.csv",
                    filtered_table="_with_dscore_filtered.csv",
                    final_stat="_full_stat.csv",
                    summ_stat="_summary_stat.csv",
                    report="_report.pdf",
                    cutoffs="_cutoffs.txt",
                    svalues="_svalues.txt",
                    qvalues="_qvalues.txt",
                    d_scores_top_target_peaks="_dscores_top_target_peaks.txt",
                    d_scores_top_decoy_peaks="_dscores_top_decoy_peaks.txt",
                    mayu_cutoff="_mayu.cutoff",
                    mayu_fasta="_mayu.fasta",
                    mayu_csv="_mayu.csv",
                    )

    if not apply_existing_scorer:
        pickled_scorer_path = os.path.join(dirname, prefix + "_scorer.bin")

    if not apply_existing_weights:
        trained_weights_path = os.path.join(dirname, prefix + "_weights.txt")

    if not CONFIG.get("target.overwrite", False):
        found_exsiting_file = False
        to_check = list(pathes.keys())
        if not apply_existing_scorer:
            to_check.append(pickled_scorer_path)
        if not apply_existing_weights:
            to_check.append(trained_weights_path)
        for p in to_check:
            if os.path.exists(p):
                found_exsiting_file = True
                print "ERROR: %s already exists" % p
        if found_exsiting_file:
            print
            print "please use --target.overwrite option"
            print
            return

    format_ = "%(levelname)s -- [pid=%(process)s] : %(asctime)s: %(message)s"
    logging.basicConfig(level=logging.INFO, format=format_)
    logging.info("config settings:")
    for k, v in sorted(CONFIG.items()):
        logging.info("    %s: %s" % (k, v))
    start_at = time.time()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        result, needed_to_persist, trained_weights = PyProphet().process_csv(path, delim_in, persisted_scorer, persisted_weights)
        (summ_stat, final_stat, scored_table) = result
    needed = time.time() - start_at

    print
    print "=" * 78
    print
    print summ_stat
    print
    print "=" * 78

    print
    if summ_stat is not None:
        summ_stat.to_csv(pathes.summ_stat, sep=delim_out, index=False)
        print "WRITTEN: ", pathes.summ_stat
    if final_stat is not None:
        final_stat.to_csv(pathes.final_stat, sep=delim_out, index=False)
        print "WRITTEN: ", pathes.final_stat
        plot_data = save_report(pathes.report, basename, scored_table, final_stat)
        print "WRITTEN: ", pathes.report
        cutoffs, svalues, qvalues, top_target, top_decoys = plot_data
        for (name, values) in [("cutoffs", cutoffs), ("svalues", svalues), ("qvalues", qvalues),
                               ("d_scores_top_target_peaks", top_target),
                               ("d_scores_top_decoy_peaks", top_decoys)]:
            path = pathes[name]
            with open(path, "w") as fp:
                fp.write(" ".join("%e" % v for v in values))
            print "WRITTEN: ", path
    scored_table.to_csv(pathes.scored_table, sep=delim_out, index=False)
    print "WRITTEN: ", pathes.scored_table

    filtered_table = scored_table[scored_table.d_score > CONFIG.get("d_score.cutoff")]

    filtered_table.to_csv(pathes.filtered_table, sep=delim_out, index=False)
    print "WRITTEN: ", pathes.filtered_table

    if not apply_existing_scorer:
        bin_data = zlib.compress(cPickle.dumps(needed_to_persist, protocol=2))
        with open(pickled_scorer_path, "wb") as fp:
            fp.write(bin_data)
        print "WRITTEN: ", pickled_scorer_path

    if not apply_existing_weights:
        np.savetxt(trained_weights_path,trained_weights,delimiter="\t")
        print "WRITTEN: ", trained_weights_path

    if CONFIG.get("export.mayu", True):
        export_mayu(pathes.mayu_cutoff, pathes.mayu_fasta, pathes.mayu_csv, scored_table, final_stat)
        print "WRITTEN: ", pathes.mayu_cutoff
        print "WRITTEN: ", pathes.mayu_fasta
        print "WRITTEN: ", pathes.mayu_csv
    print

    seconds = int(needed)
    msecs = int(1000 * (needed - seconds))
    minutes = int(needed / 60.0)

    print "NEEDED",
    if minutes:
        print minutes, "minutes and",

    print "%d seconds and %d msecs wall time" % (seconds, msecs)
    print
Ejemplo n.º 4
0
def _main(args):

	options = dict()
	path = None
	
	print "PyProphet, unified edition"
	
	if "--help" in args:
		print_help()
		return

	if "--version" in args:
		print_version()
		return

	def USER_ERROR(str):
		print "USER ERROR:", str

	for arg in args:
		if arg.startswith("--"):
			if "=" in arg:
				pre, __, post = arg.partition("=")
				options[pre[2:]] = post
			else:
				options[arg[2:]] = True
		else:
			if path is not None:
				print_help()
				USER_ERROR("duplicate input file argument")
				sys.exit(EX_USAGE)
			path = arg

	if path is None:
		print_help()
		USER_ERROR("no input file given")
		sys.exit(EX_USAGE)

	CONFIG, info = standard_config()
	invalid_params = get_invalid_params(CONFIG, options)
	if len(invalid_params) > 0:
		print_help()
		for p in invalid_params:
			USER_ERROR("invalid parameter '%s'" % p)
		sys.exit(EX_CONFIG)

	CONFIG.update(options)
	fix_config_types(CONFIG)
	dump_config(CONFIG)

	delim_in = CONFIG.get("delim.in", ",")
	delim_out = CONFIG.get("delim.out", ",")

	dirname = CONFIG.get("target.dir", None)
	if dirname is None:
		dirname = os.path.dirname(path)

	basename = os.path.basename(path)
	prefix, __ = os.path.splitext(basename)



	persisted_scorer = None
	apply_scorer = CONFIG.get("apply_scorer")
	if apply_scorer:
		if not os.path.exists(apply_scorer):
			USER_ERROR("scorer file %s does not exist" % apply_scorer)
			sys.exit(EX_CONFIG)
		try:
			persisted_scorer = cPickle.loads(zlib.decompress(open(apply_scorer, "rb").read()))
		except:
			import traceback
			traceback.print_exc()
			raise

#	print "## SCORER PATH: ", apply_scorer	
#	print "## PERSISTED SCORER: ", persisted_scorer	
	apply_existing_scorer = persisted_scorer is not None
	if not apply_existing_scorer:
		pickled_scorer_path = os.path.join(dirname, prefix + "_scorer.bin")



	persisted_weights = None
	apply_weights = CONFIG.get("apply_weights")
	if apply_weights:
		if not os.path.exists(apply_weights):
			USER_ERROR("weights file %s does not exist" % apply_weights)
			sys.exit(EX_CONFIG)
		try:
			persisted_weights = np.loadtxt(apply_weights)
		except:
			import traceback
			traceback.print_exc()
			raise

	apply_existing_weights = persisted_weights is not None
	if not apply_existing_weights:
		trained_weights_path = os.path.join(dirname, prefix + "_weights.txt")


	class Paths(dict):
		def __init__(self, prefix=prefix, dirname=dirname, **kw):
			for k, postfix in kw.items():
				self[k] = os.path.join(dirname, prefix + postfix)
		__getattr__ = dict.__getitem__

	paths = Paths(scored_table="_with_dscore.csv",
					filtered_table="_with_dscore_filtered.csv",
					output="_output.csv",
					final_stat="_full_stat.csv",
					summ_stat="_summary_stat.csv",
					report="_report.pdf",
					cutoffs="_cutoffs.txt",
					svalues="_svalues.txt",
					d_scores_top_target_peaks="_dscores_top_target_peaks.txt",
					d_scores_top_decoy_peaks="_dscores_top_decoy_peaks.txt",
					mayu_cutoff="_mayu.cutoff",
					mayu_fasta="_mayu.fasta",
					mayu_csv="_mayu.csv",
					)


	
	if not CONFIG.get("target.overwrite", False):
		found_existing_file = False
		to_check = list(paths.keys())
		if not apply_existing_scorer:
			to_check.append(pickled_scorer_path)
		if not apply_existing_weights:
			to_check.append(trained_weights_path)
		for p in to_check:
			if os.path.exists(p):
				found_existing_file = True
				print "OUTPUT ERROR: %s already exists" % p
		if found_existing_file:
			print
			print "please use --target.overwrite option"
			print
			sys.exit(EX_CANTCREAT)

	format_ = "%(levelname)s -- [pid=%(process)s] : %(asctime)s: %(message)s"
	logging.basicConfig(level=logging.INFO, format=format_)
	logging.info("config settings:")
	for k, v in sorted(CONFIG.items()):
		logging.info("	%s: %s" % (k, v))
	start_at = time.time()
	with warnings.catch_warnings():
		warnings.simplefilter("ignore")
		
		classifierType = CONFIG.get("classifier.type")
		if classifierType == "LDA":
			classifier = LDALearner
		elif classifierType == "SGD":
			classifier = SGDLearner
		elif classifierType == "linSVM":
			classifier = LinearSVMLearner
		elif classifierType == "rbfSVM":
			classifier = RbfSVMLearner
		elif classifierType == "polySVM":
			classifier = PolySVMLearner
		elif classifierType == "logit":
			classifier = LogitLearner
		else:
			USER_ERROR("classifier '%s' is not supported" % classifierType)
			sys.exit(EX_CONFIG)
		
		method = HolyGostQuery(StandardSemiSupervisedTeacher(classifier))
		result_tables, clfs_df, needed_to_persist, trained_weights = method.process_csv(path, delim_in, persisted_scorer, persisted_weights)
	
	needed = time.time() - start_at

	train_frac 	= CONFIG.get("train.fraction")
	def printSumTable(str, df):
		with warnings.catch_warnings():
			warnings.filterwarnings("ignore",category=DeprecationWarning)
			if df is not None:
				print str
				print df[df.qvalue < 0.21][['qvalue', 'TP', 'cutoff']]
	print
	print "=" * 78
	print "%d%% of data used for training" % (train_frac*100)
	print "'" * 78
	print
	#for k in result_dict.iterkeys():
	printSumTable(k, result_tables[0])
	print
	print "=" * 78
	print
	
	if not CONFIG.get("no.file.output"):
		summ_stat, final_stat, scored_table = result_tables
		#if 'true_normal' in result_tables:
		#	summ_statT, final_statT, scored_tableT = result_tables['true_normal']
		#	summ_stat.to_csv(paths.summ_stat, sep=delim_out, index=False)
		#	print "WRITTEN: ", paths.summ_stat
		#	plot_data = save_report(paths.reportT, basename, scored_tableT, final_statT)
		#	print "WRITTEN: ", paths.report
		if summ_stat is not None:
			summ_stat.to_csv(paths.summ_stat, sep=delim_out, index=False)
			print "WRITTEN: ", paths.summ_stat

		if final_stat is not None:
			plot_data = save_report(paths.report, basename, scored_table, final_stat)
			print "WRITTEN: ", paths.report
			
			if True: #CONFIG.get("all.output"):
				final_stat.to_csv(paths.final_stat, sep=delim_out, index=False)
				print "WRITTEN: ", paths.final_stat
				
				cutoffs, svalues, qvalues, top_target, top_decoys = plot_data
				for (name, values) in [("cutoffs", cutoffs), ("svalues", svalues), ("qvalues", qvalues),
								   ("d_scores_top_target_peaks", top_target),
								   ("d_scores_top_decoy_peaks", top_decoys)]:
					path = paths[name]
					with open(path, "w") as fp:
						fp.write(" ".join("%e" % v for v in values))
					print "WRITTEN: ", path
		
		if clfs_df is not None and CONFIG.get("all.output"):
			clfs_df.to_csv("clfs.csv", sep=delim_out, index=False)
			print "WRITTEN: ", "clfs.csv"
		
		scored_table.to_csv(paths.scored_table, sep=delim_out, index=False)
		print "WRITTEN: ", paths.scored_table

		output = scored_table.rename(columns = {"d_score" : "pyProph_score", "m_score" : "qvalue"})
		output.to_csv(paths.output, sep=delim_out, index=False)
		print "WRITTEN: ", paths.output

		filtered_table = scored_table[scored_table.d_score > CONFIG.get("d_score.cutoff")]
		filtered_table.to_csv(paths.filtered_table, sep=delim_out, index=False)
		print "WRITTEN: ", paths.filtered_table
		
		if not apply_existing_scorer: # and CONFIG.get("all.output"):
			bin_data = zlib.compress(cPickle.dumps(needed_to_persist, protocol=2))
			with open(pickled_scorer_path, "wb") as fp:
				fp.write(bin_data)
			print "WRITTEN: ", pickled_scorer_path

		if not apply_existing_weights:
			np.savetxt(trained_weights_path,trained_weights,delimiter="\t")
			print "WRITTEN: ", trained_weights_path

		if CONFIG.get("export.mayu", True):
			export_mayu(paths.mayu_cutoff, paths.mayu_fasta, paths.mayu_csv, scored_table, final_stat)
			print "WRITTEN: ", paths.mayu_cutoff
			print "WRITTEN: ", paths.mayu_fasta
			print "WRITTEN: ", paths.mayu_csv
		print


	print "NEEDED %s wall time" % (nice_time(needed))
	print
Ejemplo n.º 5
0
def _main(args):

    options = dict()
    path = None

    if "--help" in args:
        print_help()
        return

    if "--version" in args:
        print_version()
        return

    for arg in args:
        if arg.startswith("--"):
            if "=" in arg:
                pre, __, post = arg.partition("=")
                options[pre[2:]] = post
            else:
                options[arg[2:]] = True
        else:
            if path is not None:
                print_help()
                raise Exception("duplicate input file argument")
            path = arg

    if path is None:
        print_help()
        raise Exception("no input file given")

    CONFIG, info = standard_config()
    CONFIG.update(options)
    fix_config_types(CONFIG)
    dump_config(CONFIG)

    delim_in = CONFIG.get("delim.in", ",")
    delim_out = CONFIG.get("delim.out", ",")

    dirname = CONFIG.get("target.dir", None)
    if dirname is None:
        dirname = os.path.dirname(path)

    basename = os.path.basename(path)
    prefix, __ = os.path.splitext(basename)

    persisted = None
    apply_ = CONFIG.get("apply")
    if apply_:
        if not os.path.exists(apply_):
            raise Exception("scorer file %s does not exist" % apply_)
        try:
            persisted = cPickle.loads(zlib.decompress(open(apply_, "rb").read()))
        except:
            import traceback
            traceback.print_exc()
            raise

    apply_existing_scorer = persisted is not None

    class Pathes(dict):
        def __init__(self, prefix=prefix, dirname=dirname, **kw):
            for k, postfix in kw.items():
                self[k] = os.path.join(dirname, prefix + postfix)
        __getattr__ = dict.__getitem__

    pathes = Pathes(scored_table="_with_dscore.csv",
                    final_stat="_full_stat.csv",
                    summ_stat="_summary_stat.csv",
                    report="_report.pdf",
                    cutoffs="_cutoffs.txt",
                    svalues="_svalues.txt",
                    qvalues="_qvalues.txt",
                    d_scores_top_target_peaks="_dscores_top_target_peaks.txt",
                    d_scores_top_decoy_peaks="_dscores_top_decoy_peaks.txt",
                    mayu_cutoff="_mayu.cutoff",
                    mayu_fasta="_mayu.fasta",
                    mayu_csv="_mayu.csv",
    )

    if not apply_existing_scorer:
        pickled_scorer_path = os.path.join(dirname, prefix + "_scorer.bin")

    if not CONFIG.get("target.overwrite", False):
        found_exsiting_file = False
        to_check = list(pathes.keys())
        if not apply_existing_scorer:
            to_check.append(pickled_scorer_path)
        for p in to_check:
            if os.path.exists(p):
                found_exsiting_file = True
                print "ERROR: %s already exists" % p
        if found_exsiting_file:
            print
            print "please use --target.overwrite option"
            print
            return

    format_ = "%(levelname)s -- [pid=%(process)s] : %(asctime)s: %(message)s"
    logging.basicConfig(level=logging.INFO, format=format_)
    logging.info("config settings:")
    for k, v in sorted(CONFIG.items()):
        logging.info("    %s: %s" % (k, v))
    start_at = time.time()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        result, needed_to_persist = PyProphet().process_csv(path, delim_in, persisted)
        (summ_stat, final_stat, scored_table) = result
    needed = time.time() - start_at

    print
    print "=" * 78
    print
    print summ_stat
    print
    print "=" * 78

    print
    if summ_stat is not None:
        summ_stat.to_csv(pathes.summ_stat, sep=delim_out, index=False)
        print "WRITTEN: ", pathes.summ_stat
    if final_stat is not None:
        final_stat.to_csv(pathes.final_stat, sep=delim_out, index=False)
        print "WRITTEN: ", pathes.final_stat
        plot_data = save_report(pathes.report, basename, scored_table, final_stat)
        print "WRITTEN: ", pathes.report
        cutoffs, svalues, qvalues, top_target, top_decoys = plot_data
        for (name, values) in [("cutoffs", cutoffs), ("svalues", svalues), ("qvalues", qvalues),
                               ("d_scores_top_target_peaks", top_target),
                               ("d_scores_top_decoy_peaks", top_decoys)]:
            path = pathes[name]
            with open(path, "w") as fp:
                fp.write(" ".join("%e" % v for v in values))
            print "WRITTEN: ", path
    scored_table.to_csv(pathes.scored_table, sep=delim_out, index=False)
    print "WRITTEN: ", pathes.scored_table

    if not apply_existing_scorer:
        bin_data = zlib.compress(cPickle.dumps(needed_to_persist, protocol=2))
        with open(pickled_scorer_path, "wb") as fp:
            fp.write(bin_data)
        print "WRITTEN: ", pickled_scorer_path

    if CONFIG.get("export.mayu", True):
        export_mayu(pathes.mayu_cutoff, pathes.mayu_fasta, pathes.mayu_csv, scored_table, final_stat)
        print "WRITTEN: ", pathes.mayu_cutoff
        print "WRITTEN: ", pathes.mayu_fasta
        print "WRITTEN: ", pathes.mayu_csv
    print

    seconds = int(needed)
    msecs = int(1000 * (needed - seconds))
    minutes = int(needed / 60.0)

    print "NEEDED",
    if minutes:
        print minutes, "minutes and",

    print "%d seconds and %d msecs wall time" % (seconds, msecs)
    print