def main(): from dpark import DparkContext, optParser #optParser.set_default('master', 'mesos') optParser.add_option('-e', '--query', type='string', default='', help='execute the SQL qeury then exit') options, args = optParser.parse_args() load_history() if options.query: execute(options.query) sys.exit(0) shell()
# -*- coding: utf-8 -*- import uuid import inspect from dpark import DparkContext, optParser from sql.parser import parse from collections import OrderedDict from models import Model optParser.add_option("-s") # "option used for py.test" optParser.add_option("-x") class Table(object): dialect = "excel" columns = () def __init__(self, name, paths=None, columns=None, query=None): self.name = name self.columns = OrderedDict(columns or self.__class__.columns) self.paths = paths or [] self.query = query def index(self, field): return self.columns.keys().index(field) def rdd(self, dpark=None): if self.query: return self.query.rdd
f.write('\n'.join(comb_ad_context)) dtest_sample_temp = xgb.DMatrix(temp_dir + '.libsvm') ypred.append(' '.join(dp.parallelize(bst.predict(dtest_sample_temp)).map(lambda x:str(x)).collect())) with open(mix_dir + '.txt', 'w') as f: f.write('\n'.join(ypred)) check_call('rm -rf %s' % temp_dir + '.libsvm', shell=True) ## plot histogram #plt.hist(ypred_sample_temp,10) #plot_path = '/home2/songsiyu/data/models_%s/%s' % (options.feature_domain, model_date_str) + '/plots' #if not os.path.exists(plot_path): # check_call('mkdir %s' % plot_path, shell=True) #plt.savefig(plot_path + '/ad%d' % curr_ad) if __name__ == '__main__': optParser.add_option('--model_version', dest='model_version') ##input=yesterday optParser.add_option('--feature_domain', dest='feature_domain') options, _ = optParser.parse_args() dp = DparkContext() if not options.model_version: model_date_str = (datetime.today() - timedelta(1)).strftime('%Y%m%d') ##yestoday else: model_date_str = options.model_version # train logger.info('mixing %s' % options.feature_domain) _mix(dp, options.feature_domain, model_date_str) logger.info('mix.py done!')
if s.lower().startswith(c): arg = s[len(c):].strip() getattr(self,c)(arg) self.sql = '' continue if not self.sql.rstrip().endswith(';'): continue self.run_sql() except Exception, e: import traceback; traceback.print_exc() self.sql = '' if __name__ == '__main__': from dpark import optParser optParser.set_default('master', 'flet6') optParser.add_option('-e', '--query', type='string', default='', help='execute the SQL qeury then exit') optParser.add_option('-s', '--script', type='string', default='', help='execute the SQL script file then exit') options, args = optParser.parse_args() console = Console() if options.query: console.run_script(options.query) elif options.script: with open(options.script) as f: console.run_script(f.read()) else: console.run()
f.seek(0) f2 = open(outfile, 'w') for line in f: l = line.strip('\n').split(sp) f2.write(l[0]) f2.write(sp) l = l[1:] for i in xrange(len(minlst)): if maxlst[i] == minlst[i]: stf = maxlst[i] f2.write(str(stf)) if i != len(minlst) - 1: f2.write(sp) else: stf = (float(l[i]) - minlst[i]) / (maxlst[i] - minlst[i]) f2.write(str(stf)) if i != len(minlst) - 1: f2.write(sp) f2.write('\n') if __name__ == '__main__': from dpark import optParser optParser.add_option('--in', dest='inputf') optParser.add_option('--out', dest='outputf') options, args = optParser.parse_args() inf = options.inputf outf = options.outputf norm(inf, outf)
import os, sys from subprocess import check_call, call from datetime import timedelta, datetime from dpark import DparkContext, optParser import random import math from config import PLAN_SETTINGS, logger import ctr_rdds from common.util import GeneralMap, is_spider from db_tools import DBTools sys.path.append('%s/user_profile' % os.path.dirname(os.path.realpath(__file__))) optParser.add_option("--date", dest="date") optParser.add_option("--collection", dest="collection") options, _ = optParser.parse_args() dp = DparkContext() def feature_gen(current_date, collection): db_tools = DBTools() if collection == 'CPC': AD_TO_TAGS = dp.broadcast(db_tools.get_ad_to_tags()) elif collection == 'market': AD_TO_TAGS = dp.broadcast(db_tools.get_market_ad_to_features()) else: raise Exception AD_TO_ITEM = dp.broadcast(db_tools.get_ad_to_item()) ## extract from db AD_TO_ORDER = dp.broadcast(db_tools.get_ad_to_order()) AD_TO_ACCOUNT = dp.broadcast(db_tools.get_ad_to_account()) AD_TO_UNIT = dp.broadcast(db_tools.get_ad_to_units())