def main(): from dpark import DparkContext, optParser #optParser.set_default('master', 'mesos') optParser.add_option('-e', '--query', type='string', default='', help='execute the SQL qeury then exit') options, args = optParser.parse_args() load_history() if options.query: execute(options.query) sys.exit(0) shell()
import time from dpark import DparkContext, optParser from dpark.file_manager import file_manager dc = DparkContext() optParser.set_usage("%prog [options] path") options, args = optParser.parse_args() path = args[0] def run(split_size=1): t = time.time() dc.textFile(path).mergeSplit( splitSize=split_size).filter(lambda x: "yangxiufeng" in x).count() return time.time() - t run() # file cache print("{}s with locality".format(run())) file_manager.fs_list = file_manager.fs_list[1:] print("{}s merge & without locality".format(run(10))) print("{}s without locality, ".format(run()))
f.write('\n'.join(comb_ad_context)) dtest_sample_temp = xgb.DMatrix(temp_dir + '.libsvm') ypred.append(' '.join(dp.parallelize(bst.predict(dtest_sample_temp)).map(lambda x:str(x)).collect())) with open(mix_dir + '.txt', 'w') as f: f.write('\n'.join(ypred)) check_call('rm -rf %s' % temp_dir + '.libsvm', shell=True) ## plot histogram #plt.hist(ypred_sample_temp,10) #plot_path = '/home2/songsiyu/data/models_%s/%s' % (options.feature_domain, model_date_str) + '/plots' #if not os.path.exists(plot_path): # check_call('mkdir %s' % plot_path, shell=True) #plt.savefig(plot_path + '/ad%d' % curr_ad) if __name__ == '__main__': optParser.add_option('--model_version', dest='model_version') ##input=yesterday optParser.add_option('--feature_domain', dest='feature_domain') options, _ = optParser.parse_args() dp = DparkContext() if not options.model_version: model_date_str = (datetime.today() - timedelta(1)).strftime('%Y%m%d') ##yestoday else: model_date_str = options.model_version # train logger.info('mixing %s' % options.feature_domain) _mix(dp, options.feature_domain, model_date_str) logger.info('mix.py done!')
from dpark import DparkContext, optParser dc = DparkContext() options, args = optParser.parse_args() infile = args[0] outfile = args[1] print("from {} to {}".format(infile, outfile)) def fm(x): for w in x.strip().split(): yield (w, 1) (dc.textFile(infile) .flatMap(fm) .reduceByKey(lambda x, y: x + y, numSplits=6) .map(lambda x: " ".join(list(map(str, x)))) .saveAsTextFile(outfile, overwrite=False))