Esempio n. 1
0
def main():
    from dpark import DparkContext, optParser
    #optParser.set_default('master', 'mesos')
    optParser.add_option('-e', '--query', type='string', default='',
            help='execute the SQL qeury then exit')
    
    options, args = optParser.parse_args()

    load_history()
    
    if options.query:
        execute(options.query)
        sys.exit(0)

    shell()
Esempio n. 2
0
import time
from dpark import DparkContext, optParser
from dpark.file_manager import file_manager
dc = DparkContext()

optParser.set_usage("%prog [options] path")
options, args = optParser.parse_args()

path = args[0]


def run(split_size=1):
    t = time.time()
    dc.textFile(path).mergeSplit(
        splitSize=split_size).filter(lambda x: "yangxiufeng" in x).count()
    return time.time() - t


run()  # file cache
print("{}s with locality".format(run()))
file_manager.fs_list = file_manager.fs_list[1:]
print("{}s merge & without locality".format(run(10)))
print("{}s without locality, ".format(run()))
Esempio n. 3
0
			f.write('\n'.join(comb_ad_context))
		dtest_sample_temp = xgb.DMatrix(temp_dir + '.libsvm')
		ypred.append(' '.join(dp.parallelize(bst.predict(dtest_sample_temp)).map(lambda x:str(x)).collect()))
	with open(mix_dir + '.txt', 'w') as f:
		f.write('\n'.join(ypred))
	check_call('rm -rf %s' % temp_dir + '.libsvm', shell=True)
	## plot histogram
	#plt.hist(ypred_sample_temp,10)
	#plot_path = '/home2/songsiyu/data/models_%s/%s' % (options.feature_domain, model_date_str) + '/plots'
	#if not os.path.exists(plot_path):
	#	check_call('mkdir %s' % plot_path, shell=True)
	#plt.savefig(plot_path + '/ad%d' % curr_ad)


if __name__ == '__main__':
    optParser.add_option('--model_version', dest='model_version')    ##input=yesterday
    optParser.add_option('--feature_domain', dest='feature_domain')
    options, _ = optParser.parse_args()
    dp = DparkContext()

    if not options.model_version:
        model_date_str = (datetime.today() - timedelta(1)).strftime('%Y%m%d')   ##yestoday
    else:
        model_date_str = options.model_version

    # train
    logger.info('mixing %s' % options.feature_domain)
    _mix(dp, options.feature_domain, model_date_str)
    
    logger.info('mix.py done!')
Esempio n. 4
0
File: wc.py Progetto: douban/dpark
from dpark import DparkContext, optParser

dc = DparkContext()
options, args = optParser.parse_args()
infile = args[0]
outfile = args[1]
print("from {} to {}".format(infile, outfile))


def fm(x):
    for w in x.strip().split():
        yield (w, 1)


(dc.textFile(infile)
    .flatMap(fm)
    .reduceByKey(lambda x, y: x + y, numSplits=6)
    .map(lambda x: " ".join(list(map(str, x))))
    .saveAsTextFile(outfile, overwrite=False))