def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000']) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def freeze_script(script_path, temp_path='_hadoopy_temp'): """Freezes a script, puts it on hdfs, and gives you the path 'frozen_tar_path' can be given to launch_frozen and it will use that instead of making its own, this is useful for repeated calls. If a file with the same md5 already exists in the temp_path, it is used instead of putting a new copy there to avoid the file transfer. The files are put into a temporary file based on the timestamp first, then moved to a location that is only a function of their md5 to prevent partial files. Args: script_path: Path to a hadoopy script temp_path: HDFS temporary path (default is '_hadoopy_temp') Returns: {'cmds': commands_ran, 'frozen_tar_path': frozen_tar_path} """ tmp_frozen_tar_path = temp_path + '/%f.tar' % time.time() freeze_fp = tempfile.NamedTemporaryFile(suffix='.tar') cmds = hadoopy._freeze.freeze_to_tar(os.path.abspath(script_path), freeze_fp.name) md5 = _md5_file(freeze_fp.name) frozen_tar_path = temp_path + '/%s.tar' % md5 if hadoopy.exists(frozen_tar_path): return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path} hadoopy.put(freeze_fp.name, tmp_frozen_tar_path) try: hadoopy.mv(tmp_frozen_tar_path, frozen_tar_path) except IOError, e: if hadoopy.exists(frozen_tar_path): # Check again return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path} raise e
def __close_tmp_mv(fp_tweets, fp_wordcloud, hdfs_path=None, local_path=None): fp_tweets.close() fp_wordcloud.close() filename_tweets = fp_tweets.name filename_wordcloud = fp_wordcloud.name print "Will create new file " + filename_tweets[-24:-4] + ".csv" # Put to HDFS if specified to write to HDFS if hdfs_path: hadoopy.put( filename_tweets, hdfs_path + Util.TWEETS + '/' + filename_tweets[-24:-4] + '.csv') hadoopy.put( filename_wordcloud, hdfs_path + Util.WORDCLOUD + '/' + filename_wordcloud[-24:-4] + '.csv') # Put to local path if specified to write to local file system if local_path: shutil.copy( filename_tweets, local_path + Util.TWEETS + '/' + filename_tweets[-24:-4] + '.csv') shutil.copy( filename_wordcloud, local_path + Util.WORDCLOUD + '/' + filename_wordcloud[-24:-4] + '.csv') os.remove(filename_tweets) os.remove(filename_wordcloud)
def _read_files(fns, prev_hashes, hdfs_output, output_format, max_record_size): """ Args: fns: Iterator of file names prev_hashes: Set of hashes (they will be skipped), this is used to make the data unique Yields: Tuple of (data_hash, data) where data_hash is a sha1 hash """ for fn in fns: sha1_hash = _sha1(fn) if sha1_hash not in prev_hashes: prev_hashes.add(sha1_hash) if output_format == 'record' and max_record_size is not None and max_record_size < os.stat(fn)[6]: # Put the file into the remote location hdfs_path = hadoopy.abspath('%s/_blobs/%s_%s' % (hdfs_output, sha1_hash, os.path.basename(fn))) data = '' hadoopy.put(fn, hdfs_path) else: hdfs_path = '' data = open(fn).read() if output_format == 'kv': yield sha1_hash, data elif output_format == 'record': out = {'sha1': sha1_hash, 'full_path': fn, 'extension': os.path.splitext(fn)[1][1:]} if data: out['data'] = data if hdfs_path: out['hdfs_path'] = hdfs_path yield sha1_hash, out
def __cleanup_tmp_dir(tmp_tweet_dir, tmp_wordcloud_dir, hdfs_path=None, local_path=None): tweet_files = os.listdir(tmp_tweet_dir) for tf in tweet_files: if hdfs_path: hadoopy.put(tmp_tweet_dir + '/' + tf, hdfs_path + Util.TWEETS + '/' + tf[-24:-4] + '.csv') if local_path: shutil.copy(tmp_tweet_dir + '/' + tf, local_path + Util.TWEETS + '/' + tf[-24:-4] + '.csv') os.remove(tmp_tweet_dir + '/' + tf) wordcloud_files = os.listdir(tmp_wordcloud_dir) for wf in wordcloud_files: if hdfs_path: hadoopy.put(tmp_wordcloud_dir + '/' + wf, hdfs_path + Util.WORDCLOUD + '/' + wf[-24:-4] + '.csv') if local_path: shutil.copy( tmp_wordcloud_dir + '/' + wf, local_path + Util.WORDCLOUD + '/' + wf[-24:-4] + '.csv') os.remove(tmp_wordcloud_dir + '/' + wf)
def test_local(self): out_path = '%s/local_test/%f' % (self.data_path, time.time()) hadoopy.mkdir(out_path) hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb') hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out_list_cmdenvs', 'local.py', max_input=1000, cmdenvs=['TEST_ENV=10'], files=[ 'wc-input-alice.tb' ]) # Just bring this along to test the files hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out', 'local.py', max_input=1000, cmdenvs={'TEST_ENV': '10'}, files=[ 'wc-input-alice.tb' ]) # Just bring this along to test the files hadoopy.launch_local( ((1000 * 'a', 10000000 * 'b') for x in range(100)), None, 'local.py', max_input=10000, cmdenvs=['TEST_ENV=10'], files=['wc-input-alice.tb'])
def test_local(self): out_path = '%s/local_test/%f' % (self.data_path, time.time()) hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb') hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out', 'local.py', max_input=1000, cmdenvs=['TEST_ENV=10'], files=['wc-input-alice.tb']) # Just bring this along to test the files hadoopy.launch_local(((1000 * 'a', 10000000 * 'b') for x in range(100)), None, 'local.py', max_input=10000, cmdenvs=['TEST_ENV=10'], files=['wc-input-alice.tb'])
def _run_face(self, fn, out_path, **kw): in_path = self.data_path + fn hdfs_out_path = '%sout-%s-%f' % (self.data_path, fn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw) for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)): with open(out_path + 'img%.8d.png' % num, 'w') as fp: fp.write(image_data)
def _run_face(self, fn, **kw): in_path = self.data_path + fn out_path = "%sout-%s-%f" % (self.data_path, fn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, out_path, "face_finder.py", files=["haarcascade_frontalface_default.xml"], **kw) for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)): with open(self.out_path + "img%.8d.jpg" % num, "w") as fp: fp.write(image_data)
def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) # This is no longer true in CDH4 self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=[ 'mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000' ], **kw) launcher(in_path, out_path, script_name, jobconfs={ 'mapred.min.split.size': '100000000', 'mapreduce.task.userlog.limit.kb': '1000' }, **kw) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == 'launch_frozen_cmd': cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % ( script_name, in_path, out_path) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def _run_hdfs(self, orig_fn): fn = "%f-%s" % (time.time(), orig_fn) file_path = "%s/%s" % (self.data_path, fn) hadoopy.put(orig_fn, file_path) cat_output = [_ for _ in hadoopy.readtb(file_path)] line = (331, "Title: Alice's Adventures in Wonderland") self.assertTrue(line in cat_output) ls_output = hadoopy.ls(self.data_path) self.assertTrue([x for x in ls_output if x.rsplit("/", 1)[-1] == fn]) ls_output = hadoopy.ls(file_path) self.assertTrue(ls_output[0].rsplit("/", 1)[-1] == fn)
def _run_hdfs(self, orig_fn): fn = '%f-%s' % (time.time(), orig_fn) file_path = '%s/%s' % (self.data_path, fn) hadoopy.put(orig_fn, file_path) cat_output = [_ for _ in hadoopy.readtb(file_path)] line = (331, 'Title: Alice\'s Adventures in Wonderland') self.assertTrue(line in cat_output) ls_output = hadoopy.ls(self.data_path) self.assertTrue([x for x in ls_output if x.rsplit('/', 1)[-1] == fn]) ls_output = hadoopy.ls(file_path) self.assertTrue(ls_output[0].rsplit('/', 1)[-1] == fn)
def _run_haystack(fn, script_name): cur_time = time.time() hdfs_base_path = "hadoopy-test-data/%f/" % cur_time print("Storing HDFS temp files and output in [%s]" % hdfs_base_path) in_path = hdfs_base_path + os.path.basename(fn) out_path = hdfs_base_path + "out-" + os.path.basename(fn) hadoopy.put(fn, in_path) print("Launching job [%s]" % script_name) hadoopy.launch_frozen(in_path, out_path, script_name, files=[data_path + "target.jpg"]) print("Storing local output in [%s]" % local_out) for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)): open("%s%s-img%.8d-%s.jpg" % (local_out, script_name, num, image_name), "w").write(image_data)
def _run_face(self, fn, out_path, **kw): bfn = os.path.basename(fn) in_path = self.data_path + bfn hdfs_out_path = '%sout-%s-%f' % (self.data_path, bfn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw) for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)): with open(out_path + 'img%.8d.png' % num, 'w') as fp: fp.write(image_data)
def _run_haystack(fn, script_name): cur_time = time.time() hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time print('Storing HDFS temp files and output in [%s]' % hdfs_base_path) in_path = hdfs_base_path + os.path.basename(fn) out_path = hdfs_base_path + 'out-' + os.path.basename(fn) hadoopy.put(fn, in_path) print('Launching job [%s]' % script_name) hadoopy.launch_frozen(in_path, out_path, script_name, files=[data_path + 'target.jpg']) print('Storing local output in [%s]' % local_out) for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)): open('%s%s-img%.8d-%s.jpg' % (local_out, script_name, num, image_name), 'w').write(image_data)
def freeze_script(script_path, cache=True, temp_path='_hadoopy_temp'): """Freezes a script, puts it on hdfs, and gives you the path 'frozen_tar_path' can be given to launch_frozen and it will use that instead of making its own, this is useful for repeated calls. If a file with the same md5 already exists in the temp_path, it is used instead of putting a new copy there to avoid the file transfer. The files are put into a temporary file based on the timestamp first, then moved to a location that is only a function of their md5 to prevent partial files. Args: script_path: Path to a hadoopy script cache: If True (default) then use previously frozen scripts. Cache is stored in memory (not persistent). temp_path: HDFS temporary path (default is '_hadoopy_temp') Returns: {'cmds': commands_ran, 'frozen_tar_path': frozen_tar_path} Raises: ValueError: Script cannot be found """ script_abspath = os.path.abspath(script_path) if not os.path.exists(script_abspath): raise ValueError('Script [%s] does not exist.' % script_abspath) try: if not cache: raise KeyError # NOTE(brandyn): Don't use cache item cmds, frozen_tar_path = FREEZE_CACHE[script_abspath] except KeyError: tmp_frozen_tar_path = temp_path + '/%f.tar' % time.time() freeze_fp = tempfile.NamedTemporaryFile(suffix='.tar') cmds = hadoopy._freeze.freeze_to_tar(os.path.abspath(script_path), freeze_fp.name) md5 = _md5_file(freeze_fp.name) frozen_tar_path = temp_path + '/%s.tar' % md5 if not hadoopy.exists(frozen_tar_path): if not hadoopy.exists(temp_path): # CDH4 Fix hadoopy.mkdir(temp_path) hadoopy.put(freeze_fp.name, tmp_frozen_tar_path) try: hadoopy.mv(tmp_frozen_tar_path, frozen_tar_path) except IOError: if not hadoopy.exists(frozen_tar_path): # Check again raise FREEZE_CACHE[script_abspath] = cmds, frozen_tar_path return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path}
def test_local(self): out_path = "%s/local_test/%f" % (self.data_path, time.time()) hadoopy.put("wc-input-alice.tb", out_path + "/wc-input-alice.tb") hadoopy.launch_local( out_path + "/wc-input-alice.tb", out_path + "/out", "local.py", max_input=1000, cmdenvs=["TEST_ENV=10"], files=["wc-input-alice.tb"], ) # Just bring this along to test the files hadoopy.launch_local( ((1000 * "a", 10000000 * "b") for x in range(100)), None, "local.py", max_input=10000, cmdenvs=["TEST_ENV=10"], files=["wc-input-alice.tb"], )
def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw): fn = "out-%f-%s" % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + ".out" print(os.path.abspath(".")) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher( in_path, out_path, script_name, jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"], **kw ) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == "launch_frozen_cmd": cmd = ( 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name, in_path, out_path) ) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError("Launcher not recognized") wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc["the"], 1664) self.assertEqual(wc["Alice"], 221)
def report_clusters_faces_videos(predict_start_time, video_start_time): """ """ if SKIP_OVERRIDE and OVERRIDE_REPORT_START_TIME: return OVERRIDE_REPORT_START_TIME root = make_drive_root(predict_start_time, 'predict') start_time = OVERRIDE_REPORT_START_TIME if OVERRIDE_REPORT_START_TIME else '%f' % time.time() video_root = make_drive_root(video_start_time, 'video') out_root = make_drive_root(start_time, 'report') local = make_local_root(start_time) clusters = ['indoors', 'nonphotos', 'outdoors', 'objects', 'pr0n'] clusters += ['faces'] # Process all the thumbnails in parallel thumb_input = [root + '/cluster/' + c + '/partition' for c in clusters] picarus.report.make_thumbnails(thumb_input, out_root + '/report/thumb', 100, 'cluster') if video_root is not None: picarus.report.make_thumbnails(video_root + '/video_keyframe/allframes', out_root + '/report/vidthumb', 100, 'frame') # Prepare json report report = {} for c in clusters: make_faces = 'faces' in c r = picarus.report.report_clusters(root + '/cluster/' + c, c, make_faces) report.update(r) # Copy all the thumbnails locally picarus.report.report_thumbnails(out_root + '/report/thumb', local + '/report/t/') if video_root is not None: r = picarus.report.report_video_keyframe(video_root + '/video_keyframe/keyframe') report.update(r) picarus.report.report_thumbnails(out_root + '/report/vidthumb', local + '/report/t/') with open(local + '/report/sample_report.js', 'w') as f: f.write('var report = ') f.write(json.dumps(report)) shutil.copy(picarus.report.__path__[0] + '/data/static_sample_report.html', local + '/report') hadoopy.put(local + '/report', out_root + '/report/') print('Report output ------------------> [%s/%s]' % (out_root, '/report')) return start_time
def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) # This is no longer true in CDH4 self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000'], **kw) launcher(in_path, out_path, script_name, jobconfs={'mapred.min.split.size': '100000000', 'mapreduce.task.userlog.limit.kb': '1000'}, **kw) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == 'launch_frozen_cmd': cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name, in_path, out_path) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
import hadoopy import time import os # Setup paths here = os.path.abspath(os.path.dirname(__file__)) data_path = "hadoopy-test-data/%f/" % time.time() input_path = data_path + "wc-input-alice.tb" output_path = data_path + "wc-output-alice" # Put the data from a local path onto HDFS hadoopy.put(os.path.join(here, "..", "..", "data", "wc-input-alice.tb"), input_path) # Launch the job. The wc.py script will be "frozen" (all dependencies are discovered using Pyinstaller). # The cluster doesn't need Hadoopy, Python, or any other libraries for this to work (as long as Pyinstaller can find everything, if not there are simple things that you can do to fix it). hadoopy.launch_frozen(input_path, output_path, "wc.py") # Analyze the output. The output is an iterator of (word, count) where word is a string and count # is an integer. word_counts = dict(hadoopy.readtb(output_path)) for probe_word, expected_count in [("the", 1664), ("Alice", 221), ("tree", 3)]: print("word_counts[%s] = %d" % (probe_word, word_counts[probe_word])) assert expected_count == word_counts[probe_word]
from google_ngram_downloader import readline_google_store import wget import hadoopy import os for i in range(3, 6): gene = readline_google_store(ngram_len=i, lang='eng') while True: try: fname, url, records = next(gene) print fname if hadoopy.exists('/google-ngram/'+str(i)+'/'+fname): continue else: wget.download(url) hadoopy.put(fname, '/google-ngram/'+str(i)+'/'+fname) os.remove(fname) except StopIteration: print "END" break
def main(): local_path = './BigData/dummy_data' for file in read_local_dir(local_path): hadoopy.put(file, 'data') print "The file %s has been put into hdfs" % (file,)
def main(opt): # set twitter auth credentials auth = tweepy.OAuthHandler(opt.consumer_key, opt.consumer_secret) auth.set_access_token(opt.access_token_key, opt.access_token_secret) # get api instance if opt.https_proxy: api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, proxy=opt.https_proxy) else: api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) # write_to_file, roll_size or roll_count write_to_file = True if opt.hdfs_path or opt.local_path else False # instantiate the analyzer analyzer = an.Analyzer() # override opt.roll_size for testing # opt.roll_size = 20480 if write_to_file: tmp_tweet_dir = Util.TMP_DIR + '/' + Util.TWEETS tmp_wordcloud_dir = Util.TMP_DIR + '/' + Util.WORDCLOUD if not os.path.exists(tmp_tweet_dir): os.makedirs(tmp_tweet_dir) if not os.path.exists(tmp_wordcloud_dir): os.makedirs(tmp_wordcloud_dir) # create new hdfs paths if opt.hdfs_path: hadoopy.put(tmp_tweet_dir, opt.hdfs_path) hadoopy.put(tmp_wordcloud_dir, opt.hdfs_path) # create new local paths if opt.local_path: try: os.makedirs(opt.local_path + Util.TWEETS) os.makedirs(opt.local_path + Util.WORDCLOUD) except OSError as e: if e.errno != errno.EEXIST: raise e pass # join our keywords as a single query # query = ' OR '.join(opt.keywords) queries = [ ' OR '.join(opt.keywords[i:i + 10]) for i in xrange(0, len(opt.keywords), 10) ] for query in queries: file_closed = True # Cursor params # since_id=tweet_id, max_id=tweet_id, lang="en" # include_entities=True, rpp=100, count=1000 if opt.since_tweet_id: cursor = tweepy.Cursor(api.search, q=query, result_type="recent", since_id=opt.since_tweet_id, rpp=100) else: cursor = tweepy.Cursor(api.search, q=query, result_type="recent", rpp=100) try: for tweet in cursor.items(): tweet_obj, wordarray = Tweet.tweet_wordcloud_from_json( tweet._json, analyzer) wordcloud_list = Wordcloud.list_from_array( tweet_obj.tweet_id, wordarray) Util.vprint(opt.verbose, "Tweet_id: " + str(tweet_obj.tweet_id)) # print "Tweet_id: " + str(tweet_obj.tweet_id) # determine if we are flagged to write to file if write_to_file: # start of loop # fp will either return an existing .tmp file or open a new one # bytes_written will be automatically set to zero # if new file created if file_closed: now = datetime.utcnow() fp_tweets = open( tmp_tweet_dir + '/' + now.strftime('%Y-%m-%dT%H.%M.%SZ') + '.tmp', 'a') fp_wordcloud = open( tmp_wordcloud_dir + '/' + now.strftime('%Y-%m-%dT%H.%M.%SZ') + '.tmp', 'a') bytes_written = 0 print "Create new temporary file to write to: " + fp_tweets.name[ -24:] bytes_written += Tweet.write_to_file(tweet_obj, fp_tweets) Wordcloud.write_to_file(wordcloud_list, fp_wordcloud) file_closed = False Util.vprint(opt.verbose, "bytes_written: " + str(bytes_written)) # close the file if reached limit # rename (remove .tmp) and move to specified local / HDFS path if (bytes_written >= opt.roll_size): __close_tmp_mv(fp_tweets, fp_wordcloud, opt.hdfs_path, opt.local_path) file_closed = True print "Finished searching tweets for queries: " print query except tweepy.error.TweepError as te: print "Tweepy throws error" print te.reason print te.response except (KeyboardInterrupt, SystemExit): if write_to_file and not file_closed: print "Closing temporary files" fp_tweets.close() fp_wordcloud.close() __cleanup_tmp_dir(tmp_tweet_dir, tmp_wordcloud_dir, opt.hdfs_path, opt.local_path) # post loop # close the file, just in case it is not closed within the loop finally: if write_to_file and not file_closed: print "Closing temporary files" fp_tweets.close() fp_wordcloud.close() __cleanup_tmp_dir(tmp_tweet_dir, tmp_wordcloud_dir, opt.hdfs_path, opt.local_path) file_closed = True if write_to_file and not file_closed: print "Closing temporary files" fp_tweets.close() fp_wordcloud.close() __cleanup_tmp_dir(tmp_tweet_dir, tmp_wordcloud_dir, opt.hdfs_path, opt.local_path) print "Ending tweet searching"
def write(localpath,hdfspath): try: hadoopy.put(localpath,hdfspath) except Exception, e: logging.exception(e) return False
def on_data(self, data): tweet_obj, wordarray = Tweet.tweet_wordcloud_from_json( json.loads(data), self.analyzer) wordcloud_list = Wordcloud.list_from_array(tweet_obj.tweet_id, wordarray) print tweet_obj.to_tsv_str() if self.write_to_file: print "Writing to file" if self.file_closed: print "Open new temporary files to write to" now = datetime.utcnow() self.fp_tweets = open( Util.TMP_DIR + '/' + Util.TWEETS + '/' + now.strftime('%Y-%m-%dT%H.%M.%SZ') + '.tmp', 'a') self.fp_wordcloud = open( Util.TMP_DIR + '/' + Util.WORDCLOUD + '/' + now.strftime('%Y-%m-%dT%H.%M.%SZ') + '.tmp', 'a') self.bytes_written = 0 self.bytes_written += Tweet.write_to_file(tweet_obj, self.fp_tweets) Wordcloud.write_to_file(wordcloud_list, self.fp_wordcloud) self.file_closed = False print "bytes_written: " + str(self.bytes_written) print "roll_size: " + str(self.roll_size) if (self.bytes_written >= self.roll_size): self.fp_tweets.close() self.fp_wordcloud.close() filename_tweets = self.fp_tweets.name filename_wordcloud = self.fp_wordcloud.name print "Moving temporary files " + filename_tweets[ -24:-4] + ".csv" print "filename_tweets: " + filename_tweets print "filename_wordcloud: " + filename_wordcloud if self.hdfs_path: hadoopy.put( filename_tweets, self.hdfs_path + Util.TWEETS + '/' + filename_tweets[-24:-4] + '.csv') hadoopy.put( filename_wordcloud, self.hdfs_path + Util.WORDCLOUD + '/' + filename_wordcloud[-24:-4] + '.csv') if self.local_path: shutil.copy( filename_tweets, self.local_path + '/' + Util.TWEETS + '/' + filename_tweets[-24:-4] + '.csv') shutil.copy( filename_wordcloud, self.local_path + '/' + Util.WORDCLOUD + '/' + filename_wordcloud[-24:-4] + '.csv') os.remove(filename_tweets) os.remove(filename_wordcloud) self.file_closed = True return True
def main(): local_path='../data/' #hadoopy.writetb('/tmp/data/',read_local_dir(local_path)) for file in read_local_dir(local_path): hadoopy.put(file,'/tmp/data') print "The file %s has been put into hdfs" % (file)
def main(): local_path = '../data/' #hadoopy.writetb('/tmp/data/',read_local_dir(local_path)) for file in read_local_dir(local_path): hadoopy.put(file, '/tmp/data') print "The file %s has been put into hdfs" % (file)
ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d%H%M%S') fw = open("./logs"+"/"+"data"+ts,"w") dataList = [] for i in xrange(arg2): string = randomDate("1/1/2010-1:30:00", "1/1/2014-4:50:60",random.random())+" "+pub_list[int(random.random()*10)%len(pub_list)]+" "+advertiser_list[int(random.random()*10)%len(advertiser_list)]+" "+ website_list[int(random.random()*10)%len(website_list)] + " " + geo_list[int(random.random()*10)%len(geo_list)] + " " +str(round(random.random(),4)) + " " + str(int(random.random()*10000)) if (i+1)%1000 == 0 : fw.close() ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d%H%M%S') fw = open("./logs/"+"data"+ts+str(int(random.random()*10000)),"w") print >> fw, string #producer.send_messages(arg1, string) #kafka TODO fw.close() hadoopy.put("./logs/*", hdfs_path) #hadoop TODO #kafka.close() #kafka TODO for the_file in os.listdir("./logs"): file_path = os.path.join("./logs", the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception, e: print e
import hadoopy import time # Setup paths data_path = "hadoopy-test-data/%f/" % time.time() input_path = data_path + "wc-input-alice.tb" output_path = data_path + "wc-output-alice" # Put the data from a local path onto HDFS hadoopy.put("../../data/wc-input-alice.tb", input_path) # Launch the job. The wc.py script will be "frozen" (all dependencies are discovered using Pyinstaller). # The cluster doesn't need Hadoopy, Python, or any other libraries for this to work (as long as Pyinstaller can find everything, if not there are simple things that you can do to fix it). hadoopy.launch_frozen(input_path, output_path, "wc.py") # Analyze the output. The output is an iterator of (word, count) where word is a string and count # is an integer. word_counts = dict(hadoopy.readtb(output_path)) for probe_word, expected_count in [("the", 1664), ("Alice", 221), ("tree", 3)]: print("word_counts[%s] = %d" % (probe_word, word_counts[probe_word])) assert expected_count == word_counts[probe_word]
import hadoopy import time import os # Setup paths here = os.path.abspath(os.path.dirname(__file__)) data_path = 'hadoopy-test-data/%f/' % time.time() input_path = data_path + 'wc-input-alice.tb' output_path = data_path + 'wc-output-alice' # Put the data from a local path onto HDFS hadoopy.put(os.path.join(here, '..', '..', 'data', 'wc-input-alice.tb'), input_path) # Launch the job. The wc.py script will be "frozen" (all dependencies are discovered using Pyinstaller). # The cluster doesn't need Hadoopy, Python, or any other libraries for this to work (as long as Pyinstaller can find everything, if not there are simple things that you can do to fix it). hadoopy.launch_frozen(input_path, output_path, 'wc.py') # Analyze the output. The output is an iterator of (word, count) where word is a string and count # is an integer. word_counts = dict(hadoopy.readtb(output_path)) for probe_word, expected_count in [('the', 1664), ('Alice', 221), ('tree', 3)]: print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word])) assert expected_count == word_counts[probe_word]