Example #1
0
 def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000',
                                                    'mapreduce.task.userlog.limit.kb=1000'])
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Example #2
0
def freeze_script(script_path, temp_path='_hadoopy_temp'):
    """Freezes a script, puts it on hdfs, and gives you the path

    'frozen_tar_path' can be given to launch_frozen and it will use that
    instead of making its own, this is useful for repeated calls.  If a
    file with the same md5 already exists in the temp_path, it is used
    instead of putting a new copy there to avoid the file transfer.  The
    files are put into a temporary file based on the timestamp first, then
    moved to a location that is only a function of their md5 to prevent partial
    files.

    Args:
        script_path: Path to a hadoopy script
        temp_path: HDFS temporary path (default is '_hadoopy_temp')

    Returns:
        {'cmds': commands_ran, 'frozen_tar_path': frozen_tar_path}
    """
    tmp_frozen_tar_path = temp_path + '/%f.tar' % time.time()
    freeze_fp = tempfile.NamedTemporaryFile(suffix='.tar')
    cmds = hadoopy._freeze.freeze_to_tar(os.path.abspath(script_path), freeze_fp.name)
    md5 = _md5_file(freeze_fp.name)
    frozen_tar_path = temp_path + '/%s.tar' % md5
    if hadoopy.exists(frozen_tar_path):
        return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path}
    hadoopy.put(freeze_fp.name, tmp_frozen_tar_path)
    try:
        hadoopy.mv(tmp_frozen_tar_path, frozen_tar_path)
    except IOError, e:
        if hadoopy.exists(frozen_tar_path):  # Check again
            return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path}
        raise e
Example #3
0
def __close_tmp_mv(fp_tweets, fp_wordcloud, hdfs_path=None, local_path=None):
    fp_tweets.close()
    fp_wordcloud.close()
    filename_tweets = fp_tweets.name
    filename_wordcloud = fp_wordcloud.name
    print "Will create new file " + filename_tweets[-24:-4] + ".csv"

    # Put to HDFS if specified to write to HDFS
    if hdfs_path:
        hadoopy.put(
            filename_tweets,
            hdfs_path + Util.TWEETS + '/' + filename_tweets[-24:-4] + '.csv')
        hadoopy.put(
            filename_wordcloud, hdfs_path + Util.WORDCLOUD + '/' +
            filename_wordcloud[-24:-4] + '.csv')

    # Put to local path if specified to write to local file system
    if local_path:
        shutil.copy(
            filename_tweets,
            local_path + Util.TWEETS + '/' + filename_tweets[-24:-4] + '.csv')
        shutil.copy(
            filename_wordcloud, local_path + Util.WORDCLOUD + '/' +
            filename_wordcloud[-24:-4] + '.csv')

    os.remove(filename_tweets)
    os.remove(filename_wordcloud)
def _read_files(fns, prev_hashes, hdfs_output, output_format, max_record_size):
    """
    Args:
        fns: Iterator of file names
        prev_hashes: Set of hashes (they will be skipped), this is used to make
            the data unique

    Yields:
        Tuple of (data_hash, data) where data_hash is a sha1 hash
    """
    for fn in fns:
        sha1_hash = _sha1(fn)
        if sha1_hash not in prev_hashes:
            prev_hashes.add(sha1_hash)
            if output_format == 'record' and max_record_size is not None and max_record_size < os.stat(fn)[6]:
                # Put the file into the remote location
                hdfs_path = hadoopy.abspath('%s/_blobs/%s_%s' % (hdfs_output, sha1_hash, os.path.basename(fn)))
                data = ''
                hadoopy.put(fn, hdfs_path)
            else:
                hdfs_path = ''
                data = open(fn).read()
            if output_format == 'kv':
                yield sha1_hash, data
            elif output_format == 'record':
                out = {'sha1': sha1_hash, 'full_path': fn,
                       'extension': os.path.splitext(fn)[1][1:]}
                if data:
                    out['data'] = data
                if hdfs_path:
                    out['hdfs_path'] = hdfs_path
                yield sha1_hash, out
Example #5
0
def __cleanup_tmp_dir(tmp_tweet_dir,
                      tmp_wordcloud_dir,
                      hdfs_path=None,
                      local_path=None):

    tweet_files = os.listdir(tmp_tweet_dir)
    for tf in tweet_files:
        if hdfs_path:
            hadoopy.put(tmp_tweet_dir + '/' + tf,
                        hdfs_path + Util.TWEETS + '/' + tf[-24:-4] + '.csv')
        if local_path:
            shutil.copy(tmp_tweet_dir + '/' + tf,
                        local_path + Util.TWEETS + '/' + tf[-24:-4] + '.csv')
        os.remove(tmp_tweet_dir + '/' + tf)

    wordcloud_files = os.listdir(tmp_wordcloud_dir)
    for wf in wordcloud_files:
        if hdfs_path:
            hadoopy.put(tmp_wordcloud_dir + '/' + wf,
                        hdfs_path + Util.WORDCLOUD + '/' + wf[-24:-4] + '.csv')
        if local_path:
            shutil.copy(
                tmp_wordcloud_dir + '/' + wf,
                local_path + Util.WORDCLOUD + '/' + wf[-24:-4] + '.csv')
        os.remove(tmp_wordcloud_dir + '/' + wf)
Example #6
0
 def test_local(self):
     out_path = '%s/local_test/%f' % (self.data_path, time.time())
     hadoopy.mkdir(out_path)
     hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb')
     hadoopy.launch_local(out_path + '/wc-input-alice.tb',
                          out_path + '/out_list_cmdenvs',
                          'local.py',
                          max_input=1000,
                          cmdenvs=['TEST_ENV=10'],
                          files=[
                              'wc-input-alice.tb'
                          ])  # Just bring this along to test the files
     hadoopy.launch_local(out_path + '/wc-input-alice.tb',
                          out_path + '/out',
                          'local.py',
                          max_input=1000,
                          cmdenvs={'TEST_ENV': '10'},
                          files=[
                              'wc-input-alice.tb'
                          ])  # Just bring this along to test the files
     hadoopy.launch_local(
         ((1000 * 'a', 10000000 * 'b') for x in range(100)),
         None,
         'local.py',
         max_input=10000,
         cmdenvs=['TEST_ENV=10'],
         files=['wc-input-alice.tb'])
Example #7
0
 def test_local(self):
     out_path = '%s/local_test/%f' % (self.data_path, time.time())
     hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb')
     hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out', 'local.py', max_input=1000,
                          cmdenvs=['TEST_ENV=10'],
                          files=['wc-input-alice.tb'])  # Just bring this along to test the files
     hadoopy.launch_local(((1000 * 'a', 10000000 * 'b') for x in range(100)), None, 'local.py', max_input=10000,
                          cmdenvs=['TEST_ENV=10'],
                          files=['wc-input-alice.tb'])
Example #8
0
 def _run_face(self, fn, out_path, **kw):
     in_path = self.data_path + fn
     hdfs_out_path = '%sout-%s-%f' % (self.data_path, fn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw)
     for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)):
         with open(out_path + 'img%.8d.png' % num, 'w') as fp:
             fp.write(image_data)
Example #9
0
 def _run_face(self, fn, **kw):
     in_path = self.data_path + fn
     out_path = "%sout-%s-%f" % (self.data_path, fn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path, out_path, "face_finder.py", files=["haarcascade_frontalface_default.xml"], **kw)
     for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
         with open(self.out_path + "img%.8d.jpg" % num, "w") as fp:
             fp.write(image_data)
Example #10
0
 def _run_wc(self,
             orig_fn,
             script_name='wc.py',
             launcher=hadoopy.launch_frozen,
             **kw):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])  # This is no longer true in CDH4
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(in_path,
                  out_path + '_list_jobconfs',
                  script_name,
                  jobconfs=[
                      'mapred.min.split.size=100000000',
                      'mapreduce.task.userlog.limit.kb=1000'
                  ],
                  **kw)
         launcher(in_path,
                  out_path,
                  script_name,
                  jobconfs={
                      'mapred.min.split.size': '100000000',
                      'mapreduce.task.userlog.limit.kb': '1000'
                  },
                  **kw)
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == 'launch_frozen_cmd':
         cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (
             script_name, in_path, out_path)
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Example #11
0
 def _run_hdfs(self, orig_fn):
     fn = "%f-%s" % (time.time(), orig_fn)
     file_path = "%s/%s" % (self.data_path, fn)
     hadoopy.put(orig_fn, file_path)
     cat_output = [_ for _ in hadoopy.readtb(file_path)]
     line = (331, "Title: Alice's Adventures in Wonderland")
     self.assertTrue(line in cat_output)
     ls_output = hadoopy.ls(self.data_path)
     self.assertTrue([x for x in ls_output if x.rsplit("/", 1)[-1] == fn])
     ls_output = hadoopy.ls(file_path)
     self.assertTrue(ls_output[0].rsplit("/", 1)[-1] == fn)
Example #12
0
 def _run_hdfs(self, orig_fn):
     fn = '%f-%s' % (time.time(), orig_fn)
     file_path = '%s/%s' % (self.data_path, fn)
     hadoopy.put(orig_fn, file_path)
     cat_output = [_ for _ in hadoopy.readtb(file_path)]
     line = (331, 'Title: Alice\'s Adventures in Wonderland')
     self.assertTrue(line in cat_output)
     ls_output = hadoopy.ls(self.data_path)
     self.assertTrue([x for x in ls_output if x.rsplit('/', 1)[-1] == fn])
     ls_output = hadoopy.ls(file_path)
     self.assertTrue(ls_output[0].rsplit('/', 1)[-1] == fn)
Example #13
0
def _run_haystack(fn, script_name):
    cur_time = time.time()
    hdfs_base_path = "hadoopy-test-data/%f/" % cur_time
    print("Storing HDFS temp files and output in [%s]" % hdfs_base_path)
    in_path = hdfs_base_path + os.path.basename(fn)
    out_path = hdfs_base_path + "out-" + os.path.basename(fn)
    hadoopy.put(fn, in_path)
    print("Launching job [%s]" % script_name)
    hadoopy.launch_frozen(in_path, out_path, script_name, files=[data_path + "target.jpg"])
    print("Storing local output in [%s]" % local_out)
    for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)):
        open("%s%s-img%.8d-%s.jpg" % (local_out, script_name, num, image_name), "w").write(image_data)
Example #14
0
 def _run_face(self, fn, out_path, **kw):
     bfn = os.path.basename(fn)
     in_path = self.data_path + bfn
     hdfs_out_path = '%sout-%s-%f' % (self.data_path, bfn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path,
                           hdfs_out_path,
                           'face_finder.py',
                           files=['haarcascade_frontalface_default.xml'],
                           **kw)
     for num, ((image_name, box),
               image_data) in enumerate(hadoopy.readtb(hdfs_out_path)):
         with open(out_path + 'img%.8d.png' % num, 'w') as fp:
             fp.write(image_data)
Example #15
0
def _run_haystack(fn, script_name):
    cur_time = time.time()
    hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time
    print('Storing HDFS temp files and output in [%s]' % hdfs_base_path)
    in_path = hdfs_base_path + os.path.basename(fn)
    out_path = hdfs_base_path + 'out-' + os.path.basename(fn)
    hadoopy.put(fn, in_path)
    print('Launching job [%s]' % script_name)
    hadoopy.launch_frozen(in_path,
                          out_path,
                          script_name,
                          files=[data_path + 'target.jpg'])
    print('Storing local output in [%s]' % local_out)
    for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)):
        open('%s%s-img%.8d-%s.jpg' % (local_out, script_name, num, image_name),
             'w').write(image_data)
Example #16
0
def freeze_script(script_path, cache=True, temp_path='_hadoopy_temp'):
    """Freezes a script, puts it on hdfs, and gives you the path

    'frozen_tar_path' can be given to launch_frozen and it will use that
    instead of making its own, this is useful for repeated calls.  If a
    file with the same md5 already exists in the temp_path, it is used
    instead of putting a new copy there to avoid the file transfer.  The
    files are put into a temporary file based on the timestamp first, then
    moved to a location that is only a function of their md5 to prevent partial
    files.

    Args:
        script_path: Path to a hadoopy script
        cache: If True (default) then use previously frozen scripts.  Cache is stored in memory (not persistent).
        temp_path: HDFS temporary path (default is '_hadoopy_temp')

    Returns:
        {'cmds': commands_ran, 'frozen_tar_path': frozen_tar_path}

    Raises:
        ValueError: Script cannot be found
    """
    script_abspath = os.path.abspath(script_path)
    if not os.path.exists(script_abspath):
        raise ValueError('Script [%s] does not exist.' % script_abspath)
    try:
        if not cache:
            raise KeyError  # NOTE(brandyn): Don't use cache item
        cmds, frozen_tar_path = FREEZE_CACHE[script_abspath]
    except KeyError:
        tmp_frozen_tar_path = temp_path + '/%f.tar' % time.time()
        freeze_fp = tempfile.NamedTemporaryFile(suffix='.tar')
        cmds = hadoopy._freeze.freeze_to_tar(os.path.abspath(script_path), freeze_fp.name)
        md5 = _md5_file(freeze_fp.name)
        frozen_tar_path = temp_path + '/%s.tar' % md5
        if not hadoopy.exists(frozen_tar_path):
            if not hadoopy.exists(temp_path):  # CDH4 Fix
                hadoopy.mkdir(temp_path)
            hadoopy.put(freeze_fp.name, tmp_frozen_tar_path)
            try:
                hadoopy.mv(tmp_frozen_tar_path, frozen_tar_path)
            except IOError:
                if not hadoopy.exists(frozen_tar_path):  # Check again
                    raise
    FREEZE_CACHE[script_abspath] = cmds, frozen_tar_path
    return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path}
Example #17
0
 def test_local(self):
     out_path = "%s/local_test/%f" % (self.data_path, time.time())
     hadoopy.put("wc-input-alice.tb", out_path + "/wc-input-alice.tb")
     hadoopy.launch_local(
         out_path + "/wc-input-alice.tb",
         out_path + "/out",
         "local.py",
         max_input=1000,
         cmdenvs=["TEST_ENV=10"],
         files=["wc-input-alice.tb"],
     )  # Just bring this along to test the files
     hadoopy.launch_local(
         ((1000 * "a", 10000000 * "b") for x in range(100)),
         None,
         "local.py",
         max_input=10000,
         cmdenvs=["TEST_ENV=10"],
         files=["wc-input-alice.tb"],
     )
Example #18
0
 def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw):
     fn = "out-%f-%s" % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + ".out"
     print(os.path.abspath("."))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(
             in_path,
             out_path,
             script_name,
             jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"],
             **kw
         )
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == "launch_frozen_cmd":
         cmd = (
             'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"'
             % (script_name, in_path, out_path)
         )
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError("Launcher not recognized")
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc["the"], 1664)
     self.assertEqual(wc["Alice"], 221)
Example #19
0
def report_clusters_faces_videos(predict_start_time, video_start_time):
    """
    """
    if SKIP_OVERRIDE and OVERRIDE_REPORT_START_TIME:
        return OVERRIDE_REPORT_START_TIME
    root = make_drive_root(predict_start_time, 'predict')
    start_time = OVERRIDE_REPORT_START_TIME if OVERRIDE_REPORT_START_TIME else '%f' % time.time()
    video_root = make_drive_root(video_start_time, 'video')
    out_root = make_drive_root(start_time, 'report')
    local = make_local_root(start_time)
    clusters = ['indoors', 'nonphotos', 'outdoors', 'objects', 'pr0n']
    clusters += ['faces']

    # Process all the thumbnails in parallel
    thumb_input = [root + '/cluster/' + c + '/partition' for c in clusters]
    picarus.report.make_thumbnails(thumb_input, out_root + '/report/thumb', 100, 'cluster')
    if video_root is not None:
        picarus.report.make_thumbnails(video_root + '/video_keyframe/allframes', out_root + '/report/vidthumb', 100, 'frame')

    # Prepare json report
    report = {}
    for c in clusters:
        make_faces = 'faces' in c
        r = picarus.report.report_clusters(root + '/cluster/' + c, c, make_faces)
        report.update(r)

    # Copy all the thumbnails locally
    picarus.report.report_thumbnails(out_root + '/report/thumb', local + '/report/t/')
    if video_root is not None:
        r = picarus.report.report_video_keyframe(video_root + '/video_keyframe/keyframe')
        report.update(r)
        picarus.report.report_thumbnails(out_root + '/report/vidthumb', local + '/report/t/')

    with open(local + '/report/sample_report.js', 'w') as f:
        f.write('var report = ')
        f.write(json.dumps(report))

    shutil.copy(picarus.report.__path__[0] + '/data/static_sample_report.html', local + '/report')
    hadoopy.put(local + '/report', out_root + '/report/')
    print('Report output ------------------> [%s/%s]' % (out_root, '/report'))
    return start_time
Example #20
0
 def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])  # This is no longer true in CDH4
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=['mapred.min.split.size=100000000',
                                                                               'mapreduce.task.userlog.limit.kb=1000'], **kw)
         launcher(in_path, out_path, script_name, jobconfs={'mapred.min.split.size': '100000000',
                                                                 'mapreduce.task.userlog.limit.kb': '1000'}, **kw)
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == 'launch_frozen_cmd':
         cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name,
                                                                                                                                             in_path,
                                                                                                                                             out_path)
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Example #21
0
import hadoopy
import time
import os

# Setup paths
here = os.path.abspath(os.path.dirname(__file__))
data_path = "hadoopy-test-data/%f/" % time.time()
input_path = data_path + "wc-input-alice.tb"
output_path = data_path + "wc-output-alice"

# Put the data from a local path onto HDFS

hadoopy.put(os.path.join(here, "..", "..", "data", "wc-input-alice.tb"), input_path)

# Launch the job.  The wc.py script will be "frozen" (all dependencies are discovered using Pyinstaller).
# The cluster doesn't need Hadoopy, Python, or any other libraries for this to work (as long as Pyinstaller can find everything, if not there are simple things that you can do to fix it).
hadoopy.launch_frozen(input_path, output_path, "wc.py")

# Analyze the output.  The output is an iterator of (word, count) where word is a string and count
# is an integer.
word_counts = dict(hadoopy.readtb(output_path))
for probe_word, expected_count in [("the", 1664), ("Alice", 221), ("tree", 3)]:
    print("word_counts[%s] = %d" % (probe_word, word_counts[probe_word]))
    assert expected_count == word_counts[probe_word]
Example #22
0
from google_ngram_downloader import readline_google_store
import wget
import hadoopy
import os

for i in range(3, 6):

	gene = readline_google_store(ngram_len=i, lang='eng')

	while True:
		try:
			fname, url, records = next(gene)
			print fname
			if hadoopy.exists('/google-ngram/'+str(i)+'/'+fname):
				continue
			else:
				wget.download(url)
				hadoopy.put(fname, '/google-ngram/'+str(i)+'/'+fname)
				os.remove(fname)
		
		except StopIteration:
			print "END"
			break
def main():
    local_path = './BigData/dummy_data'
    for file in  read_local_dir(local_path):
        hadoopy.put(file, 'data')
        print "The file %s has been put into hdfs" % (file,)
Example #24
0
def main(opt):

    # set twitter auth credentials
    auth = tweepy.OAuthHandler(opt.consumer_key, opt.consumer_secret)
    auth.set_access_token(opt.access_token_key, opt.access_token_secret)

    # get api instance
    if opt.https_proxy:
        api = tweepy.API(auth,
                         wait_on_rate_limit=True,
                         wait_on_rate_limit_notify=True,
                         proxy=opt.https_proxy)
    else:
        api = tweepy.API(auth,
                         wait_on_rate_limit=True,
                         wait_on_rate_limit_notify=True)

    # write_to_file, roll_size or roll_count
    write_to_file = True if opt.hdfs_path or opt.local_path else False

    # instantiate the analyzer
    analyzer = an.Analyzer()

    # override opt.roll_size for testing
    # opt.roll_size = 20480

    if write_to_file:
        tmp_tweet_dir = Util.TMP_DIR + '/' + Util.TWEETS
        tmp_wordcloud_dir = Util.TMP_DIR + '/' + Util.WORDCLOUD

        if not os.path.exists(tmp_tweet_dir):
            os.makedirs(tmp_tweet_dir)

        if not os.path.exists(tmp_wordcloud_dir):
            os.makedirs(tmp_wordcloud_dir)

        # create new hdfs paths
        if opt.hdfs_path:
            hadoopy.put(tmp_tweet_dir, opt.hdfs_path)
            hadoopy.put(tmp_wordcloud_dir, opt.hdfs_path)

        # create new local paths
        if opt.local_path:
            try:
                os.makedirs(opt.local_path + Util.TWEETS)
                os.makedirs(opt.local_path + Util.WORDCLOUD)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise e
                pass

    # join our keywords as a single query
    # query = ' OR '.join(opt.keywords)
    queries = [
        ' OR '.join(opt.keywords[i:i + 10])
        for i in xrange(0, len(opt.keywords), 10)
    ]

    for query in queries:

        file_closed = True

        # Cursor params
        # since_id=tweet_id, max_id=tweet_id, lang="en"
        # include_entities=True, rpp=100, count=1000
        if opt.since_tweet_id:
            cursor = tweepy.Cursor(api.search,
                                   q=query,
                                   result_type="recent",
                                   since_id=opt.since_tweet_id,
                                   rpp=100)
        else:
            cursor = tweepy.Cursor(api.search,
                                   q=query,
                                   result_type="recent",
                                   rpp=100)

        try:

            for tweet in cursor.items():

                tweet_obj, wordarray = Tweet.tweet_wordcloud_from_json(
                    tweet._json, analyzer)

                wordcloud_list = Wordcloud.list_from_array(
                    tweet_obj.tweet_id, wordarray)

                Util.vprint(opt.verbose,
                            "Tweet_id: " + str(tweet_obj.tweet_id))
                # print "Tweet_id: " + str(tweet_obj.tweet_id)

                # determine if we are flagged to write to file
                if write_to_file:

                    # start of loop
                    # fp will either return an existing .tmp file or open a new one
                    # bytes_written will be automatically set to zero
                    # if new file created
                    if file_closed:
                        now = datetime.utcnow()
                        fp_tweets = open(
                            tmp_tweet_dir + '/' +
                            now.strftime('%Y-%m-%dT%H.%M.%SZ') + '.tmp', 'a')
                        fp_wordcloud = open(
                            tmp_wordcloud_dir + '/' +
                            now.strftime('%Y-%m-%dT%H.%M.%SZ') + '.tmp', 'a')
                        bytes_written = 0
                        print "Create new temporary file to write to: " + fp_tweets.name[
                            -24:]

                    bytes_written += Tweet.write_to_file(tweet_obj, fp_tweets)
                    Wordcloud.write_to_file(wordcloud_list, fp_wordcloud)
                    file_closed = False
                    Util.vprint(opt.verbose,
                                "bytes_written: " + str(bytes_written))

                    # close the file if reached limit
                    # rename (remove .tmp) and move to specified local / HDFS path
                    if (bytes_written >= opt.roll_size):
                        __close_tmp_mv(fp_tweets, fp_wordcloud, opt.hdfs_path,
                                       opt.local_path)
                        file_closed = True

            print "Finished searching tweets for queries: "
            print query

        except tweepy.error.TweepError as te:
            print "Tweepy throws error"
            print te.reason
            print te.response

        except (KeyboardInterrupt, SystemExit):
            if write_to_file and not file_closed:
                print "Closing temporary files"
                fp_tweets.close()
                fp_wordcloud.close()
                __cleanup_tmp_dir(tmp_tweet_dir, tmp_wordcloud_dir,
                                  opt.hdfs_path, opt.local_path)

        # post loop
        # close the file, just in case it is not closed within the loop
        finally:
            if write_to_file and not file_closed:
                print "Closing temporary files"
                fp_tweets.close()
                fp_wordcloud.close()
                __cleanup_tmp_dir(tmp_tweet_dir, tmp_wordcloud_dir,
                                  opt.hdfs_path, opt.local_path)
                file_closed = True

    if write_to_file and not file_closed:
        print "Closing temporary files"
        fp_tweets.close()
        fp_wordcloud.close()
        __cleanup_tmp_dir(tmp_tweet_dir, tmp_wordcloud_dir, opt.hdfs_path,
                          opt.local_path)

    print "Ending tweet searching"
Example #25
0
def write(localpath,hdfspath):
    try:
        hadoopy.put(localpath,hdfspath)
    except Exception, e:
        logging.exception(e)
        return False
Example #26
0
    def on_data(self, data):

        tweet_obj, wordarray = Tweet.tweet_wordcloud_from_json(
            json.loads(data), self.analyzer)

        wordcloud_list = Wordcloud.list_from_array(tweet_obj.tweet_id,
                                                   wordarray)

        print tweet_obj.to_tsv_str()

        if self.write_to_file:
            print "Writing to file"

            if self.file_closed:
                print "Open new temporary files to write to"
                now = datetime.utcnow()
                self.fp_tweets = open(
                    Util.TMP_DIR + '/' + Util.TWEETS + '/' +
                    now.strftime('%Y-%m-%dT%H.%M.%SZ') + '.tmp', 'a')
                self.fp_wordcloud = open(
                    Util.TMP_DIR + '/' + Util.WORDCLOUD + '/' +
                    now.strftime('%Y-%m-%dT%H.%M.%SZ') + '.tmp', 'a')
                self.bytes_written = 0

            self.bytes_written += Tweet.write_to_file(tweet_obj,
                                                      self.fp_tweets)
            Wordcloud.write_to_file(wordcloud_list, self.fp_wordcloud)
            self.file_closed = False

            print "bytes_written: " + str(self.bytes_written)
            print "roll_size: " + str(self.roll_size)

            if (self.bytes_written >= self.roll_size):
                self.fp_tweets.close()
                self.fp_wordcloud.close()

                filename_tweets = self.fp_tweets.name
                filename_wordcloud = self.fp_wordcloud.name

                print "Moving temporary files " + filename_tweets[
                    -24:-4] + ".csv"
                print "filename_tweets: " + filename_tweets
                print "filename_wordcloud: " + filename_wordcloud

                if self.hdfs_path:
                    hadoopy.put(
                        filename_tweets, self.hdfs_path + Util.TWEETS + '/' +
                        filename_tweets[-24:-4] + '.csv')
                    hadoopy.put(
                        filename_wordcloud, self.hdfs_path + Util.WORDCLOUD +
                        '/' + filename_wordcloud[-24:-4] + '.csv')

                if self.local_path:
                    shutil.copy(
                        filename_tweets, self.local_path + '/' + Util.TWEETS +
                        '/' + filename_tweets[-24:-4] + '.csv')
                    shutil.copy(
                        filename_wordcloud,
                        self.local_path + '/' + Util.WORDCLOUD + '/' +
                        filename_wordcloud[-24:-4] + '.csv')

                os.remove(filename_tweets)
                os.remove(filename_wordcloud)
                self.file_closed = True

        return True
Example #27
0
def main():
    local_path='../data/'
    #hadoopy.writetb('/tmp/data/',read_local_dir(local_path))
    for file in read_local_dir(local_path):
        hadoopy.put(file,'/tmp/data')
        print "The file %s has been put into hdfs" % (file)
Example #28
0
def main():
    local_path = '../data/'
    #hadoopy.writetb('/tmp/data/',read_local_dir(local_path))
    for file in read_local_dir(local_path):
        hadoopy.put(file, '/tmp/data')
        print "The file %s has been put into hdfs" % (file)
Example #29
0
ts = time.time()
ts = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d%H%M%S')
fw = open("./logs"+"/"+"data"+ts,"w")

dataList = []

for i in xrange(arg2):
	string = randomDate("1/1/2010-1:30:00", "1/1/2014-4:50:60",random.random())+" "+pub_list[int(random.random()*10)%len(pub_list)]+" "+advertiser_list[int(random.random()*10)%len(advertiser_list)]+" "+ website_list[int(random.random()*10)%len(website_list)] + " " + geo_list[int(random.random()*10)%len(geo_list)] + " " +str(round(random.random(),4)) + " " + str(int(random.random()*10000))
	if (i+1)%1000 == 0 :
		fw.close()
		ts = time.time()
		ts = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d%H%M%S')
		fw = open("./logs/"+"data"+ts+str(int(random.random()*10000)),"w")
	print >> fw, string
	#producer.send_messages(arg1, string) #kafka TODO

fw.close()

hadoopy.put("./logs/*", hdfs_path) #hadoop TODO
#kafka.close() #kafka TODO



for the_file in os.listdir("./logs"):
	file_path = os.path.join("./logs", the_file)
	try:
		if os.path.isfile(file_path):
			os.unlink(file_path)
	except Exception, e:
		print e
Example #30
0
import hadoopy
import time

# Setup paths
data_path = "hadoopy-test-data/%f/" % time.time()
input_path = data_path + "wc-input-alice.tb"
output_path = data_path + "wc-output-alice"

# Put the data from a local path onto HDFS
hadoopy.put("../../data/wc-input-alice.tb", input_path)

# Launch the job.  The wc.py script will be "frozen" (all dependencies are discovered using Pyinstaller).
# The cluster doesn't need Hadoopy, Python, or any other libraries for this to work (as long as Pyinstaller can find everything, if not there are simple things that you can do to fix it).
hadoopy.launch_frozen(input_path, output_path, "wc.py")

# Analyze the output.  The output is an iterator of (word, count) where word is a string and count
# is an integer.
word_counts = dict(hadoopy.readtb(output_path))
for probe_word, expected_count in [("the", 1664), ("Alice", 221), ("tree", 3)]:
    print("word_counts[%s] = %d" % (probe_word, word_counts[probe_word]))
    assert expected_count == word_counts[probe_word]
Example #31
0
import hadoopy
import time
import os

# Setup paths
here = os.path.abspath(os.path.dirname(__file__))
data_path = 'hadoopy-test-data/%f/' % time.time()
input_path = data_path + 'wc-input-alice.tb'
output_path = data_path + 'wc-output-alice'

# Put the data from a local path onto HDFS

hadoopy.put(os.path.join(here, '..', '..', 'data', 'wc-input-alice.tb'),
            input_path)

# Launch the job.  The wc.py script will be "frozen" (all dependencies are discovered using Pyinstaller).
# The cluster doesn't need Hadoopy, Python, or any other libraries for this to work (as long as Pyinstaller can find everything, if not there are simple things that you can do to fix it).
hadoopy.launch_frozen(input_path, output_path, 'wc.py')

# Analyze the output.  The output is an iterator of (word, count) where word is a string and count
# is an integer.
word_counts = dict(hadoopy.readtb(output_path))
for probe_word, expected_count in [('the', 1664), ('Alice', 221), ('tree', 3)]:
    print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word]))
    assert expected_count == word_counts[probe_word]