Exemple #1
0
 def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000',
                                                    'mapreduce.task.userlog.limit.kb=1000'])
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Exemple #2
0
def freeze_script(script_path, temp_path='_hadoopy_temp'):
    """Freezes a script, puts it on hdfs, and gives you the path

    'frozen_tar_path' can be given to launch_frozen and it will use that
    instead of making its own, this is useful for repeated calls.  If a
    file with the same md5 already exists in the temp_path, it is used
    instead of putting a new copy there to avoid the file transfer.  The
    files are put into a temporary file based on the timestamp first, then
    moved to a location that is only a function of their md5 to prevent partial
    files.

    Args:
        script_path: Path to a hadoopy script
        temp_path: HDFS temporary path (default is '_hadoopy_temp')

    Returns:
        {'cmds': commands_ran, 'frozen_tar_path': frozen_tar_path}
    """
    tmp_frozen_tar_path = temp_path + '/%f.tar' % time.time()
    freeze_fp = tempfile.NamedTemporaryFile(suffix='.tar')
    cmds = hadoopy._freeze.freeze_to_tar(os.path.abspath(script_path), freeze_fp.name)
    md5 = _md5_file(freeze_fp.name)
    frozen_tar_path = temp_path + '/%s.tar' % md5
    if hadoopy.exists(frozen_tar_path):
        return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path}
    hadoopy.put(freeze_fp.name, tmp_frozen_tar_path)
    try:
        hadoopy.mv(tmp_frozen_tar_path, frozen_tar_path)
    except IOError, e:
        if hadoopy.exists(frozen_tar_path):  # Check again
            return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path}
        raise e
Exemple #3
0
 def tearDown(self):
     if hadoopy.exists(self.data_path):
         self.assertTrue(hadoopy.isempty(self.data_path))  # directories are empty
         self.assertTrue(hadoopy.isdir(self.data_path))
         hadoopy.rmr(self.data_path)
     self.assertFalse(hadoopy.exists(self.data_path))
     self.assertFalse(hadoopy.isdir(self.data_path))
     self.assertFalse(hadoopy.isempty(self.data_path))
Exemple #4
0
 def tearDown(self):
     if hadoopy.exists(self.data_path):
         self.assertTrue(hadoopy.isempty(
             self.data_path))  # directories are empty
         self.assertTrue(hadoopy.isdir(self.data_path))
         hadoopy.rmr(self.data_path)
     self.assertFalse(hadoopy.exists(self.data_path))
     self.assertFalse(hadoopy.isdir(self.data_path))
     self.assertFalse(hadoopy.isempty(self.data_path))
Exemple #5
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args

    mat = args.get('mat', None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")

    input = mat
    matname, matext = os.path.splitext(mat)

    gopts.getintkey('blocksize', 3)
    schedule = gopts.getstrkey('reduce_schedule', '1')

    # clear the output
    output = args.get('output', '%s-qrr%s' % (matname, matext))
    if hadoopy.exists(output):
        print "Removing %s" % (output)
        hadoopy.rm(output)

    outputnamefunc = lambda x: output + "_iter%i" % (x)
    steps = schedule.split(',')

    jobconfs = []

    # determine the split size
    if 'split_size' in args:
        splitsize = args['split_size']
        jobconfs.append('mapreduce.input.fileinputformat.split.minsize=' +
                        str(splitsize))

    for i, step in enumerate(steps):
        if i > 0:
            input = curoutput
            mapper = 'org.apache.hadoop.mapred.lib.IdentityMapper'
        else:
            mapper = True  # use the command line mapper

        if i + 1 == len(steps):
            curoutput = output
        else:
            curoutput = output + "_iter%i" % (i + 1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)

        gopts.setkey('iter', i)

        if launch:
            hadoopy.launch_frozen(input,
                                  curoutput,
                                  __file__,
                                  mapper=mapper,
                                  cmdenvs=gopts.cmdenv(),
                                  num_reducers=int(step),
                                  jobconfs=jobconfs)
Exemple #6
0
 def _run_wc(self,
             orig_fn,
             script_name='wc.py',
             launcher=hadoopy.launch_frozen,
             **kw):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])  # This is no longer true in CDH4
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(in_path,
                  out_path + '_list_jobconfs',
                  script_name,
                  jobconfs=[
                      'mapred.min.split.size=100000000',
                      'mapreduce.task.userlog.limit.kb=1000'
                  ],
                  **kw)
         launcher(in_path,
                  out_path,
                  script_name,
                  jobconfs={
                      'mapred.min.split.size': '100000000',
                      'mapreduce.task.userlog.limit.kb': '1000'
                  },
                  **kw)
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == 'launch_frozen_cmd':
         cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (
             script_name, in_path, out_path)
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Exemple #7
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args

    mat = args.get('mat', None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")

    input = mat
    matname, matext = os.path.splitext(mat)

    gopts.getintkey('blocksize', 3)
    schedule = gopts.getstrkey('reduce_schedule', '1')

    # clear the output
    output = args.get('output', '%s-normal%s' % (matname, matext))
    if hadoopy.exists(output):
        print "Removing %s" % (output)
        hadoopy.rm(output)

    outputnamefunc = lambda x: output + "_iter%i" % (x)
    steps = schedule.split(',')

    for i, step in enumerate(steps):
        if i > 0:
            input = curoutput

        if i + 1 == len(steps):
            curoutput = output
        else:
            curoutput = output + "_iter%i" % (i + 1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)

        gopts.setkey('iter', i)

        if launch:
            if i > 0:
                mapper = "org.apache.hadoop.mapred.lib.IdentityMapper"
                hadoopy.launch_frozen(input,
                                      curoutput,
                                      __file__,
                                      mapper=mapper,
                                      cmdenvs=gopts.cmdenv(),
                                      num_reducers=int(step))
            else:
                hadoopy.launch_frozen(input,
                                      curoutput,
                                      __file__,
                                      cmdenvs=gopts.cmdenv(),
                                      num_reducers=int(step))
Exemple #8
0
 def test_err(self):
     nonsense_path = 'sdfskjdfksjdkfjskdfksjdfksdkfjskdjfksjdk'
     self.assertFalse(hadoopy.exists(nonsense_path))
     self.assertEquals(
         hadoopy.abspath(nonsense_path).rsplit('/')[-1], nonsense_path)
     self.assertRaises(IOError, hadoopy.ls, nonsense_path)
     self.assertRaises(IOError, hadoopy.readtb(nonsense_path).next)
def main():
    if hadoopy.exists(hdfs_output):
        hadoopy.rmr("-skipTrash %s" % hdfs_output)
    hadoopy.launch(hdfs_path,
                   hdfs_output,
                   'WordCount.py',
                   files=['../stop_words.txt'])
Exemple #10
0
def hdfs_temp(hdfs_temp_dir=None):
    if hdfs_temp_dir is None:
        hdfs_temp_dir = HDFS_TEMP_DIR
    temp_path = hadoopy.abspath('%s/%f-%f' % (hdfs_temp_dir, time.time(), random.random()))
    yield temp_path
    if hadoopy.exists(temp_path):
        hadoopy.rmr(temp_path)
Exemple #11
0
def insert_vector_into_hdfs(hdfs_path, iterator):
    # Deleting the file if it existes
    if hadoopy.exists(hdfs_path):
        hadoopy.rmr("-skipTrash %s"%hdfs_path)
    # Writing to HDFS   
    # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode.
    hadoopy.writetb(hdfs_path, iterator)
Exemple #12
0
 def _inner(in_path, out_path, script_path, *args, **kw):
     out_path = canonicalize_path(out_path)
     _new_output(out_path)
     if isinstance(in_path, str):
         in_path = canonicalize_path(in_path)
     else:
         in_path = [canonicalize_path(x) for x in in_path]
     gevent.sleep()
     if isinstance(in_path, str):
         _wait_on_input(in_path)
     else:
         for x in in_path:
             _wait_on_input(x)
     print('Flow: All inputs available [%s]' % str(in_path))
     update_graph(in_path, out_path, script_path)
     if USE_EXISTING and hadoopy.exists(out_path):
         print(("Flow: Resusing output [%s].  1.) You can't use the return value"
                " of this command (it is set to None) and 2.) The existing output is assumed to be correct.") % out_path)
         p = None
     else:
         p = launch(in_path, out_path, script_path, wait=False, *args, **kw)
         while p['process'].poll() is None:
             gevent.sleep(.1)
         print('Flow: Process completed')
         if p['process'].returncode:
             for x in range(10):
                 print('Flow: Task failed....[%d/10]' % x)
             raise subprocess.CalledProcessError(p['process'].returncode, p['hadoop_cmds'][0])
     _set_output(out_path)
     return p
Exemple #13
0
def write_tb(path, fold=None):
    fddb_path = '/home/morariu/downloads/fddb'
    if fold == None:
        folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-??.txt'
    else:
        folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-%02i.txt' % fold

    if hadoopy.exists(path):
        # do nothing if the file already exists
        pass
    else:
        # otherwise, find all images in the fddb folds and put them  on hdfs
        names = []
        for fn in glob.glob(folds_glob):
            with open(fn, 'r') as fp:
                names.extend(['%s/%s.jpg' % (fddb_path, l) 
                              for l in fp.read().strip().split('\n')])
        # print message about filenames that do not exist
        for n in names:
            if not os.path.exists(n):
                print('"%s" does not exist!' % n)
        # remove those filenames from the list
        names = filter(os.path.exists, names)
        # write the images to tb files
        hadoopy.writetb(path, [(n, open(n, 'rb').read()) for n in names])
Exemple #14
0
def insert_data_into_hdfs():
    # Deleting the file if it existes
    if hadoopy.exists(tb_path):
        hadoopy.rmr("-skipTrash %s"%tb_path)
    # Writing to HDFS   
    # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode.
    hadoopy.writetb(tb_path, get_kv_from_file(data_file_path))
Exemple #15
0
def freeze_script(script_path, cache=True, temp_path='_hadoopy_temp'):
    """Freezes a script, puts it on hdfs, and gives you the path

    'frozen_tar_path' can be given to launch_frozen and it will use that
    instead of making its own, this is useful for repeated calls.  If a
    file with the same md5 already exists in the temp_path, it is used
    instead of putting a new copy there to avoid the file transfer.  The
    files are put into a temporary file based on the timestamp first, then
    moved to a location that is only a function of their md5 to prevent partial
    files.

    Args:
        script_path: Path to a hadoopy script
        cache: If True (default) then use previously frozen scripts.  Cache is stored in memory (not persistent).
        temp_path: HDFS temporary path (default is '_hadoopy_temp')

    Returns:
        {'cmds': commands_ran, 'frozen_tar_path': frozen_tar_path}

    Raises:
        ValueError: Script cannot be found
    """
    script_abspath = os.path.abspath(script_path)
    if not os.path.exists(script_abspath):
        raise ValueError('Script [%s] does not exist.' % script_abspath)
    try:
        if not cache:
            raise KeyError  # NOTE(brandyn): Don't use cache item
        cmds, frozen_tar_path = FREEZE_CACHE[script_abspath]
    except KeyError:
        tmp_frozen_tar_path = temp_path + '/%f.tar' % time.time()
        freeze_fp = tempfile.NamedTemporaryFile(suffix='.tar')
        cmds = hadoopy._freeze.freeze_to_tar(os.path.abspath(script_path), freeze_fp.name)
        md5 = _md5_file(freeze_fp.name)
        frozen_tar_path = temp_path + '/%s.tar' % md5
        if not hadoopy.exists(frozen_tar_path):
            if not hadoopy.exists(temp_path):  # CDH4 Fix
                hadoopy.mkdir(temp_path)
            hadoopy.put(freeze_fp.name, tmp_frozen_tar_path)
            try:
                hadoopy.mv(tmp_frozen_tar_path, frozen_tar_path)
            except IOError:
                if not hadoopy.exists(frozen_tar_path):  # Check again
                    raise
    FREEZE_CACHE[script_abspath] = cmds, frozen_tar_path
    return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path}
Exemple #16
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args
    
    mat = args.get('mat',None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")
        
    input = mat
    matname,matext = os.path.splitext(mat)
    
    gopts.getintkey('blocksize',3)
    schedule = gopts.getstrkey('reduce_schedule','1')

    # clear the output
    output = args.get('output','%s-normal%s'%(matname,matext))
    if hadoopy.exists(output):
        print "Removing %s"%(output)
        hadoopy.rm(output)
    
    outputnamefunc = lambda x: output+"_iter%i"%(x)
    steps = schedule.split(',')
        
    for i,step in enumerate(steps):
        if i>0:
            input = curoutput
            
        if i+1==len(steps):
            curoutput = output
        else:
            curoutput = output+"_iter%i"%(i+1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)
            
        gopts.setkey('iter',i)
            
        if launch:
            if i>0:
                mapper="org.apache.hadoop.mapred.lib.IdentityMapper"
                hadoopy.launch_frozen(input, curoutput, __file__, 
                    mapper=mapper,
                    cmdenvs=gopts.cmdenv(), num_reducers=int(step))
            else:
                hadoopy.launch_frozen(input, curoutput, __file__, 
                    cmdenvs=gopts.cmdenv(), num_reducers=int(step))
Exemple #17
0
 def _run_face(self, fn, **kw):
     in_path = self.data_path + fn
     out_path = "%sout-%s-%f" % (self.data_path, fn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path, out_path, "face_finder.py", files=["haarcascade_frontalface_default.xml"], **kw)
     for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
         with open(self.out_path + "img%.8d.jpg" % num, "w") as fp:
             fp.write(image_data)
Exemple #18
0
 def _run_face(self, fn, out_path, **kw):
     in_path = self.data_path + fn
     hdfs_out_path = '%sout-%s-%f' % (self.data_path, fn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw)
     for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)):
         with open(out_path + 'img%.8d.png' % num, 'w') as fp:
             fp.write(image_data)
Exemple #19
0
def _wait_on_input(in_path):
    import hadoopy
    if not hadoopy.exists(in_path) and in_path not in HADOOPY_OUTPUTS:
        #print('Flow: Path [%s] does not exist yet, we will wait for it but you must create it eventually.' % in_path)
        print('Flow: Path [%s] does not exist yet, you will probably get an error from hadoop.' % in_path)
    if in_path in HADOOPY_OUTPUTS:  # not hadoopy.exists(in_path)
        print('Flow: Waiting for [%s]' % in_path)
        HADOOPY_OUTPUTS.setdefault(in_path, gevent.event.Event()).wait()
        print('Flow: Obtained [%s]' % in_path)
 def test_name(self):
     if not hadoopy.exists('picarus/logos'):
         put_logos_on_hadoop()
     lp = LogoProcessor()
     hdfs_path = 'picarus/logos'
     #lp.compute_db_hadoop(hdfs_path)
     with open('index.pb') as fp:
         lp.load(fp.read())
     print lp.index._hashes.shape
     compare_to_local(lp)
Exemple #21
0
 def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw):
     fn = "out-%f-%s" % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + ".out"
     print(os.path.abspath("."))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(
             in_path,
             out_path,
             script_name,
             jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"],
             **kw
         )
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == "launch_frozen_cmd":
         cmd = (
             'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"'
             % (script_name, in_path, out_path)
         )
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError("Launcher not recognized")
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc["the"], 1664)
     self.assertEqual(wc["Alice"], 221)
Exemple #22
0
 def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])  # This is no longer true in CDH4
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=['mapred.min.split.size=100000000',
                                                                               'mapreduce.task.userlog.limit.kb=1000'], **kw)
         launcher(in_path, out_path, script_name, jobconfs={'mapred.min.split.size': '100000000',
                                                                 'mapreduce.task.userlog.limit.kb': '1000'}, **kw)
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == 'launch_frozen_cmd':
         cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name,
                                                                                                                                             in_path,
                                                                                                                                             out_path)
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Exemple #23
0
 def _inner(out_path, *args, **kw):
     out_path = canonicalize_path(out_path)
     _new_output(out_path)
     print('Flow: Writer called on [%s]' % out_path)
     gevent.sleep()
     if USE_EXISTING and hadoopy.exists(out_path):
         print(("Flow: Resusing output [%s].  1.) You can't use the return value"
                " of this command (it is set to None) and 2.) The existing output is assumed to be correct.") % out_path)
         out = None
     else:
         out = hdfs(out_path, *args, **kw)
     _set_output(out_path)
     EDGES.append('%s->%s' % (get_local_node(), get_path_node(out_path)))
     return out
def aggregateData(num_line, terminus1, terminus2, threshold, *dirs):
    print 'aller:{}-{} \nretour:{}-{}'.format(terminus1, terminus2, terminus2,
                                              terminus1)
    ter_coor1, ter_coor2 = data_extraction.getTerminusCoor(
        num_line, terminus1, terminus2)
    source_gps_dir = 'hdfs://BigDataPOC:8020/datalab/exp_b02/data/gps_data'
    data_aller = []
    data_retour = []
    for dir_name in dirs:
        print dir_name
        path = os.path.join(source_gps_dir, str(num_line), str(dir_name))
        if hadoopy.isdir(path):
            cmd = 'hdfs dfs -du %s' % (path)
            p = subprocess.Popen(cmd.split(),
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)
            frames_aller = []
            frames_retour = []
            for file_name in p.stdout.readlines():
                line_daily = extractInfo(file_name.split()[1])
                line_daily_aller, line_daily_retour = data_extraction.generateDailyData(
                    line_daily, (ter_coor1, ter_coor2), threshold, terminus1,
                    terminus2)
                frames_aller.append(line_daily_aller)
                frames_retour.append(line_daily_retour)
            data_aller.append(
                pd.concat(frames_aller).reset_index().drop('index', axis=1))
            data_retour.append(
                pd.concat(frames_retour).reset_index().drop('index', axis=1))
        else:
            if hadoopy.exists(dir_name):
                line_daily = extractInfo(dir_name)
                line_daily_aller, line_daily_retour = data_extraction.generateDailyData(
                    line_daily, (ter_coor1, ter_coor2), threshold, terminus1,
                    terminus2)
                data_aller.append(line_daily_aller)
                data_retour.append(line_daily_retour)
            else:
                print "there are paths in args which are not directories"
                sys.exit(1)
    data_aller = pd.concat(data_aller).reset_index().drop('index', axis=1)
    data_retour = pd.concat(data_retour).reset_index().drop('index', axis=1)
    cols = [
        'DATE', 'TIME', 'LINE', 'BUS_NUM', 'X_COORDINATE', 'Y_COORDINATE',
        'LONGITUDE', 'LATITUDE', 'SPEED'
    ]
    data_aller = data_aller[cols]
    data_retour = data_retour[cols]
    return data_aller, data_retour
Exemple #25
0
 def _run_face(self, fn, out_path, **kw):
     bfn = os.path.basename(fn)
     in_path = self.data_path + bfn
     hdfs_out_path = '%sout-%s-%f' % (self.data_path, bfn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path,
                           hdfs_out_path,
                           'face_finder.py',
                           files=['haarcascade_frontalface_default.xml'],
                           **kw)
     for num, ((image_name, box),
               image_data) in enumerate(hadoopy.readtb(hdfs_out_path)):
         with open(out_path + 'img%.8d.png' % num, 'w') as fp:
             fp.write(image_data)
Exemple #26
0
 def test_readtb_writetb(self):
     working_path = "%s/readtb_writetb/" % (self.data_path)
     self.assertFalse(hadoopy.exists(working_path))
     self.assertFalse(hadoopy.isdir(working_path))
     self.assertFalse(hadoopy.isempty(working_path))
     for x in range(10):
         fn = "%s/%.5d" % (working_path, x)
         print(fn)
         data = [("1", 1), (1.3, np.array([1, 2, 3])), (True, {"1": 3})]
         hadoopy.writetb(fn, data)
     self.assertFalse(hadoopy.isdir(fn))
     self.assertFalse(hadoopy.isempty(fn))
     self.assertTrue(hadoopy.isdir(working_path))
     self.assertTrue(hadoopy.isempty(working_path))  # isempty returns true on directories
     self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
Exemple #27
0
 def test_readtb_writetb(self):
     working_path = '%s/readtb_writetb/' % (self.data_path)
     self.assertFalse(hadoopy.exists(working_path))
     self.assertFalse(hadoopy.isdir(working_path))
     self.assertFalse(hadoopy.isempty(working_path))
     for x in range(10):
         fn = '%s/%.5d' % (working_path, x)
         print(fn)
         data = [('1', 1), (1.3, np.array([1, 2, 3])), (True, {'1': 3})]
         hadoopy.writetb(fn, data)
     self.assertFalse(hadoopy.isdir(fn))
     self.assertFalse(hadoopy.isempty(fn))
     self.assertTrue(hadoopy.isdir(working_path))
     self.assertTrue(hadoopy.isempty(
         working_path))  # isempty returns true on directories
     self.assertEqual(self._readtb(readtb, working_path),
                      self._readtb(hadoopy.readtb, working_path))
def extractUsefulData(num_line,start_date,end_date):
    year = str(start_date)[:4]
    month = str(start_date)[4:6]
    start_day = str(start_date)[-2:]
    end_day = str(end_date)[-2:]
    home_dir_source = 'hdfs://BigDataPOC:8020/datalab/exp_vsb/inputData'
    home_dir_des = 'hdfs://BigDataPOC:8020/datalab/exp_b02/data/gps_data'
    for i in np.arange(int(start_day),int(end_day)+1):
        if i<10:
            date = '0'+ str(i)
        else:
            date = str(i)
        file_source = 'loc_bus_'+ str(start_date)[:6] +date+'_'+str(num_line)+'.csv' 
        source = os.path.join(home_dir_source,file_source)
        home_dir_des_line = os.path.join(home_dir_des,str(num_line))
        home_dir_des_month = os.path.join(home_dir_des_line,str(start_date)[:6])
        if not os.path.exists(home_dir_des_line):
            try:
                os.mkdir(os.path.dirname(home_dir_des_line))
            except OSError:
                pass
            if not os.path.exists(home_dir_des_month):
                try:
                    os.mkdir(os.path.dirname(home_dir_des_month))
                except OSError:
                    pass
        if not os.path.exists(home_dir_des_month):
                try:
                    os.mkdir(os.path.dirname(home_dir_des_month))
                except OSError:
                    pass
        file_des = 'bus_gps_'+ str(start_date)[:6] +date+'_'+str(num_line)+'.csv' 
        destination = os.path.join(home_dir_des_month,file_des)
        if hadoopy.exists(destination):
            hadoopy.rmr(destination)
        getGpsData(source,destination)
        print 'it is finished:'+file_des
Exemple #29
0
def flickr_images(tags,
                  images_per_tag,
                  hdfs_output,
                  num_files=20,
                  max_iters=1,
                  max_pages=1,
                  output_meta=False,
                  api_key=None,
                  api_secret=None,
                  remove_output=False):
    tags = list(tags)
    if api_key is None or api_secret is None:
        api_key = os.environ['FLICKR_API_KEY']
        api_secret = os.environ['FLICKR_API_SECRET']
    tags_per_chunk = max(len(tags) / num_files, 1)
    if remove_output and hadoopy.exists(hdfs_output):
        print('Removing output dir[%s]' % hdfs_output)
        hadoopy.rmr(hdfs_output)
    cmdenvs = {
        'FLICKR_API_KEY': api_key,
        'FLICKR_API_SECRET': api_secret,
        'MAX_ITERS': str(max_iters),
        'MAX_PAGES': str(max_pages)
    }
    for chunk_num, chunk_tags in enumerate(_chunks(tags, tags_per_chunk)):
        hadoopy.writetb(hdfs_output + '/tags/%d' % chunk_num,
                        [(images_per_tag, tag) for tag in chunk_tags])
    hadoopy.launch_frozen(hdfs_output + '/tags',
                          hdfs_output + '/metadata',
                          _lf('flickr_bulk.py'),
                          cmdenvs=cmdenvs,
                          num_reducers=num_files)
    output_type = 'meta' if output_meta else 'image'
    hadoopy.launch_frozen(hdfs_output + '/metadata',
                          hdfs_output + '/image_metadata',
                          _lf('file_downloader.py'),
                          cmdenvs={'OUTPUT_TYPE': output_type})
Exemple #30
0
def doSample(jarfile, inputs, output, k):
    for item in inputs:
        if item[-1] == "/":
            name = (item[:-1]).split('/')[-1]
        else:
            name = item.split('/')[-1]
        print "item", item 
        #tmp_dir = tmp_path + name + "/"
        if hadoopy.exists(item):
            continue
        hadoopy.mkdir(item)
        #tmp_inputs.append(tmp_dir)
        real_input = data_dir + name + "/"
        for f in hadoopy.ls(real_input):
            if not hadoopy.isdir(f):
                #ff = tmp_dir + f.split('/')[-1]
                if k > 0:
                    poolSample(f, item, k)
                else:
                    commonSample(f, item, ratio)
    '''if not hadoopy.exists(output):
        hadoopy.mkdir(output)
    if hadoopy.isdir(output):
        output = output[:-1]
    if output[-1] == '/':
        output = output[:-1]
    name = output.split('/')[-1]
    tmp_output = tmp_path + name + "/"'''
    #if not hpath.exists(tmp_output):
    #    hdfs.mkdir(tmp_output)
    codegen.executeJar(jarfile, inputs, output)
    #jobid = job.getJobIDFromLog(tmp_log_dir)
    job_para = job.getJobPara()
    '''for item in tmp_inputs:
        os.system("hadoop fs -rmr " + item)
    os.system("hadoop fs -rmr " + tmp_output)'''
    return job_para
Exemple #31
0
            cur_args.extend(['-mapper', "'./tsqr map %i'"%(blocksize)])    
        
        if i+1==len(steps):
            curoutput = output
        else:
            curoutput = output+"_iter%i"%(i+1)
            
        
        cur_args.extend(['-jobconf',"'mapreduce.job.name="+jobname+
            " (%i/%i)'"%(i+1,len(steps))])
        cur_args.extend(['-input',"'"+input+"'"])
        cur_args.extend(['-output',"'"+curoutput+"'"])
        cur_args.extend(['-numReduceTasks', "'%i'"%(int(step))])
    
        cmd = ['hadoop','jar',streaming_jar]
        cmd.extend(cur_args)
    
        print "Running Hadoop Command:"
        print
        print ' '.join(cmd) 
        print
        print "End Hadoop Command"
        
        if hadoopy.exists(curoutput):
            print "Removing %s"%(curoutput)
            hadoopy.rm(curoutput)

        subprocess.check_call(' '.join(cmd),shell=True)


def main():
    if hadoopy.exists(hdfs_output):
        hadoopy.rmr("-skipTrash %s" % hdfs_output)
    hadoopy.launch(hdfs_path, hdfs_output, "WordCount.py", files=["../stop_words.txt"])
Exemple #33
0
# -*- coding: utf-8 -*-
"""
Created on Mon Nov  9 16:35:12 2015

@author: user
"""

import hadoopy

input_path = 'wiki_index.tb'
output_path = "/result"

if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s"%output_path)

hadoopy.launch(input_path, output_path, 'map_red_01.py')
word_urls = dict(hadoopy.readtb(output_path))

for word in word_urls:
    print "%s: %s, %s" % (word, word_urls[word][0], word_urls[word][1])
Exemple #34
0
#!/usr/bin/env python

import hadoopy

input_path = "/alice.txt"
output_path = "/result"

if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s" % output_path)

hadoopy.launch(input_path, output_path, 'WordCount.py')

word_counts = dict(hadoopy.readtb(output_path))

for word in word_counts:
    print "%s: %d" % (word, word_counts[word])
hiveStatementForPythonCreate += " or ".join(tempStatement)
hiveStatementForPythonCreate += ");"

print "hiveStatementForPythonCreate:"+hiveStatementForPythonCreate;
hivestrcommandForPython = ["hive","-e",hiveStatementForPythonCreate]
current2 = datetime.datetime.now()
call(hivestrcommandForPython)
current3 = datetime.datetime.now()
print "hive2 second="+str((current3 - current2).seconds)

#impalaStatementForCreate = "use tax;refresh tax.tax_access_log_python;insert overwrite TABLE tax_access_log_partition PARTITION (date_hour) SELECT client_ip,client,userid,request,method,uri,protocal,path,params,query,fileType,fileName,status,bytes_sent, date_time,referer,useragent,host,concat(strleft(from_unixtime(unix_timestamp(date_time)),14),'00:00')as date_hour from  tax.tax_access_log_python;";
#####3.delete old data
for deltime in deleteTime :
    hdfsFilePath = '"/user/hive/warehouse/tax.db/tax_access_log_partition/date_hour='+deltime+'"'
    if hadoopy.exists(hdfsFilePath) == 1:
        print "remove file path:"+hdfsFilePath
        hadoopy.rmr('"/user/hive/warehouse/tax.db/tax_access_log_partition/date_hour='+deltime+'"')

#####4.insert Impala
impalaStatementForCreate = "use tax;refresh tax.tax_access_log_python;"
impalaStatementForCreate += " insert into TABLE tax_access_log_partition PARTITION (date_hour) "
impalaStatementForCreate += " SELECT client_ip,client,userid,request,method,uri,protocal,path,params,query,fileType,fileName,status,bytes_sent, date_time,referer,useragent,host,concat(strleft(from_unixtime(unix_timestamp(date_time)),14),'00:00')as date_hour "
impalaStatementForCreate += " from  tax.tax_access_log_python"
impalaStatementForCreate += " where "

tempStatement =[]
for insert_time in insertTime :
    tempStatement += ["date_time like '"+insert_time+"'"]

impalaStatementForCreate += " or ".join(tempStatement)
 def _load_input_data(self):
     # copy fddb data to hdfs if it is not already there
     if not hadoopy.exists(self.data_fn):
         print('Creating input data \'%s\'...' % self.data_fn)
         import fddb_data
         fddb_data.write_tb(self.data_fn, 1)
def _load_input_data(data_fn):
    """copy fddb data to hdfs if it is not already there"""
    if not hadoopy.exists(data_fn):
        print('Creating input data \'%s\'...' % data_fn)
        import fddb_data
        fddb_data.write_tb(data_fn)
Exemple #38
0
try:
	arg2 = int(sys.argv[2])
except Exception:
	arg2 = 1000


try:
	arg3 = sys.argv[3]
except Exception:
	arg3 = "/logs"

hdfs_path = arg3


if not hadoopy.exists(hdfs_path):
	print "does not exist, hence creating directory in hdfs"
	hadoopy.mkdir(hdfs_path)
else:
	print "writing to hdfs"

if not os.path.exists("./logs"):
    os.makedirs("./logs")

ts = time.time()
ts = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d%H%M%S')
fw = open("./logs"+"/"+"data"+ts,"w")

dataList = []

for i in xrange(arg2):
    lambda (x, y): (x[5:].decode('utf-8'), y[5:].decode('utf-8')))

splitText = lines.map(lambda (url, text): (url, [
    stem(word.group().lower()) for word in re.finditer(
        r"\w+", text, re.UNICODE) if word.group().lower() not in words_stop
]))

tf = splitText.map(lambda (url, splittedText): (url, {
    word: 1.0 * splittedText.count(word) / len(splittedText)
    for word in splittedText
}))

tfWordAsKey = tf.flatMap(lambda (url, tf): [(word, [(url, tf[
    word])]) for word in tf]).reduceByKey(lambda a, b: a + b)

tfidf = tfWordAsKey.map(lambda (word, tfList): (word, [(url, tf * np.log10(
    27474.0 / len(tfList))) for (url, tf) in tfList]))

NwordsMax = 200000


def read_rdd(rdd):
    for key, data in rdd.takeSample(True, NwordsMax):
        yield key, data


if hadoopy.exists(output_hdfs_path):
    hadoopy.rmr("-skipTrash %s" % output_hdfs_path)

hadoopy.writetb(output_hdfs_path, read_rdd(tfidf))
Exemple #40
0
output_path = "hdfs://localhost:9000/user/user/vector"
temp_path = "hdfs://localhost:9000/user/user/temp"


def read_vector(vect):
    for i, v in enumerate(vect):
        yield str(i).encode('utf-8'), v


N = 64375

diff = 1.

r0 = np.ones(N).astype(np.float) / N

if hadoopy.exists(input_path):
    hadoopy.rmr("-skipTrash %s" % input_path)
os.system('hdfs dfs -cp ' + edge_path + ' ' + input_path)

if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s" % output_path)
hadoopy.writetb(output_path, read_vector(r0))

if hadoopy.exists(temp_path):
    hadoopy.rmr("-skipTrash %s" % temp_path)

iteration = 0
while diff > 0.01:
    if hadoopy.exists(temp_path):
        hadoopy.rmr("-skipTrash %s" % temp_path)
    hadoopy.launch(input_path, temp_path, 'PageRank.py', files=[])
def _load_input_data(data_fn):
    """copy fddb data to hdfs if it is not already there"""
    if not hadoopy.exists(data_fn):
        print('Creating input data \'%s\'...' % data_fn)
        import fddb_data
        fddb_data.write_tb(data_fn)
import hadoopy

tb_path="hdfs://localhost:9000/user/user/edge_list.tb"

N = 64375

if hadoopy.exists(tb_path):
    hadoopy.rmr("-skipTrash %s"%tb_path)

def read_edge_wiki(file_object):
    while True:
        line = file_object.readline().split()
        if not line:
            break
        yield (line[0].decode('utf-8'),1.0/N),[l.decode('utf-8') for l in line[1:]]
        #yield line[0].decode('utf-8'),line[1].decode('utf-8')

def main():
    with open('edge_list.txt') as f:
        hadoopy.writetb(tb_path,read_edge_wiki(f))

if __name__ == '__main__':
    main()

import hadoopy
import os
import sys
import happybase
import numpy as np

hdfs_path = 'simplewikiFromHbase'
local_path = 'simlewikiFromHbaseLocal'
if hadoopy.exists(hdfs_path):
    hadoopy.rmr("-skipTrash %s" % hdfs_path)

connection = happybase.Connection('localhost', '9090')

if 'simplewiki' not in connection.tables():
    sys.exit("Error : no simplewiki table found")
else:
    print "OK : simplewiki table found"
    table_wiki = connection.table('simplewiki')

NdocsMax = 30000


def read_hbase(table_hbase):
    for key, data in table_hbase.scan(limit=NdocsMax):
        yield key.decode('utf-8'), data['wiki:text'].decode('utf-8')


#def read_local_dir(local_path):
#    for fn in os.listdir(local_path):
#        path = os.path.join(local_path, fn)
#        if os.path.isfile(path):
Exemple #44
0
from google_ngram_downloader import readline_google_store
import wget
import hadoopy
import os

for i in range(3, 6):

	gene = readline_google_store(ngram_len=i, lang='eng')

	while True:
		try:
			fname, url, records = next(gene)
			print fname
			if hadoopy.exists('/google-ngram/'+str(i)+'/'+fname):
				continue
			else:
				wget.download(url)
				hadoopy.put(fname, '/google-ngram/'+str(i)+'/'+fname)
				os.remove(fname)
		
		except StopIteration:
			print "END"
			break
Exemple #45
0
sys.setdefaultencoding('utf8')

hbase_table = 'wiki'
hdfs_path = 'wiki.tb'

host= 'localhost'
connection = happybase.Connection(host)
wiki_table = connection.table(hbase_table)


def get_url_content_for_hdfs():
    for url, content in wiki_table.scan():
        v = content['cf:content'].encode('utf-8')
        yield url, v

if hadoopy.exists(hdfs_path):
    hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning)
    
hadoopy.writetb(hdfs_path,get_url_content_for_hdfs()) # Writing the wiki table inot HDFS

# Test OK (ATIH 2/12/2015)
url_content_dict = dict(hadoopy.readtb(hdfs_path))
for k, v in url_content_dict.iteritems():
    print 'k = ', k
    print 'v = ', v
    break

for k, v in hadoopy.readtb(hdfs_path):
    print 'k = ', k.encode('utf-8')
    print 'v = ', v.encode('utf-8')
    break
Exemple #46
0
 def test_err(self):
     nonsense_path = "sdfskjdfksjdkfjskdfksjdfksdkfjskdjfksjdk"
     self.assertFalse(hadoopy.exists(nonsense_path))
     self.assertEquals(hadoopy.abspath(nonsense_path).rsplit("/")[-1], nonsense_path)
     self.assertRaises(IOError, hadoopy.ls, nonsense_path)
     self.assertRaises(IOError, hadoopy.readtb(nonsense_path).next)
    def validate(argv):
    
        keywords_selected = False
        keywords = []

        verbose = False
        hdfs_path = None
        local_path = None

        roll_size_selected = False
        roll_size = None
        
        consumer_key = None
        consumer_secret = None
        access_token_key = None
        access_token_secret = None

        credential_file_selected = False
        keywords_credential_file_selected = False
        credential_file_path = ''
        
        since_tweet_id = None
        https_proxy = None

        try:
            # second arg is (short) options, should be separated by :
            # third arg is long options, as an array
            opts, args = getopt.getopt(argv, "", 
                ["keywords=", "verbose", 
                 "write-to-hdfs=", "write-to-local=", "roll-size=", 
                 "consumer-key=", "consumer-secret=", 
                 "access-token-key=", "access-token-secret=", 
                 "credential-file=", "keywords-credential-file=",
                 "since-tweet-id=", "https-proxy="])

        except getopt.GetoptError:
            print Option.print_help()
            sys.exit(2)

        for opt, arg in opts:
            if opt == '--help':
                Option.print_help()

            elif opt == '--keywords':
                if keywords_credential_file_selected:
                    print "Error: You cannot choose --keywords and --keywords-credential-file at the same time"
                    sys.exit(2)
                else:
                    keywords_selected = True
                    keywords = arg.split(",")

            elif opt == '--verbose':
                verbose = True

            elif opt == '--write-to-hdfs':
                # validate and parse hdfs path
                hdfs_path = Option.parse_hdfs_path(arg)
                # if not hdfs_path or not hadoopy.exists(hdfs_path):
                if not hdfs_path:
                    print "Error: URL should be valid. Ex. hdfs://<host>:<port>/hdfs/dir"
                    sys.exit(2)
                elif not hadoopy.exists(hdfs_path):
                    print "Error: HDFS path does not exist"
                    sys.exit(2)
                elif not hdfs_path.endswith("/"):
                    hdfs_path = hdfs_path + "/"

            elif opt == '--write-to-local':
                # validate local path
                if not path.isdir(arg):
                    print "Error: Local path is not a directory or does not exist."
                    sys.exit(2)
                else:
                    local_path = arg if arg.endswith('/') else arg + '/'

            elif opt == '--roll-size':
                right_format, total_size, message = Option.parse_roll_size(arg, Util.MIN_ROLL_SIZE, Util.MAX_ROLL_SIZE) 
                if right_format:
                    roll_size_selected = True
                    roll_size = total_size
                else:
                    print message
                    sys.exit(2)

            elif opt == '--credential-file':
                if keywords_credential_file_selected:
                    print "Error: You cannot choose --credential-file and --keywords-credential-file at the same time"
                    sys.exit(2)
                else:
                    credential_file_selected = True
                    credential_file_path = arg
                
            elif opt == '--keywords-credential-file':
                if keywords_selected or credential_file_selected:
                    print "Error: You cannot choose --keywords-credential-file with --keywords and/or --keywords-credential-file"
                    sys.exit(2)
                else:
                    keywords_credential_file_selected = True
                    credential_file_path = arg
                    
            elif opt == '--consumer-key':
                consumer_key = arg
                
            elif opt == '--consumer-secret':
                consumer_secret = arg
                
            elif opt == '--access-token-key':
                access_token_key = arg
                
            elif opt == '--access-token-secret':
                access_token_secret = arg
                    
            elif opt == '--since-tweet-id':
                if len(str(arg)) < 18:
                    print "Warning: Invalid tweet id; ignoring set value."
                else:
                    since_tweet_id = arg 
                    
            elif opt == '--https-proxy':
                if not Option.parse_https_proxy(arg):
                    print "Warning: Possibly invalid HTTPS PROXY URL string; ignoring set value."
                else:    
                    https_proxy = arg


        if not keywords_selected and not keywords_credential_file_selected:
            print "Error: Keywords are required"
            sys.exit(2)

        if credential_file_selected:
            valid, error_message, consumer_key, consumer_secret, access_token_key, access_token_secret, temp_keywords = Option.validate_keywords_credential_file(credential_file_path, False)
            if not valid:
                print error_message
                sys.exit(2)
    
        if keywords_credential_file_selected:
            valid, error_message, consumer_key, consumer_secret, access_token_key, access_token_secret, keywords = Option.validate_keywords_credential_file(credential_file_path, True)
            if not valid:
                print error_message
                sys.exit(2)
                
        if not (consumer_key and consumer_secret and access_token_key and access_token_secret):
            print str(consumer_key) + ', ' + str(consumer_secret) + ', ' + str(access_token_key) + ', ' + str(access_token_secret)
            print "Error: Incomplete Twitter credentials."
            sys.exit(2)

        if not roll_size_selected:
            if hdfs_path or local_path:    
                print "Info: --roll-size not specified. Will default to roll size = 1048576 bytes (1 MB)." 
                roll_size_selected = True
                roll_size = Util.DEFAULT_ROLL_SIZE
        else:
            if not hdfs_path and not local_path:
                print "Warning: --roll-size flag ignored. No file to save to."
                roll_size = None
        
        print 'keywords: ' + ",".join(keywords)
        print 'verbose: ' + str(verbose)
        print 'hdfs_path: ' + str(hdfs_path)
        print 'local_path: ' +  str(local_path)
        print 'roll_size_selected: ' + str(roll_size_selected)
        print 'roll_size: ' + str(roll_size)
        print 'consumer_key: ' + str(consumer_key)
        print 'consumer_secret: ' + str(consumer_secret)
        print 'access_token_key: ' + str(access_token_key)
        print 'access_token_secret: ' + str(access_token_secret)
        print 'since_tweet_id: ' + str(since_tweet_id)
        print 'https_proxy: ' + str(https_proxy)

        return Option(keywords, verbose, hdfs_path, local_path, 
                      roll_size, consumer_key, consumer_secret, 
                      access_token_key, access_token_secret,
                      since_tweet_id, https_proxy)
Exemple #48
0
def calcul_delta(vectore_before, vector_after):
    before = {}
    after = {}
    s = 0
    for k, v in vectore_before:
        before[k] = v
    for k, v in vector_after:
        after[k] = v
    for k in before:
        s = np.abs(vectore_before[k] - vector_after[k])
    return s

##############################################################################

if hadoopy.exists(temp_vector_path):
    hadoopy.rmr("-skipTrash %s"%temp_vector_path)
copy(eigen_vector_tb_path, temp_vector_path)    

while diff>0.01:
    
   
    eigen_vector_before = load_eigen_vector(temp_vector_path)

    if hadoopy.exists(temp_vector_path):
        hadoopy.rmr("-skipTrash %s"%temp_vector_path)
    
    hadoopy.launch_local(data_tb_path, temp_vector_path, 'PageRank.py')
    
    eigen_vector_after = load_eigen_vector(temp_vector_path)
    
#input_path="hdfs://localhost:9000/alice.txt"
input_hdfs_path="hdfs://localhost:9000/user/user/simplewikiFromHbase"
output_hdfs_path='hdfs://localhost:9000/user/user/indexwikiFromSpark'

words_stop = [line.rstrip('\n') for line in open('../stop_words.txt')]
words_stop.append('')

sc=SparkContext()

lines = sc.sequenceFile(input_hdfs_path).map(lambda (x,y):(x[5:].decode('utf-8'),y[5:].decode('utf-8')))

splitText = lines.map(lambda (url,text):(url,[stem(word.group().lower()) for word in re.finditer(r"\w+",text,re.UNICODE) if word.group().lower() not in words_stop]))

tf = splitText.map(lambda (url,splittedText):(url,{word:1.0*splittedText.count(word)/len(splittedText) for word in splittedText}))

tfWordAsKey = tf.flatMap(lambda (url,tf):[(word,[(url,tf[word])]) for word in tf]).reduceByKey(lambda a,b:a+b)

tfidf = tfWordAsKey.map(lambda (word,tfList):(word,[(url,tf*np.log10(27474.0/len(tfList))) for (url,tf) in tfList]))

NwordsMax = 200000
def read_rdd(rdd):
    for key,data in rdd.takeSample(True,NwordsMax):
        yield key,data

if hadoopy.exists(output_hdfs_path):
    hadoopy.rmr("-skipTrash %s"%output_hdfs_path)

hadoopy.writetb(output_hdfs_path,read_rdd(tfidf))


Exemple #50
0
import hadoopy

tb_path = "hdfs://localhost:9000/user/user/edge_list.tb"

N = 64375

if hadoopy.exists(tb_path):
    hadoopy.rmr("-skipTrash %s" % tb_path)


def read_edge_wiki(file_object):
    while True:
        line = file_object.readline().split()
        if not line:
            break
        yield (line[0].decode('utf-8'),
               1.0 / N), [l.decode('utf-8') for l in line[1:]]
        #yield line[0].decode('utf-8'),line[1].decode('utf-8')


def main():
    with open('edge_list.txt') as f:
        hadoopy.writetb(tb_path, read_edge_wiki(f))


if __name__ == '__main__':
    main()