Example #1
0
def insert_data_into_hdfs():
    # Deleting the file if it existes
    if hadoopy.exists(tb_path):
        hadoopy.rmr("-skipTrash %s"%tb_path)
    # Writing to HDFS   
    # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode.
    hadoopy.writetb(tb_path, get_kv_from_file(data_file_path))
def main():
    if hadoopy.exists(hdfs_output):
        hadoopy.rmr("-skipTrash %s" % hdfs_output)
    hadoopy.launch(hdfs_path,
                   hdfs_output,
                   'WordCount.py',
                   files=['../stop_words.txt'])
Example #3
0
def custom_initialization():
    host= 'localhost'
    connection = happybase.Connection(host)
    wiki_table = connection.table('wiki')
    hdfs_path = 'wiki_index.tb'
    hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning)
    hadoopy.writetb(hdfs_path,wiki_table.scan(limit=1000)) # Writing the wiki table inot HDFS
Example #4
0
def rmr(path):
    """ remove path from HDFS """
    try:
        hadoopy.rmr(path)
    except IOError:
        return False
    return True
Example #5
0
def hdfs_temp(hdfs_temp_dir=None):
    if hdfs_temp_dir is None:
        hdfs_temp_dir = HDFS_TEMP_DIR
    temp_path = hadoopy.abspath('%s/%f-%f' % (hdfs_temp_dir, time.time(), random.random()))
    yield temp_path
    if hadoopy.exists(temp_path):
        hadoopy.rmr(temp_path)
Example #6
0
def insert_vector_into_hdfs(hdfs_path, iterator):
    # Deleting the file if it existes
    if hadoopy.exists(hdfs_path):
        hadoopy.rmr("-skipTrash %s"%hdfs_path)
    # Writing to HDFS   
    # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode.
    hadoopy.writetb(hdfs_path, iterator)
Example #7
0
 def tearDown(self):
     if hadoopy.exists(self.data_path):
         self.assertTrue(hadoopy.isempty(self.data_path))  # directories are empty
         self.assertTrue(hadoopy.isdir(self.data_path))
         hadoopy.rmr(self.data_path)
     self.assertFalse(hadoopy.exists(self.data_path))
     self.assertFalse(hadoopy.isdir(self.data_path))
     self.assertFalse(hadoopy.isempty(self.data_path))
Example #8
0
 def tearDown(self):
     if hadoopy.exists(self.data_path):
         self.assertTrue(hadoopy.isempty(
             self.data_path))  # directories are empty
         self.assertTrue(hadoopy.isdir(self.data_path))
         hadoopy.rmr(self.data_path)
     self.assertFalse(hadoopy.exists(self.data_path))
     self.assertFalse(hadoopy.isdir(self.data_path))
     self.assertFalse(hadoopy.isempty(self.data_path))
Example #9
0
def throughput_test(launcher):
    output_path = '_hadoopy_bench/%f' % time.time()
    v = 'blah'
    kv = (v, {'client_time': time.time(),
              'value_len': len(v),
              'count': 0})
    num_files = 3
    num_kvs = 10000000
    hadoopy.writetb(output_path + '/input/0', (kv for x in xrange(num_kvs)))
    for x in range(1, num_files):
        hadoopy.cp(output_path + '/input/0', output_path + '/input/%d' % x)
    hadoopy.freeze_script('time_job.py')  # Factor out Pyinstaller time
    st = time.time()
    launcher(output_path + '/input', output_path + '/output', 'time_job.py')
    print((num_kvs * num_files) / (time.time() - st))
    hadoopy.rmr(output_path)
Example #10
0
def latency_test(launcher):
    output_path = '_hadoopy_bench/%f' % time.time()
    v = 'blah'

    kv = (v, {'client_time': time.time(),
              'value_len': len(v),
              'count': 0})
    hadoopy.writetb(output_path + '/input', [kv])
    launcher(output_path + '/input', output_path + '/output', 'time_job.py')
    v = hadoopy.readtb(output_path + '/output').next()[1]
    v['server_time'] = time.time()
    t0 = v['worker_time'] - v['client_time']
    t1 = v['server_time'] - v['worker_time']
    t2 = v['server_time'] - v['client_time']
    print((t0, t1, t2))
    hadoopy.rmr(output_path)
Example #11
0
def flickr_images(tags,
                  images_per_tag,
                  hdfs_output,
                  num_files=20,
                  max_iters=1,
                  max_pages=1,
                  output_meta=False,
                  api_key=None,
                  api_secret=None,
                  remove_output=False):
    tags = list(tags)
    if api_key is None or api_secret is None:
        api_key = os.environ['FLICKR_API_KEY']
        api_secret = os.environ['FLICKR_API_SECRET']
    tags_per_chunk = max(len(tags) / num_files, 1)
    if remove_output and hadoopy.exists(hdfs_output):
        print('Removing output dir[%s]' % hdfs_output)
        hadoopy.rmr(hdfs_output)
    cmdenvs = {
        'FLICKR_API_KEY': api_key,
        'FLICKR_API_SECRET': api_secret,
        'MAX_ITERS': str(max_iters),
        'MAX_PAGES': str(max_pages)
    }
    for chunk_num, chunk_tags in enumerate(_chunks(tags, tags_per_chunk)):
        hadoopy.writetb(hdfs_output + '/tags/%d' % chunk_num,
                        [(images_per_tag, tag) for tag in chunk_tags])
    hadoopy.launch_frozen(hdfs_output + '/tags',
                          hdfs_output + '/metadata',
                          _lf('flickr_bulk.py'),
                          cmdenvs=cmdenvs,
                          num_reducers=num_files)
    output_type = 'meta' if output_meta else 'image'
    hadoopy.launch_frozen(hdfs_output + '/metadata',
                          hdfs_output + '/image_metadata',
                          _lf('file_downloader.py'),
                          cmdenvs={'OUTPUT_TYPE': output_type})
def extractUsefulData(num_line,start_date,end_date):
    year = str(start_date)[:4]
    month = str(start_date)[4:6]
    start_day = str(start_date)[-2:]
    end_day = str(end_date)[-2:]
    home_dir_source = 'hdfs://BigDataPOC:8020/datalab/exp_vsb/inputData'
    home_dir_des = 'hdfs://BigDataPOC:8020/datalab/exp_b02/data/gps_data'
    for i in np.arange(int(start_day),int(end_day)+1):
        if i<10:
            date = '0'+ str(i)
        else:
            date = str(i)
        file_source = 'loc_bus_'+ str(start_date)[:6] +date+'_'+str(num_line)+'.csv' 
        source = os.path.join(home_dir_source,file_source)
        home_dir_des_line = os.path.join(home_dir_des,str(num_line))
        home_dir_des_month = os.path.join(home_dir_des_line,str(start_date)[:6])
        if not os.path.exists(home_dir_des_line):
            try:
                os.mkdir(os.path.dirname(home_dir_des_line))
            except OSError:
                pass
            if not os.path.exists(home_dir_des_month):
                try:
                    os.mkdir(os.path.dirname(home_dir_des_month))
                except OSError:
                    pass
        if not os.path.exists(home_dir_des_month):
                try:
                    os.mkdir(os.path.dirname(home_dir_des_month))
                except OSError:
                    pass
        file_des = 'bus_gps_'+ str(start_date)[:6] +date+'_'+str(num_line)+'.csv' 
        destination = os.path.join(home_dir_des_month,file_des)
        if hadoopy.exists(destination):
            hadoopy.rmr(destination)
        getGpsData(source,destination)
        print 'it is finished:'+file_des
hiveStatementForPythonCreate += ");"

print "hiveStatementForPythonCreate:"+hiveStatementForPythonCreate;
hivestrcommandForPython = ["hive","-e",hiveStatementForPythonCreate]
current2 = datetime.datetime.now()
call(hivestrcommandForPython)
current3 = datetime.datetime.now()
print "hive2 second="+str((current3 - current2).seconds)

#impalaStatementForCreate = "use tax;refresh tax.tax_access_log_python;insert overwrite TABLE tax_access_log_partition PARTITION (date_hour) SELECT client_ip,client,userid,request,method,uri,protocal,path,params,query,fileType,fileName,status,bytes_sent, date_time,referer,useragent,host,concat(strleft(from_unixtime(unix_timestamp(date_time)),14),'00:00')as date_hour from  tax.tax_access_log_python;";
#####3.delete old data
for deltime in deleteTime :
    hdfsFilePath = '"/user/hive/warehouse/tax.db/tax_access_log_partition/date_hour='+deltime+'"'
    if hadoopy.exists(hdfsFilePath) == 1:
        print "remove file path:"+hdfsFilePath
        hadoopy.rmr('"/user/hive/warehouse/tax.db/tax_access_log_partition/date_hour='+deltime+'"')

#####4.insert Impala
impalaStatementForCreate = "use tax;refresh tax.tax_access_log_python;"
impalaStatementForCreate += " insert into TABLE tax_access_log_partition PARTITION (date_hour) "
impalaStatementForCreate += " SELECT client_ip,client,userid,request,method,uri,protocal,path,params,query,fileType,fileName,status,bytes_sent, date_time,referer,useragent,host,concat(strleft(from_unixtime(unix_timestamp(date_time)),14),'00:00')as date_hour "
impalaStatementForCreate += " from  tax.tax_access_log_python"
impalaStatementForCreate += " where "

tempStatement =[]
for insert_time in insertTime :
    tempStatement += ["date_time like '"+insert_time+"'"]

impalaStatementForCreate += " or ".join(tempStatement)
impalaStatementForCreate += ";"
Example #14
0
#!/usr/bin/env python

import hadoopy

input_path = "/alice.txt"
output_path = "/result"

if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s" % output_path)

hadoopy.launch(input_path, output_path, 'WordCount.py')

word_counts = dict(hadoopy.readtb(output_path))

for word in word_counts:
    print "%s: %d" % (word, word_counts[word])
Example #15
0
import hadoopy
import os
import logging


input_path = '/data/corpus_data'
output_path = '/data/output'
local_path = '/app/opencorpora'

# Utilities
def read_local_dir(local_path):
  for fn in os.listdir(local_path):
    path = os.path.join(local_path, fn)
    if os.path.isfile(path):
      yield path, open(path).read()

# Cleanup and write input data
if hadoopy.exists(input_path):
  hadoopy.rmr(input_path)
if hadoopy.exists(output_path):
  hadoopy.rmr(output_path)
hadoopy.writetb(input_path, read_local_dir(local_path))

# Launch the job
hadoopy.launch_frozen(input_path, output_path, 'wc.py')

# Read the first KV pair
word_counts = dict(hadoopy.readtb(output_path))
for w3, tpl in word_counts.items():
  if tpl[1] > 4:
    print tpl[0][0], tpl[0][1], tpl[0][2], tpl[1], tpl[2], tpl[3]
import hadoopy

tb_path="hdfs://localhost:9000/user/user/edge_list.tb"

N = 64375

if hadoopy.exists(tb_path):
    hadoopy.rmr("-skipTrash %s"%tb_path)

def read_edge_wiki(file_object):
    while True:
        line = file_object.readline().split()
        if not line:
            break
        yield (line[0].decode('utf-8'),1.0/N),[l.decode('utf-8') for l in line[1:]]
        #yield line[0].decode('utf-8'),line[1].decode('utf-8')

def main():
    with open('edge_list.txt') as f:
        hadoopy.writetb(tb_path,read_edge_wiki(f))

if __name__ == '__main__':
    main()

Example #17
0
def calcul_delta(vectore_before, vector_after):
    before = {}
    after = {}
    s = 0
    for k, v in vectore_before:
        before[k] = v
    for k, v in vector_after:
        after[k] = v
    for k in before:
        s = np.abs(vectore_before[k] - vector_after[k])
    return s

##############################################################################

if hadoopy.exists(temp_vector_path):
    hadoopy.rmr("-skipTrash %s"%temp_vector_path)
copy(eigen_vector_tb_path, temp_vector_path)    

while diff>0.01:
    
   
    eigen_vector_before = load_eigen_vector(temp_vector_path)

    if hadoopy.exists(temp_vector_path):
        hadoopy.rmr("-skipTrash %s"%temp_vector_path)
    
    hadoopy.launch_local(data_tb_path, temp_vector_path, 'PageRank.py')
    
    eigen_vector_after = load_eigen_vector(temp_vector_path)
    
    if hadoopy.exists(eigen_vector_tb_path):
input_path = "hdfs://localhost:9000/user/user/input.tb"
output_path = "hdfs://localhost:9000/user/user/vector"
temp_path = "hdfs://localhost:9000/user/user/temp"

def read_vector(vect):
    for i,v in enumerate(vect):
        yield str(i).encode('utf-8'),v

N = 64375

diff=1.

r0 = np.ones(N).astype(np.float)/N

if hadoopy.exists(input_path):
    hadoopy.rmr("-skipTrash %s"%input_path)
os.system('hdfs dfs -cp '+edge_path+' '+input_path)
    
if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s"%output_path)
hadoopy.writetb(output_path,read_vector(r0))

if hadoopy.exists(temp_path):
    hadoopy.rmr("-skipTrash %s"%temp_path)

iteration = 0
while diff>0.01:
    if hadoopy.exists(temp_path):
        hadoopy.rmr("-skipTrash %s"%temp_path)
    hadoopy.launch(input_path,temp_path,'PageRank.py',files=[])
    
def main():
    if hadoopy.exists(hdfs_output):
        hadoopy.rmr("-skipTrash %s" % hdfs_output)
    hadoopy.launch(hdfs_path, hdfs_output, "WordCount.py", files=["../stop_words.txt"])
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import hadoopy
import os
import sys
import happybase
import numpy as np

hdfs_path = 'simplewikiFromHbase' # equivaut à "http://localhost/user/user/simpleikiFromHbase"
local_path = 'simplewikiFromHbaseLocal'
if hadoopy.exists(hdfs_path):
    hadoopy.rmr("-skipTrash %s"%hdfs_path)

connection = happybase.Connection('localhost','9090')

if 'simplewiki' not in connection.tables():
    sys.exit("Error : no simplewiki table found")
else:
    print "OK : simplewiki table found"
    table_wiki = connection.table('simplewiki')

NdocsMax = 30000
def read_hbase(table_hbase):
    for key,data in table_hbase.scan(limit=NdocsMax):
        yield key.decode('utf-8'),data['wiki:text'].decode('utf-8')

#def read_local_dir(local_path):
#    for fn in os.listdir(local_path):
#       path = os.path.join(local_path, fn)
#        if os.path.isfile(path):
Example #21
0
temp_path = "hdfs://localhost:9000/user/user/temp"


def read_vector(vect):
    for i, v in enumerate(vect):
        yield str(i).encode('utf-8'), v


N = 64375

diff = 1.

r0 = np.ones(N).astype(np.float) / N

if hadoopy.exists(input_path):
    hadoopy.rmr("-skipTrash %s" % input_path)
os.system('hdfs dfs -cp ' + edge_path + ' ' + input_path)

if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s" % output_path)
hadoopy.writetb(output_path, read_vector(r0))

if hadoopy.exists(temp_path):
    hadoopy.rmr("-skipTrash %s" % temp_path)

iteration = 0
while diff > 0.01:
    if hadoopy.exists(temp_path):
        hadoopy.rmr("-skipTrash %s" % temp_path)
    hadoopy.launch(input_path, temp_path, 'PageRank.py', files=[])
Example #22
0
hbase_table = 'wiki'
hdfs_path = 'wiki.tb'

host= 'localhost'
connection = happybase.Connection(host)
wiki_table = connection.table(hbase_table)


def get_url_content_for_hdfs():
    for url, content in wiki_table.scan():
        v = content['cf:content'].encode('utf-8')
        yield url, v

if hadoopy.exists(hdfs_path):
    hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning)
    
hadoopy.writetb(hdfs_path,get_url_content_for_hdfs()) # Writing the wiki table inot HDFS

# Test OK (ATIH 2/12/2015)
url_content_dict = dict(hadoopy.readtb(hdfs_path))
for k, v in url_content_dict.iteritems():
    print 'k = ', k
    print 'v = ', v
    break

for k, v in hadoopy.readtb(hdfs_path):
    print 'k = ', k.encode('utf-8')
    print 'v = ', v.encode('utf-8')
    break
import hadoopy
import os
import sys
import happybase
import numpy as np

hdfs_path = 'simplewikiFromHbase'
local_path = 'simlewikiFromHbaseLocal'
if hadoopy.exists(hdfs_path):
    hadoopy.rmr("-skipTrash %s" % hdfs_path)

connection = happybase.Connection('localhost', '9090')

if 'simplewiki' not in connection.tables():
    sys.exit("Error : no simplewiki table found")
else:
    print "OK : simplewiki table found"
    table_wiki = connection.table('simplewiki')

NdocsMax = 30000


def read_hbase(table_hbase):
    for key, data in table_hbase.scan(limit=NdocsMax):
        yield key.decode('utf-8'), data['wiki:text'].decode('utf-8')


#def read_local_dir(local_path):
#    for fn in os.listdir(local_path):
#        path = os.path.join(local_path, fn)
#        if os.path.isfile(path):
#input_path="hdfs://localhost:9000/alice.txt"
input_hdfs_path="hdfs://localhost:9000/user/user/simplewikiFromHbase"
output_hdfs_path='hdfs://localhost:9000/user/user/indexwikiFromSpark'

words_stop = [line.rstrip('\n') for line in open('../stop_words.txt')]
words_stop.append('')

sc=SparkContext()

lines = sc.sequenceFile(input_hdfs_path).map(lambda (x,y):(x[5:].decode('utf-8'),y[5:].decode('utf-8')))

splitText = lines.map(lambda (url,text):(url,[stem(word.group().lower()) for word in re.finditer(r"\w+",text,re.UNICODE) if word.group().lower() not in words_stop]))

tf = splitText.map(lambda (url,splittedText):(url,{word:1.0*splittedText.count(word)/len(splittedText) for word in splittedText}))

tfWordAsKey = tf.flatMap(lambda (url,tf):[(word,[(url,tf[word])]) for word in tf]).reduceByKey(lambda a,b:a+b)

tfidf = tfWordAsKey.map(lambda (word,tfList):(word,[(url,tf*np.log10(27474.0/len(tfList))) for (url,tf) in tfList]))

NwordsMax = 200000
def read_rdd(rdd):
    for key,data in rdd.takeSample(True,NwordsMax):
        yield key,data

if hadoopy.exists(output_hdfs_path):
    hadoopy.rmr("-skipTrash %s"%output_hdfs_path)

hadoopy.writetb(output_hdfs_path,read_rdd(tfidf))


Example #25
0
import hadoopy

tb_path = "hdfs://localhost:9000/user/user/edge_list.tb"

N = 64375

if hadoopy.exists(tb_path):
    hadoopy.rmr("-skipTrash %s" % tb_path)


def read_edge_wiki(file_object):
    while True:
        line = file_object.readline().split()
        if not line:
            break
        yield (line[0].decode('utf-8'),
               1.0 / N), [l.decode('utf-8') for l in line[1:]]
        #yield line[0].decode('utf-8'),line[1].decode('utf-8')


def main():
    with open('edge_list.txt') as f:
        hadoopy.writetb(tb_path, read_edge_wiki(f))


if __name__ == '__main__':
    main()