Ejemplo n.º 1
0
def insert_vector_into_hdfs(hdfs_path, iterator):
    # Deleting the file if it existes
    if hadoopy.exists(hdfs_path):
        hadoopy.rmr("-skipTrash %s"%hdfs_path)
    # Writing to HDFS   
    # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode.
    hadoopy.writetb(hdfs_path, iterator)
Ejemplo n.º 2
0
def hdfs_temp(hdfs_temp_dir=None):
    if hdfs_temp_dir is None:
        hdfs_temp_dir = HDFS_TEMP_DIR
    temp_path = hadoopy.abspath('%s/%f-%f' % (hdfs_temp_dir, time.time(), random.random()))
    yield temp_path
    if hadoopy.exists(temp_path):
        hadoopy.rmr(temp_path)
Ejemplo n.º 3
0
def insert_data_into_hdfs():
    # Deleting the file if it existes
    if hadoopy.exists(tb_path):
        hadoopy.rmr("-skipTrash %s"%tb_path)
    # Writing to HDFS   
    # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode.
    hadoopy.writetb(tb_path, get_kv_from_file(data_file_path))
Ejemplo n.º 4
0
def custom_initialization():
    host= 'localhost'
    connection = happybase.Connection(host)
    wiki_table = connection.table('wiki')
    hdfs_path = 'wiki_index.tb'
    hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning)
    hadoopy.writetb(hdfs_path,wiki_table.scan(limit=1000)) # Writing the wiki table inot HDFS
Ejemplo n.º 5
0
 def tearDown(self):
     if hadoopy.exists(self.data_path):
         self.assertTrue(hadoopy.isempty(self.data_path))  # directories are empty
         self.assertTrue(hadoopy.isdir(self.data_path))
         hadoopy.rmr(self.data_path)
     self.assertFalse(hadoopy.exists(self.data_path))
     self.assertFalse(hadoopy.isdir(self.data_path))
     self.assertFalse(hadoopy.isempty(self.data_path))
Ejemplo n.º 6
0
def latency_test(launcher):
    output_path = '_hadoopy_bench/%f' % time.time()
    v = 'blah'

    kv = (v, {'client_time': time.time(),
              'value_len': len(v),
              'count': 0})
    hadoopy.writetb(output_path + '/input', [kv])
    launcher(output_path + '/input', output_path + '/output', 'time_job.py')
    v = hadoopy.readtb(output_path + '/output').next()[1]
    v['server_time'] = time.time()
    t0 = v['worker_time'] - v['client_time']
    t1 = v['server_time'] - v['worker_time']
    t2 = v['server_time'] - v['client_time']
    print((t0, t1, t2))
    hadoopy.rmr(output_path)
Ejemplo n.º 7
0
def throughput_test(launcher):
    output_path = '_hadoopy_bench/%f' % time.time()
    v = 'blah'
    kv = (v, {'client_time': time.time(),
              'value_len': len(v),
              'count': 0})
    num_files = 3
    num_kvs = 10000000
    hadoopy.writetb(output_path + '/input/0', (kv for x in xrange(num_kvs)))
    for x in range(1, num_files):
        hadoopy.cp(output_path + '/input/0', output_path + '/input/%d' % x)
    hadoopy.freeze_script('time_job.py')  # Factor out Pyinstaller time
    st = time.time()
    launcher(output_path + '/input', output_path + '/output', 'time_job.py')
    print((num_kvs * num_files) / (time.time() - st))
    hadoopy.rmr(output_path)
Ejemplo n.º 8
0
#input_path="hdfs://localhost:9000/alice.txt"
input_hdfs_path="hdfs://localhost:9000/user/user/simplewikiFromHbase"
output_hdfs_path='hdfs://localhost:9000/user/user/indexwikiFromSpark'

words_stop = [line.rstrip('\n') for line in open('../stop_words.txt')]
words_stop.append('')

sc=SparkContext()

lines = sc.sequenceFile(input_hdfs_path).map(lambda (x,y):(x[5:].decode('utf-8'),y[5:].decode('utf-8')))

splitText = lines.map(lambda (url,text):(url,[stem(word.group().lower()) for word in re.finditer(r"\w+",text,re.UNICODE) if word.group().lower() not in words_stop]))

tf = splitText.map(lambda (url,splittedText):(url,{word:1.0*splittedText.count(word)/len(splittedText) for word in splittedText}))

tfWordAsKey = tf.flatMap(lambda (url,tf):[(word,[(url,tf[word])]) for word in tf]).reduceByKey(lambda a,b:a+b)

tfidf = tfWordAsKey.map(lambda (word,tfList):(word,[(url,tf*np.log10(27474.0/len(tfList))) for (url,tf) in tfList]))

NwordsMax = 200000
def read_rdd(rdd):
    for key,data in rdd.takeSample(True,NwordsMax):
        yield key,data

if hadoopy.exists(output_hdfs_path):
    hadoopy.rmr("-skipTrash %s"%output_hdfs_path)

hadoopy.writetb(output_hdfs_path,read_rdd(tfidf))


Ejemplo n.º 9
0
hbase_table = 'wiki'
hdfs_path = 'wiki.tb'

host= 'localhost'
connection = happybase.Connection(host)
wiki_table = connection.table(hbase_table)


def get_url_content_for_hdfs():
    for url, content in wiki_table.scan():
        v = content['cf:content'].encode('utf-8')
        yield url, v

if hadoopy.exists(hdfs_path):
    hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning)
    
hadoopy.writetb(hdfs_path,get_url_content_for_hdfs()) # Writing the wiki table inot HDFS

# Test OK (ATIH 2/12/2015)
url_content_dict = dict(hadoopy.readtb(hdfs_path))
for k, v in url_content_dict.iteritems():
    print 'k = ', k
    print 'v = ', v
    break

for k, v in hadoopy.readtb(hdfs_path):
    print 'k = ', k.encode('utf-8')
    print 'v = ', v.encode('utf-8')
    break
def main():
    if hadoopy.exists(hdfs_output):
        hadoopy.rmr("-skipTrash %s" % hdfs_output)
    hadoopy.launch(hdfs_path, hdfs_output, "WordCount.py", files=["../stop_words.txt"])
Ejemplo n.º 11
0
hiveStatementForPythonCreate += ");"

print "hiveStatementForPythonCreate:"+hiveStatementForPythonCreate;
hivestrcommandForPython = ["hive","-e",hiveStatementForPythonCreate]
current2 = datetime.datetime.now()
call(hivestrcommandForPython)
current3 = datetime.datetime.now()
print "hive2 second="+str((current3 - current2).seconds)

#impalaStatementForCreate = "use tax;refresh tax.tax_access_log_python;insert overwrite TABLE tax_access_log_partition PARTITION (date_hour) SELECT client_ip,client,userid,request,method,uri,protocal,path,params,query,fileType,fileName,status,bytes_sent, date_time,referer,useragent,host,concat(strleft(from_unixtime(unix_timestamp(date_time)),14),'00:00')as date_hour from  tax.tax_access_log_python;";
#####3.delete old data
for deltime in deleteTime :
    hdfsFilePath = '"/user/hive/warehouse/tax.db/tax_access_log_partition/date_hour='+deltime+'"'
    if hadoopy.exists(hdfsFilePath) == 1:
        print "remove file path:"+hdfsFilePath
        hadoopy.rmr('"/user/hive/warehouse/tax.db/tax_access_log_partition/date_hour='+deltime+'"')

#####4.insert Impala
impalaStatementForCreate = "use tax;refresh tax.tax_access_log_python;"
impalaStatementForCreate += " insert into TABLE tax_access_log_partition PARTITION (date_hour) "
impalaStatementForCreate += " SELECT client_ip,client,userid,request,method,uri,protocal,path,params,query,fileType,fileName,status,bytes_sent, date_time,referer,useragent,host,concat(strleft(from_unixtime(unix_timestamp(date_time)),14),'00:00')as date_hour "
impalaStatementForCreate += " from  tax.tax_access_log_python"
impalaStatementForCreate += " where "

tempStatement =[]
for insert_time in insertTime :
    tempStatement += ["date_time like '"+insert_time+"'"]

impalaStatementForCreate += " or ".join(tempStatement)
impalaStatementForCreate += ";"
Ejemplo n.º 12
0
import hadoopy

tb_path="hdfs://localhost:9000/user/user/edge_list.tb"

N = 64375

if hadoopy.exists(tb_path):
    hadoopy.rmr("-skipTrash %s"%tb_path)

def read_edge_wiki(file_object):
    while True:
        line = file_object.readline().split()
        if not line:
            break
        yield (line[0].decode('utf-8'),1.0/N),[l.decode('utf-8') for l in line[1:]]
        #yield line[0].decode('utf-8'),line[1].decode('utf-8')

def main():
    with open('edge_list.txt') as f:
        hadoopy.writetb(tb_path,read_edge_wiki(f))

if __name__ == '__main__':
    main()

Ejemplo n.º 13
0
def calcul_delta(vectore_before, vector_after):
    before = {}
    after = {}
    s = 0
    for k, v in vectore_before:
        before[k] = v
    for k, v in vector_after:
        after[k] = v
    for k in before:
        s = np.abs(vectore_before[k] - vector_after[k])
    return s

##############################################################################

if hadoopy.exists(temp_vector_path):
    hadoopy.rmr("-skipTrash %s"%temp_vector_path)
copy(eigen_vector_tb_path, temp_vector_path)    

while diff>0.01:
    
   
    eigen_vector_before = load_eigen_vector(temp_vector_path)

    if hadoopy.exists(temp_vector_path):
        hadoopy.rmr("-skipTrash %s"%temp_vector_path)
    
    hadoopy.launch_local(data_tb_path, temp_vector_path, 'PageRank.py')
    
    eigen_vector_after = load_eigen_vector(temp_vector_path)
    
    if hadoopy.exists(eigen_vector_tb_path):
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import hadoopy
import os
import sys
import happybase
import numpy as np

hdfs_path = 'simplewikiFromHbase' # equivaut à "http://localhost/user/user/simpleikiFromHbase"
local_path = 'simplewikiFromHbaseLocal'
if hadoopy.exists(hdfs_path):
    hadoopy.rmr("-skipTrash %s"%hdfs_path)

connection = happybase.Connection('localhost','9090')

if 'simplewiki' not in connection.tables():
    sys.exit("Error : no simplewiki table found")
else:
    print "OK : simplewiki table found"
    table_wiki = connection.table('simplewiki')

NdocsMax = 30000
def read_hbase(table_hbase):
    for key,data in table_hbase.scan(limit=NdocsMax):
        yield key.decode('utf-8'),data['wiki:text'].decode('utf-8')

#def read_local_dir(local_path):
#    for fn in os.listdir(local_path):
#       path = os.path.join(local_path, fn)
#        if os.path.isfile(path):
Ejemplo n.º 15
0
Archivo: task.py Proyecto: c58/par-prog
import hadoopy
import os
import logging


input_path = '/data/corpus_data'
output_path = '/data/output'
local_path = '/app/opencorpora'

# Utilities
def read_local_dir(local_path):
  for fn in os.listdir(local_path):
    path = os.path.join(local_path, fn)
    if os.path.isfile(path):
      yield path, open(path).read()

# Cleanup and write input data
if hadoopy.exists(input_path):
  hadoopy.rmr(input_path)
if hadoopy.exists(output_path):
  hadoopy.rmr(output_path)
hadoopy.writetb(input_path, read_local_dir(local_path))

# Launch the job
hadoopy.launch_frozen(input_path, output_path, 'wc.py')

# Read the first KV pair
word_counts = dict(hadoopy.readtb(output_path))
for w3, tpl in word_counts.items():
  if tpl[1] > 4:
    print tpl[0][0], tpl[0][1], tpl[0][2], tpl[1], tpl[2], tpl[3]
input_path = "hdfs://localhost:9000/user/user/input.tb"
output_path = "hdfs://localhost:9000/user/user/vector"
temp_path = "hdfs://localhost:9000/user/user/temp"

def read_vector(vect):
    for i,v in enumerate(vect):
        yield str(i).encode('utf-8'),v

N = 64375

diff=1.

r0 = np.ones(N).astype(np.float)/N

if hadoopy.exists(input_path):
    hadoopy.rmr("-skipTrash %s"%input_path)
os.system('hdfs dfs -cp '+edge_path+' '+input_path)
    
if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s"%output_path)
hadoopy.writetb(output_path,read_vector(r0))

if hadoopy.exists(temp_path):
    hadoopy.rmr("-skipTrash %s"%temp_path)

iteration = 0
while diff>0.01:
    if hadoopy.exists(temp_path):
        hadoopy.rmr("-skipTrash %s"%temp_path)
    hadoopy.launch(input_path,temp_path,'PageRank.py',files=[])