Ejemplo n.º 1
0
def ts_rank(job_id, iter_count, input_path, top_n):
    if not (job_id and iter_count and input_path
            and os.path.exists(input_path)):
        print 'error'
        return []
    print 'job_id:', monitor(job_id)
    if monitor(job_id) == 'finished':
        print 'hadoop_results start'
        return hadoop_results(job_id, top_n)
    fs = HadoopFS()
    fs.rmr('%s' % job_id)
    fs.mkdir('%s' % job_id)
    fs.put(input_path, '%s/hat_init' % job_id)  # input文件的路径
    #init
    ts_rank_iter = TsRankIter(input_path='%s/hat_init' % job_id,
                              output_path='%s/hat_tmp1' % job_id)
    ts_rank_iter.run()
    #iter
    for i in range(iter_count - 1):
        ts_rank_iter = TsRankIter(input_path='%s/hat_tmp%s' % (job_id,
                                                               (i + 1)),
                                  output_path='%s/hat_tmp%s' % (job_id,
                                                                (i + 2)))
        ts_rank_iter.run()
    #sort
    ts_rank_sorter = TsRankSorter(
        input_path='%s/hat_tmp%s' % (job_id, iter_count),
        output_path='%s/hat_results' % job_id)  # 这里的input_path是不是错了?
    ts_rank_sorter.run()
    # clean init and temp files
    fs.rmr('%s/hat_tmp*' % job_id)
    fs.rmr('%s/hat_init' % job_id)
    sorted_uids, all_uid_tr = hadoop_results(job_id, top_n)

    return sorted_uids, all_uid_tr
Ejemplo n.º 2
0
 def test_pagerank(self):
     size = 1000
     g = nx.DiGraph(nx.powerlaw_cluster_graph(size, 3, 0.001))
     N = len(g.nodes())
     tmp_file = tempfile.NamedTemporaryFile(delete=False)
     for node in g.nodes():
         outlinks = g.out_edges(nbunch=[node])
         outlinks = map(str, [n2 for n1, n2 in outlinks])
         if not outlinks:
             value = 'pr_results,%s,%s' % (1.0/N, N)
             tmp_file.write('%s\t%s\n' % (node, value))
         else:
             outlinks_str = ','.join(outlinks)
             value = 'pr_results,%s,%s,' % (1.0/N, N)
             value += outlinks_str
             tmp_file.write('%s\t%s\n' % (node, value))
     tmp_file.flush()
     input_path = tmp_file.name
     job_id = 'unittest'
     sorted_ids = pagerank(job_id, self.iter_count, input_path, self.top_n)
     fs = HadoopFS()
     fs.rmr('%s/hat_results' % job_id)
     if self.top_n <= size: 
         self.assertEqual(len(sorted_ids), self.top_n, 'some ids is missing')
     id_ranges = range(0, 1000)
     for _id in sorted_ids:
         self.assertIn(int(_id), id_ranges, 'node should in graph')
Ejemplo n.º 3
0
def ts_rank(job_id, iter_count, input_path, top_n):
    if not(job_id and iter_count and input_path and os.path.exists(input_path)):
        print 'error'
        return []
    print 'job_id:', monitor(job_id)
    if monitor(job_id) == 'finished':
        print 'hadoop_results start'
        return hadoop_results(job_id, top_n)
    fs = HadoopFS()
    fs.rmr('%s' % job_id)
    fs.mkdir('%s' % job_id)
    fs.put(input_path, '%s/hat_init' % job_id) # input文件的路径
    #init
    ts_rank_iter = TsRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id)
    ts_rank_iter.run()
    #iter
    for i in range(iter_count-1):
        ts_rank_iter = TsRankIter(input_path='%s/hat_tmp%s' % (job_id, (i+1)), output_path='%s/hat_tmp%s' % (job_id, (i+2)))
        ts_rank_iter.run()
    #sort
    ts_rank_sorter = TsRankSorter(input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id) # 这里的input_path是不是错了?
    ts_rank_sorter.run()
    # clean init and temp files
    fs.rmr('%s/hat_tmp*' % job_id)
    fs.rmr('%s/hat_init' % job_id)
    sorted_uids, all_uid_tr = hadoop_results(job_id, top_n)

    return sorted_uids, all_uid_tr
Ejemplo n.º 4
0
def pagerank(job_id, iter_count, input_path, top_n):
    if not (job_id and iter_count and input_path and os.path.exists(input_path)):
        return []

    if monitor(job_id) == 'finished':
        return hadoop_results(job_id, top_n)

    #set work dir and put input temp file into file system
    fs = HadoopFS()
    fs.rmr('%s' % job_id)
    fs.mkdir('%s' % job_id)
    fs.put(input_path, '%s/hat_init' % job_id)

    #init
    pr_iter = PageRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id)
    pr_iter.run()

    #iter
    for i in range(iter_count-1):
        pr_iter = PageRankIter(input_path='%s/hat_tmp%s' % (job_id, (i+1)), output_path='%s/hat_tmp%s' % (job_id, (i+2)))
        pr_iter.run()

    #sort
    pr_sorter = PageRankSorter(input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id)
    pr_sorter.run()

    #clean init and temp files
    fs.rmr('%s/hat_tmp*' % job_id)
    fs.rmr('%s/hat_init' % job_id)

    sorted_uids = hadoop_results(job_id, top_n)

    return sorted_uids
Ejemplo n.º 5
0
def read_from_hdfs(job_id, top_n):
    fs = HadoopFS()
    outputs = fs.cat('%s/hat_results/*' % job_id)
    if not outputs:
        return []
    if len(outputs) > top_n:
        outputs = outputs[-top_n:]
    outputs.reverse()
    sorted_uids = []
    for line in outputs:
        uid, value = line.strip().split('\t')
        sorted_uids.append(uid)
    return sorted_uids
Ejemplo n.º 6
0
def pagerank_simluation_test(input_path):
    iter_count = 5
    top_n = 500
    job_id = 1
    sorted_ids = pagerank(job_id, iter_count, input_path, top_n)
    if sorted_ids:
        if len(sorted_ids) < 10:
            for i in range(len(sorted_ids)):
                print sorted_ids[i]
        else:
            for i in range(10):
                print sorted_ids[i]
    fs = HadoopFS()
    fs.rmr('%s/hat_results' % job_id)
Ejemplo n.º 7
0
def hadoop_results(job_id, top_n):
    data = []
    fs = HadoopFS()
    outputs = fs.cat('%s/hat_results/*' % job_id)
    if not outputs:
        return [], {}
    all_outputs = outputs
    if len(outputs) > top_n:
        outputs = outputs[-top_n:]
    outputs.reverse()
    sorted_uids = []
    all_uid_r = {}
    for line in all_outputs:
        uid, r = line.strip().split('\t')
        all_uid_r[uid] = r
    for line in outputs:
        uid, r = line.strip().split('\t')
        sorted_uids.append(uid)
    return sorted_uids, all_uid_r
Ejemplo n.º 8
0
def monitor(job_id):
    fs = HadoopFS()
    finished = False
    has_tmps = False
    outputs = fs.ls('%s' % job_id)
    if not outputs:
        return 'data_not_prepared'
    count = 0
    for line in outputs:
        if 'tmp' in line:
            count += 1
            has_tmps = True
        if 'results' in line:
            if not has_tmps:
                finished = True
    if not finished:
        return 'stage%s' % count
    else:
        return 'finished'
Ejemplo n.º 9
0
def monitor(job_id):
    fs = HadoopFS()
    finished = False
    has_tmps = False
    outputs = fs.ls('%s' % job_id)
    if not outputs:
        return 'data_not_prepared'
    count = 0
    for line in outputs:
        if 'tmp' in line:
            count += 1
            has_tmps = True
        if 'results' in line:
             if not has_tmps:
                 finished = True
    if not finished:
        return 'stage%s' % count
    else:
        return 'finished'
Ejemplo n.º 10
0
def hadoop_results(job_id, top_n):
    data = []
    fs = HadoopFS()
    outputs = fs.cat('%s/hat_results/*' % job_id)
    if not outputs:
        return [], {}
    all_outputs = outputs
    if len(outputs) > top_n:
        outputs = outputs[-top_n:]
    outputs.reverse()
    sorted_uids = []
    all_uid_r = {}
    for line in all_outputs:
        uid, r = line.strip().split('\t')
        all_uid_r[uid] = r
    for line in outputs:
        uid, r = line.strip().split('\t')
        sorted_uids.append(uid)
    return sorted_uids, all_uid_r
Ejemplo n.º 11
0
def save_to_tmp(job_id, data):
    tmp_file = tempfile.NamedTemporaryFile(delete=False)
    for key, value in data.iteritems():
        tmp_file.write('%s\t%s\n' % (key, value))
    tmp_file.flush()
    fs = HadoopFS()
    fs.rmr('%s' % job_id)
    fs.mkdir('%s' % job_id)
    fs.put(tmp_file.name, '%s/hat_init' % job_id)
    return tmp_file.name
Ejemplo n.º 12
0
def main():
    job_id = 'hat_1'

    if (len(sys.argv) < 3):
        print 'Usage: python pagerank.py input_file iter_count'
        sys.exit()
    else:
        iter_count = int(sys.argv[2])
        input_file_name = sys.argv[1]

    fs = HadoopFS()
    #set work dir and put input file into file system
    fs.mkdir('%s' % job_id)
    fs.put(input_file_name, '%s/hat_init' % job_id)

    #init
    pr_iter = PageRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id)
    pr_iter.run()

    #iter
    for i in range(iter_count-1):
        pr_iter = PageRankIter(input_path='%s/hat_tmp%s' % (job_id, (i+1)), output_path='%s/hat_tmp%s' % (job_id, (i+2)))
        pr_iter.run()

    #sort
    pr_sorter = PageRankSorter(input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id)
    pr_sorter.run()

    #output and clean work dir
    try:
        outputs = fs.cat('%s/hat_results/*' % job_id)
        if len(outputs) > 100:
            outputs = outputs[-100:]
        for line in outputs:
            print line
    except Exception:
        raise
    finally:
        fs.rmr('%s' % job_id)
Ejemplo n.º 13
0
#-*- coding: utf-8 -*-

from hat.fs import HadoopFS

fs = HadoopFS(debug=True)

print fs.mkdir('test')

print fs.put('test.txt', 'test/test_for_fs')

for line in fs.cat('test/test_for_fs/*'):
    print line

print fs.rmr('test')

for line in fs.ls():
    print line