Esempio n. 1
0
class HDFSSErvice:

    namenode_host = "localhost"
    namenode_port = "9870"
    root_folder = "/"
    chunck_size = 100000

    def __init__(self):
        self._client = HdfsClient(hosts=self.namenode_host + ":" + self.namenode_port, user_name="root")

    def get(self, hdfs_path: str):
        file_size = self.get_file_size(hdfs_path)
        for i in range(0, file_size, self.chunck_size):
            file_response = self._client.open(hdfs_path, offset=i, length=i + self.chunck_size)
            yield file_response.read()
        
    def append(self, hdfs_path: str, data: bytes):
        self.create_if_not_exist(hdfs_path)
        self._client.append(hdfs_path, data)
    
    def create_if_not_exist(self, hdfs_path: str):
        if not self._client.exists(hdfs_path):
            self._client.create(hdfs_path, b"")

    def get_messages_number(self, hdfs_path: str):
        return int(self.get_file_size(hdfs_path) / self.chunck_size + 1)

    def get_file_size(self, hdfs_path):
        file_infos = self._client.get_content_summary(hdfs_path)
        return file_infos.length

    def test(self):
        pass
Esempio n. 2
0
def read_hdfs(filename,
              host,
              split_ratio,
              delimiter=',',
              normalize=False,
              dtype=None,
              header=None,
              skiprows=None,
              index_col=False,
              output_label=True,
              randomize=False,
              return_as_dataframe=False,
              describe=False,
              label_vector=False):
    client = HdfsClient(hosts=host)
    return read_csv(filename=client.open(filename),
                    split_ratio=split_ratio,
                    delimiter=delimiter,
                    normalize=normalize,
                    dtype=dtype,
                    header=header,
                    skiprows=skiprows,
                    index_col=index_col,
                    output_label=output_label,
                    randomize=randomize,
                    return_as_dataframe=return_as_dataframe,
                    describe=describe,
                    label_vector=label_vector)
Esempio n. 3
0
 def __load_corpus_from_hdfs(self, hdfs_host: str) -> List:
     fs = HdfsClient(hdfs_host)
     with fs.open(self.corpus_path) as fp:
         corpus = list()
         for line in tqdm(fp.read().decode().split('\n')):
             if line:
                 d = json.loads(line)
                 corpus.append(d)
     return corpus
Esempio n. 4
0
    def load_fields_with_vocab(self, hdfs_host: str) -> Dict[str, Field]:
        fs = HdfsClient(hdfs_host)
        if fs.exists(self.fields_path):
            print(f'get fields from {hdfs_host}{self.fields_path}')
        else:
            raise Exception(f'there are no fields in {hdfs_host}{self.fields_path}')

        loaded_dict = json.loads(fs.open(self.fields_path).read())
        return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}
 def load_model(self, train_dir, modelnum, appendix):
     print('~' * 100)
     c3_path = f'/user/{self.username}/fortuna/model/{train_dir}_{modelnum}/model_e{appendix}'
     print(c3_path)
     fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.username)
     model_pickle = fs.open(c3_path)
     model_dict = pickle.load(model_pickle)
     self.model.load_state_dict(model_dict)
     acc_lst, total, prec,recall,f1score,f1s,rocauc = self.eval(self.test_iter, len(self.task.te_dataset))
     print('~' * 100)
Esempio n. 6
0
 def load_fields_from_c3(self) -> Dict[str, Field]:
     fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name)
     if fs.exists(self.c3_fields_path):
         print(f'get fields from {self.C3_HDFS_HOST}{self.c3_fields_path}')
     else:
         raise Exception(f'there are no fields in {self.C3_HDFS_HOST}{self.c3_fields_path}')
     loaded_dict = json.loads(fs.open(self.c3_fields_path).read())
     print(loaded_dict)
     max_vocab_indexes = {k: v['max_vocab_index'] for k, v in loaded_dict.items()}
     return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}, max_vocab_indexes
Esempio n. 7
0
 def load_matrix(self, filepath, shape=None):
     if os.environ['local'] == '1' and os.path.exists(filepath):
         return np.loadtxt(filepath, dtype=np.float)
     else:
         hosts = os.environ['hosts']
         if len(hosts) == 0:
             hosts = 'master'
         client = HdfsClient(hosts=hosts)
         if client.exists(filepath):
             return np.fromstring(
                 client.open(filepath).read()).reshape(shape)
     return False
Esempio n. 8
0
    def _load_preprocessed(self) -> List[Example]:
        fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name)
        if fs.exists(self.c3_path):
            print(f'get preprocessed corpus from {self.C3_HDFS_HOST}{self.c3_path}')
        else:
            raise Exception(f'there are no preprocessed in {self.C3_HDFS_HOST}{self.c3_path}')

        preprocessed = []
        for line in fs.open(self.c3_path).read().decode().split('\n'):
            if line:
                ex = Example()
                for k, v in json.loads(line).items():
                    setattr(ex, k, v)
                preprocessed.append(ex)
        return preprocessed
Esempio n. 9
0
def read_csv():
    client = HdfsClient(hosts='master33:50070', user_name='hadoop')
    inputfile = client.open('/pdfs/output.csv')
    df = pd.read_csv(inputfile)

    print("read done")

    # 將vector string 轉換為 vector
    def transfer_vectorStr_to_vector(df):
        for i in range(0, len(df['vector'])):
            array = np.fromstring( \
                df['vector'][i].replace('[', '').replace(']', ''), \
                dtype=np.double, sep='  ')

            df.at[i, 'vector'] = Vectors.dense(array)
        return df

    df = transfer_vectorStr_to_vector(df)

    return df
Esempio n. 10
0
def start():
    # 连接MongoDB,查询tokens,根据contractAddress到etherscan查询最新数据
    client = MongoCluster().connect()
    db = client.get_database('gse-transaction')
    collection = db.get_collection('mrout_6000001-6001000')
    # collection.insert_one()

    # 连接HDFS读取文件
    from pyhdfs import HdfsClient
    client2 = HdfsClient(hosts='%s,50070' % hdfs_ip, max_tries=10)
    # 返回这个用户的根目录
    print client2.get_home_directory()
    # 返回可用的namenode节点
    print client2.get_active_namenode()
    # 返回指定目录下的所有文件
    print client2.listdir("/user/leon/mrout_3_6000001-6001000/")
    # 读某个文件
    client2.mkdirs("/user/leon")
    inputfile = client2.open('/user/leon/mrout_3_6000001-6001000/part-00000')
    # 查看文件内容
    for r in inputfile:
        line = str(r).encode('utf-8')  # open后是二进制,str()转换为字符串并转码
        print(line)
Esempio n. 11
0
            list_subdir_date.sort()
            for fname in list_subdir_date:
                if fname in list_subdir_date_cleaned:
                    #                     #TODO: to debug
                    #                     if client.exists(os.path.join(dir_subdata_cleaned, fname)):
                    #                         print (os.path.join(dir_subdata_cleaned, fname))
                    #                         client.delete(os.path.join(dir_subdata_cleaned, fname))
                    logger.debug('has been cleaned, ignore this file : %s' %
                                 fname)
                    continue
                s_guapairiqi = public.parse_datetime(
                    fname, format_from='%Y-%m-%d-%H-%M.txt')
                f_fullname = os.path.join(dir_subdata, fname)
                logger.debug('doing file : %s' % f_fullname)
                f = client.open(f_fullname)
                try:
                    f_context = f.read().decode('gbk')
                except UnicodeDecodeError as e:
                    logger.error('decode error : %s' % f_fullname)
                    logger.error(e)
                    dir_error = os.path.join(dir_subdata, 'error_cleaning')
                    if not client.exists(dir_error):
                        client.mkdirs(dir_error)
                        logger.debug('mkdir dir for error files : %s' %
                                     dir_error)
                    #TODO: if success delete error files
                    fname_error = os.path.join(dir_error, fname)
                    if not client.exists(fname_error):
                        client.create(fname_error, None)
                        logger.warn('create error flag file : %s' %
Esempio n. 12
0
import pandas as pd
import numpy as np
import h5py
from hdfs.client import Client
from pyhdfs import HdfsClient
'''
import pyarrow as pa
pa.hdfs.connect(host='192.168.0.186',port=9870,user='******')
#pa.hdfs.connect()
'''

client = HdfsClient(hosts='192.168.0.186:9870', user_name='yanrujing')

a1 = client.open('/r2/test/transformed.h5')
#a1=client.open('/r2/userData/6c0f5b62c3624f6bad70b0a3066e9085/1/csv_header.csv')
b1 = a1.read()

c1 = h5py.File('/home/chen/桌面/transformed.h5')
#http://192.168.0.186:9870/explorer.html#/r2/test/transformed.h5
'''
client2 = Client(url="http://192.168.0.186:9870",root='yanrujing')
# client2.read('/r2/userData/6c0f5b62c3624f6bad70b0a3066e9085/1/csv_header.csv')
# a2=client2.read('/r2/userData/6c0f5b62c3624f6bad70b0a3066e9085/1/csv_header.csv')

with client2.read('/r2/userData/6c0f5b62c3624f6bad70b0a3066e9085/1/csv_header.csv') as reader:
    #a=pd.read_csv(reader)
    content = reader.read()
'''

a1 = open('/home/chen/桌面/transformed.h5', 'rb')
b1 = h5py.File(a1, 'r')
Esempio n. 13
0
# 打开存储在HDFS上的文件
from pyhdfs import HdfsClient
client = HdfsClient(hosts='ghym:50070', user_name='hadoop')
inputfile = client.open('/score.txt')
# 转化为csv格式
import pandas as pd
df = pd.read_table(inputfile, encoding='gbk', sep=',')
df.to_csv('demo.csv', encoding='gbk', index=None, columns=['用户名', '电影名', '评分'])
Esempio n. 14
0
from pyhdfs import HdfsClient
import pandas as pd

client = HdfsClient(hosts="172.16.18.114:50070,172.16.18.112:50070",
                    user_name='hadoop')

path = 'hdfs://172.16.18.112:50070/a3bd481c98f44dde842367b7ebaef4a6/dataSet/vbap8482a43e3883413f8345344efa6b53ec/data/Data.csv'
head = client.open(path)
data = pd.read_csv(head)
print(data)
Esempio n. 15
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from pyhdfs import HdfsClient
'''
python链接hadoop的hdfs文件系统,进行文件的上传和下载
'''

# 从hdfs文件系统读取文件

# hdfs地址
# client = HdfsClient(hosts='192.168.1.163:50070')
client = HdfsClient(hosts='192.168.1.156:50070')

print(client.listdir("/repo/"))

res = client.open('/repo/README.txt')
for r in res:
    line = str(r, encoding='utf-8')  # open后是二进制,str()转换为字符串并转码
    print(line)

client = HdfsClient(hosts='192.168.1.156:50070',
                    user_name='hadoop')  # 只有hadoop用户拥有写权限
str1 = 'hello world'
client.create('/py.txt', str1)  # 创建新文件并写入字符串

# 上传本地文件到HDFS

# client = HdfsClient(hosts='hacker:50070', user_name='root')
# 本地文件绝对路径,HDFS目录必须不存在
# client.copy_from_local('D:/PythonProjects/crawl_work/thread_crawl_work02', '/usr/hadoop/')
Esempio n. 16
0
#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
"""
    从 HDFS 中读取文件内容
    @see:https://pypi.org/project/PyHDFS/
"""

__author__ = 'JKong'

import pandas as pd
from pyhdfs import HdfsClient

client = HdfsClient(hosts='10.10.27.47:9870', user_name="hdfs")
# TypeError: cannot use a string pattern on a bytes-like object
# 从hdfs中读取文件
file = client.open(r"/a_jkong_test_data/1000.txt")
# 获取内容
content = file.read()
# open后,file是二进制,str()转换为字符串并转码
s = str(content, "utf-8")
# 打开本地文件.csv 并写入内容
file = open("./data/data.csv", "w")
file.write(s)
# pandas读取本地csv文件
train_data = pd.read_csv("./data/data.csv",
                         sep=",",
                         header=None,
                         usecols=[0, 1, 2, 3, 4],
                         names=['id', 'name', 'age', 'gender', 'time'])
print(train_data)
Esempio n. 17
0
class hdfs(object):
    #默认50070端口
    def __init__(self, cur_database_param):
        # super(HdfsClients, self).__init__()
        # self.quert_db_info = super(HdfsClients, self).getDBConfig()
        # self.hdfsHost=self.quert_db_info["host"]
        hdfsHost = cur_database_param['url']
        path = cur_database_param['dbname']
        self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost))
        self.host = hdfsHost
        self.path = path

    def append(self, path, data):
        self.hdfs.append(path, data)
        pass

    def concat(self, target, sources):
        self.concat(target, sources)

    # self, taskJobId,tableName=None,jobTemplateFieldList=None
    def createTableByTaskJobId(self,
                               taskJobId,
                               tableName=None,
                               jobTemplateFieldList=None,
                               data=None):
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(taskJobId)
            tableName = taskJob.tableName
        path = self.path + '/' + tableName
        self.hdfs.create(path, data, replication=2)

    def hmkdirs(self, path):
        self.hdfs.mkdirs(path)

    def open(self, path):
        return self.hdfs.open(path=path)

    def delete(self, path):
        self.hdfs.delete(path=path)

    def listdir(self, rule):
        f = self.hdfs.listdir(rule)
        return f

    def insert(self, jobid, tablename, column_dict, paramMap=None):
        if tablename == None:
            taskJob = TaskJobDao.loadTaskById(jobid)
            tablename = taskJob.tableName
        path = self.path + '/' + tablename
        createTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        task_job_id_sequenceValue = paramMap.get(
            "task_job_id_sequence") if paramMap != None else None
        if task_job_id_sequenceValue != None:
            column_dict.update(
                {"task_job_id_sequence": str(task_job_id_sequenceValue)})
        column_dict.update({
            "task_job_del_flag": "False",
            "task_job_create_time": createTime
        })
        # self.append(path, column_dict)
        if self.isTableExist(tablename):
            self.append(path, column_dict)
        else:
            self.createTableByTaskJobId(jobid, tablename, column_dict)
        # return column_dict

    def isTableExist(self, tablename):
        path = self.path + '/' + tablename
        exist = self.hdfs.exists(path)
        return exist

    def save_to_hdfs(self, jobid, path, data):
        if self.isTableExist(path):
            self.append(path, data)
        else:
            self.createTableByTaskJobId(jobid, path, data)

    def save_to_hdfs2(self, path, data):
        if self.hdfs.exists(path):
            self.hdfs.append(path, data)
        else:
            self.hdfs.create(path, data, replication=2)

    def execute(self, sqls="append", path=None, data=None):
        try:
            if isinstance(sqls, list) and len(sqls) > 0:
                for sql in sqls:
                    # method = eval(sql)
                    method = getattr(self, sql)
                    method(path, data)
            else:
                # method = eval(sqls)
                method = getattr(self, sqls)
                method(path, data)
        except Exception, e:
            logging.error("hdfs,execute," + str(e))
            raise Exception()
Esempio n. 18
0
for day in days_path:
    hours = client.listdir(day)
    for hour in hours:
        path = day + '/' + str(hour)
        #print path
        hours_path.append(path)

minutes_path = []
for path in hours_path:
    minutes = client.listdir(path)
    if len(minutes) != 60:
        print '[ INFO ] Incomplete minutes (less than 60) in path: ' + str(path) + ' (Count = ' + str(len(minutes)) + ')'
    
    for minute in minutes:
        path_out = path + '/' + str(minute)
        #print path_out
        minutes_path.append(path_out)

counter = 0
len_contents_list = []
for path in minutes_path:
    len_contents = len(re.sub('\n$','',client.open(path).read()).split('\n'))
    len_contents_list.append(len_contents)
    counter += len_contents


print '\n\nTotal number of logs: ' + str(counter) + '\n\n'


#ZEND