Beispiel #1
0
    def __push_fields(self, hdfs_host: str, fields: Dict[str, Field]):
        fs = HdfsClient(hdfs_host)
        fs.mkdirs('/'.join(self.fields_path.split('/')[:-1]))
        fs.delete(self.fields_path)
        dicted_fields = {k: self.field_to_dict(v) for k, v in fields.items()}
        fs.create(self.fields_path, json.dumps(dicted_fields))

        fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name)
        fs.mkdirs('/'.join(self.c3_fields_path.split('/')[:-1]))
        fs.delete(self.c3_fields_path)
        c3_dicted_fields = {}
        for k, value in dicted_fields.items():
            if value['use_vocab']:
                max_vocab_index = len(value['vocab']['itos'])
                value['max_vocab_index'] = max_vocab_index
                value['dtype'] = str(torch.int64)
                vocab = value['vocab']
                for tok in self.FIELDS_TOKEN_ATTRS:
                    if value[tok]:
                        value[tok] = vocab['stoi'][value[tok]]
                value.pop('vocab')
                value['use_vocab'] = False
            else:
                value['max_vocab_index'] = 1
            c3_dicted_fields[k] = value
        fs.create(self.c3_fields_path, json.dumps(c3_dicted_fields))
Beispiel #2
0
def cluster():
    try:
        cluster = ClickHouseCluster(__file__)
        cluster.add_instance(
            "node1",
            main_configs=["configs/config.d/storage_conf.xml"],
            macros={"replica": "node1"},
            with_zookeeper=True,
            with_hdfs=True,
        )
        cluster.add_instance(
            "node2",
            main_configs=["configs/config.d/storage_conf.xml"],
            macros={"replica": "node2"},
            with_zookeeper=True,
            with_hdfs=True,
        )
        logging.info("Starting cluster...")
        cluster.start()
        if cluster.instances["node1"].is_debug_build():
            # https://github.com/ClickHouse/ClickHouse/issues/27814
            pytest.skip(
                "libhdfs3 calls rand function which does not pass harmful check in debug build"
            )
        logging.info("Cluster started")

        fs = HdfsClient(hosts=cluster.hdfs_ip)
        fs.mkdirs("/clickhouse1")
        fs.mkdirs("/clickhouse2")
        logging.info("Created HDFS directory")

        yield cluster
    finally:
        cluster.shutdown()
Beispiel #3
0
def ProcAll(LocalDir, HdfsDir):
    NameNode = GolobalConfig['hdfs']['NameNode']
    UserName = GolobalConfig['hdfs']['UserName']
    client = HdfsClient(hosts=NameNode, user_name=UserName)
    if not client.exists(HdfsDir):
        client.mkdirs(HdfsDir)
    total = len(os.listdir(LocalDir))
    processed = 0
    failedList = list()
    FileSize = 0
    StartTime = time.time()
    for filename in os.listdir(LocalDir):
        srcFile = os.path.join(LocalDir, filename)
        dstFile = HdfsDir + '/' + filename
        if not ProcOne(client, srcFile, dstFile):
            failedList.append(srcFile)
        else:
            FileSize += os.path.getsize(srcFile)
        processed += 1
        print('%d/%d/%d, time cost: %.2f s' %
              (total, processed, len(failedList), time.time() - StartTime))
        print('%d B, %.2f MB/s \n' % (FileSize, FileSize / 1024 / 1024 /
                                      (time.time() - StartTime)))

    if failedList:
        print('failedList: %s' % repr(failedList))
        return False
    else:
        print('Good! No Error!')
        print('%d B, %.2f MB, %.2f GB, %.2f MB/s' % \
          (FileSize, FileSize/1024/1024, FileSize/1024/1024/1024, FileSize/1024/1024/(time.time()-StartTime)))
        return True
Beispiel #4
0
def cluster():
    try:
        cluster = ClickHouseCluster(__file__)
        cluster.add_instance("node", main_configs=["configs/config.d/storage_conf.xml"], with_hdfs=True)
        logging.info("Starting cluster...")
        cluster.start()
        logging.info("Cluster started")

        fs = HdfsClient(hosts=cluster.hdfs_ip)
        fs.mkdirs('/clickhouse')

        logging.info("Created HDFS directory")

        yield cluster
    finally:
        cluster.shutdown()
Beispiel #5
0
def test_read_files_with_spaces(started_cluster):
    hdfs_api = started_cluster.hdfs_api

    fs = HdfsClient(hosts=started_cluster.hdfs_ip)
    dir = '/test_spaces'
    exists = fs.exists(dir)
    if exists:
        fs.delete(dir, recursive=True)
    fs.mkdirs(dir)

    hdfs_api.write_data(f"{dir}/test test test 1.txt", "1\n")
    hdfs_api.write_data(f"{dir}/test test test 2.txt", "2\n")
    hdfs_api.write_data(f"{dir}/test test test 3.txt", "3\n")

    node1.query(f"create table test (id UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{dir}/test*', 'TSV')")
    assert node1.query("select * from test order by id") == "1\n2\n3\n"
    fs.delete(dir, recursive=True)
Beispiel #6
0
def test_hdfsCluster(started_cluster):
    hdfs_api = started_cluster.hdfs_api
    fs = HdfsClient(hosts=started_cluster.hdfs_ip)
    dir = '/test_hdfsCluster'
    exists = fs.exists(dir)
    if exists:
        fs.delete(dir, recursive=True)
    fs.mkdirs(dir)
    hdfs_api.write_data("/test_hdfsCluster/file1", "1\n")
    hdfs_api.write_data("/test_hdfsCluster/file2", "2\n")
    hdfs_api.write_data("/test_hdfsCluster/file3", "3\n")

    actual = node1.query("select id, _file as file_name, _path as file_path from hdfs('hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id")
    expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n"
    assert actual == expected

    actual = node1.query("select id, _file as file_name, _path as file_path from hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id")
    expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n"
    assert actual == expected
    fs.delete(dir, recursive=True)
Beispiel #7
0
    def __push_preprocessed(self, c3_path:str, user_name:str, dataset: Dataset):
        def push_to_hdfs(jstrs):
            if not fs.exists(c3_path):
                fs.create(c3_path, '\n'.join(jstrs) + '\n')
            else:
                fs.append(c3_path, '\n'.join(jstrs) + '\n')

        fs = HdfsClient(self.C3_HDFS_HOST, user_name=user_name)
        fs.mkdirs('/'.join(c3_path.split('/')[:-1]))
        fs.delete(c3_path)
        jstrs = []
        BUFSIZE = 2048
        for fxed_instance in tqdm(Iterator(dataset, batch_size=1), maxinterval=len(dataset.examples)):
            fxed_instance_dict = {name: getattr(fxed_instance, name).tolist()[0] for name in self.fields.keys()}
            jstrs.append(json.dumps(fxed_instance_dict))
            if len(jstrs) >= BUFSIZE:
                push_to_hdfs(jstrs)
                jstrs = []

        if jstrs:
            push_to_hdfs(jstrs)
Beispiel #8
0
def start():
    # 连接MongoDB,查询tokens,根据contractAddress到etherscan查询最新数据
    client = MongoCluster().connect()
    db = client.get_database('gse-transaction')
    collection = db.get_collection('mrout_6000001-6001000')
    # collection.insert_one()

    # 连接HDFS读取文件
    from pyhdfs import HdfsClient
    client2 = HdfsClient(hosts='%s,50070' % hdfs_ip, max_tries=10)
    # 返回这个用户的根目录
    print client2.get_home_directory()
    # 返回可用的namenode节点
    print client2.get_active_namenode()
    # 返回指定目录下的所有文件
    print client2.listdir("/user/leon/mrout_3_6000001-6001000/")
    # 读某个文件
    client2.mkdirs("/user/leon")
    inputfile = client2.open('/user/leon/mrout_3_6000001-6001000/part-00000')
    # 查看文件内容
    for r in inputfile:
        line = str(r).encode('utf-8')  # open后是二进制,str()转换为字符串并转码
        print(line)
Beispiel #9
0
    def _setup_walk(self, client: HdfsClient) -> Callable[..., str]:
        def path(*args: str) -> str:
            return posixpath.join(TEST_DIR, *args)

        self._make_empty_dir(client)
        client.create(path("f1"), b"")
        client.mkdirs(path("a1", "b1"))
        client.create(path("a1", "b1", "f2"), b"")
        client.mkdirs(path("a1", "b2"))
        client.mkdirs(path("a2"))
        return path
Beispiel #10
0
 def _make_empty_dir(self, client: HdfsClient) -> None:
     # Get an empty dir
     client.delete(TEST_DIR, recursive=True)
     assert not client.delete(TEST_DIR, recursive=True)
     assert client.mkdirs(TEST_DIR)
Beispiel #11
0
            if x['type'] == 'DIRECTORY'
        ]
        list_dirs.sort()
        for subdir in list_dirs:
            dir_subdata = os.path.join(dir_dataroot, subdir)
            logger.debug('data path : %s' % dir_subdata)
            dir_subdata_cleaned = os.path.join(dir_subdata, 'cleaned4netsec')
            logger.debug('data path for cleaned files : %s' %
                         dir_subdata_cleaned)
            list_subdir_date = [
                x['pathSuffix'] for x in client.list_status(dir_subdata)
                if x['type'] == 'FILE'
            ]
            if len(list_subdir_date) > 0:
                if not client.exists(dir_subdata_cleaned):
                    client.mkdirs(dir_subdata_cleaned)
                    logger.debug('mkdir dir for cleaned files : %s' %
                                 dir_subdata_cleaned)

            list_subdir_date_cleaned = [
                x['pathSuffix']
                for x in client.list_status(dir_subdata_cleaned)
                if x['type'] == 'FILE'
            ]

            list_subdir_date.sort()
            for fname in list_subdir_date:
                if fname in list_subdir_date_cleaned:
                    #                     #TODO: to debug
                    #                     if client.exists(os.path.join(dir_subdata_cleaned, fname)):
                    #                         print (os.path.join(dir_subdata_cleaned, fname))
Beispiel #12
0
class hdfs(object):
    #默认50070端口
    def __init__(self, cur_database_param):
        # super(HdfsClients, self).__init__()
        # self.quert_db_info = super(HdfsClients, self).getDBConfig()
        # self.hdfsHost=self.quert_db_info["host"]
        hdfsHost = cur_database_param['url']
        path = cur_database_param['dbname']
        self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost))
        self.host = hdfsHost
        self.path = path

    def append(self, path, data):
        self.hdfs.append(path, data)
        pass

    def concat(self, target, sources):
        self.concat(target, sources)

    # self, taskJobId,tableName=None,jobTemplateFieldList=None
    def createTableByTaskJobId(self,
                               taskJobId,
                               tableName=None,
                               jobTemplateFieldList=None,
                               data=None):
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(taskJobId)
            tableName = taskJob.tableName
        path = self.path + '/' + tableName
        self.hdfs.create(path, data, replication=2)

    def hmkdirs(self, path):
        self.hdfs.mkdirs(path)

    def open(self, path):
        return self.hdfs.open(path=path)

    def delete(self, path):
        self.hdfs.delete(path=path)

    def listdir(self, rule):
        f = self.hdfs.listdir(rule)
        return f

    def insert(self, jobid, tablename, column_dict, paramMap=None):
        if tablename == None:
            taskJob = TaskJobDao.loadTaskById(jobid)
            tablename = taskJob.tableName
        path = self.path + '/' + tablename
        createTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        task_job_id_sequenceValue = paramMap.get(
            "task_job_id_sequence") if paramMap != None else None
        if task_job_id_sequenceValue != None:
            column_dict.update(
                {"task_job_id_sequence": str(task_job_id_sequenceValue)})
        column_dict.update({
            "task_job_del_flag": "False",
            "task_job_create_time": createTime
        })
        # self.append(path, column_dict)
        if self.isTableExist(tablename):
            self.append(path, column_dict)
        else:
            self.createTableByTaskJobId(jobid, tablename, column_dict)
        # return column_dict

    def isTableExist(self, tablename):
        path = self.path + '/' + tablename
        exist = self.hdfs.exists(path)
        return exist

    def save_to_hdfs(self, jobid, path, data):
        if self.isTableExist(path):
            self.append(path, data)
        else:
            self.createTableByTaskJobId(jobid, path, data)

    def save_to_hdfs2(self, path, data):
        if self.hdfs.exists(path):
            self.hdfs.append(path, data)
        else:
            self.hdfs.create(path, data, replication=2)

    def execute(self, sqls="append", path=None, data=None):
        try:
            if isinstance(sqls, list) and len(sqls) > 0:
                for sql in sqls:
                    # method = eval(sql)
                    method = getattr(self, sql)
                    method(path, data)
            else:
                # method = eval(sqls)
                method = getattr(self, sqls)
                method(path, data)
        except Exception, e:
            logging.error("hdfs,execute," + str(e))
            raise Exception()