Ejemplo n.º 1
0
    def puts(self, rowKeys, values, qualifier='1'):
        """ put sevel rows, `qualifier` is autoincrement

       :param rowKeys: a single rowKey
       :param values: values is a 2-dimension list, one piece element is [name, sex, age]
       :param qualifier: column family qualifier

       Usage::

       >>> HBaseTest().puts('test', [['lee', 'f', '27'], ['clark', 'm', 27], ['dan', 'f', '27']])

       """
        mutationsBatch = []
        if not isinstance(rowKeys, list):
            rowKeys = [rowKeys] * len(values)

        for i, value in enumerate(values):
            mutations = []
            for j, column in enumerate(value):
                if isinstance(column, str):
                    m_name = Hbase.Mutation(column=self.columnFamilies[j] +
                                            ':' + qualifier,
                                            value=column)
                elif isinstance(column, int):
                    m_name = Hbase.Mutation(column=self.columnFamilies[j] +
                                            ':' + qualifier,
                                            value=encode(column))
                mutations.append(m_name)

            qualifier = str(int(qualifier) + 1)
            mutationsBatch.append(
                Hbase.BatchMutation(row=rowKeys[i], mutations=mutations))
        self.client.mutateRows(self.table, mutationsBatch, {})
Ejemplo n.º 2
0
    def puts(self, rowKeys, qualifier, values):
        """ put sevel rows, `qualifier` is autoincrement

        :param rowKeys: a single rowKey
        :param values: values is a 2-dimension list, one piece element is [name, sex, age]
        :param qualifier: column family qualifier

        Usage::

        >>> HBaseTest('table').puts(rowKeys=[1,2,3],qualifier="name",values=[1,2,3])

        """

        mutationsBatch = []
        if not isinstance(rowKeys, list):
            rowKeys = [rowKeys] * len(values)

        for i, value in enumerate(values):
            mutations = []
            # for j, column in enumerate(value):
            if isinstance(value, str):
                value = value.encode('utf-8')
                m_name = Hbase.Mutation(column=(self.columnFamilies[0] + ':' +
                                                qualifier).encode('utf-8'),
                                        value=value)
            elif isinstance(value, int):
                m_name = Hbase.Mutation(column=(self.columnFamilies[0] + ':' +
                                                qualifier).encode('utf-8'),
                                        value=encode(value))
            mutations.append(m_name)
            mutationsBatch.append(
                Hbase.BatchMutation(row=rowKeys[i].encode('utf-8'),
                                    mutations=mutations))
        self.client.mutateRows(self.table, mutationsBatch, {})
Ejemplo n.º 3
0
    def write_hbase(data, table_name, ip, server_port):
        """
        将数据写入Hbase中
        :param data: 包含数据的迭代器,单条数据为dict类型,比如 {'img_oss' = 'http://bj-image.oss-cn-hangzhou-internal.
        aliyuncs.com/6321965c0c96f1ea809b15ad757252f3.jpeg', 'img_type' = ['line_chart']}
        :param table_name: 需要推送的目标表的表名
        :param ip: 推送的目标thrift ip
        :param server_port: 推送的目标thrift port
        """

        if not isinstance(table_name, bytes):
            table_name = bytes(table_name, encoding='utf-8')

        # 建立 thrift 连接
        transport = TSocket.TSocket(ip, server_port)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        client = Hbase.Client(protocol)
        transport.open()

        result = []
        count = 0
        for item in data:
            count += 1
            mutations = []
            img_type = bytes(item['img_type'], encoding='utf-8')
            row_key = bytes(hashlib.md5(item['url'].encode()).hexdigest(), encoding='utf-8')

            mutations.append(Mutation(column=b'info:img_type', value=img_type))
            result.append(Hbase.BatchMutation(row=row_key, mutations=mutations))

        client.mutateRows(table_name, result, None)

        transport.close()
Ejemplo n.º 4
0
 def puts(self, rowkey, columnFamilies, values):
     mutationsBatch = []
     try:
         if not isinstance(rowkey, list):
             rowKeys = [rowkey] * len(values)
         for i, value in enumerate(values):
             mutations = []
             for j, column in enumerate(value):
                 if isinstance(column, str):
                     m_name = Hbase.Mutation(column=columnFamilies[j] +
                                             ':' + '0',
                                             value=column)
                 elif isinstance(column, int):
                     m_name = Hbase.Mutation(column=columnFamilies[j] +
                                             ':' + '0',
                                             value=encode(column))
                 mutations.append(m_name)
             mutationsBatch.append(
                 Hbase.BatchMutation(row=rowKeys[i], mutations=mutations))
         self.client.mutateRows(self.dbname, mutationsBatch)
         return True
     except (Hbase.IOError, Hbase.TException, Hbase.TApplicationException,
             Hbase.IllegalArgument) as e:
         logInfo('puts')
         logInfo(e)
         print(e)
     return False
Ejemplo n.º 5
0
    def write_data_to_hbase(data, col_names, table_name, ip, server_port):
        """
        该函数为在mapPartation中调用的功能函数。接受的RDD数据以迭代器的形式传入。
        通过遍历迭代器,将迭代器中的数据缓冲到一个缓冲变量中。
        当缓冲变量中的数据量到达1000条时,将数据推送到hbase中,然后清空变量,姐搜下一批数据。
        :param data: 包含数据的迭代器。
        :param col_names: 需要推送的列的列名
        :param table_name: 需要推送的目标表的表名
        :param ip: 推送的目标thrift ip
        :param server_port: 推送的目标thrift port
        :return: 每一行对应的缓冲变量的索引编号
        """
        print("start putDataAsPartition")
        if not isinstance(table_name, bytes):
            table_name = bytes(table_name, encoding='utf-8')
        col_names = HBaseUtils().str_list_to_bytes_list(col_names)

        # 建立hbase连接
        transport = TSocket.TSocket(ip, server_port)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        client = Hbase.Client(protocol)
        transport.open()

        # 开始收集数据
        result = []
        return_data = []
        count = 0
        for line in data:
            # print("data: " + str(line))
            count += 1
            # 收集数据生成BathMutation
            mutations_ = []
            for colName in col_names:
                if str(colName, encoding='utf-8') in line:
                    mutations_.append(Mutation(column=colName,
                                               value=bytes(line[str(colName, encoding='utf-8')], encoding='utf-8')))
            result.append(Hbase.BatchMutation(row=bytes(line["rowKey"], encoding='utf-8'), mutations=mutations_))
            # 每1000条想hbase推送一次数据
            if count % 1000 == 0:
                client.mutateRows(table_name, result, None)
                result = []

        # 推送出缓冲变量中的剩余数据
        if len(result) > 0:
            client.mutateRows(table_name, result, None)

        transport.close()
        return return_data
Ejemplo n.º 6
0
    def puts(self, rowKeys, values, qualifier='1'):
        mutationsBatch = []
        if not isinstance(rowKeys, list):
            rowKeys = [rowKeys] * len(values)

        for i, value in enumerate(values):
            mutations = []
            for j, column in enumerate(value):
                if isinstance(column, str):
                    m_name = Hbase.Mutation(column=self.columnFamilies[j] +
                                            ':' + qualifier,
                                            value=column)
                elif isinstance(column, int):
                    m_name = Hbase.Mutation(column=self.columnFamilies[j] +
                                            ':' + qualifier,
                                            value=encode(column))
                mutations.append(m_name)

            qualifier = str(int(qualifier) + 1)
            mutationsBatch.append(
                Hbase.BatchMutation(row=rowKeys[i], mutations=mutations))
        self.client.mutateRows(self.table, mutationsBatch, {})
Ejemplo n.º 7
0
def execute():
    mutationsbatch = []
    mutations_attributes = {}

    sock = TSocket.TSocket(thriftServer, thriftPort)
    transport = TTransport.TSaslClientTransport(sock, thriftServer,
                                                saslServiceName)
    #protocol = TCompactProtocol.TCompactProtocol(transport)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    client = Hbase.Client(protocol)
    transport.open()

    mutations = [
        Hbase.Mutation(column="c:coluna1", value='Texto da coluna 1'),
        Hbase.Mutation(column="c:coluna2", value='Texto da coluna 2')
    ]
    row_key = '00001'

    mutationsbatch.append(Hbase.BatchMutation(row=row_key,
                                              mutations=mutations))

    client.mutateRows(tablename, mutationsbatch, mutations_attributes)

    print('OK')

    del mutations
    del mutationsbatch

    mutationsbatch = []

    transport.close()

    del client
    del protocol
    del transport
    del sock
Ejemplo n.º 8
0
 columnFamilies = []
 columnFamilies.append(Hbase.ColumnDescriptor(name='info'))
 columnFamilies.append(Hbase.ColumnDescriptor(name='contact'))
 columnFamilies.append(Hbase.ColumnDescriptor(name='others'))
 client.createTable(tableName, columnFamilies)
 mutationsbatch = []
 mutations = [
     Hbase.Mutation(column='info:FULLNAME', value='Gustavo Achong'),
     Hbase.Mutation(column='info:AGE', value='38'),
     Hbase.Mutation(column='contact:EMAILID',
                    value='*****@*****.**'),
     Hbase.Mutation(column='contact:PHONE', value='398-555-0132'),
     Hbase.Mutation(column='others:MODIFIEDDATE',
                    value='5/16/2005 4:33:33 PM')
 ]
 mutationsbatch.append(Hbase.BatchMutation(row='1',
                                           mutations=mutations))
 mutations = [
     Hbase.Mutation(column='info:FULLNAME', value='Catherine Abel'),
     Hbase.Mutation(column='info:AGE', value='36'),
     Hbase.Mutation(column='contact:EMAILID',
                    value='*****@*****.**'),
     Hbase.Mutation(column='contact:PHONE', value='747-555-0171'),
     Hbase.Mutation(column='others:MODIFIEDDATE',
                    value='5/16/2005 4:33:33 PM')
 ]
 mutationsbatch.append(Hbase.BatchMutation(row='2',
                                           mutations=mutations))
 mutations = [
     Hbase.Mutation(column='info:FULLNAME', value='Kim Abercrombie'),
     Hbase.Mutation(column='info:AGE', value='38'),
     Hbase.Mutation(column='contact:EMAILID',
Ejemplo n.º 9
0
    def puts(self, records, job_id):
        """
        hbase批量插入
        :param records: 多条条记录list,一条记录格式为{'_id':'','field1':'', 'field2':''}
        :param job_id: 任务类型,比如 'mongodb:hb_charts'
        :return: 
        """
        assert isinstance(records, list)

        row_name = ''  # 行的ID
        log_column = ''  # 记下的列,比如 update_at 列
        if job_id.split(':')[0] == 'mongodb':
            row_name = '_id'
            log_column = 'last_updated'
        elif job_id.split(':')[0] == 'mysql':
            row_name = 'id'
            log_column = 'update_at'

        mutations_batch = []
        for record in records:
            mutations = []
            # row_key的值为 md5(_id)[0:10]:_id
            _id = str(record[row_name])
            row_key = bytes(
                hashlib.md5(bytes(_id, encoding="utf-8")).hexdigest()[0:10] +
                ':' + _id,
                encoding="utf-8")
            for item in record:
                if item == row_name:
                    continue

                key = bytes('data:' + item, encoding="utf8")
                var = bytes(str(record[item]), encoding="utf8")
                # hbase.client.keyvalue.maxsize 默认是10M,超出这个值则设置为None
                if len(var) < 10 * 1024 * 1024:
                    mutations.append(Hbase.Mutation(column=key, value=var))
                else:
                    mutations.append(
                        Hbase.Mutation(column=key,
                                       value=bytes(str(None),
                                                   encoding="utf8")))

            mutations_batch.append(
                Hbase.BatchMutation(row=row_key, mutations=mutations))

        self.client.mutateRows(self.table, mutations_batch, {})

        self.put_num += len(mutations_batch)

        with self.file_lock:
            f = open(job_id + '.txt', 'w')
            json = dict({
                'date': '',
                'job_id': '',
                'id': '',
                'update': '',
                'number': ''
            })
            json['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
            json['job_id'] = job_id
            json['id'] = records[-1][row_name]
            if job_id.split(':')[0] == 'mongodb':
                json['update'] = records[-1][log_column]
            elif job_id.split(':')[0] == 'mysql':
                json['update'] = records[-1][log_column].strftime(
                    '%Y-%m-%d %H:%M:%S')
            json['number'] = str(self.put_num)
            f.write(str(json))
            f.close()
Ejemplo n.º 10
0
for filename in os.listdir(sourceDir):
    shakespeare = open(os.path.join(sourceDir, filename), "rb")

    linenumber = 0

    # Create a list of mutations per work of Shakespeare
    mutationsbatch = []

    for line in shakespeare:
        rowkey = username + "-" + filename + "-" + str(linenumber).zfill(6)

        # Create an array containing all values for the Column Descriptors
        mutations = [
            Hbase.Mutation(column=messagecolumncf, value=line.strip()),
            Hbase.Mutation(column=linenumbercolumncf,
                           value=encode(linenumber)),
            Hbase.Mutation(column=usernamecolumncf, value=username)
        ]

        # Add the new mutations to the MutationsBatch list
        mutationsbatch.append(
            Hbase.BatchMutation(row=rowkey, mutations=mutations))

        linenumber = linenumber + 1

    # Run the mutations for the work of Shakespeare
    client.mutateRows(tablename, mutationsbatch)

transport.close()