Ejemplo n.º 1
0
 def getBatchMutations(cf, qualifiers, tups):
     batchMutations = []
     for tup in tups:
         mutations = []
         for i in range(1, len(qualifiers)):
             mutation = Mutation(column="%s:%s" % (cf, qualifiers[i]),
                                 value=tup[i])
             mutations.append(mutation)
         batchMutation = BatchMutation(tup[0], mutations)
         batchMutations.append(batchMutation)
     return batchMutations
Ejemplo n.º 2
0
    def writeAll(self, infos):
        """
        same as write, but write multi lines
        return:None
        NOTE:all Mutation using one timestamp, use the bigest line's timestamp
        """
        if infos is None:
            raise ValueError, "info is None"
        rowBatches = []
        theBigestTime = None
        for info in infos:
            batchMutation = BatchMutation()
            (key, value) = info.split('|', 2)
            kvMap = {}
            for kv in key.split('`'):
                (k, v) = kv.split('=')
                kvMap[k] = v

            jkid = kvMap['jkid']
            jktime = kvMap['time']
            keyTime = jktime[0:12]
            rowKey = "%s_%s" % (jkid, keyTime)
            batchMutation.row = rowKey
            #print rowKey
            cols=[]
            
            for kv in value.split('`'):
                k, v = kv.split('=')
                colName = "info:%s" % k
                col = Mutation(column=colName, value=v)
                cols.append(col)
            batchMutation.mutations = cols
            if cmp(theBigestTime, jktime) < 0:
                theBigestTime = jktime
            rowBatches.append(batchMutation)
        
        if theBigestTime is not None: 
            timestamp = time.mktime(time.strptime(theBigestTime, "%Y%m%d%H%M%S"))
            self.client.mutateRowsTs(self.tableName, rowBatches, timestamp)
Ejemplo n.º 3
0
    def insert_rows_data(self, table_name, data):
        allmutation = []
        for row in data:
            if data[row]:
                one_row = []
                for column in data[row]:
                    mutation = Mutation(column=column,
                                        value=json.dumps(data[row][column]))
                    one_row.append(mutation)
                batchMutation = BatchMutation(row, one_row)
                allmutation.append(batchMutation)

        if (len(allmutation) > 0):
            try:
                self.client.mutateRows(table_name, allmutation)
                return True
            except Exception as e:
                if (self.exception):
                    raise e  # 抛出异常让外部处理
                logging.error('insert_rows_data data error : %s' % e)
                return False
        logging.info('insert data null')
        return False
Ejemplo n.º 4
0
def input_to_es(table, ops):
    client, transport = get_client(table, ops.location, ops.port)
    transport.open()
    rows = []
    print 'Begin to write date to table, from %s to %d months later' \
            % (ops.start, int(ops.months))
    for date in get_dates(ops.start, int(ops.months)):
        for provider, no in PROVIDERIDs:
            rowkey = date.replace('/', '') + no
            provider_id = Mutation(column='info:logisticproviderid',
                                   value=provider)
            et = random.randint(100000, 99999999)
            exp_total = Mutation(column='exp:total', value=str(et))
            exp_error = Mutation(column='exp:error', \
                    value=str(random.randint(100, 999)))
            expstate_total = Mutation(column='expstate:total',
                                      value=str(et * 10))
            expstate_error = Mutation(column='expstate:error', \
                    value=str(random.randint(100, 999)))
            rows.append(BatchMutation(rowkey, [provider_id, exp_total, \
                    exp_error, expstate_total, expstate_error]))
    client.mutateRows(table, rows, {})
    transport.close()
    print 'Write to ExpressStatistics finished'
Ejemplo n.º 5
0
column01 = ColumnDescriptor(
    name='user_info'
)  # ColumnDescriptor(bloomFilterType='NONE', bloomFilterNbHashes=0, name='user_info', maxVersions=3, blockCacheEnabled=False, inMemory=False, timeToLive=-1, bloomFilterVectorSize=0, compression='NONE')
column02 = ColumnDescriptor('addr_info')
#hbase好像不支持使用Python创建预分区表
# client.createTable('tablepy',[column01,column02])
# print(client)
region_info = client.getTableRegions('tablepy')  #查看表分区
table_info = client.getColumnDescriptors('tablepy')  #查看表结构
print(
    region_info
)  # [TRegionInfo(startKey='', endKey='', version=1, id=1543752131747L, name='tablepy,,1543752131747.ccfa71e67b9732adb575129bf9e560eb.')]
print(
    table_info
)  # {'addr_info:': ColumnDescriptor(bloomFilterType='NONE', bloomFilterNbHashes=0, name='addr_info:', maxVersions=3, blockCacheEnabled=False, inMemory=False, timeToLive=2147483647, bloomFilterVectorSize=0, compression='NONE'), 'user_info:': ColumnDescriptor(bloomFilterType='NONE', bloomFilterNbHashes=0, name='user_info:', maxVersions=3, blockCacheEnabled=False, inMemory=False, timeToLive=2147483647, bloomFilterVectorSize=0, compression='NONE')}
#插入数据
mutation = Mutation(column='user_info:province', value='350000')
batchs = BatchMutation('row02', [mutation])
insert_resut = client.getRow('tablepy', 'row01')
#插入多条数据
# client.mutateRow('tablepy','row01',[mutation])
client.mutateRows(
    'tablepy',
    [batchs])  #这个方法与上面mutateRow的区别在于,mutateRows可以一次插入多条记录,而mutateRow只能插入单条数据
print(
    insert_resut
)  # [TRowResult(columns={'user_info:province': TCell(timestamp=1543752270954L, value='350000')}, row='row01')]
client.deleteAll('tablepy', 'row01', 'addr_info')  #删除指定指定行指定列的数据
client.deleteAllRow('tablepy', 'row01')  #删除指定行的全部数据

socket.close()  #用完要记得关闭
Ejemplo n.º 6
0
    def data_shuffle(self, mongo_data_list, province_list, city_list,
                     area_list):
        batch_list = list()

        for city in city_list:
            if city["NAME_"] == "县":
                city_list.remove(city)

        for data in mongo_data_list:
            mutation_list = list()
            # print(data)
            prov_n = None
            prov_c = None
            city_n = None
            city_c = None
            area_n = None
            area_c = None

            # 省级字段
            for prov in province_list:
                if prov["NAME_"] == data["AREA_"][:len(prov["NAME_"])]:
                    prov_n = prov["NAME_"]
                    prov_c = prov["CODE_"]

            # todo 乱码
            # if prov_c is None:
            # print(data)

            # 市级字段
            for city in city_list:
                if city["PARENT_"] == prov_c:
                    if city["NAME_"] in data["AREA_"][:len(prov_n) +
                                                      len(city["NAME_"])]:
                        city_n = city["NAME_"]
                        city_c = city["CODE_"]

            for area in area_list:
                if area["PARENT_"] == city_c:
                    if area["NAME_"] in data["AREA_"]:
                        area_n = area["NAME_"]
                        area_c = area["CODE_"]

                    elif area["NAME_"][:2] in data["AREA_"]:
                        area_n = area["NAME_"]
                        area_c = area["CODE_"]
                        index = data["AREA_"].find(area["NAME_"][:2])
                        data["AREA_"] = data["AREA_"].replace(
                            data["AREA_"][index:], area["NAME_"])

            if area_n is None:
                # 石家庄市桥东区被合并,数据库无匹配
                if data["AREA_"] == "河北省石家庄市桥东区":
                    area_n = "桥西区"
                    area_c = "130104"
                else:
                    for area in area_list:
                        # 省直辖县级市 todo 添加条件 area["PARENT_"][:2] == city_c[:2] and
                        if area["PARENT_"][-4:] == "9000":
                            if area["NAME_"] in data["AREA_"]:
                                city_n = "省直辖县级行政区划"
                                city_c = prov_c[:2] + "9000"
                                area_n = area["NAME_"]
                                area_c = area["CODE_"]

            if area_n is None:
                area_n = city_n
                area_c = area_c

            # 乱码跳过此次循环
            if area_n is None:
                print(data)
                continue

            # 地址清洗
            if "中国" in data["ADDR_"][:2]:
                data["ADDR_"] = data["ADDR_"].replace("中国", "")

            if data["ADDR_"] == "":
                data["ADDR_"] = data["AREA_"]

            if "电话" in data["ADDR_"]:
                index = data["ADDR_"].find("电话")
                data["ADDR_"] = data["ADDR_"].replace(data["ADDR_"][index:],
                                                      "")

            if prov_n + prov_n[:-1] in data["ADDR_"]:
                data["ADDR_"] = data["ADDR_"].replace(prov_n + prov_n[:-1],
                                                      prov_n)

            if prov_n not in data["ADDR_"][:len(prov_n)]:
                if prov_n[:2] in data["ADDR_"][:len(prov_n)]:
                    data["ADDR_"] = data["ADDR_"][:len(prov_n)].replace(
                        prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):]
                else:
                    data["ADDR_"] = prov_n + data["ADDR_"]

            if city_n not in data["ADDR_"][:len(prov_n) + len(city_n)]:
                if city_n[:-1] in data["ADDR_"][:len(prov_n) + len(city_n)]:
                    data["ADDR_"] = data[
                        "ADDR_"][:len(prov_n) + len(city_n)].replace(
                            city_n[:-1],
                            city_n) + data["ADDR_"][len(prov_n) + len(city_n):]
                    data["ADDR_"] = data["ADDR_"].replace("市县", "市")
                else:
                    if city_c[-4:] != "9000":
                        data["ADDR_"] = data["ADDR_"][:len(
                            prov_n)] + city_n + data["ADDR_"][len(prov_n):]
                    else:
                        if area_n in data["ADDR_"][:len(prov_n) + len(area_n)]:
                            pass
                        else:
                            if area_n in data["ADDR_"]:
                                index = data["ADDR_"].find(area_n)
                                data["ADDR_"] = prov_n + data["ADDR_"][index:]
                            else:
                                data["ADDR_"] = data[
                                    "ADDR_"][:len(prov_n)].replace(
                                        prov_n,
                                        "") + data["ADDR_"][len(prov_n):]
                                data["ADDR_"] = data[
                                    "ADDR_"][:len(city_n)].replace(
                                        city_n,
                                        "") + data["ADDR_"][len(city_n):]
                                index = data["ADDR_"].find(area_n) - len(
                                    area_n)
                                data["ADDR_"] = prov_n + data["AREA_"][
                                    index:] + data["ADDR_"]

                        if city_n in data["ADDR_"][:len(prov_n) + len(city_n)]:
                            data["ADDR_"] = data[
                                "ADDR_"][:len(prov_n) + len(city_n)].replace(
                                    city_n, "") + data["ADDR_"][len(prov_n) +
                                                                len(city_n):]
                            # print(data["ADDR_"])

            addr_ = data["ADDR_"]

            # 定义HBase_row
            deal_time = int(float(data["DEALTIME_"]))
            row_time = 9999999999 - deal_time
            row = str(data["ENTITY_CODE_"]) + "_" + str(row_time)

            # 状态列字段
            mutation_s = Mutation(column="{}:{}".format("S", "STATUS_"),
                                  value="1")
            mutation_list.append(mutation_s)

            # 创建时间
            mutation_creat_time = Mutation(column="{}:{}".format(
                "C", "CREATE_TIME_"),
                                           value=str(data["DATETIME_"]))
            mutation_list.append(mutation_creat_time)

            # 地区编码
            mutation_area_C = Mutation(column="{}:{}".format(
                "C", "AREA_CODE_"),
                                       value=area_c)
            mutation_list.append(mutation_area_C)

            # 学校名称
            mutation_name = Mutation(column="{}:{}".format("F", "NAME_"),
                                     value=str(data["NAME_"]))
            mutation_list.append(mutation_name)

            # 学校图片
            mutation_url = Mutation(column="{}:{}".format("F", "IMAGES_"),
                                    value=str(data["IMAGES_"]))
            mutation_list.append(mutation_url)

            # 学校级别(是否公办)
            mutation_url = Mutation(column="{}:{}".format("F", "GRADE_"),
                                    value=str(data["GRADE_"]))
            mutation_list.append(mutation_url)

            # 省级编码
            mutation_p_c = Mutation(column="{}:{}".format(
                "F", "PROVINCE_CODE_"),
                                    value=prov_c)
            mutation_list.append(mutation_p_c)

            # 省级名称
            mutation_p_n = Mutation(column="{}:{}".format(
                "F", "PROVINCE_NAME_"),
                                    value=prov_n)
            mutation_list.append(mutation_p_n)

            # 市级编码
            mutation_c_c = Mutation(column="{}:{}".format("F", "CITY_CODE_"),
                                    value=city_c)
            mutation_list.append(mutation_c_c)

            # 市级名称
            mutation_c_n = Mutation(column="{}:{}".format("F", "CITY_NAME_"),
                                    value=city_n)
            mutation_list.append(mutation_c_n)

            # 区县编码
            mutation_area_c = Mutation(column="{}:{}".format(
                "F", "DISTRICT_CODE_"),
                                       value=area_c)
            mutation_list.append(mutation_area_c)

            # 区县名称
            mutation_area_n = Mutation(column="{}:{}".format(
                "F", "DISTRICT_NAME_"),
                                       value=area_n)
            mutation_list.append(mutation_area_n)

            # 学校性质(公办民办私立)
            mutation_addr = Mutation(column="{}:{}".format(
                "F", "SCHOOL_TYPE_"),
                                     value=str(data["SCHOOL_TYPE_"]))
            mutation_list.append(mutation_addr)

            # 学校等级
            mutation_addr = Mutation(column="{}:{}".format("F", "PERIOD_"),
                                     value=str(data["PERIOD_"]))
            mutation_list.append(mutation_addr)

            # 学校电话
            mutation_addr = Mutation(column="{}:{}".format("F", "TEL_"),
                                     value=str(data["TEL_"]))
            mutation_list.append(mutation_addr)

            # 学校网站
            mutation_addr = Mutation(column="{}:{}".format("F", "WEBSITE_"),
                                     value=str(data["WEBSITE_"]))
            mutation_list.append(mutation_addr)

            # 学校地址
            mutation_addr = Mutation(column="{}:{}".format("F", "ADDR_"),
                                     value=addr_)
            mutation_list.append(mutation_addr)

            # 学校简介
            mutation_addr = Mutation(column="{}:{}".format("F", "BRIEF_"),
                                     value=str(data["BRIEF_"]))
            mutation_list.append(mutation_addr)

            # 页面地址
            mutation_addr = Mutation(column="{}:{}".format("F", "URL_"),
                                     value=str(data["URL_"]))
            mutation_list.append(mutation_addr)

            # 处理时间
            mutation_addr = Mutation(column="{}:{}".format("F", "DEALTIME_"),
                                     value=str(data["DEALTIME_"]))
            mutation_list.append(mutation_addr)

            # 实体名称
            mutation_addr = Mutation(column="{}:{}".format(
                "F", "ENTITY_NAME_"),
                                     value=str(data["ENTITY_NAME_"]))
            mutation_list.append(mutation_addr)

            # 实体编码
            mutation_addr = Mutation(column="{}:{}".format(
                "F", "ENTITY_CODE_"),
                                     value=str(data["ENTITY_CODE_"]))
            mutation_list.append(mutation_addr)

            # MongoDB_id
            mutation_id = Mutation(column="{}:{}".format("F", "_id"),
                                   value=str(data["_id"]))
            mutation_list.append(mutation_id)

            batch_mutation = BatchMutation(row, mutation_list)
            batch_list.append(batch_mutation)

        return batch_list