Ejemplo n.º 1
0
def write_file(file_path,thu1):
    '''
    读取数据写入文件中
    :param file_path:
    :return:
    '''

    max_num = load_num()

    common_keys.KEY_NUM=max_num
    file=None
    logger.info("数据加载结束:rowkey num=" + str(
        common_keys.KEY_NUM) + ",rowkey profix=" + common_keys.ROWKEY_PROFIX + ",last rowkey profix=" + str(common_keys.LAST_ROWKEY_PROFIX))
    while (True):

        logger.info("检查rowkey...")
        set_rowkey_profix()
        if common_keys.LAST_ROWKEY_PROFIX != common_keys.ROWKEY_PROFIX:
            logger.info("rowkey前缀改变,更新rowkey文件。")
            write_rowkey(max_num)
            max_num = 0

        if not name_manager.has_next(common_keys.FINISH_LIST_NAME):
            logger.info("无数据,等待中...")
            if file!=None:
                file.close()
            file = None
            time.sleep(common_keys.WAIT_TIME)
            continue

        logger.info("检查到数据,开始生成rowkey...")
        rowkey = create_rowkey(max_num)
        name_manager.set_string(common_keys.KEY_NUM_NAME,str(max_num))
        max_num += 1

        logger.info("生成rowkey:"+rowkey+",开始写数据...")
        string = name_manager.get(common_keys.FINISH_LIST_NAME)
        bean = standard_spider.Bean()
        bean.parser(string)
        bean.cut = do_cut(bean.title,thu1)
        bean.need = needs(bean)
        bean.year=re.search("\d{4}",bean.date).group()
        bean.fill_date=str(datetime.datetime.now().date())

        bean.responsible = responsible(bean.url)
        line = rowkey + "##" + bean.name + "##" + bean.customerid + "##" + bean.year + "##" + bean.flag + "##" + bean.title + "##" + bean.url + "##" + bean.fill_date + "##" + bean.date + "##" + bean.operator + "##" + bean.need + "##" + bean.text

        line = re.sub("\s+", " ", line)
        # rowkey_file.write(rowkey+"\n")
        if file==None:
            file = open(file_path, "w+", encoding="utf-8")
        file.write(line + "\n")
Ejemplo n.º 2
0
def write_file(file_path):

    max_num = load_num()
    common_keys.KEY_NUM = max_num
    file = None
    logger.info("数据加载结束:rowkey num=" + str(common_keys.KEY_NUM) +
                ",rowkey profix=" + common_keys.ROWKEY_PROFIX +
                ",last rowkey profix=" + common_keys.LAST_ROWKEY_PROFIX)
    while (True):
        if not name_manager.has_next(common_keys.FINISH_LIST_NAME):
            logger.info("无数据,等待中...")
            if file != None:
                file.close()
            file = None
            time.sleep(common_keys.WAIT_TIME)
            continue
        logger.info("发现数据,开始生成rowkey...")
        if common_keys.ROWKEY_CONDITION.acquire():
            logger.info("获得rowkey锁。")
            # print("================",max_num)
            # print("==========",common_keys.KEY_NUM)
            if common_keys.KEY_NUM == 0 and max_num != 0:
                write_rowkey(max_num)

            max_num = common_keys.KEY_NUM
            rowkey = create_rowkey(max_num)
            common_keys.KEY_NUM += 1
            name_manager.set_string(common_keys.KEY_NUM_NAME, str(max_num))
            logger.info("释放锁并唤醒等待的线程...")
            common_keys.ROWKEY_CONDITION.release()
            common_keys.ROWKEY_CONDITION.notify_all()
        else:
            logger.info("未获取到rowkey锁,等待中...")
            common_keys.ROWKEY_CONDITION.wait()
            continue
        logger.info("数据写入中...")
        string = name_manager.get(common_keys.FINISH_LIST_NAME)
        bean = standard_spider.Bean()
        bean.parser(string)
        bean.cut = do_cut(bean.title)
        bean.need = needs(bean)
        bean.year = re.search("\d{4}", bean.date).group()
        bean.fill_date = str(datetime.datetime.now().date())

        bean.responsible = responsible(bean.url)
        line = rowkey + "##" + bean.name + "##" + bean.customerid + "##" + bean.year + "##" + bean.flag + "##" + bean.title + "##" + bean.url + "##" + bean.fill_date + "##" + bean.date + "##" + bean.operator + "##" + bean.need + "##" + bean.text

        line = re.sub("\s+", " ", line)
        # rowkey_file.write(rowkey+"\n")
        if file == None:
            file = open(file_path, "w+", encoding="utf-8")
        file.write(line + "\n")
Ejemplo n.º 3
0
def reset_time():
    while (True):
        if common_keys.ROWKEY_CONDITION.acquire():
            common_keys.KEY_NUM = 0

            # d = time.time()
            d = time.mktime(datetime.datetime.now().date().timetuple())
            common_keys.KEY_TIME = sys.maxsize - long(d)

            common_keys.LAST_ROWKEY_PROFIX = common_keys.ROWKEY_PROFIX
            set_rowkey_profix()
            print_errs(common_keys.ERR_PATH)
            name_manager.set_string(common_keys.KEY_TIME_NAME,
                                    str(common_keys.KEY_TIME))
            common_keys.ROWKEY_CONDITION.release()
            break
        else:
            common_keys.ROWKEY_CONDITION.wait()
Ejemplo n.º 4
0
def set_rowkey_profix():
    """
    获取当前时间,设定当前rowkey的前缀
    :return:
    """
    logger.info("设置rowkey前缀。")
    h = datetime.datetime.now().hour
    if h < common_keys.SECOND_TIME:
        name_manager.set_string(common_keys.KEY_TIME_S_NAME,
                                common_keys.FIRST_TIME_S)
        common_keys.ROWKEY_PROFIX = create_rowkey_profix(
            common_keys.KEY_TIME, common_keys.FIRST_TIME_S
        )  #str(common_keys.KEY_TIME)+"_"+common_keys.FIRST_TIME_S
    else:
        name_manager.set_string(common_keys.KEY_TIME_S_NAME,
                                common_keys.SECOND_TIME_S)
        common_keys.ROWKEY_PROFIX = create_rowkey_profix(
            common_keys.KEY_TIME, common_keys.SECOND_TIME_S
        )  #str(common_keys.KEY_TIME)+"_"+common_keys.SECOND_TIME_S
Ejemplo n.º 5
0
def set_rowkey_profix():
    """
    获取当前时间,设定当前rowkey的前缀
    :return:
    """
    logger.info("设置rowkey前缀。")
    if common_keys.ROWKEY_PROFIX!=None:
        common_keys.LAST_ROWKEY_PROFIX=common_keys.ROWKEY_PROFIX

    #获取当天时间(long)
    d = time.mktime(datetime.datetime.now().date().timetuple())
    common_keys.KEY_TIME = sys.maxsize - long(d)
    name_manager.set_string(common_keys.KEY_TIME_NAME,common_keys.KEY_TIME)

    h = datetime.datetime.now().hour
    if h < common_keys.SECOND_TIME:
        name_manager.set_string(common_keys.KEY_TIME_S_NAME,common_keys.FIRST_TIME_S)
        common_keys.ROWKEY_PROFIX = create_rowkey_profix(common_keys.KEY_TIME,common_keys.FIRST_TIME_S)#str(common_keys.KEY_TIME)+"_"+common_keys.FIRST_TIME_S
    else:
        name_manager.set_string(common_keys.KEY_TIME_S_NAME, common_keys.SECOND_TIME_S)
        common_keys.ROWKEY_PROFIX=create_rowkey_profix(common_keys.KEY_TIME,common_keys.SECOND_TIME_S)#str(common_keys.KEY_TIME)+"_"+common_keys.SECOND_TIME_S