Beispiel #1
0
def word_repost_relationship(batch_num, temp_dir, searchList, breakpos=None):
    # 据进程名生成日志
    name = f'getRepost_batchNum{str(batch_num)}'
    log = Logger(name)
    logger = log.getLogger()

    # 断点处理
    if not breakpos:
        # 每个进程维护一个sublist转发关系的层级目录(临时)
        level_dir = temp_dir + f'temp_{name}/'
        if not os.path.exists(level_dir):
            os.mkdir(level_dir)
        # 生成写文件
        repost_file = temp_dir + name + '.csv'
        repost_writer = csvWriter(repost_file, repost=True)
    else:
        level_dir = temp_dir + breakpos['level_dir']
        repost_file = temp_dir + breakpos['repost_file']
        repost_writer = csvWriter(repost_file, repost=True, breakpos=True)
        # 先爬取完断点id,再对余下id按常规爬取
        get_repost_relationship(breakpos['center_bw_id'], repost_writer,
                                level_dir, logger, breakpos)
        searchList = searchList[1:]

    # 常规爬取
    logger.info('Strat getting repost...')
    for id in searchList:
        get_repost_relationship(id, repost_writer, level_dir, logger)
    logger.info('Finish!')
    # 爬取完后,删除日志
    # 日志主要用于处理断点,若爬取完成则不再需要
    log.remove()
def word_repost_relationship(temp_dir, searchList, breakpos=None):
    # 据进程名生成日志
    name = 'getRepost_' + str(os.getpid())
    log = Logger(name)
    logger = log.getLogger()
    # 每个进程维护一个sublist转发关系的层级目录(临时)
    level_dir = temp_dir + f'temp_{name}/'
    os.mkdir(level_dir)

    # 断点处理
    if not breakpos:
        # 生成写文件
        repost_file = temp_dir + name + '.csv'
        repost_writer = csvWriter(repost_file, repost=True)
    else:
        repost_file = temp_dir + breakpos['repost_file']
        repost_writer = csvWriter(repost_file, repost=True, breakpos=True)
        # 先爬取完断点id,再对余下id按常规爬取
        get_repost_relationship(breakpos['center_bw_id'], repost_writer,
                                level_dir, logger, breakpos)
        searchList = searchList[1:]  # 有问题!!断点再看,常常爬取不成

    # 常规爬取
    logger.info('Strat getting repost...')
    for id in searchList:
        get_repost_relationship(id, repost_writer, level_dir, logger)
    logger.info('Finish!')
    # 爬取完后,删除日志
    # 日志主要用于处理断点,若爬取完成则不再需要
    log.remove()
def word_repost_relationship(temp_dir, searchList, breakpos=None):
    # 据进程名生成日志
    name = 'getRepost_' + str(os.getpid())
    logger = getLogger(name)
    # 每个进程维护一个sublist转发关系的层级目录(临时)
    level_dir = temp_dir + 'temp/'
    os.mkdir(level_dir)

    # 断点处理
    if not breakpos:
        # 生成写文件
        repost_file = temp_dir + name + '.csv'
        repost_writer = csvWriter(repost_file, repost=True)
    else:
        repost_file = temp_dir + breakpos['repost_file']
        repost_writer = csvWriter(repost_file, repost=True, breakpos=True)
        # 先爬取完断点id,再对余下id按常规爬取
        get_repost_relationship(breakpos['center_bw_id'], repost_writer,
                                level_dir, logger, breakpos)
        searchList = searchList[1:]

    # 常规爬取
    logger.info('Strat getting repost...')
    for id in searchList:
        get_repost_relationship(id, repost_writer, level_dir, logger)
    logger.info('Finish!')
    # 删除临时目录
    shutil.rmtree(level_dir)
Beispiel #4
0
def word_spider(searchlist):
    # 加载设置文件,获取数据输出路径和检索词
    config = load_config()
    hot_dir = config['hot_dir']
    topic_dir = config['topic_dir']
    repost_dir = config['repost_dir']
    # 根据规定日志目录创建目录实例
    name = multiprocessing.current_process().name
    logger = getLogger(name)
    topic_dir += name + '_'
    # 记录载入检索词列表的次数
    epoch = 1

    while True:
        # 对每一个词爬取相关微博和各微博的转发关系
        for wd in searchlist:
            logger.info(f'EPOCH: {epoch}. Keyword: {wd}. Start crawling ...')
            search_file = hot_dir + 'search_result_' + str(wd) + '.csv'
            repost_file = repost_dir + 'repost_Relationship_' + str(wd) + '.csv'
            # 创建两个写的对象,同时创建文件
            search_writer = csvWriter(search_file, search=True)
            repost_writer = csvWriter(repost_file, repost=True)

            # 获取该检索词的所有相关微博,至多能获取1000条
            get_query_info(wd, search_writer, logger)

            # 获取相关微博id组成的列表
            idList = search_writer.get_idList()
            # 获取各相关微博的转发关系
            for bw_id in idList:
                get_repost_relationship(bw_id, repost_writer, logger)

            repost_writer.drop_duplicates()

            # 获取该词相关所有话题作为之后的检索词
            get_more_topic(wd, epoch, topic_dir, logger)

        # 结束一轮检索爬取
        # 获取新检索词列表
        filename = topic_dir + 'Topics_' + str(epoch) + '.csv'
        with open(filename, 'r', encoding='utf-8-sig') as f:
            rows = csv.reader(f)
            searchlist = [row[0].strip() for row in rows]

        # 删除中间文件
        os.remove(filename)

        epoch += 1
Beispiel #5
0
def get_repost_relationship(bw_id,
                            repost_writer,
                            level_dir,
                            logger,
                            breakpos=None):
    # center_bw_id记录最原始的bw_id
    center_bw_id = bw_id
    # 类层次遍历处理转发关系
    # 为了节省内存,将每一层的层级关系写入level_dir

    # 断点处理
    if not breakpos:
        # 初始化层数为0,仍可以获取转发关系
        level = 1
        idList = [bw_id]
    else:
        level = breakpos['level']
        break_file = level_dir + f'Level_{level}_{center_bw_id}.csv'
        temp_writer = csvWriter(break_file, temp=True, breakpos=True)
        idList = temp_writer.get_idList(breakpos.get('break_id'))

    # 爬取转发
    while len(idList) > 0:
        # 创建下一层的原博文件,即该层的转发微博的微博id
        temp_file = level_dir + f'Level_{level+1}_{center_bw_id}.csv'
        if level == breakpos and breakpos.get('break_id'):
            temp_writer = csvWriter(
                temp_file, temp=True,
                breakpos=True)  # 断点为本层的中间,所以其下一层文件早已创建,直接往后添加
        else:
            temp_writer = csvWriter(temp_file, temp=True)  # 非断点,则照常创建新文件

        # 获得该层所有bw_id的直接转发关系
        for bw_id in idList:
            get_repost_info(center_bw_id, bw_id, level, repost_writer, logger,
                            temp_writer)
        # 获取下一level的原博id
        idList = temp_writer.get_idList()
        # 删除存储本层idList的文件
        if not level == 1:
            os.remove(level_dir + f'Level_{level}_{center_bw_id}.csv')
        level += 1
    # 爬取结束后,删除最后一次的temp_file
    os.remove(temp_file)
def one_word_spider():
    # 加载设置文件,获取数据输出路径和检索词
    config = load_config()
    hot_dir = config['hot_dir']
    repost_dir = config['repost_dir']
    one_repost_dir = config['one_repost_dir']
    searchlist = config['searchlist']
    if type(searchlist) is str or len(searchlist) == 1:
        wd = searchlist
    else:
        raise ValueError('one_word_spider() can only accept one search word!')

    # 写文件的文件名
    search_file = hot_dir + 'search_result_' + str(wd) + '.csv'
    # 创建写的对象,同时创建文件
    search_writer = csvWriter(search_file, search=True)
    # 获取该检索词的所有相关微博,至多能获取1000条
    # 为了日志正常输出(python的logging非进程安全),需要用到进程池
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Keyword: {wd}. Start crawling ...')
    p = Pool(1)
    p.apply_async(one_word_get_query_info, args=(wd, search_writer))
    p.close()
    p.join()

    # 获取相关微博id组成的列表
    # 将其分为10个列表,用10个进程进行转发关系爬取
    raw_searchlist = search_writer.get_idList()
    searchList = split_searchList(raw_searchlist, 10)

    # 生成10进程的进程池
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Generating Process Pool Containing 10 Process...')
    p = Pool(10)
    for group in searchList:
        p.apply_async(one_word_repost_relationship, args=(group,))
    p.close()
    p.join()

    # 将10个进程的csv文件进行合并
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start Merging csv Files...')
    merge_csv(wd, one_repost_dir, repost_dir)
def one_word_continue():
    # 加载设置文件,获取数据输出路径和检索词
    config = load_config()
    repost_dir = config['repost_dir']
    one_repost_dir = config['one_repost_dir']
    searchlist = config['searchlist']
    if type(searchlist) is str or len(searchlist) == 1:
        wd = searchlist
    else:
        raise ValueError('one_word_spider() can only accept one search word!')
    # 读取已爬取好的检索词相关微博
    filename = config['hot_dir'] + 'search_result_' + str(wd) + '.csv'
    reader = csvWriter(filename, search=True, breakpos=True)
    raw_searchlist = reader.get_idList()

    # 根据每个进程此前中断的center_bw_id重新进行爬取
    breakpos = config['breakpos']
    allList = getBreakList(raw_searchlist, breakpos)

    # 启动进程池
    pool_size = len(allList)
    p = Pool(pool_size)
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Generating Process Pool Containing {str(pool_size)} Process...')
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start crawling repost relationship...')
    
    for sublist in allList:
        if sublist.get('breakpos'):
            p.apply_async(one_word_repost_relationship, args=(sublist['thisList'], sublist['breakpos']))
        else:
            p.apply_async(one_word_repost_relationship, args=(sublist['thisList'],))

    p.close()
    p.join()
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Finish crawling repost relationship!')

    # 将所有csv文件进行合并
    merge_csv(wd, one_repost_dir, repost_dir)
Beispiel #8
0
def word_spider():
    # 加载设置文件,获取数据输出路径和检索词
    config = load_config()
    hot_dir = config['hot_dir']
    repost_dir = config['repost_dir']
    process_num = config['process_num']
    searchlist = config['searchlist']
    expand_topic = config['expand_topic']
    if expand_topic:
        topic_dir = config['topic_dir']

    # 记录载入检索词列表的次数
    epoch = 1

    while True:
        # 对每一个词爬取相关微博和各微博的转发关系
        for wd in searchlist:
            if wd == config.get('breakList'):
                config['breakPoint'] = True
            else:
                config['breakPoint'] = False

            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  EPOCH: {epoch}. Keyword: {wd}.'
            )
            search_file = hot_dir + 'search_result_' + str(wd) + '.csv'
            repost_file = repost_dir + 'repost_Relationship_' + str(
                wd) + '.csv'
            # 创建两个写的对象,同时创建文件
            # 如果是断点,则不创建新文件,只指定各文件的字段
            search_writer = csvWriter(search_file,
                                      search=True,
                                      breakpos=config['breakPoint'])
            repost_writer = csvWriter(repost_file,
                                      repost=True,
                                      breakpos=config['breakPoint'])
            # 需要临时目录来存储多个进程的文件
            temp = repost_dir + wd + '/'

            if not config['breakPoint']:
                # 创建临时目录
                os.mkdir(temp)
                # 获取该检索词的所有相关微博,至多能获取1000条
                # 因为Python不兼容多进程的日志,而之后repost需要多进程
                # 所以凡日志输出,都需要新开进程处理
                print(
                    f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start searching keyword: {wd}.'
                )
                p = Pool(1)
                p.apply_async(word_get_query_info, args=(wd, search_writer))
                p.close()
                p.join()
                print(
                    f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Finished searching keyword: {wd}.'
                )

            # 获取相关微博id组成的列表
            raw_idList = search_writer.get_idList()
            # 分割待爬取转发关系的id,以便开启进程
            idList = splitList(raw_idList,
                               process_num,
                               breakpos=config['breakPoint'])

            # 多进程爬取转发关系
            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start crawling repost relationship...'
            )
            # 正常爬取时,遵循用户设置的进程数
            # 断点处理时,遵循每个进程处理不多于10个center_bw_id的原则
            p = Pool(len(idList))
            for num, item in enumerate(idList):
                if item.get('breakpos'):
                    p.apply_async(word_repost_relationship,
                                  args=(num, temp, item['sublist'],
                                        item['breakpos']))
                else:
                    p.apply_async(word_repost_relationship,
                                  args=(num, temp, item['sublist']))
            p.close()
            p.join()

            # 整合中间文件成完整文件并去重
            repost_writer.merge_csv(temp)

            # 获取该词相关所有话题作为之后的检索词
            if expand_topic:
                get_more_topic(wd, epoch, topic_dir)

        if expand_topic:
            # 结束一轮检索爬取
            # 获取新检索词列表
            filename = topic_dir + 'Topics_' + str(epoch) + '.csv'
            with open(filename, 'r', encoding='utf-8-sig') as f:
                rows = csv.reader(f)
                searchlist = list(set([row[0].strip() for row in rows]))
            os.remove(filename)
            epoch += 1
        else:
            break
Beispiel #9
0
def word_spider():
    # 加载设置文件,获取数据输出路径和检索词
    config = load_config()
    hot_dir = config['hot_dir']
    topic_dir = config['topic_dir']
    repost_dir = config['repost_dir']
    process_num = config['process_num']
    one_word = config['one_word']
    searchlist = config['searchlist']

    # 记录载入检索词列表的次数
    epoch = 1

    while True:
        # 对每一个词爬取相关微博和各微博的转发关系
        for wd in searchlist:
            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  EPOCH: {epoch}. Keyword: {wd}.'
            )
            search_file = hot_dir + 'search_result_' + str(wd) + '.csv'
            repost_file = repost_dir + 'repost_Relationship_' + str(
                wd) + '.csv'
            # 创建两个写的对象,同时创建文件
            search_writer = csvWriter(search_file, search=True)
            repost_writer = csvWriter(repost_file, repost=True)

            # 获取该检索词的所有相关微博,至多能获取1000条
            # 因为Python不兼容多进程的日志,而之后repost需要多进程
            # 所以凡日志输出,都需要新开进程处理
            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start searching keyword: {wd}.'
            )
            p = Pool(1)
            p.apply_async(word_get_query_info, args=(wd, search_writer))
            p.close()
            p.join()
            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Finished searching keyword: {wd}.'
            )

            # 获取相关微博id组成的列表
            raw_idList = search_writer.get_idList()
            # 分割待爬取转发关系的id,以便开启进程
            idList = splitList(raw_idList, process_num)
            # 需要临时目录来存储多个进程的文件
            temp = repost_dir + wd + '/'
            os.mkdir(temp)

            # 多进程爬取转发关系
            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start crawling repost relationship...'
            )
            p = Pool(process_num)
            for sublist in idList:
                p.apply_async(word_repost_relationship, args=(temp, sublist))
            p.close()
            p.join()

            # 整合中间文件成完整文件并去重
            repost_writer.merge_csv(temp)

            # 获取该词相关所有话题作为之后的检索词
            if one_word:
                break
            else:
                get_more_topic(wd, epoch, topic_dir)

        # 结束一轮检索爬取
        # 获取新检索词列表
        filename = topic_dir + 'Topics_' + str(epoch) + '.csv'
        with open(filename, 'r', encoding='utf-8-sig') as f:
            rows = csv.reader(f)
            searchlist = list(set([row[0].strip() for row in rows]))
        os.remove(filename)

        epoch += 1