Example #1
0
def getLogger(name):
    # load_config
    dir = load_config()['log_dir']
    logging.basicConfig(filename=dir + name + '_spider.log',
                        level=logging.INFO,
                        format='[%(asctime)s]  %(levelname)-12s | %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    logger = logging.getLogger()
    return logger
Example #2
0
def pool_spider(group_num):
    # 加载设置文件,获取处理好的检索词列表
    raw_searchlist = load_config()['searchlist']
    searchlist = split_searchList(raw_searchlist, 5)
    print('Link parent process %s.' % os.getpid())
    p = Pool(group_num)
    for list in searchlist:
        p.apply_async(word_spider, args=(list, ))
    p.close()  # 关闭进程池
    p.join()  # 阻塞父进程直至所有子进程运行完毕
Example #3
0
def word_spider(searchlist):
    # 加载设置文件,获取数据输出路径和检索词
    config = load_config()
    hot_dir = config['hot_dir']
    topic_dir = config['topic_dir']
    repost_dir = config['repost_dir']
    # 根据规定日志目录创建目录实例
    name = multiprocessing.current_process().name
    logger = getLogger(name)
    topic_dir += name + '_'
    # 记录载入检索词列表的次数
    epoch = 1

    while True:
        # 对每一个词爬取相关微博和各微博的转发关系
        for wd in searchlist:
            logger.info(f'EPOCH: {epoch}. Keyword: {wd}. Start crawling ...')
            search_file = hot_dir + 'search_result_' + str(wd) + '.csv'
            repost_file = repost_dir + 'repost_Relationship_' + str(wd) + '.csv'
            # 创建两个写的对象,同时创建文件
            search_writer = csvWriter(search_file, search=True)
            repost_writer = csvWriter(repost_file, repost=True)

            # 获取该检索词的所有相关微博,至多能获取1000条
            get_query_info(wd, search_writer, logger)

            # 获取相关微博id组成的列表
            idList = search_writer.get_idList()
            # 获取各相关微博的转发关系
            for bw_id in idList:
                get_repost_relationship(bw_id, repost_writer, logger)

            repost_writer.drop_duplicates()

            # 获取该词相关所有话题作为之后的检索词
            get_more_topic(wd, epoch, topic_dir, logger)

        # 结束一轮检索爬取
        # 获取新检索词列表
        filename = topic_dir + 'Topics_' + str(epoch) + '.csv'
        with open(filename, 'r', encoding='utf-8-sig') as f:
            rows = csv.reader(f)
            searchlist = [row[0].strip() for row in rows]

        # 删除中间文件
        os.remove(filename)

        epoch += 1
def one_word_spider():
    # 加载设置文件,获取数据输出路径和检索词
    config = load_config()
    hot_dir = config['hot_dir']
    repost_dir = config['repost_dir']
    one_repost_dir = config['one_repost_dir']
    searchlist = config['searchlist']
    if type(searchlist) is str or len(searchlist) == 1:
        wd = searchlist
    else:
        raise ValueError('one_word_spider() can only accept one search word!')

    # 写文件的文件名
    search_file = hot_dir + 'search_result_' + str(wd) + '.csv'
    # 创建写的对象,同时创建文件
    search_writer = csvWriter(search_file, search=True)
    # 获取该检索词的所有相关微博,至多能获取1000条
    # 为了日志正常输出(python的logging非进程安全),需要用到进程池
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Keyword: {wd}. Start crawling ...')
    p = Pool(1)
    p.apply_async(one_word_get_query_info, args=(wd, search_writer))
    p.close()
    p.join()

    # 获取相关微博id组成的列表
    # 将其分为10个列表,用10个进程进行转发关系爬取
    raw_searchlist = search_writer.get_idList()
    searchList = split_searchList(raw_searchlist, 10)

    # 生成10进程的进程池
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Generating Process Pool Containing 10 Process...')
    p = Pool(10)
    for group in searchList:
        p.apply_async(one_word_repost_relationship, args=(group,))
    p.close()
    p.join()

    # 将10个进程的csv文件进行合并
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start Merging csv Files...')
    merge_csv(wd, one_repost_dir, repost_dir)
def one_word_continue():
    # 加载设置文件,获取数据输出路径和检索词
    config = load_config()
    repost_dir = config['repost_dir']
    one_repost_dir = config['one_repost_dir']
    searchlist = config['searchlist']
    if type(searchlist) is str or len(searchlist) == 1:
        wd = searchlist
    else:
        raise ValueError('one_word_spider() can only accept one search word!')
    # 读取已爬取好的检索词相关微博
    filename = config['hot_dir'] + 'search_result_' + str(wd) + '.csv'
    reader = csvWriter(filename, search=True, breakpos=True)
    raw_searchlist = reader.get_idList()

    # 根据每个进程此前中断的center_bw_id重新进行爬取
    breakpos = config['breakpos']
    allList = getBreakList(raw_searchlist, breakpos)

    # 启动进程池
    pool_size = len(allList)
    p = Pool(pool_size)
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Generating Process Pool Containing {str(pool_size)} Process...')
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start crawling repost relationship...')
    
    for sublist in allList:
        if sublist.get('breakpos'):
            p.apply_async(one_word_repost_relationship, args=(sublist['thisList'], sublist['breakpos']))
        else:
            p.apply_async(one_word_repost_relationship, args=(sublist['thisList'],))

    p.close()
    p.join()
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Finish crawling repost relationship!')

    # 将所有csv文件进行合并
    merge_csv(wd, one_repost_dir, repost_dir)
Example #6
0
def splitList(raw_searchlist, group_num, breakpos=False):
    # 为进程池预处理
    # searchlist:共分为group_num组对应group_num个进程,每个元素为词典,sublist为检索词列表,breakpos为断点
    searchlist = []
    temp = []
    count = 1
    num = int(len(raw_searchlist) / group_num)
    if num == 1:
        raise Exception('Please reduce the group number or add more words')
    for ele in raw_searchlist:
        temp.append(ele)
        if count % num == 0 and not len(searchlist) == (group_num - 1):
            searchlist.append({'sublist': temp})
            temp = []
        count += 1
    if temp:
        searchlist.append({'sublist': temp})

    # 断点处理
    if breakpos:
        newList = []
        breakList = load_config()['breakList']
        for item in breakList:
            this_dict = {}
            this_dict['breakpos'] = item
            temp_list = searchlist[item['batch_num']]['sublist']
            pos = temp_list.index(item['center_bw_id'])
            this_dict['sublist'] = temp_list[pos:pos + 10]
            newList.append(this_dict)
            # 若某个batch剩余id很多,非断点id处理成其它list
            pos += 10
            while pos < len(temp_list):
                newList.append({'sublist': temp_list[pos:pos + 10]})
                pos += 10
        return newList
    else:
        return searchlist
def load_db_conn_from_config_file(config_file, db_name):
    db_config = load_config(config_file, config_name=db_name)
    db_conn = get_db_connection_from_config(db_config)

    return db_conn
Example #8
0
def word_spider():
    # 加载设置文件,获取数据输出路径和检索词
    config = load_config()
    hot_dir = config['hot_dir']
    repost_dir = config['repost_dir']
    process_num = config['process_num']
    searchlist = config['searchlist']
    expand_topic = config['expand_topic']
    if expand_topic:
        topic_dir = config['topic_dir']

    # 记录载入检索词列表的次数
    epoch = 1

    while True:
        # 对每一个词爬取相关微博和各微博的转发关系
        for wd in searchlist:
            if wd == config.get('breakList'):
                config['breakPoint'] = True
            else:
                config['breakPoint'] = False

            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  EPOCH: {epoch}. Keyword: {wd}.'
            )
            search_file = hot_dir + 'search_result_' + str(wd) + '.csv'
            repost_file = repost_dir + 'repost_Relationship_' + str(
                wd) + '.csv'
            # 创建两个写的对象,同时创建文件
            # 如果是断点,则不创建新文件,只指定各文件的字段
            search_writer = csvWriter(search_file,
                                      search=True,
                                      breakpos=config['breakPoint'])
            repost_writer = csvWriter(repost_file,
                                      repost=True,
                                      breakpos=config['breakPoint'])
            # 需要临时目录来存储多个进程的文件
            temp = repost_dir + wd + '/'

            if not config['breakPoint']:
                # 创建临时目录
                os.mkdir(temp)
                # 获取该检索词的所有相关微博,至多能获取1000条
                # 因为Python不兼容多进程的日志,而之后repost需要多进程
                # 所以凡日志输出,都需要新开进程处理
                print(
                    f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start searching keyword: {wd}.'
                )
                p = Pool(1)
                p.apply_async(word_get_query_info, args=(wd, search_writer))
                p.close()
                p.join()
                print(
                    f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Finished searching keyword: {wd}.'
                )

            # 获取相关微博id组成的列表
            raw_idList = search_writer.get_idList()
            # 分割待爬取转发关系的id,以便开启进程
            idList = splitList(raw_idList,
                               process_num,
                               breakpos=config['breakPoint'])

            # 多进程爬取转发关系
            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start crawling repost relationship...'
            )
            # 正常爬取时,遵循用户设置的进程数
            # 断点处理时,遵循每个进程处理不多于10个center_bw_id的原则
            p = Pool(len(idList))
            for num, item in enumerate(idList):
                if item.get('breakpos'):
                    p.apply_async(word_repost_relationship,
                                  args=(num, temp, item['sublist'],
                                        item['breakpos']))
                else:
                    p.apply_async(word_repost_relationship,
                                  args=(num, temp, item['sublist']))
            p.close()
            p.join()

            # 整合中间文件成完整文件并去重
            repost_writer.merge_csv(temp)

            # 获取该词相关所有话题作为之后的检索词
            if expand_topic:
                get_more_topic(wd, epoch, topic_dir)

        if expand_topic:
            # 结束一轮检索爬取
            # 获取新检索词列表
            filename = topic_dir + 'Topics_' + str(epoch) + '.csv'
            with open(filename, 'r', encoding='utf-8-sig') as f:
                rows = csv.reader(f)
                searchlist = list(set([row[0].strip() for row in rows]))
            os.remove(filename)
            epoch += 1
        else:
            break
Example #9
0
 def __init__(self, name):
     dir = load_config()['log_dir']
     self.name = dir + name + '_spider.log'
Example #10
0
def word_spider():
    # 加载设置文件,获取数据输出路径和检索词
    config = load_config()
    hot_dir = config['hot_dir']
    topic_dir = config['topic_dir']
    repost_dir = config['repost_dir']
    process_num = config['process_num']
    one_word = config['one_word']
    searchlist = config['searchlist']

    # 记录载入检索词列表的次数
    epoch = 1

    while True:
        # 对每一个词爬取相关微博和各微博的转发关系
        for wd in searchlist:
            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  EPOCH: {epoch}. Keyword: {wd}.'
            )
            search_file = hot_dir + 'search_result_' + str(wd) + '.csv'
            repost_file = repost_dir + 'repost_Relationship_' + str(
                wd) + '.csv'
            # 创建两个写的对象,同时创建文件
            search_writer = csvWriter(search_file, search=True)
            repost_writer = csvWriter(repost_file, repost=True)

            # 获取该检索词的所有相关微博,至多能获取1000条
            # 因为Python不兼容多进程的日志,而之后repost需要多进程
            # 所以凡日志输出,都需要新开进程处理
            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start searching keyword: {wd}.'
            )
            p = Pool(1)
            p.apply_async(word_get_query_info, args=(wd, search_writer))
            p.close()
            p.join()
            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Finished searching keyword: {wd}.'
            )

            # 获取相关微博id组成的列表
            raw_idList = search_writer.get_idList()
            # 分割待爬取转发关系的id,以便开启进程
            idList = splitList(raw_idList, process_num)
            # 需要临时目录来存储多个进程的文件
            temp = repost_dir + wd + '/'
            os.mkdir(temp)

            # 多进程爬取转发关系
            print(
                f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Start crawling repost relationship...'
            )
            p = Pool(process_num)
            for sublist in idList:
                p.apply_async(word_repost_relationship, args=(temp, sublist))
            p.close()
            p.join()

            # 整合中间文件成完整文件并去重
            repost_writer.merge_csv(temp)

            # 获取该词相关所有话题作为之后的检索词
            if one_word:
                break
            else:
                get_more_topic(wd, epoch, topic_dir)

        # 结束一轮检索爬取
        # 获取新检索词列表
        filename = topic_dir + 'Topics_' + str(epoch) + '.csv'
        with open(filename, 'r', encoding='utf-8-sig') as f:
            rows = csv.reader(f)
            searchlist = list(set([row[0].strip() for row in rows]))
        os.remove(filename)

        epoch += 1