def getLogger(name): # load_config dir = load_config()['log_dir'] logging.basicConfig(filename=dir + name + '_spider.log', level=logging.INFO, format='[%(asctime)s] %(levelname)-12s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger() return logger
def pool_spider(group_num): # 加载设置文件,获取处理好的检索词列表 raw_searchlist = load_config()['searchlist'] searchlist = split_searchList(raw_searchlist, 5) print('Link parent process %s.' % os.getpid()) p = Pool(group_num) for list in searchlist: p.apply_async(word_spider, args=(list, )) p.close() # 关闭进程池 p.join() # 阻塞父进程直至所有子进程运行完毕
def word_spider(searchlist): # 加载设置文件,获取数据输出路径和检索词 config = load_config() hot_dir = config['hot_dir'] topic_dir = config['topic_dir'] repost_dir = config['repost_dir'] # 根据规定日志目录创建目录实例 name = multiprocessing.current_process().name logger = getLogger(name) topic_dir += name + '_' # 记录载入检索词列表的次数 epoch = 1 while True: # 对每一个词爬取相关微博和各微博的转发关系 for wd in searchlist: logger.info(f'EPOCH: {epoch}. Keyword: {wd}. Start crawling ...') search_file = hot_dir + 'search_result_' + str(wd) + '.csv' repost_file = repost_dir + 'repost_Relationship_' + str(wd) + '.csv' # 创建两个写的对象,同时创建文件 search_writer = csvWriter(search_file, search=True) repost_writer = csvWriter(repost_file, repost=True) # 获取该检索词的所有相关微博,至多能获取1000条 get_query_info(wd, search_writer, logger) # 获取相关微博id组成的列表 idList = search_writer.get_idList() # 获取各相关微博的转发关系 for bw_id in idList: get_repost_relationship(bw_id, repost_writer, logger) repost_writer.drop_duplicates() # 获取该词相关所有话题作为之后的检索词 get_more_topic(wd, epoch, topic_dir, logger) # 结束一轮检索爬取 # 获取新检索词列表 filename = topic_dir + 'Topics_' + str(epoch) + '.csv' with open(filename, 'r', encoding='utf-8-sig') as f: rows = csv.reader(f) searchlist = [row[0].strip() for row in rows] # 删除中间文件 os.remove(filename) epoch += 1
def one_word_spider(): # 加载设置文件,获取数据输出路径和检索词 config = load_config() hot_dir = config['hot_dir'] repost_dir = config['repost_dir'] one_repost_dir = config['one_repost_dir'] searchlist = config['searchlist'] if type(searchlist) is str or len(searchlist) == 1: wd = searchlist else: raise ValueError('one_word_spider() can only accept one search word!') # 写文件的文件名 search_file = hot_dir + 'search_result_' + str(wd) + '.csv' # 创建写的对象,同时创建文件 search_writer = csvWriter(search_file, search=True) # 获取该检索词的所有相关微博,至多能获取1000条 # 为了日志正常输出(python的logging非进程安全),需要用到进程池 print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Keyword: {wd}. Start crawling ...') p = Pool(1) p.apply_async(one_word_get_query_info, args=(wd, search_writer)) p.close() p.join() # 获取相关微博id组成的列表 # 将其分为10个列表,用10个进程进行转发关系爬取 raw_searchlist = search_writer.get_idList() searchList = split_searchList(raw_searchlist, 10) # 生成10进程的进程池 print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Generating Process Pool Containing 10 Process...') p = Pool(10) for group in searchList: p.apply_async(one_word_repost_relationship, args=(group,)) p.close() p.join() # 将10个进程的csv文件进行合并 print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Start Merging csv Files...') merge_csv(wd, one_repost_dir, repost_dir)
def one_word_continue(): # 加载设置文件,获取数据输出路径和检索词 config = load_config() repost_dir = config['repost_dir'] one_repost_dir = config['one_repost_dir'] searchlist = config['searchlist'] if type(searchlist) is str or len(searchlist) == 1: wd = searchlist else: raise ValueError('one_word_spider() can only accept one search word!') # 读取已爬取好的检索词相关微博 filename = config['hot_dir'] + 'search_result_' + str(wd) + '.csv' reader = csvWriter(filename, search=True, breakpos=True) raw_searchlist = reader.get_idList() # 根据每个进程此前中断的center_bw_id重新进行爬取 breakpos = config['breakpos'] allList = getBreakList(raw_searchlist, breakpos) # 启动进程池 pool_size = len(allList) p = Pool(pool_size) print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Generating Process Pool Containing {str(pool_size)} Process...') print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Start crawling repost relationship...') for sublist in allList: if sublist.get('breakpos'): p.apply_async(one_word_repost_relationship, args=(sublist['thisList'], sublist['breakpos'])) else: p.apply_async(one_word_repost_relationship, args=(sublist['thisList'],)) p.close() p.join() print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Finish crawling repost relationship!') # 将所有csv文件进行合并 merge_csv(wd, one_repost_dir, repost_dir)
def splitList(raw_searchlist, group_num, breakpos=False): # 为进程池预处理 # searchlist:共分为group_num组对应group_num个进程,每个元素为词典,sublist为检索词列表,breakpos为断点 searchlist = [] temp = [] count = 1 num = int(len(raw_searchlist) / group_num) if num == 1: raise Exception('Please reduce the group number or add more words') for ele in raw_searchlist: temp.append(ele) if count % num == 0 and not len(searchlist) == (group_num - 1): searchlist.append({'sublist': temp}) temp = [] count += 1 if temp: searchlist.append({'sublist': temp}) # 断点处理 if breakpos: newList = [] breakList = load_config()['breakList'] for item in breakList: this_dict = {} this_dict['breakpos'] = item temp_list = searchlist[item['batch_num']]['sublist'] pos = temp_list.index(item['center_bw_id']) this_dict['sublist'] = temp_list[pos:pos + 10] newList.append(this_dict) # 若某个batch剩余id很多,非断点id处理成其它list pos += 10 while pos < len(temp_list): newList.append({'sublist': temp_list[pos:pos + 10]}) pos += 10 return newList else: return searchlist
def load_db_conn_from_config_file(config_file, db_name): db_config = load_config(config_file, config_name=db_name) db_conn = get_db_connection_from_config(db_config) return db_conn
def word_spider(): # 加载设置文件,获取数据输出路径和检索词 config = load_config() hot_dir = config['hot_dir'] repost_dir = config['repost_dir'] process_num = config['process_num'] searchlist = config['searchlist'] expand_topic = config['expand_topic'] if expand_topic: topic_dir = config['topic_dir'] # 记录载入检索词列表的次数 epoch = 1 while True: # 对每一个词爬取相关微博和各微博的转发关系 for wd in searchlist: if wd == config.get('breakList'): config['breakPoint'] = True else: config['breakPoint'] = False print( f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] EPOCH: {epoch}. Keyword: {wd}.' ) search_file = hot_dir + 'search_result_' + str(wd) + '.csv' repost_file = repost_dir + 'repost_Relationship_' + str( wd) + '.csv' # 创建两个写的对象,同时创建文件 # 如果是断点,则不创建新文件,只指定各文件的字段 search_writer = csvWriter(search_file, search=True, breakpos=config['breakPoint']) repost_writer = csvWriter(repost_file, repost=True, breakpos=config['breakPoint']) # 需要临时目录来存储多个进程的文件 temp = repost_dir + wd + '/' if not config['breakPoint']: # 创建临时目录 os.mkdir(temp) # 获取该检索词的所有相关微博,至多能获取1000条 # 因为Python不兼容多进程的日志,而之后repost需要多进程 # 所以凡日志输出,都需要新开进程处理 print( f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Start searching keyword: {wd}.' ) p = Pool(1) p.apply_async(word_get_query_info, args=(wd, search_writer)) p.close() p.join() print( f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Finished searching keyword: {wd}.' ) # 获取相关微博id组成的列表 raw_idList = search_writer.get_idList() # 分割待爬取转发关系的id,以便开启进程 idList = splitList(raw_idList, process_num, breakpos=config['breakPoint']) # 多进程爬取转发关系 print( f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Start crawling repost relationship...' ) # 正常爬取时,遵循用户设置的进程数 # 断点处理时,遵循每个进程处理不多于10个center_bw_id的原则 p = Pool(len(idList)) for num, item in enumerate(idList): if item.get('breakpos'): p.apply_async(word_repost_relationship, args=(num, temp, item['sublist'], item['breakpos'])) else: p.apply_async(word_repost_relationship, args=(num, temp, item['sublist'])) p.close() p.join() # 整合中间文件成完整文件并去重 repost_writer.merge_csv(temp) # 获取该词相关所有话题作为之后的检索词 if expand_topic: get_more_topic(wd, epoch, topic_dir) if expand_topic: # 结束一轮检索爬取 # 获取新检索词列表 filename = topic_dir + 'Topics_' + str(epoch) + '.csv' with open(filename, 'r', encoding='utf-8-sig') as f: rows = csv.reader(f) searchlist = list(set([row[0].strip() for row in rows])) os.remove(filename) epoch += 1 else: break
def __init__(self, name): dir = load_config()['log_dir'] self.name = dir + name + '_spider.log'
def word_spider(): # 加载设置文件,获取数据输出路径和检索词 config = load_config() hot_dir = config['hot_dir'] topic_dir = config['topic_dir'] repost_dir = config['repost_dir'] process_num = config['process_num'] one_word = config['one_word'] searchlist = config['searchlist'] # 记录载入检索词列表的次数 epoch = 1 while True: # 对每一个词爬取相关微博和各微博的转发关系 for wd in searchlist: print( f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] EPOCH: {epoch}. Keyword: {wd}.' ) search_file = hot_dir + 'search_result_' + str(wd) + '.csv' repost_file = repost_dir + 'repost_Relationship_' + str( wd) + '.csv' # 创建两个写的对象,同时创建文件 search_writer = csvWriter(search_file, search=True) repost_writer = csvWriter(repost_file, repost=True) # 获取该检索词的所有相关微博,至多能获取1000条 # 因为Python不兼容多进程的日志,而之后repost需要多进程 # 所以凡日志输出,都需要新开进程处理 print( f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Start searching keyword: {wd}.' ) p = Pool(1) p.apply_async(word_get_query_info, args=(wd, search_writer)) p.close() p.join() print( f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Finished searching keyword: {wd}.' ) # 获取相关微博id组成的列表 raw_idList = search_writer.get_idList() # 分割待爬取转发关系的id,以便开启进程 idList = splitList(raw_idList, process_num) # 需要临时目录来存储多个进程的文件 temp = repost_dir + wd + '/' os.mkdir(temp) # 多进程爬取转发关系 print( f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Start crawling repost relationship...' ) p = Pool(process_num) for sublist in idList: p.apply_async(word_repost_relationship, args=(temp, sublist)) p.close() p.join() # 整合中间文件成完整文件并去重 repost_writer.merge_csv(temp) # 获取该词相关所有话题作为之后的检索词 if one_word: break else: get_more_topic(wd, epoch, topic_dir) # 结束一轮检索爬取 # 获取新检索词列表 filename = topic_dir + 'Topics_' + str(epoch) + '.csv' with open(filename, 'r', encoding='utf-8-sig') as f: rows = csv.reader(f) searchlist = list(set([row[0].strip() for row in rows])) os.remove(filename) epoch += 1