def down_dedao(): global problem_list mysql = MySQL() try: mysql.get_connection() select_results = mysql.select('article', ['article_id', 'article_content'], 'avatar_uploaded = 0') print(len(select_results)) problem_list = [] # type:"audio , audio:" # type:"image , src:" for result in select_results: resource_list = [] # 文章id id = result[0] # 追加作者头像信息 article_author = mysql.select('article_author', ['article_id', 'author_id'], 'article_id = ' + str(id)) author_info = mysql.select( 'author', ['author_name', 'author_avatar'], 'author_id = ' + str(article_author[0][1])) resource_list.append(author_info[0][1]) # 查封面 ext_attributes = mysql.select( 'ext_attribute', ['article_id', 'attribute_name', 'attribute_value'], 'article_id = ' + str(id)) for ext_attribute in ext_attributes: # 评论头像追加 if ext_attribute[1] == 'comment': comment = json.loads(ext_attribute[2]) for note in comment['list']: resource_list.append(note['notes_owner']['avatar']) # 文章正文 try: # 先创建目录 os.mkdir(os.path.join(os.path.abspath('resource'), str(id))) # resource_path_list = [] # 对当前文章的链接一个一个进行下载,构造请求头(分析一下) for resource in resource_list: # 下载音频 # if '.m4a' in resource or '.mp4' in resource: audio_parse = urlparse(resource) if '.m4a' in resource or '.mp4' in resource: audio_headers['Host'] = audio_parse[1] audio = requests.get(resource, headers=audio_headers) else: image_headers['Host'] = audio_parse[1] audio = requests.get(resource, headers=image_headers) # 解析出资源名 audio_name = audio_parse[2][audio_parse[2].rfind('/') + 1:] # 把文章资源放到resource/id/... with open( os.path.join(os.path.abspath('resource'), str(id), audio_name), 'wb') as f: f.write(audio.content) print( '资源下载成功:%s' % os.path.join(os.path.abspath('resource'), audio_name)) time.sleep(2) except: problem_list.append(id) finally: print(problem_list) mysql.close_connection()
print('文件 %s 上传成功:%s' % (path, DOMAIN + key)) return DOMAIN + key except Exception as e: print(e) print('文件 %s 再次上传,上传次数为 %s' % (path, current_count)) return upload_file(path, current_count + 1, max_retry_count) else: print('文件 %s 上传次数超过 %s ,上传失败!' % (path, max_retry_count)) # 上传3次都没有上传成功就抛出异常 raise Exception('文件 %s 上传次数超过 %s ,上传失败!' % (path, max_retry_count)) if __name__ == '__main__': mysql = MySQL() mysql.get_connection() try: print(os.path.abspath('resource')) dir_list = os.listdir(os.path.abspath('resource')) # 每个dir就是一个文章的所有资源 for dir in dir_list: try: dir_path = os.path.join(os.path.abspath('resource'), dir) file_list = os.listdir(dir_path) for file in file_list: file_path = os.path.join(dir_path, file) # 上传,如果上传失败就重复试3次 upload_file(file_path) # time.sleep(1) # 该文章upload字段更新,更新为已上传成功
from handle_mysql import MySQL mysql = MySQL() try: mysql.get_connection() mysql.insert('tb_column', ) # column = mysql.select('tb_column', ['column_id', 'current_article_num'], 'column_name="%s"' % '万维钢·精英日课³') # # print(result) # # crawled_article = [] # # # 如果数据库中有 # if column: # column_id = column[0][0] # current_article_num = column[0][1] # print(column_id, current_article_num) # # # def f(x): # return x[0] # # # crawled_article_id = mysql.select('article_column', ['article_id'], 'column_id="%s"' % column_id) # crawled_article_ids = list(map(f, crawled_article_id)) # print(crawled_article_ids) # print(len(crawled_article_ids), current_article_num) # # if len(crawled_article_ids) < current_article_num: # for article_id in crawled_article_ids: # article_name = mysql.select('article', ['article_name'], 'article_id="%s"' % article_id)
def check_num(column_id): ''' 检测指定column_id的文章是否缺失 :param column_id: :return: ''' mysql = MySQL() try: mysql.get_connection() ids = mysql.select('article_column', ['article_id'], 'column_id=%s' % column_id) print(ids) print(len(ids)) time.sleep(3) def f(x): return x[0] ids = list(map(f, ids)) print(ids) # 找到第一篇的id for id in ids: r = mysql.select( 'ext_attribute', ['attribute_value'], "attribute_name = 'prev_article_id' and attribute_value = '0' and article_id = %s" % id) if r: first_id = id break print(first_id) time.sleep(3) token = True for i in range(400): # 查找当前id的下一篇id r = mysql.select( 'ext_attribute', ['attribute_value'], "attribute_name = 'next_article_id' and article_id = %s" % first_id) if not r: break if int(r[0][0]) == 0: token = False break # 检查下一篇id的文章是否存在 r2 = mysql.select('article', ['article_id'], "article_id = %s" % r[0][0]) # 如果不存在 if not r2: break first_id = r[0][0] # 这个first_id的下一篇就没有 if token: print('这篇id为:%s的下一篇文章不存在!' % first_id) else: print('没有不存在的文章!') problem_list = [] # for result in select_results: # try: # contents = json.loads(result[1]) # except: # problem_list.append(result[0]) # if problem_list: # print('出现问题的文章id:', problem_list) # else: # print('没有文章出现问题') finally: mysql.close_connection()
import os from down_upload_resource.download_file import down_dedao from down_upload_resource.upload_file import upload_file from down_upload_resource.delete_dir import clear_dir from handle_mysql import MySQL import time # 目前一下子下载20条数据,到resource文件夹中 down_dedao(522897, 50) # 进行上传 mysql = MySQL() mysql.get_connection() try: print(os.path.abspath('resource')) dir_list = os.listdir(os.path.abspath('resource')) # 每个dir就是一个文章的所有资源 for dir in dir_list: try: dir_path = os.path.join(os.path.abspath('resource'), dir) file_list = os.listdir(dir_path) for file in file_list: file_path = os.path.join(dir_path, file) # 上传,如果上传失败就重复试3次 upload_file(file_path) # time.sleep(1) # 该文章upload字段更新,更新为已上传成功 mysql.update('article', 'uploaded = 1', 'article_id = "%s"' % dir) except Exception as e: print(e)
reset_columns(driver) break # 重置column列表 def reset_columns(driver): # 重置列表 driver.back() # 点击课程 if wait.until(lambda x: x.find_element_by_xpath( "//android.widget.TextView[@resource-id='com.luojilab.player:id/tv_course']" )): driver.find_element_by_xpath( "//android.widget.TextView[@resource-id='com.luojilab.player:id/tv_course']" ).click() # 点击课程里面的最新购买 if wait.until(lambda x: x.find_element_by_id( "com.luojilab.player:id/nearBuyBtn")): driver.find_element_by_id("com.luojilab.player:id/nearBuyBtn").click() if __name__ == '__main__': mysql = MySQL() try: mysql.get_connection() handle_dedao(driver) finally: mysql.close_connection()
def crawl_article(driver): l = get_size(driver) x1 = int(l[0] * 0.5) y1 = int(l[1] * 0.75) y2 = int(l[1] * 0.25) n = 0 while True: temp = driver.page_source driver.swipe(x1, y1, x1, y2) time.sleep(0.2) # if temp == driver.page_source and '点击加载留言' in driver.page_source: if temp == driver.page_source: if n > 3: break n += 1 if __name__ == '__main__': mysql = MySQL() try: # mysql.get_connection() handle_dedao(driver) finally: pass # mysql.close_connection()
def response(flow): # 该文章的评论 # https://entree.igetget.com/bauhinia/v1/article/info if 'igetget.com/ledgers/notes/article_comment_list' in flow.request.url: content = json.loads(flow.response.text) # 如果有更多评论,就取出拼接再放入 if content['c']['isMore']: if not os.path.exists('../temp_file/5.txt') or os.path.getsize('../temp_file/5.txt') == 0: with open('../temp_file/5.txt', 'wb') as f: pickle.dump(content, f) else: print('之前大小:', os.path.getsize('../temp_file/5.txt'),) with open('../temp_file/5.txt', 'rb') as f: temp = pickle.load(f) with open('../temp_file/5.txt', 'wb') as f: f.truncate() print('中间大小:', os.path.getsize('../temp_file/5.txt')) temp['c']['list'] = temp['c']['list'] + content['c']['list'] with open('../temp_file/5.txt', 'wb') as f: pickle.dump(temp, f) print('之后大小:', os.path.getsize('../temp_file/5.txt')) # 如果就一条或者最后一条 else: # 从文件取出,存入数据库,得到文章id,一起存入数据库 # 如果这是最后一条 request = flow.request print(request.content) bys = request.content s = '&' + bys.decode('utf-8') + '&' page_search = re.compile(r"&page=(.*?)&") page = re.search(page_search, s).groups(1) page_count_search = re.compile(r"&page_count=(.*?)&") page_count = re.search(page_count_search, s).groups(1) article_id_search = re.compile(r"&article_id=(.*?)&") article_id = re.search(article_id_search, s).groups(1) print(page[0],page_count[0],article_id[0]) if os.path.getsize('../temp_file/5.txt'): print('最后一条') with open('../temp_file/5.txt', 'rb') as f: temp = pickle.load(f) temp['c']['list'] = temp['c']['list'] + content['c']['list'] print('一共 ',temp['c']['total'],'当前 ',len(temp['c']['list'])) # 然后把temp解析拼装放入数据库 with open('../temp_file/5.txt', 'wb') as f: f.truncate() # 如果仅这一条 else: # 直接存到数据库,就不用清空文件了 print('只有一条') print('一共 ',content['c']['total'],'当前 ',len(content['c']['list'])) temp = content ext_info = {} ext_info['article_id'] = int(article_id[0]) ext_info['attribute_name'] = 'comment' comment = {} # 评论所属文章id comment['article_id'] = article_id[0] comment['per_page_count'] = page_count[0] # 评论列表 comment['list'] = [] # 评论总数量 comment['comment_total'] = temp['c']['total'] if 0 != int(comment['comment_total']): # 评论的额外信息,包括文章栏目信息,以备不时之需 comment['extra'] = temp['c']['list'][0].get('extra') for a_comment in temp['c']['list']: comment_dict = {} comment_dict['note_id'] = a_comment.get('note_id') # 评论内容 comment_dict['note'] = a_comment.get('note') comment_dict['content'] = a_comment.get('content') comment_dict['note_title'] = a_comment.get('note_title') # 作者回复 comment_dict['note_line'] = a_comment.get('note_line') # 评论时间 comment_dict['comment_reply_time'] = a_comment.get('comment_reply_time') comment_dict['create_time'] = a_comment.get('create_time') comment_dict['update_time'] = a_comment.get('update_time') # 评论转发数 comment_dict['repost_count'] = a_comment['notes_count'].get('repost_count',0) # 评论评论数 comment_dict['comment_count'] = a_comment['notes_count'].get('comment_count',0) # 评论点赞数 comment_dict['like_count'] = a_comment['notes_count'].get('like_count',0) comment_dict['notes_owner'] = a_comment.get('notes_owner') comment['list'].append(comment_dict) # print(comment) else: comment['list'] = [] # 如果大于20,就取前20个 if len(comment['list']) > 18: comment['list'] = comment['list'][:18] ext_info['attribute_value'] = json.dumps(comment,ensure_ascii=False) # print(len(json.dumps(comment))) # print(json.dumps(comment).encode('gb2312').decode('unicode_escape')) # print(len(json.dumps(comment).encode('gb2312').decode('unicode_escape'))) # print('哈哈') # 插入数据库 try: mysql = MySQL() mysql.get_connection() # 目前判重机制是看这个article_id是否有额外属性comment,如果有就判重,但是没有比较内容是否重复,只是判断是否有这个属性 mysql.insert('ext_attribute',ext_info) finally: mysql.close_connection()
def down_dedao(column_id, num): print(image_headers) print(proxies) global problem_list mysql = MySQL() try: mysql.get_connection() # select_results = mysql.select('article',['article_id','article_content'],'uploaded = 0 limit %s' % num) select_results = mysql.select( 'article', ['article_id', 'article_content'], 'uploaded = 0 and article_id in (SELECT article_id FROM article_column WHERE column_id = %s) limit %s' % (column_id, num)) print(len(select_results)) problem_list = [] # type:"audio , audio:" # type:"image , src:" for result in select_results: resource_list = [] file_path_list = [] # 文章id id = result[0] # 追加作者头像信息 article_author = mysql.select('article_author', ['article_id', 'author_id'], 'article_id = ' + str(id)) author_info = mysql.select( 'author', ['author_name', 'author_avatar'], 'author_id = ' + str(article_author[0][1])) resource_list.append(author_info[0][1]) # 查封面 ext_attributes = mysql.select( 'ext_attribute', ['article_id', 'attribute_name', 'attribute_value'], 'article_id = ' + str(id)) for ext_attribute in ext_attributes: # 封面追加 if ext_attribute[1] == 'cover_image': resource_list.append(ext_attribute[2]) # 评论头像追加 if ext_attribute[1] == 'comment': comment = json.loads(ext_attribute[2]) for note in comment['list']: resource_list.append(note['notes_owner']['avatar']) # 文章正文 try: contents = json.loads(result[1]) for content in contents: if content['type'] == 'audio': resource_list.append(content['audio']['mp3_play_url']) elif content['type'] == 'image' and content.get('src'): resource_list.append(content.get('src')) # 先创建目录 os.mkdir(os.path.join(os.path.abspath('resource'), str(id))) # resource_path_list = [] # 对当前文章的链接一个一个进行下载,构造请求头(分析一下) for resource in resource_list: # 下载音频 # if '.m4a' in resource or '.mp4' in resource: audio_parse = urlparse(resource) if '.m4a' in resource or '.mp4' in resource: audio_headers['Host'] = audio_parse[1] # audio = requests.get(resource, headers=audio_headers, proxies=proxies) audio = requests.get(resource, headers=audio_headers) else: image_headers['Host'] = audio_parse[1] # audio = requests.get(resource, headers=image_headers, proxies=proxies) audio = requests.get(resource, headers=image_headers) # 解析出资源名 audio_name = audio_parse[2][audio_parse[2].rfind('/') + 1:] # 把文章资源放到resource/id/... with open( os.path.join(os.path.abspath('resource'), str(id), audio_name), 'wb') as f: f.write(audio.content) file_path_list.append( os.path.join(os.path.abspath('resource'), audio_name)) print( '资源下载成功:%s' % os.path.join(os.path.abspath('resource'), audio_name)) time.sleep(1) except: problem_list.append(id) finally: print(problem_list) mysql.close_connection()
def response(self, flow: mitmproxy.http.HTTPFlow): #显示所有栏目列表发送的请求 #这里要防止把之前的清空 # if 'entree.igetget.com/purchased/v2/product/allList' in flow.request.url: if 'igetget.com/purchased/v2/product/allList' in flow.request.url: self.num = self.num + 1 ctx.log.info("We've seen %d flows" % self.num) #得到请求,看是否的第一次请求 request = flow.request bys = request.content s = '&' + bys.decode('utf-8') + '&' page_search = re.compile(r"&page=(.*?)&") page = re.search(page_search, s).groups(1) print('第几次:', page[0]) #如果是第一次请求。刷新1.txt文件 if page[0] == '1': print('------------------') if os.path.exists('temp_file/1.txt'): #移除旧的 os.remove('temp_file/1.txt') #创建新的 with open('temp_file/1.txt', 'wb') as f: pass content = json.loads(flow.response.text) _1_info_before = {} #这里要追加column,不然刷新后读不到数据了 #得到请求参数,如果page = 1,就说明是第一次请求,清除1.txt(如果存在就清除),并创建新的1.txt #必须先创建1.txt with open('temp_file/1.txt', 'rb') as f: #如果不为空,就读 if os.path.getsize('temp_file/1.txt'): print('no null') #读到数据,重新组装为json _1_info_before = pickle.load(f) if _1_info_before: _1_info_before['c']['list'] = _1_info_before['c'][ 'list'] + content['c']['list'] # 这里最好改成redis缓存的方式 with open('temp_file/1.txt', 'wb') as f: # r如果里面有数据,就_1_info_before if os.path.getsize('temp_file/1.txt'): # 清除文件中所有数据 # 必须要清空,不然gg f.truncate() else: pickle.dump(content, f) #如果不为 if page[0] != '1': with open('temp_file/1.txt', 'wb') as f: if os.path.getsize('temp_file/1.txt') == 0: pickle.dump(_1_info_before, f) # 点击某个栏目后发送的请求1 # if 'entree.igetget.com/bauhinia/v1/class/purchase/info' in flow.request.url: if 'igetget.com/bauhinia/v1/class/purchase/info' in flow.request.url: content = json.loads(flow.response.text) with open('temp_file/1.txt', 'rb') as f: _1_info = pickle.load(f) column_info = {} author_info = {} source_info = {} #拼装当前点击文章的栏目信息 for alist in _1_info['c']['list']: #目前只爬取文章结构为40的文章 #尽量多用content里的变量 #column的id竟然不是唯一的,用类型进一步限制下,不然会把其他栏目的信息,放到错位的栏目上 #用column_id和content_category先唯一限定column试试 # print('-----------'* 10) # print(alist['id']) # print(content['c']['class_info']['product_id']) # print(alist['category']) # print(alist['type']) # print(content['c']['class_info']['product_type']) # print(content['c']['class_info']['name']) # print(alist['title']) # print('-----------' * 10) if alist['id'] == content['c']['class_info'][ 'product_id'] and alist['category'] == 40 and alist[ 'type'] == content['c']['class_info'][ 'product_type'] and content['c']['class_info'][ 'name'] == alist['title']: #栏目 #根据栏目名称生成栏目id column_info['column_id'] = generate_id( content['c']['class_info']['name']) column_info['column_name'] = content['c']['class_info'][ 'name'] column_info['column_info'] = content['c']['items'][1][ 'content'] column_info['column_learn_num'] = content['c'][ 'class_info']['learn_user_count'] # 课程总数 column_info['article_num'] = content['c']['class_info'][ 'phase_num'] column_info['current_article_num'] = content['c'][ 'class_info']['current_article_count'] column_info['finished'] = 0 if content['c']['class_info'][ 'phase_num'] - content['c']['class_info'][ 'current_article_count'] > 0 else 1 column_info['crawl_time'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) # 作者 author_info['author_id'] = generate_id( content['c']['class_info']['lecturer_name']) author_info['author_name'] = content['c']['class_info'][ 'lecturer_name'] author_info['author_avatar'] = content['c']['class_info'][ 'lecturer_avatar'] author_info['author_info'] = content['c']['items'][0][ 'content'] # 来源 source_info['source_name'] = 'https://www.igetget.com/' source_info['source_info'] = '' source_info['source_id'] = generate_id( source_info['source_name']) break mysql = MySQL() mysql.get_connection() try: if column_info: print(json.dumps(column_info)) # 把category==40的栏目信息存储到数据库 mysql.insert('tb_column', column_info) if author_info: print(json.dumps(author_info)) # 把作者信息存到数据库中 mysql.insert('author', author_info) if source_info: print(json.dumps(source_info)) # 来源信息存入数据库 mysql.insert('source', source_info) except Exception as e: print(e) finally: mysql.close_connection() with open('temp_file/2.txt', 'wb') as f: pickle.dump(content, f) #点击栏目中的文章后发送的请求1 #https://entree.igetget.com/bauhinia/v1/article/info if 'igetget.com/bauhinia/v1/article/info' in flow.request.url: content = json.loads(flow.response.text) with open('temp_file/4.txt', 'wb') as f: pickle.dump(content, f) # 当我们请求的url包含以下字符串的时候就做对应的操作,对正文进行解析 #有该文章标题,封面,正文, #还需要添加文章属于哪个栏目,属于该栏目的哪章 if 'igetget.com/ddarticle/v1/article/get' in flow.request.url: content = json.loads(flow.response.text) article_info = {} ext_info = {} article_author_info = {} article_column_info = {} article_source_info = {} article_category_info = {} with open('temp_file/4.txt', 'rb') as f: _4_info = pickle.load(f) mysql = MySQL() mysql.get_connection() try: if _4_info and _4_info['c']['dd_article_id'] == content[ 'data']['article']['Id']: #栏目信息嵌入 # article_info['column_id'] = generate_id(_4_info['c']['class_info']['name']) # article_info['column_name'] = _4_info['c']['class_info']['name'] # article_info['class_id'] = _4_info['c']['article_info']['class_id'] # 文章id article_info['article_id'] = _4_info['c']['article_info'][ 'id'] article_info['article_name'] = _4_info['c'][ 'article_info']['title'] # 正文是字符串,先进行loads sub_content = json.loads(content['data']['content']) # 正文解析,转换为字符串存储,并进行编码转换 temp = handle_dedao_dict( get_content_list(sub_content, _4_info)) article_info['article_content'] = temp # 显示文章信息 print('article_info: ' + json.dumps(article_info)) # return None # print('hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh') #插入文章表 mysql.insert('article', article_info) #关联文章和作者 article_author_info['article_id'] = article_info[ 'article_id'] article_author_info['author_id'] = generate_id( _4_info['c']['class_info']['lecturer_name']) print('article_author_info: ' + json.dumps(article_author_info)) mysql.insert('article_author', article_author_info) #关联文章和栏目 article_column_info['article_id'] = article_info[ 'article_id'] article_column_info['column_id'] = generate_id( _4_info['c']['class_info']['name']) print('article_column_info: ' + json.dumps(article_column_info)) mysql.insert('article_column', article_column_info) # 关联文章和来源 article_source_info['article_id'] = article_info[ 'article_id'] article_source_info['source_id'] = generate_id( 'https://www.igetget.com/') print('article_source_info: ' + json.dumps(article_source_info)) mysql.insert('article_source', article_source_info) #额外信息 ext_info['article_id'] = article_info['article_id'] ext_info['attribute_name'] = 'prev_article_id' ext_info['attribute_value'] = _4_info['c'][ 'prev_article_id'] #插入额外属性表 print('ext_info: ' + json.dumps(ext_info)) mysql.insert('ext_attribute', ext_info) ext_info['attribute_name'] = 'next_article_id' ext_info['attribute_value'] = _4_info['c'][ 'next_article_id'] #插入额外属性表 print('ext_info: ' + json.dumps(ext_info)) mysql.insert('ext_attribute', ext_info) ext_info['attribute_name'] = 'cover_image' ext_info['attribute_value'] = _4_info['c']['article_info'][ 'logo'] # 插入额外属性表 print('ext_info: ' + json.dumps(ext_info)) mysql.insert('ext_attribute', ext_info) ext_info['attribute_name'] = 'article_learn_count' ext_info['attribute_value'] = _4_info['c']['article_info'][ 'cur_learn_count'] # 插入额外属性表 print('ext_info: ' + json.dumps(ext_info)) mysql.insert('ext_attribute', ext_info) ext_info['attribute_name'] = 'audio_url' ext_info['attribute_value'] = _4_info['c']['article_info'][ 'audio']['mp3_play_url'] # 插入额外属性表 print('ext_info: ' + json.dumps(ext_info)) mysql.insert('ext_attribute', ext_info) # article_info['article_learn_count'] = _4_info['c']['article_info']['cur_learn_count'] # article_info['audio_url'] = _4_info['c']['article_info']['audio']['mp3_play_url'] #得到文章 章节名 with open('temp_file/2.txt', 'rb') as f: _2_info = pickle.load(f) if _4_info['c']['class_info']['name'] == _2_info['c'][ 'class_info']['name'] and _4_info['c'][ 'article_info']['class_id'] == _2_info['c'][ 'class_info']['id'] and _2_info['c'][ 'class_info']['has_chapter'] == 1: for adict in _2_info['c']['chapter_list']: if _4_info['c']['article_info'][ 'chapter_id'] == adict['id']: ext_info['attribute_name'] = 'chapter_name' ext_info['attribute_value'] = adict['name'] #插入额外信息表 print('ext_info: ' + json.dumps(ext_info)) mysql.insert('ext_attribute', ext_info) # article_info['create_time'] = content['data']['article']['CreateTime'] # article_info['update_time'] = content['data']['article']['UpdateTime'] # article_info['publish_time'] = content['data']['article']['PublishTime'] except Exception as e: print('文章链接entree.igetget.com/ddarticle/v1/article/get 出现异常') print(e) finally: mysql.close_connection() # 该文章的评论 # https://entree.igetget.com/bauhinia/v1/article/info if 'igetget.com/ledgers/notes/article_comment_list' in flow.request.url: content = json.loads(flow.response.text) # 如果有更多评论,就取出拼接再放入 if content['c']['isMore']: if not os.path.exists('temp_file/5.txt') or os.path.getsize( 'temp_file/5.txt') == 0: with open('temp_file/5.txt', 'wb') as f: pickle.dump(content, f) else: print( '之前大小:', os.path.getsize('temp_file/5.txt'), ) with open('temp_file/5.txt', 'rb') as f: temp = pickle.load(f) with open('temp_file/5.txt', 'wb') as f: f.truncate() print('中间大小:', os.path.getsize('temp_file/5.txt')) temp['c'][ 'list'] = temp['c']['list'] + content['c']['list'] with open('temp_file/5.txt', 'wb') as f: pickle.dump(temp, f) print('之后大小:', os.path.getsize('temp_file/5.txt')) # 如果就一条或者最后一条 else: # 从文件取出,存入数据库,得到文章id,一起存入数据库 # 如果这是最后一条 request = flow.request print(request.content) bys = request.content s = '&' + bys.decode('utf-8') + '&' page_search = re.compile(r"&page=(.*?)&") page = re.search(page_search, s).groups(1) page_count_search = re.compile(r"&page_count=(.*?)&") page_count = re.search(page_count_search, s).groups(1) article_id_search = re.compile(r"&detail_id=(.*?)&") # article_id_search = re.compile(r"&article_id=(.*?)&") article_id = re.search(article_id_search, s).groups(1) print(page[0], page_count[0], article_id[0]) if os.path.getsize('temp_file/5.txt'): print('最后一条') with open('temp_file/5.txt', 'rb') as f: temp = pickle.load(f) temp['c'][ 'list'] = temp['c']['list'] + content['c']['list'] print('一共 ', temp['c']['total'], '当前 ', len(temp['c']['list'])) # 然后把temp解析拼装放入数据库 with open('temp_file/5.txt', 'wb') as f: f.truncate() # 如果仅这一条 else: # 直接存到数据库,就不用清空文件了 print('只有一条') print('一共 ', content['c']['total'], '当前 ', len(content['c']['list'])) temp = content ext_info = {} ext_info['article_id'] = int(article_id[0]) ext_info['attribute_name'] = 'comment' comment = {} # 评论所属文章id comment['article_id'] = article_id[0] comment['per_page_count'] = page_count[0] # 评论列表 comment['list'] = [] # 评论总数量 comment['comment_total'] = temp['c']['total'] if 0 != int(comment['comment_total']): # 评论的额外信息,包括文章栏目信息,以备不时之需 comment['extra'] = temp['c']['list'][0].get('extra') for a_comment in temp['c']['list']: comment_dict = {} comment_dict['note_id'] = a_comment.get('note_id') # 评论内容 comment_dict['note'] = a_comment.get('note') comment_dict['content'] = a_comment.get('content') comment_dict['note_title'] = a_comment.get( 'note_title') # 作者回复 comment_dict['note_line'] = a_comment.get('note_line') # 评论时间 comment_dict['comment_reply_time'] = a_comment.get( 'comment_reply_time') comment_dict['create_time'] = a_comment.get( 'create_time') comment_dict['update_time'] = a_comment.get( 'update_time') # 评论转发数 comment_dict['repost_count'] = a_comment[ 'notes_count'].get('repost_count', 0) # 评论评论数 comment_dict['comment_count'] = a_comment[ 'notes_count'].get('comment_count', 0) # 评论点赞数 comment_dict['like_count'] = a_comment[ 'notes_count'].get('like_count', 0) comment_dict['notes_owner'] = a_comment.get( 'notes_owner') comment['list'].append(comment_dict) # print(comment) else: comment['list'] = [] # 如果大于18,就取前18个 if len(comment['list']) > 50: comment['list'] = comment['list'][:50] ext_info['attribute_value'] = json.dumps(comment, ensure_ascii=False) # print(len(json.dumps(comment))) # print(json.dumps(comment).encode('gb2312').decode('unicode_escape')) # print(len(json.dumps(comment).encode('gb2312').decode('unicode_escape'))) # print('哈哈') # 插入数据库 mysql = MySQL() try: mysql.get_connection() # 目前判重机制是看这个article_id是否有额外属性comment,如果有就判重,但是没有比较内容是否重复,只是判断是否有这个属性 mysql.insert('ext_attribute', ext_info) finally: mysql.close_connection()
# -*- coding: utf-8 -*- from handle_mysql import MySQL import json ''' 检测文章中是否有不合法的文章,并生成文章id ''' mysql = MySQL() try: mysql.get_connection() select_results = mysql.select('article', ['article_id', 'article_content'], 'uploaded = 0') problem_list = [] # type:"audio , audio:" # type:"image , src:" for result in select_results: try: contents = json.loads(result[1]) except: problem_list.append(result[0]) if problem_list: print('出现问题的文章id:', problem_list) else: print('没有文章出现问题') finally: