def craw_tag(tag_name, tag_id, write_sheet): time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") page = 1 while 1: url = XiGuaCrawler.page_url_format % (tag_id, page) r = requests.get(url, cookies=XiGuaCrawler.cookies_dict) serverid = r.cookies.get("SERVERID") XiGuaCrawler.cookies_dict["SERVERID"] = serverid if len(r.content) < 100: # page页面没有了之后返回的长度是24 break bs_obj = BeautifulSoup(r.content, "html.parser") article_tags = bs_obj.findAll("tr", attrs={"data-articleid": re.compile(".*")}) for article_tag in article_tags: title_tag = article_tag.find("div", class_="mp-article-title") href_tag = title_tag.find("a") title = href_tag.get_text().replace("\n", "") link = href_tag.get("href") # 利用数据库保存已有的链接,如果没有在数据库中,插入数据库,并写入文件,否则不插入数据库,也不写入文件。 if DBUtil.select_data(XiGuaCrawler.select_sql_format % link) is None: DBUtil.insert_data(XiGuaCrawler.insert_sql_format % (link, time_str)) source_tag = article_tag.find("div", class_="item-source") origin = source_tag.find("div", class_="item-title").get_text() rel_time = source_tag.find("div", class_="item-sub-title").get_text() standard_time = XiGuaCrawler.convert_time_to_standard_time(rel_time) XiGuaCrawler.line_count += 1 XiGuaCrawler.write_sheet_write(write_sheet, XiGuaCrawler.line_count, title, link, "", tag_name, origin, rel_time, standard_time) page += 1
def move_article_and_image(): article_id_sql = "SELECT DISTINCT article_id from t_article_status" target_school_article_sql = "SELECT id FROM t_article WHERE origin = \"中国政法大学\"" result = DBUtil.select_datas(article_id_sql) target_result = DBUtil.select_datas(target_school_article_sql) for item in target_result: result.append(item) image_path = "/data/who_focus/image/" article_path = "/home/jfqiao/wechat_articles/" for item in result: sql = "SELECT url, image_url FROM t_article WHERE id=%s" % item[ "article_id"] article = DBUtil.select_data(sql) article_file_name = article["url"].replace("/", "").replace(":", "") os.system("cp \"%s\" /home/jfqiao/tmp_article/" % (article_path + article_file_name)) os.system("cp \"%s\" /home/jfqiao/tmp_article/" % (article_path + article_file_name + "_abstract")) if article["image_url"].startswith("https://mmbiz.qpic") or article[ "image_url"].startswith("http://mmbiz.qpic"): continue pos = article["image_url"].find("?") if pos == -1: pos = len(article["image_url"]) image_file_name = article["image_url"][:pos].replace(":", "").replace( "/", "") os.system("cp \"%s\" /home/jfqiao/tmp_image/" % (image_path + image_file_name)) os.system("cp \"%s\" /home/jfqiao/tmp_image/" % (image_path + image_file_name + ".txt"))
def reset_image_url(): f = open("/home/jfqiao/result_2018-03-21_22-37.csv", "r") select_sql_format = "SELECT id FROM t_article WHERE url = \"%s\"" insert_sql_format = "UPDATE t_article SET image_url = \"%s\" WHERE id = %s" while 1: line = f.readline() if len(line) == 0: break strs = line.replace("\n", "").split(",") select_sql = select_sql_format % strs[0] select_result = DBUtil.select_data(select_sql) if select_result: insert_sql = insert_sql_format % (strs[1], select_result["id"]) DBUtil.insert_data(insert_sql) f.close()
def select_url(url): sql = Crawler.select_sql % url return DBUtil.select_data(sql)