def craw_tag(tag_name, tag_id, write_sheet): time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") page = 1 while 1: url = XiGuaCrawler.page_url_format % (tag_id, page) r = requests.get(url, cookies=XiGuaCrawler.cookies_dict) serverid = r.cookies.get("SERVERID") XiGuaCrawler.cookies_dict["SERVERID"] = serverid if len(r.content) < 100: # page页面没有了之后返回的长度是24 break bs_obj = BeautifulSoup(r.content, "html.parser") article_tags = bs_obj.findAll("tr", attrs={"data-articleid": re.compile(".*")}) for article_tag in article_tags: title_tag = article_tag.find("div", class_="mp-article-title") href_tag = title_tag.find("a") title = href_tag.get_text().replace("\n", "") link = href_tag.get("href") # 利用数据库保存已有的链接,如果没有在数据库中,插入数据库,并写入文件,否则不插入数据库,也不写入文件。 if DBUtil.select_data(XiGuaCrawler.select_sql_format % link) is None: DBUtil.insert_data(XiGuaCrawler.insert_sql_format % (link, time_str)) source_tag = article_tag.find("div", class_="item-source") origin = source_tag.find("div", class_="item-title").get_text() rel_time = source_tag.find("div", class_="item-sub-title").get_text() standard_time = XiGuaCrawler.convert_time_to_standard_time(rel_time) XiGuaCrawler.line_count += 1 XiGuaCrawler.write_sheet_write(write_sheet, XiGuaCrawler.line_count, title, link, "", tag_name, origin, rel_time, standard_time) page += 1
def reset_image_url(): f = open("/home/jfqiao/result_2018-03-21_22-37.csv", "r") select_sql_format = "SELECT id FROM t_article WHERE url = \"%s\"" insert_sql_format = "UPDATE t_article SET image_url = \"%s\" WHERE id = %s" while 1: line = f.readline() if len(line) == 0: break strs = line.replace("\n", "").split(",") select_sql = select_sql_format % strs[0] select_result = DBUtil.select_data(select_sql) if select_result: insert_sql = insert_sql_format % (strs[1], select_result["id"]) DBUtil.insert_data(insert_sql) f.close()
def insert_url(url): date = datetime.datetime.now() date_str = date.strftime("%Y-%m-%d %H:%M:%S") sql = Crawler.insert_sql % (url, date_str) DBUtil.insert_data(sql)