Esempio n. 1
0
 def craw_tag(tag_name, tag_id, write_sheet):
     time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     page = 1
     while 1:
         url = XiGuaCrawler.page_url_format % (tag_id, page)
         r = requests.get(url, cookies=XiGuaCrawler.cookies_dict)
         serverid = r.cookies.get("SERVERID")
         XiGuaCrawler.cookies_dict["SERVERID"] = serverid
         if len(r.content) < 100:      # page页面没有了之后返回的长度是24
             break
         bs_obj = BeautifulSoup(r.content, "html.parser")
         article_tags = bs_obj.findAll("tr", attrs={"data-articleid": re.compile(".*")})
         for article_tag in article_tags:
             title_tag = article_tag.find("div", class_="mp-article-title")
             href_tag = title_tag.find("a")
             title = href_tag.get_text().replace("\n", "")
             link = href_tag.get("href")
             # 利用数据库保存已有的链接,如果没有在数据库中,插入数据库,并写入文件,否则不插入数据库,也不写入文件。
             if DBUtil.select_data(XiGuaCrawler.select_sql_format % link) is None:
                 DBUtil.insert_data(XiGuaCrawler.insert_sql_format % (link, time_str))
                 source_tag = article_tag.find("div", class_="item-source")
                 origin = source_tag.find("div", class_="item-title").get_text()
                 rel_time = source_tag.find("div", class_="item-sub-title").get_text()
                 standard_time = XiGuaCrawler.convert_time_to_standard_time(rel_time)
                 XiGuaCrawler.line_count += 1
                 XiGuaCrawler.write_sheet_write(write_sheet, XiGuaCrawler.line_count, title, link, "",
                                                tag_name, origin, rel_time, standard_time)
         page += 1
Esempio n. 2
0
def reset_image_url():
    f = open("/home/jfqiao/result_2018-03-21_22-37.csv", "r")
    select_sql_format = "SELECT id FROM t_article WHERE url = \"%s\""
    insert_sql_format = "UPDATE t_article SET image_url = \"%s\" WHERE id = %s"
    while 1:
        line = f.readline()
        if len(line) == 0:
            break
        strs = line.replace("\n", "").split(",")
        select_sql = select_sql_format % strs[0]
        select_result = DBUtil.select_data(select_sql)
        if select_result:
            insert_sql = insert_sql_format % (strs[1], select_result["id"])
            DBUtil.insert_data(insert_sql)
    f.close()
Esempio n. 3
0
 def insert_url(url):
     date = datetime.datetime.now()
     date_str = date.strftime("%Y-%m-%d %H:%M:%S")
     sql = Crawler.insert_sql % (url, date_str)
     DBUtil.insert_data(sql)