def doIt(author, url): mysql_dao = MysqlDao() headers = Headers.get_headers() proxies = Proxies.get_proxies() try: html = requests.get(url, headers=headers, timeout=30, proxies=proxies).content selector = etree.HTML(html) titles = selector.xpath('//h3/a[1]/text()') urls = selector.xpath('//h3/a[1]/@href') imgs = selector.xpath( '//div[@class="list_image"]/ul[1]/li[1]/a[1]/img[1]/@src') next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()') next_url = selector.xpath('//*[@id="pagebar"]/a[last()]/@href') category_id = 0 i = 0 print(urls) while True: if i >= len(urls): break url2 = urls[i] img_main = imgs[i] created_at = time.strftime('%Y-%m-%d %H:%M:%S') insert_value = '"' + str( category_id ) + '","' + url2 + '","' + img_main + '","' + author + '",0,"' + created_at + '"' sql = 'insert ignore into zmt_toutiao_url (`category_id`,`url`,`img_main`,`author`,`status`,`created_at`) values (' + insert_value + ')' print(sql) mysql_dao.execute(sql) i = i + 1 except Exception as e: print(Exception) print(e) try: # 翻页 next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()') if len(next_name) > 0: if u'下一页' in next_name[0]: next_url = selector.xpath( '//*[@id="pagebar"]/a[last()]/@href')[0] doIt(author, next_url) except Exception as e: print(Exception) print(e)
import threading import sys from public.city import City from public.mysqlpooldao import MysqlDao from public.redispooldao import RedisDao import pymysql import traceback pymysql.install_as_MySQLdb() from lxml import etree import requests import simplejson city_list = City.city_list mysql_dao = MysqlDao() redis_dao = RedisDao() redis_key = 'gaode:20170209_gaode_dianping_sectionl' def get_singlepage_info(base_city_name, target_url, lastpage): # if lastpage > 1: target1 = target_url.split('page=')[0] target2 = target_url.split('page=')[1] target3 = target2.split('&')[1] page = lastpage while True: if page <= 0: break true_url = target1 + "page=" + str(page) + "&" + target3
#-*- coding:utf-8 -*- import sys import simplejson from public.mysqlpooldao import MysqlDao from public.redispooldao import RedisDao redis_key = 'dianpingtest:20170104_dianping_shop_list_url' mysql_dao = MysqlDao() redis_dao = RedisDao() reload(sys) sys.setdefaultencoding('utf-8') if __name__ == '__main__': sql = 'SELECT * FROM `20170104_dianping_shop_list_url` WHERE `status`=0' district_lists = mysql_dao.execute(sql) for district_list in district_lists: district_list_json = simplejson.dumps(district_list) redis_dao.rpush(redis_key, district_list_json) print district_list_json
print(e) try: # 翻页 next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()') if len(next_name) > 0: if u'下一页' in next_name[0]: next_url = selector.xpath( '//*[@id="pagebar"]/a[last()]/@href')[0] doIt(author, next_url) except Exception as e: print(Exception) print(e) if __name__ == '__main__': mysql_dao = MysqlDao() while True: sql = 'select * from zmt_toutiaohao_url WHERE `time`=0 limit 0,1' ret = mysql_dao.execute(sql) if len(ret) == 0: break res = ret[0] id = res[0] author = res[1] url = res[2] # sql = 'update zmt_toutiaohao_url set `time`=1 where `id`=' + str(id) # res = mysql_dao.execute(sql) doIt(author, url) mysql_dao.close() print('game over')
#-*- coding:utf-8 -*- import sys import requests from lxml import etree from public.mysqlpooldao import MysqlDao from public.headers import Headers from public.redispooldao import RedisDao import re import simplejson reload(sys) sys.setdefaultencoding('utf-8') mysql_dao = MysqlDao() def get_download_info(subject, url, category_name): # url = 'http://cn163.net/archives/2876/' # url = 'http://cn163.net/archives/23749/' headers = Headers.get_headers() res = requests.get(url, headers=headers) wb_data = res.content selector = etree.HTML(wb_data) lines = selector.xpath('//div[@class="entry"]/div[@id="entry"]/p') # print len(lines) if len(lines) > 1: line1 = lines[1] xx1 = etree.tostring(line1) xx = xx1.split('<strong>') if len(xx) % 2 != 0:
import os import datetime import requests from lxml import etree import simplejson import traceback from public.mysqlpooldao import MysqlDao from public.headers import Headers from public.redispooldao import RedisDao from public.hero import Hero reload(sys) sys.setdefaultencoding('utf-8') file_path = os.path.dirname(os.path.abspath(__file__)) mysql_dao = MysqlDao() redis_dao = RedisDao() hero = Hero(file_path) redis_key = 'dianpingtest:20170104_dianping_shop_list_url' def get_last_page(url): last_page = 1 try: headers = Headers.get_headers() req = requests.get(url, headers=headers, timeout=5) if req.status_code == 200: html = req.content selector = etree.HTML(html) last_pages = selector.xpath(
# -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import requests import re import time import json from lxml import etree import pymysql from public.mysqlpooldao import MysqlDao mysql_dao = MysqlDao() def get_normalnews(objid, news_url, title, picture_if): res = requests.get(news_url) req = res.content selector = etree.HTML(req) picture_lists = selector.xpath( '//div[@class="area article"]/div[@class="article-content"]/p[@align="center"]/descendant::a[@target="_blank"]/img/@src' ) if picture_lists: for pictures in picture_lists: picture_link = pictures sql1 = ('INSERT IGNORE INTO `xcar_news_post_picture_20170505`' '(`objid`,`news_url`,`title`,`picture_link`)' 'VALUES ("%s","%s","%s","%s")') % (objid, news_url, title, picture_link) print sql1 mysql_dao.execute(sql1)
# -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import simplejson from public.mysqlpooldao import MysqlDao from public.redispooldao import RedisDao redis_key = 'gaode:20170209_gaode_dianping_sectionl' mysql_dao = MysqlDao() redis_dao = RedisDao() if __name__ == '__main__': sql = 'SELECT * FROM `a_gaode_section_longitude_latitude` WHERE `status`=0' section_lists = mysql_dao.execute(sql) # print section_lists for section_list in section_lists: section_list_json = simplejson.dumps(section_list) print section_list_json redis_dao.rpush(redis_key, section_list_json) print(section_list_json)
# -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import pymysql import requests from lxml import etree import re import time import json from public.mysqlpooldao import MysqlDao mysql_dao = MysqlDao() # headers = { # 'Accept': 'text/css,*/*;q=0.1', # 'Accept-Encoding': 'gzip, deflate, sdch', # 'Accept-Language': 'zh-CN,zh;q=0.8', # 'Cache-Control': 'max-age=0', # 'Connection': 'keep-alive', # 'Host': 'x.autoimg.cn', # 'If-Modified-Since': 'Mon, 01 May 2017 02:56:38 GMT', # 'If-None-Match': '636292329983254104', # 'Referer': 'http://www.autohome.com.cn/use/1/', # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' # } def get_newsPostInfo(objid, url, lastpage, title, publish_date): title = title objid = objid publish_date = publish_date
def run(self): mysql_dao = MysqlDao() redis_dao = RedisDao() while True: print(self.getName()) date = time.strftime('%Y%m%d') data_json = redis_dao.lpop('queue:toutiao_%s' % date) if data_json == None: break data = simplejson.loads(data_json) category_id = data['category_id'] url = data['url'] img_main = data['img_main'] author = data['author'] try: headers = Headers.get_headers() proxies = Proxies.get_proxies() html = requests.get(url, headers=headers, timeout=30, proxies=proxies).content selector = etree.HTML(html) status = selector.xpath('//*[@id="aboutus"]/div[1]/span[1]/text()') if len(status) > 0: if u'今日头条' in status[0]: category_names = selector.xpath('//div[@class="curpos"]/a[2]/text()') if len(category_names) != 0: category_name = category_names[0] if u'图片' in category_name and u'视频' in category_name: pass else: if category_id != 0: toutiaohao_authors = selector.xpath('//*[contains(@class,"gc_name")]/text()') toutiaohao_urls = selector.xpath('//*[contains(@class,"gc_name")]/@href') try: toutiaohao_num = 0 for toutiaohao_url in toutiaohao_urls: toutiaohao_sql = 'insert ignore into zmt_toutiaohao_url (`author`,`url`) values ("' + \ toutiaohao_authors[toutiaohao_num] + '","' + \ toutiaohao_urls[ toutiaohao_num] + '")' toutiaohao_num = toutiaohao_num + 1 mysql_dao.execute(toutiaohao_sql) except Exception as e: print(Exception) print(e) title = selector.xpath('//*[@class="title"]/text()') if len(title) > 0: title_t = title[0].replace('"', '') else: title_t = '' content = selector.xpath('//*[@class="article-content"]/descendant::text()') img = selector.xpath('//img[@onerror="javascript:errorimg.call(this);"]/@src') content_str = '' img_str = '' for c in content: content_str = content_str + '{ycontent}' + c.replace('"', '') for img_i in img: img_str = img_str + '{yimg}' + img_i.replace('"', '') time_now = time.strftime('%Y-%m-%d %H:%M:%S') time_ts = selector.xpath('//*[@class="time"]/text()') if len(time_ts) > 0: time_t = time_ts[0].replace('"', '') else: time_t = '' insert_value = '"' + str( category_id) + '","' + title_t + '","' + content_str + '","' + url + '","' + img_main + '","","' + img_str + '","","' + author + '","' + time_t + '","' + time_now + '","' + time_now + '"'; sql = 'insert ignore into zmt_content (`category_id`,`title`,`content`,`url`,`img_main`,`img_main_oss`,`img`,`img_oss`,`author`,`time`,`created_at`,`updated_at`) values (' + insert_value + ')' print(sql) if content_str != '': mysql_dao.execute(sql) except Exception as e: print(Exception) print(e) mysql_dao.close()
def get_keywords(): mysql_dao = MysqlDao() sql = 'select `names_chn`,`names_eng`,`names_nick`,`directors`,`writers`,`casts`FROM bttiantang_content' res = mysql_dao.execute(sql) return res