Beispiel #1
0
def doIt(author, url):
    mysql_dao = MysqlDao()
    headers = Headers.get_headers()
    proxies = Proxies.get_proxies()
    try:
        html = requests.get(url, headers=headers, timeout=30,
                            proxies=proxies).content
        selector = etree.HTML(html)
        titles = selector.xpath('//h3/a[1]/text()')
        urls = selector.xpath('//h3/a[1]/@href')
        imgs = selector.xpath(
            '//div[@class="list_image"]/ul[1]/li[1]/a[1]/img[1]/@src')
        next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()')
        next_url = selector.xpath('//*[@id="pagebar"]/a[last()]/@href')
        category_id = 0
        i = 0
        print(urls)
        while True:
            if i >= len(urls):
                break
            url2 = urls[i]
            img_main = imgs[i]
            created_at = time.strftime('%Y-%m-%d %H:%M:%S')
            insert_value = '"' + str(
                category_id
            ) + '","' + url2 + '","' + img_main + '","' + author + '",0,"' + created_at + '"'
            sql = 'insert ignore into zmt_toutiao_url (`category_id`,`url`,`img_main`,`author`,`status`,`created_at`) values (' + insert_value + ')'
            print(sql)
            mysql_dao.execute(sql)
            i = i + 1
    except Exception as e:
        print(Exception)
        print(e)
    try:
        # 翻页
        next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()')
        if len(next_name) > 0:
            if u'下一页' in next_name[0]:
                next_url = selector.xpath(
                    '//*[@id="pagebar"]/a[last()]/@href')[0]
                doIt(author, next_url)
    except Exception as e:
        print(Exception)
        print(e)
import threading

import sys
from public.city import City
from public.mysqlpooldao import MysqlDao
from public.redispooldao import RedisDao
import pymysql
import traceback

pymysql.install_as_MySQLdb()
from lxml import etree
import requests
import simplejson

city_list = City.city_list
mysql_dao = MysqlDao()
redis_dao = RedisDao()

redis_key = 'gaode:20170209_gaode_dianping_sectionl'


def get_singlepage_info(base_city_name, target_url, lastpage):
    # if lastpage > 1:
    target1 = target_url.split('page=')[0]
    target2 = target_url.split('page=')[1]
    target3 = target2.split('&')[1]
    page = lastpage
    while True:
        if page <= 0:
            break
        true_url = target1 + "page=" + str(page) + "&" + target3
Beispiel #3
0
#-*- coding:utf-8 -*-

import sys
import simplejson
from public.mysqlpooldao import MysqlDao
from public.redispooldao import RedisDao

redis_key = 'dianpingtest:20170104_dianping_shop_list_url'
mysql_dao = MysqlDao()
redis_dao = RedisDao()

reload(sys)
sys.setdefaultencoding('utf-8')

if __name__ == '__main__':
    sql = 'SELECT * FROM `20170104_dianping_shop_list_url` WHERE `status`=0'
    district_lists = mysql_dao.execute(sql)
    for district_list in district_lists:
        district_list_json = simplejson.dumps(district_list)
        redis_dao.rpush(redis_key, district_list_json)
        print district_list_json
Beispiel #4
0
        print(e)
    try:
        # 翻页
        next_name = selector.xpath('//*[@id="pagebar"]/a[last()]/text()')
        if len(next_name) > 0:
            if u'下一页' in next_name[0]:
                next_url = selector.xpath(
                    '//*[@id="pagebar"]/a[last()]/@href')[0]
                doIt(author, next_url)
    except Exception as e:
        print(Exception)
        print(e)


if __name__ == '__main__':
    mysql_dao = MysqlDao()
    while True:
        sql = 'select * from zmt_toutiaohao_url WHERE `time`=0 limit 0,1'
        ret = mysql_dao.execute(sql)
        if len(ret) == 0:
            break
        res = ret[0]
        id = res[0]
        author = res[1]
        url = res[2]
        # sql = 'update zmt_toutiaohao_url set `time`=1 where `id`=' + str(id)
        # res = mysql_dao.execute(sql)
        doIt(author, url)
    mysql_dao.close()
    print('game over')
Beispiel #5
0
#-*- coding:utf-8 -*-
import sys
import requests
from lxml import etree
from public.mysqlpooldao import MysqlDao
from public.headers import Headers
from public.redispooldao import RedisDao
import re
import simplejson

reload(sys)
sys.setdefaultencoding('utf-8')

mysql_dao = MysqlDao()


def get_download_info(subject, url, category_name):
    # url = 'http://cn163.net/archives/2876/'
    # url = 'http://cn163.net/archives/23749/'
    headers = Headers.get_headers()
    res = requests.get(url, headers=headers)
    wb_data = res.content
    selector = etree.HTML(wb_data)

    lines = selector.xpath('//div[@class="entry"]/div[@id="entry"]/p')
    # print len(lines)
    if len(lines) > 1:
        line1 = lines[1]
        xx1 = etree.tostring(line1)
        xx = xx1.split('<strong>')
        if len(xx) % 2 != 0:
import os
import datetime
import requests
from lxml import etree
import simplejson
import traceback
from public.mysqlpooldao import MysqlDao
from public.headers import Headers
from public.redispooldao import RedisDao
from public.hero import Hero

reload(sys)
sys.setdefaultencoding('utf-8')

file_path = os.path.dirname(os.path.abspath(__file__))
mysql_dao = MysqlDao()
redis_dao = RedisDao()
hero = Hero(file_path)

redis_key = 'dianpingtest:20170104_dianping_shop_list_url'


def get_last_page(url):
    last_page = 1
    try:
        headers = Headers.get_headers()
        req = requests.get(url, headers=headers, timeout=5)
        if req.status_code == 200:
            html = req.content
            selector = etree.HTML(html)
            last_pages = selector.xpath(
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests
import re
import time
import json
from lxml import etree
import pymysql
from public.mysqlpooldao import MysqlDao
mysql_dao = MysqlDao()


def get_normalnews(objid, news_url, title, picture_if):
    res = requests.get(news_url)
    req = res.content
    selector = etree.HTML(req)
    picture_lists = selector.xpath(
        '//div[@class="area article"]/div[@class="article-content"]/p[@align="center"]/descendant::a[@target="_blank"]/img/@src'
    )
    if picture_lists:
        for pictures in picture_lists:
            picture_link = pictures

            sql1 = ('INSERT IGNORE INTO `xcar_news_post_picture_20170505`'
                    '(`objid`,`news_url`,`title`,`picture_link`)'
                    'VALUES ("%s","%s","%s","%s")') % (objid, news_url, title,
                                                       picture_link)
            print sql1
            mysql_dao.execute(sql1)
Beispiel #8
0
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import simplejson
from public.mysqlpooldao import MysqlDao
from public.redispooldao import RedisDao

redis_key = 'gaode:20170209_gaode_dianping_sectionl'
mysql_dao = MysqlDao()
redis_dao = RedisDao()

if __name__ == '__main__':
    sql = 'SELECT * FROM `a_gaode_section_longitude_latitude` WHERE `status`=0'
    section_lists = mysql_dao.execute(sql)
    # print section_lists
    for section_list in section_lists:
        section_list_json = simplejson.dumps(section_list)
        print section_list_json
        redis_dao.rpush(redis_key, section_list_json)
        print(section_list_json)
Beispiel #9
0
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pymysql
import requests
from lxml import etree
import re
import time
import json
from public.mysqlpooldao import MysqlDao
mysql_dao = MysqlDao()

# headers = {
#     'Accept': 'text/css,*/*;q=0.1',
#     'Accept-Encoding': 'gzip, deflate, sdch',
#     'Accept-Language': 'zh-CN,zh;q=0.8',
#     'Cache-Control': 'max-age=0',
#     'Connection': 'keep-alive',
#     'Host': 'x.autoimg.cn',
#     'If-Modified-Since': 'Mon, 01 May 2017 02:56:38 GMT',
#     'If-None-Match': '636292329983254104',
#     'Referer': 'http://www.autohome.com.cn/use/1/',
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
# }


def get_newsPostInfo(objid, url, lastpage, title, publish_date):
    title = title
    objid = objid
    publish_date = publish_date
Beispiel #10
0
    def run(self):
        mysql_dao = MysqlDao()
        redis_dao = RedisDao()
        while True:
            print(self.getName())
            date = time.strftime('%Y%m%d')
            data_json = redis_dao.lpop('queue:toutiao_%s' % date)
            if data_json == None:
                break
            data = simplejson.loads(data_json)
            category_id = data['category_id']
            url = data['url']
            img_main = data['img_main']
            author = data['author']
            try:
                headers = Headers.get_headers()
                proxies = Proxies.get_proxies()
                html = requests.get(url, headers=headers, timeout=30, proxies=proxies).content
                selector = etree.HTML(html)
                status = selector.xpath('//*[@id="aboutus"]/div[1]/span[1]/text()')
                if len(status) > 0:
                    if u'今日头条' in status[0]:
                        category_names = selector.xpath('//div[@class="curpos"]/a[2]/text()')
                        if len(category_names) != 0:
                            category_name = category_names[0]
                            if u'图片' in category_name and u'视频' in category_name:
                                pass
                            else:
                                if category_id != 0:
                                    toutiaohao_authors = selector.xpath('//*[contains(@class,"gc_name")]/text()')
                                    toutiaohao_urls = selector.xpath('//*[contains(@class,"gc_name")]/@href')
                                    try:
                                        toutiaohao_num = 0
                                        for toutiaohao_url in toutiaohao_urls:
                                            toutiaohao_sql = 'insert ignore into zmt_toutiaohao_url (`author`,`url`) values ("' + \
                                                             toutiaohao_authors[toutiaohao_num] + '","' + \
                                                             toutiaohao_urls[
                                                                 toutiaohao_num] + '")'
                                            toutiaohao_num = toutiaohao_num + 1
                                            mysql_dao.execute(toutiaohao_sql)
                                    except Exception as e:
                                        print(Exception)
                                        print(e)

                                    title = selector.xpath('//*[@class="title"]/text()')
                                    if len(title) > 0:
                                        title_t = title[0].replace('"', '')
                                    else:
                                        title_t = ''
                                    content = selector.xpath('//*[@class="article-content"]/descendant::text()')
                                    img = selector.xpath('//img[@onerror="javascript:errorimg.call(this);"]/@src')
                                    content_str = ''
                                    img_str = ''
                                    for c in content:
                                        content_str = content_str + '{ycontent}' + c.replace('"', '')
                                    for img_i in img:
                                        img_str = img_str + '{yimg}' + img_i.replace('"', '')
                                    time_now = time.strftime('%Y-%m-%d %H:%M:%S')
                                    time_ts = selector.xpath('//*[@class="time"]/text()')
                                    if len(time_ts) > 0:
                                        time_t = time_ts[0].replace('"', '')
                                    else:
                                        time_t = ''
                                    insert_value = '"' + str(
                                            category_id) + '","' + title_t + '","' + content_str + '","' + url + '","' + img_main + '","","' + img_str + '","","' + author + '","' + time_t + '","' + time_now + '","' + time_now + '"';
                                    sql = 'insert ignore into zmt_content (`category_id`,`title`,`content`,`url`,`img_main`,`img_main_oss`,`img`,`img_oss`,`author`,`time`,`created_at`,`updated_at`) values (' + insert_value + ')'
                                    print(sql)
                                    if content_str != '':
                                        mysql_dao.execute(sql)
            except Exception as e:
                print(Exception)
                print(e)
        mysql_dao.close()
Beispiel #11
0
def get_keywords():
    mysql_dao = MysqlDao()
    sql = 'select `names_chn`,`names_eng`,`names_nick`,`directors`,`writers`,`casts`FROM bttiantang_content'
    res = mysql_dao.execute(sql)
    return res