Esempio n. 1
0
 def getBrowser(self):
     brower = webdriver.Chrome()
     try:
         brower.get(self.startUrl)
     except:
         mylog.error('open %s failed' % self.startUrl)
     brower.implicitly_wait(30)
     return brower
 def getBrowser(self):
     browser = webdriver.PhantomJS()
     try:
         browser.get(self.startUrl)
     except:
         mylog.error('open the %s failed' % self.startUrl)
     browser.implicitly_wait(20)
     return browser
Esempio n. 3
0
 def getBrowser(self):
     browser = webdriver.Firefox()
     print("15")
     try:
         browser.get(self.startUrl)
         print("16")
     except:
         mylog.info('open the %s failed' % self.startUrl)
     browser.implicitly_wait(20)
     return browser
Esempio n. 4
0
class TestTime(object):
    def __init__(self):
        self.log = MyLog()
        self.testTime()
        self.testLocaltime()
        self.testSleep()
        self.testStrftime()

    def testTime(self):
        self.log.info(u'开始测试time.time()函数')
        print(u'当前时间戳为:time.time()=%f' % time.time())
        print(u'这里返回的是一个浮点型的数值,他是从1970纪元后经过的浮点秒数')
        print('\n')

    def testLocaltime(self):
        self.log.info(u'开始测试time.localtime()函数')
        print(u'当前本地时间为:time.localtime()=%s' % time.localtime())
        print(u'这里返回的是一个struct_time结构的元组')
        print('\n')

    def testSleep(self):
        self.log.info(u'开始测试time.sleep()函数')
        print(u'这是个计时器,time.sleep(5)')
        print(u'闭上眼睛数上5秒就行')
        time.sleep(5)
        print('\n')

    def testStrftime(self):
        self.log.info(u'开始测试time.strftime()函数')
        print(u'这个函数返回的是一个格式化的时间')
        print(u'time.strftime("%%Y-%%m-%%d %%X",time.localtime())=%s' %
              time.strftime("%Y-%m-%d %X", time.localtime()))
        print('\n')
Esempio n. 5
0
    def __init__(self):

        self.r = redis.Redis(host=redis_host,
                             port=redis_port,
                             db=redis_name,
                             password=redis_pwd)

        # kafka配置
        self.producer = KafkaProducer(bootstrap_servers=bootstrap_servers)
        self.topic = kfk_topic

        # oss配置
        self.auth = oss2.Auth(AccessKeyID, AccessKeySecret)
        self.endpoint = EndPoint
        self.bucket = oss2.Bucket(self.auth, self.endpoint, Bucket)  # 项目名称

        self.spec_info = spec_info
        self.brand_name = '广州女装批发'
        self.brand_id = '6'
        self.store_name = '拼拼侠'
        self.store_id = '1'
        self.type_name = "服装"
        self.type_id = "6"
        self.transport_title = "拼拼侠通用运费模板"
        self.transport_id = "11"
        self.mylog = MyLog()
Esempio n. 6
0
class GoldNews(SpiderConfig):
    log = MyLog()

    def __init__(self, data_name):
        self.data_name = data_name
        self.DICT_START_URL = {
            '名家点金': 'http://gold.cnfol.com/mingjiadianjin/',
            '机构论金': 'http://gold.cnfol.com/jigoulunjin/'
        }
        self.dict_url_re_pattern = {
            '名家点金': 'http://gold\.cnfol\.com/mingjiadianjin/[0-9]+/[0-9]+\.shtml',
            '机构论金': 'http://gold\.cnfol\.com/jigoulunjin/[0-9]+/[0-9]+\.shtml'
        }
        SpiderConfig.__init__(self, self.DICT_START_URL[self.data_name], self.dict_url_re_pattern[data_name])


    @log.deco_log(sys.argv[0][0:-3] + '.log', "get_data", False)
    def get_data(self):
        list_url = SpiderConfig.get_urls(self)
        assert isinstance(list_url, list)
        list_html = SpiderConfig.get_htmls(self, list_url)
        list_headline = []
        list_date = []
        list_content = []
        check_data = lambda html: html.get_text() if html else "null"
        for soup in list_html:
            headline = soup.find('div', class_='EDArt').find('h1')
            list_headline.append(check_data(headline))
            #print check_data(headline)
            date = soup.find('div', class_="GSTitsL Cf").find('span')
            list_date.append(date.get_text())
            #print date.get_text()
            content = soup.find('div', class_="pageBd")
            if content:
                content = SpiderConfig.clean_str(self, str(content))
            else:
                raise ValueError
            #print content
            list_content.append(content)

        return list_headline, list_date, list_content

    @log.deco_log(sys.argv[0][0:-3] + '.log', "update_data", False)
    def update_data(self, *tuple_data):
        list_headline, list_date, list_content = tuple_data[0][0], tuple_data[0][1], tuple_data[0][2]
        db = SpiderConfig.db
        select_sql = "select headling from [zy_tbNews] WHERE id=(select MAX(id) from [zy_tbNews] WHERE type='%s')" % self.data_name
        update_num = SpiderConfig.check_newest_data(self, select_sql, list_headline)
        if update_num != 0:
            for i in range(update_num)[::-1]:
                insert_sql = "INSERT INTO [zy_tbNews] VALUES ('{}','{}','{}','{}')".format(
                        self.data_name, list_headline[i], list_date[i], list_content[i]
                )
                db.ExecNonQuery(insert_sql.encode('utf-8'))
                print list_headline[i]
                print list_date[i]
                print list_content[i]
            print "%s页面数据上传更新完毕" % self.data_name
        else:
            print "%s页面数据源无最新数据更新" % self.data_name
Esempio n. 7
0
class GetData(object):
    def __init__(self):
        self.url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
        self.log = MyLog()
        self.items = self.spider()
        self.pipelines()

    def get_response(self):
        #获取页面信息
        flag = True
        ua = UserAgent()
        while flag:
            with open('new3proxy.txt', 'r') as fp:
                lines = fp.readlines()
                index = random.randint(1, len(lines))
                proxys = 'https://' + lines[index - 1]

            fakeHeaders = {'User-Agent': ua.random}
            request = urllib.request.Request(self.url, headers=fakeHeaders)

            proxy = urllib.request.ProxyHandler({'https': proxys})
            opener = urllib.request.build_opener(proxy)
            urllib.request.install_opener(opener)

            try:
                response = urllib.request.urlopen(request)
                flag = False
                self.log.info(u'导入URL: 成功')
                return response
            except:
                flag = True
                self.log.error(u'导入URL: 失败')

    def spider(self):
        #数据提取
        items = []
        response = self.get_response()
        soup = BeautifulSoup(response.read(), 'html.parser')
        datas = soup.find('div', {'class': 'news-text'}).find_all('tr')
        for data in datas[1:5]:
            item = Item()
            item.paihang = data.find_all('td')[0].text
            item.name = data.find_all('td')[1].text
            item.address = data.find_all('td')[2].text
            item.score = data.find_all('td')[3].text
            items.append(item)
            self.log.info(u'获取%s信息: 成功' % item.name)
        return items

    def pipelines(self):
        #数据清洗保存
        filename = 'daxuedata.txt'
        with codecs.open(filename, 'w', 'utf8') as fp:
            for item in self.items:
                fp.write('%d \t %s \t %s \t %.f \n' % (int(
                    item.paihang), item.name, item.address, float(item.score)))
                self.log.info(u'%s保存至%s:成功' % (item.name, filename))
Esempio n. 8
0
    def getBrowser(self):

        # 最初采用PhantomJS进行动态抓取,由于PhantomJS被
        # browser = webdriver.PhantomJS(executable_path='E:/Learning/03-Programme/Python/script-libs/
        # phantomjs-2.1.1-windows/bin/phantomjs.exe')
        chrome_options = Options()

        # 此处由于set_headless属性的被遗弃,采用option的实例化方式进行
        chrome_options.headless = True
        chrome_options.add_argument('--disable-gpu')
        browser = webdriver.Chrome(chrome_options=chrome_options)

        try:
            browser.get(self.startUrl)
        except Exception as e:
            mylog.error('open the %s failed, exit the script ...' %
                        self.startUrl)
            sys.exit(-1)

        browser.implicitly_wait(20)
        return browser
Esempio n. 9
0
class GetData(object):
    def __init__(self):
        self.url = 'https://www.toutiao.com/search/?keyword=\xe8\xa1\x97\xe5\xa4\xb4\xe7\xaf\xae\xe7\x90\x83'
        self.log = MyLog()
#        self.urls = self.get_urls()
        self.items = self.spider()
        self.pipelines()
        
#    def get_urls(self):
#        pass
       
    def get_html(self):
        driver = webdriver.PhantomJS()
        driver.get(self.url)
        driver.implicitly_wait(10)
        submitelement = driver.find_element_by_xpath('//div[@class="tabBar"]//li[@class="y-left tab-item "]')
        submitelement.click()
        time.sleep(5)
        pageSource = driver.page_source
        self.log.info(u'successful')
        return pageSource
    
    def spider(self):
        i = 1
        items = []
#        for url in self.urls:
#        response = self.get_response()
        pageSource = self.get_html()
        try:
            soup = BeautifulSoup(pageSource, 'html.parser')
            datas = soup.find_all('div', {'class': 'articleCard'})
            for data in datas:
                item = Item()
                try:
                    item.image_url = data.find('a', {'class': 'img-wrap'}).find('img', {'alt': ''})['src']
                    items.append(item)
                except KeyError:
                    pass
                self.log.info(u'获取信息: 成功')
        except AttributeError:
            self.log.info(u'url None')
        return items
    
    def pipelines(self):
        filename = '街头篮球1'
        if os.path.exists(filename):
            os.chdir(filename)
        else:
            os.mkdir(filename)
            os.chdir(filename)
        i = 1
        for url in self.items:
            with open(str(i) + '.jpg', 'wb') as fp:
                i += 1
                pic = requests.get(url.image_url)
                fp.write(pic.content)
Esempio n. 10
0
""" MyMongo """
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
####
import bson
from time import sleep
#####
# $ sudo pip install pymongo
#####
import pymongo
#####
from mylog import MyLog

l = MyLog('MyMongo')

__all__ = ['MyMongo']


class MyMongo(object):
    """ mongodb functions """

    # pylint: disable=bare-except
    # pylint: disable=no-self-use
    #    pool = None
    #    dbase = None
    #    connected = False

    def __init__(self, mongo_hosts, son=True):
        """ constructor """
        self.connected = False
Esempio n. 11
0
""" api """
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
####
from password_strength import PasswordPolicy
import pyotp
####
from mymongo import MyMongo
from mylog import MyLog
from myconfig import MyConfig

# pylint: disable=fixme

c = MyConfig()
l = MyLog(c.cfg['virtualenv']['dir'] + '_api')

appname = c.cfg['virtualenv']['dir']

mongohost = c.cfg['dbs']['mongo']['host']
mongodb = c.cfg['dbs']['mongo']['db']
sessionHashSecret = c.cfg['session']['hash_secret']

passpolicylength = c.cfg['password_strength']['length']
passpolicyuppercase = c.cfg['password_strength']['uppercase']
passpolicynumbers = c.cfg['password_strength']['numbers']
passpolicyspecial = c.cfg['password_strength']['special']
passpolicy = PasswordPolicy.from_names(
    length=passpolicylength,  # min length
    uppercase=passpolicyuppercase,  # need min. uppercase letters
    numbers=passpolicynumbers,  # need min. digits
Esempio n. 12
0
#--------------------------------------------------------------------------------------------------------------------------------
from flask import Flask
from flask import request, jsonify
import requests
import time
from general_ocr_recog import general_ocr_client
from paddle_serving_client import Client
import random
import json
import time
import datetime
import numpy as np
import cv2
from mylog import MyLog

logger = MyLog('service').getlog()
from service_config import *
#--------------------------------------------------------------------------------------------------------------------------------
app = Flask(__name__)
app.config['JSON_AS_ASCII'] = False

medical_image_folder = '/data/images/'
medical_json_folder = '/data/json/'

#初始化检测和识别
det_client = Client()
det_client.load_client_config(
    "./general_ocr_config/det_infer_client/serving_client_conf.prototxt")
det_client.connect(det_ip_port)

#start rec Client
Esempio n. 13
0
    def __init__(self):
        self.url = 'https://www.toutiao.com/search/?keyword=\xe8\xa1\x97\xe5\xa4\xb4\xe7\xaf\xae\xe7\x90\x83'
        self.log = MyLog()
#        self.urls = self.get_urls()
        self.items = self.spider()
        self.pipelines()
Esempio n. 14
0
 def __init__(self):
     self.url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
     self.log = MyLog()
     self.items = self.spider()
     self.pipelines()
Esempio n. 15
0
class GetData(object):
    def __init__(self):
        self.url = 'https://movie.douban.com/subject/26266893/reviews?start='
        self.log = MyLog()
        self.urls = self.get_urls()
        self.items = self.spider()
        self.pipelines()

    def get_urls(self):
        pages = 60
        urls = []
        for i in range(0, pages, 20):
            url = self.url + str(i)
            urls.append(url)
            self.log.info(u'导入URL 成功')
        return urls

    def get_response(self, url):
        flag = True
        ua = UserAgent()
        while flag:
            with open('new4proxy.txt', 'r') as fp:
                lines = fp.readlines()
                index = random.randint(1, len(lines))
                proxys = 'https://' + lines[index - 1]

            fakeHeaders = {'User-Agent': ua.random}
            request = urllib.request.Request(url, headers=fakeHeaders)

            proxy = urllib.request.ProxyHandler({'https': proxys})
            opener = urllib.request.build_opener(proxy)
            urllib.request.install_opener(opener)

            try:
                response = urllib.request.urlopen(request)
                flag = False
                self.log.info(u'导入URL: 成功')
                return response
            except (HTTPError, URLError):
                flag = True
                self.log.error(u'导入URL: 失败')

    def spider(self):
        items = []
        for url in self.urls:
            response = self.get_response(url)
            try:
                item = Item()
                soup = BeautifulSoup(response.read(), 'html.parser')
                item.name = soup.find('a', {'class': 'name'}).text
                item.content = soup.find('div', {
                    'class': 'short-content'
                }).text
                items.append(item)
                self.log.info(u'获取%s信息: 成功' % item.name)
            except AttributeError:
                self.log.info(u'url None')
        return items

    def pipelines(self):
        filename = 'newdata.txt'
        with codecs.open(filename, 'w', 'utf8') as fp:
            for item in self.items:
                fp.write('%s \t %s \n' % (item.name, item.content))
                self.log.info(u'%s保存至%s:成功' % (item.name, filename))
Esempio n. 16
0
import pymysql
import re
import redis
import time
import phpserialize
import itertools
import os
import hashlib
import sys
sys.path.append("../")
from Fix.settings import mysql_host, mysql_port, mysql_db_user, mysql_db_pwd, mysql_db_name, mysql_db_charset
from Fix.settings import redis_host, redis_port, redis_pwd, redis_name
from Fix.settings import image_path, store_id
from mylog import MyLog

mylog = MyLog()


class FixPipeline(object):
    brand_info = dict()
    cat_info = dict()
    img_url = ""

    def __init__(self):
        self.client = pymysql.connect(
            host=mysql_host,
            port=mysql_port,
            user=mysql_db_user,  # 使用自己的用户名
            passwd=mysql_db_pwd,  # 使用自己的密码
            db=mysql_db_name,  # 数据库名
            charset=mysql_db_charset)
Esempio n. 17
0
 def __init__(self):
     self.url = 'https://movie.douban.com/subject/26266893/reviews?start='
     self.log = MyLog()
     self.urls = self.get_urls()
     self.items = self.spider()
     self.pipelines()
Esempio n. 18
0
""" mysignal """
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
####
import signal
####
from mylog import MyLog

l = MyLog('MySignal')

__all__ = ['MySignal']


class MySignal(object):
    def __init__(self):
        """ constructor """
        self.exitFlag = 0

    def signalhandler(self, signum, stack):
        """ handle ctrl-c signal """
        l.log('Exiting gracefully! ', 'info')
        self.exitFlag = 1

    def set_signalhandler(self):
        """ set signalhandler """
        signal.signal(signal.SIGTERM, self.signalhandler)
        signal.signal(signal.SIGINT, self.signalhandler)
Esempio n. 19
0
                    "step": "86400",
                    "tunes": i,
                    "count": count,
                }

                response = json.loads(
                    requests.post(url_, headers=headers, data=data).text)
                if response:
                    for data in response:
                        item = {}
                        item['onlyKey'] = mid[0].upper(
                        ) + mid[1:] + "_" + bName.upper() + "_" + unit.upper()
                        item['type'] = "Alcoin"
                        item['Measurement'] = "kline"
                        item['Timestamp'] = int(data[0]) * 1000
                        item['Open'] = data[1]
                        item['High'] = data[2]
                        item['Low'] = data[3]
                        item['Close'] = data[4]
                        item['Volume'] = data[5]
                        content.append(item)
            self.kfk.process_item(content)
        except Exception as e:
            log.info("this data an error {}".format(e))


if __name__ == '__main__':
    log = MyLog()
    log.debug("程序正在运行·····")
    a = get_requests()
    a.get_head()
Esempio n. 20
0
 def __init__(self):
     self.log = MyLog()
     self.testTime()
     self.testLocaltime()
     self.testSleep()
     self.testStrftime()