Exemple #1
0
    def __init__(self):
        self.sqlhelper = SqlHelper()
        # self.sqlhelper.init_db()

        self.proxies_set = set()
        self.url_count = 0
        self.url_total = sum([len(p['urls']) for p in parserList])
 def _db_save_loop(self):
     while 1:
         parm = self.parm_queue.get(block=True)
         gevent.sleep(0.1)
         self.count = self.count + 1
         S = SqlHelper(logger=self.logger)
         self.crawl_pool.spawn(S.insert_scholar, **parm)
Exemple #3
0
# coding:utf-8
from db.SqlHelper import SqlHelper
from util.exception import Con_DB_Fail

try:
    sqlhelper = SqlHelper()
    sqlhelper.init_db()
except Exception:
    raise Con_DB_Fail

proxy = {
    'ip': '192.168.1.1',
    'port': int('80'),
    'type': 0,
    'protocol': 0,
    'country': u'中国',
    'area': u'四川',
    'speed': 0
}
sqlhelper.insert(proxy)
Exemple #4
0
# coding:utf-8
from db.SqlHelper import SqlHelper
from util.exception import Con_DB_Fail

try:
    sqlhelper = SqlHelper()
    sqlhelper.init_db()
except Exception:
    raise Con_DB_Fail

proxy = {'ip': '192.168.1.1', 'port': int('80'), 'type': 0, 'protocol': 0, 'country': u'中国', 'area': u'四川', 'speed': 0}
sqlhelper.insert(proxy)
# 官方
import json

# tornado
import tornado.httpserver
import tornado.ioloop
import tornado.options
import tornado.web
from tornado.options import define, options

from config import DEFAULT_SELECT_LIMIT, API_PORT
from db.SqlHelper import SqlHelper as SqlHelper
from api.tornadoLog import logger, init_tornado_log

sqlhelper = SqlHelper()


class GetProxy(tornado.web.RequestHandler):
    def data_received(self, chunk):
        pass

    def get(self):
        count = self.get_argument('count', DEFAULT_SELECT_LIMIT)
        use_flag = self.get_argument('use_flag', 'default')

        self.write(
            json.dumps(sqlhelper.select(count=int(count), use_flag=use_flag)))


class Used(tornado.web.RequestHandler):
Exemple #6
0
# coding:utf-8
import sys
from util.exception import Con_DB_Fail


try:
    from db.SqlHelper import SqlHelper as SqlHelper
    sqlHelper = SqlHelper()
    sqlHelper.init_db()
except Exception as e:
    print(str(e))
    raise Con_DB_Fail



def store_data(queue2, db_proxy_num):

    successNum = 0
    failNum = 0
    while True:
        try:
            proxy = queue2.get(timeout=300)
            if proxy:

                sqlHelper.insert(proxy)
                successNum += 1
            else:
                failNum += 1
            str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum, failNum)
            sys.stdout.write(str + "\r")
            sys.stdout.flush()
Exemple #7
0
class ProxyCrawl(object):
    def __init__(self):
        self.sqlhelper = SqlHelper()
        # self.sqlhelper.init_db()

        self.proxies_set = set()
        self.url_count = 0
        self.url_total = sum([len(p['urls']) for p in parserList])

    def run(self):
        while True:
            self.proxies_set.clear()
            logger.info('------> loop begin')

            # TODO 删除无效代理
            logger.info('删除无效代理 %d 个。' % 0)

            # 现有代理列表 现有代理数量
            count = 0
            for proxy in self.sqlhelper.select_all():
                count += 1
                self.proxies_set.add('%s:%s' % (proxy[0], proxy[1]))
            begin_num = len(self.proxies_set)
            logger.info('现有代理数量 %d 个。' % begin_num)
            if begin_num != count:
                logger.error('数据库中存在重复代理 -- %d --> %d' % (count, begin_num))

            logger.info('爬取新的代理...')
            spawns = []
            self.url_count = 0
            for p in parserList:
                spawns.append(gevent.spawn(self.crawl, p))
                if len(spawns) >= MAX_DOWNLOAD_CONCURRENT:
                    gevent.joinall(spawns)
                    spawns = []
            gevent.joinall(spawns)
            spawns.clear()

            end_num = len(self.proxies_set)
            logger.info('新代理爬取完成,当前代理数量 %d, 新增加 %d!' %
                        (end_num, end_num - begin_num))

            logger.info('------> loop end, sleep %ds!\n' % UPDATE_TIME)

            time.sleep(UPDATE_TIME)

    def crawl(self, parser):
        """
        爬取
        :param parser:
        :return:
        """
        html_parser = Html_Parser()
        for url in parser['urls']:
            response = Html_Downloader.download(url)
            if response is not None:
                proxy_list = html_parser.parse(response, parser)
                if proxy_list is not None:
                    # 检查爬取到的proxy
                    count, new = 0, 0
                    for proxy in proxy_list:
                        count += 1
                        proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
                        if proxy_str not in self.proxies_set:
                            self.proxies_set.add(proxy_str)
                            new += 1
                            self.sqlhelper.insert(proxy)
                    self.url_count += 1
                    logger.info(
                        '%d/%d -- <%s> 获取%d, 未记录的%d' %
                        (self.url_count, self.url_total, url, count, new))
                else:
                    self.url_count += 1
                    logger.warning('%d/%d -- <%s> 解析数据错误' %
                                   (self.url_count, self.url_total, url))
            else:
                self.url_count += 1
                logger.warning('%d/%d -- <%s> 下载页面错误' %
                               (self.url_count, self.url_total, url))
Exemple #8
0
import sys

sys.path.append(os.path.join(os.getcwd().split('scholar')[0], 'scholar'))

from utils.logger import get_logger
from utils.set_value import set_value

from db.SqlHelper import SqlHelper
import pymysql
conn = pymysql.connect(host='localhost',
                       user='******',
                       passwd='weiaizq1314',
                       db='eb',
                       port=3306)
cur = conn.cursor()
sqlhepler = SqlHelper(logger=get_logger("syl"))
cur.execute("""
                  select * from sc;
                  """)
res = iter(cur.fetchall())

while True:
    tmp = (next(res))
    name = tmp[0].replace("INSERT INTO `sc` VALUES ('",
                          "").replace("'", "").strip()
    email = tmp[1].replace("'", "")
    major = tmp[2].replace("'", "")
    website = tmp[3].replace("'", "")
    avatar = tmp[4].replace("'", "").replace(");", "")
    if "Texas" in major:
        organization = "Texas A&M University"
Exemple #9
0
import os
import sys

from nameparser import HumanName
sys.path.append(os.path.join(os.getcwd().split('scholar')[0], 'scholar'))

PACKAGE_PARENT = '..'
SCRIPT_DIR = os.path.dirname(
    os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

from utils.logger import get_logger
from utils.connection import *
from db.SqlHelper import SqlHelper
from utils.set_value import set_value
sqlhelper = SqlHelper(logger=get_logger("wz"))


def bme():
    html = fetch("https://bme.ucdavis.edu/people/departmental-faculty/")
    item_list = extract("//tbody/tr", html, multi=True)
    for i in item_list:
        avatar = extract("//td[1]/a/img/@src", str(etree.tostring(i)))
        name = extract("//td[2]/a/strong/text()", str(etree.tostring(i)))
        major = "Biomedical Engineering"
        organzation = "University of California,Davis"
        website = extract("//a[@rel='noopener noreferrer']/@href",
                          str(etree.tostring(i)))

        if extract("//td[2]/a/@href", str(etree.tostring(i))) is not None:
            sc_url = extract("//td[2]/a/@href", str(etree.tostring(i)))
Exemple #10
0
@editor:    PyCharm
@create:    2017/8/18 1:14
@description:
            --
"""
import os
import sys
from pprint import pprint
sys.path.append(os.path.join(os.getcwd().split('scholar')[0], 'scholar'))

from db.SqlHelper import SqlHelper
from utils.logger import get_logger
from utils.set_value import set_value
import simplejson
from utils.get_file_path import current_file_name
sqlhepler = SqlHelper(logger=get_logger("dr.wang"))

import os
tmp = "C:/Users/tonylu/Desktop/UConn"


def Wang_db_save(rootDir):
    for lists in os.listdir(rootDir):
        path = os.path.abspath(os.path.join(rootDir, lists))
        if os.path.isdir(path):
            Wang_db_save(path)
        else:
            with open(path, 'r') as f:
                res = simplejson.load(f)
            for key, i in res.items():
                if len(i) is 5:
Exemple #11
0
 def _feed_db(self, **parm):
     
     sqlhepler = SqlHelper(logger=self.logging)
     sqlhepler.insert_scholar(**parm)