def __init__(self): self.sqlhelper = SqlHelper() # self.sqlhelper.init_db() self.proxies_set = set() self.url_count = 0 self.url_total = sum([len(p['urls']) for p in parserList])
def _db_save_loop(self): while 1: parm = self.parm_queue.get(block=True) gevent.sleep(0.1) self.count = self.count + 1 S = SqlHelper(logger=self.logger) self.crawl_pool.spawn(S.insert_scholar, **parm)
# coding:utf-8 from db.SqlHelper import SqlHelper from util.exception import Con_DB_Fail try: sqlhelper = SqlHelper() sqlhelper.init_db() except Exception: raise Con_DB_Fail proxy = { 'ip': '192.168.1.1', 'port': int('80'), 'type': 0, 'protocol': 0, 'country': u'中国', 'area': u'四川', 'speed': 0 } sqlhelper.insert(proxy)
# coding:utf-8 from db.SqlHelper import SqlHelper from util.exception import Con_DB_Fail try: sqlhelper = SqlHelper() sqlhelper.init_db() except Exception: raise Con_DB_Fail proxy = {'ip': '192.168.1.1', 'port': int('80'), 'type': 0, 'protocol': 0, 'country': u'中国', 'area': u'四川', 'speed': 0} sqlhelper.insert(proxy)
# 官方 import json # tornado import tornado.httpserver import tornado.ioloop import tornado.options import tornado.web from tornado.options import define, options from config import DEFAULT_SELECT_LIMIT, API_PORT from db.SqlHelper import SqlHelper as SqlHelper from api.tornadoLog import logger, init_tornado_log sqlhelper = SqlHelper() class GetProxy(tornado.web.RequestHandler): def data_received(self, chunk): pass def get(self): count = self.get_argument('count', DEFAULT_SELECT_LIMIT) use_flag = self.get_argument('use_flag', 'default') self.write( json.dumps(sqlhelper.select(count=int(count), use_flag=use_flag))) class Used(tornado.web.RequestHandler):
# coding:utf-8 import sys from util.exception import Con_DB_Fail try: from db.SqlHelper import SqlHelper as SqlHelper sqlHelper = SqlHelper() sqlHelper.init_db() except Exception as e: print(str(e)) raise Con_DB_Fail def store_data(queue2, db_proxy_num): successNum = 0 failNum = 0 while True: try: proxy = queue2.get(timeout=300) if proxy: sqlHelper.insert(proxy) successNum += 1 else: failNum += 1 str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum, failNum) sys.stdout.write(str + "\r") sys.stdout.flush()
class ProxyCrawl(object): def __init__(self): self.sqlhelper = SqlHelper() # self.sqlhelper.init_db() self.proxies_set = set() self.url_count = 0 self.url_total = sum([len(p['urls']) for p in parserList]) def run(self): while True: self.proxies_set.clear() logger.info('------> loop begin') # TODO 删除无效代理 logger.info('删除无效代理 %d 个。' % 0) # 现有代理列表 现有代理数量 count = 0 for proxy in self.sqlhelper.select_all(): count += 1 self.proxies_set.add('%s:%s' % (proxy[0], proxy[1])) begin_num = len(self.proxies_set) logger.info('现有代理数量 %d 个。' % begin_num) if begin_num != count: logger.error('数据库中存在重复代理 -- %d --> %d' % (count, begin_num)) logger.info('爬取新的代理...') spawns = [] self.url_count = 0 for p in parserList: spawns.append(gevent.spawn(self.crawl, p)) if len(spawns) >= MAX_DOWNLOAD_CONCURRENT: gevent.joinall(spawns) spawns = [] gevent.joinall(spawns) spawns.clear() end_num = len(self.proxies_set) logger.info('新代理爬取完成,当前代理数量 %d, 新增加 %d!' % (end_num, end_num - begin_num)) logger.info('------> loop end, sleep %ds!\n' % UPDATE_TIME) time.sleep(UPDATE_TIME) def crawl(self, parser): """ 爬取 :param parser: :return: """ html_parser = Html_Parser() for url in parser['urls']: response = Html_Downloader.download(url) if response is not None: proxy_list = html_parser.parse(response, parser) if proxy_list is not None: # 检查爬取到的proxy count, new = 0, 0 for proxy in proxy_list: count += 1 proxy_str = '%s:%s' % (proxy['ip'], proxy['port']) if proxy_str not in self.proxies_set: self.proxies_set.add(proxy_str) new += 1 self.sqlhelper.insert(proxy) self.url_count += 1 logger.info( '%d/%d -- <%s> 获取%d, 未记录的%d' % (self.url_count, self.url_total, url, count, new)) else: self.url_count += 1 logger.warning('%d/%d -- <%s> 解析数据错误' % (self.url_count, self.url_total, url)) else: self.url_count += 1 logger.warning('%d/%d -- <%s> 下载页面错误' % (self.url_count, self.url_total, url))
import sys sys.path.append(os.path.join(os.getcwd().split('scholar')[0], 'scholar')) from utils.logger import get_logger from utils.set_value import set_value from db.SqlHelper import SqlHelper import pymysql conn = pymysql.connect(host='localhost', user='******', passwd='weiaizq1314', db='eb', port=3306) cur = conn.cursor() sqlhepler = SqlHelper(logger=get_logger("syl")) cur.execute(""" select * from sc; """) res = iter(cur.fetchall()) while True: tmp = (next(res)) name = tmp[0].replace("INSERT INTO `sc` VALUES ('", "").replace("'", "").strip() email = tmp[1].replace("'", "") major = tmp[2].replace("'", "") website = tmp[3].replace("'", "") avatar = tmp[4].replace("'", "").replace(");", "") if "Texas" in major: organization = "Texas A&M University"
import os import sys from nameparser import HumanName sys.path.append(os.path.join(os.getcwd().split('scholar')[0], 'scholar')) PACKAGE_PARENT = '..' SCRIPT_DIR = os.path.dirname( os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) from utils.logger import get_logger from utils.connection import * from db.SqlHelper import SqlHelper from utils.set_value import set_value sqlhelper = SqlHelper(logger=get_logger("wz")) def bme(): html = fetch("https://bme.ucdavis.edu/people/departmental-faculty/") item_list = extract("//tbody/tr", html, multi=True) for i in item_list: avatar = extract("//td[1]/a/img/@src", str(etree.tostring(i))) name = extract("//td[2]/a/strong/text()", str(etree.tostring(i))) major = "Biomedical Engineering" organzation = "University of California,Davis" website = extract("//a[@rel='noopener noreferrer']/@href", str(etree.tostring(i))) if extract("//td[2]/a/@href", str(etree.tostring(i))) is not None: sc_url = extract("//td[2]/a/@href", str(etree.tostring(i)))
@editor: PyCharm @create: 2017/8/18 1:14 @description: -- """ import os import sys from pprint import pprint sys.path.append(os.path.join(os.getcwd().split('scholar')[0], 'scholar')) from db.SqlHelper import SqlHelper from utils.logger import get_logger from utils.set_value import set_value import simplejson from utils.get_file_path import current_file_name sqlhepler = SqlHelper(logger=get_logger("dr.wang")) import os tmp = "C:/Users/tonylu/Desktop/UConn" def Wang_db_save(rootDir): for lists in os.listdir(rootDir): path = os.path.abspath(os.path.join(rootDir, lists)) if os.path.isdir(path): Wang_db_save(path) else: with open(path, 'r') as f: res = simplejson.load(f) for key, i in res.items(): if len(i) is 5:
def _feed_db(self, **parm): sqlhepler = SqlHelper(logger=self.logging) sqlhepler.insert_scholar(**parm)