# coding:utf-8 import base64 import json import time from com.unif.simuwang.ObtainSimuwangInfo import ObtainSimuwangInfo from com.unif.util.DateUtil import DateUtil from com.unif.util.HttpUtil import HttpUtil from com.unif.util.LogUtil import LogUtil from com.unif.vo.paramater import paramater logger = LogUtil.get_logger('SaveSimuwangArticle') # 保存文章 class SaveSimuwangArticle: USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Opera/8.0 (Windows NT 5.1; U; en)', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
# coding:utf-8 import time from com.unif.util.LogUtil import LogUtil from com.unif.util.SendEmailUtil import SendEmailUtil logger = LogUtil.get_logger('DateUtil') class DateUtil: def __init__(self): logger.info('初始化日期工具类') # 入参: '2019年01月09日 14:02:00' @staticmethod def time_transfer(publish_time): array = time.strptime(publish_time, u"%Y年%m月%d日 %H:%M:%S") try: date_time = time.strftime("%Y-%m-%d %H:%M:%S", array) except Exception as e: logger.error(e) SendEmailUtil.send_email('带年月日时间转换异常', e) return date_time # 为避免时间格式等问题,只针对确切时间比 @staticmethod def verify_time(time_str): if time_str is None: return True try:
# coding:utf-8 from bs4 import BeautifulSoup from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('ObtainVentureInfo') class ObtainVentureInfo: def __init__(self): logger.info("初始化:ObtainVentureInfo") # 获取标题 def find_title(self, data): soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8') title_info = soup.find_all('h1', class_='h1_01') if title_info is None: return '无题' if len(title_info) == 0: return '无题' title = title_info[0].attrs['title'] if title is None: return '无题' # result = eval(repr(title).replace('\\', '')) # result = eval(repr(result).replace('/', '')) # result = eval(repr(result).replace('*', ''))
# coding:utf-8 import re # 正则表达式 from bs4 import BeautifulSoup from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('ObtainPeDailyInfo') class ObtainPeDailyInfo: def __init__(self): logger.info("初始化:ObtainPeDailyInfo") # 获取标题 def find_title(self, data): soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8') title_info = soup.find_all('div', class_='main final-content') if title_info is None: return '无题' if len(title_info) == 0: return '无题' title = title_info[0].attrs['data-title'] if title is None: return '无题' result = eval(repr(title).replace('\\', '')) result = eval(repr(result).replace('/', ''))
# coding:utf-8 import datetime import json import re from bs4 import BeautifulSoup from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('Obtain36KrInfo') class Obtain36KrInfo: def __init__(self): logger.info("初始化:Obtain36KrInfo") # 获取标题 def find_title(self, data): soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8') if soup.h1 is None: return '无题' return soup.h1.string # 获取分页列表 def find_pages1(self, data): result = {} soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8') content = soup.find_all('script') if content is None:
# coding:utf-8 import base64 import json from com.unif.kr.Obtain36KrInfo import Obtain36KrInfo from com.unif.util.DateUtil import DateUtil from com.unif.util.HttpUtil import HttpUtil from com.unif.util.LogUtil import LogUtil from com.unif.vo.paramater import paramater logger = LogUtil.get_logger('SaveKrArticle') # 保存文章 class SaveKrArticle: def __init__(self): self.obtainInfo = Obtain36KrInfo() logger.info("初始化:SaveKrArticle") # 保存文章 def save_article(self, categoryName, tag, url, imgurl): data = HttpUtil.get_html(url) if data is None: return True title = self.obtainInfo.find_title(data) authors = self.obtainInfo.find_author_info(data) context = self.obtainInfo.find_context(data) subject = self.obtainInfo.find_subject(data) tags = tag author = '' public_time = ''
# coding:utf-8 import base64 import json from com.unif.jfz.ObtainJfzInfo import ObtainJfzInfo from com.unif.util.DateUtil import DateUtil from com.unif.util.HttpUtil import HttpUtil from com.unif.util.LogUtil import LogUtil from com.unif.vo.paramater import paramater logger = LogUtil.get_logger('SaveJfzArticle') # 保存文章 class SaveJfzArticle: def __init__(self): self.obtainInfo = ObtainJfzInfo() logger.info("初始化:SaveJfzArticle") # 保存文章 def save_article(self, categoryName, tag, url, desc): data = HttpUtil.get_html(url) if data is None: return True title = self.obtainInfo.find_title(data) authors = self.obtainInfo.find_author_info(data) context = self.obtainInfo.find_context(data) subject = desc tags = tag author = ''
# coding:utf-8 import datetime from bs4 import BeautifulSoup from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('ObtainSimuwangInfo') class ObtainSimuwangInfo: def __init__(self): logger.info("初始化:ObtainSimuwangInfo") def get_soup_obj(self, html_str): return BeautifulSoup(html_str, 'html.parser', from_encoding='utf-8') def get_title(self, data): article_title_obj = data.find('div', class_='article-header') # 标题 return article_title_obj.h1.string def get_time(self, data): article_time_obj = data.find('span', class_='time') # 时间 if len(article_time_obj.string) == 4: return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 现在 else: return article_time_obj.string def get_desc(self, data): article_desc_obj = data.find('meta', attrs={'name': 'Description'}) # 描述/简介
# coding:utf-8 import threading from com.unif.util.HttpUtil import HttpUtil from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('PeDailyThreads') class PeDailyThreads(threading.Thread): def __init__(self, thread_id, url, categoryName, obtain, save): threading.Thread.__init__(self) self.thread_id = thread_id self.url = url self.obtain = obtain self.save = save self.categoryName = categoryName logger.info("初始化:PeDailyThreads") def run(self): logger.info("开始线程:", self.thread_id) i = 0 flag = True while True: i = i + 1 act_url = self.url + str(i) logger.info(act_url) html = HttpUtil.get_html(act_url) if html is None:
# coding:utf-8 from bs4 import BeautifulSoup from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('ObtainJfzInfo') class ObtainJfzInfo: def __init__(self): logger.info("初始化:ObtainJfzInfo") # 获取标题 def find_title(self, data): soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8') v = soup.find('div', class_='title').span if v is None: return '无题' return v.string # 获取文章分页列表 def find_pages(self, data): result = {} soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8') content = soup.find('div', class_='article-list').find_all('div', class_='con con-description') if content is None: return result for v in content: url = 'https://v.jfz.com' + v.a.attrs['href']
# coding:utf-8 import threading from com.unif.util.HttpUtil import HttpUtil from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('KrThreads') class KrThreads(threading.Thread): def __init__(self, thread_id, url, sub_url, categoryName, tag, obtain, save): threading.Thread.__init__(self) self.thread_id = thread_id self.url = url self.obtain = obtain self.save = save self.categoryName = categoryName self.sub_url = sub_url self.tag = tag logger.info("初始化:KrThreads") def run(self): logger.info("开始线程:", self.thread_id) act_url = self.url logger.info(act_url) html = HttpUtil.get_html(act_url) if html is None: return
# coding:utf-8 import base64 import json from com.unif.chinaventure.ObtainVentureInfo import ObtainVentureInfo from com.unif.util.DateUtil import DateUtil from com.unif.util.HttpUtil import HttpUtil from com.unif.util.LogUtil import LogUtil from com.unif.vo.paramater import paramater logger = LogUtil.get_logger('SaveVentureArticle') # 保存文章 class SaveVentureArticle: def __init__(self): self.obtainInfo = ObtainVentureInfo() logger.info("初始化:SaveVentureArticle") # 保存文章 def save_article(self, categoryName, url, imgurl): data = HttpUtil.get_html(url) if data is None: return True title = self.obtainInfo.find_title(data) authors = self.obtainInfo.find_author_info(data) context = self.obtainInfo.find_context(data) subject = self.obtainInfo.find_subject(data) tags = self.obtainInfo.find_tags(data) editor = self.obtainInfo.find_editor(context) author = ''
# coding:utf-8 import threading import time from com.unif.util.DateUtil import DateUtil from com.unif.util.HttpUtil import HttpUtil from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('JfzVideoThreads') class JfzVideoThreads(threading.Thread): def __init__(self, thread_id, url, categoryName, tag, obtain, save): threading.Thread.__init__(self) self.thread_id = thread_id self.url = url self.obtain = obtain self.save = save self.categoryName = categoryName self.tag = tag logger.info("初始化:JfzVideoThreads") def run(self): logger.info("开始线程:", self.thread_id) i = 0 flag = True while True: i = i + 1 act_url = self.url + str(i) logger.info(act_url) html = HttpUtil.get_html(act_url)
# coding:utf-8 import json import urllib import urllib.request as urllib2 from urllib import request from lxml import etree from com.unif.util.LogUtil import LogUtil from com.unif.util.SendEmailUtil import SendEmailUtil logger = LogUtil.get_logger('HttpUtil') class HttpUtil: def __init__(self): logger.info("初始化HttpUtil") @staticmethod def post(parameter): # interface_url = 'http://172.16.42.253:8080/publiccms/admin/cmsImport/reptile' # 李钊本地 interface_url = 'http://192.168.30.152:8095/publiccms/admin/cmsImport/reptile' # 开发环境 logger.info('入参:' + str(parameter)) url = interface_url # json串数据使用 parameter = json.dumps(parameter).encode(encoding='utf-8') # 普通数据使用 # parameter = parse.urlencode(parameter).encode(encoding='utf-8')
# coding:utf-8 from com.unif.jfz.JfzArticleThreads import JfzArticleThreads from com.unif.jfz.JfzVideoThreads import JfzVideoThreads from com.unif.jfz.ObtainJfzInfo import ObtainJfzInfo from com.unif.jfz.SaveJfzArticle import SaveJfzArticle from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('SpiderJfz') class SpiderJfz: def __init__(self): logger.info("初始化:SpiderJfz") # 1.执行爬虫,爬取【文章】信息 def executeSpiderArticle(self): obtain = ObtainJfzInfo() save = SaveJfzArticle() urls = { "https://v.jfz.com/item-4/": "资讯" } i = 0 threads = [] for url, name in urls.items(): # 创建新线程 i = i + 1 categoryName = '资讯' thread1 = JfzArticleThreads("Thread-" + str(i), url, categoryName, name, obtain, save)
# coding:utf-8 import threading from com.unif.util.HttpUtil import HttpUtil from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('JfzArticleThreads') class JfzArticleThreads(threading.Thread): def __init__(self, thread_id, url, categoryName, tag, obtain, save): threading.Thread.__init__(self) self.thread_id = thread_id self.url = url self.obtain = obtain self.save = save self.categoryName = categoryName self.tag = tag logger.info("初始化:JfzArticleThreads") def run(self): logger.info("开始线程:", self.thread_id) i = 0 flag = True while True: i = i + 1 act_url = self.url + 'p' + str(i) + '.html' logger.info(act_url) html = HttpUtil.get_html(act_url) if html is None:
# coding:utf-8 import threading import time from com.unif.util.HttpUtil import HttpUtil from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('SimuwangThreads') class SimuwangThreads(threading.Thread): def __init__(self, thread_id, url, categoryName, obtain, save): threading.Thread.__init__(self) self.thread_id = thread_id self.url = url self.obtain = obtain self.save = save self.categoryName = categoryName logger.info("初始化:SimuwangThreads") def run(self): logger.info("开始线程:", self.thread_id) i = 0 flag = True while True: i = i + 1 act_url = self.url + "?page=" + str(i) logger.info(act_url) # 这里是先拿到这个界面所有的链接
# coding:utf-8 import time from com.unif.chinaventure.SpiderVentureArticle import SpiderVentureArticle from com.unif.jfz.SpiderJfz import SpiderJfz from com.unif.kr.Spider36KrArticle import Spider36KrArticle from com.unif.pedily.SpiderPeDailyArticle import SpiderPeDailyArticle from com.unif.simuwang.SpiderSimuwangArticle import SpiderSimuwangArticle from com.unif.util.LogUtil import LogUtil from com.unif.util.SendEmailUtil import SendEmailUtil logger = LogUtil.get_logger('Job') class Job: def __init__(self): logger.info("初始化Job") def execute(self): logger.info('Job启动中....') logger.info('Job启动成功!') while True: # 刷新服务器时间 current_time = time.strftime("%H:%M:%S", time.localtime()) # -------------------------------------------------------------------------------------- # 1、【投资界】设置每天定时的时间 if current_time == "12:10:00" or current_time == "18:00:00": logger.info('【投资界】爬虫任务开始执行....') SendEmailUtil.send_email('【投资界】爬虫任务开始执行', '【投资界】爬虫任务开始执行....') try:
# coding:utf-8 import threading from com.unif.util.HttpUtil import HttpUtil from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('VentureThreads') class VentureThreads(threading.Thread): def __init__(self, thread_id, url, categoryName, obtain, save, type): threading.Thread.__init__(self) self.thread_id = thread_id self.url = url self.obtain = obtain self.save = save self.categoryName = categoryName self.type = type logger.info("初始化:VentureThreads") def run(self): logger.info("开始线程:", self.thread_id) i = 0 flag = True while True: i = i + 1 act_url = self.url + str(i) + '-10.shtml' logger.info(act_url) html = HttpUtil.get_html(act_url) if html is None:
# coding:utf-8 from com.unif.kr.KrThreads import KrThreads from com.unif.kr.Obtain36KrInfo import Obtain36KrInfo from com.unif.kr.SaveKrArticle import SaveKrArticle from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('Spider36KrArticle') class Spider36KrArticle: def __init__(self): logger.info("初始化:Spider36KrArticle") # 执行爬虫 def executeSpider(self): obtain = Obtain36KrInfo() save = SaveKrArticle() urls = { "https://36kr.com/information/contact": "创投", "https://36kr.com/information/technology": "科技", "https://36kr.com/information/happy_life": "生活", "https://36kr.com/information/web_zhichang": "职场", "https://36kr.com/information/travel": "出行", "https://36kr.com/information/innovate": "创新", "https://36kr.com/information/real_estate": "房产", "https://36kr.com/information/other": "其他" } sub_url = [ 'https://36kr.com/pp/api/feed-stream?type=web&feed_id=305', 'https://36kr.com/pp/api/feed-stream?type=web&feed_id=306',
# coding:utf-8 from com.unif.simuwang.ObtainSimuwangInfo import ObtainSimuwangInfo from com.unif.simuwang.SaveSimuwangArticle import SaveSimuwangArticle from com.unif.simuwang.SimuwangThreads import SimuwangThreads from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('SpiderSimuwangArticle') class SpiderSimuwangArticle: def __init__(self): logger.info("初始化:SpiderSimuwangArticle") # 执行爬虫 def executeSpider(self): obtain = ObtainSimuwangInfo() urls = { 'https://www.simuwang.com/news/lists.html': '资讯', } save = SaveSimuwangArticle() i = 0 threads = [] for url, name in urls.items(): # 创建新线程 i = i + 1 thread1 = SimuwangThreads("Thread-" + str(i), url, name, obtain, save) # 开启新线程
# coding:utf-8 from com.unif.pedily.ObtainPeDailyInfo import ObtainPeDailyInfo from com.unif.pedily.PeDailyThreads import PeDailyThreads from com.unif.pedily.SavePeDailyArticle import SaveArticle from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('SpiderPeDailyArticle') class SpiderPeDailyArticle: def __init__(self): logger.info("初始化:SpiderPeDailyArticle") # 执行爬虫 def executeSpider(self): obtain = ObtainPeDailyInfo() urls = { 'https://pe.pedaily.cn/': '投资', 'https://news.pedaily.cn/': '投资', 'https://people.pedaily.cn/': '资讯', 'https://research.pedaily.cn/': '资讯' } save = SaveArticle() i = 0 threads = [] for url, name in urls.items(): # 创建新线程 i = i + 1 thread1 = PeDailyThreads("Thread-" + str(i), url, name, obtain, save) # 开启新线程
# coding:utf-8 from com.unif.chinaventure.ObtainVentureInfo import ObtainVentureInfo from com.unif.chinaventure.SaveVentureArticle import SaveVentureArticle from com.unif.chinaventure.VentureThreads import VentureThreads from com.unif.util.LogUtil import LogUtil logger = LogUtil.get_logger('SpiderVentureArticle') class SpiderVentureArticle: def __init__(self): logger.info("初始化:SpiderVentureArticle") # 执行爬虫 def executeSpider(self): obtain = ObtainVentureInfo() urls = { 'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/11/': 'VC/PE', 'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/3/': '瞰三板', 'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/20/': '产业资本', 'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/14/': '锐公司', 'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/5/': '金融', 'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/4/': '潮汛Hot', 'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/23/': '人物', 'https://www.chinaventure.com.cn/cmsmodel/report/jsonListBySearch/-1_-1_-1/': '研究院' } save = SaveVentureArticle() i = 0 threads = []