Example #1
0
# coding:utf-8
import base64
import json
import time

from com.unif.simuwang.ObtainSimuwangInfo import ObtainSimuwangInfo
from com.unif.util.DateUtil import DateUtil
from com.unif.util.HttpUtil import HttpUtil
from com.unif.util.LogUtil import LogUtil
from com.unif.vo.paramater import paramater

logger = LogUtil.get_logger('SaveSimuwangArticle')


# 保存文章
class SaveSimuwangArticle:
    USER_AGENTS = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
        'Opera/8.0 (Windows NT 5.1; U; en)',
        'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
        'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
# coding:utf-8

import time

from com.unif.util.LogUtil import LogUtil
from com.unif.util.SendEmailUtil import SendEmailUtil

logger = LogUtil.get_logger('DateUtil')


class DateUtil:
    def __init__(self):
        logger.info('初始化日期工具类')

    # 入参: '2019年01月09日  14:02:00'
    @staticmethod
    def time_transfer(publish_time):
        array = time.strptime(publish_time, u"%Y年%m月%d日 %H:%M:%S")
        try:
            date_time = time.strftime("%Y-%m-%d %H:%M:%S", array)
        except Exception as e:
            logger.error(e)
            SendEmailUtil.send_email('带年月日时间转换异常', e)
        return date_time

    # 为避免时间格式等问题,只针对确切时间比
    @staticmethod
    def verify_time(time_str):
        if time_str is None:
            return True
        try:
Example #3
0
# coding:utf-8

from bs4 import BeautifulSoup

from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('ObtainVentureInfo')


class ObtainVentureInfo:
    def __init__(self):
        logger.info("初始化:ObtainVentureInfo")

    # 获取标题
    def find_title(self, data):
        soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8')
        title_info = soup.find_all('h1', class_='h1_01')

        if title_info is None:
            return '无题'
        if len(title_info) == 0:
            return '无题'

        title = title_info[0].attrs['title']

        if title is None:
            return '无题'

        # result = eval(repr(title).replace('\\', ''))
        # result = eval(repr(result).replace('/', ''))
        # result = eval(repr(result).replace('*', ''))
Example #4
0
# coding:utf-8
import re  # 正则表达式

from bs4 import BeautifulSoup

from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('ObtainPeDailyInfo')


class ObtainPeDailyInfo:
    def __init__(self):
        logger.info("初始化:ObtainPeDailyInfo")

    # 获取标题
    def find_title(self, data):
        soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8')
        title_info = soup.find_all('div', class_='main final-content')

        if title_info is None:
            return '无题'
        if len(title_info) == 0:
            return '无题'

        title = title_info[0].attrs['data-title']

        if title is None:
            return '无题'

        result = eval(repr(title).replace('\\', ''))
        result = eval(repr(result).replace('/', ''))
# coding:utf-8

import datetime
import json
import re

from bs4 import BeautifulSoup

from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('Obtain36KrInfo')


class Obtain36KrInfo:
    def __init__(self):
        logger.info("初始化:Obtain36KrInfo")

    # 获取标题
    def find_title(self, data):
        soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8')

        if soup.h1 is None:
            return '无题'
        return soup.h1.string

    # 获取分页列表
    def find_pages1(self, data):
        result = {}
        soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8')
        content = soup.find_all('script')
        if content is None:
Example #6
0
# coding:utf-8
import base64
import json

from com.unif.kr.Obtain36KrInfo import Obtain36KrInfo
from com.unif.util.DateUtil import DateUtil
from com.unif.util.HttpUtil import HttpUtil
from com.unif.util.LogUtil import LogUtil
from com.unif.vo.paramater import paramater

logger = LogUtil.get_logger('SaveKrArticle')


# 保存文章
class SaveKrArticle:
    def __init__(self):
        self.obtainInfo = Obtain36KrInfo()
        logger.info("初始化:SaveKrArticle")

    # 保存文章
    def save_article(self, categoryName, tag, url, imgurl):
        data = HttpUtil.get_html(url)
        if data is None:
            return True
        title = self.obtainInfo.find_title(data)
        authors = self.obtainInfo.find_author_info(data)
        context = self.obtainInfo.find_context(data)
        subject = self.obtainInfo.find_subject(data)
        tags = tag
        author = ''
        public_time = ''
Example #7
0
# coding:utf-8
import base64
import json

from com.unif.jfz.ObtainJfzInfo import ObtainJfzInfo
from com.unif.util.DateUtil import DateUtil
from com.unif.util.HttpUtil import HttpUtil
from com.unif.util.LogUtil import LogUtil
from com.unif.vo.paramater import paramater

logger = LogUtil.get_logger('SaveJfzArticle')


# 保存文章
class SaveJfzArticle:

    def __init__(self):
        self.obtainInfo = ObtainJfzInfo()
        logger.info("初始化:SaveJfzArticle")

    # 保存文章
    def save_article(self, categoryName, tag, url, desc):
        data = HttpUtil.get_html(url)
        if data is None:
            return True
        title = self.obtainInfo.find_title(data)
        authors = self.obtainInfo.find_author_info(data)
        context = self.obtainInfo.find_context(data)
        subject = desc
        tags = tag
        author = ''
# coding:utf-8
import datetime

from bs4 import BeautifulSoup

from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('ObtainSimuwangInfo')


class ObtainSimuwangInfo:
    def __init__(self):
        logger.info("初始化:ObtainSimuwangInfo")

    def get_soup_obj(self, html_str):
        return BeautifulSoup(html_str, 'html.parser', from_encoding='utf-8')

    def get_title(self, data):
        article_title_obj = data.find('div', class_='article-header')  # 标题
        return article_title_obj.h1.string

    def get_time(self, data):
        article_time_obj = data.find('span', class_='time')  # 时间
        if len(article_time_obj.string) == 4:
            return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')  # 现在
        else:
            return article_time_obj.string

    def get_desc(self, data):
        article_desc_obj = data.find('meta', attrs={'name':
                                                    'Description'})  # 描述/简介
# coding:utf-8
import threading

from com.unif.util.HttpUtil import HttpUtil
from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('PeDailyThreads')


class PeDailyThreads(threading.Thread):
    def __init__(self, thread_id, url, categoryName, obtain, save):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.url = url
        self.obtain = obtain
        self.save = save
        self.categoryName = categoryName
        logger.info("初始化:PeDailyThreads")

    def run(self):
        logger.info("开始线程:", self.thread_id)

        i = 0
        flag = True
        while True:
            i = i + 1
            act_url = self.url + str(i)
            logger.info(act_url)
            html = HttpUtil.get_html(act_url)

            if html is None:
Example #10
0
# coding:utf-8

from bs4 import BeautifulSoup

from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('ObtainJfzInfo')


class ObtainJfzInfo:
    def __init__(self):
        logger.info("初始化:ObtainJfzInfo")

    # 获取标题
    def find_title(self, data):
        soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8')
        v = soup.find('div', class_='title').span
        if v is None:
            return '无题'
        return v.string

    # 获取文章分页列表
    def find_pages(self, data):
        result = {}
        soup = BeautifulSoup(data, 'html.parser', from_encoding='utf-8')
        content = soup.find('div', class_='article-list').find_all('div', class_='con con-description')
        if content is None:
            return result
        for v in content:
            url = 'https://v.jfz.com' + v.a.attrs['href']
Example #11
0
# coding:utf-8
import threading

from com.unif.util.HttpUtil import HttpUtil
from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('KrThreads')


class KrThreads(threading.Thread):
    def __init__(self, thread_id, url, sub_url, categoryName, tag, obtain,
                 save):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.url = url
        self.obtain = obtain
        self.save = save
        self.categoryName = categoryName
        self.sub_url = sub_url
        self.tag = tag
        logger.info("初始化:KrThreads")

    def run(self):
        logger.info("开始线程:", self.thread_id)
        act_url = self.url
        logger.info(act_url)
        html = HttpUtil.get_html(act_url)

        if html is None:
            return
Example #12
0
# coding:utf-8
import base64
import json

from com.unif.chinaventure.ObtainVentureInfo import ObtainVentureInfo
from com.unif.util.DateUtil import DateUtil
from com.unif.util.HttpUtil import HttpUtil
from com.unif.util.LogUtil import LogUtil
from com.unif.vo.paramater import paramater

logger = LogUtil.get_logger('SaveVentureArticle')


# 保存文章
class SaveVentureArticle:
    def __init__(self):
        self.obtainInfo = ObtainVentureInfo()
        logger.info("初始化:SaveVentureArticle")

    # 保存文章
    def save_article(self, categoryName, url, imgurl):
        data = HttpUtil.get_html(url)
        if data is None:
            return True
        title = self.obtainInfo.find_title(data)
        authors = self.obtainInfo.find_author_info(data)
        context = self.obtainInfo.find_context(data)
        subject = self.obtainInfo.find_subject(data)
        tags = self.obtainInfo.find_tags(data)
        editor = self.obtainInfo.find_editor(context)
        author = ''
Example #13
0
# coding:utf-8
import threading
import time

from com.unif.util.DateUtil import DateUtil
from com.unif.util.HttpUtil import HttpUtil
from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('JfzVideoThreads')


class JfzVideoThreads(threading.Thread):
    def __init__(self, thread_id, url, categoryName, tag, obtain, save):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.url = url
        self.obtain = obtain
        self.save = save
        self.categoryName = categoryName
        self.tag = tag
        logger.info("初始化:JfzVideoThreads")

    def run(self):
        logger.info("开始线程:", self.thread_id)
        i = 0
        flag = True
        while True:
            i = i + 1
            act_url = self.url + str(i)
            logger.info(act_url)
            html = HttpUtil.get_html(act_url)
# coding:utf-8

import json
import urllib
import urllib.request as urllib2
from urllib import request

from lxml import etree

from com.unif.util.LogUtil import LogUtil
from com.unif.util.SendEmailUtil import SendEmailUtil

logger = LogUtil.get_logger('HttpUtil')


class HttpUtil:
    def __init__(self):
        logger.info("初始化HttpUtil")

    @staticmethod
    def post(parameter):
        # interface_url = 'http://172.16.42.253:8080/publiccms/admin/cmsImport/reptile' # 李钊本地
        interface_url = 'http://192.168.30.152:8095/publiccms/admin/cmsImport/reptile'  # 开发环境

        logger.info('入参:' + str(parameter))
        url = interface_url
        # json串数据使用
        parameter = json.dumps(parameter).encode(encoding='utf-8')
        # 普通数据使用
        # parameter = parse.urlencode(parameter).encode(encoding='utf-8')
# coding:utf-8
from com.unif.jfz.JfzArticleThreads import JfzArticleThreads
from com.unif.jfz.JfzVideoThreads import JfzVideoThreads
from com.unif.jfz.ObtainJfzInfo import ObtainJfzInfo
from com.unif.jfz.SaveJfzArticle import SaveJfzArticle
from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('SpiderJfz')


class SpiderJfz:
    def __init__(self):
        logger.info("初始化:SpiderJfz")

    # 1.执行爬虫,爬取【文章】信息
    def executeSpiderArticle(self):
        obtain = ObtainJfzInfo()
        save = SaveJfzArticle()

        urls = {
            "https://v.jfz.com/item-4/": "资讯"
        }

        i = 0
        threads = []
        for url, name in urls.items():
            # 创建新线程
            i = i + 1
            categoryName = '资讯'

            thread1 = JfzArticleThreads("Thread-" + str(i), url, categoryName, name, obtain, save)
Example #16
0
# coding:utf-8
import threading

from com.unif.util.HttpUtil import HttpUtil
from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('JfzArticleThreads')


class JfzArticleThreads(threading.Thread):
    def __init__(self, thread_id, url, categoryName, tag, obtain, save):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.url = url
        self.obtain = obtain
        self.save = save
        self.categoryName = categoryName
        self.tag = tag
        logger.info("初始化:JfzArticleThreads")

    def run(self):
        logger.info("开始线程:", self.thread_id)
        i = 0
        flag = True
        while True:
            i = i + 1
            act_url = self.url + 'p' + str(i) + '.html'
            logger.info(act_url)
            html = HttpUtil.get_html(act_url)

            if html is None:
Example #17
0
# coding:utf-8

import threading
import time

from com.unif.util.HttpUtil import HttpUtil
from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('SimuwangThreads')


class SimuwangThreads(threading.Thread):

    def __init__(self, thread_id, url, categoryName, obtain, save):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.url = url
        self.obtain = obtain
        self.save = save
        self.categoryName = categoryName
        logger.info("初始化:SimuwangThreads")

    def run(self):
        logger.info("开始线程:", self.thread_id)

        i = 0
        flag = True
        while True:
            i = i + 1
            act_url = self.url + "?page=" + str(i)
            logger.info(act_url)  # 这里是先拿到这个界面所有的链接
Example #18
0
# coding:utf-8
import time

from com.unif.chinaventure.SpiderVentureArticle import SpiderVentureArticle
from com.unif.jfz.SpiderJfz import SpiderJfz
from com.unif.kr.Spider36KrArticle import Spider36KrArticle
from com.unif.pedily.SpiderPeDailyArticle import SpiderPeDailyArticle
from com.unif.simuwang.SpiderSimuwangArticle import SpiderSimuwangArticle
from com.unif.util.LogUtil import LogUtil
from com.unif.util.SendEmailUtil import SendEmailUtil

logger = LogUtil.get_logger('Job')


class Job:
    def __init__(self):
        logger.info("初始化Job")

    def execute(self):
        logger.info('Job启动中....')
        logger.info('Job启动成功!')
        while True:
            # 刷新服务器时间
            current_time = time.strftime("%H:%M:%S", time.localtime())

            # --------------------------------------------------------------------------------------
            # 1、【投资界】设置每天定时的时间
            if current_time == "12:10:00" or current_time == "18:00:00":
                logger.info('【投资界】爬虫任务开始执行....')
                SendEmailUtil.send_email('【投资界】爬虫任务开始执行', '【投资界】爬虫任务开始执行....')
                try:
# coding:utf-8
import threading

from com.unif.util.HttpUtil import HttpUtil
from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('VentureThreads')


class VentureThreads(threading.Thread):
    def __init__(self, thread_id, url, categoryName, obtain, save, type):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.url = url
        self.obtain = obtain
        self.save = save
        self.categoryName = categoryName
        self.type = type
        logger.info("初始化:VentureThreads")

    def run(self):
        logger.info("开始线程:", self.thread_id)

        i = 0
        flag = True
        while True:
            i = i + 1
            act_url = self.url + str(i) + '-10.shtml'
            logger.info(act_url)
            html = HttpUtil.get_html(act_url)
            if html is None:
Example #20
0
# coding:utf-8
from com.unif.kr.KrThreads import KrThreads
from com.unif.kr.Obtain36KrInfo import Obtain36KrInfo
from com.unif.kr.SaveKrArticle import SaveKrArticle
from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('Spider36KrArticle')


class Spider36KrArticle:
    def __init__(self):
        logger.info("初始化:Spider36KrArticle")

    # 执行爬虫
    def executeSpider(self):
        obtain = Obtain36KrInfo()
        save = SaveKrArticle()

        urls = {
            "https://36kr.com/information/contact": "创投",
            "https://36kr.com/information/technology": "科技",
            "https://36kr.com/information/happy_life": "生活",
            "https://36kr.com/information/web_zhichang": "职场",
            "https://36kr.com/information/travel": "出行",
            "https://36kr.com/information/innovate": "创新",
            "https://36kr.com/information/real_estate": "房产",
            "https://36kr.com/information/other": "其他"
        }
        sub_url = [
            'https://36kr.com/pp/api/feed-stream?type=web&feed_id=305',
            'https://36kr.com/pp/api/feed-stream?type=web&feed_id=306',
Example #21
0
# coding:utf-8

from com.unif.simuwang.ObtainSimuwangInfo import ObtainSimuwangInfo
from com.unif.simuwang.SaveSimuwangArticle import SaveSimuwangArticle
from com.unif.simuwang.SimuwangThreads import SimuwangThreads
from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('SpiderSimuwangArticle')


class SpiderSimuwangArticle:
    def __init__(self):
        logger.info("初始化:SpiderSimuwangArticle")

    # 执行爬虫
    def executeSpider(self):
        obtain = ObtainSimuwangInfo()
        urls = {
            'https://www.simuwang.com/news/lists.html': '资讯',
        }
        save = SaveSimuwangArticle()

        i = 0
        threads = []
        for url, name in urls.items():
            # 创建新线程
            i = i + 1
            thread1 = SimuwangThreads("Thread-" + str(i), url, name, obtain,
                                      save)

            # 开启新线程
Example #22
0
# coding:utf-8
from com.unif.pedily.ObtainPeDailyInfo import ObtainPeDailyInfo
from com.unif.pedily.PeDailyThreads import PeDailyThreads
from com.unif.pedily.SavePeDailyArticle import SaveArticle
from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('SpiderPeDailyArticle')


class SpiderPeDailyArticle:
    def __init__(self):
        logger.info("初始化:SpiderPeDailyArticle")  # 执行爬虫

    def executeSpider(self):
        obtain = ObtainPeDailyInfo()
        urls = {
            'https://pe.pedaily.cn/': '投资',
            'https://news.pedaily.cn/': '投资',
            'https://people.pedaily.cn/': '资讯',
            'https://research.pedaily.cn/': '资讯'
        }
        save = SaveArticle()

        i = 0
        threads = []
        for url, name in urls.items():
            # 创建新线程
            i = i + 1
            thread1 = PeDailyThreads("Thread-" + str(i), url, name, obtain, save)

            # 开启新线程
# coding:utf-8
from com.unif.chinaventure.ObtainVentureInfo import ObtainVentureInfo
from com.unif.chinaventure.SaveVentureArticle import SaveVentureArticle
from com.unif.chinaventure.VentureThreads import VentureThreads
from com.unif.util.LogUtil import LogUtil

logger = LogUtil.get_logger('SpiderVentureArticle')


class SpiderVentureArticle:
    def __init__(self):
        logger.info("初始化:SpiderVentureArticle")

    # 执行爬虫
    def executeSpider(self):
        obtain = ObtainVentureInfo()
        urls = {
            'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/11/': 'VC/PE',
            'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/3/': '瞰三板',
            'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/20/': '产业资本',
            'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/14/': '锐公司',
            'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/5/': '金融',
            'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/4/': '潮汛Hot',
            'https://www.chinaventure.com.cn/cmsmodel/news/jsonListByChannel/23/': '人物',
            'https://www.chinaventure.com.cn/cmsmodel/report/jsonListBySearch/-1_-1_-1/': '研究院'
        }

        save = SaveVentureArticle()

        i = 0
        threads = []