def __init__(self, tab_urls): super(Collector, self).__init__() self._lock = threading.RLock() self._db = MongoDB() self._thread_stop = False self._urls = [] self._null_times = 0 self._read_pos = -1 self._write_pos = -1 self._tab_urls = tab_urls self._depth = int( tools.get_conf_value('config.conf', "collector", "depth")) self._max_size = int( tools.get_conf_value('config.conf', "collector", "max_size")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) #初始时将正在做的任务至为未做 self._db.update(self._tab_urls, {'status': Constance.DOING}, {'status': Constance.TODO}) self._finished_callback = None
def __init__(self, tab_urls, depth, process_num = None): ''' @summary: --------- @param tab_urls: @param depth: @param process_num: 进程编号 --------- @result: ''' super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False self._tab_worker_status = 'news:worker_status' self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')
def __init__(self, collector, tab_urls): super(PaserControl, self).__init__() self._parsers = [] self._collector = collector self._urlCount = int(tools.get_conf_value('config.conf', "parser", "url_count")) self._interval = int(tools.get_conf_value('config.conf', "parser", "sleep_time")) self._tab_urls = tab_urls
def __init__(self, tab_urls, tab_site='', tab_content='', parser_count=None, parser_params={}, begin_callback=None, end_callback=None, content_unique_key='url', delete_tab_urls=False): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param parser_params : 解析器所用的參數 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._tab_urls = tab_urls self._db = MongoDB() if delete_tab_urls: self._db.delete(tab_urls) self._db.set_unique_key(tab_urls, 'url') if tab_site: self._db.set_unique_key(tab_site, 'site_id') if tab_content: self._db.set_unique_key(tab_content, content_unique_key) #设置索引 加快查询速度 self._db.set_ensure_index(tab_urls, 'depth') self._db.set_ensure_index(tab_urls, 'status') if tab_site: self._db.set_ensure_index(tab_site, 'read_status') if tab_content: self._db.set_ensure_index(tab_content, 'read_status') self._collector = Collector(tab_urls) self._parsers = [] self._parser_params = parser_params self._begin_callback = begin_callback self._end_callabck = end_callback self._parser_count = int( tools.get_conf_value( 'config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value( 'config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value( 'config.conf', "spider_site", "except_site_name").split(',')
def monitor_proxies(): redis_0 = RedisDB() config = os.path.join(os.path.dirname(__file__) + '/../config.conf') redis_key = tools.get_conf_value(config, 'redis', 'redis_key') redis_key2 = tools.get_conf_value(config, 'redis', 'redis_key2') sum = redis_0.count(redis_key) sum2 = redis_0.count(redis_key2) log.debug("douban当前redis库中剩余ip总数:%d" % sum) log.debug("weibo当前redis库中剩余ip总数:%d" % sum2)
def __init__(self, tab_urls, tab_site, tab_content, parser_count=None, search_keyword1=[], search_keyword2=[], search_keyword3=[], begin_callback=None, end_callback=None, content_unique_key=None): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param search_keyword1: 搜索关键字(列表)全部包含 @param search_keyword2: 搜索关键字(列表)至少包含一个 @param search_keyword3: 搜索关键字(列表)一个都不能包含 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._tab_urls = tab_urls self._db = MongoDB() self._db.set_unique_key(tab_urls, 'url') self._db.set_unique_key(tab_site, 'site_id') self._db.set_unique_key( tab_content, 'url' if not content_unique_key else content_unique_key) self._collector = Collector(tab_urls) self._parsers = [] self._search_keyword1 = search_keyword1 self._search_keyword2 = search_keyword2 self._search_keyword3 = search_keyword3 self._begin_callback = begin_callback self._end_callabck = end_callback self._parser_count = int( tools.get_conf_value( 'config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value( 'config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value( 'config.conf', "spider_site", "except_site_name").split(',')
def __init__(self, tab_list, tab_unique_key_list, tab_ensure_index_list, parser_count=None, site_parsers=None, parser_params={}, begin_callback=None, end_callback=None, delete_tab_urls=False): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param parser_params : 解析器所用的参数 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._db = MongoDB() self._tab_urls = tab_list[0] if delete_tab_urls: self._db.delete(self._tab_urls) self._site_parsers = site_parsers for tab_index in range(len(tab_list)): self._db.set_unique_key(tab_list[tab_index], tab_unique_key_list[tab_index]) # 设置索引 加快查询速度 for ensure_index in tab_ensure_index_list[tab_index]: self._db.set_ensure_index(tab_list[tab_index], ensure_index) self._collector = Collector(self._tab_urls, self._site_parsers) self._parsers = [] self._parser_params = parser_params self._begin_callback = begin_callback self._end_callabck = end_callback self._parser_count = int( tools.get_conf_value( 'config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value( 'config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value( 'config.conf', "spider_site", "except_site_name").split(',')
def __init__(self, collector, tab_images): super(ImagePornControl, self).__init__() self._collector = collector self._tab_images = tab_images self._deal_image_count = int( tools.get_conf_value('../config.conf', "image_porn", "deal_image_count")) self._interval = int( tools.get_conf_value('../config.conf', "image_porn", "sleep_time")) self._db = MongoDB() self._image_porn_recg = ImagePornRecg()
def __init__(self, tab_urls, depth): super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth # or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False
def main(): search_task_sleep_time = int( tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time')) # 更新任务状态 正在做的更新为等待 while True: # 查询任务状态 有正在做的 sleep contine # TODO search_keyword1 = ['hi'] search_keyword2 = ['hello'] search_keyword3 = ['hello, hi'] task_id = 1 # 任务为空 sleep continue # TODO def begin_callback(): log.info('\n********** template begin **********') # 更新任务状态 doing def end_callback(): log.info('\n********** template end **********') # 更新任务状态 done # 导出数据 # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '') # export_data.export_to_oracle() # 配置spider # spider = Spider(tab_urls = 'template_urls', tab_site = 'template_site_info', tab_content = '', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback) spider = Spider(tab_urls='template_urls', tab_site='template_site_info', tab_content='template_content_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, search_keyword1=search_keyword1, search_keyword2=search_keyword2, search_keyword3=search_keyword3) # 添加parser spider.add_parser(xxx_parser) spider.add_parser(yyy_parser) spider.start() # time.sleep(search_task_sleep_time) break
Created on 2017-08-22 14:06 --------- @summary: 同步oracle数据库到ElasticSearc --------- @author: Boris ''' import sys sys.path.append('../') import init import utils.tools as tools from elasticsearch import Elasticsearch import elasticsearch.helpers from utils.log import log ADDRESS = tools.get_conf_value('config.conf', 'elasticsearch', 'yqtj') class Singleton(object): def __new__(cls, *args, **kwargs): if not hasattr(cls, '_inst'): cls._inst = super(Singleton, cls).__new__(cls) return cls._inst class ES(): def __init__(self, address=ADDRESS): try: print(address.split(',')) self._es = Elasticsearch(address.split(','))
# -*- coding: utf-8 -*- ''' Created on 2016-11-16 16:25 --------- @summary: 操作mongo数据库 --------- @author: Boris ''' import sys sys.path.append('../') import init import pymongo import utils.tools as tools from utils.log import log IP = tools.get_conf_value('config.conf', 'mongodb', 'ip') PORT = int(tools.get_conf_value('config.conf', 'mongodb', 'port')) DB = tools.get_conf_value('config.conf', 'mongodb', 'db') class Singleton(object): def __new__(cls, *args, **kwargs): if not hasattr(cls,'_inst'): cls._inst=super(Singleton,cls).__new__(cls, *args, **kwargs) return cls._inst class MongoDB(Singleton): def __init__(self, ip = IP, port = PORT, db = DB): super(MongoDB, self).__init__()
--------- @author: Boris ''' import sys sys.path.append('../') import init import cx_Oracle import utils.tools as tools from utils.log import log import datetime import os os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8' # 防止查出的中文乱码 STOP_ORCL = False #禁用oracle IP = tools.get_conf_value('config.conf', 'oracledb', 'ip') PORT = int(tools.get_conf_value('config.conf', 'oracledb', 'port')) DB = tools.get_conf_value('config.conf', 'oracledb', 'db') USER_NAME = tools.get_conf_value('config.conf', 'oracledb', 'user_name') USER_PASS = tools.get_conf_value('config.conf', 'oracledb', 'user_pass') class Singleton(object): def __new__(cls, *args, **kwargs): if not hasattr(cls, '_inst'): cls._inst = super(Singleton, cls).__new__(cls, *args, **kwargs) return cls._inst class OracleDB(Singleton):
''' import sys sys.path.append('..') import init import utils.tools as tools from db.elastic_search import ES from base.compare_keywords import CompareKeywords from word_cloud.word_cloud import WordCloud from base.hot_sync import HotSync from base.vip_checked import VipChecked from summary.summary import Summary from emotion.emotion import Emotion from utils.log import log ADDRESS = tools.get_conf_value('config.conf', 'elasticsearch', 'data-pool') SYNC_TIME_FILE = 'iopm_sync/sync_time.txt' IOPM_SERVICE_ADDRESS = 'http://localhost:8080/' SLEEP_TIME = int(tools.get_conf_value('config.conf', 'sync', 'sleep_time')) class ArticleSync(): def __init__(self, table): self._record_time = tools.get_json( tools.read_file(SYNC_TIME_FILE)) or {} self._compare_keywords = CompareKeywords() self._summary = Summary() self._emotion = Emotion() self._word_cloud = WordCloud() self._es = ES() self._hot_sync = HotSync()
import sys sys.path.append('../') import init import pid pid.record_pid(__file__) import utils.tools as tools from utils.log import log from base.spider import Spider from utils.export_data import ExportData # 需配置 import news.task_status as task_status from news.parsers import * MASTER_ADDRESS = tools.get_conf_value('config.conf', 'master', 'address') SEARCH_TASK_SLEEP_TIME = int( tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time')) def main(): while True: if task_status.is_doing: log.debug('正在做 不取任务') tools.delay_time(SEARCH_TASK_SLEEP_TIME) continue task_status.is_doing = True # 查找任务 get_task_url = MASTER_ADDRESS + '/task/get_task'
HEADER = { "Query": "String Parameters", "view": "URL encoded", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Cache-Control": "max-age=0", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Host": "qyapi.weixin.qq.com" } ORGANIZATION = tools.get_conf_value('config.conf', 'wechat', 'organization') class WechatService(): _depertment_id = None def __init__(self, corpid, send_msg_secret, sync_user_sercet, agentid): self._agentid = agentid self._send_msg_access_token = self.get_access_token( corpid, send_msg_secret) self._sync_user_access_token = self.get_access_token( corpid, sync_user_sercet) if not WechatService._depertment_id: WechatService._depertment_id = self.get_depertment_id(ORGANIZATION) if not WechatService._depertment_id: # 通讯录中无此部门,新创建
} while ( i < a && n == - 1 ); if (n == -1) break; u += String.fromCharCode((15 & r) << 4 | (60 & n) >> 2); do { if (o = 255 & e.charCodeAt(i++), 61 == o) return u; o = l[o] } while ( i < a && o == - 1 ); if (o == -1) break; u += String.fromCharCode((3 & n) << 6 | o) } return u } ''' ONE_PAGE_TIME_INTERVAL = 3600 FILE_LOCAL_PATH = tools.get_conf_value('config.conf', 'files', 'headlines_save_path') NEWS_LOCAL = 1 VIDEO = 2 STORAGE_ID = 2 # 必须定义 添加网站信息 @tools.run_safe_model(__name__) def add_site_info(): log.debug('添加网站信息') site_id = SITE_ID name = NAME table = 'VAApp_site_info' url = 'http://sj.qq.com/myapp/detail.htm?apkName=com.ss.android.article.news' base_parser.add_website_info(table, site_id, url, name)
--------- @summary: 同步oracle数据库到ElasticSearc --------- @author: Boris ''' import sys sys.path.append('../') import init import utils.tools as tools from elasticsearch import Elasticsearch import elasticsearch.helpers from utils.log import log ADDRESS = tools.get_conf_value('config.conf', 'elasticsearch', 'address') class Singleton(object): def __new__(cls, *args, **kwargs): if not hasattr(cls, '_inst'): cls._inst = super(Singleton, cls).__new__(cls) return cls._inst class ES(): def __init__(self, address=ADDRESS): try: print(address.split(',')) self._es = Elasticsearch(address.split(','))
import collections from utils.log import log import utils.tools as tools import web import json import random from service.wechat_service import WechatService MIN_SLEEP_TIME = 30000 # 每个历史列表、文章详情时间间隔 毫秒 MAX_SLEEP_TIME = 65000 MIN_WAIT_TIME = 1000 * 60 * 60 * 6 # 做完所有公众号后休息的时间,然后做下一轮 MAX_WAIT_TIME = 1000 * 60 * 60 * 8 ONLY_TODAY_MSG = int( tools.get_conf_value('config.conf', 'spider', 'only_today_msg')) SPIDER_START_TIME = tools.get_conf_value('config.conf', 'spider', 'spider_start_time') class WechatAction(): _wechat_service = WechatService() _todo_urls = collections.deque() # 待做的url _article_info = { # 缓存文章信息,第一次缓存列表信息、第二次缓存观看量点赞量,第三次直到评论信息也取到后,则入库 "article_id": { "title": "", "content": "", #.... } }
# -*- coding: utf-8 -*- ''' Created on 2017-12-11 15:13 --------- @summary: 同步新闻 --------- @author: Administrator ''' import sys sys.path.append('..') import init import utils.tools as tools from db.elastic_search import ES ADDRESS = tools.get_conf_value('config.conf', 'elasticsearch', 'data-pool') SYNC_TIME_FILE = 'iopm_sync/sync_time.txt' class NewsSync(): def __init__(self): self._record_time = tools.read_file(SYNC_TIME_FILE) or {} def _get_per_record_time(self): news_record_time = '' news_record_time = tools.get_json(self._record_time).get('news_record_time') return news_record_time def _record_now_record_time(self, record_time): self._record_time['news_record_time'] = record_time tools.write_file(SYNC_TIME_FILE, tools.dumps_json(self._record_time))
import collections from utils.log import log import utils.tools as tools from db.oracledb import OracleDB from db.elastic_search import ES from base.wechat_sogou import WechatSogou from base.wechat_public_platform import WechatPublicPlatform from base import constance import random SIZE = 100 TIME_INTERVAL = 24 * 60 * 60 CHECK_NEW_ARTICLE = int( tools.get_conf_value('config.conf', 'spider', 'only_today_msg')) # 有新发布的文章才爬取 class WechatService(): _db = OracleDB() _es = ES() _wechat_sogou = WechatSogou() _wechat_public_platform = WechatPublicPlatform() _todo_accounts = collections.deque() _rownum = 1 _is_done = False # 做完一轮 _is_all_done = False # 所有账号当日发布的消息均已爬取 # wechat_sogou 最后没被封的时间
import init import base.constance as Constance import base.base_parser as base_parser import utils.tools as tools from utils.log import log from db.mongodb import MongoDB from db.oracledb import OracleDB SITE_ID = 10004 search_type = 102 NAME = '新浪微博' db = MongoDB() oracledb = OracleDB() FILE_LOCAL_PATH = tools.get_conf_value('config.conf', 'files', 'wwa_save_path') + 'weibo/' def get_release_time(mblog): try: release_time = mblog['created_at'] data = tools.time.time() ltime = tools.time.localtime(data) timeStr = tools.time.strftime("%Y-%m-%d", ltime) if tools.re.compile('今天').findall(release_time): release_time = release_time.replace('今天', '%s' % timeStr) elif tools.re.compile('小时前').findall(release_time): nhours = tools.re.compile('(\d+)小时前').findall(release_time) hours_ago = (tools.datetime.datetime.now() - tools.datetime.timedelta(hours=int(nhours[0]))) release_time = hours_ago.strftime("%Y-%m-%d %H:%M")
sys.path.append('../../') import base.base_parser as base_parser import init import utils.tools as tools from utils.log import log import base.constance as Constance import re import time # 必须定义 网站id SITE_ID = 6 # 必须定义 网站名 NAME = '酷6视频' FILE_LOCAL_PATH = tools.get_conf_value('config.conf', 'files', 'program_save_path') # 必须定义 添加网站信息 @tools.run_safe_model(__name__) def add_site_info(): log.debug('添加网站信息') site_id = SITE_ID name = NAME table = 'PROGRAM_site_info' url = "http://news.v1.cn/V1make.shtml" base_parser.add_website_info(table, site_id, url, name) # 必须定义 添加根url @tools.run_safe_model(__name__)
''' Created on 2017-12-29 10:44 --------- @summary: 筛选符合省内的信息 --------- @author: Boris ''' import sys sys.path.append('../') import init import utils.tools as tools from utils.log import log from db.oracledb import OracleDB PROVINCE = tools.get_conf_value('config.conf', 'province', 'province') class ProvinceFilter(): def __init__(self, province_name=PROVINCE): self._province_airs = [] self._db = OracleDB() if province_name: self._province_airs.append(province_name) province_id = self.load_province_id(province_name) if province_id: self._province_airs.extend( air[0] for air in self.load_province_air(province_id)) # self._province_airs.extend(town[0] for town in self.load_province_town(province_id)) else: # 全国 self._province_airs.extend(province[0]
--------- @author: Yongxin_Yang ''' import sys sys.path.append("../") from aiohttp_requests import requests import time from db.redisdb import RedisDB from utils.log import log from utils import tools import os import asyncio config = os.path.join(os.path.dirname(__file__) + '/../config.conf') redis_key = tools.get_conf_value(config, 'redis', 'redis_key') class Detection(object): def __init__(self): self.redis = RedisDB() self.test_url = "https://movie.douban.com/" #@tools.debug async def get_html(self, root_url, proxy, semaphore): try: test_proxy = "http://" + proxy log.debug("正在测试代理:" + test_proxy) async with semaphore: response = await requests.get(root_url, proxy=test_proxy,
sys.path.append('../../') import base.base_parser as base_parser import news.parsers.base_parser as self_base_parser import init import utils.tools as tools from utils.log import log import base.constance as Constance from extractor.article_extractor import ArticleExtractor # print(article_extractor.article_extractor) # 必须定义 网站id SITE_ID = 1 # 必须定义 网站名 NAME = '新闻正文提取' DEPTH = int(tools.get_conf_value('config.conf', "collector", "depth")) # 必须定义 添加网站信息 @tools.run_safe_model(__name__) def add_site_info(): log.debug('添加网站信息') pass # 必须定义 添加根url @tools.run_safe_model(__name__) def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s
--------- @summary: 同步新闻 --------- @author: Administrator ''' import sys sys.path.append('..') import init import pid pid.record_pid(__file__) import utils.tools as tools from utils.log import log from base.article_sync import ArticleSync SLEEP_TIME = int(tools.get_conf_value('config.conf', 'sync', 'sleep_time')) class VideoSync(ArticleSync): def __init__(self): super(VideoSync, self).__init__('video_news') @tools.log_function_time def deal_video_article(self, video_news_list): ''' @summary:处理视频 --------- @param video_news_list: # video_news: { "time_length":null,
@author: Administrator ''' import sys sys.path.append('..') import init import utils.tools as tools from db.elastic_search import ES from cluster.compare_text import compare_text from copy import deepcopy from base.hot_week_sync import HotWeekSync import random from utils.cut_text import CutText MIN_SIMILARITY = 0.5 # 相似度阈值 IOPM_SERVICE_ADDRESS = tools.get_conf_value('config.conf', 'iopm_service', 'address') INFO_WEIGHT = { 1: 6, # 新闻 2: 2, # 微信 3: 1, # 微博 8: 1, # 视频 } class HotSync(): def __init__(self): self._es = ES() self._hot_week_sync = HotWeekSync() self._cut_text = CutText() self._cut_text.set_stop_words('utils/stop_words.txt')
def main(): search_task_sleep_time = int( tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time')) db = OracleDB() # 更新符合日期条件的任务状态 未做 sql = 'update tab_ivms_task_info t set t.task_status = 501 where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time' db.update(sql) # 更新关键词状态 未做 sql = 'update tab_ivms_task_keyword k set k.finish_status = 601 where k.task_id in (select t.task_id from tab_ivms_task_info t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time)' db.update(sql) while True: # 查任务 log.debug('查询任务...') sql = 'select t.task_id from TAB_IVMS_TASK_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and t.task_status = 501' result = db.find(sql, fetch_one=True) if not result: break task_id = result[0] while True: # 查看是否有正在执行的任务 sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 602' % task_id do_task = db.find(sql, fetch_one=True) if do_task: time.sleep(search_task_sleep_time) continue sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 601' % task_id result = db.find(sql, fetch_one=True) if not result: break keyword_id = result[0] task_id = result[1] search_keyword1 = [] search_keyword2 = result[2].split(',') if result[2] else [] search_keyword3 = result[3].split(',') if result[3] else [] def begin_callback(): log.info('\n********** VA begin **********') # 更新任务状态 正在做 sql = 'update TAB_IVMS_TASK_INFO set task_status = 502 where task_id = %d' % task_id db.update(sql) # 更新关键词状态 正在做 sql = 'update tab_ivms_task_keyword set finish_status = 602 where id = %d' % keyword_id db.update(sql) def end_callback(): # 更新关键词状态 做完 sql = 'update tab_ivms_task_keyword set finish_status = 603 where id = %d' % keyword_id db.update(sql) # 如果该任务的所有关键词都做完 则更新任务状态为做完 sql = 'select t.* from tab_ivms_task_keyword t where task_id = %d and finish_status = 601' % task_id results = db.find(sql) if not results: # 导出数据 key_map = { 'program_id': 'vint_sequence.nextval', 'search_type': 'int_search_type', 'program_name': 'str_title', 'program_url': 'str_url', 'release_date': 'date_release_time', 'image_url': 'str_image_url', 'program_content': 'str_content', 'task_id': 'vint_%d' % task_id, 'keyword': 'str_keyword', 'keyword_count': 'int_keyword_count', 'check_status': 'vint_202' } export = ExportData('VA_content_info', 'tab_ivms_program_info', key_map, 'program_url') export.export_to_oracle() # 更新任务状态 做完 sql = 'update TAB_IVMS_TASK_INFO set task_status = 503 where task_id = %d' % task_id db.update(sql) log.info('\n********** VA end **********') # 配置spider spider = Spider(tab_urls='VA_urls', tab_site='VA_site_info', tab_content='VA_content_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, search_keyword1=search_keyword1, search_keyword2=search_keyword2, search_keyword3=search_keyword3) # 添加parser spider.add_parser(baidu_parser) spider.add_parser(magnet_parser) spider.add_parser(netdisk_parser) spider.add_parser(weibo_parser) spider.add_parser(wechat_parser) spider.add_parser(soubaidupan_parser) spider.add_parser(douban_parser) spider.start() time.sleep(search_task_sleep_time)
sys.path.append('..') import init import utils.tools as tools from db.elastic_search import ES from base.compare_keywords import CompareKeywords from word_cloud.word_cloud import WordCloud from base.hot_sync import HotSync from base.vip_checked import VipChecked from summary.summary import Summary from emotion.emotion import Emotion from utils.log import log from base.province_filter import ProvinceFilter from base.event_filter import EventFilter # 标记事件类别 移到每周热点 DATA_POOL = tools.get_conf_value('config.conf', 'elasticsearch', 'data-pool') YQTJ = tools.get_conf_value('config.conf', 'elasticsearch', 'yqtj') PROVINCE = tools.get_conf_value('config.conf', 'province', 'province') IOPM_SERVICE_ADDRESS = tools.get_conf_value('config.conf', 'iopm_service', 'address') SLEEP_TIME = int(tools.get_conf_value('config.conf', 'sync', 'sleep_time')) SYNC_TIME_FILE = 'iopm_sync/sync_time/' class ArticleSync(): def __init__(self, table): self._sync_time_file = SYNC_TIME_FILE + table + '.txt' self._record_time = tools.get_json( tools.read_file(self._sync_time_file)) or {} self._compare_keywords = CompareKeywords()