Exemple #1
0
# -*- coding: utf-8 -*-

import logging, json, time, datetime, requests, re, random, socket
from lxml import etree
from bs4 import BeautifulSoup

from context.context import Context

join_path = Context().get("pathutil.join_path")
correct_link = Context().get("pathutil.correct_link")
fmt_time = Context().get("datetimeutil.fmt_time")
local2utc = Context().get("datetimeutil.local2utc")


def get_web_data(url,
                 data=None,
                 headers={},
                 proxies={},
                 allow_redirects=True,
                 timeout=None):
    count = 0
    html_stream = None
    while count < 2:
        try:
            if data is not None:
                html_stream = requests.post(url,
                                            timeout=timeout,
                                            data=data,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=allow_redirects)
Exemple #2
0
# -*- coding: utf-8 -*-
from uuid import uuid1
from django.conf import settings

from context.context import Context

ContentModel = Context().get("search.ContentModel")
CassandraQueryApi = Context().get("CassandraQueryApi")


class SearchArticleModel(ContentModel):
    """docstring for SearchArticleModel"""

    #TYPE = "zjld.article"
    TYPE = "zjld.search"

    FIELDS = {
        "type": u"元搜索",
        "author": u"",
        "publisher": u"",
        "title": u"",
        "content": u"",
        "url": u"",
        "key": u"",
    }

    def __init__(self, dct={}):
        super(SearchArticleModel, self).__init__(dct)

    def find_dup(self):
        dup = []
Exemple #3
0
# -*- coding: utf-8 -*-
from uuid import uuid1
from django.conf import settings

from context.context import Context

ContentModel = Context().get("zjld.ContentModel")
CassandraQueryApi = Context().get("CassandraQueryApi")


class ZjldArticleModel(ContentModel):
    """docstring for ZjldArticleModel"""

    TYPE = "zjld.article"

    FIELDS = {
        "type": u"文章",
        "author": u"",
        "publisher": u"",
        "title": u"",
        "content": u"",
        "url": u"",
    }

    def __init__(self, dct={}):
        super(ZjldArticleModel, self).__init__(dct)

    def find_dup(self):
        dup = []
        if self.get('url'):
            cql = """SELECT * FROM %s WHERE url='%s' LIMIT 1""" \
Exemple #4
0
# -*- coding: utf-8 -*-
import re

from context.context import Context

extract_key = Context().get("utils.extract_key")

_CHAR2NUM = {
    u"0": 0,
    u"1": 1,
    u"2": 2,
    u"3": 3,
    u"4": 4,
    u"5": 5,
    u"6": 6,
    u"7": 7,
    u"8": 8,
    u"9": 9,
    u"零": 0,
    u"一": 1,
    u"壹": 1,
    u"二": 2,
    u"贰": 2,
    u"两": 2,
    u"三": 3,
    u"叁": 3,
    u"四": 4,
    u"肆": 4,
    u"五": 5,
    u"伍": 5,
    u"六": 6,
Exemple #5
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
import requests
import datetime
import json
#from datetime import datetime
import time
from bs4 import BeautifulSoup
from lxml import etree

from context.context import Context

fmt_time = Context().get("datetimeutil.fmt_time")


def get_urls_re(homepage, time = 10, cookie=''):

    html_stream = None
    count = 0
    while count < 2:
        try:
            html_stream = requests.get(homepage ,cookies=cookie ,\
                timeout = time)
        except:
            count += 1
        else:
            break
    return html_stream
Exemple #6
0
# -*- coding: utf-8 -*-
import time
import os
import signal
import logging
from django.conf import settings

from context.context import Context

Daemon = Context().get("utils.Daemon")
RedisQueryApi = Context().get("RedisQueryApi")
Handler = Context().get("Handler")

_CRAWLER_TYPES = {}
_TERMINATING = False
inject_logger = logging.getLogger("crawler.inject")


class CrawlerDaemon(Daemon):
    """
    注入任务服务的类,继承了Daemon类。

    """
    def __init__(self, CRAWLER_PID):
        super(CrawlerDaemon, self).__init__(pidfile=CRAWLER_PID)

    def run(self):
        signal.signal(signal.SIGTERM, self.term_handler)  #将正常终止信号绑定自定义方法。

        print "jobtracker pid=%s start done." % os.getpid()
        inject_logger.info("jobtracker pid=%s START !" % os.getpid())
Exemple #7
0
# -*- coding: utf-8 -*-
import re

from context.context import Context

join_path = Context().get("pathutil.join_path")
Field = Context().get("Field")
Url = Context().get("Url")
ArticleContentCrawler = Context().get("ArticleContentCrawler")
FatherCrawler = Context().get("FatherCrawler")
is_url = Context().get("htmlutil.is_url")


class AqsiqCrawler(FatherCrawler):
    type = "aqsiq.news"

    item = Field(name="item",
                 path=r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')")
    url = Field(name="key", path=r".*", type=Url)
    province = Field(name="province", value=u"全国")
    publisher = Field(name="publisher", value=u"国家质量监督检验检疫总局")

    xpath = {
        'title': "//tr/td[@align='center']/h1",
        'pubtime':
        "//tr/td[@align='center']/h1/../../following-sibling::tr[1]/td/text()",
        'content': "//div[@class='TRS_Editor']",
    }
    child = ArticleContentCrawler
    export_fields = [province, publisher]
Exemple #8
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import time, random, re
from bs4 import BeautifulSoup
from urllib import quote, unquote

from context.context import Context

WeiboArticleModel = Context().get("WeiboArticleModel")
WeiboHotModel = Context().get("WeiboHotModel")
SearchArticleModel = Context().get("SearchArticleModel")
Crawler = Context().get("Crawler")
export = Context().get("export")
from crawlerimpl.weixin.processdata import HandleUrl, new_time, clear_label, \
        HandleContent, get_urls_re, get_charset, change_to_json, clear_space


def _get_url(url):
    html_stream = get_urls_re(url, time=6)
    if True:
        html_stream.encoding = "utf-8"
    else:
        html_stream.encoding = get_charset(html_stream.text)
    return html_stream


class FirstCrawler(Crawler):
Exemple #9
0
# -*- coding: utf-8 -*-
import os
import signal
import time
import logging
from threading import Timer
from django.conf import settings

from context.context import Context

Crawler = Context().get("Crawler")
Handler = Context().get("Handler")
get_exception_info = Context().get("get_exception_info")

fetch_logger = logging.getLogger("crawler.fetch")
_RUNNING_CRAWLER = None
_TERMINATING = False


def procedure():
    """
    一个执行任务服务进程所需要做的事。
    """

    signal.signal(signal.SIGTERM, service_term_handler)  #将正常终止信号与自定义方法绑定。
    signal.signal(signal.SIGALRM, task_term_handler)  #将闹钟信号与自定义方法绑定。

    start_time = time.time()
    print "tasktracker pid=%s start done." % os.getpid()
    fetch_logger.info("tasktracker pid=%s START !" % os.getpid())
    while (True if settings.PROCESS_TIMEOUT > 0 else
Exemple #10
0
# -*- coding: utf-8 -*-
from datetime import datetime
from uuid import uuid1

from context.context import Context

ContentModel = Context().get("ecommerce.ContentModel")
CassandraQueryApi = Context().get("CassandraQueryApi")


class EcBasicModel(ContentModel):

    TYPE = "ecommerce.basic"

    FIELDS = {
        "source_id": u"",
        "title": u"",
        "adword": u"",
        "version": u"",
        "original_price": 0.0,
        "history_price": {},
        "price": 0.0,
        "score": 0,
        "summary": {},
        "address": u"",
        "status": 0,
    }
    INDEXES = [
        {
            "key": [("source", 1), ("source_id", 1)],
            "unique": True
Exemple #11
0
# -*- coding: utf-8 -*-
import sys
import os
import signal

from django.conf import settings

from context.context import Context

_create_child = Context().get("processutil._create_child")
procedure = Context().get("procedure")


def start():
    pid_file = file(settings.CRAWLER_TASK_PID, "w+")
    for i in range(settings.TASKTRACKER_COUNT):
        pid = _create_child(procedure, [], {}).keys()[0]
        pid_file.write(str(pid) + "\n")
    pid_file.close()


def stop():
    pid_file = file(settings.CRAWLER_TASK_PID, "r")
    pids = pid_file.readlines()
    pid_file.close()
    for i in range(len(pids)):
        pids[i] = int(pids[i].strip())
        try:
            os.kill(pids[i], signal.SIGTERM)
        except:
            self.pids.pop(pid)
Exemple #12
0
import urllib3
import urlparse
import urllib
import time
import random
from scrapy.selector import HtmlXPathSelector

from context.context import Context

unix_time = Context().get("datetimeutil.unix_time")

_SITES_RATE_LIMIT = {
    "mp3.easou.com": 1.0,
    'music.douban.com': 2.0,
    'douban.fm': 2.0,
    'music.baidu.com': 2.0,
}

_SITES_LAST_ACCESS = {}

_NUM_POOLS = 10
_TIMEOUT = 30
_DEFAULT_HEADER = {}
#_DEFAULT_HEADER =  {
#        'Accept' :"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
#        'Accept-Charset' : "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
#        'Accept-Encoding' : "gzip,deflate,sdch",
#        'Accept-Language' : "en-US,en;q=0.8",
#        'Cache-Control' : 'max-age=0',
#        'Connection' :'keep-alive',
#        'User-Agent' :"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.162 Safari/535.19"
Exemple #13
0
# -*- coding: utf-8 -*-
import re
from lxml import etree, html
from datetime import datetime

from context.context import Context

Crawler = Context().get("Crawler")
export = Context().get("export")
Handler = Context().get("Handler")
SearchArticleModel = Context().get("SearchArticleModel")
ZjldArticleModel = Context().get("ZjldArticleModel")
Readability = Context().get("Readability")
htmlutil = Context().get("htmlutil")
clear_space = Context().get("textutil.clear_space")
new_time = Context().get("datetimeutil.new_time")
fmt_time = Context().get("datetimeutil.fmt_time")
local2utc = Context().get("datetimeutil.local2utc")
Field = Context().get("Field")
Url = Context().get("Url")
join_path = Context().get("pathutil.join_path")
getTag = Context().get("bosonutil.getTag")

PROXIES = {
    "http": "http://192.168.1.165:8888",
    "https": "http://192.168.1.191:8888"
}


def find_field(name, fields):
    for i in fields:
Exemple #14
0
# -*- coding: utf-8 -*-
from uuid import uuid1
import time
from datetime import datetime

from context.context import Context]

ContentModel = Context().get("weibo.ContentModel")
CassandraQueryApi = Context().get("CassandraQueryApi")
RedisQueryApi = Context().get("RedisQueryApi")


class WeiboArticleModel(ContentModel):
    """docstring for WeiboArticleModel"""

    TYPE = "zjld.weibo"

    FIELDS = {
        "type": u"微博",
        "id": uuid1(),
        "author": u"",
        "title": u"",
        "subtitle": [],
        "content": u"",
        "url": u"",
        "imgurl":[],
        "source": u"",
        "origin_source": u"",
        "pubtime": datetime.utcfromtimestamp(0),
        "crtime": datetime.now(),
        "publisher": u"",
Exemple #15
0
# -*- coding: utf-8 -*-
import json
import time
import logging
from datetime import datetime, timedelta
from django.conf import settings
from django.db import transaction

from context.context import Context

CrawlerConf = Context().get("CrawlerConf")
Task = Context().get("Task")
RedisQueryApi = Context().get("RedisQueryApi")
time2str = Context().get("datetimeutil.time2str")

inject_logger = logging.getLogger("crawler.inject")
fetch_logger = logging.getLogger("crawler.fetch")
_CRAWLER_CONF = CrawlerConf()


class Status:
    """
    任务状态。
    """

    NotStart = 0
    Running = 1
    Succeed = 2
    Failed = -1
    Canceling = -2
Exemple #16
0
# -*- coding: utf-8 -*-

import sys
root_mod = '/Users/liujiasheng/workspace/crawler/crawler'
sys.path.append(root_mod)
import django, os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.development")
django.setup()
import re
from datetime import datetime

from apps.base.models import ScarletOnsell
from context.context import Context

htmlutil = Context().get("htmlutil")
Url = Context().get("Url")
SearchContentCrawler = Context().get("SearchContentCrawler")
FatherCrawler = Context().get("FatherCrawler")
Field = Context().get("Field")
Crawler = Context().get("Crawler")


class BuffOnsellCrawler(Crawler):

    type = "buff.onsell"

    def __init__(self, task):
        pass
        # super(BuffOnsellCrawler, self).__init__(task)

    def crawl(self):
Exemple #17
0
# -*- coding: utf-8 -*-
import os
import socket
import signal
import time
import logging
import json
import re
from datetime import datetime, timedelta

from context.context import Context

ModelBase = Context().get("ModelBase")
str2time = Context().get("datetimeutil.str2time")


class Crawler(object):
    """
    业务爬虫的超类,所有业务爬虫都是该类的子类。

    每个业务爬虫都必须有一个唯一标识符,该标识符为名为type的成员属性。
    同时要重写crawl()方法。

    """

    type = "base.crawler"

    def __init__(self, task):
        self.task = task
        self.key = None
        self.data = None
Exemple #18
0
'''
@author: Yu
'''
import time
import sys
import traceback
import mailutil

from context.context import Context

Daemon = Context().get("utils.Daemon")


class ServiceDefinition(object):
    def __init__(self,
                 check_func,
                 name="Service",
                 check_interval=180,
                 retries=3):
        if not callable(check_func):
            raise TypeError
        self.check_func = check_func
        self.name = name
        self.check_interval = check_interval
        self.retries = retries
        self.failures = 0
        self.last_check = None

    def check(self):
        self.check_func()
Exemple #19
0
# -*- coding: utf-8 -*-

import sys
root_mod = '/home/jshliu/Project/zjld/fix/common/crawler'
sys.path.append(root_mod)
import django, os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.development");
django.setup()
import re
from datetime import datetime

from context.context import Context

Url = Context().get("Url")
SearchContentCrawler = Context().get("SearchContentCrawler")
FatherCrawler = Context().get("FatherCrawler")
Field = Context().get("Field") 


class BaiduCrawler(FatherCrawler):
    """
    百度新闻搜索爬虫,继承了通用一级爬虫类。

    """

    type = "baidu.news" #该爬虫的唯一标识符。

    child = SearchContentCrawler #指定生成的任务由哪一爬虫执行。

    item = Field(name="item", path="//div[@id='content_left']/div/div[@class='result']") #需要解析的字段,name为‘item’为特殊含义,不能被占用。
    pubtime = Field(name="pubtime", path="div//p[@class='c-author']/text()", type=datetime)
Exemple #20
0
 def __init__(self):
     sc = Context()
     self.__session = SparkSession.builder.getOrCreate()
Exemple #21
0
 def load(self):
     sc = Context()
     return sc.get_context().textFile(self.__config.get_source_path())
Exemple #22
0
# -*- coding: utf-8 -*-
import copy

from context.context import Context

convert = Context().get("typeutil.convert")


class ModelMeta(type):
    def __init__(self, name, bases, dct):
        fields = dct.get('FIELDS', {})
        base = bases[0]
        while base != object:
            for k, v in base.__dict__.get('FIELDS', {}).iteritems():
                fields[k] = v
            base = base.__base__
        dct['FIELDS'] = fields

        indexes = dct.get('INDEXES', [])
        base = bases[0]
        while base != object:
            indexes.extend(base.__dict__.get('INDEXES', []))
            base = base.__base__
        dct['INDEXES'] = indexes

        type.__init__(self, name, bases, dct)


class ModelBase(dict):

    __metaclass__ = ModelMeta
Exemple #23
0
# -*- coding: utf-8 -*-
import sys

from django.conf import settings

from context.context import Context

CrawlerDaemon = Context().get("CrawlerDaemon")


def run(*args):
	"""
	注入任务服务进程的执行入口。
	"""
	jobtracker = CrawlerDaemon(settings.CRAWLER_JOB_PID)
	if args[0] == 'start':
		jobtracker.start()
	elif args[0] == 'stop':
		jobtracker.stop()
Exemple #24
0
# -*- coding: utf-8 -*-
import logging
import copy
import time
from uuid import uuid1
from datetime import datetime

from context.context import Context

unix_time = Context().get("datetimeutil.unix_time")
ModelBase = Context().get("ModelBase")
CassandraQueryApi = Context().get("CassandraQueryApi")

import_logger = logging.getLogger("crawler.import")


class ContentModel(ModelBase):

    TYPE = "base.content"
    FIELDS = {
        "id": uuid1(),
        "source": u"",
        "origin_source": u"",
        "pubtime": datetime.utcfromtimestamp(0),
        "crtime": datetime.now(),
        "crtime_int": int(time.time() * 1000000),
        "province": u"",
        "city": u"",
        "district": u"",
        "tag": "",
        "comment": {},
Exemple #25
0
# -*- coding: utf-8 -*-
import logging
import copy
from uuid import uuid1
from datetime import datetime

from context.context import Context

unix_time = Context().get("datetimeutil.unix_time")
ModelBase = Context().get("ModelBase")

_LOGGER = logging.getLogger("ecommerceimport")


class ContentModel(ModelBase):

    TYPE = "base.content"
    FIELDS = {
        "id": uuid1(),
        "source": u"",
        "source_level": {},
        "first_level": u"",
        "second_level": u"",
        "third_level": u"",
        "fourth_level": u"",
        "fifth_level": u"",
        "province": u"",
        "city": u"",
        "district": u"",
        "comment": {}
    }
Exemple #26
0
#coding=utf-8

from django.contrib import admin

from context.context import Context
Task = Context().get("Task")


class TaskAdmin(admin.ModelAdmin):
    list_display = ('crawler', 'key', 'update_time', 'status', 'interval')
    list_editable = ('crawler', 'key', 'status', 'interval')
    list_filter = ('crawler', 'status', 'category', 'application', 'interval',
                   'timeout')
    fields = ('key', 'data', 'producer_id', 'category', 'application', 'crawler', \
     'status', 'interval', 'timeout', 'last_run', 'next_run', 'update_time', 'create_time')
    readonly_fields = ('last_run', 'update_time', 'create_time')
    ordering = ('update_time', '-key')
    search_fields = ('key', )


admin.site.register(Task, TaskAdmin)
Exemple #27
0
# -*- coding: utf-8 -*-
from uuid import uuid1

from context.context import Context

ContentModel = Context().get("weixin.ContentModel")
CassandraQueryApi = Context().get("CassandraQueryApi")


class WeixinArticleModel(ContentModel):
    """docstring for WeixinArticleModel"""

    #TYPE = "zjld.article"
    TYPE = "zjld.weixin"

    FIELDS = {
        "type": u"微信",
        "author": u"",
        "publisher": u"",
        "title": u"",
        "content": u"",
        "url": u"",
        # "province": u"",
        # "city": u"",
        # "district": u""
    }

    def __init__(self, dct={}):
        super(WeixinArticleModel, self).__init__(dct)

    def find_dup(self):