コード例 #1
0
def import_data(context: Context,
                spark: SparkSession,
                aggregator: AggregationContext,
                data_source: str,
                limit: int = 50000):
    # Store in HBase for further batch processing
    print("Start: " + str(datetime.now()))
    csv = load_newest(context, spark)
    context.save_hbase(csv)
    print("End: " + str(datetime.now()))

    # Update ingestion times for Flume
    latest = datetime.fromtimestamp(csv.first()["opened"])
    update_ingestion_times(data_source, latest)

    # Batch process 15 minute intervals
    aggregated = get_batch_processed(csv)
    aggregator.save_hbase(aggregated)
    def receive(self, from_node, contexts, timestamp=0):
        """receive_data
        1. Stores the information who sent what
        2. Increase the hopcount when the context is a single context

        >>> d = ContextAggregator()
        >>> # two contexts are received
        >>> r = d.receive(1, {Context(value=1.0, cohorts=[0,1,2]), Context(value=1.0, cohorts=[0,1,3])})
        >>> same(d.get_received_data(1), [[0,1,2],[0,1,3]])
        True
        >>>
        """
        contexts = Context.increase_hop_count(contexts)
        self.input[from_node] = contexts

        received_info = contexts_to_standard(contexts)
        self.context_history.add_to_history(node_number=from_node, value=received_info, timestamp=timestamp)
コード例 #3
0
# -*- coding: utf-8 -*-
import re

from context.context import Context

extract_key = Context().get("utils.extract_key")

_CHAR2NUM = {
    u"0": 0,
    u"1": 1,
    u"2": 2,
    u"3": 3,
    u"4": 4,
    u"5": 5,
    u"6": 6,
    u"7": 7,
    u"8": 8,
    u"9": 9,
    u"零": 0,
    u"一": 1,
    u"壹": 1,
    u"二": 2,
    u"贰": 2,
    u"两": 2,
    u"三": 3,
    u"叁": 3,
    u"四": 4,
    u"肆": 4,
    u"五": 5,
    u"伍": 5,
    u"六": 6,
コード例 #4
0
ファイル: model.py プロジェクト: jshliu/crawler
# -*- coding: utf-8 -*-
from uuid import uuid1
from django.conf import settings

from context.context import Context

ContentModel = Context().get("zjld.ContentModel")
CassandraQueryApi = Context().get("CassandraQueryApi")


class ZjldArticleModel(ContentModel):
    """docstring for ZjldArticleModel"""

    TYPE = "zjld.article"

    FIELDS = {
        "type": u"文章",
        "author": u"",
        "publisher": u"",
        "title": u"",
        "content": u"",
        "url": u"",
    }

    def __init__(self, dct={}):
        super(ZjldArticleModel, self).__init__(dct)

    def find_dup(self):
        dup = []
        if self.get('url'):
            cql = """SELECT * FROM %s WHERE url='%s' LIMIT 1""" \
コード例 #5
0
ファイル: daemon.py プロジェクト: jshliu/crawler
# -*- coding: utf-8 -*-
import time
import os
import signal
import logging
from django.conf import settings

from context.context import Context

Daemon = Context().get("utils.Daemon")
RedisQueryApi = Context().get("RedisQueryApi")
Handler = Context().get("Handler")

_CRAWLER_TYPES = {}
_TERMINATING = False
inject_logger = logging.getLogger("crawler.inject")


class CrawlerDaemon(Daemon):
    """
    注入任务服务的类,继承了Daemon类。

    """
    def __init__(self, CRAWLER_PID):
        super(CrawlerDaemon, self).__init__(pidfile=CRAWLER_PID)

    def run(self):
        signal.signal(signal.SIGTERM, self.term_handler)  #将正常终止信号绑定自定义方法。

        print "jobtracker pid=%s start done." % os.getpid()
        inject_logger.info("jobtracker pid=%s START !" % os.getpid())
コード例 #6
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
import requests
import datetime
import json
#from datetime import datetime
import time
from bs4 import BeautifulSoup
from lxml import etree

from context.context import Context

fmt_time = Context().get("datetimeutil.fmt_time")


def get_urls_re(homepage, time = 10, cookie=''):

    html_stream = None
    count = 0
    while count < 2:
        try:
            html_stream = requests.get(homepage ,cookies=cookie ,\
                timeout = time)
        except:
            count += 1
        else:
            break
    return html_stream
 def test_deserialize(self):
     c = Context(value=1.0, cohorts={0,1,2})
     s = c.serialize(zipped=True)
     c2 = Context.deserialize(s,zipped=True)
     c == c2
コード例 #8
0
ファイル: weibo.py プロジェクト: jshliu/crawler
#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import time, random, re
from bs4 import BeautifulSoup
from urllib import quote, unquote

from context.context import Context

WeiboArticleModel = Context().get("WeiboArticleModel")
WeiboHotModel = Context().get("WeiboHotModel")
SearchArticleModel = Context().get("SearchArticleModel")
Crawler = Context().get("Crawler")
export = Context().get("export")
from crawlerimpl.weixin.processdata import HandleUrl, new_time, clear_label, \
        HandleContent, get_urls_re, get_charset, change_to_json, clear_space


def _get_url(url):
    html_stream = get_urls_re(url, time=6)
    if True:
        html_stream.encoding = "utf-8"
    else:
        html_stream.encoding = get_charset(html_stream.text)
    return html_stream


class FirstCrawler(Crawler):
コード例 #9
0
ファイル: model.py プロジェクト: jshliu/crawler
# -*- coding: utf-8 -*-
from uuid import uuid1
import time
from datetime import datetime

from context.context import Context]

ContentModel = Context().get("weibo.ContentModel")
CassandraQueryApi = Context().get("CassandraQueryApi")
RedisQueryApi = Context().get("RedisQueryApi")


class WeiboArticleModel(ContentModel):
    """docstring for WeiboArticleModel"""

    TYPE = "zjld.weibo"

    FIELDS = {
        "type": u"微博",
        "id": uuid1(),
        "author": u"",
        "title": u"",
        "subtitle": [],
        "content": u"",
        "url": u"",
        "imgurl":[],
        "source": u"",
        "origin_source": u"",
        "pubtime": datetime.utcfromtimestamp(0),
        "crtime": datetime.now(),
        "publisher": u"",
コード例 #10
0
ファイル: news.py プロジェクト: jshliu/crawler
# -*- coding: utf-8 -*-
import re

from context.context import Context

join_path = Context().get("pathutil.join_path")
Field = Context().get("Field")
Url = Context().get("Url")
ArticleContentCrawler = Context().get("ArticleContentCrawler")
FatherCrawler = Context().get("FatherCrawler")
is_url = Context().get("htmlutil.is_url")


class AqsiqCrawler(FatherCrawler):
    type = "aqsiq.news"

    item = Field(name="item",
                 path=r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')")
    url = Field(name="key", path=r".*", type=Url)
    province = Field(name="province", value=u"全国")
    publisher = Field(name="publisher", value=u"国家质量监督检验检疫总局")

    xpath = {
        'title': "//tr/td[@align='center']/h1",
        'pubtime':
        "//tr/td[@align='center']/h1/../../following-sibling::tr[1]/td/text()",
        'content': "//div[@class='TRS_Editor']",
    }
    child = ArticleContentCrawler
    export_fields = [province, publisher]
コード例 #11
0
                        level=env.LOGGING_LEVEL,
                        format='%(asctime)s %(message)s')

else:
    # otherwise, log to terminal
    logging.basicConfig(level=env.LOGGING_LEVEL,
                        format='%(asctime)s %(message)s')

_LOGGER = logging.getLogger(__name__)

if __name__ == '__main__':

    # init context
    _LOGGER.info('initializing context')
    from context.context import Context
    Context.initialize(CONFIG_FILE_PATH)

    # init endpoints
    _LOGGER.info('initializing endpoints')
    from endpoints import Endpoints
    Endpoints.initialize()

    # Endpoints.post_transaction(
    #     accountID='88efgiTlszS1z2TqSlPj',
    #     counterParty='suntrust',
    #     transactionType='debit',
    #     description='ATM Withdrawal',
    #     amount='20'
    # )

    # Endpoints.post_transaction(
コード例 #12
0
# -*- coding: utf-8 -*-
from datetime import datetime
from uuid import uuid1

from context.context import Context

ContentModel = Context().get("ecommerce.ContentModel")
CassandraQueryApi = Context().get("CassandraQueryApi")


class EcBasicModel(ContentModel):

    TYPE = "ecommerce.basic"

    FIELDS = {
        "source_id": u"",
        "title": u"",
        "adword": u"",
        "version": u"",
        "original_price": 0.0,
        "history_price": {},
        "price": 0.0,
        "score": 0,
        "summary": {},
        "address": u"",
        "status": 0,
    }
    INDEXES = [
        {
            "key": [("source", 1), ("source_id", 1)],
            "unique": True
コード例 #13
0
# -*- coding: utf-8 -*-
import os
import signal
import time
import logging
from threading import Timer
from django.conf import settings

from context.context import Context

Crawler = Context().get("Crawler")
Handler = Context().get("Handler")
get_exception_info = Context().get("get_exception_info")

fetch_logger = logging.getLogger("crawler.fetch")
_RUNNING_CRAWLER = None
_TERMINATING = False


def procedure():
    """
    一个执行任务服务进程所需要做的事。
    """

    signal.signal(signal.SIGTERM, service_term_handler)  #将正常终止信号与自定义方法绑定。
    signal.signal(signal.SIGALRM, task_term_handler)  #将闹钟信号与自定义方法绑定。

    start_time = time.time()
    print "tasktracker pid=%s start done." % os.getpid()
    fetch_logger.info("tasktracker pid=%s START !" % os.getpid())
    while (True if settings.PROCESS_TIMEOUT > 0 else
コード例 #14
0
ファイル: tasktracker.py プロジェクト: jshliu/crawler
# -*- coding: utf-8 -*-
import sys
import os
import signal

from django.conf import settings

from context.context import Context

_create_child = Context().get("processutil._create_child")
procedure = Context().get("procedure")


def start():
    pid_file = file(settings.CRAWLER_TASK_PID, "w+")
    for i in range(settings.TASKTRACKER_COUNT):
        pid = _create_child(procedure, [], {}).keys()[0]
        pid_file.write(str(pid) + "\n")
    pid_file.close()


def stop():
    pid_file = file(settings.CRAWLER_TASK_PID, "r")
    pids = pid_file.readlines()
    pid_file.close()
    for i in range(len(pids)):
        pids[i] = int(pids[i].strip())
        try:
            os.kill(pids[i], signal.SIGTERM)
        except:
            self.pids.pop(pid)
コード例 #15
0
#coding=utf-8

from django.contrib import admin

from context.context import Context
Task = Context().get("Task")


class TaskAdmin(admin.ModelAdmin):
    list_display = ('crawler', 'key', 'update_time', 'status', 'interval')
    list_editable = ('crawler', 'key', 'status', 'interval')
    list_filter = ('crawler', 'status', 'category', 'application', 'interval',
                   'timeout')
    fields = ('key', 'data', 'producer_id', 'category', 'application', 'crawler', \
     'status', 'interval', 'timeout', 'last_run', 'next_run', 'update_time', 'create_time')
    readonly_fields = ('last_run', 'update_time', 'create_time')
    ordering = ('update_time', '-key')
    search_fields = ('key', )


admin.site.register(Task, TaskAdmin)
コード例 #16
0
# standard
import json
import logging

# packages
import requests
from firebase_admin import messaging

# internal
from context.context import Context
from utility import const

_LOGGER = logging.getLogger(__name__)

token = Context.data()[const.FIREBASE][const.TOKEN]

def generate_message(data):
    # send message to firebase

    message = messaging.Message(
        data=json.dumps(data),
        token=token
    )

    resp = messaging.send(message)

    return resp

    
コード例 #17
0
ファイル: dash_app.py プロジェクト: ngngardner/hackgsu_glance
        raise PreventUpdate
    storage._update_transactions()
    transaction = storage.tables()[const.TRANSACTIONS_DATA][log_id]

    amount = transaction[const.T_LOG][const.TOTALS][const.GRAND_AMOUNT]
    day = transaction[const.BUSINESS_DAY][const.DATE_TIME]

    return f'Amount: {amount}, Time: {day}'


if __name__ == '__main__':

    # init context
    _LOGGER.info('initializing context')
    from context.context import Context
    Context.initialize(CONFIG_FILE_PATH)

    # init endpoints
    _LOGGER.info('initializing endpoints')
    from endpoints import Endpoints
    Endpoints.initialize()

    # init firebase
    _LOGGER.info('initializing firebase')
    import firebase_admin
    from firebase_admin import credentials
    from firebase_admin import db
    with FIREBASE_CRED_PATH.open() as file:
        FIREBASE_DATA = json.loads(file.read())
        cred = credentials.Certificate(FIREBASE_DATA)
コード例 #18
0
# -*- coding: utf-8 -*-

import sys
root_mod = '/Users/liujiasheng/workspace/crawler/crawler'
sys.path.append(root_mod)
import django, os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.development")
django.setup()
import re
from datetime import datetime

from apps.base.models import ScarletOnsell
from context.context import Context

htmlutil = Context().get("htmlutil")
Url = Context().get("Url")
SearchContentCrawler = Context().get("SearchContentCrawler")
FatherCrawler = Context().get("FatherCrawler")
Field = Context().get("Field")
Crawler = Context().get("Crawler")


class BuffOnsellCrawler(Crawler):

    type = "buff.onsell"

    def __init__(self, task):
        pass
        # super(BuffOnsellCrawler, self).__init__(task)

    def crawl(self):
コード例 #19
0
# -*- coding: utf-8 -*-
import json
import time
import logging
from datetime import datetime, timedelta
from django.conf import settings
from django.db import transaction

from context.context import Context

CrawlerConf = Context().get("CrawlerConf")
Task = Context().get("Task")
RedisQueryApi = Context().get("RedisQueryApi")
time2str = Context().get("datetimeutil.time2str")

inject_logger = logging.getLogger("crawler.inject")
fetch_logger = logging.getLogger("crawler.fetch")
_CRAWLER_CONF = CrawlerConf()


class Status:
    """
    任务状态。
    """

    NotStart = 0
    Running = 1
    Succeed = 2
    Failed = -1
    Canceling = -2
コード例 #20
0
ファイル: monitor.py プロジェクト: jshliu/crawler
'''
@author: Yu
'''
import time
import sys
import traceback
import mailutil

from context.context import Context

Daemon = Context().get("utils.Daemon")


class ServiceDefinition(object):
    def __init__(self,
                 check_func,
                 name="Service",
                 check_interval=180,
                 retries=3):
        if not callable(check_func):
            raise TypeError
        self.check_func = check_func
        self.name = name
        self.check_interval = check_interval
        self.retries = retries
        self.failures = 0
        self.last_check = None

    def check(self):
        self.check_func()
コード例 #21
0
# -*- coding: utf-8 -*-
import os
import socket
import signal
import time
import logging
import json
import re
from datetime import datetime, timedelta

from context.context import Context

ModelBase = Context().get("ModelBase")
str2time = Context().get("datetimeutil.str2time")


class Crawler(object):
    """
    业务爬虫的超类,所有业务爬虫都是该类的子类。

    每个业务爬虫都必须有一个唯一标识符,该标识符为名为type的成员属性。
    同时要重写crawl()方法。

    """

    type = "base.crawler"

    def __init__(self, task):
        self.task = task
        self.key = None
        self.data = None
 def test_big_number(self):
     c = Context(value=1.0, cohorts={40000})
     s = c.serialize(zipped=True)
     c2 = Context.deserialize(s,zipped=True)
     c == c2
コード例 #23
0
ファイル: news.py プロジェクト: jshliu/crawler
# -*- coding: utf-8 -*-

import sys
root_mod = '/home/jshliu/Project/zjld/fix/common/crawler'
sys.path.append(root_mod)
import django, os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.development");
django.setup()
import re
from datetime import datetime

from context.context import Context

Url = Context().get("Url")
SearchContentCrawler = Context().get("SearchContentCrawler")
FatherCrawler = Context().get("FatherCrawler")
Field = Context().get("Field") 


class BaiduCrawler(FatherCrawler):
    """
    百度新闻搜索爬虫,继承了通用一级爬虫类。

    """

    type = "baidu.news" #该爬虫的唯一标识符。

    child = SearchContentCrawler #指定生成的任务由哪一爬虫执行。

    item = Field(name="item", path="//div[@id='content_left']/div/div[@class='result']") #需要解析的字段,name为‘item’为特殊含义,不能被占用。
    pubtime = Field(name="pubtime", path="div//p[@class='c-author']/text()", type=datetime)
コード例 #24
0
ファイル: session.py プロジェクト: andang1390/black_sabbath
 def __init__(self):
     sc = Context()
     self.__session = SparkSession.builder.getOrCreate()
コード例 #25
0
 def load(self):
     sc = Context()
     return sc.get_context().textFile(self.__config.get_source_path())
コード例 #26
0
ファイル: modelbase.py プロジェクト: jshliu/crawler
# -*- coding: utf-8 -*-
import copy

from context.context import Context

convert = Context().get("typeutil.convert")


class ModelMeta(type):
    def __init__(self, name, bases, dct):
        fields = dct.get('FIELDS', {})
        base = bases[0]
        while base != object:
            for k, v in base.__dict__.get('FIELDS', {}).iteritems():
                fields[k] = v
            base = base.__base__
        dct['FIELDS'] = fields

        indexes = dct.get('INDEXES', [])
        base = bases[0]
        while base != object:
            indexes.extend(base.__dict__.get('INDEXES', []))
            base = base.__base__
        dct['INDEXES'] = indexes

        type.__init__(self, name, bases, dct)


class ModelBase(dict):

    __metaclass__ = ModelMeta
コード例 #27
0
# standard
import json
import logging

# packages
import requests

# internal
from context.context import Context
from utility import const

_LOGGER = logging.getLogger(__name__)

base_url = Context.data()[const.SILVER][const.BASE_URL]
headers = Context.data()[const.SILVER][const.HEADERS]


def get_store():
    # get store data from the silver api

    # build request url
    req_url = base_url + '/v2/stores'

    res = requests.get(url=req_url, headers=headers)

    if res.status_code == 200:
        # there is only one store in our data, so we only return that
        return res.json()[const.RESULT][0]
    else:
        _LOGGER.error('Request error')
        return None
コード例 #28
0
# -*- coding: utf-8 -*-
import logging
import copy
import time
from uuid import uuid1
from datetime import datetime

from context.context import Context

unix_time = Context().get("datetimeutil.unix_time")
ModelBase = Context().get("ModelBase")
CassandraQueryApi = Context().get("CassandraQueryApi")

import_logger = logging.getLogger("crawler.import")


class ContentModel(ModelBase):

    TYPE = "base.content"
    FIELDS = {
        "id": uuid1(),
        "source": u"",
        "origin_source": u"",
        "pubtime": datetime.utcfromtimestamp(0),
        "crtime": datetime.now(),
        "crtime_int": int(time.time() * 1000000),
        "province": u"",
        "city": u"",
        "district": u"",
        "tag": "",
        "comment": {},
コード例 #29
0
# -*- coding: utf-8 -*-
import sys

from django.conf import settings

from context.context import Context

CrawlerDaemon = Context().get("CrawlerDaemon")


def run(*args):
	"""
	注入任务服务进程的执行入口。
	"""
	jobtracker = CrawlerDaemon(settings.CRAWLER_JOB_PID)
	if args[0] == 'start':
		jobtracker.start()
	elif args[0] == 'stop':
		jobtracker.stop()
コード例 #30
0
# standard
import json
import logging

# packages
import requests

# internal
from context.context import Context
from utility import const

_LOGGER = logging.getLogger(__name__)

base_url = Context.data()[const.TRANSACTIONS][const.BASE_URL]
headers = Context.data()[const.TRANSACTIONS][const.HEADERS]

def transaction_details(transaction_id:str):
    # get transaction data from the silver api

    # build request url
    req_url = base_url + '/transaction-document/transaction-documents/' + transaction_id

    res = requests.get(
        url=req_url,
        headers=headers
    )

    if res.status_code == 200:
        data = res.json()
        del data[const.ID]
コード例 #31
0
# -*- coding: utf-8 -*-
import logging
import copy
from uuid import uuid1
from datetime import datetime

from context.context import Context

unix_time = Context().get("datetimeutil.unix_time")
ModelBase = Context().get("ModelBase")

_LOGGER = logging.getLogger("ecommerceimport")


class ContentModel(ModelBase):

    TYPE = "base.content"
    FIELDS = {
        "id": uuid1(),
        "source": u"",
        "source_level": {},
        "first_level": u"",
        "second_level": u"",
        "third_level": u"",
        "fourth_level": u"",
        "fifth_level": u"",
        "province": u"",
        "city": u"",
        "district": u"",
        "comment": {}
    }
コード例 #32
0
# standard
import logging

# packages
import requests

# internal
from utility import const
from context.context import Context

_LOGGER = logging.getLogger(__name__)

BASE_URL = Context.data()[const.BASE_URL]
HEADERS = Context.data()[const.HEADERS]


def inspect_account(accountID: str,
                    base_url: str = BASE_URL,
                    headers: dict = HEADERS):
    '''
    '''
    # build request url
    req_url = base_url + f'/accounts/{accountID}'

    # make request
    inspect_account_request = requests.get(url=req_url, headers=headers)

    # if request was successful
    if inspect_account_request.status_code == 200:
        inspect_account_response = {
            const.STATUS: inspect_account_request.status_code,
コード例 #33
0
# -*- coding: utf-8 -*-
from uuid import uuid1
from django.conf import settings

from context.context import Context

ContentModel = Context().get("search.ContentModel")
CassandraQueryApi = Context().get("CassandraQueryApi")


class SearchArticleModel(ContentModel):
    """docstring for SearchArticleModel"""

    #TYPE = "zjld.article"
    TYPE = "zjld.search"

    FIELDS = {
        "type": u"元搜索",
        "author": u"",
        "publisher": u"",
        "title": u"",
        "content": u"",
        "url": u"",
        "key": u"",
    }

    def __init__(self, dct={}):
        super(SearchArticleModel, self).__init__(dct)

    def find_dup(self):
        dup = []