def import_data(context: Context, spark: SparkSession, aggregator: AggregationContext, data_source: str, limit: int = 50000): # Store in HBase for further batch processing print("Start: " + str(datetime.now())) csv = load_newest(context, spark) context.save_hbase(csv) print("End: " + str(datetime.now())) # Update ingestion times for Flume latest = datetime.fromtimestamp(csv.first()["opened"]) update_ingestion_times(data_source, latest) # Batch process 15 minute intervals aggregated = get_batch_processed(csv) aggregator.save_hbase(aggregated)
def receive(self, from_node, contexts, timestamp=0): """receive_data 1. Stores the information who sent what 2. Increase the hopcount when the context is a single context >>> d = ContextAggregator() >>> # two contexts are received >>> r = d.receive(1, {Context(value=1.0, cohorts=[0,1,2]), Context(value=1.0, cohorts=[0,1,3])}) >>> same(d.get_received_data(1), [[0,1,2],[0,1,3]]) True >>> """ contexts = Context.increase_hop_count(contexts) self.input[from_node] = contexts received_info = contexts_to_standard(contexts) self.context_history.add_to_history(node_number=from_node, value=received_info, timestamp=timestamp)
# -*- coding: utf-8 -*- import re from context.context import Context extract_key = Context().get("utils.extract_key") _CHAR2NUM = { u"0": 0, u"1": 1, u"2": 2, u"3": 3, u"4": 4, u"5": 5, u"6": 6, u"7": 7, u"8": 8, u"9": 9, u"零": 0, u"一": 1, u"壹": 1, u"二": 2, u"贰": 2, u"两": 2, u"三": 3, u"叁": 3, u"四": 4, u"肆": 4, u"五": 5, u"伍": 5, u"六": 6,
# -*- coding: utf-8 -*- from uuid import uuid1 from django.conf import settings from context.context import Context ContentModel = Context().get("zjld.ContentModel") CassandraQueryApi = Context().get("CassandraQueryApi") class ZjldArticleModel(ContentModel): """docstring for ZjldArticleModel""" TYPE = "zjld.article" FIELDS = { "type": u"文章", "author": u"", "publisher": u"", "title": u"", "content": u"", "url": u"", } def __init__(self, dct={}): super(ZjldArticleModel, self).__init__(dct) def find_dup(self): dup = [] if self.get('url'): cql = """SELECT * FROM %s WHERE url='%s' LIMIT 1""" \
# -*- coding: utf-8 -*- import time import os import signal import logging from django.conf import settings from context.context import Context Daemon = Context().get("utils.Daemon") RedisQueryApi = Context().get("RedisQueryApi") Handler = Context().get("Handler") _CRAWLER_TYPES = {} _TERMINATING = False inject_logger = logging.getLogger("crawler.inject") class CrawlerDaemon(Daemon): """ 注入任务服务的类,继承了Daemon类。 """ def __init__(self, CRAWLER_PID): super(CrawlerDaemon, self).__init__(pidfile=CRAWLER_PID) def run(self): signal.signal(signal.SIGTERM, self.term_handler) #将正常终止信号绑定自定义方法。 print "jobtracker pid=%s start done." % os.getpid() inject_logger.info("jobtracker pid=%s START !" % os.getpid())
#!/usr/bin/python # -*- coding: utf-8 -*- import re import requests import datetime import json #from datetime import datetime import time from bs4 import BeautifulSoup from lxml import etree from context.context import Context fmt_time = Context().get("datetimeutil.fmt_time") def get_urls_re(homepage, time = 10, cookie=''): html_stream = None count = 0 while count < 2: try: html_stream = requests.get(homepage ,cookies=cookie ,\ timeout = time) except: count += 1 else: break return html_stream
def test_deserialize(self): c = Context(value=1.0, cohorts={0,1,2}) s = c.serialize(zipped=True) c2 = Context.deserialize(s,zipped=True) c == c2
#!/usr/bin/python # -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import time, random, re from bs4 import BeautifulSoup from urllib import quote, unquote from context.context import Context WeiboArticleModel = Context().get("WeiboArticleModel") WeiboHotModel = Context().get("WeiboHotModel") SearchArticleModel = Context().get("SearchArticleModel") Crawler = Context().get("Crawler") export = Context().get("export") from crawlerimpl.weixin.processdata import HandleUrl, new_time, clear_label, \ HandleContent, get_urls_re, get_charset, change_to_json, clear_space def _get_url(url): html_stream = get_urls_re(url, time=6) if True: html_stream.encoding = "utf-8" else: html_stream.encoding = get_charset(html_stream.text) return html_stream class FirstCrawler(Crawler):
# -*- coding: utf-8 -*- from uuid import uuid1 import time from datetime import datetime from context.context import Context] ContentModel = Context().get("weibo.ContentModel") CassandraQueryApi = Context().get("CassandraQueryApi") RedisQueryApi = Context().get("RedisQueryApi") class WeiboArticleModel(ContentModel): """docstring for WeiboArticleModel""" TYPE = "zjld.weibo" FIELDS = { "type": u"微博", "id": uuid1(), "author": u"", "title": u"", "subtitle": [], "content": u"", "url": u"", "imgurl":[], "source": u"", "origin_source": u"", "pubtime": datetime.utcfromtimestamp(0), "crtime": datetime.now(), "publisher": u"",
# -*- coding: utf-8 -*- import re from context.context import Context join_path = Context().get("pathutil.join_path") Field = Context().get("Field") Url = Context().get("Url") ArticleContentCrawler = Context().get("ArticleContentCrawler") FatherCrawler = Context().get("FatherCrawler") is_url = Context().get("htmlutil.is_url") class AqsiqCrawler(FatherCrawler): type = "aqsiq.news" item = Field(name="item", path=r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')") url = Field(name="key", path=r".*", type=Url) province = Field(name="province", value=u"全国") publisher = Field(name="publisher", value=u"国家质量监督检验检疫总局") xpath = { 'title': "//tr/td[@align='center']/h1", 'pubtime': "//tr/td[@align='center']/h1/../../following-sibling::tr[1]/td/text()", 'content': "//div[@class='TRS_Editor']", } child = ArticleContentCrawler export_fields = [province, publisher]
level=env.LOGGING_LEVEL, format='%(asctime)s %(message)s') else: # otherwise, log to terminal logging.basicConfig(level=env.LOGGING_LEVEL, format='%(asctime)s %(message)s') _LOGGER = logging.getLogger(__name__) if __name__ == '__main__': # init context _LOGGER.info('initializing context') from context.context import Context Context.initialize(CONFIG_FILE_PATH) # init endpoints _LOGGER.info('initializing endpoints') from endpoints import Endpoints Endpoints.initialize() # Endpoints.post_transaction( # accountID='88efgiTlszS1z2TqSlPj', # counterParty='suntrust', # transactionType='debit', # description='ATM Withdrawal', # amount='20' # ) # Endpoints.post_transaction(
# -*- coding: utf-8 -*- from datetime import datetime from uuid import uuid1 from context.context import Context ContentModel = Context().get("ecommerce.ContentModel") CassandraQueryApi = Context().get("CassandraQueryApi") class EcBasicModel(ContentModel): TYPE = "ecommerce.basic" FIELDS = { "source_id": u"", "title": u"", "adword": u"", "version": u"", "original_price": 0.0, "history_price": {}, "price": 0.0, "score": 0, "summary": {}, "address": u"", "status": 0, } INDEXES = [ { "key": [("source", 1), ("source_id", 1)], "unique": True
# -*- coding: utf-8 -*- import os import signal import time import logging from threading import Timer from django.conf import settings from context.context import Context Crawler = Context().get("Crawler") Handler = Context().get("Handler") get_exception_info = Context().get("get_exception_info") fetch_logger = logging.getLogger("crawler.fetch") _RUNNING_CRAWLER = None _TERMINATING = False def procedure(): """ 一个执行任务服务进程所需要做的事。 """ signal.signal(signal.SIGTERM, service_term_handler) #将正常终止信号与自定义方法绑定。 signal.signal(signal.SIGALRM, task_term_handler) #将闹钟信号与自定义方法绑定。 start_time = time.time() print "tasktracker pid=%s start done." % os.getpid() fetch_logger.info("tasktracker pid=%s START !" % os.getpid()) while (True if settings.PROCESS_TIMEOUT > 0 else
# -*- coding: utf-8 -*- import sys import os import signal from django.conf import settings from context.context import Context _create_child = Context().get("processutil._create_child") procedure = Context().get("procedure") def start(): pid_file = file(settings.CRAWLER_TASK_PID, "w+") for i in range(settings.TASKTRACKER_COUNT): pid = _create_child(procedure, [], {}).keys()[0] pid_file.write(str(pid) + "\n") pid_file.close() def stop(): pid_file = file(settings.CRAWLER_TASK_PID, "r") pids = pid_file.readlines() pid_file.close() for i in range(len(pids)): pids[i] = int(pids[i].strip()) try: os.kill(pids[i], signal.SIGTERM) except: self.pids.pop(pid)
#coding=utf-8 from django.contrib import admin from context.context import Context Task = Context().get("Task") class TaskAdmin(admin.ModelAdmin): list_display = ('crawler', 'key', 'update_time', 'status', 'interval') list_editable = ('crawler', 'key', 'status', 'interval') list_filter = ('crawler', 'status', 'category', 'application', 'interval', 'timeout') fields = ('key', 'data', 'producer_id', 'category', 'application', 'crawler', \ 'status', 'interval', 'timeout', 'last_run', 'next_run', 'update_time', 'create_time') readonly_fields = ('last_run', 'update_time', 'create_time') ordering = ('update_time', '-key') search_fields = ('key', ) admin.site.register(Task, TaskAdmin)
# standard import json import logging # packages import requests from firebase_admin import messaging # internal from context.context import Context from utility import const _LOGGER = logging.getLogger(__name__) token = Context.data()[const.FIREBASE][const.TOKEN] def generate_message(data): # send message to firebase message = messaging.Message( data=json.dumps(data), token=token ) resp = messaging.send(message) return resp
raise PreventUpdate storage._update_transactions() transaction = storage.tables()[const.TRANSACTIONS_DATA][log_id] amount = transaction[const.T_LOG][const.TOTALS][const.GRAND_AMOUNT] day = transaction[const.BUSINESS_DAY][const.DATE_TIME] return f'Amount: {amount}, Time: {day}' if __name__ == '__main__': # init context _LOGGER.info('initializing context') from context.context import Context Context.initialize(CONFIG_FILE_PATH) # init endpoints _LOGGER.info('initializing endpoints') from endpoints import Endpoints Endpoints.initialize() # init firebase _LOGGER.info('initializing firebase') import firebase_admin from firebase_admin import credentials from firebase_admin import db with FIREBASE_CRED_PATH.open() as file: FIREBASE_DATA = json.loads(file.read()) cred = credentials.Certificate(FIREBASE_DATA)
# -*- coding: utf-8 -*- import sys root_mod = '/Users/liujiasheng/workspace/crawler/crawler' sys.path.append(root_mod) import django, os os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.development") django.setup() import re from datetime import datetime from apps.base.models import ScarletOnsell from context.context import Context htmlutil = Context().get("htmlutil") Url = Context().get("Url") SearchContentCrawler = Context().get("SearchContentCrawler") FatherCrawler = Context().get("FatherCrawler") Field = Context().get("Field") Crawler = Context().get("Crawler") class BuffOnsellCrawler(Crawler): type = "buff.onsell" def __init__(self, task): pass # super(BuffOnsellCrawler, self).__init__(task) def crawl(self):
# -*- coding: utf-8 -*- import json import time import logging from datetime import datetime, timedelta from django.conf import settings from django.db import transaction from context.context import Context CrawlerConf = Context().get("CrawlerConf") Task = Context().get("Task") RedisQueryApi = Context().get("RedisQueryApi") time2str = Context().get("datetimeutil.time2str") inject_logger = logging.getLogger("crawler.inject") fetch_logger = logging.getLogger("crawler.fetch") _CRAWLER_CONF = CrawlerConf() class Status: """ 任务状态。 """ NotStart = 0 Running = 1 Succeed = 2 Failed = -1 Canceling = -2
''' @author: Yu ''' import time import sys import traceback import mailutil from context.context import Context Daemon = Context().get("utils.Daemon") class ServiceDefinition(object): def __init__(self, check_func, name="Service", check_interval=180, retries=3): if not callable(check_func): raise TypeError self.check_func = check_func self.name = name self.check_interval = check_interval self.retries = retries self.failures = 0 self.last_check = None def check(self): self.check_func()
# -*- coding: utf-8 -*- import os import socket import signal import time import logging import json import re from datetime import datetime, timedelta from context.context import Context ModelBase = Context().get("ModelBase") str2time = Context().get("datetimeutil.str2time") class Crawler(object): """ 业务爬虫的超类,所有业务爬虫都是该类的子类。 每个业务爬虫都必须有一个唯一标识符,该标识符为名为type的成员属性。 同时要重写crawl()方法。 """ type = "base.crawler" def __init__(self, task): self.task = task self.key = None self.data = None
def test_big_number(self): c = Context(value=1.0, cohorts={40000}) s = c.serialize(zipped=True) c2 = Context.deserialize(s,zipped=True) c == c2
# -*- coding: utf-8 -*- import sys root_mod = '/home/jshliu/Project/zjld/fix/common/crawler' sys.path.append(root_mod) import django, os os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.development"); django.setup() import re from datetime import datetime from context.context import Context Url = Context().get("Url") SearchContentCrawler = Context().get("SearchContentCrawler") FatherCrawler = Context().get("FatherCrawler") Field = Context().get("Field") class BaiduCrawler(FatherCrawler): """ 百度新闻搜索爬虫,继承了通用一级爬虫类。 """ type = "baidu.news" #该爬虫的唯一标识符。 child = SearchContentCrawler #指定生成的任务由哪一爬虫执行。 item = Field(name="item", path="//div[@id='content_left']/div/div[@class='result']") #需要解析的字段,name为‘item’为特殊含义,不能被占用。 pubtime = Field(name="pubtime", path="div//p[@class='c-author']/text()", type=datetime)
def __init__(self): sc = Context() self.__session = SparkSession.builder.getOrCreate()
def load(self): sc = Context() return sc.get_context().textFile(self.__config.get_source_path())
# -*- coding: utf-8 -*- import copy from context.context import Context convert = Context().get("typeutil.convert") class ModelMeta(type): def __init__(self, name, bases, dct): fields = dct.get('FIELDS', {}) base = bases[0] while base != object: for k, v in base.__dict__.get('FIELDS', {}).iteritems(): fields[k] = v base = base.__base__ dct['FIELDS'] = fields indexes = dct.get('INDEXES', []) base = bases[0] while base != object: indexes.extend(base.__dict__.get('INDEXES', [])) base = base.__base__ dct['INDEXES'] = indexes type.__init__(self, name, bases, dct) class ModelBase(dict): __metaclass__ = ModelMeta
# standard import json import logging # packages import requests # internal from context.context import Context from utility import const _LOGGER = logging.getLogger(__name__) base_url = Context.data()[const.SILVER][const.BASE_URL] headers = Context.data()[const.SILVER][const.HEADERS] def get_store(): # get store data from the silver api # build request url req_url = base_url + '/v2/stores' res = requests.get(url=req_url, headers=headers) if res.status_code == 200: # there is only one store in our data, so we only return that return res.json()[const.RESULT][0] else: _LOGGER.error('Request error') return None
# -*- coding: utf-8 -*- import logging import copy import time from uuid import uuid1 from datetime import datetime from context.context import Context unix_time = Context().get("datetimeutil.unix_time") ModelBase = Context().get("ModelBase") CassandraQueryApi = Context().get("CassandraQueryApi") import_logger = logging.getLogger("crawler.import") class ContentModel(ModelBase): TYPE = "base.content" FIELDS = { "id": uuid1(), "source": u"", "origin_source": u"", "pubtime": datetime.utcfromtimestamp(0), "crtime": datetime.now(), "crtime_int": int(time.time() * 1000000), "province": u"", "city": u"", "district": u"", "tag": "", "comment": {},
# -*- coding: utf-8 -*- import sys from django.conf import settings from context.context import Context CrawlerDaemon = Context().get("CrawlerDaemon") def run(*args): """ 注入任务服务进程的执行入口。 """ jobtracker = CrawlerDaemon(settings.CRAWLER_JOB_PID) if args[0] == 'start': jobtracker.start() elif args[0] == 'stop': jobtracker.stop()
# standard import json import logging # packages import requests # internal from context.context import Context from utility import const _LOGGER = logging.getLogger(__name__) base_url = Context.data()[const.TRANSACTIONS][const.BASE_URL] headers = Context.data()[const.TRANSACTIONS][const.HEADERS] def transaction_details(transaction_id:str): # get transaction data from the silver api # build request url req_url = base_url + '/transaction-document/transaction-documents/' + transaction_id res = requests.get( url=req_url, headers=headers ) if res.status_code == 200: data = res.json() del data[const.ID]
# -*- coding: utf-8 -*- import logging import copy from uuid import uuid1 from datetime import datetime from context.context import Context unix_time = Context().get("datetimeutil.unix_time") ModelBase = Context().get("ModelBase") _LOGGER = logging.getLogger("ecommerceimport") class ContentModel(ModelBase): TYPE = "base.content" FIELDS = { "id": uuid1(), "source": u"", "source_level": {}, "first_level": u"", "second_level": u"", "third_level": u"", "fourth_level": u"", "fifth_level": u"", "province": u"", "city": u"", "district": u"", "comment": {} }
# standard import logging # packages import requests # internal from utility import const from context.context import Context _LOGGER = logging.getLogger(__name__) BASE_URL = Context.data()[const.BASE_URL] HEADERS = Context.data()[const.HEADERS] def inspect_account(accountID: str, base_url: str = BASE_URL, headers: dict = HEADERS): ''' ''' # build request url req_url = base_url + f'/accounts/{accountID}' # make request inspect_account_request = requests.get(url=req_url, headers=headers) # if request was successful if inspect_account_request.status_code == 200: inspect_account_response = { const.STATUS: inspect_account_request.status_code,
# -*- coding: utf-8 -*- from uuid import uuid1 from django.conf import settings from context.context import Context ContentModel = Context().get("search.ContentModel") CassandraQueryApi = Context().get("CassandraQueryApi") class SearchArticleModel(ContentModel): """docstring for SearchArticleModel""" #TYPE = "zjld.article" TYPE = "zjld.search" FIELDS = { "type": u"元搜索", "author": u"", "publisher": u"", "title": u"", "content": u"", "url": u"", "key": u"", } def __init__(self, dct={}): super(SearchArticleModel, self).__init__(dct) def find_dup(self): dup = []