Ejemplo n.º 1
0
# -*- Encoding: utf-8 -*-
import re
import os
import json

from log4f import debug_logger

log = debug_logger('log/parser', 'crawler.parser')


class Indexing:
    def __init__(self, ptn, filename):
        self.ptn = ptn
        self.filename = filename

        self.data = dict()
        if os.path.exists(filename):
            with open(self.filename, 'r') as fr:
                self.data = json.load(fr)

    def save(self):
        with open(self.filename, 'wb') as fw:
            json.dump(self.data, fw, indent=4)

    def scan(self, path, save_period=2000):
        total = set(os.listdir(path))
        todo = total - set(self.data.keys())
        print '{}/{} to parse.'.format(len(todo), len(total))

        for i, filename in enumerate(todo):
            with open(os.path.join(path, filename)) as f:
Ejemplo n.º 2
0
# -*- Encoding: utf-8 -*-
"""builk requests for some kinds of web pages

"""
import re
import os
import shutil
import redis

from req import request, request_pages
from log4f import debug_logger

log = debug_logger('log/download', 'download')


def get_title(content):
    """demo validator of builk_single"""
    m = re.compile(r'<title>(.*?)</title>').search(content)
    if m is None:
        return 'No Title'
    else:
        return m.group(1)


def builk_single(job, url_ptn, cache_dir, find_new=None):
    """builk download. one single corresponding page for one ID.

    usually, it is used for profile page of a user/book/shop etc.
    """
    key = job.next()
    print 'downloading...'
Ejemplo n.º 3
0
# -*- Encoding: utf-8 -*-
import json
import redis
from os.path import join, dirname
from wechat_sdk import WechatExt
# from wechat_sdk.exceptions import NeedLoginError

from log4f import debug_logger
import settings

LOGIN_TIMEOUT = 4 * 3600  # 4 hours
r = redis.StrictRedis(**settings.REDIS_CONN)
log = debug_logger(join(dirname(__file__), 'log/notify'), 'root.notify')


def login(username, password):
    d = r.get(username)
    if d:
        log.info('lazy login. use cookie, username={}'.format(username))
        return WechatExt(username, password, login=False, **json.loads(d))
    else:
        print username, password
        wechat = WechatExt(username, password, login=False)
        wechat.login()
        log.info('login to wechat server. username={}'.format(username))
        r.setex(username, LOGIN_TIMEOUT,
                json.dumps(wechat.get_token_cookies(), indent=4))
        return wechat


def init_info():
Ejemplo n.º 4
0
# -*- Encoding: utf-8 -*-
import redis
from wechat import send

from os.path import dirname, join
from log4f import debug_logger

BASE_DIR = dirname(__file__)
log = debug_logger(join(BASE_DIR, 'log/download'), 'root.download')


class JobPool:
    def __init__(self,
                 job_name,
                 host='localhost',
                 port=6379,
                 db=0,
                 timeout=10):
        self.timeout = timeout
        self.db = redis.StrictRedis(host, port, db)
        self.total_tbl = '{}:total'.format(job_name)
        self.todo_tbl = '{}:todo'.format(job_name)
        self.name = job_name

    def init_data(self, total, done):
        self.db.delete(self.total_tbl)
        self.db.delete(self.todo_tbl)

        todo = set(total) - set(done)

        self.db.sadd(self.total_tbl, *total)
Ejemplo n.º 5
0
# -*- Encoding: utf-8 -*-
import json
import redis
from os.path import join, dirname
from wechat_sdk import WechatExt
# from wechat_sdk.exceptions import NeedLoginError

from log4f import debug_logger
import settings


LOGIN_TIMEOUT = 4 * 3600  # 4 hours
r = redis.StrictRedis(**settings.REDIS_CONN)
log = debug_logger(join(dirname(__file__), 'log/notify'), 'root.notify')


def login(username, password):
    d = r.get(username)
    if d:
        log.info('lazy login. use cookie, username={}'.format(username))
        return WechatExt(username, password, login=False, **json.loads(d))
    else:
        print username, password
        wechat = WechatExt(username, password, login=False)
        wechat.login()
        log.info('login to wechat server. username={}'.format(username))
        r.setex(username, LOGIN_TIMEOUT,
                json.dumps(wechat.get_token_cookies(), indent=4))
        return wechat

Ejemplo n.º 6
0
# -*- Encoding: utf-8 -*-
import re
import redis
import socket
from httplib2 import Http

import time
import random

from os.path import dirname, join
from log4f import debug_logger
import settings

BASE_DIR = dirname(__file__)
log = debug_logger(join(BASE_DIR, 'log/request'), 'root.request')

r = redis.StrictRedis(**settings.REDIS_CONN)


def wait(f):
    lock_name = 'http-lock'

    def _wrap_func(*args, **kwargs):
        t = r.ttl(lock_name)
        if t > 0:
            time.sleep(t)

        n_t = int(random.uniform(settings.DELAY_BOTTOM, settings.DELAY_TOP))
        r.setex(lock_name, n_t, 'locking')
        return f(*args, **kwargs)
    return _wrap_func
Ejemplo n.º 7
0
# -*- Encoding: utf-8 -*-
import redis
from wechat import send

from os.path import dirname, join
from log4f import debug_logger

BASE_DIR = dirname(__file__)
log = debug_logger(join(BASE_DIR, 'log/download'), 'root.download')


class JobPool:
    def __init__(self, job_name,
                 host='localhost', port=6379, db=0,
                 timeout=10):
        self.timeout = timeout
        self.db = redis.StrictRedis(host, port, db)
        self.total_tbl = '{}:total'.format(job_name)
        self.todo_tbl = '{}:todo'.format(job_name)
        self.name = job_name

    def init_data(self, total, done):
        self.db.delete(self.total_tbl)
        self.db.delete(self.todo_tbl)

        todo = set(total) - set(done)

        self.db.sadd(self.total_tbl, *total)
        self.db.rpush(self.todo_tbl, *todo)

    def count_todo(self):
Ejemplo n.º 8
0
# -*- Encoding: utf-8 -*-
import re
import socket
from httplib2 import Http

import time
import random

from log4f import debug_logger

log = debug_logger('log/request', 'root.request')
_last_req = None


def delay(bottom=2, top=7):
    global _last_req
    if _last_req is None:
        _last_req = time.time()
        return 0

    period = max(0,
                 _last_req+random.uniform(bottom, top)-time.time())
    log.debug('...wait {:.2f} sec'.format(period))
    time.sleep(period)
    _last_req = time.time()
    return period


def wait(f):
    def _wrap_func(*args, **kwargs):
        delay()
Ejemplo n.º 9
0
# -*- Encoding: utf-8 -*-
import re
import os
import json

from log4f import debug_logger

log = debug_logger('log/parser', 'crawler.parser')


class Indexing:
    def __init__(self, ptn, filename):
        self.ptn = ptn
        self.filename = filename

        self.data = dict()
        if os.path.exists(filename):
            with open(self.filename, 'r') as fr:
                self.data = json.load(fr)

    def save(self):
        with open(self.filename, 'wb') as fw:
            json.dump(self.data, fw, indent=4)

    def scan(self, path, save_period=2000):
        total = set(os.listdir(path))
        todo = total - set(self.data.keys())
        print '{}/{} to parse.'.format(len(todo), len(total))

        for i, filename in enumerate(todo):
            with open(os.path.join(path, filename)) as f:
Ejemplo n.º 10
0
# -*- Encoding: utf-8 -*-
import json
import time
from os.path import join, dirname, exists
from os import makedirs

import tornado.ioloop
import tornado.web
from tornado.options import define, options

from wechat_sdk import WechatBasic, WechatExt
from wechat_sdk.exceptions import NeedLoginError

from log4f import debug_logger

log = debug_logger(join(dirname(__file__), 'log'), 'root')

define("username", default='username', help="username of wechat", type=str)
define("password", default='password', help="password of wechat", type=str)
define("token", default='', help="token of wechat", type=str)
define("port", default=8000, help="run on the given port", type=int)
define("debug", default=False, help="run in Debug mode", type=bool)

today = lambda: time.strftime('%Y%m%d', time.localtime())
cookie_dir = join(dirname(__file__), 'cookie')


def login_http(username, password):
    wechat = WechatExt(username, password)
    wechat.login()
Ejemplo n.º 11
0
# -*- Encoding: utf-8 -*-
"""builk requests for some kinds of web pages

"""
import re
import os
import shutil
import redis

from req import request, request_pages
from log4f import debug_logger

log = debug_logger('log/download', 'download')


def get_title(content):
    """demo validator of builk_single"""
    m = re.compile(r'<title>(.*?)</title>').search(content)
    if m is None:
        return 'No Title'
    else:
        return m.group(1)


def builk_single(job, url_ptn, cache_dir, find_new=None):
    """builk download. one single corresponding page for one ID.

    usually, it is used for profile page of a user/book/shop etc.
    """
    key = job.next()
    print 'downloading...'