コード例 #1
0
    def __init__(self, root_cat, depth, log_file, output_dir, root_dir):
        # init logger
        self._logger = mylogger.get_logger(
            DistantExtractor.__name__,
            log_file,
            mylogger.DEBUG
        )
        io_logger = mylogger.get_logger(
            FileIO.__name__,
            log_file,
            mylogger.DEBUG
        )
        wiki_logger = mylogger.get_logger(
            WikipediaExtractor.__name__,
            log_file,
            mylogger.DEBUG
        )
        morph_logger = mylogger.get_logger(
            MorphemeTagger.__name__,
            log_file,
            mylogger.DEBUG
        )
        
        # init instance
        self._file_io = FileIO(output_dir, io_logger)
        self._wiki_extractor = WikipediaExtractor(wiki_logger, self._file_io)
        self._morpheme_tagger = MorphemeTagger(morph_logger, root_dir)
        
        # init args
        self._root_cat = root_cat
        self._limit_depth = depth
        #TODO 後々は複数クラスのシードを持てるようにする
        # name をkey, seeds(list)をvalueなdictにする
        # ラベリングのところはそうなってる
        self._seed_name = 'Car'
        self._seeds = list()
        self._categories = [self._root_cat]

        # init name
        self._seed_dir = 'seeds'
        self._unlabeled_dir = 'unlabeled_corpora'
        self._cleaned_dir = 'cleaned_corpora'
        self._mecab_dir = 'mecab_corpora'
        self._labeled_dir = 'labeled_corpora'
        self._train_dir = 'train_corpora'
        self._output = 'output'
        self._temp_dir = 'temp'
        self._templatefile = '%s/templates/template' % root_dir
        self._trainfile = '%s/train.txt' % output_dir
        self._decodefile = '%s/decode.txt' % output_dir
        self._modelfile = '%s/model' % output_dir
        self._all_labeledfile = '%s/all_labeled.txt' % output_dir
コード例 #2
0
    def __init__(self,
                 blog,
                 limit_start=0,
                 num=30,
                 threads_num=10,
                 need_save=True,
                 save_path=None,
                 img_re=None,
                 total_post_re=None,
                 max_posts=None,
                 proxies=None):
        self.blog = blog
        self.base_url = "http://%s.tumblr.com/api/read/json?start=" % self.blog
        self.total_post_re = total_post_re if total_post_re else re.compile(
            r'"posts-total":(\d+),')
        self.img_re = img_re if img_re else re.compile(
            r'photo-url-1280":"(http.*?)",')
        self.total_posts = 0
        self.max_posts = max_posts
        self.limit_start = limit_start
        self.num = num

        self.need_save = need_save
        if self.need_save:
            self.save_path = save_path
            self._check_save_path()
        else:
            from mylogger import get_logger
            self.imglog = get_logger("imgurl")

        self.proxies = proxies

        self.img_queue = Queue()
        self.post_queue = Queue()
        self.threads_num = threads_num
コード例 #3
0
def main():
    logger = get_logger(__name__)
    logger.debug("start self_checkout_machine")

    settings.init()
#   print settings.app

    app = QtGui.QApplication(sys.argv)
    window = main_window.MainWindow()
    window.goodShow()
    window.check_devices()

    sys.exit(app.exec_())
コード例 #4
0
    def __init__(self, blog, limit_start=0, num=30, threads_num=10, need_save=True, save_path=None, img_re=None, total_post_re=None, max_posts=None, proxies=None):
        self.blog = blog
        self.base_url = "http://%s.tumblr.com/api/read/json?start=" % self.blog
        self.total_post_re = total_post_re if total_post_re else re.compile(r'"posts-total":(\d+),')
        self.img_re = img_re if img_re else re.compile(r'photo-url-1280":"(http.*?)",')
        self.total_posts = 0
        self.max_posts = max_posts
        self.limit_start = limit_start
        self.num = num

        self.need_save = need_save
        if self.need_save:
            self.save_path = save_path
            self._check_save_path()
        else:
            from mylogger import get_logger
            self.imglog = get_logger("imgurl")

        self.proxies = proxies

        self.img_queue= Queue()
        self.post_queue = Queue()
        self.threads_num = threads_num
コード例 #5
0
#!/usr/bin/env python
#-*-coding:utf-8-*-

"""
    通用程序
"""
import sys
import os
import requests

from mylogger import get_logger
dllog = get_logger("app")

try:
    reload(sys)
    sys.setdefaultencoding('utf-8')
except NameError:
    # The only supported default encodings in Python are:

    #  Python 2.x: ASCII
    #  Python 3.x: UTF-8
    # So no need to sys.setdefaultencoding('utf-8')
    pass # py3

# 执行 requests 的数据下载
def download_page(url, ret_json=False, proxies=None):
    if not url:
        dllog.info("url should not be None")
        return ''

    try:
コード例 #6
0
ファイル: data_sync.py プロジェクト: ywb770377253/sync_cqssc
import re
from mylogger import get_logger

reload(sys)
sys.setdefaultencoding('utf-8')

DBHOST = "localhost:3306"
SCHEMA = "CAIPIAO"
DBUSER = "******"
DBPASSWD = "passwd"

db = torndb.Connection(host=DBHOST,
                       database=SCHEMA,
                       user=DBUSER,
                       password=DBPASSWD)
cplog = get_logger("caipiao")


class Data_Sync(object):
    ssc_re = re.compile(
        r'<td class=\'gray\'>(.*?)</td>(<td class=\'red big\'>|<td style=\'width:65px\'>)(.*?)</td>.*?<tr>'
    )

    def __init__(self,
                 start_date="20150101",
                 sleep_secs=10,
                 run_ever=True,
                 callback=None):
        self.start_date = start_date if start_date > "20130101" else "20150101"
        self.run_ever = run_ever
        self.base_url = "http://chart.cp.360.cn/kaijiang/kaijiang?lotId=255401&spanType=2&span="
コード例 #7
0
ファイル: module.py プロジェクト: ujjwaldalal/AppLogger
import mylogger

print("here as well")

log = mylogger.get_logger(__name__)
print("here here")


def say_hello(name):
    print("here too")
    log.debug("Greeting people.")
コード例 #8
0
from flask import Flask, jsonify, make_response, request
from flask_cors import CORS, cross_origin
import json
from Database import Database as db
from mylogger import get_logger

logger = get_logger()


app = Flask(__name__)
CORS(app, support_credentials=True)
app.config["CORS_HEADERS"] = "Content-Type"


@app.route("/api/insert", methods=["POST"])
@cross_origin(origin="*", headers=["Content-Type", "Authorization"])
def insert_data():

    data = json.loads(request.data)
    logger.info(f"The request data to insert: {data}")

    count = db.get_count_by_date(data["date"])
    if count == 1:
        logger.info(f"Updating only! Temperature key already exists!")
        db.update_date(data)
    else:
        logger.info(f"Inserting new entry!")

        # Insert
        db.insert_data(data)
コード例 #9
0
ファイル: worker.py プロジェクト: AssassinPig/viper-py
    email: [email protected]
'''
import settings
import redis
from crawler import Crawler
from crawleritem import CrawlerItem
from StringIO import StringIO
import lxml.html as LH
from strategy import Strategy
from mylogger import get_logger
from datetime import *

import hashlib
import os

logger = get_logger(name="worker", file_name="worker.log")

class Worker(Crawler):

    def __init__(self, start_urls=None, template_cls=None, strategy=None, settings=None):
        super(Worker, self).__init__(start_urls=start_urls, template_cls=template_cls, strategy=strategy)
        logger.debug('Init Worker...')
        if settings is None:
            try:
                self.r = redis.StrictRedis(host='localhost', port=6379, db=0)
                logger.debug('Connection local redis')
                if self.r is None:
                    raise redis.exceptions.ConnectionError 
            except (redis.exceptions.ConnectionError), e:
                raise e 
        else:
コード例 #10
0
ファイル: master.py プロジェクト: AssassinPig/viper-py
# -*- coding: utf-8 -*-
'''
    author: assassinpig
    email: [email protected]
'''
import signal, os
import time
import settings
import redis
import threading
from mylogger import get_logger 

logger = get_logger(name="master", file_name="master.log")

class Master(object): 

    @staticmethod
    def exit_handler(signum, frame):
        #print 'you press ctrl+c!'
        logger.debug('you press Ctrl+C!')
        pass

    def __init__(self, settings=None, start_urls=None, strategy=None):
        logger.debug('Init Master ...')
        if settings is None:
            try:
                logger.debug('Connect to local redis')
                self.r = redis.StrictRedis(host='localhost', port=6379, db=0)
                if self.r is None:
                    raise redis.exceptions.ConnectionError 
            except (redis.exceptions.ConnectionError), e:
コード例 #11
0
#!/usr/bin/env python
#-*-coding:utf-8-*-
"""
    1、检测商品是否有货(支持省、市,默认是四川成都);
    2、获取当前价格;
    输入值,商品 url 或者 skuid
    返回值: (time.time(), 是否有货,当前价格)
"""
import re
import time
import requests

from mylogger import get_logger

jdlog = get_logger('jd')


def run(url, provinceid=1, cityid=72):
    skuid = parse_url_for_skuid(url)
    if skuid:
        title, stock_state = check_if_in_stock(skuid, provinceid, cityid)
        price = get_current_price(skuid)
        return {
            "last_update": time.strftime("%Y-%m-%d %H:%M:%S",
                                         time.localtime()),
            "stock_state": stock_state,
            "price": price,
            "title": title
        }

コード例 #12
0
    email: [email protected]
'''
import settings
import redis
from crawler import Crawler
from crawleritem import CrawlerItem
from StringIO import StringIO
import lxml.html as LH
from strategy import Strategy
from mylogger import get_logger
from datetime import *

import hashlib
import os

logger = get_logger(name="worker", file_name="worker.log")


class Worker(Crawler):
    def __init__(self,
                 start_urls=None,
                 template_cls=None,
                 strategy=None,
                 settings=None):
        super(Worker, self).__init__(start_urls=start_urls,
                                     template_cls=template_cls,
                                     strategy=strategy)
        logger.debug('Init Worker...')
        if settings is None:
            try:
                self.r = redis.StrictRedis(host='localhost', port=6379, db=0)
コード例 #13
0
ファイル: utils.py プロジェクト: CN-P5/163spider
# LastChange:   2014-12-08 10:12:51
# History:      
#=============================================================================
'''

"""
    通用程序
"""
import torndb
import _mysql_exceptions
import sys
import requests

from mylogger import get_logger

ulog = get_logger("utils")
applog = get_logger("app")

reload(sys)
sys.setdefaultencoding('utf-8')

db = torndb.Connection(host="localhost", database="WANGYI", user="******", password="******")

# 数据库执行
def insert_mysql(sql):
    try:
        db.insert(sql)
        return True

    except _mysql_exceptions.IntegrityError, e:
        """ 主键冲突,此处不算错误 """
コード例 #14
0
ファイル: jd.py プロジェクト: treejames/jd_watch
#!/usr/bin/env python
# -*-coding:utf-8-*-
"""
    1、检测商品是否有货(支持省、市,默认是四川成都);
    2、获取当前价格;
    输入值,商品 url 或者 skuid
    返回值: (time.time(), 是否有货,当前价格)
"""
import re
import time
import requests

from mylogger import get_logger

jdlog = get_logger("jd")


def run(url, provinceid=1, cityid=72):
    skuid = parse_url_for_skuid(url)
    if skuid:
        title, stock_state = check_if_in_stock(skuid, provinceid, cityid)
        price = get_current_price(skuid)
        return {
            "last_update": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "stock_state": stock_state,
            "price": price,
            "title": title,
        }


def parse_url_for_skuid(url):
コード例 #15
0
ファイル: test.py プロジェクト: shellleyma/python-whois
import time
from mylogger import get_logger
import mysql_python
from whois import whois
from whois import parser 
try:
    import configparser # py3
except ImportError:
    import ConfigParser as configparser # py2
import pytz
tz = pytz.timezone(pytz.country_timezones('cn')[0])

cf = configparser.ConfigParser()
cf.read("config.ini")

dllog = get_logger("success")
responselog = get_logger("response")
errorlog = get_logger("error")
orederlog = get_logger("domains_orders")

userid=cf.get("resellerclub", "userid")
apikey=cf.get("resellerclub", "apikey")
mysqlhost=cf.get("db", "mysqlhost")
mysqluser=cf.get("db", "mysqluser")
mysqlpass=cf.get("db", "mysqlpass")
mysqldb=cf.get("db", "mysqldb")


mail_to="[email protected],[email protected],[email protected]"

#最初版本适用,后来从数据库读取相关数据,以下变量变成数据库读取值失败后的最后默认值
コード例 #16
0
#!/usr/bin/env python
#-*-coding:utf-8-*-
"""
    通用程序
"""
import sys
import os
import requests

from mylogger import get_logger
dllog = get_logger("app")

try:
    reload(sys)
    sys.setdefaultencoding('utf-8')
except NameError:
    # The only supported default encodings in Python are:

    #  Python 2.x: ASCII
    #  Python 3.x: UTF-8
    # So no need to sys.setdefaultencoding('utf-8')
    pass  # py3


# 执行 requests 的数据下载
def download_page(url, ret_json=False, proxies=None):
    if not url:
        dllog.info("url should not be None")
        return ''

    try:
コード例 #17
0
# -*- coding: utf-8 -*-
'''
    author: assassinpig
    email: [email protected]
'''
import signal, os
import time
import settings
import redis
import threading
from mylogger import get_logger

logger = get_logger(name="master", file_name="master.log")


class Master(object):
    @staticmethod
    def exit_handler(signum, frame):
        #print 'you press ctrl+c!'
        logger.debug('you press Ctrl+C!')
        pass

    def __init__(self, settings=None, start_urls=None, strategy=None):
        logger.debug('Init Master ...')
        if settings is None:
            try:
                logger.debug('Connect to local redis')
                self.r = redis.StrictRedis(host='localhost', port=6379, db=0)
                if self.r is None:
                    raise redis.exceptions.ConnectionError
            except (redis.exceptions.ConnectionError), e:
コード例 #18
0
import torndb
import sys
import requests
import re
from mylogger import get_logger

reload(sys)
sys.setdefaultencoding('utf-8')

DBHOST = "localhost:3306"
SCHEMA = "CAIPIAO"
DBUSER = "******"
DBPASSWD = "passwd"

db = torndb.Connection(host=DBHOST, database=SCHEMA, user=DBUSER, password=DBPASSWD)
cplog = get_logger("caipiao")

class Data_Sync(object):
    ssc_re = re.compile(r'<td class=\'gray\'>(.*?)</td>(<td class=\'red big\'>|<td style=\'width:65px\'>)(.*?)</td>.*?<tr>')

    def __init__(self, start_date="20150101", sleep_secs = 10, run_ever=True, callback=None):
        self.start_date = start_date if start_date > "20130101" else "20150101"
        self.run_ever = run_ever
        self.base_url = "http://chart.cp.360.cn/kaijiang/kaijiang?lotId=255401&spanType=2&span="
        self.latest_date = ''
        self.latest_period = ''
        self.need_sleep = False
        self.sleep_secs = sleep_secs
        self.callback=callback

    def run(self):
コード例 #19
0
ファイル: utils.py プロジェクト: xiaogang00/web-crawler
# Version:      0.0.1
# LastChange:   2014-12-08 10:12:51
# History:      
#=============================================================================
'''
"""
    通用程序
"""
import torndb
import _mysql_exceptions
import sys
import requests

from mylogger import get_logger

ulog = get_logger("utils")
applog = get_logger("app")

reload(sys)
sys.setdefaultencoding('utf-8')

db = torndb.Connection(host="localhost",
                       database="WANGYI",
                       user="******",
                       password="******")


# 数据库执行
def insert_mysql(sql):
    try:
        db.insert(sql)