コード例 #1
0
    def create_model_then_learn(shared_stuff,
                                model_type,
                                model_num,
                                policy_type,
                                env,
                                learning_starts=1000,
                                prioritized_replay=False,
                                batch_size=32,
                                verbose=0):
        global logdirs
        assert model_type == 'i' or 'm', "invalid model type"
        if model_type == 'm':
            batch_size = n_indiv * batch_size
        print(type(env))
        model = DQN(policy_type,
                    env,
                    learning_rate=1e-4,
                    learning_starts=learning_starts,
                    prioritized_replay=prioritized_replay,
                    batch_size=batch_size,
                    verbose=verbose,
                    target_network_update_freq=5000,
                    buffer_size=50000,
                    shared_stuff=shared_stuff)
        model.model_type = model_type
        model.model_num = model_num

        if model_type == 'i':
            model.indiv_logger = Logger(logdirs['indiv'][model_num])
        elif model_type == 'm':
            for indiv_num in range(n_indiv):
                model.multi_loggers[indiv_num] = Logger(
                    logdirs['multi'][model_num][indiv_num])

        model_type_str = 'indiv' if model_type == 'i' else 'multi'
        print("{} task DQN {} created".format(model_type_str, model_num))
        print("{} task DQN {} begins learning...".format(
            model_type_str, model_num))

        model.learn(total_timesteps=5000000,
                    callback=callback,
                    tb_log_name="DQN_{}_{}".format(model_type, model_num))

        print("{} task DQN {} done learning!".format(model_type_str,
                                                     model_num))

        # TODO the following block isn't used
        if model_type == 'i':
            indiv_models.append(model)
        else:
            multi_models.append(model)
コード例 #2
0
ファイル: orgcoderServer.py プロジェクト: floydScript/spider
class OrgcoderServer():

    _logger = Logger().getLogger()

    def __init__(self):
        config = Conf.config
        # 初始化数据库连接
        self.db = pymysql.connect(host=config['mysql']['host'],
                                  port=config['mysql']['port'],
                                  user=config['mysql']['username'],
                                  passwd=config['mysql']['password'],
                                  db=config['mysql']['dbname_org'])
        self.cursor = self.db.cursor()

    def query_orgcode(self, organization_name):
        """
        从mysql中查询出版社编码
        :param organization_name: 出版社名字
        :return:
        """
        sql = 'select organization_code from 09_org_collect_sys where organization_name = "' + organization_name + '"'
        self.cursor.execute(sql)
        result = self.cursor.fetchone()
        if not result:
            self._logger.info('查无此出版社========>publisher:' + organization_name)
            return ''
        self._logger.info('出版社查询========>publisher:' + organization_name +
                          '  orgcode为:' + result[0])
        return result[0]
コード例 #3
0
ファイル: kafkaConsumer.py プロジェクト: floydScript/spider
class SaveImgPipelines():
    _logger = Logger().getLogger()

    def __init__(self):
        self.config = Conf.config

    def process_item(self, item):

        if item['_entitycode'] == 'web_page_p_book_info_09':
            if item['is_set'] == '是':
                return item
            # 拼接图片路径 /opt/fhcb/fileserver/img + /book/20180909/2993702.jpg
            img_path = self.config['image']['path'] + item['coverpath']
            # 创建文件夹  /opt/fhcb/fileserver/img + /book/ + 20180909/
            dir_path = self.config['image'][
                'path'] + '/book/' + datetime.datetime.now().strftime(
                    '%Y%m%d') + '/'
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            # 从imgurl下载图片
            with open(img_path, 'wb') as f:
                resp = requests.get(item['coverurl'])
                f.write(resp.content)
                self._logger.info('下载图片到: ' + item['coverpath'])
        return item
コード例 #4
0
ファイル: pipelines.py プロジェクト: floydScript/spider
class SaveImgPipelines():
    _logger = Logger().getLogger()

    def __init__(self):
        self.config = Conf.config
        self.es = Elasticsearch(self.config['elasticsearch']['hosts'])

    def process_item(self, item):
        """
        保存图片
        :param item:
        :return:
        """
        if item['_entitycode'] == 'web_page_p_book_info_09':
            if item['is_set'] == '是':
                return item
            # 查询es,如果es中有这个isbn的话就不存图片了
            body = {
                "query": {
                    "term": {
                        "isbn": item['isbn']
                    }
                }
            }
            result = self.es.search(index="web_page_p_book_info_09", doc_type="web_page_p_book_info_09", body=body)
            if result['hits']['hits']:
                self._logger.info('重复的图片,不再下载  ISBN:'+item['isbn'])
                return item
            # 拼接图片路径 /opt/fhcb/fileserver/img + /book/20180909/2993702.jpg
            img_path = self.config['image']['path'] + item['coverpath']
            # 创建文件夹  /opt/fhcb/fileserver/img + /book/ + 20180909/
            filename = img_path.split('/')[-1]
            dir_path = img_path.replace(filename,'')
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            # 从imgurl下载图片
            with open(img_path, 'wb') as f:
                resp = requests.get(item['coverurl'])
                f.write(resp.content)
                self._logger.info('下载图片到: ' + item['coverpath'])
        return item
コード例 #5
0
ファイル: deletePhoto.py プロジェクト: floydScript/spider
    if len(terms) > 0:
        for t in terms:
            p = '/book/'+dirname +'/'+ filename
            if p in t['_source']['coverpath']:
                # 存在此图片,日期也正确,返回false
                return True,'存在此图片'
            else:
                # 存在此图片,但日期不对,返回false
                return False,'reason:es中已找到,但日期不对'
    else:
        # 不存在此图片
        return False,'reason:es中无法找到'


config = Conf.config
_logger = Logger().getLogger()

es = Elasticsearch(config['elasticsearch']['hosts'])

directory = config['image']['path']+'/book'
os.chdir(directory)
cwd = os.getcwd()
# 列出book中所有的文件夹
dirs = os.listdir(cwd)
count = 0
# 遍历文件夹
for dir in dirs :
    path = directory+'/'+dir
    if os.path.isfile(path):
        continue
    # 选择当前文件夹
コード例 #6
0
ファイル: kafkaConsumer.py プロジェクト: floydScript/spider
import json
import os

import pymysql
import requests
from kafka import KafkaConsumer
from categoryServer import CategoryServer
from conf import Conf
from mylogger import Logger
from orgcoderServer import OrgcoderServer

config = Conf.config
consumer = KafkaConsumer('09_p_spider',
                         bootstrap_servers=[config['kafka']['host']])
# consumer = KafkaConsumer('test',bootstrap_servers=[hosts])
_logger = Logger().getLogger()


class MySqlPipelines(object):
    _logger = Logger().getLogger()

    def __init__(self):
        config = Conf.config
        self.db = pymysql.connect(host=config['mysql']['host'],
                                  port=config['mysql']['port'],
                                  user=config['mysql']['username'],
                                  passwd=config['mysql']['password'],
                                  db=config['mysql']['dbname'],
                                  charset='utf8')
        self.cursor = self.db.cursor()
コード例 #7
0
                        default=0.5,
                        help='Downscaling factor of the images')
    parser.add_argument(
        '-v',
        '--validation',
        dest='val',
        type=float,
        default=10.0,
        help='Percent of the data that is used as validation (0-100)')

    return parser.parse_args()


if __name__ == '__main__':

    log = Logger("log_test.txt")

    args = get_args()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 1
    torch.manual_seed(seed)
    if device == "cuda":
        torch.cuda.manual_seed(seed)

    log.logger.info(f'Using device {device}')

    net_pretrained = models.densenet201(pretrained=True)

    net = densenet.densenet201(num_classes=1)

    net_dict = net.state_dict()
コード例 #8
0
ファイル: pipelines.py プロジェクト: floydScript/spider
class ElasticSearchPipelines():
    _logger = Logger().getLogger()

    info_mapping = {
    "mappings": {
        "web_page_p_book_info_09": {
            "properties": {
                "_entitycode": {
                "type": "string"
                },
                "_row": {
                "type": "string"
                },
                "author": {
                "type": "string",
                "index": "not_analyzed"
                },
                "authorintro": {
                "type": "string"
                },
                "bookname": {
                "type": "string"
                },
                "catalog": {
                "type": "string"
                },
                "category": {
                "type": "string"
                },
                "collectiontime": {
                "type": "date",
                "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM"
                },
                "commentcount": {
                "type": "long"
                },
                "commentcount_jd": {
                "type": "long"
                },
                "commentcount_ymx": {
                "type": "long"
                },
                "commentpercent": {
                "type": "float"
                },
                "commenttag": {
                "type": "string"
                },
                "contentsummary": {
                "type": "string"
                },
                "contenttype": {
                "type": "string"
                },
                "coverpath": {
                "type": "string"
                },
                "coverurl": {
                "type": "string"
                },
                "edition": {
                "type": "string"
                },
                "editorsugest": {
                "type": "string"
                },
                "epilogue": {
                "type": "string"
                },
                "format": {
                "type": "string"
                },
                "ifimport": {
                "type": "string"
                },
                "impression": {
                "type": "string"
                },
                "isbn": {
                "type": "string"
                },
                "issuearea": {
                "type": "string"
                },
                "language": {
                "type": "string"
                },
                "orgcategory": {
                "type": "string"
                },
                "orgcode": {
                "type": "string"
                },
                "orgisbn": {
                "type": "string"
                },
                "orgpublisher": {
                "type": "string",
                "index": "not_analyzed"
                },
                "packing": {
                "type": "string"
                },
                "pages": {
                "type": "integer"
                },
                "papermeter": {
                "type": "string"
                },
                "preface": {
                "type": "string"
                },
                "price": {
                "type": "float"
                },
                "printedtime": {
                "type": "date",
                "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM"
                },
                "publishdate": {
                "type": "date",
                "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM"
                },
                "publisher": {
                "type": "string",
                "index": "not_analyzed"
                },
                "row": {
                "type": "string"
                },
                "salecategory": {
                "type": "string"
                },
                "seriename": {
                "type": "string"
                },
                "skuid": {
                "type": "string"
                },
                "sourceprice": {
                "type": "float"
                },
                "sourceprice_jd": {
                "type": "float"
                },
                "sourceprice_ymx": {
                "type": "float"
                },
                "sourcetype": {
                "type": "string"
                },
                "subhead": {
                "type": "string"
                },
                "summary": {
                "type": "string"
                },
                "translator": {
                "type": "string"
                },
                "type": {
                "type": "string"
                },
                "url": {
                "type": "string"
                },
                "usersugest": {
                "type": "string"
                },
                "words": {
                "type": "integer"
                }
            }
        }
    }
}
    comment_mapping = {
    "mappings": {
        "web_page_p_book_comment_09": {
            "properties": {
                "_entitycode": {
                    "type": "string"
                },
                "_row": {
                    "type": "string"
                },
                "bookname": {
                    "type": "string"
                },
                "collectiontime": {
                    "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd",
                    "type": "date"
                },
                "comment": {
                    "type": "string"
                },
                "commentid": {
                    "type": "string"
                },
                "commenttitle": {
                    "type": "string"
                },
                "commenttype": {
                    "type": "string"
                },
                "commpoint": {
                    "type": "string"
                },
                "followcommentid": {
                    "type": "string"
                },
                "follownum": {
                    "type": "integer"
                },
                "hitcount": {
                    "type": "integer"
                },
                "isbn": {
                    "type": "string"
                },
                "level": {
                    "type": "integer"
                },
                "opposnum": {
                    "type": "integer"
                },
                "publishtime": {
                    "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd",
                    "type": "date"
                },
                "score": {
                    "type": "float"
                },
                "sitename": {
                    "type": "string"
                },
                "skuid": {
                    "type": "string"
                },
                "sourcetype": {
                    "type": "string"
                },
                "suportnum": {
                    "type": "integer"
                },
                "type": {
                    "type": "string"
                },
                "uri": {
                    "type": "string"
                },
                "username": {
                    "type": "string"
                }
            }
        }
    }
}

    def __init__(self):

        self.config = Conf.config
        self.orgcode_server = OrgcoderServer()
        # 建立es连接
        self.es = Elasticsearch(self.config['elasticsearch']['hosts'])
        # 图书信息索引,如果不存在则创建
        if self.es.indices.exists(index='web_page_p_book_info_09') is not True:
            self.es.indices.create(index='web_page_p_book_info_09', body=self.info_mapping)
        # 图书评论索引,如果不存在则创建
        if self.es.indices.exists(index='web_page_p_book_comment_09') is not True:
            self.es.indices.create(index='web_page_p_book_comment_09', body=self.comment_mapping)
        self.orgcode_server = OrgcoderServer()

    def process_item(self,item):
        """
        清洗数据,推送至elasticsearch
        :param item:
        :return:
        """
        if item['_entitycode'] == 'web_page_p_book_info_09':
            # 清洗数据,根据来源分类清洗
            flag = self.dd_washing_datas(item,self.orgcode_server)
            if not flag:
                return None

            id = item['isbn']
            #=================== 区分数据来源 ===================#
            tag = ''
            if item['sourcetype'] == '01':#京东
                tag = '_jd'
                sourceprice = item.pop('sourceprice')
                item['sourceprice_jd'] = sourceprice
                commentcount = item.pop('commentcount')
                item['commentcount_jd'] = commentcount

            elif item['sourcetype'] == '02':#当当
                tag = ''
                # 因为当当的字段就是原字段,无需更改

            elif item['sourcetype'] == '03':#豆瓣
                tag = '_db'
                sourceprice = item.pop('sourceprice')
                item['sourceprice_db'] = sourceprice
                commentcount = item.pop('commentcount')
                item['commentcount_db'] = commentcount

            elif item['sourcetype'] == '04':#新华书店
                tag = '_xhsd'
                sourceprice = item.pop('sourceprice')
                item['sourceprice_xhsd'] = sourceprice
                commentcount = item.pop('commentcount')
                item['commentcount_xhsd'] = commentcount

            elif item['sourcetype'] == '05':#亚马逊
                tag = '_ymx'
                sourceprice = item.pop('sourceprice')
                item['sourceprice_ymx'] = sourceprice
                commentcount = item.pop('commentcount')
                item['commentcount_ymx'] = commentcount

            elif item['sourcetype'] == '06':#天猫
                tag = '_tm'
                sourceprice = item.pop('sourceprice')
                item['sourceprice_tm'] = sourceprice
                commentcount = item.pop('commentcount')
                item['commentcount_tm'] = commentcount
            # =================== 区分数据来源 ===================#

            # 索引中无则新增,有则更新,还要将数据进行合并
            try:
                # 取到相同id的数据,如果取不到会报错,进入except
                resu = self.es.get(index=item['_entitycode'], doc_type=item['_entitycode'], id=id)
                resu_item = resu['_source']
                if tag:
                    resu_item['sourcetype'+tag] = ''
                    resu_item['commentcount'+tag] = ''
                # 取代原来为空的字段
                for key in resu_item:
                    if not resu_item[key]:
                        try:
                            resu_item[key] = item[key]
                        except:
                            pass
                # 取最大的评论数存储
                if int(resu_item['commentcount']) < int(item['commentcount']):
                    resu_item['commentcount'] = item['commentcount']
                self.es.index(index=item['_entitycode'], doc_type=item['_entitycode'], id=id, body=resu_item)
                self._logger.info('es更新数据成功:' + resu_item['url'])
            except:
                # 新增数据
                # item['price'] = str(item['price'], encoding="utf-8")

                # 判断图片是否存在
                path = self.config['image']['path'] + item['coverpath']
                if not os.path.exists(path):
                    self._logger.info('图片不存在:' + item['coverpath'])
                    return None
                # 往es中写入数据
                self.es.index(index=item['_entitycode'], doc_type=item['_entitycode'], id=id, body=item)
                self._logger.info('es新增数据成功:' + item['url'])
        elif item['_entitycode'] == 'web_page_p_book_comment_09':
            if item['type'] == '02':
                body = {
                    "query": {
                        "term": {
                            "skuid": item['skuid']
                        }
                    }
                }
                result = self.es.search(index="web_page_p_book_info_09", doc_type="web_page_p_book_info_09", body=body)
                terms = result['hits']['hits']
                if not terms:
                    return None
            id = item['_row']
            self.es.index(index=item['_entitycode'], doc_type=item['_entitycode'], id=id, body=item)
            self._logger.info('es新增数据成功:'+item['uri'])

    def dd_washing_datas(self, item,orgcode_server):
        """
        数据清洗
        1.作者
        2.书名
        3.过滤没有图片的数据
        4.将评论数为空串的转换为0
        5.根据出版社名字查询orgcode
        6.判断时间不能大于当前时间
        7.时间字段如果是空串,改成None
        8.清洗html标签
        :param item: 图书信息实体
        :param orgcode_server: 出版社编号查询 实例
        :return:
        """
        # 判断图片是否存在
        if not item['coverpath']:
            return None

        # 清洗作者1:‘菲尔·比德尔 | 译者’
        item['author'] = item['author'].split('|')[0]

        # 清洗书名1,去除(和(后面的字段
        item['bookname'] = item['bookname'].split('(')[0]
        item['bookname'] = item['bookname'].split('(')[0]

        # 清洗书名2,去除 出版社,isbn,作者等字样
        item['bookname'] = item['bookname'].replace(item['publisher'], '').replace(item['isbn'], '').replace(item['author'], '')

        # 将评论数为空串的转换为0
        if not item['commentcount']:
            item['commentcount'] = 0

        # 根据出版社名字查询orgcode

        pub_name = item['publisher']
        orgcode = orgcode_server.query_orgcode(pub_name)
        item['orgcode'] = orgcode

        # 出版时间不能大于当前时间
        strftime = datetime.datetime.strptime(item['publishdate'], "%Y-%m")
        strftime2 = datetime.datetime.now()
        if strftime > strftime2:
            return None

        # 时间字段如果是空串,改成None
        if not item['printedtime']:
            item['printedtime'] = None
        if not item['publishdate']:
            item['publishdate'] = None

        # 清洗html标签
        item['preface'] = item['preface'].replace("<br>",'')
        item['catalog'] = item['catalog'].replace("<br>",'')
        item['editorsugest'] = item['editorsugest'].replace("<br>",'')
        item['summary'] = item['summary'].replace("<br>",'')
        item['usersugest'] = item['usersugest'].replace("<br>",'')
        item['contentsummary'] = item['contentsummary'].replace("<br>",'')
        item['authorintro'] = item['authorintro'].replace("<br>",'')
        return True
コード例 #9
0
ファイル: main.py プロジェクト: flyer103/douban_fang
"""
"""

import time
import json

import requests
from lxml import etree
from pymongo import MongoClient

from mylogger import Logger

log_main = Logger.get_logger(service=__name__)


class FangCrawler:

    URL_TPL = "https://www.douban.com/group/shanghaizufang/discussion?start="

    def __init__(self):
        self.configs = self._load_conf()

        self.headers = self.configs["http"]["headers"]

        mgo_config = self.configs["mongo"]
        if mgo_config.get("rs"):
            self.mgo = MongoClient(
                mgo_config["rs"]["url"],
                replicaSet=mgo_config["rs"]["name"],
                readPreference=mgo_config["rs"]["read_preference"],
            )
コード例 #10
0
import os
import requests
from elasticsearch import Elasticsearch
from conf import Conf
from mylogger import Logger

_logger = Logger().getLogger()
config = Conf.config
es = Elasticsearch(config['elasticsearch']['hosts'])
date = '20181025'
body1 = {
    "query": {
        "term": {
            "coverpath": date
        }
    }
}
body = {
    "from": 0,
    "query": {
        "bool": {
            "must": [
                {
                    "term": {
                        "coverpath": date
                    }
                }
            ]
        }
    },
    "size": 25000
コード例 #11
0
    return parser.parse_args()


def from_config(config):
    args = settings()
    args.batchsize = int(config.get('setting', 'batchsize'))
    args.epochs = int(config.get('setting', 'epoch'))
    args.lr = float(config.get('setting', 'lr'))
    args.loss_alpha = float(config.get('hyperparam', 'loss_alpha'))
    args.loss_beta = float(config.get('hyperparam', 'loss_beta'))
    return args


if __name__ == '__main__':

    log = Logger(logger_file)
    #args = get_args()
    args = from_config(config)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 1
    torch.manual_seed(seed)
    if device == "cuda":
        torch.cuda.manual_seed(seed)

    log.logger.info(f'Using device {device}')

    net_pretrained = models.densenet201(pretrained=True)
    for i_fold in range(0, 10):
        net = densenet.densenet201(num_classes=1)

        net_dict = net.state_dict()
コード例 #12
0
        options['__groupname__'] = self.groupname
        options['__nodename__'] = self.nodename

        stats_d = {   '__group__': self.groupname, 
                      '__name__': self.nodename,
                      section: options,
         }
        jsonstr = json.dumps(stats_d, ensure_ascii=True)
        return jsonstr

StatsClient = AppStats

if __name__ == "__main__":

    from mylogger import Logger
    logger = Logger.getLogger('test1', None, 'DEBUG', True)
    Logger.addLoggingServer(logger, '127.0.0.1', 9900)

    logger2 = Logger.getLogger('test2', None, 'DEBUG', True)
    Logger.addLoggingServer(logger2, '127.0.0.1', 9900)

    stats = AppStats('cluster1', 'master', logger)
    stats2 = AppStats('cluster2', 'spider', logger2)

    section = 'process'
    options = {'mem': 1000, 'cpu':0.01}
    stats.log('p1', options, 'set')
    stats.log('p2', options, 'set')
    stats.log('p3', options, 'set')
    stats.log('p4', options, 'set')
    stats.log('p5', options, 'set')
コード例 #13
0
class ParseBook(object):
    _logger = Logger().getLogger()
    def __init__(self):
        # 初始化分类服务器,即只实例化一次数据库连接
        self.cat_server = CategoryServer()
        self.orgcode_server = OrgcoderServer()

        # 初始化es数据通道
        self.es_pipe = ElasticSearchPipelines()
        #初始化
        try:
            self.config = Conf.config

            self.mysql_hosts = self.config['mysql']['host']
            self.mysql_port = self.config['mysql']['port']
            self.mysql_user = self.config['mysql']['username']
            self.mysql_password = self.config['mysql']['password']
            self.mysql_db = self.config['mysql']['dbname']
            self.mysql_charset = self.config['mysql']['charset']
            self.mysql_table = self.config['mysql']['book_table']
        except Exception:
            # 读取文件异常
            self._logger.info("读取文件异常")
        #建立数据库连接
        self.conn = pymysql.connect(host=self.mysql_hosts,
                                    port=int(self.mysql_port),
                                    user=self.mysql_user,
                                    password=self.mysql_password,
                                    db=self.mysql_db,
                                    charset=self.mysql_charset,
                                    cursorclass=pymysql.cursors.DictCursor)
        self.cursor = self.conn.cursor()



    # 将数据推送至ES
    def parse_item(self):
        self._logger.info("查询待推送的数据:"+self.mysql_table)
        self.cursor.execute("""SELECT * FROM %s limit 140000,120000""" % self.mysql_table)

        result = self.cursor.fetchall()

        for row in result:
            # 解析数据库结果
            item = ParseBook.initItem(self,row)

            item['collectiontime'] = item['collectiontime'].strftime("%Y-%m-%d %H:%M:%S")
            self.es_pipe.process_item(item)

    # 释放资源
    def close_spider(self, spider):
        self.conn.close()
        self.cursor.close()

    # 解析mysql数据库
    def initItem(self,row):
        item = {}

        item['bookname'] = row['bookname']
        item['subhead'] = row['subhead']
        item['publisher'] = row['publisher']
        item['orgpublisher'] = row['orgpublisher']
        item['contentsummary'] = row['contentsummary']
        item['sourcetype'] = row['sourcetype']
        item['author'] = row['author']
        item['translator'] = row['translator']
        item['isbn'] = row['isbn']
        item['orgisbn'] = row['orgisbn']
        item['salecategory'] = row['salecategory']
        item['category'] = row['category']
        item['orgcategory'] = row['orgcategory']
        item['contenttype'] = row['contenttype']
        item['issuearea'] = row['issuearea']
        item['type'] = row['type']
        item['edition'] = row['edition']
        item['impression'] = row['impression']
        item['words'] = row['words']
        item['pages'] = row['pages']
        item['language'] = row['language']
        item['price'] = row['price']
        item['printedtime'] = row['printedtime']
        item['format'] = row['format']
        item['papermeter'] = row['papermeter']
        item['packing'] = row['packing']
        item['coverurl'] = row['coverurl']
        item['coverpath'] = row['coverpath']
        item['seriename'] = row['seriename']
        item['catalog'] = row['catalog']
        item['editorsugest'] = row['editorsugest']
        item['usersugest'] = row['usersugest']
        item['preface'] = row['preface']
        item['summary'] = row['summary']
        item['epilogue'] = row['epilogue']
        item['publishdate'] = row['publishdate']
        item['collectiontime'] = row['collectiontime']
        item['orgcode'] = row['orgcode']
        item['skuid'] = row['skuid']
        item['commentcount'] = row['commentcount']
        item['_row'] = row['_row']
        item['ifimport'] = '0'
        item['_entitycode'] = row['_entitycode']
        item['url'] = row['url']
        item['commentpercent'] = row['commentpercent']
        item['commenttag'] = row['commenttag']
        item['authorintro'] = row['authorintro']
        item['sourceprice'] = row['sourceprice']
        if not item['printedtime']:
            item['printedtime'] = None
        if not item['publishdate']:
            item['publishdate'] = None
        return item
コード例 #14
0
parser.add_argument('--reset_param', type=bool, default=False)
parser.add_argument('--method', type=str, default='ada')
parser.add_argument('--data_dir', type=str, default='./data/')
parser.add_argument
args = parser.parse_args()

log_name = f'log/mag_{args.method}_{args.prune_set}_ratio{args.ratio}_batch_size{args.batch_size}_epochs{args.epochs}_pruneepoch{args.prune_epoch}_times{args.times}_reset{args.reset_param}.log'
logger.add(log_name)
logger.info('logname: {}'.format(log_name))
logger.info(args)

dataset = PygNodePropPredDataset(name='ogbn-mag', root=args.data_dir)
data = dataset[0]
split_idx = dataset.get_idx_split()
evaluator = Evaluator(name='ogbn-mag')
logger1 = Logger(args.runs, args)
# We do not consider those attributes for now.
data.node_year_dict = None
data.edge_reltype_dict = None

# print(data)

edge_index_dict = data.edge_index_dict

# We need to add reverse edges to the heterogeneous graph.
r, c = edge_index_dict[('author', 'affiliated_with', 'institution')]
edge_index_dict[('institution', 'to', 'author')] = torch.stack([c, r])

r, c = edge_index_dict[('author', 'writes', 'paper')]
edge_index_dict[('paper', 'to', 'author')] = torch.stack([c, r])
コード例 #15
0
ファイル: kafkaConsumer.py プロジェクト: floydScript/spider
class MySqlPipelines(object):
    _logger = Logger().getLogger()

    def __init__(self):
        config = Conf.config
        self.db = pymysql.connect(host=config['mysql']['host'],
                                  port=config['mysql']['port'],
                                  user=config['mysql']['username'],
                                  passwd=config['mysql']['password'],
                                  db=config['mysql']['dbname'],
                                  charset='utf8')
        self.cursor = self.db.cursor()

    def process_item(self, item):

        if item['_entitycode'] == 'web_page_p_book_info_09':
            if item['is_set'] == '是':
                return item
            # 实例化分类服务器
            cate_server = CategoryServer()
            orgcode_server = OrgcoderServer()
            # 营销分类查询
            contenttype = item['contenttype'].split(',')
            if not contenttype:
                contenttype = ['']
            contenttype = contenttype[-1]
            salecategory = cate_server.query_sale_category(contenttype)
            item['salecategory'] = salecategory
            # 中图分类
            isbn = item['isbn']
            cate_code = cate_server.query_cate_server(isbn)
            item['category'] = cate_code
            item['orgcategory'] = cate_code

            # 根据出版社名字查询orgcode
            pub_name = item['publisher']
            orgcode = orgcode_server.query_orgcode(pub_name)
            item['orgcode'] = orgcode

            sql = '''insert into web_page_p_book_info_09_dangdang(bookname, subhead, publisher, orgpublisher, contentsummary, sourcetype,
                    author, translator, isbn, orgisbn, salecategory, category, orgcategory, contenttype, issuearea, type, edition, impression,
                    words, pages, language, price, printedtime, format, papermeter, packing, coverurl, coverpath, seriename, catalog, 
                    editorsugest, usersugest, preface, summary, epilogue, publishdate, collectiontime, orgcode, skuid, commentcount, _row,
                    ifimport, _entitycode, url, commentpercent, commenttag,authorintro,sourceprice)
                    values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
                    %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
            parames = (item['bookname'], item['subhead'], item['publisher'],
                       item['orgpublisher'], item['contentsummary'],
                       item['sourcetype'], item['author'], item['translator'],
                       item['isbn'], item['orgisbn'], item['salecategory'],
                       item['category'], item['orgcategory'],
                       item['contenttype'], item['issuearea'], item['type'],
                       item['edition'], item['impression'], item['words'],
                       item['pages'], item['language'], item['price'],
                       item['printedtime'], item['format'], item['papermeter'],
                       item['packing'], item['coverurl'], item['coverpath'],
                       item['seriename'], item['catalog'],
                       item['editorsugest'], item['usersugest'],
                       item['preface'], item['summary'], item['epilogue'],
                       item['publishdate'], item['collectiontime'],
                       item['orgcode'], item['skuid'], item['commentcount'],
                       item['_row'], item['ifimport'], item['_entitycode'],
                       item['url'], item['commentpercent'], item['commenttag'],
                       item['authorintro'], item['sourceprice'])
            try:
                self.cursor.execute(sql, parames)
                self.db.commit()
                self._logger.info('插入===book_info===数据:' + item['url'])
            except Exception:
                # 更改is_set为'是':即若是存在相同的数据就不去下载图片了
                item['is_set'] = '是'
                self._logger.info('存在相同数据,插入===book_info===失败:' + item['url'])
        elif item['_entitycode'] == 'web_page_p_book_comment_09':
            sql = '''insert into web_page_p_book_comment_09_dangdang(_row, isbn, uri, bookname, sourcetype, collectiontime, publishtime, username, hitcount,
                    follownum, suportnum, opposnum, commentid, followcommentid, commenttitle, commenttype, comment, score, level, commpoint, type, sitename,
                    ifimport, _entitycode)
                    values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
            parames = (item['_row'], item['isbn'], item['uri'],
                       item['bookname'], item['sourcetype'],
                       item['collectiontime'], item['publishtime'],
                       item['username'], item['hitcount'], item['follownum'],
                       item['suportnum'], item['opposnum'], item['commentid'],
                       item['followcommentid'], item['commenttitle'],
                       item['commenttype'], item['comment'], item['score'],
                       item['level'], item['commpoint'], item['type'],
                       item['sitename'], '0', item['_entitycode'])
            try:
                self.cursor.execute(sql, parames)
                self.db.commit()
                self._logger.info('插入~~~book_comment~~~数据:' + item['url'])
            except Exception:
                self._logger.info('存在相同数据,插入~~~book_comment~~~失败:' +
                                  item['url'])
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.db.close()
コード例 #16
0
ファイル: mqConsumer.py プロジェクト: floydScript/spider
# -*-coding:utf-8-*-
import json
import stomp
import time
from conf import Conf
from mylogger import Logger
from pipelines import ElasticSearchPipelines, MySqlPipelines, SaveImgPipelines

config = Conf.config

_logger = Logger().getLogger()


# 消息监听器
class BookListener(object):
    mysql_pipe = MySqlPipelines()
    image_pipe = SaveImgPipelines()
    es_pipe = ElasticSearchPipelines()

    def on_message(self, headers, message):
        """
        处理接收到的消息
        :param headers:
        :param message:
        :return:
        """
        item = json.loads(message)
        # _logger.info('接收信息:' + item['isbn'])
        # 数据推送到mysql中
        self.mysql_pipe.process_item(item)
        # 下载图片
コード例 #17
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)')
    # parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=201)
    parser.add_argument('--runs', type=int, default=1)
    parser.add_argument('--prune_set', type=str, default='train')
    parser.add_argument('--ratio', type=float, default=0.95)
    parser.add_argument('--times', type=int, default=20)
    parser.add_argument('--prune_epoch', type=int, default=301)
    parser.add_argument('--reset_param', type=bool, default=False)
    parser.add_argument('--naive', type=bool, default=False)
    parser.add_argument('--data_dir', type=str, default='./data/')
    args = parser.parse_args()

    log_name = f'log/arxivtest_{args.prune_set}_{args.ratio}_{args.epochs}_{args.prune_epoch}_{args.times}.log'
    logger.add(log_name)
    logger.info('logname: {}'.format(log_name))
    logger.info(args)

    # device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    # device = torch.device(device)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    dataset = PygNodePropPredDataset(name='ogbn-arxiv',
                                     root=args.data_dir,
                                     transform=T.ToSparseTensor())

    data = dataset[0]
    data.adj_t = data.adj_t.to_symmetric()
    data = data.to(device)

    split_idx = dataset.get_idx_split()
    train_idx = split_idx['train'].to(device)

    if args.use_sage:
        model = SAGE(data.num_features, args.hidden_channels,
                     dataset.num_classes, args.num_layers,
                     args.dropout).to(device)
    else:
        model = GCN(data.num_features, args.hidden_channels,
                    dataset.num_classes, args.num_layers,
                    args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger1 = Logger(args.runs, args)
    row, col, val = data.adj_t.coo()
    N = int(row.max() + 1)
    row = torch.cat([torch.arange(0, N).cuda(), row], dim=0)
    col = torch.cat([torch.arange(0, N).cuda(), col], dim=0)
    edge_index = torch.cat([row, col]).view(2, -1)
    data.edge_index = edge_index
    # print(data.edge_index)
    pruner = Pruner(edge_index.cpu(),
                    split_idx,
                    prune_set=args.prune_set,
                    ratio=args.ratio)
    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, data, train_idx, optimizer)
            result = test(model, data, split_idx, evaluator)
            logger1.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                logger.info(
                    f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {100 * train_acc:.2f}%, Valid: {100 * valid_acc:.2f}% Test: {100 * test_acc:.2f}%'
                )

        logger1.print_statistics(ratio=1)
        logger1.flush()
        for i in range(1, args.times + 1):
            pruner.prune(naive=args.naive)
            if args.reset_param == True:
                model.reset_parameters()
            for epoch in range(1, 1 + args.prune_epoch):
                loss = train(model, data, train_idx, optimizer, pruner=pruner)
                result = test(model, data, split_idx, evaluator, pruner=pruner)
                logger1.add_result(run, result)
                if epoch % args.log_steps == 0:
                    train_acc, valid_acc, test_acc = result
                    logger.info(
                        f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {100 * train_acc:.2f}%, Valid: {100 * valid_acc:.2f}% Test: {100 * test_acc:.2f}%'
                    )

            logger1.print_statistics(ratio=args.ratio**i)
            logger1.flush()
コード例 #18
0
class CategoryServer():
    # 授权链接
    auth_url = ''
    _logger = Logger().getLogger()
    proxies = {
        "http": ''  # 代理ip
    }

    def __init__(self):
        # self.reload_authURL()
        config = Conf.config
        #初始化数据库连接
        self.db = pymysql.connect(host=config['mysql']['host'],
                                  port=config['mysql']['port'],
                                  user=config['mysql']['username'],
                                  passwd=config['mysql']['password'],
                                  db=config['mysql']['dbname_cate'])
        self.cursor = self.db.cursor()
        self.db_sale = pymysql.connect(host=config['mysql']['host'],
                                       port=config['mysql']['port'],
                                       user=config['mysql']['username'],
                                       passwd=config['mysql']['password'],
                                       db=config['mysql']['dbname_sale_cate'])
        self.cursor_sale = self.db_sale.cursor()

    def reload_authURL(self):
        """
        更换中图网站的授权链接
        :return:
        """
        try:
            proxy_ip = requests.get(
                'http://api.ip.data5u.com/dynamic/get.html?order=f6d9a18f02f520f2aaac6b249fd8689e'
            ).content.decode().strip()
            self.proxies['http'] = proxy_ip
            url = 'http://opac.nlc.cn/F?RN=989462048'
            response = requests.get(url, timeout=20, proxies=self.proxies)
            html = response.text
            self.auth_url = re.findall('tmp="([^"]+)"', html)[0]
        except:
            self._logger.error('更换中图授权链接的时候出错')
            self.auth_url = 'http://opac.nlc.cn:80/F/IYKXX91A5NCBPEQP1DQHLF471L8ANIEHXUMSUTI2HLRRXI77MF-10964'

    def query_cate_server(self, isbn):
        """
        中图查询入口:先查book_isbn_cate表,有则return,无则再查中图网站,查到的中图分类再存进mysql
        :param isbn:
        :return:
        """
        # 先查询mysql是否有此isbn
        cate_code = self.query_cate_mysql(isbn)
        if cate_code:
            return cate_code
        # 更换授权链接
        try:
            self.reload_authURL()
        except Exception as e:
            self._logger.error(e)
        url = self.auth_url + '?func=find-b&find_code=ISB&request=%s&local_base=NLC01&filter_code_1=WLN&filter_request_1=&filter_code_2=WYR&filter_request_2=&filter_code_3=WYR&filter_request_3=&filter_code_4=WFM&filter_request_4=&filter_code_5=WSL&filter_request_5=' % isbn
        try:
            # 请求中图网站获取isbn对应的网页,进行解析
            response = requests.get(url, timeout=10, proxies=self.proxies)
            html = response.text
        except Exception as e:
            self._logger.error(e)
            html = ''
        cate_code = re.findall('CALL-NO:\s*?([^\r\n]*)', html)
        if not cate_code:
            self._logger.info('中图服务器查询查无此isbn:' + isbn)
            return ''
        cate_code = cate_code[0].strip()
        if not cate_code:
            self._logger.info('中图服务器查询查无此isbn:' + isbn)
            return ''
        self._logger.info('中图服务器查询========>isbn:' + isbn + '  分类为:' +
                          cate_code)
        # 往数据库中插入新的中图分类
        self.insert_cate_mysql(isbn, cate_code)
        return cate_code

    def query_cate_mysql(self, isbn):
        """
        从mysql中查询中图分类
        :param isbn:
        :return:
        """
        sql = 'select category from book_isbn_cate where isbn = "%s" ' % isbn
        self.cursor.execute(sql)
        result = self.cursor.fetchone()
        if not result:
            self._logger.info('中图数据库查无此isbn:' + isbn + '转为中图服务器查询')
            return None
        self._logger.info('中图数据库查询========>isbn:' + isbn + '  分类为:' +
                          result[0])
        return result[0]

    def insert_cate_mysql(self, isbn, cate_code):
        """
        往数据库中插入中图分类
        :param isbn:
        :param cate_code: 中图分类号
        :return:
        """
        sql = 'insert into book_isbn_cate(isbn,category,savetime) values(%s,%s,%s)'

        now = datetime.datetime.now()
        params = (isbn, cate_code, now)
        self.cursor.execute(sql, params)
        self.db.commit()
        pass

    def query_sale_category(self, salecategory_name):
        """
        从mysql中查询营销分类
        :param salecategory_name:
        :return:
        """
        sql = 'select id from book_category_cate where name like "%' + salecategory_name + '%"'
        self.cursor_sale.execute(sql)
        result = self.cursor_sale.fetchone()
        if not result:
            self._logger.info('查无此营销分类========>salecategory_name:' +
                              salecategory_name)
            return ''
        self._logger.info('查询营销分类========>salecategory_name:' +
                          salecategory_name + '  ID为:' + result[0])
        return result[0]
コード例 #19
0
ファイル: demo.py プロジェクト: ForRebuttal/miccai2020_400
    parser = argparse.ArgumentParser(
        description='Train the UNet on images and target masks',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-f',
                        '--filename',
                        dest='fname',
                        type=str,
                        default="demo1.jpg",
                        help='The file in ./demo/demo_img/')

    return parser.parse_args()


if __name__ == '__main__':

    log = Logger("log_demo.txt")

    args = get_args()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 1
    torch.manual_seed(seed)
    if device == "cuda":
        torch.cuda.manual_seed(seed)

    log.logger.info(f'Using device {device}')

    log.logger.info(f'Start initialize model...')

    net_pretrained = models.densenet201(pretrained=True)
    net = densenet.densenet201(num_classes=1)
コード例 #20
0
            nodename = l[3]
            section = l[4]
            if groupname not in ret:
                ret[groupname] = {}
            if nodename not in ret[groupname]:
                ret[groupname][nodename] = []
            if section not in ret[groupname][nodename]:
                ret[groupname][nodename].append(section)
        return ret

StatsServer = RedisStats

if __name__ == "__main__":

    from mylogger import Logger
    logger = Logger.getLogger('debug', None, 'DEBUG', True)
    Logger.addLoggingServer(logger, '127.0.0.1', 9900)

    from stats_client import AppStats
    stats = AppStats('cluster1', 'selector', logger)
    stats2 = AppStats('cluster2', 'master', logger)
    stats3 = AppStats('cluster3', 'parser', logger)

    import redis
    redis_ = redis.Redis()
    redisStats = RedisStats('test', redis_, logger)

    section = 'process'
    options = {'mem': 1000, 'cpu':0.01}

    stats_str = stats.stats_encode('p1', options, 'set')
コード例 #21
0
ファイル: pipelines.py プロジェクト: floydScript/spider
class MySqlPipelines(object):

    _logger = Logger().getLogger()

    def __init__(self):
        config = Conf.config
        # 创建数据库连接
        self.db = pymysql.connect(host=config['mysql']['host'], port=config['mysql']['port'],
                                  user=config['mysql']['username'],
                                  passwd=config['mysql']['password'], db=config['mysql']['dbname'], charset='utf8')
        self.cursor = self.db.cursor()
        # 实例化分类服务器
        self.cate_server = CategoryServer()


    def process_item(self, item):
        """
        查询营销分类,中图分类,出版社编号。将数据推送至mysql
        :param item:图书信息实体
        :return:item
        """
        if item['_entitycode'] == 'web_page_p_book_info_09':
            if item['is_set'] == '是':
                return item

            # 营销分类查询
            contenttype = item['contenttype'].split(',')
            if not contenttype:
                contenttype = ['']
            contenttype = contenttype[-1]
            salecategory = self.cate_server.query_sale_category(contenttype)
            item['salecategory'] = salecategory
            # 中图分类
            isbn = item['isbn']
            cate_code = self.cate_server.query_cate_server(isbn)
            item['category'] = cate_code
            item['orgcategory'] = cate_code
            if item['sourcetype'] == '01':
                table = 'web_page_p_book_info_09_jingdong'
            elif item['sourcetype'] == '02':
                table = 'web_page_p_book_info_09_dangdang'
            elif item['sourcetype'] == '03':
                table = 'web_page_p_book_info_09_douban'
            elif item['sourcetype'] == '04':
                table = 'web_page_p_book_info_09_xinhuashudian'
            elif item['sourcetype'] == '05':
                table = 'web_page_p_book_info_09_yamaxun'
            elif item['sourcetype'] == '06':
                table = 'web_page_p_book_info_09_tianmao'
            sql = 'insert into '+table+'(bookname, subhead, publisher, orgpublisher, contentsummary, sourcetype,author, translator, isbn, orgisbn, salecategory, category, orgcategory, contenttype, issuearea, type, edition, impression,words, pages, language, price, printedtime, format, papermeter, packing, coverurl, coverpath, seriename, catalog, editorsugest, usersugest, preface, summary, epilogue, publishdate, collectiontime, orgcode, skuid, commentcount, _row,ifimport, _entitycode, url, commentpercent, commenttag,authorintro,sourceprice) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
            parames = (
                item['bookname'], item['subhead'], item['publisher'], item['orgpublisher'], item['contentsummary'],
                item['sourcetype'],
                item['author'], item['translator'], item['isbn'], item['orgisbn'], item['salecategory'],
                item['category'], item['orgcategory'],
                item['contenttype'], item['issuearea'], item['type'], item['edition'], item['impression'],
                item['words'], item['pages'], item['language'],
                item['price'], item['printedtime'], item['format'], item['papermeter'], item['packing'],
                item['coverurl'], item['coverpath'],
                item['seriename'], item['catalog'], item['editorsugest'], item['usersugest'], item['preface'],
                item['summary'], item['epilogue'],
                item['publishdate'], item['collectiontime'], item['orgcode'], item['skuid'], item['commentcount'],
                item['_row'], item['ifimport'],
                item['_entitycode'], item['url'], item['commentpercent'], item['commenttag'], item['authorintro'],
                item['sourceprice']
            )
            try:
                self.cursor.execute(sql, parames)
                self.db.commit()
                self._logger.info('mysql插入===book_info===数据成功:' + item['url'])
            except Exception:
                item['is_set'] = '是'
                self._logger.info('mysql插入===book_info===失败:' + item['url'])
        elif item['_entitycode'] == 'web_page_p_book_comment_09':
            if item['sourcetype'] == '01':
                table = 'web_page_p_book_comment_09_jingdong'
            elif item['sourcetype'] == '02':
                table = 'web_page_p_book_comment_09_dangdang'
            elif item['sourcetype'] == '03':
                table = 'web_page_p_book_comment_09_douban'
            elif item['sourcetype'] == '04':
                table = 'web_page_p_book_comment_09_xinhuashudian'
            elif item['sourcetype'] == '05':
                table = 'web_page_p_book_comment_09_yamaxun'
            elif item['sourcetype'] == '06':
                table = 'web_page_p_book_comment_09_tianmao'

            sql = 'insert into '+table+'(_row, isbn, uri, bookname, sourcetype, collectiontime, publishtime, username, hitcount,follownum, suportnum, opposnum, commentid, followcommentid, commenttitle, commenttype, comment, score, level, commpoint, type, sitename,ifimport, _entitycode) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
            parames = (
                item['_row'], item['isbn'], item['uri'], item['bookname'], item['sourcetype'], item['collectiontime'],
                item['publishtime'], item['username'],
                item['hitcount'], item['follownum'], item['suportnum'], item['opposnum'], item['commentid'],
                item['followcommentid'], item['commenttitle'],
                item['commenttype'], item['comment'], item['score'], item['level'], item['commpoint'], item['type'],
                item['sitename'], '0', item['_entitycode']
            )
            try:
                self.cursor.execute(sql, parames)
                self.db.commit()
                self._logger.info('mysqk插入~~~book_comment~~~数据成功:' + item['_row'])
            except Exception as e:
                self._logger.error(e)
                self._logger.info('mysqk插入~~~book_comment~~~失败:' + item['_row'])
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.db.close()
コード例 #22
0
ファイル: auto_deploy.py プロジェクト: flyer103/autodeploy
"""github 自动部署.
"""

import os
import sys
import subprocess

from flask import Flask, request

sys.path.append(os.path.join(os.path.abspath(__file__).rsplit('/', 1)[0], 'logger'))

from configs import get_configs
from mylogger import Logger

log_main = Logger.get_logger(__file__)


app = Flask(__name__)

configs_sys = get_configs()         # 系统配置


# Route
@app.route('/deploy/<project>', methods=['POST'])
def deploy(project=None):
    if project.upper() not in configs_sys['GIT']:
        log_main.critical('No such project: {0}'.format(project))
        sys.exit(-1)

    html_url = request.json['repository']['html_url']
コード例 #23
0
import os
import re

import requests
from elasticsearch import Elasticsearch

from mylogger import Logger

_logger = Logger().getLogger()
es = Elasticsearch('10.13.11.21:9200')
index = 0
try:
    with open('zlog.text', 'r', encoding='utf-8') as file:
        lists = file.readlines()
        for l in lists:
            date = re.findall("个文件:(\d+)/\d+.jpg", l)
            _row = re.findall("个文件:\d+/(\d+).jpg", l)
            if _row:
                _row = _row[0]
                date = date[0]
                body = {"query": {"term": {"_row": _row}}}
                result = es.search(index="web_page_p_book_info_09",
                                   doc_type="web_page_p_book_info_09",
                                   body=body)
                terms = result['hits']['hits']
                if terms:
                    term = terms[0]
                    coverurl = term['_source']['coverurl']
                    coverpath = term['_source']['coverpath']
                    if date in coverpath:
                        img_path = '/mount/fhcb/fileserver/img' + coverpath
コード例 #24
0
class ParseComment(object):
    _logger = Logger.getLogger(Logger())
    def __init__(self):
        #初始化
        config = Conf.config

        # 初始化es数据通道
        self.es_pipe = ElasticSearchPipelines()

        self.mysql_hosts = config['mysql']['host']
        self.mysql_port = config['mysql']['port']
        self.mysql_user = config['mysql']['username']
        self.mysql_password = config['mysql']['password']
        self.mysql_db = config['mysql']['dbname']
        self.mysql_charset = config['mysql']['charset']
        self.mysql_table = config['mysql']['comment_table']

        # 建立数据库连接
        self.conn = pymysql.connect(host=self.mysql_hosts,
                                    port=int(self.mysql_port),
                                    user=self.mysql_user,
                                    password=self.mysql_password,
                                    db=self.mysql_db,
                                    charset=self.mysql_charset,
                                    cursorclass = pymysql.cursors.DictCursor
        )
        self.cursor = self.conn.cursor()




    # 将数据推送至ES
    def parse_item(self):
        self._logger.info("查询待推送的数据:" + self.mysql_table)
        self.cursor.execute("""SELECT * FROM %s limit 0,100000""" % self.mysql_table)

        result = self.cursor.fetchall()

        for row in result:
            # 解析数据库结果
            item = self.initItem(row)
            self.es_pipe.process_item(item)


    # 释放资源
    def close_spider(self, spider):
        self.conn.close()
        self.cursor.close()

    # 解析mysql数据库
    def initItem(self,row):

        item = {}
        item['isbn'] = row['isbn']
        item['uri'] = row['uri']
        skuid = row['uri'].split('/')[7]
        item['bookname'] = row['bookname']
        item['sourcetype'] = row['sourcetype']
        item['collectiontime'] = row['collectiontime']
        item['publishtime'] = row['publishtime']
        item['username'] = row['username']
        # 初始化int类型的数据为0
        hitcount = row['hitcount']
        if not hitcount:
            hitcount = '0'
        item['hitcount'] = hitcount
        # 初始化int类型的数据为0
        follownum = row['follownum']
        if not follownum:
            follownum = '0'
        item['follownum'] = follownum
        # 初始化int类型的数据为0
        suportnum = row['suportnum']
        if not suportnum:
            suportnum = '0'
        item['suportnum'] = suportnum
        # 初始化int类型的数据为0
        opposnum = row['opposnum']
        if not opposnum:
            opposnum = '0'
        item['opposnum'] = opposnum

        item['commentid'] = row['commentid']
        item['followcommentid'] = row['followcommentid']
        item['commenttitle'] = row['commenttitle']
        item['commenttype'] = row['commenttype']
        item['comment'] = row['comment']
        score = row['score']
        if not score:
            score = '5'
        item['score'] = score
        level = row['level']
        if not level:
            level = '0'
        item['level'] = level

        item['commpoint'] = row['commpoint']
        item['type'] = row['type']
        item['sitename'] = row['sitename']
        item['_entitycode'] = row['_entitycode']
        item['_row'] = row['_row']
        item['skuid'] = skuid
        return item
コード例 #25
0
import platform

import os

from TimerTask import Task
from email_dict import to_send_email
from email_send import EmailSend
from loading import Loading
from mylogger import Logger
from warning_main import WarningPlay
from warnstone import stoneobject

if __name__ == '__main__':
    # 记录器 实例
    logname = "生日预警日志"
    log = Logger(logname)
    logger = log.getlogger()
    # 解析器实例
    conf = configparser.ConfigParser()
    path = 'warning.conf'
    assert os.path.exists(path), "{file}不存在".format(file=path)
    if platform.system() == 'Windows':
        conf.read(path, encoding="utf-8-sig")
    else:
        conf.read(path)
    # 数据库实例
    stone = stoneobject()
    # 初始化 定时器
    task = Task("08:00", logger)
    times = conf.get(section="time", option="now")
    if task.times != datetime.time(int(times.split(':')[0]), int(times.split(':')[1])):
コード例 #26
0
ファイル: master.py プロジェクト: wangfengliang/xmaster
    f.protocol = MasterServer
    port = g_config.getint('master', 'port')
    g_logger.info('listenTCP %s' % port)
    reactor.listenTCP(port, f)
    reactor.run()

if __name__ == '__main__':

    if len(sys.argv) != 2:
        print 'Usage: %s <config>' % sys.argv[0]
        sys.exit(1)

    # 读取配置文件
    config_file = sys.argv[1]
    g_config = ConfigParser.ConfigParser()
    g_config.read(config_file)

    g_master_name = g_config.get('master', 'name')
    level = g_config.get('master', 'level') if g_config.has_option('master', 'level') else "DEBUG"
    debug = g_config.getboolean('master', 'debug') if g_config.has_option('master', 'debug') else True
    logfile = g_config.get('master', 'logfile') if g_config.has_option('master', 'logfile') else None
    logname = g_config.get('master', 'logname') if g_config.has_option('master', 'logname') else None
    if not debug:
        assert logfile, 'logfile must be set when not debug mode'
    g_logger = Logger.getLogger(logname, logfile, level=level, debug=debug)
    g_redis_addr = g_config.get('master', 'redis_addr') if g_config.has_option('master', 'redis_addr') else 'localhost'
    g_redis_port = g_config.getint('master', 'redis_port') if g_config.has_option('master', 'redis_port') else 6379

    main()