def create_model_then_learn(shared_stuff, model_type, model_num, policy_type, env, learning_starts=1000, prioritized_replay=False, batch_size=32, verbose=0): global logdirs assert model_type == 'i' or 'm', "invalid model type" if model_type == 'm': batch_size = n_indiv * batch_size print(type(env)) model = DQN(policy_type, env, learning_rate=1e-4, learning_starts=learning_starts, prioritized_replay=prioritized_replay, batch_size=batch_size, verbose=verbose, target_network_update_freq=5000, buffer_size=50000, shared_stuff=shared_stuff) model.model_type = model_type model.model_num = model_num if model_type == 'i': model.indiv_logger = Logger(logdirs['indiv'][model_num]) elif model_type == 'm': for indiv_num in range(n_indiv): model.multi_loggers[indiv_num] = Logger( logdirs['multi'][model_num][indiv_num]) model_type_str = 'indiv' if model_type == 'i' else 'multi' print("{} task DQN {} created".format(model_type_str, model_num)) print("{} task DQN {} begins learning...".format( model_type_str, model_num)) model.learn(total_timesteps=5000000, callback=callback, tb_log_name="DQN_{}_{}".format(model_type, model_num)) print("{} task DQN {} done learning!".format(model_type_str, model_num)) # TODO the following block isn't used if model_type == 'i': indiv_models.append(model) else: multi_models.append(model)
class OrgcoderServer(): _logger = Logger().getLogger() def __init__(self): config = Conf.config # 初始化数据库连接 self.db = pymysql.connect(host=config['mysql']['host'], port=config['mysql']['port'], user=config['mysql']['username'], passwd=config['mysql']['password'], db=config['mysql']['dbname_org']) self.cursor = self.db.cursor() def query_orgcode(self, organization_name): """ 从mysql中查询出版社编码 :param organization_name: 出版社名字 :return: """ sql = 'select organization_code from 09_org_collect_sys where organization_name = "' + organization_name + '"' self.cursor.execute(sql) result = self.cursor.fetchone() if not result: self._logger.info('查无此出版社========>publisher:' + organization_name) return '' self._logger.info('出版社查询========>publisher:' + organization_name + ' orgcode为:' + result[0]) return result[0]
class SaveImgPipelines(): _logger = Logger().getLogger() def __init__(self): self.config = Conf.config def process_item(self, item): if item['_entitycode'] == 'web_page_p_book_info_09': if item['is_set'] == '是': return item # 拼接图片路径 /opt/fhcb/fileserver/img + /book/20180909/2993702.jpg img_path = self.config['image']['path'] + item['coverpath'] # 创建文件夹 /opt/fhcb/fileserver/img + /book/ + 20180909/ dir_path = self.config['image'][ 'path'] + '/book/' + datetime.datetime.now().strftime( '%Y%m%d') + '/' if not os.path.exists(dir_path): os.makedirs(dir_path) # 从imgurl下载图片 with open(img_path, 'wb') as f: resp = requests.get(item['coverurl']) f.write(resp.content) self._logger.info('下载图片到: ' + item['coverpath']) return item
class SaveImgPipelines(): _logger = Logger().getLogger() def __init__(self): self.config = Conf.config self.es = Elasticsearch(self.config['elasticsearch']['hosts']) def process_item(self, item): """ 保存图片 :param item: :return: """ if item['_entitycode'] == 'web_page_p_book_info_09': if item['is_set'] == '是': return item # 查询es,如果es中有这个isbn的话就不存图片了 body = { "query": { "term": { "isbn": item['isbn'] } } } result = self.es.search(index="web_page_p_book_info_09", doc_type="web_page_p_book_info_09", body=body) if result['hits']['hits']: self._logger.info('重复的图片,不再下载 ISBN:'+item['isbn']) return item # 拼接图片路径 /opt/fhcb/fileserver/img + /book/20180909/2993702.jpg img_path = self.config['image']['path'] + item['coverpath'] # 创建文件夹 /opt/fhcb/fileserver/img + /book/ + 20180909/ filename = img_path.split('/')[-1] dir_path = img_path.replace(filename,'') if not os.path.exists(dir_path): os.makedirs(dir_path) # 从imgurl下载图片 with open(img_path, 'wb') as f: resp = requests.get(item['coverurl']) f.write(resp.content) self._logger.info('下载图片到: ' + item['coverpath']) return item
if len(terms) > 0: for t in terms: p = '/book/'+dirname +'/'+ filename if p in t['_source']['coverpath']: # 存在此图片,日期也正确,返回false return True,'存在此图片' else: # 存在此图片,但日期不对,返回false return False,'reason:es中已找到,但日期不对' else: # 不存在此图片 return False,'reason:es中无法找到' config = Conf.config _logger = Logger().getLogger() es = Elasticsearch(config['elasticsearch']['hosts']) directory = config['image']['path']+'/book' os.chdir(directory) cwd = os.getcwd() # 列出book中所有的文件夹 dirs = os.listdir(cwd) count = 0 # 遍历文件夹 for dir in dirs : path = directory+'/'+dir if os.path.isfile(path): continue # 选择当前文件夹
import json import os import pymysql import requests from kafka import KafkaConsumer from categoryServer import CategoryServer from conf import Conf from mylogger import Logger from orgcoderServer import OrgcoderServer config = Conf.config consumer = KafkaConsumer('09_p_spider', bootstrap_servers=[config['kafka']['host']]) # consumer = KafkaConsumer('test',bootstrap_servers=[hosts]) _logger = Logger().getLogger() class MySqlPipelines(object): _logger = Logger().getLogger() def __init__(self): config = Conf.config self.db = pymysql.connect(host=config['mysql']['host'], port=config['mysql']['port'], user=config['mysql']['username'], passwd=config['mysql']['password'], db=config['mysql']['dbname'], charset='utf8') self.cursor = self.db.cursor()
default=0.5, help='Downscaling factor of the images') parser.add_argument( '-v', '--validation', dest='val', type=float, default=10.0, help='Percent of the data that is used as validation (0-100)') return parser.parse_args() if __name__ == '__main__': log = Logger("log_test.txt") args = get_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') seed = 1 torch.manual_seed(seed) if device == "cuda": torch.cuda.manual_seed(seed) log.logger.info(f'Using device {device}') net_pretrained = models.densenet201(pretrained=True) net = densenet.densenet201(num_classes=1) net_dict = net.state_dict()
class ElasticSearchPipelines(): _logger = Logger().getLogger() info_mapping = { "mappings": { "web_page_p_book_info_09": { "properties": { "_entitycode": { "type": "string" }, "_row": { "type": "string" }, "author": { "type": "string", "index": "not_analyzed" }, "authorintro": { "type": "string" }, "bookname": { "type": "string" }, "catalog": { "type": "string" }, "category": { "type": "string" }, "collectiontime": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM" }, "commentcount": { "type": "long" }, "commentcount_jd": { "type": "long" }, "commentcount_ymx": { "type": "long" }, "commentpercent": { "type": "float" }, "commenttag": { "type": "string" }, "contentsummary": { "type": "string" }, "contenttype": { "type": "string" }, "coverpath": { "type": "string" }, "coverurl": { "type": "string" }, "edition": { "type": "string" }, "editorsugest": { "type": "string" }, "epilogue": { "type": "string" }, "format": { "type": "string" }, "ifimport": { "type": "string" }, "impression": { "type": "string" }, "isbn": { "type": "string" }, "issuearea": { "type": "string" }, "language": { "type": "string" }, "orgcategory": { "type": "string" }, "orgcode": { "type": "string" }, "orgisbn": { "type": "string" }, "orgpublisher": { "type": "string", "index": "not_analyzed" }, "packing": { "type": "string" }, "pages": { "type": "integer" }, "papermeter": { "type": "string" }, "preface": { "type": "string" }, "price": { "type": "float" }, "printedtime": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM" }, "publishdate": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM" }, "publisher": { "type": "string", "index": "not_analyzed" }, "row": { "type": "string" }, "salecategory": { "type": "string" }, "seriename": { "type": "string" }, "skuid": { "type": "string" }, "sourceprice": { "type": "float" }, "sourceprice_jd": { "type": "float" }, "sourceprice_ymx": { "type": "float" }, "sourcetype": { "type": "string" }, "subhead": { "type": "string" }, "summary": { "type": "string" }, "translator": { "type": "string" }, "type": { "type": "string" }, "url": { "type": "string" }, "usersugest": { "type": "string" }, "words": { "type": "integer" } } } } } comment_mapping = { "mappings": { "web_page_p_book_comment_09": { "properties": { "_entitycode": { "type": "string" }, "_row": { "type": "string" }, "bookname": { "type": "string" }, "collectiontime": { "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd", "type": "date" }, "comment": { "type": "string" }, "commentid": { "type": "string" }, "commenttitle": { "type": "string" }, "commenttype": { "type": "string" }, "commpoint": { "type": "string" }, "followcommentid": { "type": "string" }, "follownum": { "type": "integer" }, "hitcount": { "type": "integer" }, "isbn": { "type": "string" }, "level": { "type": "integer" }, "opposnum": { "type": "integer" }, "publishtime": { "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd", "type": "date" }, "score": { "type": "float" }, "sitename": { "type": "string" }, "skuid": { "type": "string" }, "sourcetype": { "type": "string" }, "suportnum": { "type": "integer" }, "type": { "type": "string" }, "uri": { "type": "string" }, "username": { "type": "string" } } } } } def __init__(self): self.config = Conf.config self.orgcode_server = OrgcoderServer() # 建立es连接 self.es = Elasticsearch(self.config['elasticsearch']['hosts']) # 图书信息索引,如果不存在则创建 if self.es.indices.exists(index='web_page_p_book_info_09') is not True: self.es.indices.create(index='web_page_p_book_info_09', body=self.info_mapping) # 图书评论索引,如果不存在则创建 if self.es.indices.exists(index='web_page_p_book_comment_09') is not True: self.es.indices.create(index='web_page_p_book_comment_09', body=self.comment_mapping) self.orgcode_server = OrgcoderServer() def process_item(self,item): """ 清洗数据,推送至elasticsearch :param item: :return: """ if item['_entitycode'] == 'web_page_p_book_info_09': # 清洗数据,根据来源分类清洗 flag = self.dd_washing_datas(item,self.orgcode_server) if not flag: return None id = item['isbn'] #=================== 区分数据来源 ===================# tag = '' if item['sourcetype'] == '01':#京东 tag = '_jd' sourceprice = item.pop('sourceprice') item['sourceprice_jd'] = sourceprice commentcount = item.pop('commentcount') item['commentcount_jd'] = commentcount elif item['sourcetype'] == '02':#当当 tag = '' # 因为当当的字段就是原字段,无需更改 elif item['sourcetype'] == '03':#豆瓣 tag = '_db' sourceprice = item.pop('sourceprice') item['sourceprice_db'] = sourceprice commentcount = item.pop('commentcount') item['commentcount_db'] = commentcount elif item['sourcetype'] == '04':#新华书店 tag = '_xhsd' sourceprice = item.pop('sourceprice') item['sourceprice_xhsd'] = sourceprice commentcount = item.pop('commentcount') item['commentcount_xhsd'] = commentcount elif item['sourcetype'] == '05':#亚马逊 tag = '_ymx' sourceprice = item.pop('sourceprice') item['sourceprice_ymx'] = sourceprice commentcount = item.pop('commentcount') item['commentcount_ymx'] = commentcount elif item['sourcetype'] == '06':#天猫 tag = '_tm' sourceprice = item.pop('sourceprice') item['sourceprice_tm'] = sourceprice commentcount = item.pop('commentcount') item['commentcount_tm'] = commentcount # =================== 区分数据来源 ===================# # 索引中无则新增,有则更新,还要将数据进行合并 try: # 取到相同id的数据,如果取不到会报错,进入except resu = self.es.get(index=item['_entitycode'], doc_type=item['_entitycode'], id=id) resu_item = resu['_source'] if tag: resu_item['sourcetype'+tag] = '' resu_item['commentcount'+tag] = '' # 取代原来为空的字段 for key in resu_item: if not resu_item[key]: try: resu_item[key] = item[key] except: pass # 取最大的评论数存储 if int(resu_item['commentcount']) < int(item['commentcount']): resu_item['commentcount'] = item['commentcount'] self.es.index(index=item['_entitycode'], doc_type=item['_entitycode'], id=id, body=resu_item) self._logger.info('es更新数据成功:' + resu_item['url']) except: # 新增数据 # item['price'] = str(item['price'], encoding="utf-8") # 判断图片是否存在 path = self.config['image']['path'] + item['coverpath'] if not os.path.exists(path): self._logger.info('图片不存在:' + item['coverpath']) return None # 往es中写入数据 self.es.index(index=item['_entitycode'], doc_type=item['_entitycode'], id=id, body=item) self._logger.info('es新增数据成功:' + item['url']) elif item['_entitycode'] == 'web_page_p_book_comment_09': if item['type'] == '02': body = { "query": { "term": { "skuid": item['skuid'] } } } result = self.es.search(index="web_page_p_book_info_09", doc_type="web_page_p_book_info_09", body=body) terms = result['hits']['hits'] if not terms: return None id = item['_row'] self.es.index(index=item['_entitycode'], doc_type=item['_entitycode'], id=id, body=item) self._logger.info('es新增数据成功:'+item['uri']) def dd_washing_datas(self, item,orgcode_server): """ 数据清洗 1.作者 2.书名 3.过滤没有图片的数据 4.将评论数为空串的转换为0 5.根据出版社名字查询orgcode 6.判断时间不能大于当前时间 7.时间字段如果是空串,改成None 8.清洗html标签 :param item: 图书信息实体 :param orgcode_server: 出版社编号查询 实例 :return: """ # 判断图片是否存在 if not item['coverpath']: return None # 清洗作者1:‘菲尔·比德尔 | 译者’ item['author'] = item['author'].split('|')[0] # 清洗书名1,去除(和(后面的字段 item['bookname'] = item['bookname'].split('(')[0] item['bookname'] = item['bookname'].split('(')[0] # 清洗书名2,去除 出版社,isbn,作者等字样 item['bookname'] = item['bookname'].replace(item['publisher'], '').replace(item['isbn'], '').replace(item['author'], '') # 将评论数为空串的转换为0 if not item['commentcount']: item['commentcount'] = 0 # 根据出版社名字查询orgcode pub_name = item['publisher'] orgcode = orgcode_server.query_orgcode(pub_name) item['orgcode'] = orgcode # 出版时间不能大于当前时间 strftime = datetime.datetime.strptime(item['publishdate'], "%Y-%m") strftime2 = datetime.datetime.now() if strftime > strftime2: return None # 时间字段如果是空串,改成None if not item['printedtime']: item['printedtime'] = None if not item['publishdate']: item['publishdate'] = None # 清洗html标签 item['preface'] = item['preface'].replace("<br>",'') item['catalog'] = item['catalog'].replace("<br>",'') item['editorsugest'] = item['editorsugest'].replace("<br>",'') item['summary'] = item['summary'].replace("<br>",'') item['usersugest'] = item['usersugest'].replace("<br>",'') item['contentsummary'] = item['contentsummary'].replace("<br>",'') item['authorintro'] = item['authorintro'].replace("<br>",'') return True
""" """ import time import json import requests from lxml import etree from pymongo import MongoClient from mylogger import Logger log_main = Logger.get_logger(service=__name__) class FangCrawler: URL_TPL = "https://www.douban.com/group/shanghaizufang/discussion?start=" def __init__(self): self.configs = self._load_conf() self.headers = self.configs["http"]["headers"] mgo_config = self.configs["mongo"] if mgo_config.get("rs"): self.mgo = MongoClient( mgo_config["rs"]["url"], replicaSet=mgo_config["rs"]["name"], readPreference=mgo_config["rs"]["read_preference"], )
import os import requests from elasticsearch import Elasticsearch from conf import Conf from mylogger import Logger _logger = Logger().getLogger() config = Conf.config es = Elasticsearch(config['elasticsearch']['hosts']) date = '20181025' body1 = { "query": { "term": { "coverpath": date } } } body = { "from": 0, "query": { "bool": { "must": [ { "term": { "coverpath": date } } ] } }, "size": 25000
return parser.parse_args() def from_config(config): args = settings() args.batchsize = int(config.get('setting', 'batchsize')) args.epochs = int(config.get('setting', 'epoch')) args.lr = float(config.get('setting', 'lr')) args.loss_alpha = float(config.get('hyperparam', 'loss_alpha')) args.loss_beta = float(config.get('hyperparam', 'loss_beta')) return args if __name__ == '__main__': log = Logger(logger_file) #args = get_args() args = from_config(config) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') seed = 1 torch.manual_seed(seed) if device == "cuda": torch.cuda.manual_seed(seed) log.logger.info(f'Using device {device}') net_pretrained = models.densenet201(pretrained=True) for i_fold in range(0, 10): net = densenet.densenet201(num_classes=1) net_dict = net.state_dict()
options['__groupname__'] = self.groupname options['__nodename__'] = self.nodename stats_d = { '__group__': self.groupname, '__name__': self.nodename, section: options, } jsonstr = json.dumps(stats_d, ensure_ascii=True) return jsonstr StatsClient = AppStats if __name__ == "__main__": from mylogger import Logger logger = Logger.getLogger('test1', None, 'DEBUG', True) Logger.addLoggingServer(logger, '127.0.0.1', 9900) logger2 = Logger.getLogger('test2', None, 'DEBUG', True) Logger.addLoggingServer(logger2, '127.0.0.1', 9900) stats = AppStats('cluster1', 'master', logger) stats2 = AppStats('cluster2', 'spider', logger2) section = 'process' options = {'mem': 1000, 'cpu':0.01} stats.log('p1', options, 'set') stats.log('p2', options, 'set') stats.log('p3', options, 'set') stats.log('p4', options, 'set') stats.log('p5', options, 'set')
class ParseBook(object): _logger = Logger().getLogger() def __init__(self): # 初始化分类服务器,即只实例化一次数据库连接 self.cat_server = CategoryServer() self.orgcode_server = OrgcoderServer() # 初始化es数据通道 self.es_pipe = ElasticSearchPipelines() #初始化 try: self.config = Conf.config self.mysql_hosts = self.config['mysql']['host'] self.mysql_port = self.config['mysql']['port'] self.mysql_user = self.config['mysql']['username'] self.mysql_password = self.config['mysql']['password'] self.mysql_db = self.config['mysql']['dbname'] self.mysql_charset = self.config['mysql']['charset'] self.mysql_table = self.config['mysql']['book_table'] except Exception: # 读取文件异常 self._logger.info("读取文件异常") #建立数据库连接 self.conn = pymysql.connect(host=self.mysql_hosts, port=int(self.mysql_port), user=self.mysql_user, password=self.mysql_password, db=self.mysql_db, charset=self.mysql_charset, cursorclass=pymysql.cursors.DictCursor) self.cursor = self.conn.cursor() # 将数据推送至ES def parse_item(self): self._logger.info("查询待推送的数据:"+self.mysql_table) self.cursor.execute("""SELECT * FROM %s limit 140000,120000""" % self.mysql_table) result = self.cursor.fetchall() for row in result: # 解析数据库结果 item = ParseBook.initItem(self,row) item['collectiontime'] = item['collectiontime'].strftime("%Y-%m-%d %H:%M:%S") self.es_pipe.process_item(item) # 释放资源 def close_spider(self, spider): self.conn.close() self.cursor.close() # 解析mysql数据库 def initItem(self,row): item = {} item['bookname'] = row['bookname'] item['subhead'] = row['subhead'] item['publisher'] = row['publisher'] item['orgpublisher'] = row['orgpublisher'] item['contentsummary'] = row['contentsummary'] item['sourcetype'] = row['sourcetype'] item['author'] = row['author'] item['translator'] = row['translator'] item['isbn'] = row['isbn'] item['orgisbn'] = row['orgisbn'] item['salecategory'] = row['salecategory'] item['category'] = row['category'] item['orgcategory'] = row['orgcategory'] item['contenttype'] = row['contenttype'] item['issuearea'] = row['issuearea'] item['type'] = row['type'] item['edition'] = row['edition'] item['impression'] = row['impression'] item['words'] = row['words'] item['pages'] = row['pages'] item['language'] = row['language'] item['price'] = row['price'] item['printedtime'] = row['printedtime'] item['format'] = row['format'] item['papermeter'] = row['papermeter'] item['packing'] = row['packing'] item['coverurl'] = row['coverurl'] item['coverpath'] = row['coverpath'] item['seriename'] = row['seriename'] item['catalog'] = row['catalog'] item['editorsugest'] = row['editorsugest'] item['usersugest'] = row['usersugest'] item['preface'] = row['preface'] item['summary'] = row['summary'] item['epilogue'] = row['epilogue'] item['publishdate'] = row['publishdate'] item['collectiontime'] = row['collectiontime'] item['orgcode'] = row['orgcode'] item['skuid'] = row['skuid'] item['commentcount'] = row['commentcount'] item['_row'] = row['_row'] item['ifimport'] = '0' item['_entitycode'] = row['_entitycode'] item['url'] = row['url'] item['commentpercent'] = row['commentpercent'] item['commenttag'] = row['commenttag'] item['authorintro'] = row['authorintro'] item['sourceprice'] = row['sourceprice'] if not item['printedtime']: item['printedtime'] = None if not item['publishdate']: item['publishdate'] = None return item
parser.add_argument('--reset_param', type=bool, default=False) parser.add_argument('--method', type=str, default='ada') parser.add_argument('--data_dir', type=str, default='./data/') parser.add_argument args = parser.parse_args() log_name = f'log/mag_{args.method}_{args.prune_set}_ratio{args.ratio}_batch_size{args.batch_size}_epochs{args.epochs}_pruneepoch{args.prune_epoch}_times{args.times}_reset{args.reset_param}.log' logger.add(log_name) logger.info('logname: {}'.format(log_name)) logger.info(args) dataset = PygNodePropPredDataset(name='ogbn-mag', root=args.data_dir) data = dataset[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(name='ogbn-mag') logger1 = Logger(args.runs, args) # We do not consider those attributes for now. data.node_year_dict = None data.edge_reltype_dict = None # print(data) edge_index_dict = data.edge_index_dict # We need to add reverse edges to the heterogeneous graph. r, c = edge_index_dict[('author', 'affiliated_with', 'institution')] edge_index_dict[('institution', 'to', 'author')] = torch.stack([c, r]) r, c = edge_index_dict[('author', 'writes', 'paper')] edge_index_dict[('paper', 'to', 'author')] = torch.stack([c, r])
class MySqlPipelines(object): _logger = Logger().getLogger() def __init__(self): config = Conf.config self.db = pymysql.connect(host=config['mysql']['host'], port=config['mysql']['port'], user=config['mysql']['username'], passwd=config['mysql']['password'], db=config['mysql']['dbname'], charset='utf8') self.cursor = self.db.cursor() def process_item(self, item): if item['_entitycode'] == 'web_page_p_book_info_09': if item['is_set'] == '是': return item # 实例化分类服务器 cate_server = CategoryServer() orgcode_server = OrgcoderServer() # 营销分类查询 contenttype = item['contenttype'].split(',') if not contenttype: contenttype = [''] contenttype = contenttype[-1] salecategory = cate_server.query_sale_category(contenttype) item['salecategory'] = salecategory # 中图分类 isbn = item['isbn'] cate_code = cate_server.query_cate_server(isbn) item['category'] = cate_code item['orgcategory'] = cate_code # 根据出版社名字查询orgcode pub_name = item['publisher'] orgcode = orgcode_server.query_orgcode(pub_name) item['orgcode'] = orgcode sql = '''insert into web_page_p_book_info_09_dangdang(bookname, subhead, publisher, orgpublisher, contentsummary, sourcetype, author, translator, isbn, orgisbn, salecategory, category, orgcategory, contenttype, issuearea, type, edition, impression, words, pages, language, price, printedtime, format, papermeter, packing, coverurl, coverpath, seriename, catalog, editorsugest, usersugest, preface, summary, epilogue, publishdate, collectiontime, orgcode, skuid, commentcount, _row, ifimport, _entitycode, url, commentpercent, commenttag,authorintro,sourceprice) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''' parames = (item['bookname'], item['subhead'], item['publisher'], item['orgpublisher'], item['contentsummary'], item['sourcetype'], item['author'], item['translator'], item['isbn'], item['orgisbn'], item['salecategory'], item['category'], item['orgcategory'], item['contenttype'], item['issuearea'], item['type'], item['edition'], item['impression'], item['words'], item['pages'], item['language'], item['price'], item['printedtime'], item['format'], item['papermeter'], item['packing'], item['coverurl'], item['coverpath'], item['seriename'], item['catalog'], item['editorsugest'], item['usersugest'], item['preface'], item['summary'], item['epilogue'], item['publishdate'], item['collectiontime'], item['orgcode'], item['skuid'], item['commentcount'], item['_row'], item['ifimport'], item['_entitycode'], item['url'], item['commentpercent'], item['commenttag'], item['authorintro'], item['sourceprice']) try: self.cursor.execute(sql, parames) self.db.commit() self._logger.info('插入===book_info===数据:' + item['url']) except Exception: # 更改is_set为'是':即若是存在相同的数据就不去下载图片了 item['is_set'] = '是' self._logger.info('存在相同数据,插入===book_info===失败:' + item['url']) elif item['_entitycode'] == 'web_page_p_book_comment_09': sql = '''insert into web_page_p_book_comment_09_dangdang(_row, isbn, uri, bookname, sourcetype, collectiontime, publishtime, username, hitcount, follownum, suportnum, opposnum, commentid, followcommentid, commenttitle, commenttype, comment, score, level, commpoint, type, sitename, ifimport, _entitycode) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''' parames = (item['_row'], item['isbn'], item['uri'], item['bookname'], item['sourcetype'], item['collectiontime'], item['publishtime'], item['username'], item['hitcount'], item['follownum'], item['suportnum'], item['opposnum'], item['commentid'], item['followcommentid'], item['commenttitle'], item['commenttype'], item['comment'], item['score'], item['level'], item['commpoint'], item['type'], item['sitename'], '0', item['_entitycode']) try: self.cursor.execute(sql, parames) self.db.commit() self._logger.info('插入~~~book_comment~~~数据:' + item['url']) except Exception: self._logger.info('存在相同数据,插入~~~book_comment~~~失败:' + item['url']) return item def close_spider(self, spider): self.cursor.close() self.db.close()
# -*-coding:utf-8-*- import json import stomp import time from conf import Conf from mylogger import Logger from pipelines import ElasticSearchPipelines, MySqlPipelines, SaveImgPipelines config = Conf.config _logger = Logger().getLogger() # 消息监听器 class BookListener(object): mysql_pipe = MySqlPipelines() image_pipe = SaveImgPipelines() es_pipe = ElasticSearchPipelines() def on_message(self, headers, message): """ 处理接收到的消息 :param headers: :param message: :return: """ item = json.loads(message) # _logger.info('接收信息:' + item['isbn']) # 数据推送到mysql中 self.mysql_pipe.process_item(item) # 下载图片
def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)') # parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=201) parser.add_argument('--runs', type=int, default=1) parser.add_argument('--prune_set', type=str, default='train') parser.add_argument('--ratio', type=float, default=0.95) parser.add_argument('--times', type=int, default=20) parser.add_argument('--prune_epoch', type=int, default=301) parser.add_argument('--reset_param', type=bool, default=False) parser.add_argument('--naive', type=bool, default=False) parser.add_argument('--data_dir', type=str, default='./data/') args = parser.parse_args() log_name = f'log/arxivtest_{args.prune_set}_{args.ratio}_{args.epochs}_{args.prune_epoch}_{args.times}.log' logger.add(log_name) logger.info('logname: {}'.format(log_name)) logger.info(args) # device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' # device = torch.device(device) device = 'cuda' if torch.cuda.is_available() else 'cpu' dataset = PygNodePropPredDataset(name='ogbn-arxiv', root=args.data_dir, transform=T.ToSparseTensor()) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() data = data.to(device) split_idx = dataset.get_idx_split() train_idx = split_idx['train'].to(device) if args.use_sage: model = SAGE(data.num_features, args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) else: model = GCN(data.num_features, args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-arxiv') logger1 = Logger(args.runs, args) row, col, val = data.adj_t.coo() N = int(row.max() + 1) row = torch.cat([torch.arange(0, N).cuda(), row], dim=0) col = torch.cat([torch.arange(0, N).cuda(), col], dim=0) edge_index = torch.cat([row, col]).view(2, -1) data.edge_index = edge_index # print(data.edge_index) pruner = Pruner(edge_index.cpu(), split_idx, prune_set=args.prune_set, ratio=args.ratio) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, data, train_idx, optimizer) result = test(model, data, split_idx, evaluator) logger1.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result logger.info( f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {100 * train_acc:.2f}%, Valid: {100 * valid_acc:.2f}% Test: {100 * test_acc:.2f}%' ) logger1.print_statistics(ratio=1) logger1.flush() for i in range(1, args.times + 1): pruner.prune(naive=args.naive) if args.reset_param == True: model.reset_parameters() for epoch in range(1, 1 + args.prune_epoch): loss = train(model, data, train_idx, optimizer, pruner=pruner) result = test(model, data, split_idx, evaluator, pruner=pruner) logger1.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result logger.info( f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {100 * train_acc:.2f}%, Valid: {100 * valid_acc:.2f}% Test: {100 * test_acc:.2f}%' ) logger1.print_statistics(ratio=args.ratio**i) logger1.flush()
class CategoryServer(): # 授权链接 auth_url = '' _logger = Logger().getLogger() proxies = { "http": '' # 代理ip } def __init__(self): # self.reload_authURL() config = Conf.config #初始化数据库连接 self.db = pymysql.connect(host=config['mysql']['host'], port=config['mysql']['port'], user=config['mysql']['username'], passwd=config['mysql']['password'], db=config['mysql']['dbname_cate']) self.cursor = self.db.cursor() self.db_sale = pymysql.connect(host=config['mysql']['host'], port=config['mysql']['port'], user=config['mysql']['username'], passwd=config['mysql']['password'], db=config['mysql']['dbname_sale_cate']) self.cursor_sale = self.db_sale.cursor() def reload_authURL(self): """ 更换中图网站的授权链接 :return: """ try: proxy_ip = requests.get( 'http://api.ip.data5u.com/dynamic/get.html?order=f6d9a18f02f520f2aaac6b249fd8689e' ).content.decode().strip() self.proxies['http'] = proxy_ip url = 'http://opac.nlc.cn/F?RN=989462048' response = requests.get(url, timeout=20, proxies=self.proxies) html = response.text self.auth_url = re.findall('tmp="([^"]+)"', html)[0] except: self._logger.error('更换中图授权链接的时候出错') self.auth_url = 'http://opac.nlc.cn:80/F/IYKXX91A5NCBPEQP1DQHLF471L8ANIEHXUMSUTI2HLRRXI77MF-10964' def query_cate_server(self, isbn): """ 中图查询入口:先查book_isbn_cate表,有则return,无则再查中图网站,查到的中图分类再存进mysql :param isbn: :return: """ # 先查询mysql是否有此isbn cate_code = self.query_cate_mysql(isbn) if cate_code: return cate_code # 更换授权链接 try: self.reload_authURL() except Exception as e: self._logger.error(e) url = self.auth_url + '?func=find-b&find_code=ISB&request=%s&local_base=NLC01&filter_code_1=WLN&filter_request_1=&filter_code_2=WYR&filter_request_2=&filter_code_3=WYR&filter_request_3=&filter_code_4=WFM&filter_request_4=&filter_code_5=WSL&filter_request_5=' % isbn try: # 请求中图网站获取isbn对应的网页,进行解析 response = requests.get(url, timeout=10, proxies=self.proxies) html = response.text except Exception as e: self._logger.error(e) html = '' cate_code = re.findall('CALL-NO:\s*?([^\r\n]*)', html) if not cate_code: self._logger.info('中图服务器查询查无此isbn:' + isbn) return '' cate_code = cate_code[0].strip() if not cate_code: self._logger.info('中图服务器查询查无此isbn:' + isbn) return '' self._logger.info('中图服务器查询========>isbn:' + isbn + ' 分类为:' + cate_code) # 往数据库中插入新的中图分类 self.insert_cate_mysql(isbn, cate_code) return cate_code def query_cate_mysql(self, isbn): """ 从mysql中查询中图分类 :param isbn: :return: """ sql = 'select category from book_isbn_cate where isbn = "%s" ' % isbn self.cursor.execute(sql) result = self.cursor.fetchone() if not result: self._logger.info('中图数据库查无此isbn:' + isbn + '转为中图服务器查询') return None self._logger.info('中图数据库查询========>isbn:' + isbn + ' 分类为:' + result[0]) return result[0] def insert_cate_mysql(self, isbn, cate_code): """ 往数据库中插入中图分类 :param isbn: :param cate_code: 中图分类号 :return: """ sql = 'insert into book_isbn_cate(isbn,category,savetime) values(%s,%s,%s)' now = datetime.datetime.now() params = (isbn, cate_code, now) self.cursor.execute(sql, params) self.db.commit() pass def query_sale_category(self, salecategory_name): """ 从mysql中查询营销分类 :param salecategory_name: :return: """ sql = 'select id from book_category_cate where name like "%' + salecategory_name + '%"' self.cursor_sale.execute(sql) result = self.cursor_sale.fetchone() if not result: self._logger.info('查无此营销分类========>salecategory_name:' + salecategory_name) return '' self._logger.info('查询营销分类========>salecategory_name:' + salecategory_name + ' ID为:' + result[0]) return result[0]
parser = argparse.ArgumentParser( description='Train the UNet on images and target masks', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-f', '--filename', dest='fname', type=str, default="demo1.jpg", help='The file in ./demo/demo_img/') return parser.parse_args() if __name__ == '__main__': log = Logger("log_demo.txt") args = get_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') seed = 1 torch.manual_seed(seed) if device == "cuda": torch.cuda.manual_seed(seed) log.logger.info(f'Using device {device}') log.logger.info(f'Start initialize model...') net_pretrained = models.densenet201(pretrained=True) net = densenet.densenet201(num_classes=1)
nodename = l[3] section = l[4] if groupname not in ret: ret[groupname] = {} if nodename not in ret[groupname]: ret[groupname][nodename] = [] if section not in ret[groupname][nodename]: ret[groupname][nodename].append(section) return ret StatsServer = RedisStats if __name__ == "__main__": from mylogger import Logger logger = Logger.getLogger('debug', None, 'DEBUG', True) Logger.addLoggingServer(logger, '127.0.0.1', 9900) from stats_client import AppStats stats = AppStats('cluster1', 'selector', logger) stats2 = AppStats('cluster2', 'master', logger) stats3 = AppStats('cluster3', 'parser', logger) import redis redis_ = redis.Redis() redisStats = RedisStats('test', redis_, logger) section = 'process' options = {'mem': 1000, 'cpu':0.01} stats_str = stats.stats_encode('p1', options, 'set')
class MySqlPipelines(object): _logger = Logger().getLogger() def __init__(self): config = Conf.config # 创建数据库连接 self.db = pymysql.connect(host=config['mysql']['host'], port=config['mysql']['port'], user=config['mysql']['username'], passwd=config['mysql']['password'], db=config['mysql']['dbname'], charset='utf8') self.cursor = self.db.cursor() # 实例化分类服务器 self.cate_server = CategoryServer() def process_item(self, item): """ 查询营销分类,中图分类,出版社编号。将数据推送至mysql :param item:图书信息实体 :return:item """ if item['_entitycode'] == 'web_page_p_book_info_09': if item['is_set'] == '是': return item # 营销分类查询 contenttype = item['contenttype'].split(',') if not contenttype: contenttype = [''] contenttype = contenttype[-1] salecategory = self.cate_server.query_sale_category(contenttype) item['salecategory'] = salecategory # 中图分类 isbn = item['isbn'] cate_code = self.cate_server.query_cate_server(isbn) item['category'] = cate_code item['orgcategory'] = cate_code if item['sourcetype'] == '01': table = 'web_page_p_book_info_09_jingdong' elif item['sourcetype'] == '02': table = 'web_page_p_book_info_09_dangdang' elif item['sourcetype'] == '03': table = 'web_page_p_book_info_09_douban' elif item['sourcetype'] == '04': table = 'web_page_p_book_info_09_xinhuashudian' elif item['sourcetype'] == '05': table = 'web_page_p_book_info_09_yamaxun' elif item['sourcetype'] == '06': table = 'web_page_p_book_info_09_tianmao' sql = 'insert into '+table+'(bookname, subhead, publisher, orgpublisher, contentsummary, sourcetype,author, translator, isbn, orgisbn, salecategory, category, orgcategory, contenttype, issuearea, type, edition, impression,words, pages, language, price, printedtime, format, papermeter, packing, coverurl, coverpath, seriename, catalog, editorsugest, usersugest, preface, summary, epilogue, publishdate, collectiontime, orgcode, skuid, commentcount, _row,ifimport, _entitycode, url, commentpercent, commenttag,authorintro,sourceprice) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' parames = ( item['bookname'], item['subhead'], item['publisher'], item['orgpublisher'], item['contentsummary'], item['sourcetype'], item['author'], item['translator'], item['isbn'], item['orgisbn'], item['salecategory'], item['category'], item['orgcategory'], item['contenttype'], item['issuearea'], item['type'], item['edition'], item['impression'], item['words'], item['pages'], item['language'], item['price'], item['printedtime'], item['format'], item['papermeter'], item['packing'], item['coverurl'], item['coverpath'], item['seriename'], item['catalog'], item['editorsugest'], item['usersugest'], item['preface'], item['summary'], item['epilogue'], item['publishdate'], item['collectiontime'], item['orgcode'], item['skuid'], item['commentcount'], item['_row'], item['ifimport'], item['_entitycode'], item['url'], item['commentpercent'], item['commenttag'], item['authorintro'], item['sourceprice'] ) try: self.cursor.execute(sql, parames) self.db.commit() self._logger.info('mysql插入===book_info===数据成功:' + item['url']) except Exception: item['is_set'] = '是' self._logger.info('mysql插入===book_info===失败:' + item['url']) elif item['_entitycode'] == 'web_page_p_book_comment_09': if item['sourcetype'] == '01': table = 'web_page_p_book_comment_09_jingdong' elif item['sourcetype'] == '02': table = 'web_page_p_book_comment_09_dangdang' elif item['sourcetype'] == '03': table = 'web_page_p_book_comment_09_douban' elif item['sourcetype'] == '04': table = 'web_page_p_book_comment_09_xinhuashudian' elif item['sourcetype'] == '05': table = 'web_page_p_book_comment_09_yamaxun' elif item['sourcetype'] == '06': table = 'web_page_p_book_comment_09_tianmao' sql = 'insert into '+table+'(_row, isbn, uri, bookname, sourcetype, collectiontime, publishtime, username, hitcount,follownum, suportnum, opposnum, commentid, followcommentid, commenttitle, commenttype, comment, score, level, commpoint, type, sitename,ifimport, _entitycode) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' parames = ( item['_row'], item['isbn'], item['uri'], item['bookname'], item['sourcetype'], item['collectiontime'], item['publishtime'], item['username'], item['hitcount'], item['follownum'], item['suportnum'], item['opposnum'], item['commentid'], item['followcommentid'], item['commenttitle'], item['commenttype'], item['comment'], item['score'], item['level'], item['commpoint'], item['type'], item['sitename'], '0', item['_entitycode'] ) try: self.cursor.execute(sql, parames) self.db.commit() self._logger.info('mysqk插入~~~book_comment~~~数据成功:' + item['_row']) except Exception as e: self._logger.error(e) self._logger.info('mysqk插入~~~book_comment~~~失败:' + item['_row']) return item def close_spider(self, spider): self.cursor.close() self.db.close()
"""github 自动部署. """ import os import sys import subprocess from flask import Flask, request sys.path.append(os.path.join(os.path.abspath(__file__).rsplit('/', 1)[0], 'logger')) from configs import get_configs from mylogger import Logger log_main = Logger.get_logger(__file__) app = Flask(__name__) configs_sys = get_configs() # 系统配置 # Route @app.route('/deploy/<project>', methods=['POST']) def deploy(project=None): if project.upper() not in configs_sys['GIT']: log_main.critical('No such project: {0}'.format(project)) sys.exit(-1) html_url = request.json['repository']['html_url']
import os import re import requests from elasticsearch import Elasticsearch from mylogger import Logger _logger = Logger().getLogger() es = Elasticsearch('10.13.11.21:9200') index = 0 try: with open('zlog.text', 'r', encoding='utf-8') as file: lists = file.readlines() for l in lists: date = re.findall("个文件:(\d+)/\d+.jpg", l) _row = re.findall("个文件:\d+/(\d+).jpg", l) if _row: _row = _row[0] date = date[0] body = {"query": {"term": {"_row": _row}}} result = es.search(index="web_page_p_book_info_09", doc_type="web_page_p_book_info_09", body=body) terms = result['hits']['hits'] if terms: term = terms[0] coverurl = term['_source']['coverurl'] coverpath = term['_source']['coverpath'] if date in coverpath: img_path = '/mount/fhcb/fileserver/img' + coverpath
class ParseComment(object): _logger = Logger.getLogger(Logger()) def __init__(self): #初始化 config = Conf.config # 初始化es数据通道 self.es_pipe = ElasticSearchPipelines() self.mysql_hosts = config['mysql']['host'] self.mysql_port = config['mysql']['port'] self.mysql_user = config['mysql']['username'] self.mysql_password = config['mysql']['password'] self.mysql_db = config['mysql']['dbname'] self.mysql_charset = config['mysql']['charset'] self.mysql_table = config['mysql']['comment_table'] # 建立数据库连接 self.conn = pymysql.connect(host=self.mysql_hosts, port=int(self.mysql_port), user=self.mysql_user, password=self.mysql_password, db=self.mysql_db, charset=self.mysql_charset, cursorclass = pymysql.cursors.DictCursor ) self.cursor = self.conn.cursor() # 将数据推送至ES def parse_item(self): self._logger.info("查询待推送的数据:" + self.mysql_table) self.cursor.execute("""SELECT * FROM %s limit 0,100000""" % self.mysql_table) result = self.cursor.fetchall() for row in result: # 解析数据库结果 item = self.initItem(row) self.es_pipe.process_item(item) # 释放资源 def close_spider(self, spider): self.conn.close() self.cursor.close() # 解析mysql数据库 def initItem(self,row): item = {} item['isbn'] = row['isbn'] item['uri'] = row['uri'] skuid = row['uri'].split('/')[7] item['bookname'] = row['bookname'] item['sourcetype'] = row['sourcetype'] item['collectiontime'] = row['collectiontime'] item['publishtime'] = row['publishtime'] item['username'] = row['username'] # 初始化int类型的数据为0 hitcount = row['hitcount'] if not hitcount: hitcount = '0' item['hitcount'] = hitcount # 初始化int类型的数据为0 follownum = row['follownum'] if not follownum: follownum = '0' item['follownum'] = follownum # 初始化int类型的数据为0 suportnum = row['suportnum'] if not suportnum: suportnum = '0' item['suportnum'] = suportnum # 初始化int类型的数据为0 opposnum = row['opposnum'] if not opposnum: opposnum = '0' item['opposnum'] = opposnum item['commentid'] = row['commentid'] item['followcommentid'] = row['followcommentid'] item['commenttitle'] = row['commenttitle'] item['commenttype'] = row['commenttype'] item['comment'] = row['comment'] score = row['score'] if not score: score = '5' item['score'] = score level = row['level'] if not level: level = '0' item['level'] = level item['commpoint'] = row['commpoint'] item['type'] = row['type'] item['sitename'] = row['sitename'] item['_entitycode'] = row['_entitycode'] item['_row'] = row['_row'] item['skuid'] = skuid return item
import platform import os from TimerTask import Task from email_dict import to_send_email from email_send import EmailSend from loading import Loading from mylogger import Logger from warning_main import WarningPlay from warnstone import stoneobject if __name__ == '__main__': # 记录器 实例 logname = "生日预警日志" log = Logger(logname) logger = log.getlogger() # 解析器实例 conf = configparser.ConfigParser() path = 'warning.conf' assert os.path.exists(path), "{file}不存在".format(file=path) if platform.system() == 'Windows': conf.read(path, encoding="utf-8-sig") else: conf.read(path) # 数据库实例 stone = stoneobject() # 初始化 定时器 task = Task("08:00", logger) times = conf.get(section="time", option="now") if task.times != datetime.time(int(times.split(':')[0]), int(times.split(':')[1])):
f.protocol = MasterServer port = g_config.getint('master', 'port') g_logger.info('listenTCP %s' % port) reactor.listenTCP(port, f) reactor.run() if __name__ == '__main__': if len(sys.argv) != 2: print 'Usage: %s <config>' % sys.argv[0] sys.exit(1) # 读取配置文件 config_file = sys.argv[1] g_config = ConfigParser.ConfigParser() g_config.read(config_file) g_master_name = g_config.get('master', 'name') level = g_config.get('master', 'level') if g_config.has_option('master', 'level') else "DEBUG" debug = g_config.getboolean('master', 'debug') if g_config.has_option('master', 'debug') else True logfile = g_config.get('master', 'logfile') if g_config.has_option('master', 'logfile') else None logname = g_config.get('master', 'logname') if g_config.has_option('master', 'logname') else None if not debug: assert logfile, 'logfile must be set when not debug mode' g_logger = Logger.getLogger(logname, logfile, level=level, debug=debug) g_redis_addr = g_config.get('master', 'redis_addr') if g_config.has_option('master', 'redis_addr') else 'localhost' g_redis_port = g_config.getint('master', 'redis_port') if g_config.has_option('master', 'redis_port') else 6379 main()