def parse_detail(self, message): conn = utils.init_db('mysql', 'hepjournal', 4) cur = conn.cursor() cur.execute( 'select journal_id,journal_name,issn,eissn,cnno from journal') rows = cur.fetchall() for journal_id, journal_name, issn, eissn, cnno in rows: self.dic[journal_id] = (journal_name, issn, eissn, cnno) cur.close() conn.close() self.predb3() self.sqlList.clear() stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, volume, issue, page, beginpage, endpage, publisher, subject, date,creator_institution, date_created, source, identifier_pissn, identifier_eissn, identifier_cnno, description, identifier_doi, language, country, provider, provider_url, provider_id, type, medium, batch, gch)values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""" count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname) if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def parse_detail_meta(self, message): conn = utils.init_db('mysql', 'aiaajournal', 2) cur = conn.cursor() cur.execute( 'select gch,journal_name,journal_name_en,pissn,eissn from journal') rows = cur.fetchall() for gch, journal_name, journal_name_en, pissn, eissn in rows: self.dic[gch] = (journal_name, journal_name_en, pissn, eissn) cur.close() conn.close() self.predb3('base_obj_meta_a_template_qk.db3', 'base_obj_meta_a_qk.aiaajournal') self.sqlList.clear() stmt = """insert into base_obj_meta_a (author,author_1st,organ,organ_1st,title,title_alt,keyword,pub_year,pub_date, vol,num,journal_raw_id,journal_name,journal_name_alt,page_info,begin_page,end_page,subject,is_oa,down_cnt,lngid, rawid,product,sub_db, provider,sub_db_id,source_type,provider_url,country,language,batch,down_date,publisher,issn,eissn,abstract, abstract_alt,doi,fund,ref_cnt,fulltext_type) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?, ?,?,?,?,?,?,?,?,?,?,?)""" count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname, 'meta') if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def parse_detail(self, message): conn = utils.init_db('mysql', 'aiaabook', 2) cur = conn.cursor() cur.execute('select url,pub_year from book') rows = cur.fetchall() for url, pub_year in rows: doi = '10.2514/' + url.split('/')[-1] self.dic[doi] = (pub_year) cur.close() conn.close() self.predb3() self.sqlList.clear() stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, identifier_pisbn, identifier_eisbn, description, publisher,cover,title_series, date,date_created, price, language, country, provider, provider_url, identifier_doi, provider_id, type,medium, batch) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) """ count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname, 'zt') # print(onemessage) if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def startdown_list(self, message): utils.printf('%s:开始下载列表页...' % self.provider) if not self.list_path: self.initpath() self.refreshproxypool() self.count = 0 conn = utils.init_db('mysql', 'science') cur = conn.cursor() cur.execute('select url,stat from issue where stat=0') rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: if len(os.listdir(self.index_path)) == 0: utils.logerror('%s:没有新的issue不需要更新' % self.provider) utils.msg2weixin('%s:没有新的issue不需要更新' % self.provider) else: self.sendwork('parse_list') for url, _ in rows: fdir = self.list_path + '/' + url.split('.')[0] if not os.path.exists(fdir): os.makedirs(fdir) fname = fdir + '/' + url.split('/')[-2] + '_' + url.split( '/')[-1] + '.html' url = 'http://' + url self.sendwork('down_list', (url, fname))
def upload2HDFS(self, message): utils.all_2_one(self.detail_path, self.merge_path) flag = utils.ProcAll(self.merge_path, self.hdfs_path) if flag: utils.msg2weixin('%s:bigjson成功上传至%s' % (self.provider, self.hdfs_path)) else: utils.logerror('%s:bigjson上传至%s出现问题' % (self.provider, self.hdfs_path)) utils.msg2weixin('%s:bigjson上传至%s出现问题' % (self.provider, self.hdfs_path))
def parse_detail(self, message): self.predb3() self.sqlList.clear() stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, title_series, cover, page, publisher, subject, date, creator_bio, date_created, identifier_pisbn, identifier_eisbn, description, identifier_doi, language, country, provider, provider_url, provider_id, type, medium, batch)values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""" count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname) self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def parse_list(self, message): conn = utils.init_db('mysql', 'ydylcnbook', 4) self.predb3() self.sqlList.clear() stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, title_alternative, title_series, cover, subject,identifier_pisbn, description, publisher, date, date_created, language, country, provider,provider_url, provider_id, type, medium, batch)values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""" sql = 'insert ignore into article(article_id,stat) Values(%s,%s)' count = 0 articlecnt = 0 result = [] for filename, fullname in utils.file_list(self.list_path): onemessage, bookdetaillist = self.parse_list_one( filename, fullname) if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() if bookdetaillist: for article_id in bookdetaillist: result.append((article_id, 0)) utils.parse_results_to_sql(conn, sql, result) articlecnt += len(result) result.clear() utils.printf('%s: 插入 %d 个文章ID到数据库' % (self.provider, articlecnt)) utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,图书成品文件为%s' % (self.provider, self.template_file)) self.senddistributefinish('startdown_detail')
def parse_index(self, message): self.predb3(fname='pishuvideo') self.sqlList.clear() stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, title, subject,description, publisher, date, date_created, language, country,provider,provider_url, provider_id,type, medium, batch)values(?,?,?,?,?,?,?,?, ?,?,?,?,?,?,?,?)""" sql = 'insert ignore into article(article_id,stat) Values(%s,%s)' count = 0 articlecnt = 0 result = [] for filename, fullname in utils.file_list(self.index_path): onemessage = self.parse_index_one(filename, fullname) if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,视频成品文件为%s' % (self.provider, self.template_file))