Example #1
0
 def parse_detail(self, message):
     conn = utils.init_db('mysql', 'hepjournal', 4)
     cur = conn.cursor()
     cur.execute(
         'select journal_id,journal_name,issn,eissn,cnno from journal')
     rows = cur.fetchall()
     for journal_id, journal_name, issn, eissn, cnno in rows:
         self.dic[journal_id] = (journal_name, issn, eissn, cnno)
     cur.close()
     conn.close()
     self.predb3()
     self.sqlList.clear()
     stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, volume, issue, page, beginpage,
     endpage, publisher, subject, date,creator_institution, date_created, source, identifier_pissn, identifier_eissn,
     identifier_cnno, description, identifier_doi, language, country, provider, provider_url, provider_id, type, medium,
     batch, gch)values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"""
     count = 0
     for filename, fullname in utils.file_list(self.detail_path):
         onemessage = self.parse_detail_one(filename, fullname)
         if onemessage:
             self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,成品文件为%s' %
                      (self.provider, self.template_file))
Example #2
0
 def parse_detail_meta(self, message):
     conn = utils.init_db('mysql', 'aiaajournal', 2)
     cur = conn.cursor()
     cur.execute(
         'select gch,journal_name,journal_name_en,pissn,eissn from journal')
     rows = cur.fetchall()
     for gch, journal_name, journal_name_en, pissn, eissn in rows:
         self.dic[gch] = (journal_name, journal_name_en, pissn, eissn)
     cur.close()
     conn.close()
     self.predb3('base_obj_meta_a_template_qk.db3',
                 'base_obj_meta_a_qk.aiaajournal')
     self.sqlList.clear()
     stmt = """insert into base_obj_meta_a (author,author_1st,organ,organ_1st,title,title_alt,keyword,pub_year,pub_date,
     vol,num,journal_raw_id,journal_name,journal_name_alt,page_info,begin_page,end_page,subject,is_oa,down_cnt,lngid,
     rawid,product,sub_db,
     provider,sub_db_id,source_type,provider_url,country,language,batch,down_date,publisher,issn,eissn,abstract,
     abstract_alt,doi,fund,ref_cnt,fulltext_type) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,
     ?,?,?,?,?,?,?,?,?,?,?)"""
     count = 0
     for filename, fullname in utils.file_list(self.detail_path):
         onemessage = self.parse_detail_one(filename, fullname, 'meta')
         if onemessage:
             self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,成品文件为%s' %
                      (self.provider, self.template_file))
 def parse_detail(self, message):
     conn = utils.init_db('mysql', 'aiaabook', 2)
     cur = conn.cursor()
     cur.execute('select url,pub_year from book')
     rows = cur.fetchall()
     for url, pub_year in rows:
         doi = '10.2514/' + url.split('/')[-1]
         self.dic[doi] = (pub_year)
     cur.close()
     conn.close()
     self.predb3()
     self.sqlList.clear()
     stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, identifier_pisbn,
      identifier_eisbn, description, publisher,cover,title_series,
      date,date_created, price, language, country, provider, provider_url, identifier_doi, provider_id,
     type,medium, batch) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
     """
     count = 0
     for filename, fullname in utils.file_list(self.detail_path):
         onemessage = self.parse_detail_one(filename, fullname, 'zt')
         # print(onemessage)
         if onemessage:
             self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,成品文件为%s' %
                      (self.provider, self.template_file))
Example #4
0
 def startdown_list(self, message):
     utils.printf('%s:开始下载列表页...' % self.provider)
     if not self.list_path:
         self.initpath()
     self.refreshproxypool()
     self.count = 0
     conn = utils.init_db('mysql', 'science')
     cur = conn.cursor()
     cur.execute('select url,stat from issue where stat=0')
     rows = cur.fetchall()
     self.totalcount = len(rows)
     if self.totalcount == 0:
         if len(os.listdir(self.index_path)) == 0:
             utils.logerror('%s:没有新的issue不需要更新' % self.provider)
             utils.msg2weixin('%s:没有新的issue不需要更新' % self.provider)
         else:
             self.sendwork('parse_list')
     for url, _ in rows:
         fdir = self.list_path + '/' + url.split('.')[0]
         if not os.path.exists(fdir):
             os.makedirs(fdir)
         fname = fdir + '/' + url.split('/')[-2] + '_' + url.split(
             '/')[-1] + '.html'
         url = 'http://' + url
         self.sendwork('down_list', (url, fname))
Example #5
0
 def upload2HDFS(self, message):
     utils.all_2_one(self.detail_path, self.merge_path)
     flag = utils.ProcAll(self.merge_path, self.hdfs_path)
     if flag:
         utils.msg2weixin('%s:bigjson成功上传至%s' %
                          (self.provider, self.hdfs_path))
     else:
         utils.logerror('%s:bigjson上传至%s出现问题' %
                        (self.provider, self.hdfs_path))
         utils.msg2weixin('%s:bigjson上传至%s出现问题' %
                          (self.provider, self.hdfs_path))
 def parse_detail(self, message):
     self.predb3()
     self.sqlList.clear()
     stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, title_series, cover, page, publisher, subject, date, creator_bio, date_created,
     identifier_pisbn, identifier_eisbn, description, identifier_doi, language, country, provider, provider_url,
     provider_id, type, medium, batch)values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"""
     count = 0
     for filename, fullname in utils.file_list(self.detail_path):
         onemessage = self.parse_detail_one(filename, fullname)
         self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,成品文件为%s' %
                      (self.provider, self.template_file))
 def parse_list(self, message):
     conn = utils.init_db('mysql', 'ydylcnbook', 4)
     self.predb3()
     self.sqlList.clear()
     stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, title_alternative,
      title_series, cover, subject,identifier_pisbn, description, publisher, date, date_created, language, country,
     provider,provider_url, provider_id, type, medium, batch)values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"""
     sql = 'insert ignore into article(article_id,stat) Values(%s,%s)'
     count = 0
     articlecnt = 0
     result = []
     for filename, fullname in utils.file_list(self.list_path):
         onemessage, bookdetaillist = self.parse_list_one(
             filename, fullname)
         if onemessage:
             self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()
         if bookdetaillist:
             for article_id in bookdetaillist:
                 result.append((article_id, 0))
             utils.parse_results_to_sql(conn, sql, result)
             articlecnt += len(result)
             result.clear()
             utils.printf('%s: 插入 %d 个文章ID到数据库' %
                          (self.provider, articlecnt))
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,图书成品文件为%s' %
                      (self.provider, self.template_file))
     self.senddistributefinish('startdown_detail')
Example #8
0
 def parse_index(self, message):
     self.predb3(fname='pishuvideo')
     self.sqlList.clear()
     stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, title, subject,description, publisher, date,
      date_created, language, country,provider,provider_url, provider_id,type, medium, batch)values(?,?,?,?,?,?,?,?,
      ?,?,?,?,?,?,?,?)"""
     sql = 'insert ignore into article(article_id,stat) Values(%s,%s)'
     count = 0
     articlecnt = 0
     result = []
     for filename, fullname in utils.file_list(self.index_path):
         onemessage = self.parse_index_one(filename, fullname)
         if onemessage:
             self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()         
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,视频成品文件为%s' % (self.provider, self.template_file))