def init_config(): global args init_db() arg = ArgumentParser(description='CoreMail Upload Vul') arg.add_argument("-u", "--url", help="Target URL; Example:http://ip:port") arg.add_argument("-f", "--file", help="Target URL; Example:url.txt") args = arg.parse_args()
def main(): while True: start() desision = input('Write your desision: ') if not desision.isdigit(): print('Wrong input') continue desision = int(desision) if not desision: print('See you next time') break elif desision == 1: show_all() elif desision == 2: write_new_record() elif desision == 3: edit_record() elif desision == 4: remove_record() elif desision == 5: search_by_id() elif desision == 6: filter_by_keyword() elif desision == 9: init_db()
def process_list(self, message): self.count = self.count + 1 sql = "update issue set stat=1 where url='{}'".format(message) self.sqlList.append(sql) if self.count % 40 == 1: utils.printf('%s:下载成功 %s 页' % (self.provider, self.count)) conn = utils.init_db('mysql', 'aiaajournal', 2) cur = conn.cursor() for sql in self.sqlList: cur.execute(sql) conn.commit() conn.close() self.sqlList.clear() if self.count % 100 == 0: self.refreshproxypool() if self.count == self.totalcount: conn = utils.init_db('mysql', 'aiaajournal', 2) cur = conn.cursor() for sql in self.sqlList: cur.execute(sql) conn.commit() conn.close() self.sqlList.clear() utils.printf('downloadlist finish') self.sendwork('parse_list')
def startdown_detail(self, message): if not self.detail_path: self.initpath() self.sqlList.clear() self.refreshproxypool() self.count = 0 conn = utils.init_db('mysql', 'hepengineeringjournal', 4) cur = conn.cursor() cur.execute( 'select article_id,journal_id from article where stat=0 and failcount<3' ) rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: utils.printf('%s:下载详情页完成' % self.provider) # self.sendwork('parse_detail_meta') self.sendwork('parse_detail') # self.sendwork('down_cover') return messagelist = [] for article_id, journal_id in rows: fdir = '%s/%s' % (self.detail_path, journal_id) if not os.path.exists(fdir): os.makedirs(fdir) messagelist.append((article_id, journal_id)) if len(messagelist) == 30: blist = messagelist.copy() self.sendwork('down_detail', blist) # utils.printf('a'+len(messagelist)) # utils.printf(messagelist) messagelist.clear() if len(messagelist) > 0: self.sendwork('down_detail', messagelist)
def parse_list(self, message): utils.printf('%s:解析列表页开始...' % self.provider) conn = utils.init_db('mysql', 'apsjournal') result = [] stmt = 'insert ignore into article(url,vol,issue) Values(%s,%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() vol = filename.split('_')[0] issue = filename.split('_')[-1].replace('.html', '') soup = BeautifulSoup(text, 'lxml') aTags = soup.select('div.large-9.columns > h5 > a') for aTag in aTags: url = aTag.get('href') if not url.startswith('/'): continue url = 'https://journals.aps.org' + url result.append((url, vol, issue)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析列表页完成...' % self.provider) self.senddistributefinish('startdown_detail')
def parse_html(self, message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'bioonejournal') result = [] stmt = 'insert ignore into journal(url,cover_url) Values(%s,%s) on DUPLICATE key UPDATE cover_url=%s' cnt = 0 for filename, fullname in utils.file_list(self.html_path): with open(fullname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') aTaglist = soup.select('div.journal.BrowseTitleAll > a') for aTag in aTaglist: url = aTag.get('href') if url == "/journals/": continue if url.startswith('/ebooks'): continue cover_url = aTag.img.get('src') result.append((url, cover_url, cover_url)) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) result.clear() utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_indexlist')
def parse_detail(self, message): conn = utils.init_db('mysql', 'aiaabook', 2) cur = conn.cursor() cur.execute('select url,pub_year from book') rows = cur.fetchall() for url, pub_year in rows: doi = '10.2514/' + url.split('/')[-1] self.dic[doi] = (pub_year) cur.close() conn.close() self.predb3() self.sqlList.clear() stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, identifier_pisbn, identifier_eisbn, description, publisher,cover,title_series, date,date_created, price, language, country, provider, provider_url, identifier_doi, provider_id, type,medium, batch) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) """ count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname, 'zt') # print(onemessage) if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def parse_indexlist(self, message): try: utils.printf('%s:解析期索引页开始...' % self.provider) conn = utils.init_db('mysql', 'bioonejournal') self.sqlList.clear() cnt = 0 cur = conn.cursor() path = '%s/%s' % (self.datepath, 'indexlist') for filename, fullname in utils.file_list(path): with open(fullname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') aTags = soup.find_all('a', class_='IssueByYearInnerText') for aTag in aTags: url = aTag.get('href').replace('https://bioone.org', '') self.sqlList.append( "insert ignore into issuelist(url,year) Values('%s','%s')" % (url, url.split('/')[-1])) cnt += len(self.sqlList) for sql in self.sqlList: cur.execute(sql) conn.commit() self.sqlList.clear() utils.printf(cnt) cur.close() conn.close() utils.printf('%s:解析索引页完成...' % self.provider) # self.sendwork('down_cover') self.senddistributefinish('startdown_index') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def parse_list(self, message): utils.printf('%s:解析列表页开始...' % self.provider) conn = utils.init_db('mysql', 'ascebook') result = [] stmt = 'insert ignore into book(url,cover_url) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') divlist = soup.select( '#frmSearchResults > div > div.listBody > div > div.leftSide') for divTag in divlist: url = divTag.a.get('href') isbn = url.split('/')[-1] cover_url = '' if not isbn.startswith('978'): continue coverTag = divTag.a.select_one('img') if coverTag: cover_url = coverTag.get('src') result.append((url, cover_url)) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析列表页完成...' % self.provider) self.senddistributefinish('startdown_detail')
def startdown_list(self, message): utils.printf('%s:开始下载列表页...' % self.provider) if not self.list_path: self.initpath() self.refreshproxypool() self.count = 0 conn = utils.init_db('mysql', 'science') cur = conn.cursor() cur.execute('select url,stat from issue where stat=0') rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: if len(os.listdir(self.index_path)) == 0: utils.logerror('%s:没有新的issue不需要更新' % self.provider) utils.msg2weixin('%s:没有新的issue不需要更新' % self.provider) else: self.sendwork('parse_list') for url, _ in rows: fdir = self.list_path + '/' + url.split('.')[0] if not os.path.exists(fdir): os.makedirs(fdir) fname = fdir + '/' + url.split('/')[-2] + '_' + url.split( '/')[-1] + '.html' url = 'http://' + url self.sendwork('down_list', (url, fname))
def parse_html(self, message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'aiaabook', 2) result = [] stmt = 'insert ignore into book(book_name,url,pub_year,cover_url) Values(%s,%s,%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.html_path): with open(fullname, encoding='utf8') as f: text = f.read() try: sel = Selector(text=text) for liTag in sel.xpath('//li[@class="search-item clearfix"]'): book_name = liTag.xpath( './div/h4/a/text()').extract_first().strip() url = liTag.xpath('./div/h4/a/@href').extract_first() pub_year = liTag.xpath( './/div[@class="search-item__data-group__field meta__date"]/text()' ).extract_first() cover_url = liTag.xpath( './div/a/img/@src').extract_first().strip() result.append((book_name, url, pub_year, cover_url)) utils.printf(len(result)) except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_detail')
def down_detail(self): utils.printf("下载详情页开始...") super().down_detail() conn = utils.init_db('mysql', 'cqjtu_kingbook') cur = conn.cursor() while True: cur.execute( 'select bookid,stat from book where stat=0 limit 10000') rows = cur.fetchall() conn.commit() if len(rows) == 0: break for bookid, _ in rows: print(bookid) url = 'http://123.56.143.23/kingbookwaiwen/book/info.aspx?id={}'.format( bookid) dirname = '%s/%s' % (self.detail_path, bookid[:3]) if not os.path.exists(dirname): os.makedirs(dirname) filename = '%s/%s.html' % (dirname, bookid) if os.path.exists(filename): sql = 'update book set stat=1 where bookid="{}"'.format( bookid) cur.execute(sql) conn.commit() continue resp = utils.get_html(url, proxies=self.proxy) if not resp: continue with open(filename, mode='w', encoding='utf8') as f: f.write(resp.content.decode()) sql = 'update book set stat=1 where bookid="{}"'.format(bookid) cur.execute(sql) conn.commit() utils.printf("下载", bookid, "成功...")
def parse_index(self, message): workdir = message try: utils.printf('%s:解析索引页开始...' % self.provider) conn = utils.init_db('mysql', 'apsjournal') result = [] stmt = 'insert ignore into issue(url,year) Values(%s,%s) on DUPLICATE key UPDATE year=%s' cnt = 0 for filename, fullname in utils.file_list(workdir): with open(fullname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') liTags = soup.select('div.volume-issue-list > ul > li') for liTag in liTags: yeartext = liTag.get_text().strip() year = re.sub('.*?(\d{4}) \(.*?\)', r'\1', yeartext) url = 'https://journals.aps.org' + liTag.b.a.get('href') result.append((url, year, year)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析索引页完成...' % self.provider) self.senddistributefinish('startdown_list') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def get_issuelist(self, message): utils.printf('%s:开始获取期列表页...' % self.provider) if not self.list_path: self.initpath() self.refreshproxypool() self.sqlList.clear() self.count = 0 conn = utils.init_db('mysql', 'hepengineeringjournal', 4) cur = conn.cursor() cur.execute('select journal_id,journal_name from journal') rows = cur.fetchall() utils.printf(rows) for journal_id, name in rows: text = None while True: url = 'http://www.engineering.org.cn/default/journal/CurrentIssue/AllVolumeId?journalId=%s' % journal_id utils.printf(url) resp = self.gethtml(url, '"success":true', None) if resp: text = resp.content.decode('utf8') break dic = json.loads(text, encoding='utf-8') index = 1 for issue_id in dic['resultValue']: sql = 'insert into issue(journal_id,issue_id,issue_index) Values(%s,%s,%s) on DUPLICATE key UPDATE issue_index=%s' % ( journal_id, issue_id, index, index) cur.execute(sql) index += 1 conn.commit() utils.printf('%s:插入%s期' % (self.provider, index)) conn.close() self.senddistributefinish('startdown_list')
def parse_list(self, message): utils.printf('%s:解析列表页开始...' % self.provider) conn = utils.init_db('mysql', 'hepengineeringjournal', 4) result = [] stmt = 'insert ignore into article(article_id,journal_id) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() journal_id = fullname.split('\\')[-2] dicitem = json.loads(text, encoding='utf-8')['resultValue'] for lanmu in dicitem.keys(): for fenlei in dicitem[lanmu].keys(): for dicdetail in dicitem[lanmu][fenlei]: article_id = dicdetail['id'] result.append((article_id, journal_id)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析列表页完成...' % self.provider) self.senddistributefinish('startdown_detail') self.sendwork('down_cover')
def parse_index(self, message): try: utils.printf('%s:解析索引页开始...' % self.provider) conn = utils.init_db('mysql', 'hepengineeringjournal', 4) self.sqlList.clear() cur = conn.cursor() for filename, fullname in utils.file_list(self.index_path): with open(fullname, encoding='utf8') as f: text = f.read() dic = json.loads(text, encoding='utf-8') gch = filename.replace('.json', '') dicitem = dic['resultValue'] issn = dicitem['issnNm'] cnno = dicitem['cnNm'] sql = 'update journal set issn="%s",cnno="%s" where journal_id="%s"' % ( issn, cnno, gch) cur.execute(sql) conn.commit() cur.close() conn.close() utils.printf('%s:解析索引页完成...' % self.provider) # self.sendwork('down_cover') self.senddistributefinish('get_issuelist') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def startdown_list(self, message): utils.printf('%s:开始下载列表页...' % self.provider) if not self.list_path: self.initpath() self.sqlList.clear() self.refreshproxypool() self.count = 0 conn = utils.init_db('mysql', 'cambridgejournal') cur = conn.cursor() cur.execute('select url,stat from issue where stat=0') rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: if len(os.listdir(self.list_path)) == 0: utils.logerror('%s:没有新的issue不需要更新' % self.provider) else: # self.sendwork('down_cover') self.sendwork('parse_list') return self.refreshsession() for url, _ in rows: fdir = self.list_path + '/' + url.split('/')[-3] if not os.path.exists(fdir): os.makedirs(fdir) flast = url.split('/')[-1] if flast.find('?pageNum=') > 0: flast = flast.split('?')[0] + '_' + flast.split('=')[-1] fname = fdir + '/' + flast + '.html' self.sendwork('down_list', (url, fname))
def parse_list(self, message): utils.printf('%s:解析列表页开始...' % self.provider) conn = utils.init_db('mysql', 'hepjournal', 4) result = [] stmt = 'insert ignore into article(url,journal_id) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() journal_id = filename.split('_')[0] sel = Selector(text=text) for aTag in sel.xpath('//a[@class="txt_biaoti"]'): url = aTag.xpath('./@href').extract_first() result.append((url, journal_id)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析列表页完成...' % self.provider) self.senddistributefinish('startdown_detail') self.sendwork('down_cover')
def startdown_list(self, message): utils.printf('%s:开始下载列表页...' % self.provider) if not self.list_path: self.initpath() self.refreshproxypool() self.sqlList.clear() self.count = 0 conn = utils.init_db('mysql', 'hepjournal', 4) cur = conn.cursor() cur.execute('select url,journal_id from issue where stat=0') rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: if len(os.listdir(self.list_path)) == 0: utils.logerror('%s:没有新的issue不需要更新' % self.provider) else: # self.sendwork('down_cover') self.sendwork('parse_list') for url, journal_id in rows: fdir = self.list_path + '/' + journal_id if not os.path.exists(fdir): os.makedirs(fdir) fname = fdir + '/' + journal_id + '_' + url.split( '/')[-2] + '_' + url.split('/')[-1].replace('.shtml', '.html') self.sendwork('down_list', (url, fname))
def parse_index(self, message): try: utils.printf('%s:解析索引页开始...' % self.provider) conn = utils.init_db('mysql', 'aiaajournal', 2) result = [] stmt = 'insert ignore into issue(url,stat) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.index_path): with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) for aTag in sel.xpath('//a[@class="loi__issue__vol"]'): url = aTag.xpath('./@href').extract_first() if url.endswith('/0/0'): continue result.append(('https://arc.aiaa.org' + url, 0)) if utils.parse_results_to_sql(conn, stmt, result, 200): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析索引页完成...' % self.provider) self.senddistributefinish('startdown_list') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def parse_list(self, message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'aiaajournal', 2) result = [] stmt = 'insert ignore into article(id,url,vol,stat,failcount) Values(%s,%s,%s,%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) for href in sel.xpath('//h5[@class="issue-item__title"]/a/@href'): url = href.extract().replace('/doi/', '/doi/abs/').strip() id = fullname.split('\\')[-2] + '_' + url.split('/')[-1] vol = filename.split('_')[0] print(id, url) result.append((id, url, vol, 0, 0)) if utils.parse_results_to_sql(conn, stmt, result, 200): cnt += len(result) utils.printf('%s解析%s条数据到数据库' % (self.provider, cnt)) result.clear() cnt += len(result) utils.parse_results_to_sql(conn, stmt, result) utils.printf('%s解析%s条数据到数据库' % (self.provider, cnt)) utils.printf('%s解析列表页完成' % self.provider) self.senddistributefinish('startdown_detail')
def run(self): conn = utils.init_db('mysql', 'aipjournal') cur = conn.cursor() sql = "select url,stat from issue where stat=0 limit 1000;" time_last = time.time() cnt = 0 while True: if url_queue.empty(): cur.execute(sql) rows = cur.fetchall() conn.commit() if rows: for row in rows: url_queue.put(row) elif sql_queue.empty(): break time_now = time.time() if (sql_queue.qsize() > 100) or (time_now - time_last > 60): num = sql_queue.qsize() while num > 0: url, flag = sql_queue.get() cur.execute( "update issue set stat={} where url='{}'".format( flag, url)) cnt += 1 num -= 1 conn.commit() utils.printf('succssed:%d' % (cnt)) time_last = time.time() time.sleep(1)
def parse_detail_meta(self, message): conn = utils.init_db('mysql', 'aiaajournal', 2) cur = conn.cursor() cur.execute( 'select gch,journal_name,journal_name_en,pissn,eissn from journal') rows = cur.fetchall() for gch, journal_name, journal_name_en, pissn, eissn in rows: self.dic[gch] = (journal_name, journal_name_en, pissn, eissn) cur.close() conn.close() self.predb3('base_obj_meta_a_template_qk.db3', 'base_obj_meta_a_qk.aiaajournal') self.sqlList.clear() stmt = """insert into base_obj_meta_a (author,author_1st,organ,organ_1st,title,title_alt,keyword,pub_year,pub_date, vol,num,journal_raw_id,journal_name,journal_name_alt,page_info,begin_page,end_page,subject,is_oa,down_cnt,lngid, rawid,product,sub_db, provider,sub_db_id,source_type,provider_url,country,language,batch,down_date,publisher,issn,eissn,abstract, abstract_alt,doi,fund,ref_cnt,fulltext_type) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?, ?,?,?,?,?,?,?,?,?,?,?)""" count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname, 'meta') if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def startdown_list(self, message): utils.printf('%s:开始下载列表页...' % self.provider) if not self.list_path: self.initpath() self.refreshproxypool() self.sqlList.clear() self.count = 0 conn = utils.init_db('mysql', 'aiaajournal', 2) cur = conn.cursor() cur.execute('select url,stat from issue where stat=0') rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: if len(os.listdir(self.list_path)) == 0: utils.logerror('%s:没有新的issue不需要更新' % self.provider) else: self.sendwork('parse_list') for url, _ in rows: urlsp = url.split('/') base_name = '%s_%s.html' % (urlsp[-2], urlsp[-1]) fdir = '%s/%s' % (self.list_path, urlsp[-3]) fname = '%s/%s' % (fdir, base_name) if not os.path.exists(fdir): os.makedirs(fdir) self.sendwork('down_list', (url, fname))
def parse_detail(self, message): conn = utils.init_db('mysql', 'hepjournal', 4) cur = conn.cursor() cur.execute( 'select journal_id,journal_name,issn,eissn,cnno from journal') rows = cur.fetchall() for journal_id, journal_name, issn, eissn, cnno in rows: self.dic[journal_id] = (journal_name, issn, eissn, cnno) cur.close() conn.close() self.predb3() self.sqlList.clear() stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, volume, issue, page, beginpage, endpage, publisher, subject, date,creator_institution, date_created, source, identifier_pissn, identifier_eissn, identifier_cnno, description, identifier_doi, language, country, provider, provider_url, provider_id, type, medium, batch, gch)values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""" count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname) if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def parse_html(self): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'hepengineeringjournal', 4) result = [] stmt = 'insert ignore into journal(journal_id,journal_name,cover_url) Values(%s,%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.html_path): with open(fullname, encoding='utf8') as f: text = f.read() try: dic = json.loads(text, encoding='utf8') for dicitem in dic['resultValue']: dicitem = json.loads(dicitem, encoding='utf8') gch = dicitem['id'] name = dicitem['name'] cover_url = dicitem['volumeImg'] if cover_url == '': cover_url = dicitem['journalImg'] print(gch, name, cover_url) result.append((gch, name, cover_url)) except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_index')
def parse_index(self, message): try: utils.printf('%s:解析索引页开始...' % self.provider) conn = utils.init_db('mysql', 'science') result = [] stmt = 'insert ignore into issue(url,stat) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.index_path): urlf = '{}.sciencemag.org'.format(filename.split('_')[0]) with open(fullname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') divTags = soup.find_all( 'div', class_= 'highwire-cite highwire-cite-highwire-issue highwire-citation-jnl-sci-issue-archive clearfix' ) for divTag in divTags: url = urlf + divTag.a.get('href') result.append((url, 0)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析索引页完成...' % self.provider) self.senddistributefinish('startdown_list') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def startdown_list(self, message): utils.printf('%s:开始下载列表页...' % self.provider) if not self.list_path: self.initpath() self.refreshproxypool() self.count = 0 conn = utils.init_db('mysql', 'apsjournal') cur = conn.cursor() current_year = time.strftime('%Y') cur.execute( "select url,stat from issue where stat=0 or year=%s or year=%s" % (current_year, int(current_year) - 1)) rows = cur.fetchall() self.totalcount = len(rows) if self.totalcount == 0: if len(os.listdir(self.list_path)) == 0: utils.logerror('%s:没有新的issue不需要更新' % self.provider) else: self.sendwork('parse_list') for url, _ in rows: fdir = self.list_path + '/' + url.split('/')[-4] if not os.path.exists(fdir): os.makedirs(fdir) fname = fdir + '/' + url.split('/')[-2] + '_' + url.split( '/')[-1] + '.html' self.sendwork('down_list', (url, fname))
def down_cover(self, message): utils.printf('开始下载图片') if not self.cover_path: self.initpath() self.refreshproxypool() conn = utils.init_db('mysql', 'hepengineeringjournal', 4) cur = conn.cursor() cur.execute( "select journal_id,cover_url from journal where cover_url!=''") rows = cur.fetchall() HEADER = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', } for journal_id, cover_url in rows: filename = self.cover_path + '/' + journal_id + '.jpg' if os.path.exists(filename): continue while True: try: proxy = self.getproxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(cover_url, headers=HEADER, timeout=20, proxies=proxies) # resp = requests.get(cover_url, headers=HEADER, timeout=20) except: utils.printf(filename) continue if utils.Img2Jpg(resp.content, filename): utils.printf('下载图片%s成功' % filename) break self.sendwork('mapcover')
def init_config(): parser = argparse.ArgumentParser() logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(module)10s] [%(levelname)5s] %(message)s') load = {} config_file = "config.json" if os.path.isfile(config_file): with open(config_file) as data: load.update(json.load(data)) parser.add_argument("-a", "--auth_service", help="Auth Service ('ptc' or 'google')", default="ptc") parser.add_argument("-u", "--username", help="Username") parser.add_argument("-p", "--password", help="Password") parser.add_argument("-l", "--location", help="Location") parser.add_argument("-r", "--radius", help="area circle radius", type=int) parser.add_argument("-w", "--width", help="area square width", type=int) parser.add_argument("-f", "--dbfile", help="DB filename", default='db.sqlite') parser.add_argument("--level", help="cell level used for tiling", default=13, type=int) parser.add_argument("-t", "--delay", help="rpc request interval", default=10, type=int) parser.add_argument("-d", "--debug", help="Debug Mode", action='store_true', default=0) parser.add_argument("-n", "--test", help="Beta algorithm", action='store_true', default=0) config = parser.parse_args() for key in config.__dict__: if key in load and config.__dict__[key] == None: config.__dict__[key] = load[key] if config.auth_service not in ['ptc', 'google']: log.error("Invalid Auth service specified! ('ptc' or 'google')") return None if config.debug: logging.getLogger("requests").setLevel(logging.DEBUG) logging.getLogger("pgoapi").setLevel(logging.DEBUG) logging.getLogger("rpc_api").setLevel(logging.DEBUG) else: logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("pgoapi").setLevel(logging.WARNING) logging.getLogger("rpc_api").setLevel(logging.WARNING) dbversion = check_db(config.dbfile) if dbversion != VERSION: log.error('Database version mismatch! Expected {}, got {}...'.format(VERSION,dbversion)) return if config.location: from utils import get_pos_by_name lat, lng, alt = get_pos_by_name(config.location); del alt if config.radius: cells = cover_circle(lat, lng, config.radius, config.level) elif config.width: cells = cover_square(lat, lng, config.width, config.level) else: log.error('Area size not given!'); return log.info('Added %d cells to scan queue.' % init_db(cells, config.dbfile)) del cells, lat, lng return config
def gettagarts(tag): db = bottle.local.db ret = [] if tag in db: tl = db[tag] else: return ret for a in tl: art = db[a] ret.insert(0, Article(art['title'], art['content'], art['time'], art['tags'])) return ret if __name__ == "__main__": import utils db = utils.init_db().connect() db.clear() db['tags'] = ['engadege', 'weiphone'] title = '''外媒评出2010年十大最令人难忘产品''' content = '''<div id="read_content"> <P align=center><IMG border=0 src="http://resource.weiphone.com/resource/h003/h57/img201012041143530.jpg"><!--威锋网www.weiphone.com版权所有61.174.61.178 --></p> <P> 公关和市场传播公司Schneider Associates评出了2009至2010年最令人难忘的十大产品,其中苹果iPad、微软Windows 7、摩托罗拉Droid智能手机和三星3D电视上榜,高科技产品在该榜单中占据重要地位。<BR> <BR><STRONG>第一名:苹果iPad</STRONG><!--威锋网www.weiphone.com版权所有61.174.61.178 --></p> <P> 今年,iPad的销量已经超过了MacBook笔记本电脑。分析师预计,第四季度这款设备的销量可达630万部。<BR> <BR><STRONG>第二名:微软Windows 7操作系统<BR></STRONG> <BR> 微软最新操作系统——Windows 7支持触摸屏,不同设备间的分享功能更强,文件和程序的访问速度更快。在开发过程中,微软大量听取了消费者的意见,而消费者也非常喜欢这款产品。<BR> <BR><STRONG>第三名:玛氏糖果公司的Pretzel M&Ms<BR></STRONG> <BR> 这家糖果巨头满足了节俭消费者对廉价甜食的需求。实际上,它只不过是一颗夹心巧克力糖,这表明有时候好的想法其实很简单。<BR> <BR><STRONG>第四名:摩托罗拉Droid手机<BR></STRONG> <BR> 摩托罗拉Droid手机的宣传口号是“多任务运行”、“500万像素摄像头”和“黑暗环境中拍照”等,皆为<a href=http://iphone.weiphone.com>iPhone</a>的软肋。而这种广告营销策略在对抗<a href=http://iphone.weiphone.com>iPhone</a>这一强大对手时十分有效。<BR> <BR><STRONG>第五名:麦当劳水果冰沙McCafé Real Fruit Smoothies<BR></STRONG> <BR> 麦当劳效仿Jamba Juice推出水果冰沙,价格却要便宜得多,受到了消费者的欢迎。<BR> <BR><STRONG>第六名:苹果iPod Nano<BR></STRONG> <BR> Nano 6的体积更小,并具备了多点触摸功能。<BR> <BR><STRONG>第七名:星巴克Via速溶咖啡<BR></STRONG> <BR> 当星巴克发布Via时,外界批评声不断。但仅仅10个月,这种饮品的销售额已超过1亿美元。<BR> <BR><STRONG>第八名:三星3D电视<BR></STRONG> <BR> 《阿凡达》激发了3D热潮,三星迅速作出反应,于今年早些时候发布了全球第一款3D液晶电视。<BR> <BR><STRONG>第九名:哈吉斯牛仔裤尿布<BR></STRONG> <BR> 这款限量版牛仔裤尿布的广告语是:“你所见过的最酷拉裤子方式”。<BR> <BR><STRONG>第十名:Kleenex纸巾<BR></STRONG> <BR> 这款纸巾由100%的原生纤维制成,可以回收利用。<!--威锋网www.weiphone.com版权所有61.174.61.178 --></p> <br /> <span> </span>
def initdb_command(): """Creates the database tables.""" utils.init_db() print('Initialized the database.')
def init_db(): """Creates the database tables.""" utils.init_db() return 'Initialized the database.'
def __init__(self): self.sqlcon, self.sqlcursor = utils.init_db() self.history = None
from bottle import route, run, view, debug, static_file import models, utils @route('/') @view('index') def index(): return dict(articles = models.getarts(), tags = models.gettags()) @route('/tags/:tag') @view('index') def tag(tag): return dict(articles = models.gettagarts(tag), tags = models.gettags(), curtag = tag) @route('/static/:path#.+#') def server_static(path): return static_file(path, root='static') @route('/download/:filename') def download(filename): return static_file(filename, root='download', download=filename) debug(True) utils.init_db('data.db') run(server='flup', reloader=True)
def _fetchall(): global last_line_result global err_level print(' == fetching line') try: evt_info,predict=_fetch_line() except Exception as e: if last_line_result: err_level+=3 log('error','档线获取失败,使用上次结果') evt_info,predict=last_line_result else: raise else: last_line_result=evt_info,predict eventid=evt_info['id'] score_parser=parse_score_meta(eventid) if not os.path.exists('db/%d.db'%eventid): # init db print(' -> new event: event #%d %s'%(eventid,evt_info['title'])) print(' -> creating database and writing event info...') init_db(eventid) with sqlite3.connect('events.db') as db: db.execute( 'insert or replace into events (id, title, begin, end, last_update, score_parser) ' 'values (?,?,?,?,null,(select score_parser from events where id=?))', [eventid,evt_info['title'],int(evt_info['begin'].timestamp()),int(evt_info['end'].timestamp()),eventid] ) if datetime.datetime.now()-datetime.timedelta(minutes=3)>evt_info['end']: log('debug','活动 #%d 结束,爬虫停止抓取'%eventid) push('[SYSTEM]\n活动 #%d 结束\n#2300 : %d pt\n#11500 : %d pt\n#23000 : %d pt'%( eventid,predict['2300']['current'],predict['11500']['current'],predict['23000']['current'])) raise SystemExit('活动结束') with sqlite3.connect('db/%d.db'%eventid) as db: db.execute('insert into line (time, t1pre, t1cur, t2pre, t2cur, t3pre, t3cur) values (?,?,?,?,?,?,?)', [ int(datetime.datetime.now().timestamp()), predict['2300']['predict'], predict['2300']['current'], predict['11500']['predict'], predict['11500']['current'], predict['23000']['predict'], predict['23000']['current'], ]) for ind,uid,name in follows: print(' == fetching score of #%d %s at place %d'%(uid,name,ind)) details=_fetch_user_rank(ind,uid,eventid) if last_user_score[ind] is not None: last_lv, last_score, last_rank=last_user_score[ind] if details['score']!=last_score: score_delta=details['score']-last_score log('info','关注者 %s 分数变更:%d pt + %d pt → %d pt%s'%\ (name,last_score,score_delta,details['score'],score_parser(score_delta,' (%s)'))) if last_score and details['score']>0: push('%s\n%s获得了 %d pt\n→ %d pt (#%d)'%\ (name,score_parser(score_delta,'进行了 %s\n'),score_delta,details['score'],details['rank'])) if details['level']!=last_lv: log('info','关注者 %s 等级变更:lv %d → lv %d'%(name,last_lv,details['level'])) if last_user_score[ind][0]>0 and details['level']>0: push('%s\n升级到了 lv. %d'%(name,details['level'])) if line_num(details['rank'])!=line_num(last_rank): better_line=min(line_num(last_rank),line_num(details['rank'])) log('info','关注者 %s 档位变更:L%d → L%d (#%d)'%\ (name,line_num(last_rank),line_num(details['rank']),details['rank'])) if line_num(last_rank)>0 and line_num(details['rank'])>0: push('%s\n%s了 %d 档\n当前排名:#%d'%\ (name,'离开' if better_line==line_num(last_rank) else '进入',better_line,details['rank'])) last_user_score[ind]=(details['level'],details['score'],details['rank']) db.execute( 'insert into follow%d (time,level,score,rank) values (?,?,?,?)'%ind, [int(datetime.datetime.now().timestamp()), details['level'], details['score'], details['rank']] ) return eventid
def load_db(self): self.sqlcon, self.sqlcursor = utils.init_db()