def update_category_relation_cot(self, journal_id): try: amount = self.category_relation_cot(journal_id) print(amount, journal_id) cur = new_db_cursor() cur.execute( 'update journal set category_relation_cot={} where sjr_id={}'. format(amount, journal_id)) cur.close() new_db_cursor().execute( 'update journal set is_crawled = true where sjr_id = {}'\ .format(journal_id) ) print('update {}ok'.format(journal_id)) except Exception as e: print(str(e))
def update_area_relation_cot(self, journal_id): amount = self.area_relation_cot(journal_id) print(amount, journal_id) cur = new_db_cursor() cur.execute( 'update journal set area_relation_cot={} where sjr_id={}'.format( amount, journal_id)) cur.close()
def area_relation_cot(self, journal_id): cur = new_db_cursor() cur.execute( 'select count(*) from journal_area WHERE journal_id={}'.format( journal_id)) amount = cur.fetchall()[0][0] cur.close() return amount
def get_db_journal_ids(self): cur = new_db_cursor() #sql = 'select sjr_id from journal WHERE area_relation_cot=0' sql = 'select sjr_id from journal WHERE is_crawled=FALSE' cur.execute(sql) journal_ids = cur.fetchall() cur.close() return journal_ids
def get_detail_journal_info(self, db_item): try: journal_sjr_id = db_item[0] result = JournalDetailPageParser(journal_sjr_id)\ .save_journal_category() if result: new_db_cursor().execute( 'update journal set is_crawled = true where sjr_id = {}'\ .format(journal_sjr_id) ) print('update {}ok'.format(journal_sjr_id)) except Exception as e: print(str(e)) ''' self.update_area_relation_cot(journal_sjr_id) ''' '''
def is_crawled_in_rank_page(self, category_id): ''' 判断某领域第一阶段是否爬过 搜索category和journal的关联表 若结果数多于10则说明已爬(多了也不需要) ''' cur = new_db_cursor() sql = 'select count(*) from journal_category where category_id = {}'.format( category_id) #print(sql) cur.execute(sql) amount = cur.fetchall()[0][0] print('JournalInfoGenerator:\n\tCategory_id:{}, Journal Amount:{}'. format(category_id, amount)) cur.close() return amount > 10
@author: lyn @contact: [email protected] @python: 3.3 @editor: PyCharm @create: 2016-08-26 14:08 @description: 获取articles的bibtex(bs4版本) """ import os, sys PACKAGE_PARENT = '..' SCRIPT_DIR = os.path.dirname( os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) from db_config import new_db_cursor cur = new_db_cursor() from crawl_tools.request_with_proxy import request_with_proxy from bs4 import BeautifulSoup from multiprocessing.dummy import Pool as ThreadPool import time, random def except_or_none(func): def wrapper(*args, **kwargs): try: return func(*args, **kwargs) except Exception as e: print('Bibtex:\n\tError in {}(): {}'.format(func.__name__, str(e))) return None
def get_db_category_ids(self): cur = new_db_cursor() cur.execute('select sjr_id,area_id from sjr_category ORDER by id desc') return cur.fetchall()