def mark_volume_ok(self, volume_db_id): sql = "update journal_volume set is_crawled=true \ where id={}".format(volume_db_id) cur = REMOTE_CONNS_POOL.new_db_cursor() cur.execute(sql) cur.close() print('Mark volume {} ok!'.format(volume_db_id))
def get_db_volume_item(self, volume_link): cur = REMOTE_CONNS_POOL.new_db_cursor() cur.execute("select is_crawled from journal_volume \ where link = '{}' ".format(volume_link)) data = cur.fetchall() cur.close() return data
def get_unfinished_journals(self, single_area_relation=True, open_access=True, limit=100, volume_links_got=True): journal_filter = ' ' if limit < 0: limit = -limit desc_word = 'desc' else: desc_word = '' if single_area_relation: journal_filter += ' area_relation_cot=1 and ' if open_access: journal_filter += ' open_access=true and ' if volume_links_got != 'no limit': if volume_links_got: journal_filter += ' volume_links_got=true and ' else: journal_filter += ' volume_links_got=false and ' cur = REMOTE_CONNS_POOL.new_db_cursor() sql = "select name,sjr_id,site_source,area_relation_cot,\ category_relation_cot,publisher,volume_links_got from journal \ WHERE{}is_crawled_all_article=FALSE\ and ( site_source like '%{}%') order by id {} limit {}"\ .format(journal_filter,self.publisher_keyword,desc_word,limit) #print(sql) cur.execute(sql) data = cur.fetchall() cur.close() return {self.publisher_keyword: data}
def journals_of_specific_index(index_sjr_id, single_area_relation, index_name, open_access, limit=100): if single_area_relation: single_area_relation_word = ' area_relation_cot=1 and ' else: single_area_relation_word = '' if open_access: open_access_word = ' open_access=true and ' else: open_access_word = '' cur = REMOTE_CONNS_POOL.new_db_cursor() if limit < 0: limit = -limit desc_word = 'desc' else: desc_word = '' sql = "select name,sjr_id,site_source,area_relation_cot,\ category_relation_cot,publisher,volume_links_got from journal \ WHERE{}{}(site_source like '%lsevier%' or site_source like '%ieee%' or site_source like '%springer%')and\ is_crawled_all_article=FALSE and \ sjr_id IN(\ select journal_id from journal_{} \ WHERE {}_id={} \ ) ORDER by h_index {} limit {}".format(single_area_relation_word, open_access_word, index_name, index_name, index_sjr_id, desc_word, limit) #print(sql) cur.execute(sql) return cur.fetchall()
def get_unfinished_volume_links(self): if not self.JournalObj.volume_links_got: #第一次初始化 self.create_new_volumes() if self.just_init: return [] cur = REMOTE_CONNS_POOL.new_db_cursor() cur.execute( 'select link,id from journal_volume \ where journal_sjr_id={} and is_crawled=FALSE'\ .format(self.JournalObj.sjr_id) ) return cur.fetchall()
def create_new_volumes(self): print('Init volume_links of {}...'.format(self.JournalObj.name)) if self.volume_links == []: #抓取volume_links失败时,避免对journal volume_links_got置1 return for volume_link in self.volume_links: self.create_volume(volume_link) cur = REMOTE_CONNS_POOL.new_db_cursor() cur.execute('update journal set volume_links_got=TRUE \ where sjr_id={}'.format(self.JournalObj.sjr_id)) cur.close() print(' volume links created ok! <{}>'.\ format(self.JournalObj.name))
def create_volume(self, volume_link): try: cur = REMOTE_CONNS_POOL.new_db_cursor() cur.execute( "insert into journal_volume(link,journal_sjr_id,is_crawled,create_time)" "values(%s,%s,%s,%s)", (volume_link, self.JournalObj.sjr_id, False, get_beijing_time())) print('[Success]Save ok volume_link: {} !'.format(volume_link)) except psycopg2.IntegrityError as e: print('[Error] in volume_link create:\n{} '.format(str(e))) except psycopg2.OperationalError as e: print('[Error] in volume_link create:\nserver conn error{}'.format( str(e))) cur.close()
def mark_journal_ok(self): cur = REMOTE_CONNS_POOL.new_db_cursor() cur.execute('update journal set is_crawled_all_article = true\ where sjr_id = {}'.format(self.JournalObj.sjr_id)) cur.close()
def categories_of_specific_area(area_sjr_id): cur = REMOTE_CONNS_POOL.new_db_cursor() cur.execute('select name,sjr_id from sjr_category \ WHERE area_id={}'.format(area_sjr_id)) return cur.fetchall()
def __init__(self, major_keyword): self.cur = REMOTE_CONNS_POOL.new_db_cursor() self.major_keyword = major_keyword