def update_category_relation_cot(self, journal_id):
     try:
         amount = self.category_relation_cot(journal_id)
         print(amount, journal_id)
         cur = new_db_cursor()
         cur.execute(
             'update journal set category_relation_cot={} where sjr_id={}'.
             format(amount, journal_id))
         cur.close()
         new_db_cursor().execute(
                 'update journal set is_crawled = true where sjr_id = {}'\
                     .format(journal_id)
             )
         print('update {}ok'.format(journal_id))
     except Exception as e:
         print(str(e))
 def update_area_relation_cot(self, journal_id):
     amount = self.area_relation_cot(journal_id)
     print(amount, journal_id)
     cur = new_db_cursor()
     cur.execute(
         'update journal set area_relation_cot={} where sjr_id={}'.format(
             amount, journal_id))
     cur.close()
 def area_relation_cot(self, journal_id):
     cur = new_db_cursor()
     cur.execute(
         'select count(*) from journal_area WHERE journal_id={}'.format(
             journal_id))
     amount = cur.fetchall()[0][0]
     cur.close()
     return amount
 def get_db_journal_ids(self):
     cur = new_db_cursor()
     #sql = 'select sjr_id from journal WHERE area_relation_cot=0'
     sql = 'select sjr_id from journal WHERE is_crawled=FALSE'
     cur.execute(sql)
     journal_ids = cur.fetchall()
     cur.close()
     return journal_ids
 def get_detail_journal_info(self, db_item):
     try:
         journal_sjr_id = db_item[0]
         result = JournalDetailPageParser(journal_sjr_id)\
             .save_journal_category()
         if result:
             new_db_cursor().execute(
                 'update journal set is_crawled = true where sjr_id = {}'\
                     .format(journal_sjr_id)
             )
             print('update {}ok'.format(journal_sjr_id))
     except Exception as e:
         print(str(e))
     '''
     self.update_area_relation_cot(journal_sjr_id)
     '''
     '''
 def is_crawled_in_rank_page(self, category_id):
     '''
         判断某领域第一阶段是否爬过
         搜索category和journal的关联表
         若结果数多于10则说明已爬(多了也不需要)
     '''
     cur = new_db_cursor()
     sql = 'select count(*) from journal_category where category_id = {}'.format(
         category_id)
     #print(sql)
     cur.execute(sql)
     amount = cur.fetchall()[0][0]
     print('JournalInfoGenerator:\n\tCategory_id:{}, Journal Amount:{}'.
           format(category_id, amount))
     cur.close()
     return amount > 10
Example #7
0
@author:    lyn
@contact:   [email protected]
@python:    3.3
@editor:    PyCharm
@create:    2016-08-26 14:08
@description:
            获取articles的bibtex(bs4版本)
"""
import os, sys
PACKAGE_PARENT = '..'
SCRIPT_DIR = os.path.dirname(
    os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

from db_config import new_db_cursor
cur = new_db_cursor()
from crawl_tools.request_with_proxy import request_with_proxy

from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool
import time, random


def except_or_none(func):
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            print('Bibtex:\n\tError in {}(): {}'.format(func.__name__, str(e)))
            return None
 def get_db_category_ids(self):
     cur = new_db_cursor()
     cur.execute('select sjr_id,area_id from sjr_category ORDER by id desc')
     return cur.fetchall()