Esempio n. 1
0
 def __init__(self,proxy=None):
     self.soups = list()
     self.more = True
     self.browser = AutoBrowser(proxy=proxy)
     self.browser.surf('http://epub.cnki.net/kns/brief/result.aspx?dbprefix=CJFQ',
                       ready_check=(By.CSS_SELECTOR,'#bottom'))
     time.sleep(2)
Esempio n. 2
0
    def __init__(self,proxy=None):
        self.first_region_set = ['安徽','北京','重庆','福建','广东','广西','甘肃','贵州','河北','河南','湖南','湖北','海南','黑龙江']
        self.second_region_set = ['吉林','江苏','江西','辽宁','内蒙古','宁夏','青海']
        self.third_region_set = ['上海','四川','山西','山东','陕西','天津','新疆','西藏','云南','浙江']
        self.college = ''
        self.region = ''
        self.last_url = ''
        self.current_url = ''
        self.result = []
        self.no_result = False

        self.browser = AutoBrowser(proxy=proxy,timeout=20)
        self.browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html',ready_check=(By.LINK_TEXT,'末页'))
Esempio n. 3
0
    def __init__(self,proxy=None):
        self.region = ''
        self.result = []

        self.browser = AutoBrowser(proxy=proxy,timeout=20)
        self.browser.surf('http://kaoshi.edu.sina.com.cn/college/collegeAvgScoreRank?syear=2013&provid=1',ready_check=(By.CLASS_NAME,'pageNumWrap'))
Esempio n. 4
0
class CEESpiderSina:
    """ CEESpiderSina类(College Entrance Examination)用来抓取高考数据,数据来源于新浪高考

    """
    def __init__(self,proxy=None):
        self.region = ''
        self.result = []

        self.browser = AutoBrowser(proxy=proxy,timeout=20)
        self.browser.surf('http://kaoshi.edu.sina.com.cn/college/collegeAvgScoreRank?syear=2013&provid=1',ready_check=(By.CLASS_NAME,'pageNumWrap'))

    def select_region(self,region):
        """ 选择省份

        :param str region: 省份
        :return: 无返回值
        """
        self.region = region
        self.browser.interact_one_time('#provSel',select_text=region)


    def select_subject(self,subject='文科'):
        """ 选择文理科

        :param str subject: 科目,文科或者理科
        :return: 无返回值
        """
        self.browser.interact_one_time('#typeSel',select_text=subject)

    def select_year(self,year='2014'):
        """ 选择年份

        :param str year: 年份
        :return: 无返回值
        """
        self.browser.interact_one_time('#sYear',select_text=year)

    def select_batch(self,batch='本科一批'):
        """ 选择批次

        :param str order: 批次
        :return: 无返回值
        """
        self.browser.interact_one_time('#sBatch',select_text=batch)

    def do_search(self):
        """ 开始搜索

        :return: 无返回值
        """
        self.browser.interact_one_time('#searchBtn',click=True)

        if self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')):
            self.current_url = self.browser.browser.current_url
        else:
            raise TimeoutError
        time.sleep(5)

    def clear(self):
        """ 清空结果

        :return:
        """
        self.result = []

    def get_result_and_more(self):
        """ 添加所有页结果到self.result

        :return:
        """
        is_next = True
        self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False))

        while(is_next):
            try:
                self.browser.browser.find_element_by_css_selector('.pageNumWrap > [node-type="next"]')
                self.browser.interact_one_time('.pageNumWrap > [node-type="next"]',click=True)
                time.sleep(2)
                if not self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')):
                    raise TimeoutError
                self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False))
            except NoSuchElementException:
                break

    @property
    def colleges(self):
        """ 返回爬虫的结果

        :return: 结果列表
        :rtype: list
        """
        vars = ['university','type','university_region','average_score','subject','year','batch','student_region']
        colleges = []
        for cstr in self.result:
            for item in re.split('\n',cstr):
                new_item = re.split('\s+',item)[1:8]
                new_item.append(self.region)
                colleges.append(dict(zip(vars,new_item)))

        colleges = [item for item in colleges if len(item) > 7]

        '''
        for item in colleges:
            if re.match('^--$',item['average_score']) is not None:
                item['average_score'] = None
            else:
                item['average_score'] = int(float(item['average_score']))
            if re.match('^--$',item['province_control_score']) is not None:
                item['province_control_score'] = None
            else:
                item['province_control_score'] = int(float(item['province_control_score']))'''

        return colleges

    def close(self):
        """ 关闭浏览器

        :return: 无返回值
        """
        self.browser.quit()
Esempio n. 5
0
# coding=UTF-8

import random
import time
from libs.network.class_autobrowser import AutoBrowser

proxy_list = ['58.22.86.44:8000']

proxy_checked_list = ['58.20.234.243:8000','58.20.242.85:8000',
                      '110.52.232.56:8000','110.52.232.56:80',
                      '58.20.232.239:8000','58.246.242.154:8080',
                      '58.20.232.239:8000','110.52.232.75:8000',
                      '60.13.74.184:81','110.52.232.60:8000',
                      '58.247.30.222:8080','58.22.86.44:8000']

browser = AutoBrowser(proxy=proxy_list[random.randint(0,len(proxy_list)-1)])
browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html')
browser.interact_one_time('.gaoxiaoshengyuandi_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True)
browser.interact_one_time('div.tabs_10:nth-child(3)',click=True)
browser.interact_one_time(location=browser.locate(link_text='西藏'),click=True)

browser.interact_one_time('.getFstypegaoxiaogesheng_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True)
browser.interact_one_time(location=browser.locate(link_text='文科'),click=True)

browser.interact_one_time('#provinceScoreKEY',send_text='复旦大学')
browser.interact_one_time('#dxlqx > form:nth-child(1) > div:nth-child(2) > input:nth-child(1)',click=True)
time.sleep(5)

print(browser.browser.find_element_by_css_selector('#queryschoolad').text)
u1 = browser.browser.current_url
browser.interact_one_time(location=browser.locate(link_text='下一页'),click=True)
Esempio n. 6
0
class Cnki:
    """ Cnki类用来连接cnki数据库

    """
    def __init__(self,proxy=None):
        self.soups = list()
        self.more = True
        self.browser = AutoBrowser(proxy=proxy)
        self.browser.surf('http://epub.cnki.net/kns/brief/result.aspx?dbprefix=CJFQ',
                          ready_check=(By.CSS_SELECTOR,'#bottom'))
        time.sleep(2)

    def submit(self):
        """ 提交查询,进行搜索

        :return:
        """
        self.browser.interact_one_time(self.browser.locate(id="btnSearch"),click=True)
        time.sleep(5)

    def sort(self,by='被引'):
        """ 根据by参数进行排序

        :param str by: 变量
        :return: 无返回值
        """
        self.browser.switch(iframe='iframeResult')
        self.browser.interact_one_time(location=self.browser.locate(link_text=by),click=True)
        time.sleep(6)

    def select_all_literature(self):
        self.browser.interact_one_time(location=self.browser.locate(link_text='清除'),click=True)
        time.sleep(2)
        self.browser.interact_one_time(location=self.browser.locate(id='selectCheckbox'),click=True)
        time.sleep(2)
        self.browser.interact_one_time(location='.SavePoint > a:nth-child(3)',click=True)

    def get_more(self,limit=4):
        """ 查询下一页

        :param limit:
        :return:
        """
        i = 1
        while self.more:
            if i >= limit:
                self.more = False
            try:
                self.browser.switch(iframe='iframeResult')
                self.browser.interact_one_time(location=self.browser.locate(id='Page_next'),click=True)
                time.sleep(3)
            except NoSuchElementException:
                self.more = False
            else:
                time.sleep(5)
                self.select_all_literature()
                self.child_operation()
                i += 1
                time.sleep(2)

    def child_operation(self):
        """ 操作子页面,并添加文献信息到self.soups

        :return: 无返回值
        """
        self.browser.interact_one_time(location='.GTContentTitle > td:nth-child(1) > input:nth-child(1)',click=True)
        self.browser.interact_one_time(location='#file_export > input:nth-child(1)',click=True)
        time.sleep(5)
        self.browser.interact_one_time(location=self.browser.locate(link_text='NoteExpress'),click=True)
        time.sleep(5)
        self.soups.append(BeautifulSoup(self.browser.browser.find_element_by_css_selector('.mainTable').text,"lxml"))
        time.sleep(5)
        self.browser.switch_to_parent(close=True)
        self.browser.switch_to_parent(close=True)
        time.sleep(5)

    def set_query(self,query_str=None):
        """ 设置专业查询字符串

        :param str query_str: 查询字符串
        :return: 无返回值
        """
        self.browser.interact_one_time(self.browser.locate(id='1_4'),click=True)
        time.sleep(2)
        self.browser.interact_one_time('#expertvalue',send_text=query_str)
        time.sleep(1)

    def set_period(self,start_period=None,end_period=None):
        """ 设置起始和终止时期

        :param str start_period: 起始时期
        :param str end_period: 终止时期
        :return: 无返回值
        """
        if start_period is not None:
            self.browser.interact_one_time(location=self.browser.locate(id='year_from'),select_text=start_period)
        if end_period is not None:
            self.browser.interact_one_time(location=self.browser.locate(id='year_to'),select_text=end_period)

        time.sleep(1)

    def set_subject(self,subjects=None):
        """ 选择学科领域

        :param list subjects: 学科字符串
        :return: 无返回值
        """
        self.browser.interact_one_time(location='input.btn:nth-child(1)',click=True)
        for subject in subjects:
            self.browser.interact_one_time(location=self.browser.locate(xpath=''.join(["//input[@name='",subject,"']"])),click=True)
        time.sleep(1)

    def export_to_pickle(self,file=r'E:\gitrobot\files\literature\literature_list.pkl'):
        """ 到处有效的代理服务器列表到文件

        :param str file: 文件名
        :return: 无返回值
        """
        F = open(file, 'wb')
        pickle.dump(self.soups, F)
        F.close()

    def export_to_dict(self):
        literature = OrderedDict()

        for llist in self.soups:
            content = str(llist.find_all('p'))
            content = re.split('</p>\]',re.split('\[<p>',content)[1])[0]
            items = re.split('\n',content)

            one_literature = dict()
            for item in items:
                if '{Title}' in item:
                    title = re.sub('\s+','',re.split('}: ',item)[1])
                if '{Author}' in item:
                    one_literature['author'] = [re.sub('\s+','',author) for author in re.split('\{Author\}\: ',item)
                                                if len(author) > 0]
                if '{Author Address}' in item:
                    one_literature['address'] = [re.sub('\s+','',address) for address in re.split(';',re.split('\}\: ',item)[1])
                                                if len(address) > 0]
                if '{Journal}' in item:
                    one_literature['journal'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{Year}' in item:
                    one_literature['year'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{Issue}' in item:
                    one_literature['issure'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{Pages}' in item:
                    one_literature['pages'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{Keywords}' in item:
                    one_literature['keyword'] = [re.sub('\s+','',keyword) for keyword in re.split(';',re.split('\}\: ',item)[1])
                                                 if len(keyword) > 0]
                if '{Abstract}' in item:
                    one_literature['abstract'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{ISBN/ISSN}' in item:
                    one_literature['ISBN/ISSN'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{Database Provider}' in item:
                    literature[title] = one_literature
                    one_literature = dict()

        return literature

    def export_to_json(self,file=r'E:\gitrobot\files\literature\literature_list.txt'):
        json.dump(self.export_to_dict(), fp=open(file,'w'))

    def close(self):
        """ 关闭浏览器

        :return: 无返回值
        """
        self.browser.quit()
Esempio n. 7
0
'''

mongo = MongoDB()
mongo.connect('publication','ChineseJournal')
'''
literatures = json.load(open(r'E:\gitrobot\files\literature\journals_cssci.txt'))
for l in literatures:
    print(l)
    #mongo.collection.insert_one(l)
print(len(literatures))'''

proxy_list = ['101.26.38.162:82']
proxy_list = ['111.56.13.152:80', '101.26.38.162:80', '101.26.38.162:82', '111.56.13.150:80', '60.191.157.155:3128', '60.191.175.54:3128', '60.191.167.93:3128', '61.163.32.6:3128', '49.1.244.139:3128', '112.16.76.188:8080', '60.191.163.147:3128', '60.194.100.51:80', '101.226.12.223:80', '82.200.81.233:80', '85.143.24.70:80', '59.58.162.141:888', '110.18.241.9:3128', '60.15.41.214:3128', '61.7.149.69:8080', '61.184.199.203:3128', '86.100.118.44:81', '61.150.89.67:3128', '61.162.223.41:9797', '95.168.217.24:3128', '86.100.118.44:80', '31.173.74.73:8080', '58.248.137.228:80', '79.120.72.222:3128', '46.218.85.101:3129', '106.56.225.200:3128', '60.15.55.228:3128', '60.13.74.184:81', '101.200.234.114:8080', '104.238.83.28:443', '91.183.124.41:80', '60.191.164.22:3128', '62.204.241.146:8000', '60.191.174.227:3128', '60.191.153.12:3128', '61.53.65.52:3128', '36.250.69.4:80', '61.153.198.178:3128', '60.191.153.75:3128', '60.191.178.43:3128', '60.13.74.184:82', '60.13.74.184:80', '60.191.161.244:3128', '60.191.170.122:3128', '60.191.167.11:3128', '61.175.220.4:3128', '61.164.92.254:9999', '61.75.2.124:3128', '27.122.12.45:3128', '64.62.233.67:80', '113.140.43.51:3128', '60.191.166.130:3128', '113.107.57.76:8101', '113.107.57.76:80', '60.191.160.20:3128', '61.134.34.148:3128', '93.51.247.104:80', '60.191.164.59:3128', '91.142.84.182:3128', '72.252.11.91:8080', '59.44.244.14:9797', '58.18.50.10:3128', '58.96.187.208:3128', '85.194.75.18:8080', '113.105.80.61:3128', '58.59.141.187:3128', '61.163.45.240:3128', '91.108.131.250:8080', '110.17.172.150:3128']
#browser = AutoBrowser(proxy=proxy_list[random.randint(0,len(proxy_list)-1)])
#browser = AutoBrowser(proxy='101.26.38.162:82')
browser = AutoBrowser()
browser.surf('http://navi.cnki.net/knavi/journal/Detailq/CJFD/JJYJ?Year=&Issue=&Entry=',
             ready_check=(By.CSS_SELECTOR,'#bottom'))

result = []
for item in mongo.collection.find({'ISSN':None}):
    print(item['中文名称'])
    browser.interact_one_time(location=browser.locate(id='navi-search-value'),send_text=item['中文名称'])
    browser.interact_one_time(location=browser.locate(id='navi-search-button'),click=True)
    time.sleep(2)
    browser.interact_one_time(location=browser.locate(
            css_selector=''.join(['a[title="',item['中文名称'],'"]'])),click=True)
    time.sleep(2)
    data = BeautifulSoup(browser.browser.find_element_by_css_selector('.list01').text,"lxml")
    ISSN = re.search('\d{4}-\d{3}[0-9a-zA-Z]',str(data)).group()
    print(ISSN)
Esempio n. 8
0
class CEESpider:
    """ CEESpider类(College Entrance Examination)用来抓取高考数据

    """
    def __init__(self,proxy=None):
        self.first_region_set = ['安徽','北京','重庆','福建','广东','广西','甘肃','贵州','河北','河南','湖南','湖北','海南','黑龙江']
        self.second_region_set = ['吉林','江苏','江西','辽宁','内蒙古','宁夏','青海']
        self.third_region_set = ['上海','四川','山西','山东','陕西','天津','新疆','西藏','云南','浙江']
        self.college = ''
        self.region = ''
        self.last_url = ''
        self.current_url = ''
        self.result = []
        self.no_result = False

        self.browser = AutoBrowser(proxy=proxy,timeout=20)
        self.browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html',ready_check=(By.LINK_TEXT,'末页'))

    def select_region(self,region):
        """ 选择省份

        :param str region: 省份
        :return: 无返回值
        """
        self.region = region
        self.browser.interact_one_time('.gaoxiaoshengyuandi_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True)

        if region in self.first_region_set:
            self.browser.interact_one_time('div.tabs_10:nth-child(1)',click=True)
        if region in self.second_region_set:
            self.browser.interact_one_time('div.tabs_10:nth-child(2)',click=True)
        if region in self.third_region_set:
            self.browser.interact_one_time('div.tabs_10:nth-child(3)',click=True)

        self.browser.interact_one_time(location=self.browser.locate(link_text=region),click=True)

    def select_subject(self,subject='文科'):
        """ 选择文理科

        :param str subject: 科目,文科或者理科
        :return: 无返回值
        """
        self.browser.interact_one_time('.getFstypegaoxiaogesheng_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True)
        self.browser.interact_one_time(location=self.browser.locate(link_text=subject),click=True)

    def set_college(self,college='复旦大学'):
        """ 设定学校

        :param str college: 学校名称
        :return: 无返回值
        """
        self.college = college
        self.browser.interact_one_time('#provinceScoreKEY',send_text=college)

    def do_search(self):
        """ 开始搜索

        :return: 无返回值
        """
        self.browser.interact_one_time('#dxlqx > form:nth-child(1) > div:nth-child(2) > input:nth-child(1)',click=True)
        if self.browser.browser.find_element_by_id('noResultMessage').text == '':
            if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')):
                self.current_url = self.browser.browser.current_url
            else:
                raise TimeoutError
        else:
            self.no_result = True

        time.sleep(5)

    def clear(self):
        """ 清空结果

        :return:
        """
        self.result = []

    def get_result_and_more(self):
        """ 添加所有页结果到self.result

        :return:
        """
        if self.no_result:
            self.no_result = False
            return None

        self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False))

        self.last_url = self.current_url
        self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True)
        if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')):
            time.sleep(5)
            self.current_url = self.browser.browser.current_url

            while self.last_url != self.current_url:
                self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False))
                self.last_url = self.current_url
                self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True)
                if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')):
                   self.current_url = self.browser.browser.current_url

    @property
    def colleges(self):
        """ 返回爬虫的结果

        :return: 结果列表
        :rtype: list
        """
        vars = ['university','student_region','subject','year','batch','average_score','province_control_score']
        colleges = []
        for cstr in self.result:
            for item in re.split('\n',cstr)[1:]:
                colleges.append(dict(zip(vars,re.split('\s+',item)[0:8])))

        for item in colleges:
            if re.match('^--$',item['average_score']) is not None:
                item['average_score'] = None
            else:
                item['average_score'] = int(float(item['average_score']))
            if re.match('^--$',item['province_control_score']) is not None:
                item['province_control_score'] = None
            else:
                item['province_control_score'] = int(float(item['province_control_score']))

        return colleges

    def close(self):
        """ 关闭浏览器

        :return: 无返回值
        """
        self.browser.quit()