コード例 #1
0
    def _feed_info_queue(self, url):
        self.logger.info("processing page %s", url)

        html = fetch(url, proxies=None, logger=self.logger)
        #print(html.capitalize())
        item = extract(RULES["item_url"], html, multi=True)
        for i in item[:88]:
            self.info_queue.put_nowait(BASE_URL + i)
コード例 #2
0
 def _crawl_info(self, item_url):
     self.logger.info("processing info %s", item_url)
     from CustomParser.ece_utexas_parser import ECEUtexasClass
     sec = fetch(item_url, proxies=None, logger=self.logger)
     tmp = ECEUtexasClass(sec)
     parm = tmp.set_value()
     tmp.terminal_monitoring()
     self.parm_queue.put_nowait(parm)
コード例 #3
0
 def crawl_info(self):
     from CustomParser.cs_utexas_parser import CSUtexasClass
     html = fetch(self.base_url, logger=self.logger)
     sec = extract(RULES["item"], html, multi=True)
     for i in sec:
         if i is not None:
             tmp = CSUtexasClass(str(etree.tostring(i)))
             parm = tmp.set_value()
             tmp.terminal_monitoring()
             self.parm_queue.put_nowait(parm)
コード例 #4
0
def download(url, user_id, logger):
    try:
        tmp = fetch(url, decode=False)
        try:
            with open('../PhotoTemp/{}.jpg'.format(user_id), 'wb') as f:
                f.write(tmp)
        except Exception as e:
            logger.warn("{} {} save failed! {}".format(url, user_id, e))
    except Exception as e:
        logger.warn("{} {} download failded! {}".format(url, user_id, e))
コード例 #5
0
 def _crawl_info(self,item_url):
     self.logger.info("processing info %s",item_url)
     from ScholarConfig.me_utexas_rule import RULES
     from CustomParser.me_utexas_parser import MeUtexasClass
     from lxml import etree
     html=fetch(item_url,proxies=None,logger=self.logger)
     sec=extract(RULES["item"],html,multi=True)
     for i in sec:
         tmp = MeUtexasClass(str(etree.tostring(i)))
         parm = tmp.set_value()
         tmp.terminal_monitoring()
         self.parm_queue.put_nowait(parm)
コード例 #6
0
    def _feed_info_queue(self, url):
        self.logger.info("processing page %s", url)

        html = fetch(url,
                     requests_session=self.requsts_session,
                     proxies=None,
                     logger=self.logger)
        #print(html.capitalize())
        item = extract(self.item_url_rule, html, multi=True)
        if not self.is_url_joint:
            [self.info_queue.put_nowait(i) for i in item]
        else:
            [self.info_queue.put_nowait(self.default_url + i) for i in item]
コード例 #7
0
    def _crawl_info(self, item_url):
        self.logger.info("processing info %s", item_url)

        self.parse_data = auto_generate(sampleurl=self.sample_url,
                                        data=self.data,
                                        common_url=item_url)

        sec = fetch(item_url,
                    requests_session=self.requsts_session,
                    proxies=None,
                    logger=self.logger)
        tmp = self.custom_parser(sec=sec, parse_data=self.parse_data)
        parm = tmp.set_value()
        tmp.terminal_monitoring()
        self.parm_queue.put_nowait(parm)
コード例 #8
0
        with open(self.proxies_or_path) as f:
            self.proxies = f.readlines()
        self.logging.info("reload %s proxies ...", len(self.proxies))

    def get_proxy(self):
        '''
        获取一个可用代理
        :return:
        '''
        if self.is_single:
            return self.proxies
        proxy = self.proxies[random.randint(0, len(self.proxies) - 1)].strip()
        host, _ = proxy.split(':')
        lastest_time = self.host_time_map.get(host, 0)
        interval = time.time() - lastest_time
        if interval < self.interval:
            self.logging.info("%s waiting", proxy)
            time.sleep(self.interval)
        self.host_time_map[host] = time.time()
        return "http://%s" % proxy.strip()


if __name__ == '__main__':
    from utils.connection import fetch
    from utils.logger import get_logger
    logger = get_logger("ScienceDirectTask")
    p = ProxyManager("./1.txt", logger)
    a = p.get_proxy()
    b = fetch(url="http://icanhazip.com", proxies=a, logger=logger)
    print(b)
コード例 #9
0
            self.email=tmp.group()

    def _generate_website(self):
        pass
    
    def _generate_cooperation(self):
        pass
        
    def _generate_bio(self):
        pass

    def _generate_keywords(self):
        pass
        
    def _generate_city(self):
        pass
    def _generate_time(self):
        pass
    def _generate_keywordKeys(self):
        self.keywordKeys = [i for i in range(1,len(self.keywords)+1)]
    def _generate_cityKeys(self):
        self.cityKeys = [i for i in range(1,len(self.city)+1)]
    def _generate_timeKeys(self):
        self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)]

if __name__ == '__main__':
    from utils.connection import fetch
    html= fetch("http://www.me.berkeley.edu/people/faculty")
    #print(html)
    a=extract(RULES["item_url"],html,multi=True)
    print(a)
コード例 #10
0
    def _generate_keywords(self):
        self.keywords = extract(RULES["keyword"],self.sec,multi=True)
    def _generate_city(self):
        pass
    def _generate_time(self):
        pass
    def _generate_keywordKeys(self):
        self.keywordKeys = [i for i in range(1,len(self.keywords)+1)]
    def _generate_cityKeys(self):
        self.cityKeys = [i for i in range(1,len(self.city)+1)]
    def _generate_timeKeys(self):
        self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)]

if __name__ == '__main__':
    from utils.connection import fetch
    html=fetch("https://www.cs.utexas.edu/faculty")
    sec = extract(RULES["item"],html,multi=True)
    for i in sec:
        if i is not None:
            # tmp=extract(RULES["info"],str(etree.tostring(i)),multi=True)[1]
            # if tmp is not None:
            #     a=extract("//a/@href",str(etree.tostring(tmp)),multi=True)
            #     if len(a)==2:
            #         print(a[1])
            #     else:
            #         print(None)
            
            tmp=extract(RULES["keyword"],str(etree.tostring(i)),multi=True)
            print(tmp)
        
        
コード例 #11
0
    def _generate_email(self):
        self.email = extract(RULES["email"],self.sec)
    def _generate_website(self):
        self.website = extract(RULES["website"],self.sec)
    def _generate_cooperation(self):
        if "Research Interest" in self.sec:
            self.cooperation = extract(RULES["cooperation"],self.sec,multi=True)
        
    def _generate_bio(self):
        pass
    def _generate_keywords(self):
        a = extract(RULES["keywords"],self.sec)
        if  a is not None:
            self.keywords.append(a)
  
    def _generate_city(self):
        pass
    def _generate_time(self):
        pass
    def _generate_keywordKeys(self):
        self.keywordKeys = [i for i in range(1,len(self.keywords)+1)]
    def _generate_cityKeys(self):
        self.cityKeys = [i for i in range(1,len(self.city)+1)]
    def _generate_timeKeys(self):
        self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)]

if __name__ == '__main__':
    from utils.connection import fetch
    html= fetch("http://tmi.utexas.edu/people/type/faculty/")
    a=extract(RULES["item_url"],html,multi=True)
    print(a)
コード例 #12
0
        self.cooperation = extract(RULES["cooperation"], self.sec, multi=True)

    def _generate_bio(self):
        self.bio = extract(RULES["bio"], self.sec)

    def _generate_keywords(self):
        self.keywords = extract(RULES["keywords"], self.sec, multi=True)

    def _generate_city(self):
        pass

    def _generate_time(self):
        pass

    def _generate_keywordKeys(self):
        self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)]

    def _generate_cityKeys(self):
        self.cityKeys = [i for i in range(1, len(self.city) + 1)]

    def _generate_timeKeys(self):
        self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)]


if __name__ == '__main__':
    from utils.connection import fetch
    html = fetch("http://www.ece.utexas.edu/people/faculty/david-soloveichik")
    #print(html)
    a = extract(RULES["keywords"], html)
    print(a)
コード例 #13
0
ファイル: me_utexas_parser.py プロジェクト: leisun123/scholar
    def _generate_time(self):
        pass

    def _generate_keywordKeys(self):
        self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)]

    def _generate_cityKeys(self):
        self.cityKeys = [i for i in range(1, len(self.city) + 1)]

    def _generate_timeKeys(self):
        self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)]


if __name__ == '__main__':
    from utils.connection import fetch
    html = fetch("http://me.utexas.edu/faculty/faculty-directory")
    sec = extract(RULES["item"], html, multi=True)
    for i in sec:
        if i is not None:
            # tmp=extract(RULES["info"],str(etree.tostring(i)),multi=True)[1]
            # if tmp is not None:
            #     a=extract("//a/@href",str(etree.tostring(tmp)),multi=True)
            #     if len(a)==2:
            #         print(a[1])
            #     else:
            #         print(None)
            #print(etree.tostring(i))
            tmp = extract(RULES["info"], str(etree.tostring(i)),
                          multi=True)[-1]
            if tmp is not None:
                a = tmp.xpath('string(.)')
コード例 #14
0
    def _generate_cooperation(self):
        self.cooperation.append(extract(RULES["cooperation"], self.sec))

    def _generate_bio(self):
        self.bio = extract(RULES["bio"], self.sec)

    def _generate_keywords(self):
        self.keywords.append(extract(RULES["keywords"], self.sec))

    def _generate_city(self):
        pass

    def _generate_time(self):
        pass

    def _generate_keywordKeys(self):
        self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)]

    def _generate_cityKeys(self):
        self.cityKeys = [i for i in range(1, len(self.city) + 1)]

    def _generate_timeKeys(self):
        self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)]


if __name__ == '__main__':
    from utils.connection import fetch
    html = fetch("http://www.me.berkeley.edu/people/faculty/m-reza-alam")
    #    print(html)
    a = extract(RULES["phone"], html)
    print(a)
コード例 #15
0
                self.keywords = tmp.split(",")
            else:
                self.keywords.append(tmp)
        else:
            self.keywords = []

    def _generate_city(self):
        pass

    def _generate_time(self):
        pass

    def _generate_keywordKeys(self):
        self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)]

    def _generate_cityKeys(self):
        self.cityKeys = [i for i in range(1, len(self.city) + 1)]

    def _generate_timeKeys(self):
        self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)]


if __name__ == '__main__':
    from utils.connection import fetch
    html = fetch("http://www.caee.utexas.edu/faculty/directory/wright")
    sec = extract(RULES["item_url"], html, multi=True)
    html_1 = fetch("http://www.caee.utexas.edu/faculty/directory/gloyna")
    a = extract(RULES["info"], html_1, multi=True)[-1]
    b = extract(RULES["cooperation"], html)
    c = []
    print(html)
コード例 #16
0
    def _generate_bio(self):
        self.bio = extract(RULES["bio"], self.sec)

    def _generate_keywords(self):
        self.keywords = extract(RULES["keywords"], self.sec, multi=True)

    def _generate_city(self):
        pass

    def _generate_time(self):
        pass

    def _generate_keywordKeys(self):
        self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)]

    def _generate_cityKeys(self):
        self.cityKeys = [i for i in range(1, len(self.city) + 1)]

    def _generate_timeKeys(self):
        self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)]


if __name__ == '__main__':
    from utils.connection import fetch
    html = fetch(
        "https://www2.eecs.berkeley.edu/Faculty/Lists/list.html?_ga=2.157267064.273235644.1500137616-1698500221.1500137605"
    )
    #print(html)
    a = extract(RULES["item_url"], html, multi=True)
    print(a)