Esempio n. 1
0
 def _generate_website(self):
     if 'Personal Website' in self.sec:
         tmp = extract(RULES["website-1"], self.sec)
         if tmp is not None:
             self.website = tmp
         else:
             self.website = extract(RULES["website-2"], self.sec)
Esempio n. 2
0
 def _generate_website(self):
     if "website" in self.parse_data.keys():
         if self.parse_data["website"]:
             regex = '"(.*?)"'
             res = re.search(regex, str(self.parse_data["website"]))
             self.website = res.group()
     else:
         if extract(website_rule, self.sec):
             self.website = extract(website_rule, self.sec)
 def _generate_website(self):
     tmp = extract(RULES["info"],self.sec,multi=True)[1]
     if tmp is not None:
         tmp_1 = tmp.xpath('string(.)')
         if tmp_1 is not None:
             a=extract("//a/@href",str(etree.tostring(tmp)),multi=True)
             if len(a)==2:
                 self.website=a[1]
             else:
                 self.website=None
Esempio n. 4
0
 def _generate_title(self):
     title_1 = extract(RULES["title_1"], self.sec)
     title_2 = extract(RULES["title_2"], self.sec)
     title_3 = extract(RULES["title_3"], self.sec)
     if title_1 is None:
         title_1 = ""
     if title_2 is None:
         title_2 = ""
     if title_3 is None:
         title_3 = ""
     self.title = "{}{}{}".format(title_1, title_2, title_3)
Esempio n. 5
0
 def _generate_phone(self):
     if "phone" in self.parse_data.keys():
         if self.parse_data["phone"]:
            self.phone = self.parse_data["phone"]
     tmp = extract(phone_rule, self.sec)
     if len(tmp):
         self.phone = tmp.xpath('string(.)').strip().replace('Phone:','')
 def _generate_avatar(self):
     if "avatar" in self.parse_data.keys():
         if self.parse_data["avatar"]:
             regex = '[a-zA-z]+://[^\s]*'
             res = re.search(regex, str(self.parse_data["avatar"]))
             self.avatar = res.group()
     self.avatar = extract(avatar_rule, self.sec)
Esempio n. 7
0
 def _generate_phone(self):
     if "phone" in self.parse_data.keys():
         if self.parse_data["phone"]:
             self.phone = self.parse_data["phone"]
     self.phone = extract(phone_rule,
                          self.sec).xpath('string(.)').strip().replace(
                              'Office Phone:', '').strip()
Esempio n. 8
0
 def _generate_avatar(self):
     if "avatar" in self.parse_data.keys():
         if self.parse_data["avatar"]:
             regex = '[a-zA-z]+://[^\s]*'
             res = re.search(regex, str(self.parse_data["avatar"]))
             self.avatar = res.group()
     self.avatar = "http://jacobsschool.ucsd.edu/faculty/faculty_bios/" \
                   + extract(avatar_rule, self.sec)
    def _feed_info_queue(self, url):
        self.logger.info("processing page %s", url)

        html = fetch(url, proxies=None, logger=self.logger)
        #print(html.capitalize())
        item = extract(RULES["item_url"], html, multi=True)
        for i in item[:88]:
            self.info_queue.put_nowait(BASE_URL + i)
 def _generate_lastName(self):
     if "name" in self.parse_data.keys():
         if self.parse_data["name"]:
             self.lastName = HumanName(self.parse_data["name"]).last
     self.lastName = HumanName(
         str(
             extract(name_rule, self.sec).xpath('string(.)').replace(
                 'Faculty Directory', ''))).last
 def _generate_email(self):
     a = extract(RULES["info"], self.sec, multi=True)[-1]
     b = a.xpath('string(.)')
     if "Email" in b:
         c=b.replace('Email','').replace('Phone','')\
             .replace('Office','').strip().split(":")
         self.email = c[1]
     else:
         self.email = None
 def _generate_cooperation(self):
     tmp = extract(RULES["cooperation"], self.sec)
     if tmp is not None:
         if ";" in tmp:
             self.cooperation = tmp.split(";")
         else:
             self.cooperation.append(tmp)
     else:
         self.cooperation = []
 def _generate_keywords(self):
     tmp = extract(RULES["keywords"], self.sec)
     if tmp is not None:
         if "," in tmp:
             self.keywords = tmp.split(",")
         else:
             self.keywords.append(tmp)
     else:
         self.keywords = []
Esempio n. 14
0
 def crawl_info(self):
     from CustomParser.cs_utexas_parser import CSUtexasClass
     html = fetch(self.base_url, logger=self.logger)
     sec = extract(RULES["item"], html, multi=True)
     for i in sec:
         if i is not None:
             tmp = CSUtexasClass(str(etree.tostring(i)))
             parm = tmp.set_value()
             tmp.terminal_monitoring()
             self.parm_queue.put_nowait(parm)
Esempio n. 15
0
 def _crawl_info(self,item_url):
     self.logger.info("processing info %s",item_url)
     from ScholarConfig.me_utexas_rule import RULES
     from CustomParser.me_utexas_parser import MeUtexasClass
     from lxml import etree
     html=fetch(item_url,proxies=None,logger=self.logger)
     sec=extract(RULES["item"],html,multi=True)
     for i in sec:
         tmp = MeUtexasClass(str(etree.tostring(i)))
         parm = tmp.set_value()
         tmp.terminal_monitoring()
         self.parm_queue.put_nowait(parm)
Esempio n. 16
0
    def _feed_info_queue(self, url):
        self.logger.info("processing page %s", url)

        html = fetch(url,
                     requests_session=self.requsts_session,
                     proxies=None,
                     logger=self.logger)
        #print(html.capitalize())
        item = extract(self.item_url_rule, html, multi=True)
        if not self.is_url_joint:
            [self.info_queue.put_nowait(i) for i in item]
        else:
            [self.info_queue.put_nowait(self.default_url + i) for i in item]
 def _generate_email(self):
     tmp=extract(RULES["info"],self.sec,multi=True)[1]
     if tmp is not None:
         tmp_1 = tmp.xpath('string(.)')
         if tmp_1 is not None:
             emailRegex = r"([\w\.\-]+@[\w\.\-]+)"
             a=(tmp.xpath('string(.)').split('\\n')[1].strip())
             import re
             b=re.search(emailRegex,str(a))
             if b is not None:
                 self.email=b.group(0)
             else:
                 self.email=None
 def _generate_phone(self):
     tmp=extract(RULES["info"],self.sec,multi=True)[1]
     if tmp is not None:
         tmp_1 = tmp.xpath('string(.)')
         if tmp_1 is not None:
             #print(tmp.xpath('string(.)').split('\\n')[1].strip().split(','))
             if len(tmp_1.split('\\n')[1].strip().split(','))==4:
                 self.phone=(tmp_1.split('\\n')[1].strip().split(',')[2])
             elif len(tmp_1.split('\\n')[1].strip().split(','))==3:
                 if '@' not in (tmp.xpath('string(.)').split('\\n')[1].strip().split(',')[1]):
                     self.phone=(tmp_1.split('\\n')[1].strip().split(',')[1])
             else:
                 self.phone=None
 def _generate_title(self):
     tmp = extract(RULES["info"],self.sec,multi=True)[0]
     if tmp is not None:
         tmp_1 = tmp.xpath('string(.)')
         if tmp_1 is not None:
             self.title = tmp_1.split('\\n')[2].strip()
 def _generate_lastName(self):
     tmp = extract(RULES["info"],self.sec,multi=True)[0]
     self.lastName = HumanName(extract("//strong/a/text()",str(etree.tostring(tmp)))).last
 def _generate_keywords(self):
     self.keywords = extract(RULES["keyword"],self.sec,multi=True)
        self.keywords = extract(RULES["keyword"],self.sec,multi=True)
    def _generate_city(self):
        pass
    def _generate_time(self):
        pass
    def _generate_keywordKeys(self):
        self.keywordKeys = [i for i in range(1,len(self.keywords)+1)]
    def _generate_cityKeys(self):
        self.cityKeys = [i for i in range(1,len(self.city)+1)]
    def _generate_timeKeys(self):
        self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)]

if __name__ == '__main__':
    from utils.connection import fetch
    html=fetch("https://www.cs.utexas.edu/faculty")
    sec = extract(RULES["item"],html,multi=True)
    for i in sec:
        if i is not None:
            # tmp=extract(RULES["info"],str(etree.tostring(i)),multi=True)[1]
            # if tmp is not None:
            #     a=extract("//a/@href",str(etree.tostring(tmp)),multi=True)
            #     if len(a)==2:
            #         print(a[1])
            #     else:
            #         print(None)
            
            tmp=extract(RULES["keyword"],str(etree.tostring(i)),multi=True)
            print(tmp)
        
        
        
 def _generate_email(self):
     if "email" in self.parse_data.keys():
         if self.parse_data["email"]:
             self.email = self.parse_data["email"]
     else:
         self.email = extract(email_rule, self.sec).replace('mailto:', '')
Esempio n. 24
0
 def _generate_lastName(self):
     tmp = extract(RULES["name"],self.sec)
     self.lastName = HumanName(tmp).last
Esempio n. 25
0
            self.email=tmp.group()

    def _generate_website(self):
        pass
    
    def _generate_cooperation(self):
        pass
        
    def _generate_bio(self):
        pass

    def _generate_keywords(self):
        pass
        
    def _generate_city(self):
        pass
    def _generate_time(self):
        pass
    def _generate_keywordKeys(self):
        self.keywordKeys = [i for i in range(1,len(self.keywords)+1)]
    def _generate_cityKeys(self):
        self.cityKeys = [i for i in range(1,len(self.city)+1)]
    def _generate_timeKeys(self):
        self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)]

if __name__ == '__main__':
    from utils.connection import fetch
    html= fetch("http://www.me.berkeley.edu/people/faculty")
    #print(html)
    a=extract(RULES["item_url"],html,multi=True)
    print(a)
Esempio n. 26
0
 def _generate_avatar(self):
     # regex = '[a-zA-z]+://[^\s]*'
     # res = re.search(regex, str(self.parse_data["avatar"]))
     self.avatar = "http://www.me.umn.edu/people/{}".format(
         extract(avatar_rule, self.sec))
Esempio n. 27
0
 def _generate_avatar(self):
     self.avatar = extract(RULES["avatar"],self.sec)
 def _generate_bio(self):
     if "bio" in self.parse_data.keys():
         if self.parse_data["bio"]:
             self.bio = self.parse_data["bio"]
     if len(bio_rule):
         self.bio = extract(bio_rule, self.sec).xpath('string(.)')
Esempio n. 29
0
 def _generate_title(self):
     self.title = extract(RULES["title"],self.sec)
 def _generate_avatar(self):
     # regex = '[a-zA-z]+://[^\s]*'
     # res = re.search(regex, str(self.parse_data["avatar"]))
     self.avatar = "http://www.mccormick.northwestern.edu{}".format(
         extract(avatar_rule, self.sec).replace('../../..', ''))