def parse(self, response):
        experts = response.xpath(
            '//div[contains(@class,"faculty-list")]/table//tr')
        for expert in experts[1:]:
            item = ExpertsExtractItem()
            row_data = expert.xpath('td')
            name = row_data[0].xpath('a/text()').extract_first()
            if name:
                item['name'] = name.strip()

            link = row_data[0].xpath('a/@href').extract_first()
            link = "https://www.stonybrook.edu/" + link if 'https:' not in link else link

            department = row_data[1].xpath('text()').extract_first()
            if department:
                item['department'] = department

            expertise_block = row_data[2].xpath('a/text()').extract()
            if expertise_block:
                for i, ex in enumerate(expertise_block, 1):
                    item['areas_of_expertise_%s' % i] = ex.strip()

            # Make a request to actual link for the blog to extract other info
            request = scrapy.Request(link, callback=self.parse_each_expert)
            request.meta['item'] = item
            yield request
Ejemplo n.º 2
0
    def parse(self, response):
        experts = response.xpath('//table[@class="directory-layout"]//tr')
        for expert in experts[2:]:
            item = ExpertsExtractItem()
            info_block, areas_of_expertise = expert.xpath('td')

            name = info_block.xpath(
                'p/a[contains(@href, "directory")]/text()').extract_first()
            if name:
                item['name'] = name.strip()

            link = info_block.xpath(
                'p/a[contains(@href, "directory")]/@href').extract_first()
            link = "http://www.law.cuny.edu/faculty/" + link if 'http://www.law.cuny.edu/faculty/' not in link else link

            email = info_block.xpath(
                'p/a[contains(@href, "mailto:")]/text()').extract_first()
            if email:
                item['email'] = email.strip()
            phone = info_block.xpath(
                'p[contains(text(), "Office")]/text()').extract_first()
            if phone:
                item['phone'] = phone.split('Office:')[-1].strip()

            expertise = areas_of_expertise.xpath('ul/li/text()').extract()
            for i, expertise in enumerate(expertise, 1):
                item['areas_of_expertise_%s' % i] = expertise.strip()

            # Make a request to actual link for the blog to extract other info
            request = scrapy.Request(link, callback=self.parse_each_expert)
            request.meta['item'] = item
            yield request
Ejemplo n.º 3
0
    def parse_each_expert(self, response):
        item = ExpertsExtractItem()
        item['faculty_page'] = response.url

        name = response.xpath('//div[@id="faculty-info"]/h1/text()').extract_first()
        if name:
            item['name'] = name

        title = response.xpath('//div[@id="faculty-info"]/ul/li')
        if title:
            for i, value in enumerate(title, 1):
                item['title_%s' % i] = value.xpath('text()').extract_first().strip()
        
        phone = response.xpath('//div[@id="facluty-contact"]//a[contains(@href, "tel:")]/text()').extract_first()
        if phone:
            item['phone'] = phone.strip()

        email = response.xpath('//div[@id="facluty-contact"]//a[contains(@href, "mailto:")]/text()').extract_first()
        if email:
            item['email'] = email.strip()

        headshot = response.xpath('//div[@class="region-photo"]//img/@src').extract_first()
        if headshot:
            item['headshot'] = 'https://its.law.nyu.edu' + headshot
        
        biography =  response.xpath('//div[@id="full-bio-text"]/p/text()').extract_first()
        if biography:
            item['biography'] = biography
        
        expertise_block = response.xpath('//p[@id="expertise-text"]/text()').extract_first()
        if expertise_block:
            for i, value in enumerate(expertise_block.split(','), 1):
                item['areas_of_expertise_%s' % i] = value

        yield item
    def parse_each_expert(self, response):
        item = ExpertsExtractItem()
        areas_of_expertise = response.meta['areas_of_expertise']
        initial_expertise = [
            a.lower().strip() for a in re.split(r',|;', areas_of_expertise)
        ]
        department = response.xpath('//div[@class="unit-100 masthead-inner"]')
        try:
            department = department.xpath('string()').extract()[0].strip()

        except:
            print 'EXCEPTION: ', response.meta['name']
            return
        name = response.xpath(
            '//h3[@id="bioName"]/strong/text()').extract_first()
        title_block = response.xpath(
            '//h3[@id="bioName"]/following-sibling::p[1]')
        for i, title in enumerate(title_block.xpath('span/text()').extract()):
            item['title_%s' % (i + 1)] = title.strip()
        item['name'] = name
        item['department'] = department
        item['phone'] = response.xpath(
            '//li[@class="tel"]/a/text()').extract_first()
        item['email'] = response.xpath(
            '//li[@class="email"]/a/text()').extract_first()
        item['website'] = response.xpath(
            '//li[@class="website"]/a/text()').extract_first()
        item['faculty_page'] = response.url
        headshot = response.xpath(
            '//img[@class="profile_photo"]/@src').extract_first()
        if headshot:
            item['headshot'] = headshot

        areas_of_expertise = response.xpath(
            '//h4[text()="Areas of Expertise"]/following-sibling::p[1]/text()'
        ).extract_first()
        detailed_expertise = []
        if areas_of_expertise:
            detailed_expertise = [
                a.lower().strip() for a in re.split(r',|;', areas_of_expertise)
            ]
        merged_expertise = get_merged_expertise(initial_expertise,
                                                detailed_expertise)
        for i, expertise in enumerate(merged_expertise):
            if i > 20:
                break
            item['areas_of_expertise_%s' % (i + 1)] = expertise

        additional_info = ''
        info = response.xpath(
            '//*[preceding-sibling::h4[contains(text(), "Additional Information")]]'
        )
        for i in info:
            if i.xpath('string()').extract_first():
                sub_info = i.xpath('string()').extract_first().strip()
                if sub_info:
                    additional_info += sub_info + '\n'
        if additional_info:
            item['additional_info'] = additional_info
        yield item
    def parse_each_expert(self, response):
        item = ExpertsExtractItem()
        item['faculty_page'] = response.url
        
        name = response.xpath('//span[contains(@class, "sbex_person_name")]/text()').extract_first()
        if name:
            item['name'] = name.strip()
        brief_info = [value.strip() for value in response.xpath('//span[contains(@class, "sbex_person_name")]/../text()').extract() if value.strip()]

        item['title'] = brief_info[0]
        item['department'] = brief_info[1] if len(brief_info) > 1 else None
        item['school_college'] = brief_info[2] if len(brief_info) > 2 else None
        item['campus'] = brief_info[3] if len(brief_info) > 3 else None
        item['other'] = brief_info[4] if len(brief_info) > 4 else None

        headshot = response.xpath('//span[contains(@class, "sbex_person_name")]/../preceding-sibling::td[1]/img/@src').extract_first()
        if headshot:
            item['headshot'] = headshot

        key_topics = response.xpath('//strong[contains(text(), "Key Topics")]/following-sibling::text()').extract()
        key_topics = ''.join(i for i in key_topics)
        if any(separator in key_topics for separator in [',', ';', '\n']):
            for j,value in enumerate(re.findall(r'([^,;\n]+)[,;\n]?', key_topics)):
                if not value.strip():
                    break
                item['key_topics_%s' % (j+1)] = value.strip()
        else:  
            item['key_topics_1'] = key_topics.strip()
        
        bio_section = response.xpath('//strong[contains(text(), "Expert\'s Biography")]/following-sibling::text()').extract()
        biography = ''. join(bio.replace('\n', '').strip() for bio in bio_section)
        if biography:
            item['biography'] = biography

        phone_section = response.xpath('//strong[contains(text(), "Office phone") or contains(text(), "Phone")]/following-sibling::text()').extract()
        phone =  [phone.replace('\n', '').strip() for phone in phone_section if phone.strip()]
        if phone:
            phone = phone[0].replace(':', '')
            item['phone'] = phone
            if 'ext.' in phone.lower():
                phone,extension = [i.strip() for i in phone.split('Ext.')]
                item['phone'] = phone
                item['extension'] = extension

        
        self.browser.get(item['faculty_page'])
        email = self.browser.find_element_by_xpath('//strong[contains(text(), "Email")]/following-sibling::a[1]')
        if email:
            item['email'] = email.text

        # email = response.xpath('//strong[contains(text(), "Email")]/following-sibling::a[1]/text()').extract_first()
        # if email:
            # item['email'] = email

        personal_site = response.xpath('//strong[contains(text(), "Web Page")]/following-sibling::a[1]/text()').extract_first()
        if personal_site:
            item['personal_site'] = personal_site

        yield item
Ejemplo n.º 6
0
 def parse(self, response):
     experts = response.xpath('//h1[contains(text(),"Faculty Experts")]/following-sibling::ul[1]//a')
     for expert in experts:
         item = ExpertsExtractItem()
         
         link = expert.xpath('@href').extract_first()
         link = 'https://www.albany.edu' + link if 'https:' not in link else link
         
         request =  scrapy.Request(link, callback=self.parse_each_expert)
         yield request
Ejemplo n.º 7
0
    def parse_each_expert(self, response):
        item = ExpertsExtractItem()
        item['faculty_page'] = response.url

        intro_block = response.xpath('//p[contains(string(), "Department: ")]')
        name = response.xpath('//p[contains(string(), "Department: ")]/preceding-sibling::h1/text()').extract_first().strip()

        intro =  intro_block.xpath('string()').extract_first().strip()
        if 'Michele Caggana' in name:
            title, department =  re.split(r'[\n]+', intro)
            faculty = None
        elif 'Lei Zhu' in name:
            faculty, department =  re.split(r'[\n]+', intro)
            title = None
        else:
            title, faculty, department =  re.split(r'[\n]+', intro)
        department = department.replace('Department: ', '')
        item['name'] = name
        item['title'] = title
        item['faculty'] = faculty
        item['department'] = department

        expertise_block = response.xpath('//p[contains(string(), "Expertise:")]')
        expertise =  expertise_block.xpath('string()').extract_first().replace('\n', '').replace('Expertise:', '').strip()
        if '; ' in expertise:
            for i, ex in enumerate(re.split(r';', expertise),1):    
                item['areas_of_expertise_%s' % i] = ex.strip()
        else:
            for i, ex in enumerate(re.split(r',', expertise),1):
                item['areas_of_expertise_%s' % i] = ex.strip()

        contact_block = response.xpath('//p[contains(string(), "Campus phone") or contains(string(), "Campus email")]')
        contact =  contact_block.xpath('string()').extract_first().strip()
        phone =  re.findall(r'Campus phone:\s*(.*)[0-9]', contact)
        email = re.findall(r'Campus email:\s*(.*)', contact)
        if phone:
            item['phone'] = phone[0]
        if email:
            item['email'] = email[0]

        bio_block = response.xpath('//p[contains(string(), "Biography:")]/following-sibling::p')
        biography_full =  ' '.join(i.strip() for i in bio_block.xpath('string()').extract())
        item['biography_full'] = biography_full

        

        info_area = response.xpath('//div[@id="faculty-info-area"]')
        headshot = info_area.xpath('img/@src').extract_first()
        if headshot:
            item['headshot'] = 'https://www.albany.edu' + headshot if 'https://www.albany.edu' not in headshot else headshot

        biography_intro = info_area.xpath('div/p[@id="faculty-banner-text"]/text()').extract_first()
        if biography_intro:
            item['biography_intro'] = biography_intro.strip()
        yield item
 def parse(self, response):
     experts = response.xpath('//div[@class="collapseSectionContent"]/a')
     for expert in experts:
         item = ExpertsExtractItem()
         name = expert.xpath('text()').extract_first()
         link = expert.xpath('@href').extract_first()
         link = 'https://www.gse.harvard.edu' + link
         item['name'] = name
         item['faculty_page'] = link
         # Make a request to actual link for the blog to extract other info
         request = scrapy.Request(link, callback=self.parse_each_expert)
         request.meta['item'] = item
         yield request
    def parse_each_name(self, response):
        item = ExpertsExtractItem()
        content = response.xpath('//div[contains(@class, "post-content")]')
        expertise_block = content.xpath('ul/li/text()').extract()
        for i, ex in enumerate(expertise_block, 1):
            item['areas_of_expertise_%s' % i] = ex.strip()

        link = content.xpath('p/a/@href').extract_first()
        if not link:
            link = content.xpath('div/a/@href').extract_first()

        if link and '/news-releases/' not in link:
            request = scrapy.Request(link, callback=self.parse_each_expert)
            request.meta['item'] = item
            yield request
 def parse(self, response):
     expertise_map = self.get_expertise_map(response)
     experts = response.xpath('//main[@id="main"]//li/a')
     for expert in experts:
         item = ExpertsExtractItem()    
         
         name = expert.xpath('text()').extract_first()
         link = expert.xpath('@href').extract_first()
         if 'https://steinhardt.nyu.edu/' not in link:
             link = 'https://steinhardt.nyu.edu' + link
         
         item['name'] = name
         item['faculty_page'] = link
         
         # Make a request to actual link for the blog to extract other info
         request =  scrapy.Request(link, callback=self.parse_each_expert)
         expertise_in_main_page = [expertise for expertise,experts in expertise_map.iteritems() if name in experts]
         if expertise_in_main_page:
             request.meta['expertise_in_main_page'] = expertise_in_main_page
         request.meta['item'] = item
         yield request
    def parse(self, response):
        
        experts = response.xpath('//div[contains(@id, "aquadBoxnyuexpandables") and @class="answer"]/p[strong or a]')
        for expert in experts:
            item = ExpertsExtractItem()
            link = None
            if expert.xpath('strong/a'):
                name = expert.xpath('strong/a/text()').extract_first()
                link = expert.xpath('strong/a/@href').extract_first()
            
            elif expert.xpath('strong'):
                name = expert.xpath('strong/text()').extract_first()

            else:
                name = expert.xpath('a/strong/text()').extract_first()
                link = expert.xpath('a/@href').extract_first()
            
            if link and 'https://dental.nyu.edu' not in link:
                link = 'https://dental.nyu.edu' + link
            
            item['name'] = name.split(',')[0]
            
            bio_block =  expert.xpath('following-sibling::ul[1]/li')
            areas_of_expertise, biography =  bio_block.xpath('string()').extract()
            
            areas_of_expertise = areas_of_expertise.split('EXPERTISE:')[-1]
            biography = biography.split('BACKGROUND:')[-1]

            for i, expertise in enumerate(areas_of_expertise.split(';'), 1):
                item['areas_of_expertise_%s' % i] = expertise.strip()
            item['biography'] = biography.strip()
            
            if link:
                # Make a request to actual link for the blog to extract other info
                request =  scrapy.Request(link, callback=self.parse_each_expert)
                request.meta['item'] = item
                yield request
            else:
                yield item
Ejemplo n.º 12
0
    def parse(self, response):
        experts = response.xpath('//div[@class="m_expert"]')
        for expert in experts:
            item = ExpertsExtractItem()

            name = expert.xpath('h2/a/text()').extract_first()
            if name:
                item['name'] = name.strip()

            link = expert.xpath('h2/a/@href').extract_first()

            link = "https://www.gc.cuny.edu" + link if 'https:' not in link else link

            title_block = expert.xpath(
                'h2/following-sibling::p[1]/i/text()').extract()
            for i, title in enumerate(title_block, 1):
                item['title_%s' % i] = title.strip()

            email = expert.xpath(
                'h2/following-sibling::p[1]/a/text()').extract_first()
            if email:
                item['email'] = email.strip()

            expertise_block = expert.xpath('h2/following-sibling::p[2]')
            if expertise_block:
                expertise = expertise_block.xpath('string()').extract_first()
                expertise = expertise.replace('SPECIALIZATIONS:', '').replace(
                    '\r', '').replace('\n', '').strip()
                for i, ex in enumerate(re.split(';|,', expertise), 1):
                    item['areas_of_expertise_%s' % i] = ex.strip()
            # yield item

            # Make a request to actual link for the blog to extract other info
            request = scrapy.Request(link, callback=self.parse_each_expert)
            request.meta['item'] = item
            request.meta['expertise_index'] = i + 1
            yield request
    def parse_each_expert(self, response):
        item = ExpertsExtractItem()

        name = response.xpath(
            '//div/h1[@class="article-title"]/text()').extract_first()
        if name:
            item['name'] = name.strip()

        title = response.xpath(
            '//div/div[@class="title"]/text()').extract_first()
        if title:
            item['title'] = title.strip()

        phone = response.xpath(
            '//ul[@class="contact-info"]/li/a[@class="icon phone"]/text()'
        ).extract_first()
        if phone:
            item['phone'] = phone.strip()

        email = response.xpath(
            '//ul[@class="contact-info"]/li/a[@class="icon email"]/text()'
        ).extract_first()
        if email:
            item['email'] = email.strip()

        twitter = response.xpath(
            '//ul[@class="contact-info"]/li/a[@class="icon twitter"]/text()'
        ).extract_first()
        if twitter:
            item['twitter'] = twitter.strip()

        personal_site = response.xpath(
            '//ul[@class="contact-info"]/li/a[@class="icon website"]/@href'
        ).extract_first()
        if personal_site:
            item['personal_site'] = personal_site.strip()

        bio_block = response.xpath('//div[@class="bio"]')
        if bio_block:
            item['biography'] = " ".join(
                b.strip() for b in bio_block.xpath('string()').extract())

        if bio_block:
            areas_of_expertise = response.xpath(
                '//div[@class="bio"]/preceding-sibling::div[1]/ul/li/a/text()'
            ).extract()
        else:
            areas_of_expertise = response.xpath(
                '//div[@class="link-to-bio"]/preceding-sibling::div[1]/ul/li/a/text()'
            ).extract()

        for i, expertise in enumerate(areas_of_expertise, 1):
            item['areas_of_expertise_%s' % i] = expertise

        faculty_page = response.xpath(
            '//div[@class="link-to-bio"]/a/@href').extract_first()
        if faculty_page:
            item['faculty_page'] = faculty_page

        item['page_link'] = response.url

        headshot = response.xpath(
            '//div[contains(@style, "background-image:url")]/@style').re(
                r'.*?url\((.*?)\)')
        if headshot:
            item['headshot'] = headshot[0]

        yield item
Ejemplo n.º 14
0
    def parse_each_expert(self, response):
        item = ExpertsExtractItem()
        item['faculty_page'] = response.url

        name = response.xpath(
            '//h1[@class="page-title"]/text()').extract_first()
        if name:
            item['name'] = name

        title_block = response.xpath(
            '//div[contains(@class, "field-name-field-expert-professional-title")]'
        )
        if title_block:
            item['title'] = title_block.xpath('string()').extract_first()

        bio_block = response.xpath(
            '//div[contains(@class, "panel-pane pane-entity-field pane-node-body")]'
        )
        if bio_block:
            item['biography'] = bio_block.xpath(
                'string()').extract()[0].strip()

        expertise_block = response.xpath(
            '//div[@class="field-label" and contains(text(), "Expertise")]/following-sibling::div[1][@class="field-items"]/div'
        )
        expertise_list = list()
        for expertise in expertise_block:
            expertise_list.append(expertise.xpath('text()').extract_first())
        for i, expertise in enumerate(expertise_list):
            if i > 20:
                break
            item['areas_of_expertise_%s' % (i + 1)] = expertise

        department_block = response.xpath(
            '//div[@class="field-label" and contains(text(), "Department/College")]/following-sibling::div[1][@class="field-items"]/div'
        )
        if department_block:
            item['college'] = department_block[0].xpath(
                'text()').extract_first()
        if department_block and len(department_block) > 1:
            item['department'] = department_block[1].xpath(
                'text()').extract_first()
        if len(department_block) > 2:
            print 'MORE DEPARTMENT: ', response.url

        headshot = response.xpath(
            '//div[contains(@class, "field-name-field-contributor-photo field-type-image")]//img/@src'
        ).extract_first()
        if headshot:
            item['headshot'] = headshot

        email = response.xpath(
            '//div[contains(@class, "field-name-field-contributor-email")]//a/text()'
        ).extract_first()
        if email:
            item['email'] = email

        phone_block = response.xpath(
            '//div[contains(@class, "field-name-field-contributor-phone")]/div[@class="field-items"]/div'
        )
        if phone_block:
            item['office_contact'] = phone_block.xpath(
                'text()').extract_first()

        phone_block = response.xpath(
            '//div[contains(@class, "field-name-field-expert-mobile-phone")]/div[@class="field-items"]/div'
        )
        if phone_block:
            item['personal_contact'] = phone_block.xpath(
                'text()').extract_first()

        personal_site_block = response.xpath(
            '//div[contains(@class, "field-name-field-expert-homepage")]/div[@class="field-items"]/div/a'
        )
        if personal_site_block:
            item['personal_site'] = personal_site_block.xpath(
                '@href').extract_first()
        yield item
Ejemplo n.º 15
0
    def parse_each_expert(self, response):
        item = ExpertsExtractItem()

        name = response.xpath(
            '//div[@class="title section"]/h1[@id="title_1" or @id="title"]/text()'
        ).extract_first()
        if name:
            item['name'] = name.strip()

        title_block = response.xpath(
            '//div[@class="title section"]/h1[@id="title_1" or @id="title"]/../following-sibling::div[1][@class="introtext text parbase section"]/p'
        )
        if not title_block:
            title_block = response.xpath(
                '//div[@class="title section"]/h1[@id="title_1" or @id="title"]/../following-sibling::div[2][@class="introtext text parbase section"]/p'
            )
        title = title_block.xpath('text()').extract_first()
        if title:
            item['title'] = title.strip()

        college = title_block.xpath('i/text()').extract_first()
        if college:
            item['college'] = college.strip()

        areas_of_expertise_block = response.xpath(
            '//h3[contains(text(),"AREAS OF EXPERTISE")]/../following-sibling::div[1]/p'
        )
        if areas_of_expertise_block:
            areas_of_expertise = areas_of_expertise_block.xpath(
                'string()').extract_first().split(',')
            for i, expertise in enumerate(areas_of_expertise, 1):
                item['areas_of_expertise_%s' % i] = expertise

        headshot = response.xpath(
            '//div[contains(@class, "image-container")]//img/@src'
        ).extract_first()
        if headshot:
            item['headshot'] = 'http://www.buffalo.edu' + headshot

        bio_block = response.xpath(
            '//div[contains(@class, "image-container")]/following-sibling::div[1][@class="text parbase section"]'
        )
        if bio_block:
            item['biography'] = bio_block.xpath('string()').extract()[0]

        contact_block = response.xpath(
            '//div[@class="text parbase section"]/ul/li')
        if len(contact_block) >= 3:
            item['phone'] = contact_block[0].xpath('text()').extract_first()
            item['email'] = contact_block[1].xpath('a/text()').extract_first()
            item['personal_site'] = contact_block[2].xpath(
                'a/@href').extract_first()

        elif contact_block[-1].xpath('a[contains(text(), "website")]'):
            item['phone'], item['email'] = contact_block[0].xpath(
                'string()').re(r'.*?(\d+-\d+-\d+).*?(\S+@\S+)')
            item['personal_site'] = contact_block[1].xpath(
                'a/@href').extract_first()
        elif len(contact_block[0].xpath('string()').re(
                r'.*?(\d+-\d+-\d+).*?(\S+@\S+)')) == 2:
            item['phone'], item['email'] = contact_block[0].xpath(
                'string()').re(r'.*?(\d+-\d+-\d+).*?(\S+@\S+)')

        else:
            item['phone'] = contact_block[0].xpath('text()').extract_first()
            item['email'] = contact_block[1].xpath('a/text()').extract_first()

        item['faculty_page'] = response.url

        yield item