def parse(self, response): experts = response.xpath( '//div[contains(@class,"faculty-list")]/table//tr') for expert in experts[1:]: item = ExpertsExtractItem() row_data = expert.xpath('td') name = row_data[0].xpath('a/text()').extract_first() if name: item['name'] = name.strip() link = row_data[0].xpath('a/@href').extract_first() link = "https://www.stonybrook.edu/" + link if 'https:' not in link else link department = row_data[1].xpath('text()').extract_first() if department: item['department'] = department expertise_block = row_data[2].xpath('a/text()').extract() if expertise_block: for i, ex in enumerate(expertise_block, 1): item['areas_of_expertise_%s' % i] = ex.strip() # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_each_expert) request.meta['item'] = item yield request
def parse(self, response): experts = response.xpath('//table[@class="directory-layout"]//tr') for expert in experts[2:]: item = ExpertsExtractItem() info_block, areas_of_expertise = expert.xpath('td') name = info_block.xpath( 'p/a[contains(@href, "directory")]/text()').extract_first() if name: item['name'] = name.strip() link = info_block.xpath( 'p/a[contains(@href, "directory")]/@href').extract_first() link = "http://www.law.cuny.edu/faculty/" + link if 'http://www.law.cuny.edu/faculty/' not in link else link email = info_block.xpath( 'p/a[contains(@href, "mailto:")]/text()').extract_first() if email: item['email'] = email.strip() phone = info_block.xpath( 'p[contains(text(), "Office")]/text()').extract_first() if phone: item['phone'] = phone.split('Office:')[-1].strip() expertise = areas_of_expertise.xpath('ul/li/text()').extract() for i, expertise in enumerate(expertise, 1): item['areas_of_expertise_%s' % i] = expertise.strip() # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_each_expert) request.meta['item'] = item yield request
def parse_each_expert(self, response): item = ExpertsExtractItem() item['faculty_page'] = response.url name = response.xpath('//div[@id="faculty-info"]/h1/text()').extract_first() if name: item['name'] = name title = response.xpath('//div[@id="faculty-info"]/ul/li') if title: for i, value in enumerate(title, 1): item['title_%s' % i] = value.xpath('text()').extract_first().strip() phone = response.xpath('//div[@id="facluty-contact"]//a[contains(@href, "tel:")]/text()').extract_first() if phone: item['phone'] = phone.strip() email = response.xpath('//div[@id="facluty-contact"]//a[contains(@href, "mailto:")]/text()').extract_first() if email: item['email'] = email.strip() headshot = response.xpath('//div[@class="region-photo"]//img/@src').extract_first() if headshot: item['headshot'] = 'https://its.law.nyu.edu' + headshot biography = response.xpath('//div[@id="full-bio-text"]/p/text()').extract_first() if biography: item['biography'] = biography expertise_block = response.xpath('//p[@id="expertise-text"]/text()').extract_first() if expertise_block: for i, value in enumerate(expertise_block.split(','), 1): item['areas_of_expertise_%s' % i] = value yield item
def parse_each_expert(self, response): item = ExpertsExtractItem() areas_of_expertise = response.meta['areas_of_expertise'] initial_expertise = [ a.lower().strip() for a in re.split(r',|;', areas_of_expertise) ] department = response.xpath('//div[@class="unit-100 masthead-inner"]') try: department = department.xpath('string()').extract()[0].strip() except: print 'EXCEPTION: ', response.meta['name'] return name = response.xpath( '//h3[@id="bioName"]/strong/text()').extract_first() title_block = response.xpath( '//h3[@id="bioName"]/following-sibling::p[1]') for i, title in enumerate(title_block.xpath('span/text()').extract()): item['title_%s' % (i + 1)] = title.strip() item['name'] = name item['department'] = department item['phone'] = response.xpath( '//li[@class="tel"]/a/text()').extract_first() item['email'] = response.xpath( '//li[@class="email"]/a/text()').extract_first() item['website'] = response.xpath( '//li[@class="website"]/a/text()').extract_first() item['faculty_page'] = response.url headshot = response.xpath( '//img[@class="profile_photo"]/@src').extract_first() if headshot: item['headshot'] = headshot areas_of_expertise = response.xpath( '//h4[text()="Areas of Expertise"]/following-sibling::p[1]/text()' ).extract_first() detailed_expertise = [] if areas_of_expertise: detailed_expertise = [ a.lower().strip() for a in re.split(r',|;', areas_of_expertise) ] merged_expertise = get_merged_expertise(initial_expertise, detailed_expertise) for i, expertise in enumerate(merged_expertise): if i > 20: break item['areas_of_expertise_%s' % (i + 1)] = expertise additional_info = '' info = response.xpath( '//*[preceding-sibling::h4[contains(text(), "Additional Information")]]' ) for i in info: if i.xpath('string()').extract_first(): sub_info = i.xpath('string()').extract_first().strip() if sub_info: additional_info += sub_info + '\n' if additional_info: item['additional_info'] = additional_info yield item
def parse_each_expert(self, response): item = ExpertsExtractItem() item['faculty_page'] = response.url name = response.xpath('//span[contains(@class, "sbex_person_name")]/text()').extract_first() if name: item['name'] = name.strip() brief_info = [value.strip() for value in response.xpath('//span[contains(@class, "sbex_person_name")]/../text()').extract() if value.strip()] item['title'] = brief_info[0] item['department'] = brief_info[1] if len(brief_info) > 1 else None item['school_college'] = brief_info[2] if len(brief_info) > 2 else None item['campus'] = brief_info[3] if len(brief_info) > 3 else None item['other'] = brief_info[4] if len(brief_info) > 4 else None headshot = response.xpath('//span[contains(@class, "sbex_person_name")]/../preceding-sibling::td[1]/img/@src').extract_first() if headshot: item['headshot'] = headshot key_topics = response.xpath('//strong[contains(text(), "Key Topics")]/following-sibling::text()').extract() key_topics = ''.join(i for i in key_topics) if any(separator in key_topics for separator in [',', ';', '\n']): for j,value in enumerate(re.findall(r'([^,;\n]+)[,;\n]?', key_topics)): if not value.strip(): break item['key_topics_%s' % (j+1)] = value.strip() else: item['key_topics_1'] = key_topics.strip() bio_section = response.xpath('//strong[contains(text(), "Expert\'s Biography")]/following-sibling::text()').extract() biography = ''. join(bio.replace('\n', '').strip() for bio in bio_section) if biography: item['biography'] = biography phone_section = response.xpath('//strong[contains(text(), "Office phone") or contains(text(), "Phone")]/following-sibling::text()').extract() phone = [phone.replace('\n', '').strip() for phone in phone_section if phone.strip()] if phone: phone = phone[0].replace(':', '') item['phone'] = phone if 'ext.' in phone.lower(): phone,extension = [i.strip() for i in phone.split('Ext.')] item['phone'] = phone item['extension'] = extension self.browser.get(item['faculty_page']) email = self.browser.find_element_by_xpath('//strong[contains(text(), "Email")]/following-sibling::a[1]') if email: item['email'] = email.text # email = response.xpath('//strong[contains(text(), "Email")]/following-sibling::a[1]/text()').extract_first() # if email: # item['email'] = email personal_site = response.xpath('//strong[contains(text(), "Web Page")]/following-sibling::a[1]/text()').extract_first() if personal_site: item['personal_site'] = personal_site yield item
def parse(self, response): experts = response.xpath('//h1[contains(text(),"Faculty Experts")]/following-sibling::ul[1]//a') for expert in experts: item = ExpertsExtractItem() link = expert.xpath('@href').extract_first() link = 'https://www.albany.edu' + link if 'https:' not in link else link request = scrapy.Request(link, callback=self.parse_each_expert) yield request
def parse_each_expert(self, response): item = ExpertsExtractItem() item['faculty_page'] = response.url intro_block = response.xpath('//p[contains(string(), "Department: ")]') name = response.xpath('//p[contains(string(), "Department: ")]/preceding-sibling::h1/text()').extract_first().strip() intro = intro_block.xpath('string()').extract_first().strip() if 'Michele Caggana' in name: title, department = re.split(r'[\n]+', intro) faculty = None elif 'Lei Zhu' in name: faculty, department = re.split(r'[\n]+', intro) title = None else: title, faculty, department = re.split(r'[\n]+', intro) department = department.replace('Department: ', '') item['name'] = name item['title'] = title item['faculty'] = faculty item['department'] = department expertise_block = response.xpath('//p[contains(string(), "Expertise:")]') expertise = expertise_block.xpath('string()').extract_first().replace('\n', '').replace('Expertise:', '').strip() if '; ' in expertise: for i, ex in enumerate(re.split(r';', expertise),1): item['areas_of_expertise_%s' % i] = ex.strip() else: for i, ex in enumerate(re.split(r',', expertise),1): item['areas_of_expertise_%s' % i] = ex.strip() contact_block = response.xpath('//p[contains(string(), "Campus phone") or contains(string(), "Campus email")]') contact = contact_block.xpath('string()').extract_first().strip() phone = re.findall(r'Campus phone:\s*(.*)[0-9]', contact) email = re.findall(r'Campus email:\s*(.*)', contact) if phone: item['phone'] = phone[0] if email: item['email'] = email[0] bio_block = response.xpath('//p[contains(string(), "Biography:")]/following-sibling::p') biography_full = ' '.join(i.strip() for i in bio_block.xpath('string()').extract()) item['biography_full'] = biography_full info_area = response.xpath('//div[@id="faculty-info-area"]') headshot = info_area.xpath('img/@src').extract_first() if headshot: item['headshot'] = 'https://www.albany.edu' + headshot if 'https://www.albany.edu' not in headshot else headshot biography_intro = info_area.xpath('div/p[@id="faculty-banner-text"]/text()').extract_first() if biography_intro: item['biography_intro'] = biography_intro.strip() yield item
def parse(self, response): experts = response.xpath('//div[@class="collapseSectionContent"]/a') for expert in experts: item = ExpertsExtractItem() name = expert.xpath('text()').extract_first() link = expert.xpath('@href').extract_first() link = 'https://www.gse.harvard.edu' + link item['name'] = name item['faculty_page'] = link # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_each_expert) request.meta['item'] = item yield request
def parse_each_name(self, response): item = ExpertsExtractItem() content = response.xpath('//div[contains(@class, "post-content")]') expertise_block = content.xpath('ul/li/text()').extract() for i, ex in enumerate(expertise_block, 1): item['areas_of_expertise_%s' % i] = ex.strip() link = content.xpath('p/a/@href').extract_first() if not link: link = content.xpath('div/a/@href').extract_first() if link and '/news-releases/' not in link: request = scrapy.Request(link, callback=self.parse_each_expert) request.meta['item'] = item yield request
def parse(self, response): expertise_map = self.get_expertise_map(response) experts = response.xpath('//main[@id="main"]//li/a') for expert in experts: item = ExpertsExtractItem() name = expert.xpath('text()').extract_first() link = expert.xpath('@href').extract_first() if 'https://steinhardt.nyu.edu/' not in link: link = 'https://steinhardt.nyu.edu' + link item['name'] = name item['faculty_page'] = link # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_each_expert) expertise_in_main_page = [expertise for expertise,experts in expertise_map.iteritems() if name in experts] if expertise_in_main_page: request.meta['expertise_in_main_page'] = expertise_in_main_page request.meta['item'] = item yield request
def parse(self, response): experts = response.xpath('//div[contains(@id, "aquadBoxnyuexpandables") and @class="answer"]/p[strong or a]') for expert in experts: item = ExpertsExtractItem() link = None if expert.xpath('strong/a'): name = expert.xpath('strong/a/text()').extract_first() link = expert.xpath('strong/a/@href').extract_first() elif expert.xpath('strong'): name = expert.xpath('strong/text()').extract_first() else: name = expert.xpath('a/strong/text()').extract_first() link = expert.xpath('a/@href').extract_first() if link and 'https://dental.nyu.edu' not in link: link = 'https://dental.nyu.edu' + link item['name'] = name.split(',')[0] bio_block = expert.xpath('following-sibling::ul[1]/li') areas_of_expertise, biography = bio_block.xpath('string()').extract() areas_of_expertise = areas_of_expertise.split('EXPERTISE:')[-1] biography = biography.split('BACKGROUND:')[-1] for i, expertise in enumerate(areas_of_expertise.split(';'), 1): item['areas_of_expertise_%s' % i] = expertise.strip() item['biography'] = biography.strip() if link: # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_each_expert) request.meta['item'] = item yield request else: yield item
def parse(self, response): experts = response.xpath('//div[@class="m_expert"]') for expert in experts: item = ExpertsExtractItem() name = expert.xpath('h2/a/text()').extract_first() if name: item['name'] = name.strip() link = expert.xpath('h2/a/@href').extract_first() link = "https://www.gc.cuny.edu" + link if 'https:' not in link else link title_block = expert.xpath( 'h2/following-sibling::p[1]/i/text()').extract() for i, title in enumerate(title_block, 1): item['title_%s' % i] = title.strip() email = expert.xpath( 'h2/following-sibling::p[1]/a/text()').extract_first() if email: item['email'] = email.strip() expertise_block = expert.xpath('h2/following-sibling::p[2]') if expertise_block: expertise = expertise_block.xpath('string()').extract_first() expertise = expertise.replace('SPECIALIZATIONS:', '').replace( '\r', '').replace('\n', '').strip() for i, ex in enumerate(re.split(';|,', expertise), 1): item['areas_of_expertise_%s' % i] = ex.strip() # yield item # Make a request to actual link for the blog to extract other info request = scrapy.Request(link, callback=self.parse_each_expert) request.meta['item'] = item request.meta['expertise_index'] = i + 1 yield request
def parse_each_expert(self, response): item = ExpertsExtractItem() name = response.xpath( '//div/h1[@class="article-title"]/text()').extract_first() if name: item['name'] = name.strip() title = response.xpath( '//div/div[@class="title"]/text()').extract_first() if title: item['title'] = title.strip() phone = response.xpath( '//ul[@class="contact-info"]/li/a[@class="icon phone"]/text()' ).extract_first() if phone: item['phone'] = phone.strip() email = response.xpath( '//ul[@class="contact-info"]/li/a[@class="icon email"]/text()' ).extract_first() if email: item['email'] = email.strip() twitter = response.xpath( '//ul[@class="contact-info"]/li/a[@class="icon twitter"]/text()' ).extract_first() if twitter: item['twitter'] = twitter.strip() personal_site = response.xpath( '//ul[@class="contact-info"]/li/a[@class="icon website"]/@href' ).extract_first() if personal_site: item['personal_site'] = personal_site.strip() bio_block = response.xpath('//div[@class="bio"]') if bio_block: item['biography'] = " ".join( b.strip() for b in bio_block.xpath('string()').extract()) if bio_block: areas_of_expertise = response.xpath( '//div[@class="bio"]/preceding-sibling::div[1]/ul/li/a/text()' ).extract() else: areas_of_expertise = response.xpath( '//div[@class="link-to-bio"]/preceding-sibling::div[1]/ul/li/a/text()' ).extract() for i, expertise in enumerate(areas_of_expertise, 1): item['areas_of_expertise_%s' % i] = expertise faculty_page = response.xpath( '//div[@class="link-to-bio"]/a/@href').extract_first() if faculty_page: item['faculty_page'] = faculty_page item['page_link'] = response.url headshot = response.xpath( '//div[contains(@style, "background-image:url")]/@style').re( r'.*?url\((.*?)\)') if headshot: item['headshot'] = headshot[0] yield item
def parse_each_expert(self, response): item = ExpertsExtractItem() item['faculty_page'] = response.url name = response.xpath( '//h1[@class="page-title"]/text()').extract_first() if name: item['name'] = name title_block = response.xpath( '//div[contains(@class, "field-name-field-expert-professional-title")]' ) if title_block: item['title'] = title_block.xpath('string()').extract_first() bio_block = response.xpath( '//div[contains(@class, "panel-pane pane-entity-field pane-node-body")]' ) if bio_block: item['biography'] = bio_block.xpath( 'string()').extract()[0].strip() expertise_block = response.xpath( '//div[@class="field-label" and contains(text(), "Expertise")]/following-sibling::div[1][@class="field-items"]/div' ) expertise_list = list() for expertise in expertise_block: expertise_list.append(expertise.xpath('text()').extract_first()) for i, expertise in enumerate(expertise_list): if i > 20: break item['areas_of_expertise_%s' % (i + 1)] = expertise department_block = response.xpath( '//div[@class="field-label" and contains(text(), "Department/College")]/following-sibling::div[1][@class="field-items"]/div' ) if department_block: item['college'] = department_block[0].xpath( 'text()').extract_first() if department_block and len(department_block) > 1: item['department'] = department_block[1].xpath( 'text()').extract_first() if len(department_block) > 2: print 'MORE DEPARTMENT: ', response.url headshot = response.xpath( '//div[contains(@class, "field-name-field-contributor-photo field-type-image")]//img/@src' ).extract_first() if headshot: item['headshot'] = headshot email = response.xpath( '//div[contains(@class, "field-name-field-contributor-email")]//a/text()' ).extract_first() if email: item['email'] = email phone_block = response.xpath( '//div[contains(@class, "field-name-field-contributor-phone")]/div[@class="field-items"]/div' ) if phone_block: item['office_contact'] = phone_block.xpath( 'text()').extract_first() phone_block = response.xpath( '//div[contains(@class, "field-name-field-expert-mobile-phone")]/div[@class="field-items"]/div' ) if phone_block: item['personal_contact'] = phone_block.xpath( 'text()').extract_first() personal_site_block = response.xpath( '//div[contains(@class, "field-name-field-expert-homepage")]/div[@class="field-items"]/div/a' ) if personal_site_block: item['personal_site'] = personal_site_block.xpath( '@href').extract_first() yield item
def parse_each_expert(self, response): item = ExpertsExtractItem() name = response.xpath( '//div[@class="title section"]/h1[@id="title_1" or @id="title"]/text()' ).extract_first() if name: item['name'] = name.strip() title_block = response.xpath( '//div[@class="title section"]/h1[@id="title_1" or @id="title"]/../following-sibling::div[1][@class="introtext text parbase section"]/p' ) if not title_block: title_block = response.xpath( '//div[@class="title section"]/h1[@id="title_1" or @id="title"]/../following-sibling::div[2][@class="introtext text parbase section"]/p' ) title = title_block.xpath('text()').extract_first() if title: item['title'] = title.strip() college = title_block.xpath('i/text()').extract_first() if college: item['college'] = college.strip() areas_of_expertise_block = response.xpath( '//h3[contains(text(),"AREAS OF EXPERTISE")]/../following-sibling::div[1]/p' ) if areas_of_expertise_block: areas_of_expertise = areas_of_expertise_block.xpath( 'string()').extract_first().split(',') for i, expertise in enumerate(areas_of_expertise, 1): item['areas_of_expertise_%s' % i] = expertise headshot = response.xpath( '//div[contains(@class, "image-container")]//img/@src' ).extract_first() if headshot: item['headshot'] = 'http://www.buffalo.edu' + headshot bio_block = response.xpath( '//div[contains(@class, "image-container")]/following-sibling::div[1][@class="text parbase section"]' ) if bio_block: item['biography'] = bio_block.xpath('string()').extract()[0] contact_block = response.xpath( '//div[@class="text parbase section"]/ul/li') if len(contact_block) >= 3: item['phone'] = contact_block[0].xpath('text()').extract_first() item['email'] = contact_block[1].xpath('a/text()').extract_first() item['personal_site'] = contact_block[2].xpath( 'a/@href').extract_first() elif contact_block[-1].xpath('a[contains(text(), "website")]'): item['phone'], item['email'] = contact_block[0].xpath( 'string()').re(r'.*?(\d+-\d+-\d+).*?(\S+@\S+)') item['personal_site'] = contact_block[1].xpath( 'a/@href').extract_first() elif len(contact_block[0].xpath('string()').re( r'.*?(\d+-\d+-\d+).*?(\S+@\S+)')) == 2: item['phone'], item['email'] = contact_block[0].xpath( 'string()').re(r'.*?(\d+-\d+-\d+).*?(\S+@\S+)') else: item['phone'] = contact_block[0].xpath('text()').extract_first() item['email'] = contact_block[1].xpath('a/text()').extract_first() item['faculty_page'] = response.url yield item