def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath( "//div[@class='view-content']/div[contains(@class,'views-row')]/div/h3/a" ): text = link.xpath("text()").get() url = link.xpath("@href").get() self.log("find technology {}/{}".format(text, url), level=logging.INFO) patent_links.append(url) # for next page next_page = response.xpath("//li[@class='pager-next']/a/@href").get() if next_page is not None: self.log('process page {}'.format(next_page), level=logging.INFO) yield response.follow( url=next_page, callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = self.parse_name_from_url(p) if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath( '//*[@id="formTechPub1"]/div/table[2]/tr/td/a'): text = link.xpath("text()").get() url = link.xpath("@href").get() self.log("find technology {}/{}".format(text, url), level=logging.INFO) patent_links.append(url) # for next page total_result = self.statictics(response) self.page += 1 if self.page * self.item_per_page < total_result: self.log('process page {}'.format(self.page), level=logging.INFO) yield response.follow( url=self.next_page_template.format(self.page), callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = self.parse_name_from_url(p) if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath("//a[@class='lead-in']/@href").getall(): patent_links.append(link) # for next page next_page = response.xpath( "//div[@class='nav-previous']/a/@href").get() if next_page is not None: self.log('process page {}'.format(next_page), level=logging.INFO) yield response.follow( url=next_page, callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = p.split('/')[-2] if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath("//h4[@class='result-title']/a"): text = link.xpath("text()").get() url = link.xpath("@href").get() self.log("find technology {}/{}".format(text, url), level=logging.INFO) patent_links.append(url) # for next page current_page, total_page = self.statictics(response) if current_page < total_page: self.log('process page {}'.format(self.page), level=logging.INFO) yield response.follow( url='https://otd.harvard.edu/explore-innovation/technologies/results/P{}/'.format(current_page * 10), callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = self.parse_name_from_url(p) if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def run(self): """ The main function """ while self.running.is_set(): proxy = requests.get(self.api_url).content.decode().split( self.token) logging.debug('add new proxy {} to pool'.format(proxy)) for p in proxy: POOL.add('http://' + p[:-1], 300) time.sleep(self.delay_second)
def extract_page(self, response): arguments = self.parse_url(response.request.url) self.log('Process page {}'.format(response.request.url), level=logging.INFO) with open( os.path.join( self.work_directory, '{}_{}_{}_{}.html'.format(arguments['province'][0], arguments['round'][0], arguments['industry'][0], arguments.get('p', ['1'])[0])), 'wb') as fo: fo.write(response.body) row = response.xpath("//table[@class='ntable']/tr") if len(row) < 1: self.log('page {} has no data'.format(response.request.url), level=logging.WARNING) POOL.remove(response.request.meta['proxy']) return first_row = True result = [] for r in row: if first_row: first_row = False continue image = r.xpath('td[1]/img/@src').extract_first() project_name = r.xpath('td[2]/a/text()').extract_first() project_url = r.xpath('td[2]/a/@src').extract_first() investor = r.xpath('td[3]/text()').extract_first() stage = r.xpath('td[4]/text()').extract_first() invest_time = r.xpath('td[5]/text()').extract_first() result.append({ 'image': image, 'project': { 'url': project_url, 'name': project_name }, 'investor': investor, 'stage': stage, 'time': invest_time }) self.log('find investment from {} to {}'.format( investor, project_name), level=logging.INFO) # save the file with open( os.path.join( self.work_directory, '{}_{}_{}_{}.json'.format(arguments['province'][0], arguments['round'][0], arguments['industry'][0], arguments.get('p', ['1'])[0])), 'w') as fo: json.dump(result, fo, ensure_ascii=False)
def start_requests(self): for url in self.start_urls: for industry in self.industry: for stage in self.stage: for year in self.year: yield scrapy.Request( url=url, dont_filter=True, callback=self.apply_filter, meta={ 'proxy': POOL.get() if not self.exclusive else POOL.pop(), 'extra': {'industry': industry, 'stage': stage, 'year': year}}, errback=self.handle_failure)
def parse_list(self, response: Response): driver = self.get_driver(response) # wait for page to load # wait for the redirect to finish. patent_links = [] if os.path.exists(os.path.join(self.work_directory, 'links.json')): patent_links = json.load(open(os.path.join(self.work_directory, 'links.json'), 'r')) else: while True: time.sleep(1) self.wait_for_element(driver, "//div[@id='ctl00_ContentPlaceHolder1_UpdateProgress1' and @style='display: block;']") table = driver.find_element_by_xpath("//div[@class='table-body']") for r in table.find_elements_by_xpath("div"): cols = r.find_elements_by_xpath("div") patent = cols[2].find_element_by_xpath('p/a') abstract = cols[2].find_element_by_xpath('div/p') patent_links.append({'name': patent.text, 'link': patent.get_attribute('href'), 'abstract': abstract.text}) self.log('Found technology {}'.format(patent.text), level=logging.INFO) if not self.next_page(driver): break time.sleep(3) with open(os.path.join(self.work_directory, 'links.json'), 'w') as fo: json.dump(patent_links, fo) for p in patent_links: name = p['link'].split('/')[-1] if os.path.exists(os.path.join(self.work_directory, name[:-4] + 'json')): self.log('{} already parsed and will skip'.format(p['link']), level=logging.INFO) continue yield Request( url=p['link'], callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath("//ul[@id='tech-licensing']/li/a"): text = link.xpath("text()").get() url = link.xpath("@href").get() if url is None: continue self.log("find technology {}/{}".format(text, url), level=logging.INFO) patent_links.append(url) for p in patent_links: name = self.parse_name_from_url(p) if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def start_requests(self): for url in self.start_urls: yield SeleniumRequest(url=url, dont_filter=True, callback=self.apply_filter, meta={'proxy': POOL.get()}, errback=self.handle_failure_selenium)
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] if os.path.exists(os.path.join(self.work_directory, 'links.json')): patent_links = json.load(open(os.path.join(self.work_directory, 'links.json'), 'r')) else: # the id of product is provded in the <script></script> for code in response.xpath("//script").getall(): if 'id_list' in code: ids = re.findall(r'[0-9]+', re.findall(r'\[[0-9,]+\]', code)[0]) patent_links = [response.url + '/public/project/{}'.format(patentId) for patentId in ids] with open(os.path.join(self.work_directory, 'links.json'), 'w') as fo: json.dump(patent_links, fo) for p in patent_links: name = p.split('/')[-1] if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def start_requests(self): for url in self.start_urls: yield Request( url=url, callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse(self, response): # find the document url link = response.xpath( "//div[@class='ProjetInfo_title']/a/@href").extract_first() if link is None: self.log('{} fail to download'.format(response.url, link), level=logging.WARNING) # remove the invalid proxy POOL.remove(response.request.meta['proxy']) return page = uuid.uuid5(uuid.NAMESPACE_URL, response.url.encode('utf-8')).hex filename = '%s.html' % page with open(os.path.join(self.work_directory, filename), 'wb') as f: f.write(response.body) self.log('{} => {}'.format(response.url, link), level=logging.INFO) yield { 'link': response.url, 'xml': link, }
def apply_filter(self, response): for year in range(2018, 2003, -1): url = response.request.url + 'y{}/'.format(year) self.log('Process page {}'.format(url), level=logging.INFO) yield scrapy.Request( url=url, dont_filter=True, callback=self.parse, meta={'proxy': POOL.get()}, errback=self.handle_failure)
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for row in response.xpath( "//div[@id='nouvant-portfolio-content']/div[@class='technology']" ): title = row.xpath("h2/a/text()").get() link = row.xpath("h2/a/@href").get() abstract = row.xpath("p/span/text()").get() self.log('found patent {}'.format(title), level=logging.INFO) patent_links.append({ 'title': title, 'link': link, 'abstract': abstract }) statistics = self.statistics(response) self.log('found {}/{} patents'.format(statistics['end'], statistics['total']), level=logging.INFO) if statistics['end'] < statistics['total']: yield response.follow( url='/technologies?limit=50&offset={}&query='.format( statistics['end']), callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = p['link'].split('/')[-1] if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p['link']), level=logging.INFO) continue yield response.follow( url=p['link'], callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def start_requests(self): for url in self.data.keys(): if url is None or not url.startswith('http'): continue name = url.split('/')[-1] if os.path.exists(os.path.join(self.work_directory, name + '.json')): continue yield Request(url=url, meta={'proxy': POOL.get()}, errback=self.handle_failure, callback=self.parse)
def parse_category(self, response: Response): # with javascript it would be //div[@class='split-taxonomy-4']/ul/li/a/@href for row in response.xpath( "//section[@id='block-taxonomy-menu-block-1']/ul/li/a/@href" ).getall(): self.log('find category {}'.format(row), level=logging.INFO) yield response.follow( url=row, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, callback=self.parse_list, errback=self.handle_failure)
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for row in response.xpath( "//section[@id='block-system-main']/div[@class='node node-technology node-teaser clearfix']/h2/a" ): name = row.xpath("text()").get() link = row.xpath("@href").get() patent_links.append({'name': name, 'link': link}) self.log('found patents {}'.format(name), level=logging.INFO) if response.xpath("//li[@class='pager-last']/a/@href").get() is not None and\ response.url != response.xpath("//li[@class='pager-last']/a/@href").get(): # have next page if '?page=' in response.url: elements = response.url.split("=") page = (int(elements[-1]) + 1) self.log('go to page {}'.format(page), level=logging.INFO) yield response.follow( url='='.join(elements[:-1]) + '={}'.format(page), dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, callback=self.parse_list, errback=self.handle_failure) else: self.log('go to page 2', level=logging.INFO) yield response.follow( url=response.url + '?page=1', dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, callback=self.parse_list, errback=self.handle_failure) for p in patent_links: yield response.follow( url=p['link'], dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, callback=self.parse, errback=self.handle_failure)
def start_requests(self): urls = json.load( open(os.path.join(self.work_directory, 'links.json'), 'rb')) for url in urls: page = uuid.uuid5(uuid.NAMESPACE_URL, url.encode('utf-8')).hex filename = '%s.html' % page if os.path.exists(os.path.join(self.work_directory, filename)): self.log('{} already exists'.format(url)) continue yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': POOL.get()}, errback=self.handle_failure) time.sleep(random.random())
def parse_page(log, work_directory: str, with_proxy: bool, slug: str): print('process {}'.format(slug)) while True: try: if with_proxy: proxies = POOL.get() response = get(slug, proxies={'http': proxies, 'https': proxies}) else: response = get(slug) if 200 <= response.status_code < 300: data = response.json() with open(os.path.join(work_directory, '{}.json'.format(slug.split('/')[-1])), 'w') as fo: json.dump(data, fo) break except Exception as e: print(e)
def parse_school_list(self, response: Response): if os.path.exists(os.path.join(self.work_directory, 'links.json')): school_links = json.load(open(os.path.join(self.work_directory, 'links.json'), 'r')) else: driver = self.get_driver(response) school_links = [] page = 1 while True: for row in driver.find_elements_by_xpath("//table[@id='search-results']/tbody/tr"): title = row.find_element_by_xpath("td[@class='search-results-major']/a").text link = row.find_element_by_xpath("td[@class='search-results-major']/a").get_attribute('href') school_links.append({'name': title, 'link': link}) self.log('find school {}'.format(title), level=logging.INFO) total_page = self.statistics(response) self.log('Finish page {}/{}'.format(page, total_page), level=logging.INFO) if page < total_page: try: next_page = driver.find_element_by_xpath("//img[@class='paginator-next-page paginator-button']") if next_page is not None: next_page.click() page += 1 except Exception as e: self.log('Fail to go to page {}'.format(page + 1), level=logging.ERROR) break else: break time.sleep(3) wait = WebDriverWait(driver, 30) try: wait.until(lambda x: len(x.find_elements_by_xpath("//table[@id='search-results' and @style='opacity: 1;']")) > 0) except Exception as e: self.log('Unable to retrieve school information: {}'.format(e), level=logging.ERROR) break with open(os.path.join(self.work_directory, 'links.json'), 'w') as fo: json.dump(school_links, fo) for s in school_links: yield Request( url=s['link'], callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse_list(self, response): self.log('Parse list {}'.format(response.url), level=logging.INFO) name = response.url.split('/')[-1] with open(os.path.join(self.work_directory, name + '.html'), 'wb') as fo: fo.write(response.body) # for the information of school school = create_company() meta = self.get_school_information(response) if 'Name' in meta: school['name'] = meta['Name'] if 'URL' in meta: school['ref'] = meta['URL'] school['contact']['website'] = meta['URL'] if 'Group Type' in meta: school['abs'] = meta['Group Type'] school['addr'] = deepcopy(self.address) school['addr']['line1'] = school['name'] if school['name'] in self.blacklist: return patent_links = [] if os.path.exists(os.path.join(self.work_directory, school['name'] + '.json')): patent_links = json.load(open(os.path.join(self.work_directory, school['name'] + '.json'), 'r')) else: # the id of product is provded in the <script></script> for code in response.xpath("//script").getall(): if 'id_list' in code: ids = re.findall(r'[0-9]+', re.findall(r'\[[0-9,]+\]', code)[0]) patent_links = ['https://www.flintbox.com/public/project/{}'.format(patentId) for patentId in ids] with open(os.path.join(self.work_directory, school['name'] + '.json'), 'w') as fo: json.dump(patent_links, fo) for p in patent_links: name = p.split('/')[-1] if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get(), 'school': school} if self.with_proxy else {'school': school}, errback=self.handle_failure)
def start_requests(self): for url in self.start_urls: for p in self.province: for i in self.industry: for r in self.round: self.log( 'start page province={} industry={} and round={}'. format(p, i, r), level=logging.INFO) yield scrapy.Request(url=url, dont_filter=True, callback=self.apply_filter, meta={ 'proxy': POOL.get(), 'extra': { 'province': p, 'industry': i, 'round': r } }, errback=self.handle_failure)
def apply_filter(self, response): if os.path.exists(os.path.join(self.work_directory, 'category.json')): with open(os.path.join(self.work_directory, 'category.json'), 'r') as fi: result = json.load(fi) links = [] for r1 in result: for r2 in r1['children']: if len(r2['children']) < 1: links.append(r2['url']) else: links.extend([r3['url'] for r3 in r2['children']]) else: links = self.find_filter(response) for l in links: url = l.split('/') url[-2] = 'ma/' + url[-2] l = '/'.join(url) self.log('Process page {}'.format(l), level=logging.INFO) yield scrapy.Request( url=l, dont_filter=True, callback=self.parse, meta={'proxy': POOL.get()}, errback=self.handle_failure)
def start_requests(self): for url in self.start_urls: yield SeleniumRequest(url, callback=self.parse, meta={'proxy': POOL.get()}, errback=self.handle_failure_selenium)