Beispiel #1
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath(
             "//div[@class='view-content']/div[contains(@class,'views-row')]/div/h3/a"
     ):
         text = link.xpath("text()").get()
         url = link.xpath("@href").get()
         self.log("find technology {}/{}".format(text, url),
                  level=logging.INFO)
         patent_links.append(url)
     # for next page
     next_page = response.xpath("//li[@class='pager-next']/a/@href").get()
     if next_page is not None:
         self.log('process page {}'.format(next_page), level=logging.INFO)
         yield response.follow(
             url=next_page,
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = self.parse_name_from_url(p)
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
Beispiel #2
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath(
             '//*[@id="formTechPub1"]/div/table[2]/tr/td/a'):
         text = link.xpath("text()").get()
         url = link.xpath("@href").get()
         self.log("find technology {}/{}".format(text, url),
                  level=logging.INFO)
         patent_links.append(url)
     # for next page
     total_result = self.statictics(response)
     self.page += 1
     if self.page * self.item_per_page < total_result:
         self.log('process page {}'.format(self.page), level=logging.INFO)
         yield response.follow(
             url=self.next_page_template.format(self.page),
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = self.parse_name_from_url(p)
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
Beispiel #3
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath("//a[@class='lead-in']/@href").getall():
         patent_links.append(link)
     # for next page
     next_page = response.xpath(
         "//div[@class='nav-previous']/a/@href").get()
     if next_page is not None:
         self.log('process page {}'.format(next_page), level=logging.INFO)
         yield response.follow(
             url=next_page,
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = p.split('/')[-2]
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
Beispiel #4
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath("//h4[@class='result-title']/a"):
         text = link.xpath("text()").get()
         url = link.xpath("@href").get()
         self.log("find technology {}/{}".format(text, url), level=logging.INFO)
         patent_links.append(url)
     # for next page
     current_page, total_page = self.statictics(response)
     if current_page < total_page:
         self.log('process page {}'.format(self.page), level=logging.INFO)
         yield response.follow(
             url='https://otd.harvard.edu/explore-innovation/technologies/results/P{}/'.format(current_page * 10),
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = self.parse_name_from_url(p)
         if os.path.exists(os.path.join(self.work_directory, name + '.json')):
             self.log('{} already parsed and will skip'.format(p), level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
Beispiel #5
0
 def run(self):
     """
     The main function
     """
     while self.running.is_set():
         proxy = requests.get(self.api_url).content.decode().split(
             self.token)
         logging.debug('add new proxy {} to pool'.format(proxy))
         for p in proxy:
             POOL.add('http://' + p[:-1], 300)
         time.sleep(self.delay_second)
 def extract_page(self, response):
     arguments = self.parse_url(response.request.url)
     self.log('Process page {}'.format(response.request.url),
              level=logging.INFO)
     with open(
             os.path.join(
                 self.work_directory,
                 '{}_{}_{}_{}.html'.format(arguments['province'][0],
                                           arguments['round'][0],
                                           arguments['industry'][0],
                                           arguments.get('p', ['1'])[0])),
             'wb') as fo:
         fo.write(response.body)
     row = response.xpath("//table[@class='ntable']/tr")
     if len(row) < 1:
         self.log('page {} has no data'.format(response.request.url),
                  level=logging.WARNING)
         POOL.remove(response.request.meta['proxy'])
         return
     first_row = True
     result = []
     for r in row:
         if first_row:
             first_row = False
             continue
         image = r.xpath('td[1]/img/@src').extract_first()
         project_name = r.xpath('td[2]/a/text()').extract_first()
         project_url = r.xpath('td[2]/a/@src').extract_first()
         investor = r.xpath('td[3]/text()').extract_first()
         stage = r.xpath('td[4]/text()').extract_first()
         invest_time = r.xpath('td[5]/text()').extract_first()
         result.append({
             'image': image,
             'project': {
                 'url': project_url,
                 'name': project_name
             },
             'investor': investor,
             'stage': stage,
             'time': invest_time
         })
         self.log('find investment from {} to {}'.format(
             investor, project_name),
                  level=logging.INFO)
     # save the file
     with open(
             os.path.join(
                 self.work_directory,
                 '{}_{}_{}_{}.json'.format(arguments['province'][0],
                                           arguments['round'][0],
                                           arguments['industry'][0],
                                           arguments.get('p', ['1'])[0])),
             'w') as fo:
         json.dump(result, fo, ensure_ascii=False)
Beispiel #7
0
 def start_requests(self):
     for url in self.start_urls:
         for industry in self.industry:
             for stage in self.stage:
                 for year in self.year:
                     yield scrapy.Request(
                         url=url,
                         dont_filter=True,
                         callback=self.apply_filter,
                         meta={
                             'proxy': POOL.get() if not self.exclusive else POOL.pop(),
                             'extra': {'industry': industry, 'stage': stage, 'year': year}},
                         errback=self.handle_failure)
Beispiel #8
0
 def parse_list(self, response: Response):
     driver = self.get_driver(response)
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     if os.path.exists(os.path.join(self.work_directory, 'links.json')):
         patent_links = json.load(open(os.path.join(self.work_directory, 'links.json'), 'r'))
     else:
         while True:
             time.sleep(1)
             self.wait_for_element(driver, "//div[@id='ctl00_ContentPlaceHolder1_UpdateProgress1' and @style='display: block;']")
             table = driver.find_element_by_xpath("//div[@class='table-body']")
             for r in table.find_elements_by_xpath("div"):
                 cols = r.find_elements_by_xpath("div")
                 patent = cols[2].find_element_by_xpath('p/a')
                 abstract = cols[2].find_element_by_xpath('div/p')
                 patent_links.append({'name': patent.text, 'link': patent.get_attribute('href'), 'abstract': abstract.text})
                 self.log('Found technology {}'.format(patent.text), level=logging.INFO)
             if not self.next_page(driver):
                 break
             time.sleep(3)
         with open(os.path.join(self.work_directory, 'links.json'), 'w') as fo:
             json.dump(patent_links, fo)
     for p in patent_links:
         name = p['link'].split('/')[-1]
         if os.path.exists(os.path.join(self.work_directory, name[:-4] + 'json')):
             self.log('{} already parsed and will skip'.format(p['link']), level=logging.INFO)
             continue
         yield Request(
             url=p['link'],
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
Beispiel #9
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath("//ul[@id='tech-licensing']/li/a"):
         text = link.xpath("text()").get()
         url = link.xpath("@href").get()
         if url is None:
             continue
         self.log("find technology {}/{}".format(text, url),
                  level=logging.INFO)
         patent_links.append(url)
     for p in patent_links:
         name = self.parse_name_from_url(p)
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
Beispiel #10
0
 def start_requests(self):
     for url in self.start_urls:
         yield SeleniumRequest(url=url,
                               dont_filter=True,
                               callback=self.apply_filter,
                               meta={'proxy': POOL.get()},
                               errback=self.handle_failure_selenium)
Beispiel #11
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     if os.path.exists(os.path.join(self.work_directory, 'links.json')):
         patent_links = json.load(open(os.path.join(self.work_directory, 'links.json'), 'r'))
     else:
         # the id of product is provded in the <script></script>
         for code in response.xpath("//script").getall():
             if 'id_list' in code:
                 ids = re.findall(r'[0-9]+', re.findall(r'\[[0-9,]+\]', code)[0])
                 patent_links = [response.url + '/public/project/{}'.format(patentId) for patentId in ids]
         with open(os.path.join(self.work_directory, 'links.json'), 'w') as fo:
             json.dump(patent_links, fo)
     for p in patent_links:
         name = p.split('/')[-1]
         if os.path.exists(os.path.join(self.work_directory, name + '.json')):
             self.log('{} already parsed and will skip'.format(p), level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
Beispiel #12
0
 def start_requests(self):
     for url in self.start_urls:
         yield Request(
             url=url,
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
 def parse(self, response):
     # find the document url
     link = response.xpath(
         "//div[@class='ProjetInfo_title']/a/@href").extract_first()
     if link is None:
         self.log('{} fail to download'.format(response.url, link),
                  level=logging.WARNING)
         # remove the invalid proxy
         POOL.remove(response.request.meta['proxy'])
         return
     page = uuid.uuid5(uuid.NAMESPACE_URL, response.url.encode('utf-8')).hex
     filename = '%s.html' % page
     with open(os.path.join(self.work_directory, filename), 'wb') as f:
         f.write(response.body)
     self.log('{} => {}'.format(response.url, link), level=logging.INFO)
     yield {
         'link': response.url,
         'xml': link,
     }
Beispiel #14
0
 def apply_filter(self, response):
     for year in range(2018, 2003, -1):
         url = response.request.url + 'y{}/'.format(year)
         self.log('Process page {}'.format(url), level=logging.INFO)
         yield scrapy.Request(
             url=url,
             dont_filter=True,
             callback=self.parse,
             meta={'proxy': POOL.get()},
             errback=self.handle_failure)
Beispiel #15
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for row in response.xpath(
             "//div[@id='nouvant-portfolio-content']/div[@class='technology']"
     ):
         title = row.xpath("h2/a/text()").get()
         link = row.xpath("h2/a/@href").get()
         abstract = row.xpath("p/span/text()").get()
         self.log('found patent {}'.format(title), level=logging.INFO)
         patent_links.append({
             'title': title,
             'link': link,
             'abstract': abstract
         })
     statistics = self.statistics(response)
     self.log('found {}/{} patents'.format(statistics['end'],
                                           statistics['total']),
              level=logging.INFO)
     if statistics['end'] < statistics['total']:
         yield response.follow(
             url='/technologies?limit=50&offset={}&query='.format(
                 statistics['end']),
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = p['link'].split('/')[-1]
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p['link']),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p['link'],
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
 def start_requests(self):
     for url in self.data.keys():
         if url is None or not url.startswith('http'):
             continue
         name = url.split('/')[-1]
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             continue
         yield Request(url=url,
                       meta={'proxy': POOL.get()},
                       errback=self.handle_failure,
                       callback=self.parse)
Beispiel #17
0
 def parse_category(self, response: Response):
     # with javascript it would be //div[@class='split-taxonomy-4']/ul/li/a/@href
     for row in response.xpath(
             "//section[@id='block-taxonomy-menu-block-1']/ul/li/a/@href"
     ).getall():
         self.log('find category {}'.format(row), level=logging.INFO)
         yield response.follow(
             url=row,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             callback=self.parse_list,
             errback=self.handle_failure)
Beispiel #18
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for row in response.xpath(
             "//section[@id='block-system-main']/div[@class='node node-technology node-teaser clearfix']/h2/a"
     ):
         name = row.xpath("text()").get()
         link = row.xpath("@href").get()
         patent_links.append({'name': name, 'link': link})
         self.log('found patents {}'.format(name), level=logging.INFO)
     if response.xpath("//li[@class='pager-last']/a/@href").get() is not None and\
             response.url != response.xpath("//li[@class='pager-last']/a/@href").get():
         # have next page
         if '?page=' in response.url:
             elements = response.url.split("=")
             page = (int(elements[-1]) + 1)
             self.log('go to page {}'.format(page), level=logging.INFO)
             yield response.follow(
                 url='='.join(elements[:-1]) + '={}'.format(page),
                 dont_filter=True,
                 meta={'proxy': POOL.get()} if self.with_proxy else {},
                 callback=self.parse_list,
                 errback=self.handle_failure)
         else:
             self.log('go to page 2', level=logging.INFO)
             yield response.follow(
                 url=response.url + '?page=1',
                 dont_filter=True,
                 meta={'proxy': POOL.get()} if self.with_proxy else {},
                 callback=self.parse_list,
                 errback=self.handle_failure)
     for p in patent_links:
         yield response.follow(
             url=p['link'],
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             callback=self.parse,
             errback=self.handle_failure)
 def start_requests(self):
     urls = json.load(
         open(os.path.join(self.work_directory, 'links.json'), 'rb'))
     for url in urls:
         page = uuid.uuid5(uuid.NAMESPACE_URL, url.encode('utf-8')).hex
         filename = '%s.html' % page
         if os.path.exists(os.path.join(self.work_directory, filename)):
             self.log('{} already exists'.format(url))
             continue
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              meta={'proxy': POOL.get()},
                              errback=self.handle_failure)
         time.sleep(random.random())
Beispiel #20
0
def parse_page(log, work_directory: str, with_proxy: bool, slug: str):
    print('process {}'.format(slug))
    while True:
        try:
            if with_proxy:
                proxies = POOL.get()
                response = get(slug, proxies={'http': proxies, 'https': proxies})
            else:
                response = get(slug)
            if 200 <= response.status_code < 300:
                data = response.json()
                with open(os.path.join(work_directory, '{}.json'.format(slug.split('/')[-1])), 'w') as fo:
                    json.dump(data, fo)
                break
        except Exception as e:
            print(e)
    def parse_school_list(self, response: Response):
        if os.path.exists(os.path.join(self.work_directory, 'links.json')):
            school_links = json.load(open(os.path.join(self.work_directory, 'links.json'), 'r'))
        else:
            driver = self.get_driver(response)
            school_links = []
            page = 1
            while True:
                for row in driver.find_elements_by_xpath("//table[@id='search-results']/tbody/tr"):
                    title = row.find_element_by_xpath("td[@class='search-results-major']/a").text
                    link = row.find_element_by_xpath("td[@class='search-results-major']/a").get_attribute('href')
                    school_links.append({'name': title, 'link': link})
                    self.log('find school {}'.format(title), level=logging.INFO)
                total_page = self.statistics(response)
                self.log('Finish page {}/{}'.format(page, total_page), level=logging.INFO)
                if page < total_page:
                    try:
                        next_page = driver.find_element_by_xpath("//img[@class='paginator-next-page paginator-button']")
                        if next_page is not None:
                            next_page.click()
                        page += 1
                    except Exception as e:
                        self.log('Fail to go to page {}'.format(page + 1), level=logging.ERROR)
                        break
                else:
                    break
                time.sleep(3)
                wait = WebDriverWait(driver, 30)
                try:
                    wait.until(lambda x: len(x.find_elements_by_xpath("//table[@id='search-results' and @style='opacity: 1;']")) > 0)
                except Exception as e:
                    self.log('Unable to retrieve school information: {}'.format(e), level=logging.ERROR)
                    break
            with open(os.path.join(self.work_directory, 'links.json'), 'w') as fo:
                json.dump(school_links, fo)

        for s in school_links:
            yield Request(
                url=s['link'],
                callback=self.parse_list,
                dont_filter=True,
                meta={'proxy': POOL.get()} if self.with_proxy else {},
                errback=self.handle_failure)
 def parse_list(self, response):
     self.log('Parse list {}'.format(response.url), level=logging.INFO)
     name = response.url.split('/')[-1]
     with open(os.path.join(self.work_directory, name + '.html'), 'wb') as fo:
         fo.write(response.body)
     # for the information of school
     school = create_company()
     meta = self.get_school_information(response)
     if 'Name' in meta:
         school['name'] = meta['Name']
     if 'URL' in meta:
         school['ref'] = meta['URL']
         school['contact']['website'] = meta['URL']
     if 'Group Type' in meta:
         school['abs'] = meta['Group Type']
     school['addr'] = deepcopy(self.address)
     school['addr']['line1'] = school['name']
     if school['name'] in self.blacklist:
         return
     patent_links = []
     if os.path.exists(os.path.join(self.work_directory, school['name'] + '.json')):
         patent_links = json.load(open(os.path.join(self.work_directory, school['name'] + '.json'), 'r'))
     else:
         # the id of product is provded in the <script></script>
         for code in response.xpath("//script").getall():
             if 'id_list' in code:
                 ids = re.findall(r'[0-9]+', re.findall(r'\[[0-9,]+\]', code)[0])
                 patent_links = ['https://www.flintbox.com/public/project/{}'.format(patentId) for patentId in ids]
         with open(os.path.join(self.work_directory, school['name'] + '.json'), 'w') as fo:
             json.dump(patent_links, fo)
     for p in patent_links:
         name = p.split('/')[-1]
         if os.path.exists(os.path.join(self.work_directory, name + '.json')):
             self.log('{} already parsed and will skip'.format(p), level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get(), 'school': school} if self.with_proxy else {'school': school},
             errback=self.handle_failure)
 def start_requests(self):
     for url in self.start_urls:
         for p in self.province:
             for i in self.industry:
                 for r in self.round:
                     self.log(
                         'start page province={} industry={} and round={}'.
                         format(p, i, r),
                         level=logging.INFO)
                     yield scrapy.Request(url=url,
                                          dont_filter=True,
                                          callback=self.apply_filter,
                                          meta={
                                              'proxy': POOL.get(),
                                              'extra': {
                                                  'province': p,
                                                  'industry': i,
                                                  'round': r
                                              }
                                          },
                                          errback=self.handle_failure)
Beispiel #24
0
 def apply_filter(self, response):
     if os.path.exists(os.path.join(self.work_directory, 'category.json')):
         with open(os.path.join(self.work_directory, 'category.json'), 'r') as fi:
             result = json.load(fi)
         links = []
         for r1 in result:
             for r2 in r1['children']:
                 if len(r2['children']) < 1:
                     links.append(r2['url'])
                 else:
                     links.extend([r3['url'] for r3 in r2['children']])
     else:
         links = self.find_filter(response)
     for l in links:
         url = l.split('/')
         url[-2] = 'ma/' + url[-2]
         l = '/'.join(url)
         self.log('Process page {}'.format(l), level=logging.INFO)
         yield scrapy.Request(
             url=l,
             dont_filter=True,
             callback=self.parse,
             meta={'proxy': POOL.get()},
             errback=self.handle_failure)
Beispiel #25
0
 def start_requests(self):
     for url in self.start_urls:
         yield SeleniumRequest(url,
                               callback=self.parse,
                               meta={'proxy': POOL.get()},
                               errback=self.handle_failure_selenium)