def parse(self, response): self.log('Parse technology {}'.format(response.url), level=logging.INFO) name = response.url.split('/')[-1] with open(os.path.join(self.work_directory, name + '.html'), 'wb') as fo: fo.write(response.body) product = create_product() product['name'] = response.xpath("//h1[@id='page-title']/text()").get() product['ref'] = response.url product['contact']['website'] = response.url product['addr'] = deepcopy(self.address) product['asset']['type'] = 3 description = self.get_description(response) abstract = extract_dictionary(description, 'Applications') product['abs'] = '\n'.join(abstract.values()) if len(product['abs']) < 1: product['abs'] = next(iter(description.values())) if len(product['abs']) < 1: product['abs'] = product['name'] market = extract_dictionary(description, 'Advantages') product['asset']['market'] = '\n'.join(market.values()) tech = extract_dictionary(description, 'Technology') product['asset']['tech'] = '\n'.join(tech.values()) for k in abstract: del description[k] for k in market: del description[k] for k in tech: del description[k] product['intro'] = dictionary_to_markdown(description) product['intro'] = dictionary_to_markdown(description) product['tag'] = self.add_keywords(response) product['contact'] = self.get_contact(response) inventors = self.add_inventors(response) for index, user in enumerate(inventors): if len(user['abs']) < 1: user['abs'] = 'Inventor of ' + product['name'] user['addr'] = product['addr'] patents = self.get_patents(response) publications = self.get_publications(response) with open(os.path.join(self.work_directory, name + 'json'), 'w') as fo: json.dump( { 'product': product, 'inventors': inventors, 'patents': patents, 'publications': publications }, fo)
def parse(self, response): self.log('Parse technology {}'.format(response.url), level=logging.INFO) name = self.parse_name_from_url(response.url) with open(os.path.join(self.work_directory, name + '.html'), 'wb') as fo: fo.write(response.body) product = create_product() product['ref'] = response.url product['contact']['website'] = response.url product['name'] = response.xpath(self.title_xpath).get() meta = self.get_meta(response) abstract = extract_dictionary(meta, self.abstract_filter) product['abs'] = '\n'.join(abstract.values()) market = extract_dictionary(meta, self.market_filter) product['asset']['market'] = '\n'.join(market.values()) tech = extract_dictionary(meta, self.tech_filter) product['asset']['tech'] = '\n'.join(tech.values()) for k in market: if k in meta: del meta[k] for k in tech: if k in meta: del meta[k] for k in abstract: if k in meta: del meta[k] product['intro'] = dictionary_to_markdown(meta) product['asset']['type'] = 3 product['addr'] = deepcopy(self.address) product['tag'] = self.add_tags(response) inventors = self.add_inventors(response) for index, user in enumerate(inventors): user['abs'] = 'Inventor of ' + product['name'] user['addr'] = product['addr'] user['tag'] = product['tag'] contact = self.get_contact(response) contact['abs'] = 'Inventor of ' + product['name'] contact['addr'] = product['addr'] contact['tag'] = product['tag'] product['contact'] = contact['contact'] with open(os.path.join(self.work_directory, name + '.json'), 'w') as fo: json.dump( { 'product': product, 'inventors': inventors, 'contact': contact }, fo)
def parse(self, response): self.log('Parse technology {}'.format(response.url), level=logging.INFO) name = self.parse_name_from_url(response.url) with open(os.path.join(self.work_directory, name + '.html'), 'wb') as fo: fo.write(response.body) product = create_product() product['ref'] = response.url product['contact']['website'] = response.url product['name'] = response.xpath("string(//h1[@class='title'])").get() meta = self.get_meta(response) market = extract_dictionary(meta, 'Applications') product['asset']['market'] = '\n'.join(market.values()) for k in market: del meta[k] product['intro'] = dictionary_to_markdown(meta) product['abs'] = product['intro'][:product['intro'].find('. ') + 1] if len(product['abs']) < 1: product['abs'] = product['intro'] product['asset']['type'] = 3 product['addr'] = deepcopy(self.address) inventors = self.add_inventors(response) for index, user in enumerate(inventors): user['abs'] = 'Inventor of ' + product['name'] user['addr'] = product['addr'] user['tag'] = product['tag'] contact = self.get_contact(response) product['contact'] = contact with open(os.path.join(self.work_directory, name + '.json'), 'w') as fo: json.dump({'product': product, 'inventors': inventors}, fo)
def parse(self, response): self.log('Parse technology {}'.format(response.url), level=logging.INFO) name = response.url.split('/')[-1] with open(os.path.join(self.work_directory, name), 'wb') as fo: fo.write(response.body) product = create_product() product['name'] = response.xpath("//h1[@class='tech-heading tech-heading-main']/text()").get() product['ref'] = response.url product['contact']['website'] = response.url product['addr'] = deepcopy(self.address) product['asset']['type'] = 3 description = self.get_description(response) abstract = extract_dictionary(description, 'brief|Brief|BRIEF') product['abs'] = '\n'.join(abstract.values()) if len(product['abs']) < 1: product['abs'] = next(iter(description.values())) if len(product['abs']) < 1: product['abs'] = product['name'] introduction = extract_dictionary(description, 'full|Full|FULL') product['intro'] = '\n'.join(introduction.values()) for k in abstract: del description[k] for k in introduction: del description[k] product['asset']['market'] = dictionary_to_markdown(description) product['contact'] = self.get_contact(response) product['tag'] = self.add_keywords(response) contact_person = self.get_contact_person(response) contact_person['abs'] = 'Person of Contact for ' + product['name'] contact_person['addr'] = product['addr'] contact_person['contact'] = product['contact'] contact_person['tag'] = product['tag'] inventors = self.add_inventors(response) for index, user in enumerate(inventors): user['abs'] = 'Inventor of ' + product['name'] user['addr'] = product['addr'] user['tag'] = product['tag'] patents = self.get_patents(response) with open(os.path.join(self.work_directory, name[:-4] + 'json'), 'w') as fo: json.dump({'product': product, 'contact': contact_person, 'inventors': inventors, 'patents': patents}, fo)
def parse(self, response): self.log('Parse technology {}'.format(response.url), level=logging.INFO) name = response.url.split('/')[-1] with open(os.path.join(self.work_directory, name + '.html'), 'wb') as fo: fo.write(response.body) product = create_product() product['ref'] = response.url product['contact']['website'] = response.url meta = self.get_meta(response) product['name'] = meta['Project Title'] try: product['created'] = parse(meta['Posted Date']).strftime("%a, %d %b %Y %H:%M:%S GMT") except: pass product['tag'] = meta['Tags'] if len(meta['banner']) > 0: product['logo'] = meta['banner'][0] product['asset']['type'] = 3 abstract = extract_dictionary(meta, 'brief|Brief|BRIEF|Short') product['abs'] = '\n'.join(abstract.values()) if len(product['abs']) < 1: product['abs'] = next(iter(meta.values())) if len(product['abs']) < 1: product['abs'] = product['name'] introduction = extract_dictionary(meta, 'abstract|Abstract') product['intro'] = '\n'.join(introduction.values()) for k in abstract: del meta[k] for k in introduction: del meta[k] product['asset']['market'] = dictionary_to_markdown(meta) product['contact'] = self.get_contact(response) product['addr'] = deepcopy(self.get_address(response)) inventors = self.add_inventors(response) for index, user in enumerate(inventors): user['abs'] = 'Inventor of ' + product['name'] user['addr'] = product['addr'] user['tag'] = product['tag'] with open(os.path.join(self.work_directory, name + '.json'), 'w') as fo: json.dump({'product': product, 'inventors': inventors}, fo)
def parse(self, response): self.log('Parse technology {}'.format(response.url), level=logging.INFO) name = response.url.split('/')[-2] with open(os.path.join(self.work_directory, name + '.html'), 'wb') as fo: fo.write(response.body) product = create_product() product['ref'] = response.url product['contact']['website'] = response.url product['name'] = response.xpath("string(//h1)").get() meta = self.get_meta(response) abstract = extract_dictionary(meta, 'Advantage|advantage|Abstract') product['abs'] = '\n'.join(abstract.values()) if len(product['abs']) < 1: product['abs'] = next(iter(meta.values())) if len(product['abs']) < 1: product['abs'] = product['name'] product['asset']['tech'] = dictionary_to_markdown( meta, ('Technology', )) product['asset']['market'] = dictionary_to_markdown( meta, ('Value Proposition', 'Value proposition')) for k in abstract: del meta[k] for key in ('Value Proposition', 'Value proposition', 'Technology'): if key in meta: del meta[key] product['intro'] = dictionary_to_markdown(meta) product['asset']['type'] = 3 product['addr'] = deepcopy(self.address) inventors = self.add_inventors(response) product['tag'] = self.add_tags(response) for index, user in enumerate(inventors): user['abs'] = 'Inventor of ' + product['name'] user['addr'] = product['addr'] user['tag'] = product['tag'] product['contact'] = self.get_contact(response) with open(os.path.join(self.work_directory, name + '.json'), 'w') as fo: json.dump({'product': product, 'inventors': inventors}, fo)