Exemple #1
    def parse(self, response):
        self.log('Parse technology {}'.format(response.url),
        name = response.url.split('/')[-1]
        with open(os.path.join(self.work_directory, name + '.html'),
                  'wb') as fo:
        product = create_product()
        product['name'] = response.xpath("//h1[@id='page-title']/text()").get()
        product['ref'] = response.url
        product['contact']['website'] = response.url
        product['addr'] = deepcopy(self.address)
        product['asset']['type'] = 3
        description = self.get_description(response)
        abstract = extract_dictionary(description, 'Applications')
        product['abs'] = '\n'.join(abstract.values())
        if len(product['abs']) < 1:
            product['abs'] = next(iter(description.values()))
        if len(product['abs']) < 1:
            product['abs'] = product['name']
        market = extract_dictionary(description, 'Advantages')
        product['asset']['market'] = '\n'.join(market.values())
        tech = extract_dictionary(description, 'Technology')
        product['asset']['tech'] = '\n'.join(tech.values())
        for k in abstract:
            del description[k]
        for k in market:
            del description[k]
        for k in tech:
            del description[k]
        product['intro'] = dictionary_to_markdown(description)
        product['intro'] = dictionary_to_markdown(description)
        product['tag'] = self.add_keywords(response)
        product['contact'] = self.get_contact(response)

        inventors = self.add_inventors(response)
        for index, user in enumerate(inventors):
            if len(user['abs']) < 1:
                user['abs'] = 'Inventor of ' + product['name']
            user['addr'] = product['addr']

        patents = self.get_patents(response)
        publications = self.get_publications(response)
        with open(os.path.join(self.work_directory, name + 'json'), 'w') as fo:
                    'product': product,
                    'inventors': inventors,
                    'patents': patents,
                    'publications': publications
                }, fo)
Exemple #2
    def parse(self, response):
        self.log('Parse technology {}'.format(response.url),
        name = self.parse_name_from_url(response.url)
        with open(os.path.join(self.work_directory, name + '.html'),
                  'wb') as fo:
        product = create_product()
        product['ref'] = response.url
        product['contact']['website'] = response.url
        product['name'] = response.xpath(self.title_xpath).get()
        meta = self.get_meta(response)
        abstract = extract_dictionary(meta, self.abstract_filter)
        product['abs'] = '\n'.join(abstract.values())
        market = extract_dictionary(meta, self.market_filter)
        product['asset']['market'] = '\n'.join(market.values())
        tech = extract_dictionary(meta, self.tech_filter)
        product['asset']['tech'] = '\n'.join(tech.values())
        for k in market:
            if k in meta:
                del meta[k]
        for k in tech:
            if k in meta:
                del meta[k]
        for k in abstract:
            if k in meta:
                del meta[k]
        product['intro'] = dictionary_to_markdown(meta)
        product['asset']['type'] = 3
        product['addr'] = deepcopy(self.address)
        product['tag'] = self.add_tags(response)
        inventors = self.add_inventors(response)
        for index, user in enumerate(inventors):
            user['abs'] = 'Inventor of ' + product['name']
            user['addr'] = product['addr']
            user['tag'] = product['tag']
        contact = self.get_contact(response)
        contact['abs'] = 'Inventor of ' + product['name']
        contact['addr'] = product['addr']
        contact['tag'] = product['tag']
        product['contact'] = contact['contact']

        with open(os.path.join(self.work_directory, name + '.json'),
                  'w') as fo:
                    'product': product,
                    'inventors': inventors,
                    'contact': contact
                }, fo)
Exemple #3
    def parse(self, response):
        self.log('Parse technology {}'.format(response.url),
        name = self.parse_name_from_url(response.url)
        with open(os.path.join(self.work_directory, name + '.html'),
                  'wb') as fo:
        product = create_product()
        product['ref'] = response.url
        product['contact']['website'] = response.url
        product['name'] = response.xpath("string(//h1[@class='title'])").get()
        meta = self.get_meta(response)
        market = extract_dictionary(meta, 'Applications')
        product['asset']['market'] = '\n'.join(market.values())
        for k in market:
            del meta[k]
        product['intro'] = dictionary_to_markdown(meta)
        product['abs'] = product['intro'][:product['intro'].find('. ') + 1]
        if len(product['abs']) < 1:
            product['abs'] = product['intro']
        product['asset']['type'] = 3
        product['addr'] = deepcopy(self.address)
        inventors = self.add_inventors(response)
        for index, user in enumerate(inventors):
            user['abs'] = 'Inventor of ' + product['name']
            user['addr'] = product['addr']
            user['tag'] = product['tag']
        contact = self.get_contact(response)
        product['contact'] = contact

        with open(os.path.join(self.work_directory, name + '.json'),
                  'w') as fo:
            json.dump({'product': product, 'inventors': inventors}, fo)
Exemple #4
    def parse(self, response):
        self.log('Parse technology {}'.format(response.url), level=logging.INFO)
        name = response.url.split('/')[-1]
        with open(os.path.join(self.work_directory, name), 'wb') as fo:
        product = create_product()
        product['name'] = response.xpath("//h1[@class='tech-heading tech-heading-main']/text()").get()
        product['ref'] = response.url
        product['contact']['website'] = response.url
        product['addr'] = deepcopy(self.address)
        product['asset']['type'] = 3
        description = self.get_description(response)
        abstract = extract_dictionary(description, 'brief|Brief|BRIEF')
        product['abs'] = '\n'.join(abstract.values())
        if len(product['abs']) < 1:
            product['abs'] = next(iter(description.values()))
        if len(product['abs']) < 1:
            product['abs'] = product['name']
        introduction = extract_dictionary(description, 'full|Full|FULL')
        product['intro'] = '\n'.join(introduction.values())
        for k in abstract:
            del description[k]
        for k in introduction:
            del description[k]
        product['asset']['market'] = dictionary_to_markdown(description)
        product['contact'] = self.get_contact(response)
        product['tag'] = self.add_keywords(response)

        contact_person = self.get_contact_person(response)
        contact_person['abs'] = 'Person of Contact for ' + product['name']
        contact_person['addr'] = product['addr']
        contact_person['contact'] = product['contact']
        contact_person['tag'] = product['tag']
        inventors = self.add_inventors(response)
        for index, user in enumerate(inventors):
            user['abs'] = 'Inventor of ' + product['name']
            user['addr'] = product['addr']
            user['tag'] = product['tag']

        patents = self.get_patents(response)
        with open(os.path.join(self.work_directory, name[:-4] + 'json'), 'w') as fo:
            json.dump({'product': product, 'contact': contact_person, 'inventors': inventors, 'patents': patents}, fo)
Exemple #5
    def parse(self, response):
        self.log('Parse technology {}'.format(response.url), level=logging.INFO)
        name = response.url.split('/')[-1]
        with open(os.path.join(self.work_directory, name + '.html'), 'wb') as fo:
        product = create_product()
        product['ref'] = response.url
        product['contact']['website'] = response.url
        meta = self.get_meta(response)
        product['name'] = meta['Project Title']
            product['created'] = parse(meta['Posted Date']).strftime("%a, %d %b %Y %H:%M:%S GMT")
        product['tag'] = meta['Tags']
        if len(meta['banner']) > 0:
            product['logo'] = meta['banner'][0]
        product['asset']['type'] = 3
        abstract = extract_dictionary(meta, 'brief|Brief|BRIEF|Short')
        product['abs'] = '\n'.join(abstract.values())
        if len(product['abs']) < 1:
            product['abs'] = next(iter(meta.values()))
        if len(product['abs']) < 1:
            product['abs'] = product['name']
        introduction = extract_dictionary(meta, 'abstract|Abstract')
        product['intro'] = '\n'.join(introduction.values())
        for k in abstract:
            del meta[k]
        for k in introduction:
            del meta[k]
        product['asset']['market'] = dictionary_to_markdown(meta)
        product['contact'] = self.get_contact(response)
        product['addr'] = deepcopy(self.get_address(response))
        inventors = self.add_inventors(response)
        for index, user in enumerate(inventors):
            user['abs'] = 'Inventor of ' + product['name']
            user['addr'] = product['addr']
            user['tag'] = product['tag']

        with open(os.path.join(self.work_directory, name + '.json'), 'w') as fo:
            json.dump({'product': product, 'inventors': inventors}, fo)
Exemple #6
    def parse(self, response):
        self.log('Parse technology {}'.format(response.url),
        name = response.url.split('/')[-2]
        with open(os.path.join(self.work_directory, name + '.html'),
                  'wb') as fo:
        product = create_product()
        product['ref'] = response.url
        product['contact']['website'] = response.url
        product['name'] = response.xpath("string(//h1)").get()
        meta = self.get_meta(response)
        abstract = extract_dictionary(meta, 'Advantage|advantage|Abstract')
        product['abs'] = '\n'.join(abstract.values())
        if len(product['abs']) < 1:
            product['abs'] = next(iter(meta.values()))
        if len(product['abs']) < 1:
            product['abs'] = product['name']
        product['asset']['tech'] = dictionary_to_markdown(
            meta, ('Technology', ))
        product['asset']['market'] = dictionary_to_markdown(
            meta, ('Value Proposition', 'Value proposition'))
        for k in abstract:
            del meta[k]
        for key in ('Value Proposition', 'Value proposition', 'Technology'):
            if key in meta:
                del meta[key]
        product['intro'] = dictionary_to_markdown(meta)
        product['asset']['type'] = 3
        product['addr'] = deepcopy(self.address)
        inventors = self.add_inventors(response)
        product['tag'] = self.add_tags(response)
        for index, user in enumerate(inventors):
            user['abs'] = 'Inventor of ' + product['name']
            user['addr'] = product['addr']
            user['tag'] = product['tag']
        product['contact'] = self.get_contact(response)

        with open(os.path.join(self.work_directory, name + '.json'),
                  'w') as fo:
            json.dump({'product': product, 'inventors': inventors}, fo)