class _TweetLoader(ItemLoader):

  CSS_BASE = 'div[role="main"] div.permalink-inner'

  default_input_processor = proc.MapCompose(gettext)
  default_output_processor = proc.Join('')

  created_at_in = proc.TakeFirst()
  created_at_out = proc.Compose(only, to_date)

  user_in = proc.Identity()
  user_out = proc.Compose(only)

  body_in = proc.Identity()
  body_out = proc.Compose(only)

  media_in = proc.Identity()
  media_out = proc.Compose(only)

  parents_in = \
  children_in = proc.Compose(extract_tweet_meta)

  parents_out = \
  children_out = proc.Identity()

  retweet_count_in = \
  favorite_count_in = \
  reply_count_in = proc.Compose(only)

  label_in = proc.Identity()
  label_out = proc.TakeFirst()

  user = None
  body = None
  stats = None

  def load_item(self):
    self.add_value('user', self.user.load_item())
    self.add_value('body', self.body.load_item())
    return super().load_item()

  def parse_iframe(self, response):
    content = response.css('.SummaryCard-content')
    media_item_loader = MediaItemLoader(Media(), content)

    media_item_loader.add_value('url', response.url)
    media_item_loader.add_selector('content', 'p::text')
    media_item_loader.add_selector('title', 'h2.TwitterCard-title::text')
    media_item_loader.add_selector('content_source', '.SummaryCard-content span::text')
    media_item_loader.add_selector('data_source', '::attr(data-src)')

    self.add_value('media', media_item_loader.load_item())
    yield self.load_item()
Beispiel #2
0
class PisoFiCrawlerItemLoader(ItemLoader):
    default_item_class = PisoFiCrawlerItem

    # Input processors
    default_input_processor = strip_whitespace

    # Output processors
    default_output_processor = processors.TakeFirst()
    name_out = name_output_processor
    ip_out = ip_output_processor
    description_out = description_output_processor
Beispiel #3
0
class SalesTransactionCrawlerItemLoader(ItemLoader):
    default_item_class = SalesTransactionCrawlerItem

    # Input processors
    default_input_processor = strip_whitespace

    # Output processors
    default_output_processor = processors.TakeFirst()
    mac_addr_out = mac_addr_output_processor
    transaction_type_out = transaction_type_output_processor
    vendo_out = vendo_output_processor
    amount_out = amount_output_processor
Beispiel #4
0
    def parse_advisors(self, response):
        company_url = response.xpath(
            '//*[@id="profile_header_heading"]/a/@href').extract_first()
        employee_selector = response.css('div.advisors').xpath('.//ul/li')

        for sel in employee_selector:
            loader = ItemLoader(item=BoardMember(), selector=sel)
            loader.default_input_processor = processors.MapCompose(
                w3lib.html.remove_tags)
            loader.default_output_processor = processors.TakeFirst()
            loader.add_value('company_url', company_url)
            loader.add_xpath('person_url', './/h4/a/@href')
            loader.add_xpath('title', './/h5/text()')
            yield loader.load_item()
Beispiel #5
0
    def parse_partners(self, response):
        company_url = response.xpath(
            '//*[@id="profile_header_heading"]/a/@href').extract_first()
        partner_selectors = response.css('div.partners').xpath(
            './/ul/li//h4/a')

        for sel in partner_selectors:
            loader = ItemLoader(item=Partner(), selector=sel)
            loader.default_input_processor = processors.MapCompose(
                w3lib.html.remove_tags)
            loader.default_output_processor = processors.TakeFirst()
            loader.add_value('focal_company_url', company_url)
            loader.add_xpath('partner_url', '@href')
            yield loader.load_item()
Beispiel #6
0
    def parse_acquisitions(self, response):
        company_url = response.xpath(
            '//*[@id="profile_header_heading"]/a/@href').extract_first()
        acq_selectors = response.css('div.acquisitions').xpath(
            './/tr[not(th)]')

        for sel in acq_selectors:
            loader = ItemLoader(item=Acquisition(), selector=sel)
            loader.default_input_processor = processors.MapCompose(
                w3lib.html.remove_tags)
            loader.default_output_processor = processors.TakeFirst()

            loader.add_value('focal_company_url', company_url)
            loader.add_xpath('date', 'td[1]/text()')
            loader.add_xpath('acquired_url', 'td[2]/a/@href')
            yield loader.load_item()
Beispiel #7
0
    def parse_organization(self, response):
        loader = ItemLoader(item=Organization(), response=response)
        loader.default_input_processor = processors.MapCompose(
            w3lib.html.remove_tags)
        loader.default_output_processor = processors.TakeFirst()

        loader.add_xpath('name', '//*[@id="profile_header_heading"]/a/text()')
        loader.add_value('url', response.url)
        # loader.add_value('ipo_stock', None) # TODO!

        # TODO: supposed to get person url for founders!
        # Fields expected: headquarters, description, founders, categories,
        # website, founded (date), and aliases
        keys = response.css('div.definition-list').xpath('dt/text()')
        values = response.css('div.definition-list').xpath('dd')
        for i in range(len(keys)):
            key = keys[i].extract()
            key = key[:key.find(':')].lower()
            try:
                loader.add_value(key, values[i].extract())
            except KeyError as e:
                # Ignore if key is not in the Item's field
                pass

        loader.add_xpath('facebook',
                         '(//a[contains(@class,"facebook")])[1]/@href')
        loader.add_xpath('twitter',
                         '(//a[contains(@class,"twitter")])[1]/@href')
        loader.add_xpath('linkedin',
                         '(//a[contains(@class,"linkedin")])[1]/@href')

        yield loader.load_item()

        for item in self.parse_acquisitions(response):
            yield item
        for item in self.parse_employees(response):
            yield item
        for item in self.parse_competitors(response):
            yield item
        for item in self.parse_partners(response):
            yield item
        for item in self.parse_advisors(response):
            yield item
Beispiel #8
0
    def parse_person(self, response):
        loader = ItemLoader(item=Person(), response=response)
        loader.default_input_processor = processors.MapCompose(
            w3lib.html.remove_tags)
        loader.default_output_processor = processors.TakeFirst()

        loader.add_xpath('name', '//*[@id="profile_header_heading"]/a/text()')
        loader.add_value('url', response.url)
        loader.add_xpath(
            'primary_role',
            '//*[@id="info-card-overview-content"]/div/dl/div/dd')

        # Fields expected: born, gender, location, website
        overview = response.xpath(
            '//*[@id="info-card-overview-content"]/div/dl/dt/text()')
        overview_loader = loader.nested_xpath(
            '//*[@id="info-card-overview-content"]/div/dl')
        for i in range(len(overview)):
            key = overview[i].extract()
            key = key[:key.find(':')].lower()
            try:
                overview_loader.add_xpath(key, 'dd[{}]/text()'.format(i + 1))
            except KeyError as e:
                # Ignore if key is not in the Item's field
                pass

        loader.add_xpath('facebook',
                         '(//a[contains(@class,"facebook")])[1]/@href')
        loader.add_xpath('twitter',
                         '(//a[contains(@class,"twitter")])[1]/@href')
        loader.add_xpath('linkedin',
                         '(//a[contains(@class,"linkedin")])[1]/@href')
        loader.add_xpath('description', '//*[@id="description"]/span/div')
        loader.add_css('current_jobs', '.current_job')
        loader.add_css('past_jobs', '.past_job')
        loader.nested_css('.advisory_roles').add_xpath('board_advisors',
                                                       './/ul/li')
        loader.nested_css('table.investors').add_xpath(
            'investments', './/tr[not(@class="thead")]')
        loader.nested_css('.education').add_xpath('education', './/ul/li')

        return loader.load_item()
Beispiel #9
0
class PublicationLoader(ItemLoader):
    default_output_processor = processors.TakeFirst()
class LinkLoader(ItemLoader):
    default_output_processor = processors.TakeFirst()
    title_in = processors.MapCompose(helpers.rws, helpers.beautify_romanian)
Beispiel #11
0
class ECBItemLoader(ItemLoader):
    default_output_processor = processors.TakeFirst()
class StatusLoader(ItemLoader):
    default_output_processor = processors.TakeFirst()
    status_in = processors.MapCompose(helpers.rws, helpers.beautify_romanian)
Beispiel #13
0
class ActivityLoader(ItemLoader):
    default_output_processor = processors.TakeFirst()
    name_in = processors.MapCompose(helpers.rws, helpers.beautify_romanian)
    dictionary_in = processors.Identity()
class PleneryTimeLoader(ItemLoader):
    default_output_processor = processors.TakeFirst()
    name_in = processors.MapCompose(helpers.rws, helpers.beautify_romanian)
class MediaItemLoader(ItemLoader):
  default_input_processor = proc.MapCompose(gettext)
  default_output_processor = proc.TakeFirst()

  url_in = proc.Identity()
class UserItemLoader(ItemLoader):
  default_output_processor = proc.Join('')
  is_verified_out = proc.TakeFirst()