Python cleanItem Examples, bizscraper.utils.cleanItem Python Examples

Example #1

0

Show file

    def parse(self, response):
        soup = BeautifulSoup(response.body, features="html.parser")
        for row in soup.find('div', {
                'id': 'tabs-1'
        }).findAll('div', class_='listing'):
            item = BizscraperItem()
            item['Source'] = self.name
            item['Listing_Title'] = row.find(
                'h2', class_='listing-title').text.strip()
            item['Listing_URL'] = row.find(
                'h2', class_='listing-title').find('a').attrs['href']
            item['Listing_Description'] = row.find(
                'div', class_='listing-description').find('p').text.strip()
            item['Asking_Price'] = cleanItem(
                row.find(
                    'dd',
                    class_='listing-overview-item--asking-price').text.strip())
            item['Cash_Flow'] = cleanItem(
                row.find('dd', class_='listing-overview-item--yearly-profit').
                text.strip())
            item['Scraped_At'] = datetime.now()
            item['Gross_Revenue'] = cleanItem(
                row.find('dd', class_='listing-overview-item--yearly-revenue').
                text.strip())
            item['Category'] = item['Listing_Title'].split(' -')[0].strip()
            if item['Asking_Price'] and item[
                    'Cash_Flow'] and item['Cash_Flow'] > 0:
                item['Multiple'] = round(
                    item['Asking_Price'] / item['Cash_Flow'], 3)

            yield item

Example #2

0

Show file

File: websitecloser.py Project: jackfoley0811/bizdir-scapers

    def parse_detail(self, response):
      item = response.meta['item']
      soup = BeautifulSoup(response.body, features="html.parser")
      info = {}
      for row in soup.find('div', class_='sb-table').findAll('div', class_='line'):
        info[row.find('div', class_='left').text.strip()] = row.find('div', class_='right').text.strip()
      info['Multiple'] = ''


      item['Asking_Price'] = cleanItem(info['Asking Price'])
      if 'Cash Flow' in info:
        item['Cash_Flow'] = cleanItem(info['Cash Flow'])
      else:
        item['Cash_Flow'] = None

      if item['Asking_Price'] and item['Cash_Flow'] and item['Cash_Flow'] > 0:
        item['Multiple'] = round(item['Asking_Price']/item['Cash_Flow'], 3)
      else:
        item['Multiple'] = ''
      for key in self.keys:
        if key not in info:
          info[key] = None

      item['Gross_Revenue'] = cleanItem(info['Gross Income'])
      # item['Multiple'] = info['Multiple']
      item['Year_Established'] = info['Year Established']
      item['Employee_Count'] = info['Employees']
      item['Listing_Description'] = soup.find('div', class_='cfx').text.strip()
      yield item

Example #3

0

Show file

 def parse_list(self, response):
     soup = BeautifulSoup(response.body, features="html.parser")
     for row in soup.findAll('div', class_='listing-row'):
         item_link = row.find('h2')
         item = BizscraperItem()
         item['Source'] = self.name
         item['Listing_Title'] = item_link.text.strip()
         item['Listing_URL'] = self.base_url + item_link.find(
             'a').attrs['href']
         item['Category'] = row.find('div', class_='stats').find(
             'span', class_='bold').text.strip()
         if item['Category'] == 'Domain':
             continue
         if row.find('div', class_='domain-pricing'):
             item['Asking_Price'] = cleanItem(
                 row.find('div', class_='domain-pricing').find(
                     'p').text.strip().replace('K', '000'))
         elif row.find('div', class_='regular-pricing'):
             item['Asking_Price'] = cleanItem(
                 row.find('div', class_='regular-pricing').find(
                     'p').text.strip().replace('K', '000'))
         yield scrapy.Request(url=item['Listing_URL'],
                              cookies=self.custom_cookie,
                              callback=self.parse_detail,
                              meta={'item': item},
                              method="GET")

Example #4

0

Show file

    def parse_list(self, response):
        soup = BeautifulSoup(response.body, features="html.parser")
        for row in soup.findAll('li', class_='_3B5De'):
            item_link = row.find('a', class_='_2OuU9')
            item = BizscraperItem()
            item['Source'] = self.name
            item['Listing_Title'] = item_link.text.strip()
            item['Listing_URL'] = self.base_url + item_link.attrs['href']
            item['Listing_Description'] = row.find(
                'div', class_='_3u-CT').find('p').text.strip()
            item['Gross_Revenue'] = cleanItem(
                row.findAll('div', class_='_3llXt')[0].text.strip())
            item['Scraped_At'] = datetime.now()
            if item['Gross_Revenue']:
                item['Gross_Revenue'] = item['Gross_Revenue'] * 12
            if 'k' in row.findAll('div', class_='_3llXt')[0].text.strip():
                item['Gross_Revenue'] = item['Gross_Revenue'] * 1000

            item['Cash_Flow'] = cleanItem(
                row.findAll('div', class_='_3llXt')[1].text.strip())
            if item['Cash_Flow']:
                item['Cash_Flow'] = item['Cash_Flow'] * 12
            if 'k' in row.findAll('div', class_='_3llXt')[1].text.strip():
                item['Cash_Flow'] = item['Cash_Flow'] * 1000

            item['Inventory'] = cleanItem(
                row.findAll('div', class_='_3llXt')[2].text.strip())
            if item['Inventory']:
                item['Inventory'] = item['Inventory'] * 12
            if 'k' in row.findAll('div', class_='_3llXt')[2].text.strip():
                item['Inventory'] = item['Inventory'] * 1000

            item['Asking_Price'] = cleanItem(
                row.find('div',
                         class_='_1Pa0r').find('div',
                                               class_='_1uCwB').text.strip())

            if item['Asking_Price'] and item[
                    'Cash_Flow'] and item['Cash_Flow'] > 0:
                item['Multiple'] = round(
                    item['Asking_Price'] / item['Cash_Flow'], 3)
            item['Category'] = 'ecommerce'

            # print(item)
            yield item

Example #5

0

Show file

    def parse_detail(self, response):
        item = response.meta['item']
        soup = BeautifulSoup(response.body, features="html.parser")
        item['Cash_Flow'] = None
        item['Gross_Revenue'] = None
        item['Multiple'] = None
        item['Scraped_At'] = datetime.now()
        print(item['Listing_URL'])
        if soup.find('div', class_='premium-blocks'):
            blocks = soup.find('div', class_='premium-blocks').findAll('div')
            for block in blocks:
                block_title = block.find('h3').text
                if 'Monthly Revenue' in block_title:
                    item['Gross_Revenue'] = cleanItem(
                        block.find('span').text.strip()) * 12
                elif 'Monthly Profit' in block_title:
                    item['Cash_Flow'] = cleanItem(
                        block.find('span').text.strip()) * 12
            print('-------------', item['Gross_Revenue'], item['Cash_Flow'],
                  '-------------')
            if item['Asking_Price'] and item[
                    'Cash_Flow'] and item['Cash_Flow'] > 0:
                item['Multiple'] = round(
                    item['Asking_Price'] / item['Cash_Flow'], 3)

        item['Listing_Description'] = ''
        if len(soup.findAll('div', class_='content-container')) > 0:
            description_container = soup.findAll('div',
                                                 class_='content-container')[0]
            description = ''
            if len(description_container.findAll('p')) == 0:
                description = description_container.text.strip().replace(
                    'Listing Details', '').strip()
            else:
                for line in description_container.findAll('p'):
                    description = description + line.text + '\n'
            item['Listing_Description'] = description

        for link in soup.find('div', class_='pad-3').findAll('a'):
            if 'Visit Website' in link.attrs['title']:
                item['Website'] = link.attrs['href']
        # print(item)
        yield item

Example #6

0

Show file

File: empireflippers.py Project: jackfoley0811/bizdir-scapers

    def parse(self, response):
        soup = BeautifulSoup(response.body, features="html.parser")
        for item in soup.findAll('div', class_='listing-item'):
            wrapper = item.find('div', class_='row')
            cols = wrapper.findAll('div', class_='col')

            item = BizscraperItem()
            item['Source'] = self.name
            item['Listing_Title'] = cols[2].find('h5').text.strip()
            item['Listing_URL'] = cols[2].find('a').attrs['href']
            item['Asking_Price'] = cleanItem(cols[4].text.strip())
            item['Cash_Flow'] = cleanItem(cols[5].text.strip()) * 12
            item['Multiple'] = round(item['Asking_Price'] / item['Cash_Flow'],
                                     3)
            item['Scraped_At'] = datetime.now()
            item['Category'] = cols[3].text.strip()
            yield scrapy.Request(url=item['Listing_URL'],
                                 callback=self.parse_detail,
                                 meta={'item': item},
                                 method="GET")

Example #7

0

Show file

File: empireflippers.py Project: jackfoley0811/bizdir-scapers

 def parse_detail(self, response):
     item = response.meta['item']
     soup = BeautifulSoup(response.body, features="html.parser")
     if soup.find('div', class_='sites-summary_left'):
         item['Gross_Revenue'] = cleanItem(
             soup.find('div', class_='sites-summary_left').find(
                 'ul').findAll('li')[2].text.strip().replace(
                     'Monthly Revenue', '')) * 12
         print('==================================')
         print(item)
         print('===================================')
         item['Listing_Description'] = soup.find(
             'div', class_='sites-summary_left').text.replace(
                 soup.find('div',
                           class_='sites-summary_left').find('ul').text, '')
     yield item

Example #8

0

Show file

    def parse_detail(self, response):
        soup = BeautifulSoup(response.body, features="html.parser")
        if soup.find('h1', class_='bfsTitle'):

            information_group = soup.findAll('div', class_='specs')
            info = {}

            for key in self.initial_keys:
                info[key] = None
            for information in information_group:
                for piece in information.findAll('p'):
                    key = piece.find('span',
                                     class_='title').text.strip().replace(
                                         ':', '')
                    info[key] = piece.find('b').text.strip()

            detailed_info = soup.find('dl', class_='listingProfile_details')
            flag = True
            detailed_info_list = {}
            for key in self.detail_keys:
                detailed_info_list[key] = None
            key = ''
            if detailed_info:
                for row in detailed_info.findAll():
                    if '<dt>' in str(row):
                        flag = True
                        key = row.text
                    if '<dd>' in str(row) and flag == True:
                        flag = False
                        detailed_info_list[key.replace(':', '')] = row.text

            item = BizscraperItem()
            item['Source'] = self.name

            item['Listing_URL'] = response.meta['detail_url']
            item['Listing_Title'] = soup.find('h1',
                                              class_='bfsTitle').text.strip()
            item['Listing_Description'] = soup.find(
                'div', class_='businessDescription').text.strip()
            item['Seller_Financing'] = False
            if soup.find('div', {'id': 'seller-financing'}):
                item['Seller_Financing'] = True

            item['Asking_Price'] = cleanItem(info['Asking Price'])
            item['Cash_Flow'] = cleanItem(info['Cash Flow'])
            if item['Asking_Price'] and item[
                    'Cash_Flow'] and item['Cash_Flow'] > 0:
                item['Multiple'] = round(
                    item['Asking_Price'] / item['Cash_Flow'], 3)
            item['Gross_Revenue'] = cleanItem(info['Gross Revenue'])
            item['EBITDA'] = cleanItem(info['EBITDA'])
            item['FF_E'] = cleanItem(info['FF&E'])
            item['Inventory'] = cleanItem(info['Inventory'])

            item['Year_Established'] = info['Established']
            item['Employee_Count'] = detailed_info_list['Employees']
            item['Website'] = detailed_info_list['Business Website']
            item['Scraped_At'] = datetime.now()

            if soup.find('div', {'id': 'others'}):
                others = soup.find('div', {'id': 'others'}).findAll('a')
                if len(others) > 0:
                    item['Category'] = others[0].text.replace('for Sale',
                                                              '').strip()
            if response.meta['location'] and response.meta['location'] != '':
                if len(response.meta['location'].split(',')) == 2:
                    item['Location_County'] = response.meta['location'].split(
                        ',')[0].strip()
                    item['Location_State'] = response.meta['location'].split(
                        ',')[1].strip()
                else:
                    item['Location_State'] = response.meta['location'].split(
                        ',')[0].strip()
            # print(item)
            yield item

Example #9

0

Show file

File: bizquest.py Project: jackfoley0811/bizdir-scapers

    def parse_detail(self, response):
        soup = BeautifulSoup(response.body, features="html.parser")

        initial_info = {}
        initial_wrapper = soup.find('div', class_='col-md-3')
        initial_key = ''
        for key in self.initial_keys:
            initial_info[key] = None
        for item in initial_wrapper.findAll('b'):
            flag = True
            for key in self.initial_keys:
                if key in item.text:
                    initial_key = key
                    flag = False
                    break
            if flag:
                initial_info[initial_key] = item.text.strip().replace(
                    'included in asking price', ' included in asking price')

        initial_info['Asking Price'] = cleanItem(initial_info['Asking Price'])
        initial_info['Cash Flow'] = cleanItem(initial_info['Cash Flow'])

        if initial_info['Asking Price'] and initial_info[
                'Cash Flow'] and initial_info['Cash Flow'] > 0:
            initial_info['Multiple'] = round(
                initial_info['Asking Price'] / initial_info['Cash Flow'], 3)

        detail_info = {}
        detail_wrapper = soup.findAll('dl', class_='dl-horizontal')[0]
        detail_key = ''
        for key in self.detail_keys:
            detail_info[key] = None
        for row in detail_wrapper.findAll():
            if '<dt>' in str(row):
                flag = True
                key = row.text.strip()
            if '<dd>' in str(row) and flag == True:
                flag = False
                detail_info[key.replace(':', '')] = row.text.strip()

        item = response.meta['item']
        item['Seller_Financing'] = False
        if soup.find('div',
                     class_='financing') and 'Seller Financing' in soup.find(
                         'div', class_='financing').text:
            item['Seller_Financing'] = True

        breadcrumbs = soup.find('ol', {'id': 'crumbs'}).findAll('li')
        item['Category'] = None
        if len(breadcrumbs) > 2:
            item['Category'] = breadcrumbs[2].text.replace(
                'Businesses for Sale', '').strip()

        item['Asking_Price'] = initial_info['Asking Price']
        item['Cash_Flow'] = initial_info['Cash Flow']
        item['Gross_Revenue'] = cleanItem(initial_info['Gross Revenue'])
        item['EBITDA'] = cleanItem(initial_info['EBITDA'])
        item['FF_E'] = cleanItem(initial_info['FF&E'])
        item['Inventory'] = cleanItem(initial_info['Inventory'])
        item['Multiple'] = initial_info['Multiple']

        item['Year_Established'] = detail_info['Year Established']
        item['Employee_Count'] = detail_info['Number of Employees']
        item['Website'] = detail_info['Website']

        if detail_info['Location']:
            if len(detail_info['Location'].split(',')) == 2:
                item['Location_County'] = detail_info['Location'].split(
                    ',')[0].strip()
                item['Location_State'] = detail_info['Location'].split(
                    ',')[1].strip()
            else:
                item['Location_State'] = detail_info['Location'].split(
                    ',')[0].strip()
        yield item