def parse(self, response): soup = BeautifulSoup(response.body, features="html.parser") for row in soup.find('div', { 'id': 'tabs-1' }).findAll('div', class_='listing'): item = BizscraperItem() item['Source'] = self.name item['Listing_Title'] = row.find( 'h2', class_='listing-title').text.strip() item['Listing_URL'] = row.find( 'h2', class_='listing-title').find('a').attrs['href'] item['Listing_Description'] = row.find( 'div', class_='listing-description').find('p').text.strip() item['Asking_Price'] = cleanItem( row.find( 'dd', class_='listing-overview-item--asking-price').text.strip()) item['Cash_Flow'] = cleanItem( row.find('dd', class_='listing-overview-item--yearly-profit'). text.strip()) item['Scraped_At'] = datetime.now() item['Gross_Revenue'] = cleanItem( row.find('dd', class_='listing-overview-item--yearly-revenue'). text.strip()) item['Category'] = item['Listing_Title'].split(' -')[0].strip() if item['Asking_Price'] and item[ 'Cash_Flow'] and item['Cash_Flow'] > 0: item['Multiple'] = round( item['Asking_Price'] / item['Cash_Flow'], 3) yield item
def parse_detail(self, response): item = response.meta['item'] soup = BeautifulSoup(response.body, features="html.parser") info = {} for row in soup.find('div', class_='sb-table').findAll('div', class_='line'): info[row.find('div', class_='left').text.strip()] = row.find('div', class_='right').text.strip() info['Multiple'] = '' item['Asking_Price'] = cleanItem(info['Asking Price']) if 'Cash Flow' in info: item['Cash_Flow'] = cleanItem(info['Cash Flow']) else: item['Cash_Flow'] = None if item['Asking_Price'] and item['Cash_Flow'] and item['Cash_Flow'] > 0: item['Multiple'] = round(item['Asking_Price']/item['Cash_Flow'], 3) else: item['Multiple'] = '' for key in self.keys: if key not in info: info[key] = None item['Gross_Revenue'] = cleanItem(info['Gross Income']) # item['Multiple'] = info['Multiple'] item['Year_Established'] = info['Year Established'] item['Employee_Count'] = info['Employees'] item['Listing_Description'] = soup.find('div', class_='cfx').text.strip() yield item
def parse_list(self, response): soup = BeautifulSoup(response.body, features="html.parser") for row in soup.findAll('div', class_='listing-row'): item_link = row.find('h2') item = BizscraperItem() item['Source'] = self.name item['Listing_Title'] = item_link.text.strip() item['Listing_URL'] = self.base_url + item_link.find( 'a').attrs['href'] item['Category'] = row.find('div', class_='stats').find( 'span', class_='bold').text.strip() if item['Category'] == 'Domain': continue if row.find('div', class_='domain-pricing'): item['Asking_Price'] = cleanItem( row.find('div', class_='domain-pricing').find( 'p').text.strip().replace('K', '000')) elif row.find('div', class_='regular-pricing'): item['Asking_Price'] = cleanItem( row.find('div', class_='regular-pricing').find( 'p').text.strip().replace('K', '000')) yield scrapy.Request(url=item['Listing_URL'], cookies=self.custom_cookie, callback=self.parse_detail, meta={'item': item}, method="GET")
def parse_list(self, response): soup = BeautifulSoup(response.body, features="html.parser") for row in soup.findAll('li', class_='_3B5De'): item_link = row.find('a', class_='_2OuU9') item = BizscraperItem() item['Source'] = self.name item['Listing_Title'] = item_link.text.strip() item['Listing_URL'] = self.base_url + item_link.attrs['href'] item['Listing_Description'] = row.find( 'div', class_='_3u-CT').find('p').text.strip() item['Gross_Revenue'] = cleanItem( row.findAll('div', class_='_3llXt')[0].text.strip()) item['Scraped_At'] = datetime.now() if item['Gross_Revenue']: item['Gross_Revenue'] = item['Gross_Revenue'] * 12 if 'k' in row.findAll('div', class_='_3llXt')[0].text.strip(): item['Gross_Revenue'] = item['Gross_Revenue'] * 1000 item['Cash_Flow'] = cleanItem( row.findAll('div', class_='_3llXt')[1].text.strip()) if item['Cash_Flow']: item['Cash_Flow'] = item['Cash_Flow'] * 12 if 'k' in row.findAll('div', class_='_3llXt')[1].text.strip(): item['Cash_Flow'] = item['Cash_Flow'] * 1000 item['Inventory'] = cleanItem( row.findAll('div', class_='_3llXt')[2].text.strip()) if item['Inventory']: item['Inventory'] = item['Inventory'] * 12 if 'k' in row.findAll('div', class_='_3llXt')[2].text.strip(): item['Inventory'] = item['Inventory'] * 1000 item['Asking_Price'] = cleanItem( row.find('div', class_='_1Pa0r').find('div', class_='_1uCwB').text.strip()) if item['Asking_Price'] and item[ 'Cash_Flow'] and item['Cash_Flow'] > 0: item['Multiple'] = round( item['Asking_Price'] / item['Cash_Flow'], 3) item['Category'] = 'ecommerce' # print(item) yield item
def parse_detail(self, response): item = response.meta['item'] soup = BeautifulSoup(response.body, features="html.parser") item['Cash_Flow'] = None item['Gross_Revenue'] = None item['Multiple'] = None item['Scraped_At'] = datetime.now() print(item['Listing_URL']) if soup.find('div', class_='premium-blocks'): blocks = soup.find('div', class_='premium-blocks').findAll('div') for block in blocks: block_title = block.find('h3').text if 'Monthly Revenue' in block_title: item['Gross_Revenue'] = cleanItem( block.find('span').text.strip()) * 12 elif 'Monthly Profit' in block_title: item['Cash_Flow'] = cleanItem( block.find('span').text.strip()) * 12 print('-------------', item['Gross_Revenue'], item['Cash_Flow'], '-------------') if item['Asking_Price'] and item[ 'Cash_Flow'] and item['Cash_Flow'] > 0: item['Multiple'] = round( item['Asking_Price'] / item['Cash_Flow'], 3) item['Listing_Description'] = '' if len(soup.findAll('div', class_='content-container')) > 0: description_container = soup.findAll('div', class_='content-container')[0] description = '' if len(description_container.findAll('p')) == 0: description = description_container.text.strip().replace( 'Listing Details', '').strip() else: for line in description_container.findAll('p'): description = description + line.text + '\n' item['Listing_Description'] = description for link in soup.find('div', class_='pad-3').findAll('a'): if 'Visit Website' in link.attrs['title']: item['Website'] = link.attrs['href'] # print(item) yield item
def parse(self, response): soup = BeautifulSoup(response.body, features="html.parser") for item in soup.findAll('div', class_='listing-item'): wrapper = item.find('div', class_='row') cols = wrapper.findAll('div', class_='col') item = BizscraperItem() item['Source'] = self.name item['Listing_Title'] = cols[2].find('h5').text.strip() item['Listing_URL'] = cols[2].find('a').attrs['href'] item['Asking_Price'] = cleanItem(cols[4].text.strip()) item['Cash_Flow'] = cleanItem(cols[5].text.strip()) * 12 item['Multiple'] = round(item['Asking_Price'] / item['Cash_Flow'], 3) item['Scraped_At'] = datetime.now() item['Category'] = cols[3].text.strip() yield scrapy.Request(url=item['Listing_URL'], callback=self.parse_detail, meta={'item': item}, method="GET")
def parse_detail(self, response): item = response.meta['item'] soup = BeautifulSoup(response.body, features="html.parser") if soup.find('div', class_='sites-summary_left'): item['Gross_Revenue'] = cleanItem( soup.find('div', class_='sites-summary_left').find( 'ul').findAll('li')[2].text.strip().replace( 'Monthly Revenue', '')) * 12 print('==================================') print(item) print('===================================') item['Listing_Description'] = soup.find( 'div', class_='sites-summary_left').text.replace( soup.find('div', class_='sites-summary_left').find('ul').text, '') yield item
def parse_detail(self, response): soup = BeautifulSoup(response.body, features="html.parser") if soup.find('h1', class_='bfsTitle'): information_group = soup.findAll('div', class_='specs') info = {} for key in self.initial_keys: info[key] = None for information in information_group: for piece in information.findAll('p'): key = piece.find('span', class_='title').text.strip().replace( ':', '') info[key] = piece.find('b').text.strip() detailed_info = soup.find('dl', class_='listingProfile_details') flag = True detailed_info_list = {} for key in self.detail_keys: detailed_info_list[key] = None key = '' if detailed_info: for row in detailed_info.findAll(): if '<dt>' in str(row): flag = True key = row.text if '<dd>' in str(row) and flag == True: flag = False detailed_info_list[key.replace(':', '')] = row.text item = BizscraperItem() item['Source'] = self.name item['Listing_URL'] = response.meta['detail_url'] item['Listing_Title'] = soup.find('h1', class_='bfsTitle').text.strip() item['Listing_Description'] = soup.find( 'div', class_='businessDescription').text.strip() item['Seller_Financing'] = False if soup.find('div', {'id': 'seller-financing'}): item['Seller_Financing'] = True item['Asking_Price'] = cleanItem(info['Asking Price']) item['Cash_Flow'] = cleanItem(info['Cash Flow']) if item['Asking_Price'] and item[ 'Cash_Flow'] and item['Cash_Flow'] > 0: item['Multiple'] = round( item['Asking_Price'] / item['Cash_Flow'], 3) item['Gross_Revenue'] = cleanItem(info['Gross Revenue']) item['EBITDA'] = cleanItem(info['EBITDA']) item['FF_E'] = cleanItem(info['FF&E']) item['Inventory'] = cleanItem(info['Inventory']) item['Year_Established'] = info['Established'] item['Employee_Count'] = detailed_info_list['Employees'] item['Website'] = detailed_info_list['Business Website'] item['Scraped_At'] = datetime.now() if soup.find('div', {'id': 'others'}): others = soup.find('div', {'id': 'others'}).findAll('a') if len(others) > 0: item['Category'] = others[0].text.replace('for Sale', '').strip() if response.meta['location'] and response.meta['location'] != '': if len(response.meta['location'].split(',')) == 2: item['Location_County'] = response.meta['location'].split( ',')[0].strip() item['Location_State'] = response.meta['location'].split( ',')[1].strip() else: item['Location_State'] = response.meta['location'].split( ',')[0].strip() # print(item) yield item
def parse_detail(self, response): soup = BeautifulSoup(response.body, features="html.parser") initial_info = {} initial_wrapper = soup.find('div', class_='col-md-3') initial_key = '' for key in self.initial_keys: initial_info[key] = None for item in initial_wrapper.findAll('b'): flag = True for key in self.initial_keys: if key in item.text: initial_key = key flag = False break if flag: initial_info[initial_key] = item.text.strip().replace( 'included in asking price', ' included in asking price') initial_info['Asking Price'] = cleanItem(initial_info['Asking Price']) initial_info['Cash Flow'] = cleanItem(initial_info['Cash Flow']) if initial_info['Asking Price'] and initial_info[ 'Cash Flow'] and initial_info['Cash Flow'] > 0: initial_info['Multiple'] = round( initial_info['Asking Price'] / initial_info['Cash Flow'], 3) detail_info = {} detail_wrapper = soup.findAll('dl', class_='dl-horizontal')[0] detail_key = '' for key in self.detail_keys: detail_info[key] = None for row in detail_wrapper.findAll(): if '<dt>' in str(row): flag = True key = row.text.strip() if '<dd>' in str(row) and flag == True: flag = False detail_info[key.replace(':', '')] = row.text.strip() item = response.meta['item'] item['Seller_Financing'] = False if soup.find('div', class_='financing') and 'Seller Financing' in soup.find( 'div', class_='financing').text: item['Seller_Financing'] = True breadcrumbs = soup.find('ol', {'id': 'crumbs'}).findAll('li') item['Category'] = None if len(breadcrumbs) > 2: item['Category'] = breadcrumbs[2].text.replace( 'Businesses for Sale', '').strip() item['Asking_Price'] = initial_info['Asking Price'] item['Cash_Flow'] = initial_info['Cash Flow'] item['Gross_Revenue'] = cleanItem(initial_info['Gross Revenue']) item['EBITDA'] = cleanItem(initial_info['EBITDA']) item['FF_E'] = cleanItem(initial_info['FF&E']) item['Inventory'] = cleanItem(initial_info['Inventory']) item['Multiple'] = initial_info['Multiple'] item['Year_Established'] = detail_info['Year Established'] item['Employee_Count'] = detail_info['Number of Employees'] item['Website'] = detail_info['Website'] if detail_info['Location']: if len(detail_info['Location'].split(',')) == 2: item['Location_County'] = detail_info['Location'].split( ',')[0].strip() item['Location_State'] = detail_info['Location'].split( ',')[1].strip() else: item['Location_State'] = detail_info['Location'].split( ',')[0].strip() yield item