def scrape_listings(self, url): # next page url from 'Next 'pagination tag ip = self.__proxies[self.__proxy_id % self.__pr_len] with self.__mutex: self.__proxy_id += 1 if self.__proxy_id > 1000000: with self.__mutex: self.__proxy_id = 0 try: bs = load_page_via_proxies(url, self.html_parser, ip) # bs = load_page(url, self.html_parser) except URLError: self.logger.critical( 'Timeout error while scraping listings from %s', url) return pages_urls = [url] listings_count = int( bs.find('span', { 'class': 'flex-grow text-right text-lighter pr-2' }).text.split('of')[1].strip()) pages_count = int(math.ceil( listings_count / 24)) # because there is 24 listings in every page for i in range(2, pages_count + 1): pages_urls.append(url + '/{}/'.format(i)) return self.scrape_listings_via_queries(pages_urls)
def scrape_listings_from_page(self, url): # next page url from 'Next 'pagination tag ip = self.__proxies[self.__proxy_id % self.__pr_len] with self.__mutex: self.__proxy_id += 1 if self.__proxy_id > 1000000: with self.__mutex: self.__proxy_id = 0 try: bs = load_page_via_proxies(url, self.html_parser, ip) except: self.logger.error('Error while scraping listings from %s', url) return try: listings = bs.find('div', { 'class': 'row equal-height' }).find_all('a') except AttributeError: self.logger.critical('Error while scraping listings from %s', url) return listings_urls = [] for i in listings: listings_urls.append(self.urls[0] + i['href']) return listings_urls
def scrape_listings(self, url): # next page url from 'Next 'pagination tag try: ip = self.__proxies[self.__proxy_id % self.__pr_len] with self.mutex: self.__proxy_id += 1 if self.__proxy_id > 1000000: with self.mutex: self.__proxy_id = 0 bs = load_page_via_proxies(url.split('&')[0], self.html_parser, ip) # bs = load_page(url.split('&')[0], self.html_parser) except URLError: self.logger.critical( 'Timeout error while scraping listings from %s', url) return except: self.logger.error(traceback.format_exc()) return paging = bs.find('a', {'class': 'next'}, href=True) max_url_id = None try: max_url_id = int(paging.find_previous_sibling().text) except: pass if max_url_id: # uncomment for single page debug # return self.scrape_listings_via_pagin_next(url) url_query = self.urls[0] + '?page=' # from [1-max_url_id) to [1-max_url_id] max_url_id += 1 pages_urls = [] for num in range(1, max_url_id): pages_urls.append(url_query + str(num)) return self.scrape_listings_via_queries(pages_urls) else: return self.scrape_listings_via_pagin_next(url)
def scrape_listings_from_page(self, url): # next page url from 'Next 'pagination tag try: ip = self.__proxies[self.__proxy_id % self.__pr_len] with self.mutex: self.__proxy_id += 1 if self.__proxy_id > 1000000: with self.mutex: self.__proxy_id = 0 # bs = load_page(url.split('&')[0], self.html_parser) bs = load_page_via_proxies(url.split('&')[0], self.html_parser, ip) except: print(traceback.format_exc()) return listings_tags = bs.find_all('a', {'class': 'image'}, href=True) listings_urls = [] if listings_tags: for listing_tag in listings_tags: listings_urls.append(urljoin(self.domain, listing_tag['href'])) return listings_urls
def scrape_profile(self, url): data = DataKeys.initialize() data[DataKeys.PROFILE_URL] = url data[DataKeys.SOURCE] = SOURCES.TRACKICO ip = self.__proxies[self.__proxy_id % self.__pr_len] with self.__mutex: self.__proxy_id += 1 if self.__proxy_id > 1000000: with self.__mutex: self.__proxy_id = 0 try: # bs = load_page(url, self.html_parser) bs = load_page_via_proxies(url, self.html_parser, proxy=ip) except: self.logger.warning('Could not scrape profile {}'.format(url)) return # ICO NAME try: data[DataKeys.NAME] = bs.select_one('h1.h2').text.strip() except AttributeError: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'ICO name')) # ICO Logo try: logo = bs.select_one( 'div.img-thumbnail.align-self-center.m-2').find('img')['src'] if 'data:image' not in logo: data[DataKeys.LOGO_PATH] = load_image( urljoin(self.domain, logo), ScraperBase.logo_tmp_path) else: data[DataKeys.LOGO_PATH] = load_image( logo, ScraperBase.logo_tmp_path) except AttributeError: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'ICO logo')) except Exception as e: self.logger.error('could not download {} logo with: {}'.format( url, str(e))) try: data[DataKeys.DESCRIPTION] = bs.select_one( 'div.fs-14').text.strip() except AttributeError: self.logger.warning( self.NOT_FOUND_MSG.format(url, 'ICO description')) try: pre_ico_dates = bs.find( 'th', text='Pre-Sale').findNextSibling('td').text.strip() data[DataKeys.PRE_ICO_START] = pre_ico_dates.split( '-')[0].strip().split()[-1] data[DataKeys.PRE_ICO_END] = pre_ico_dates.split( '-')[1].strip().split()[-1] except (AttributeError, IndexError): self.logger.debug(self.NOT_FOUND_MSG.format(url, 'Pre ICO dates')) try: ico_dates = bs.find( 'th', text='Token Sale').findNextSibling('td').text.strip() data[DataKeys.ICO_START] = ico_dates.split( '-')[0].strip().split()[-1] data[DataKeys.ICO_END] = ico_dates.split( '-')[1].strip().split()[-1] except (AttributeError, IndexError): self.logger.warning(self.NOT_FOUND_MSG.format(url, 'ICO dates')) try: data[DataKeys.COUNTRY] = bs.find( 'th', text='Country').findNextSibling('td').find('a').text.strip() except AttributeError: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'ICO country')) try: data[DataKeys.PLATFORM] = bs.find( 'th', text='Platform').findNextSibling('td').find('a').text.strip() except AttributeError: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'ICO platform')) try: data[DataKeys.TOKEN_NAME] = bs.find( 'th', text='Token').findNextSibling('td').text.strip() except AttributeError: self.logger.warning( self.NOT_FOUND_MSG.format(url, 'ICO token name')) try: data[DataKeys.OVERALL_SCORE] = bs.select_one( 'div.fs-60.fw-400.text-primary').text.strip() except AttributeError: self.logger.warning( self.NOT_FOUND_MSG.format(url, 'ICO overall rating')) # getting social pages # TODO: maybe will be necessary to add other community types map_ = { 'bitcointalk': DataKeys.BITCOINTALK_URL, 'twitter': DataKeys.TWITTER_URL, 'facebook': DataKeys.FACEBOOK_URL, 'telegram': DataKeys.TELEGRAM_URL, 'github': DataKeys.GITHUB_URL, 'reddit': DataKeys.REDDIT_URL, 'linkedin': DataKeys.LINKEDIN_URL, 'homepage': DataKeys.WEBSITE, 'whitepaper': DataKeys.WHITEPAPER, 'slack': DataKeys.SLACK_URL, 'blog': DataKeys.MEDIUM_URL, 'youtube': DataKeys.YOUTUBE_URL, 'instagram': DataKeys.INSTAGRAM_URL } social_pages_div = bs.select_one('div.flexbox.flex-wrap') if social_pages_div: social_pages_ = social_pages_div.find_all('a') for page_ in social_pages_: if page_.has_attr('onclick'): candidate_spl = page_['onclick'].split('link-') if len(candidate_spl) <= 1: candidate_spl = page_['onclick'].split('button-') if len(candidate_spl) > 1: cand = candidate_spl[1] soc_ = re.sub('[^\w]', '', cand).lower() if soc_ in map_: value_ = page_['href'].strip() key_ = map_[soc_] data[key_] = value_ else: self.logger.warning( self.NOT_FOUND_MSG.format(url, 'Social pages div')) # try: # social_pages = bs.find('div', {'class': 'card card-body text-center'}).find_all('a') # # for page in social_pages: # try: # soc = re.sub('[^\w]', '', page['onclick'].split('button-')[1]).lower() # except (AttributeError, IndexError, KeyError): # continue # # if soc in map_: # try: # key = map_[soc] # except KeyError: # continue # # try: # value = page['href'].strip() # data[key] = value # except AttributeError: # self.logger.warning('No url for {} social page'.format(key)) # else: # continue # except: # self.logger.warning(self.NOT_FOUND_MSG.format(url, 'Social pages')) TrackIco.process(data) return data
def scrape_profile(self, url): data = DataKeys.initialize() data[DataKeys.PROFILE_URL] = url data[DataKeys.SOURCE] = SOURCES.ICOBENCH try: ip = self.__proxies[self.__proxy_id % self.__pr_len] with self.mutex: self.__proxy_id += 1 if self.__proxy_id > 1000000: with self.mutex: self.__proxy_id = 0 bs = load_page_via_proxies(url, self.html_parser, ip) # bs = load_page(url, self.html_parser) except: print(traceback.format_exc()) self.logger.error('Error while scraping profile {}'.format(url)) return try: description_tag = bs.find('div', {'class': 'name'}) name_and_description = description_tag.findChildren( re.compile('h\d')) data[DataKeys.NAME] = name_and_description[0].text.strip() # data[DataKeys.DESCRIPTION] = name_and_description[1].text.strip() except: self.logger.warning( self.NOT_FOUND_MSG.format(url, 'Name and/or Description')) try: data[DataKeys.DESCRIPTION] = bs.find('div', { 'class': 'name' }).parent.find_next_sibling('p').text.strip() except: self.logger.warning( self.NOT_FOUND_MSG.format(url, 'Name and/or Description')) ######################### Score Fileds ######################### score_divs = bs.find('div', { 'class': 'rating' }).find('div', { 'class': 'distribution' }).findAll('div') # todo decide what to do in case of DATA_KEYS data_mapping = { 'ICO PROFILE': DataKeys.ICO_PROFILE_SCORE, 'VISION': DataKeys.VISION_SCORE, 'TEAM': DataKeys.TEAM_SCORE, 'PRODUCT': DataKeys.PRODUCT_SCORE } for div in score_divs: label = str(div.find('label').text).strip() key = data_mapping.get(label.upper()) try: data[key] = str(div.contents[0]).strip() except AttributeError: data[key] = BOOL_VALUES.NOT_AVAILABLE rate_div = bs.find('div', {'itemprop': 'ratingValue'}) if rate_div: data[DataKeys.OVERALL_SCORE] = str(rate_div['content']) else: self.NOT_FOUND_MSG.format(url, 'Experts score') ############################################################### financial_divs = bs.find('div', {'class': 'financial_data'}) if financial_divs: ############ date info ############## date_err = None try: # get label of date (TIME if available otherwise STATUS which can be UNKNOWN and ENDED) date_label = financial_divs.find('label', text='Time') pre = False if not date_label: date_label = financial_divs.find('label', text=re.compile( 'PreICO time', re.IGNORECASE)) if date_label: pre = True else: date_label = financial_divs.find('label', text=re.compile( 'STATUS', re.IGNORECASE)) date_number = date_label.find_next_sibling('div', {'class': 'number'}, text=True) div_text = date_number.text.strip() if div_text.upper() == 'ENDED': data[DataKeys.STATUS] = ICO_STATUS.ENDED elif div_text.upper() == 'UNKNOWN': pass else: date_info = re.findall( r'\d{4}[\-.]\d{2}[\-.]\d{2}', date_number.find_next_sibling().text) if date_info: if pre: data[DataKeys.PRE_ICO_START] = date_info[0] data[DataKeys.PRE_ICO_END] = date_info[1] else: data[DataKeys.ICO_START] = date_info[0] data[DataKeys.ICO_END] = date_info[1] except Exception as e: date_err = self.NOT_FOUND_MSG.format( url, 'Date Info') + ' with message: '.format(str(e)) ############## end of date info ################# #################### Overall information ##################### financial_divs_ = financial_divs.findAll('div', {'class': 'data_row'}) if financial_divs_: financial_info_keys = { 'TOKEN': DataKeys.TOKEN_NAME, 'PREICO PRICE': DataKeys.PRE_ICO_PRICE, 'PRICE': DataKeys.ICO_PRICE, 'PRICE IN ICO': DataKeys.ICO_PRICE, 'PLATFORM': DataKeys.PLATFORM, 'ACCEPTING': DataKeys.ACCEPTED_CURRENCIES, 'SOFT CAP': DataKeys.SOFT_CAP, 'HARD CAP': DataKeys.HARD_CAP, 'COUNTRY': DataKeys.COUNTRY, 'RESTRICTED AREAS': DataKeys.COUNTRIES_RESTRICTED } for financial_div in financial_divs_: try: info_ = financial_div.findAll('div') key = info_[0].text.strip().upper() if key == 'ICO START': data[DataKeys.ICO_START] = info_[1].text.strip( ).replace('st', '').replace('nd', '').replace( 'th', '').replace('rd', '') date_err = None elif key == 'ICO END': data[DataKeys.ICO_END] = info_[1].text.strip( ).replace('st', '').replace('nd', '').replace( 'th', '').replace('rd', '') date_err = None elif key == 'PREICO END': data[DataKeys.ICO_END] = info_[1].text.strip( ).replace('st', '').replace('nd', '').replace( 'th', '').replace('rd', '') date_err = None elif key == 'PREICO START': data[DataKeys.ICO_END] = info_[1].text.strip( ).replace('st', '').replace('nd', '').replace( 'th', '').replace('rd', '') date_err = None # kyc and whitelist are in one filed # so this case is not as other ones if key == 'WHITELIST/KYC': text = info_[1].text.upper() data[ DataKeys. KYC] = BOOL_VALUES.YES if 'KYC' in text else BOOL_VALUES.NO data[ DataKeys. WHITELIST] = BOOL_VALUES.YES if 'WHITELIST' in text else BOOL_VALUES.NO continue if key in financial_info_keys: text = info_[1].text.strip() if text: data[financial_info_keys[key]] = text except: pass if date_err: self.logger.warning(date_err) else: self.logger.warning( self.NOT_FOUND_MSG.format(url, 'financial data 2')) else: self.logger.warning( self.NOT_FOUND_MSG.format(url, 'financial data')) # get links try: soc_mapping = { 'FACEBOOK': DataKeys.FACEBOOK_URL, 'GITHUB': DataKeys.GITHUB_URL, 'MEDIUM': DataKeys.MEDIUM_URL, 'TELEGRAM': DataKeys.TELEGRAM_URL, 'REDDIT': DataKeys.REDDIT_URL, 'BITCOINTALK': DataKeys.BITCOINTALK_URL, 'WWW': DataKeys.WEBSITE, 'LINKEDIN': DataKeys.LINKEDIN_URL, 'TWITTER': DataKeys.TWITTER_URL } link_tags = bs.find('div', {'class': 'socials'}).findAll('a') for link_tag in link_tags: if link_tag.has_attr('title') and link_tag.has_attr('href'): soc = link_tag.text.strip() if soc.upper() in soc_mapping: data[soc_mapping[soc.upper()]] = link_tag['href'] except AttributeError: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'Social links')) try: logo_link = bs.find('div', {'class': 'image'}).find('img') data[DataKeys.LOGO_PATH] = load_image( urljoin(self.domain, logo_link['src']), ScraperBase.logo_tmp_path) except (AttributeError, KeyError): self.logger.warning(self.NOT_FOUND_MSG.format('Logo url')) except Exception as e: self.logger.error('could not download {} logo with: {}'.format( url, str(e))) IcoBench.process(data) return data