def scrape_listings_via_pagin_next(self, url, page_num=None): # next page url from 'Next 'pagination tag try: bs = load_page(url.split('&')[0], self.html_parser) except URLError: self.logger.error('Timeout error while scraping listings from %s', url) return paging = bs.find('a', {'class': 'next'}, href=True) next_page_url = None if not paging else urljoin( self.domain, paging['href']) listing_urls = [] listings = bs.find_all('a', {'class': 'image'}, href=True) if listings: for profile in listings: listing_urls.append(urljoin(self.domain, profile['href'])) # if next page is previous page (pagination ended) break recursion if next_page_url or next_page_url == url: page_num = 1 if page_num is None else page_num + 1 sys.stdout.write('\r[Scraping listing urls: {}]'.format(page_num)) sys.stdout.flush() if page_num < 2: listing_urls += self.scrape_listings_via_pagin_next( next_page_url, page_num) sys.stdout.write('\r') return listing_urls
def scrape_listings(url, rec=True): post_count = 0 user_list = [] comment_count = 0 if '/r/' not in url: return post_count, comment_count, user_list while True: try: bs = load_page(url, Reddit.html_parser) except Exception as e: if rec: return Reddit.scrape_listings(url, rec=False) logging.error( 'Unable to scrap profile for {}, after retrying 2 time, the reason: {}' .format(url, str(e))) break try: posts = bs.find_all('div', {'class': 'top-matter'}) post_count += len(posts) for post in posts: comment = re.findall( '\d+', post.find('li', { 'class': 'first' }).find('a').text) if len(comment) > 0: comment_count += int(comment[0]) user_name = post.find('p', { 'class': 'tagline' }).find('a').text if user_name not in user_list: user_list.append(user_name) url = bs.find('span', { 'class': 'next-button' }).find('a')['href'] except AttributeError: break return post_count, comment_count, user_list
def __scrape_listings(url): try: bs = load_page(url, __html_parser) except: __logger.warning('Could not load bitcointalk page') return try: url_sample = re.match('.*topic=\d*', url).group(0) except: __logger.warning('Found unknown bitcoinalk reference') return urls = [url] pagins = bs.findAll('a', {'class': 'navPages'}) for p in pagins: if p.has_attr('href'): url = re.match('.*topic=\d*(.\d+)?', p['href']).group(0) urls.append(url) last_pagin_num = 0 for url in urls: try: n = int(url.split('.')[-1]) except ValueError: continue if n > last_pagin_num: last_pagin_num = n i = 0 urls_ = [] while i != last_pagin_num + 20: urls_.append('{}.{}'.format(url_sample, str(i))) i += 20 return random.sample(urls_, len(urls_))
def scrape_listings(self, url): try: # driver = setup_browser(self.browser_name) bs = load_page(url, self.html_parser) except: self.logger.critical('Error while scraping listings from %s', url) return # driver.get(url) # urls = [] # wait = WebDriverWait(driver, 5) # try: # while True: # elements = driver.find_elements_by_css_selector('.t_wrap.t_line') # for e in elements: # urls.append(e.get_attribute('href')) # next_ = wait.until(EC.presence_of_element_located( # (By.XPATH, '//a[contains(text(), "ยป") and @class="pagination__link"]'))) # if next_: # click(driver, next_) # else: # break # except: # if len(urls) == 0: # self.logger.critical('Could not extract listings from'.format(url)) tags = bs.find('div', { 'class': 'upcoming-sec__main' }).findAll('a', {'target': '_blank'}) urls = [] for tag in tags: urls.append(tag['href']) # driver.quit() return urls
def scrape_profile(self, url): data = DataKeys.initialize() data[DataKeys.PROFILE_URL] = url data[DataKeys.SOURCE] = SOURCES.ICOBAZAAR try: bs_ = load_page(url, self.__html_parser) except: self.logger.error('Could not scrape profile {}'.format(url)) return # scrapping of basic data try: data[DataKeys.NAME] = bs_.find('div', { 'class': 'com-header__info' }).find('h1').text except AttributeError: self.logger.error(self.NOT_FOUND_MSG.format(url, 'ICO name')) try: data[DataKeys.DESCRIPTION] = bs_.find('div', { 'class': 'com-header__info' }).find('p').text except AttributeError: self.logger.error(self.NOT_FOUND_MSG.format( url, 'ICO description')) try: logo_url = bs_.find('div', { 'class': 'com-header__logo' }).img['src'].strip() data[DataKeys.LOGO_PATH] = load_image(logo_url, ScraperBase.logo_tmp_path) except (AttributeError, KeyError): self.logger.error(self.NOT_FOUND_MSG.format(url, 'ICO logo')) except Exception as e: self.logger.error('could not download {} logo with: {}'.format( url, str(e))) try: data[DataKeys.OVERALL_SCORE] = bs_.find( 'div', {'class': 'ico-rating'})['rating'] except: try: bs_ = load_page(url, self.__html_parser) data[DataKeys.OVERALL_SCORE] = bs_.find( 'div', {'class': 'ico-rating'})['rating'] except (AttributeError, KeyError): self.logger.error(self.NOT_FOUND_MSG.format(url, 'Rating')) map_ = { 'start': DataKeys.ICO_START, 'end': DataKeys.ICO_END, 'cap': DataKeys.HARD_CAP, 'goal': DataKeys.SOFT_CAP, 'price': DataKeys.ICO_PRICE } try: for a in bs_.find_all('div', {'class': 'com-sidebar__info-line'}): try: key = map_[re.sub(':', '', a.find('span').text).strip().lower()] try: value = a.find('span', { 'class': 'com-sidebar__info-value' }).text.strip() data[key] = value except AttributeError: self.logger.error( 'No data for {} in sidebar'.format(key)) pass except AttributeError: self.logger.error( 'Key {} does not exist in sidebar'.format( re.sub(':', '', a.find('span').text.strip()))) pass except AttributeError: self.logger.error(self.NOT_FOUND_MSG.format(url, 'Sidebar')) pass try: data[DataKeys.WEBSITE] = bs_.find('div', { 'class': 'com-sidebar' }).find('a')['href'] except AttributeError: self.logger.error(self.NOT_FOUND_MSG.format(url, 'ICO website')) # scrap data from "community" tab of particular listing try: bs__ = load_page(url + '/community', self.__html_parser) except AttributeError: self.logger.error( 'Could not scrape community of profile {}'.format(url)) return # ----rating list try: rating_list = bs__.find('div', { 'class': 'com-rating__list' }).find_all('div', {'class': 'com-rating__list-element'}) for rate in rating_list: if rate.find('span').text.lower() == 'team': data[DataKeys.TEAM_SCORE] = re.findall( '\d{1,3}\%', rate.find('div', { 'class': 'progress-bar' }).find('span')['style'])[0].strip('%') except AttributeError: self.logger.error(self.NOT_FOUND_MSG.format(url, 'Team')) # getting social pages # TODO: maybe will be necessary to add other community types map_ = { 'website': DataKeys.WEBSITE, 'bitcointalk': DataKeys.BITCOINTALK_URL, 'twitter': DataKeys.TWITTER_URL, 'facebook': DataKeys.FACEBOOK_URL, 'telegram': DataKeys.TELEGRAM_URL, 'github': DataKeys.GITHUB_URL, 'reddit': DataKeys.REDDIT_URL, 'linkedin': DataKeys.LINKEDIN_URL, 'slack': DataKeys.SLACK_URL } try: social_pages = bs__.find('div', { 'class': 'com-social' }).find_all('a') for page in social_pages: try: key = page.find('i')['class'][1].split('-')[1].lower() if key in map_ and page.has_attr('href'): value = page['href'].strip() data[map_[key]] = value except AttributeError: self.logger.error( 'Unsupported Community type for scrapping --> {} '. format(page.find('i')['class'][1].split('-')[1])) except AttributeError: self.logger.error(self.NOT_FOUND_MSG.format(url, 'Social pages')) IcoBazaar.process(data) return data
def scrape_profile(self, url): data = DataKeys.initialize() data[DataKeys.PROFILE_URL] = url data[DataKeys.SOURCE] = SOURCES.ICOMARKS try: bs = load_page(url, self.html_parser) except: self.logger.error('Could not extract {} page'.format(url)) return # name try: data[DataKeys.NAME] = bs.find('h1', { 'itemprop': 'name' }).text.strip() except: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'ICO name')) # logo try: logo_path = bs.find('img', {'itemprop': 'url'})['src'] data[DataKeys.LOGO_PATH] = load_image( urljoin(self.domain, logo_path), ScraperBase.logo_tmp_path) except (AttributeError, KeyError): self.logger.warning(self.NOT_FOUND_MSG.format(url, 'ICO logo')) except Exception as e: self.logger.error('could not download {} logo with: {}'.format( url, str(e))) # overall scores try: data[DataKeys.OVERALL_SCORE] = bs.find( 'div', { 'class': 'ico-rating-overall' }).text.strip() except: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'ICO score')) # other scores score_mapping = { 'ICO PROFILE': DataKeys.ICO_PROFILE_SCORE, 'TEAM & ADVISORS': DataKeys.TEAM_SCORE } try: ratings = bs.findAll('div', {'class': 'ico-rating__item'}) for rating in ratings: title = rating.find('div', class_='ico-rating__title', text=True) key = None if not title else title.text.strip().upper() if key in score_mapping: value = rating.parent.find('div', class_='ico-rating__circle') data[score_mapping[key]] = value.text.strip() except: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'ICO score')) details_mapping = { 'COUNTRY:': DataKeys.COUNTRY, 'PRICE:': DataKeys.ICO_PRICE, 'ACCEPTING:': DataKeys.ACCEPTED_CURRENCIES, 'SOFT CAP:': DataKeys.SOFT_CAP, 'HARD CAP:': DataKeys.HARD_CAP, 'TICKER:': DataKeys.TOKEN_NAME, 'PLATFORM:': DataKeys.PLATFORM, 'TOKEN TYPE:': DataKeys.TOKEN_STANDARD } details_info = bs.select_one('div.icoinfo') try: desks = details_info.select('div.icoinfo-block__item') for detail in desks: title = detail.find('span', text=True) key = None if not title else title.text.strip().upper() if key in details_mapping: value = title.parent.text.split(':')[1].strip() data[details_mapping[key]] = value except: self.logger.error( 'Someting went wrong in {}, when scraping detail rows'.format( url)) # pre ico time try: date = details_info.find('span', text='Pre-sale Time:') if date: value = date.parent.text.split(':')[1].upper() dates = value.split('-') data[DataKeys.PRE_ICO_START] = dates[0].strip() data[DataKeys.PRE_ICO_END] = dates[1].strip() except: self.logger.warning(self.NOT_FOUND_MSG.format( url, 'Pre Date info')) # ico time try: date = details_info.find('span', text='ICO Time:') if date: value = date.parent.text.split(':')[1].upper() dates = value.split('-') data[DataKeys.ICO_START] = dates[0].strip() data[DataKeys.ICO_END] = dates[1].strip() except: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'Date info')) # website url try: title = details_info.find('span', text='Website:') if title: value = title.find_next_sibling('a') data[DataKeys.WEBSITE] = value['href'] except: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'Date info')) # KYC/Whitelist try: kyc_w = details_info.find('span', text='Whitelist/KYC:') if kyc_w == 'WHITELIST/KYC': text = kyc_w.parent.text.split(':')[1].upper() data[ DataKeys. KYC] = BOOL_VALUES.YES if 'KYC' in text else BOOL_VALUES.NO data[ DataKeys. WHITELIST] = BOOL_VALUES.YES if 'WHITELIST' in text else BOOL_VALUES.NO except: self.logger.warning( self.NOT_FOUND_MSG.format(url, 'KYC and whitelist')) # soc links try: soc_links = details_info.findAll('a', {'class': 'icoinfo-block__view'}) for soc_link in soc_links: if soc_link.has_attr('href'): if re.match( '^(https?(:\/\/)?(www)?.?)?bitcointalk.org\/.*', soc_link['href']): data[DataKeys.BITCOINTALK_URL] = soc_link['href'] continue if re.match('^(https?(:\/\/)?(www)?.?)?facebook.com\/.*', soc_link['href']): data[DataKeys.FACEBOOK_URL] = soc_link['href'] continue if re.match('^(https?(:\/\/)?(www)?.?)?twitter.com\/.*', soc_link['href']): data[DataKeys.TWITTER_URL] = soc_link['href'] continue if re.match('^(https?(:\/\/)?(www)?.?)?t.me\/.*', soc_link['href']): data[DataKeys.TELEGRAM_URL] = soc_link['href'] continue if re.match('^(https?(:\/\/)?(www)?.?)?reddit.com\/.*', soc_link['href']): data[DataKeys.REDDIT_URL] = soc_link['href'] continue if re.match('^(https?(:\/\/)?(www)?.?)?github.com\/.*', soc_link['href']): data[DataKeys.GITHUB_URL] = soc_link['href'] continue if re.match('^(https?(:\/\/)?(www)?.?)?medium.com\/.*', soc_link['href']): data[DataKeys.MEDIUM_URL] = soc_link['href'] continue if re.match('^(https?(:\/\/)?(www)?.?)?linkedin.com\/.*', soc_link['href']): data[DataKeys.LINKEDIN_URL] = soc_link['href'] continue if re.match('^(https?(:\/\/)?(www)?.?)?linkedin.com\/.*', soc_link['href']): data[DataKeys.LINKEDIN_URL] = soc_link['href'] continue except: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'Soc links')) # description try: data[DataKeys.DESCRIPTION] = bs.find( 'div', {'class', 'company-description'}).text.strip() except: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'Description')) IcoMarks.process(data) return data
def scrape_profile(self, url): data = DataKeys.initialize() data[DataKeys.PROFILE_URL] = url data[DataKeys.SOURCE] = SOURCES.TOKENTOPS try: bs = load_page(url, self.html_parser) except: self.logger.error('Could not extract {} page'.format(url)) return # name try: data[DataKeys.NAME] = bs.find('h1', { 'class': 'page-details__title' }).text.strip() except AttributeError: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'ICO name')) # logo try: logo_path = bs.find('img', {'class': 'page-details__logo'})['src'] data[DataKeys.LOGO_PATH] = load_image( urljoin(self.domain, logo_path), ScraperBase.logo_tmp_path) except AttributeError: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'ICO logo')) except Exception as e: self.logger.error('could not download {} logo with: {}'.format( url, str(e))) # overall scores try: score = bs.find('div', { 'class': 'rating_block' }).find('span', { 'class': 'rating-text' }).text.strip() if score != '0': data[DataKeys.OVERALL_SCORE] = score except (AttributeError, ValueError): self.logger.warning(self.NOT_FOUND_MSG.format( url, 'Overall score')) # social links soc_mapping = { 'Facebook': DataKeys.FACEBOOK_URL, 'Github': DataKeys.GITHUB_URL, 'Blog': DataKeys.MEDIUM_URL, 'Telegram': DataKeys.TELEGRAM_URL, 'Reddit': DataKeys.REDDIT_URL, 'Bitcoin Talk': DataKeys.BITCOINTALK_URL, 'Website': DataKeys.WEBSITE, 'Linkedin': DataKeys.LINKEDIN_URL, 'Twitter': DataKeys.TWITTER_URL } try: soc_tags = bs.find('div', {'class': 'page-details__main'}) if soc_tags: for key, _ in soc_mapping.items(): target = soc_tags.find('a', {'title': key}) if target and target.has_attr('href'): data[soc_mapping[key]] = target['href'] except: self.logger.error( 'Something went wrong in {}, when scraping social links'. format(url)) # details details_mapping = { 'START DATE': DataKeys.ICO_START, 'CLOSE DATE': DataKeys.ICO_END, 'TOKEN SYMBOL': DataKeys.TOKEN_NAME, 'SMART CONTRACT BLOCKCHAIN': DataKeys.PLATFORM, 'AMOUNT RAISED': DataKeys.RAISED } try: details = bs.findAll('div', {'class': 'page-details__info-row'}) for detail in details: title = detail.find('h3', {'class': 'page-details__info-title'}, text=True) if title and title.text.strip().upper() in details_mapping: value = title.find_next_sibling( 'div', {'class': 'page-details__info-descr'}, text=True) if value: data[details_mapping[ title.text.strip().upper()]] = value.text.strip() except: self.logger.error( 'Something went wrong in {}, when scraping detail rows'.format( url)) # description try: div_tag = bs.find('div', {'class': 'show-more-wrap show-more--big2'}) description_tag = div_tag.find('h2', text=True) if description_tag: data[DataKeys.DESCRIPTION] = description_tag.text.strip() except: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'Description')) # review scores try: review_sum = 0 total_reviews = 0 review_blocks = bs.findAll('div', {'id': 'section-review-block'}) reviews = [] for block in review_blocks: reviews += block.findAll('div', {'class': 'rat-stars'}) for review in reviews: score = review.find('span') if score and score.has_attr('style'): try: value = int( re.search('\d(\d{1,2})?', score['style']).group()) if value == 0: continue review_sum += value total_reviews += 1 except: self.logger.warning( 'Could not find score percentage from {}'.format( url)) if total_reviews != 0 and review_sum != 0: data[DataKeys.USER_SCORE] = review_sum // total_reviews except: pass TokenTops.process(data) return data
def scrape_profile(self, url): data = DataKeys.initialize() data[DataKeys.PROFILE_URL] = url data[DataKeys.SOURCE] = SOURCES.ICORATING try: bs = load_page(url, self.html_parser) except: self.logger.error('Could not scrape profile {}'.format(url)) return try: text = bs.find('div', {'class': 'h1'}).find('h1').text # from "ICO NAME (ICN)" to "ICO NAME" data[DataKeys.NAME] = text.split('(')[0].strip() except: self.logger.error(self.NOT_FOUND_MSG.format(url, 'ICO name')) try: ratings_tag = bs.findAll('span', {'class': 'title'}, text=True) for rating in ratings_tag: # RISK if rating.text.upper() == 'RISK SCORE': risk = rating.parent.find('span', {'class': 'score'}, text=True) if risk: risk_text = risk.text.split('/') if risk_text and len(risk_text) == 2: data[DataKeys.RISK_SCORE] = float( risk_text[0].strip()) # Hype if rating.text.upper() == 'HYPE SCORE': hype = rating.parent.find('span', {'class': 'score'}, text=True) if hype: hype_text = hype.text.split('/') if hype_text and len(hype_text) == 2: data[DataKeys.HYPE_SCORE] = float( hype_text[0].strip()) # Investment if rating.text.upper() == 'INVESTMENT RATING': inv = rating.parent.find('span', {'class': 'name'}, text=True) if inv: value = inv.text.upper() investment_ratings = { 'POSITIVE+': 8, 'POSITIVE': 7, 'STABLE+': 6, 'STABLE': 5, 'RISKY+': 4, 'RISKY': 3, 'RISKY-': 2, 'NEGATIVE': 1, 'NEGATIVE-': 0, 'NA': BOOL_VALUES.NOT_AVAILABLE } rating = investment_ratings[value.upper()] if rating: data[DataKeys.ROI_SCORE] = rating except: self.logger.warning('Exception while scraping {} from {}'.format( 'rating info', url)) link_tags = bs.findAll('a', {'target': '_blank'}, text=False) soc_mapping = { 'FACEBOOK': DataKeys.FACEBOOK_URL, 'GITHUB': DataKeys.GITHUB_URL, 'MEDIUM': DataKeys.MEDIUM_URL, 'INSTAGRAM': DataKeys.INSTAGRAM_URL, 'TELEGRAM': DataKeys.TELEGRAM_URL, 'REDDIT': DataKeys.REDDIT_URL, 'BTCTALK': DataKeys.BITCOINTALK_URL, 'WEBSITE': DataKeys.WEBSITE, 'LINKEDIN': DataKeys.LINKEDIN_URL, 'TWITTER': DataKeys.TWITTER_URL } for link_tag in link_tags: try: text = link_tag.text.strip().upper() key = soc_mapping[text] data[key] = link_tag['href'] except (AttributeError, KeyError): continue # logo link try: data[DataKeys.LOGO_PATH] = load_image( urljoin( self.domain, bs.find('div', { 'class': 'share' }).find_previous_sibling('img')['src']), ScraperBase.logo_tmp_path) except (AttributeError, KeyError): self.logger.warning(self.NOT_FOUND_MSG.format(url, 'logo url')) except Exception as e: self.logger.error('could not download {} logo with: {}'.format( url, str(e))) # description try: data[DataKeys.DESCRIPTION] = bs.find( 'td', text='Description:').find_next_sibling().text.strip() except: self.logger.warning(self.NOT_FOUND_MSG.format(url, 'description')) try: bs = load_page(url + '/details', self.html_parser) except: self.logger.error( self.NOT_FOUND_MSG.format(url + '/details', 'info table')) info_map = { 'Pre-ICO start date:': DataKeys.PRE_ICO_START, 'Pre-ICO end date:': DataKeys.PRE_ICO_END, 'Hard cap:': DataKeys.HARD_CAP, 'ICO start date:': DataKeys.ICO_START, 'ICO end date:': DataKeys.ICO_END, 'Soft cap:': DataKeys.SOFT_CAP, 'Ticker:': DataKeys.TOKEN_NAME, 'ICO Platform:': DataKeys.PLATFORM, 'Token price in USD:': DataKeys.ICO_PRICE, 'Accepted Currencies:': DataKeys.ACCEPTED_CURRENCIES, 'Country Limitations:': DataKeys.COUNTRIES_RESTRICTED, 'Token Standard:': DataKeys.TOKEN_STANDARD, 'Registration Country:': DataKeys.COUNTRY } rows = bs.find_all('td', text=re.compile('.*:$')) for row in rows: try: key = row.text.strip() if key in info_map: value = row.find_next_sibling().text.strip() data[info_map[key]] = value except AttributeError: continue IcoRating.process(data) return data