def scrape_site(url): print "Now scraping: " + url soup = souphelper.get_soup(url) if soup is None: return 0 try: h1s = soup.find_all('h1') title = h1s[1].string.strip() tables = soup.find_all('table') counter = 1 for td in tables[4].find_all('td'): if counter < 4: value = souphelper.string_to_int(td.string) if counter == 1: price = value elif counter == 2: income = value elif counter == 3: cash_flow = value else: break counter += 1 db_access.insert_data_point('websiteproperties.com', title, 'Price', price) db_access.insert_data_point('websiteproperties.com', title, 'Gross Income', income) db_access.insert_data_point('websiteproperties.com', title, 'Cash Flow', cash_flow) except: print "Error scraping page: " + url
def scrape_site(url): print "Now scraping: " + url soup = souphelper.get_soup("http://www.siteworthchecker.com/" + url) if soup: try: worth = souphelper.string_to_int(soup.span.string) spans = soup.find_all('span') for i in range(len(spans)): try: spans[i] = souphelper.string_to_int(str(spans[i])) except: spans[i] = 0 # for i in range(len(spans)): # print str(i) + " : " + str(spans[i]) daily_visits = spans[3] daily_revenue = spans[4] alexa_rank = spans[5] google_page_rank = spans[6] google_indexed_pages = spans[7] google_backlinks = spans[9] domain_age = spans[17] except: print "Error scraping: " + url return 0 try: db_query = 'REPLACE INTO siteworthchecker SET url="%s", worth="%s", daily_visits="%s", daily_revenue="%s", alexa_rank="%s", google_page_rank="%s", google_indexed_pages="%s", google_backlinks="%s", domain_age="%s"' % (url, worth, daily_visits, daily_revenue, alexa_rank, google_page_rank, google_indexed_pages, google_backlinks, domain_age) con.query(db_query) except _mysql.Error, e: print "Error %d: %s" % (e.args[0], e.args[1])
def scrape_site(url): print "Now scraping: " + url soup = souphelper.get_soup("http://webstatsart.com/" + url) if soup: try: worth_string = soup.h2.string worth = souphelper.string_to_int(worth_string) if "Million" in worth_string: worth = worth * 1000000 tr = soup.find_all('tr') for item in tr: scan = str(item) # print "--------------------------" # print item if "Pagerank" in scan: google_page_rank = souphelper.string_to_int(str(item.td.next_sibling)) if "Alexa Rank" in scan: alexa_rank = souphelper.string_to_int(str(item.td.next_sibling)) if "Daily Unique Visitors" in scan: daily_visits = souphelper.string_to_int(str(item.td.next_sibling)) if "Daily Ad Revenue" in scan: daily_revenue = souphelper.string_to_int(str(item.td.next_sibling)) except: return 0 try: db_query = 'REPLACE INTO webstatsart SET url="%s", worth="%s", daily_visits="%s", daily_revenue="%s", alexa_rank="%s", google_page_rank="%s"' % (url, worth, daily_visits, daily_revenue, alexa_rank, google_page_rank) con.query(db_query) return 1 except _mysql.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) return 0
def grab_sites_to_crawl(url): print "Grabing sites to crawl from " + url # doc = open('websitebroker_cat.htm').read() # soup = BeautifulSoup(doc) soup = souphelper.get_soup(url) if not soup: return 0 for link in soup.find_all('a'): if 'site-details' in link['href']: sites_to_crawl.append(root + link['href'])
def grab_categories_to_crawl(url): soup = souphelper.get_soup(url) if not soup: return 0 # For testing DELETE ME later # doc = open('websitebroker.htm').read() # soup = BeautifulSoup(doc) for link in soup.find_all('a'): href = link['href'] if 'listing' in href and 'new' not in href and "1" not in href[-1]: categories_to_crawl.append(root + href)
def scrape_site(url): print 'Now scraping: ' + url soup = souphelper.get_soup('http://www.yandalo.com/www.' + url) if not soup: return 0 h1 = soup.find_all('h1') for i in range(len(h1)): item = str(h1[i]) if i < len(h1) - 1: value = souphelper.string_to_int(str(h1[i+1])[4:]) # print item if 'Worth' in item: worth = value if 'Pageviews' in item: daily_pageviews = value if 'Earnings' in item: daily_revenue = value h3 = soup.find_all('h3') for i in range(len(h3)): h3[i] = str(h3[i])[4:] # print h3[i] # print str(i) + ' : ' + item try: alexa_rank = souphelper.string_to_int(str(h3[1])) google_page_rank = str(h3[11])[80:81] #FIX ME!! bounce_rate = souphelper.string_to_float(str(h3[15])) #fix, this is a percentage! pageviews_per_user = souphelper.string_to_int(str(h3[5])) #again, this is a decimal daily_time_on_site = souphelper.string_to_int(str(h3[6])) #decimal load_time = souphelper.string_to_float(str(h3[9])) #decimal bing_backlinks = souphelper.string_to_int(str(h3[3])) inbound_links = souphelper.string_to_int(str(h3[4])) visits_from_se = souphelper.string_to_float(str(h3[16])) #decimal if 'Yes' in h3[0]: dmoz_directory = 1 else: dmoz_directory = 0 except: print 'Error grabbing data for: ' + url return 0 try: db_query = 'REPLACE INTO yandalo SET url="%s", worth="%s", daily_pageviews="%s", daily_revenue="%s", alexa_rank="%s", google_page_rank="%s", bounce_rate="%s", pageviews_per_user="******", daily_time_on_site="%s", load_time="%s", bing_backlinks="%s", inbound_links="%s", visits_from_se="%s", dmoz_directory="%s"' % (url, worth, daily_pageviews, daily_revenue, alexa_rank, google_page_rank, bounce_rate, pageviews_per_user, daily_time_on_site, load_time, bing_backlinks, inbound_links, visits_from_se, dmoz_directory) con.query(db_query) except _mysql.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) return 0
def scrape_site(url): print "Now Scraping: " + url soup = souphelper.get_soup("http://www.statsmogul.com/" + url) if soup: td = soup.find_all('td') for i in range(len(td)): new_value = souphelper.string_to_int(str(td[i])) if new_value is not None: td[i] = new_value try: # The actual data em = soup.find_all('em') worth = souphelper.string_to_int(str(em[4])) alexa_rank = td[0] compete_rank = td[1] quantcast_rank = td[2] google_page_rank = td[7] domain_characters = td[3] if "No" in td[4]: domain_dictionary = 0 else: domain_dictionary = 1 domain_age = td[5] if "No" in td[6]: domain_dashes_or_numbers = 0 else: domain_dashes_or_numbers = 1 yahoo_backlinks = td[8] if "No" in td[10]: meta_tags = 0 else: meta_tags = 1 if "No" in td[11]: dmoz_directory = 0 else: dmoz_directory = 1 except: print " Error scraping: " + url return 0 try: db_query = 'REPLACE INTO statsmogul SET url="%s", worth="%s", alexa_rank="%s", compete_rank="%s", quantcast_rank="%s", google_page_rank="%s", domain_characters="%s", domain_dictionary="%s", domain_age="%s", domain_dashes_or_numbers="%s", yahoo_backlinks="%s", meta_tags="%s", dmoz_directory="%s"' % (url, worth, alexa_rank, compete_rank, quantcast_rank, google_page_rank, domain_characters, domain_dictionary, domain_age, domain_dashes_or_numbers, yahoo_backlinks, meta_tags, dmoz_directory) con.query(db_query) return 1 except _mysql.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) return 0
def grab_sites_to_crawl(url): soup = souphelper.get_soup(url) if soup is None: return 0 # doc = open('websiteproperties.htm').read() # soup = BeautifulSoup(doc) for link in soup.find_all('a'): if link.string is not None: href = link['href'] linkstring = link.string.encode('ascii', 'ignore') #may need to get rid of this on prod if "Next" in linkstring: print "Found another page!" grab_sites_to_crawl(domain + href) print '' if "view" in href: sites_to_crawl.append(domain + href)
def scrape_site(url): print "Now scraping: " + url soup = souphelper.get_soup(url) if not soup: return 0 # doc = open('websitebroker_page.htm').read() # soup = BeautifulSoup(doc) # Want [4] tables = soup.find_all('table') counter = 1 try: for td in tables[4].find_all('td'): # This is totally ridiculous to call the re.search on each of these itmes # but for some reason calling it once in a variable was having issues. # Fix me if you have time!!! if td.string is None: break if counter == 9: visitors = souphelper.string_to_int(td.string) # print visitors elif counter == 11: page_views = souphelper.string_to_int(td.string) # print page_views elif counter == 13: income = souphelper.string_to_int(td.string) # print income elif counter == 15: expenses = souphelper.string_to_int(td.string) # print expenses elif counter == 17: price = souphelper.string_to_int(td.string) # print price elif counter == 19: url = td.string # print url counter += 1 db_access.insert_data_point('websitebroker.com', url, 'Price', price) db_access.insert_data_point('websitebroker.com', url, 'Visitors', visitors) db_access.insert_data_point('websitebroker.com', url, 'Page Views', page_views) db_access.insert_data_point('websitebroker.com', url, 'Income', income) db_access.insert_data_point('websitebroker.com', url, 'Expenses', expenses) except: print "Error scraping site: " + url
def scrape_site(url): print "Now scraping: " + url soup = souphelper.get_soup("http://www.cubestat.com/www." + url) # Yeah, it's stupid to have to assign these here, but even with try I was getting assignment errors # worth = 0 # daily_pageviews = 0 # daily_revenue = 0 # alexa_rank = 0 # quantcast_rank = 0 # compete_rank = 0 # google_page_rank = 0 # live_indexed_pages = 0 # domain_age = 0 if soup: # for span in soup.find_all('span'): # print span spans = soup.find_all('span') for i in range(len(spans)): try: # print str(i) + ' : ' + str(souphelper.string_to_int(str(spans[i].string))) spans[i] = souphelper.string_to_int(str(spans[i].string)) except: spans[i] = 0 try: worth = spans[0] daily_pageviews = spans[1] daily_revenue = spans[2] alexa_rank = spans[9] quantcast_rank = spans[10] compete_rank = spans[11] google_page_rank = spans[12] live_indexed_pages = spans[17] domain_age = spans[26] except: print "list index out of range error, cannot scape: " + url return 0 try: db_query = 'REPLACE INTO cubestat SET url="%s", worth="%s", daily_pageviews="%s", daily_revenue="%s", alexa_rank="%s", quantcast_rank="%s", compete_rank="%s", google_page_rank="%s", live_indexed_pages="%s", domain_age="%s"' % (url, worth, daily_pageviews, daily_revenue, alexa_rank, quantcast_rank, compete_rank, google_page_rank, live_indexed_pages, domain_age) con.query(db_query) except _mysql.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) return 0
def scrape_site(url): print "Now crawling: " + url soup = souphelper.get_soup("http://www.websiteoutlook.com/www." + url) if not soup: return 0 # To get the worth of the site, look in <div class="wid">, then the <p> tag description = soup.find_all("div", "wid") worth_re = re.findall("USD [0-9]+[.]*[0-9]*", str(description[0])) millions = re.findall("USD [0-9]+[.]*[0-9]* Million", str(description[0])) billions = re.findall("USD [0-9]+[.]*[0-9]* Billion", str(description[0])) worth = worth_re[1][4:] # there seems to be always 2 `USD descriptions, we want the second one # websiteoutlook adds millions to the number if it is over a million if millions: worth = int(float(worth) * 1000000) elif billions: worth = int(float(worth) * 1000000000) else: worth = int(re.search("[0-9]+", worth).group(0)) tables = soup.find_all("table") # ---------------------------------------------------------- # Grab data in First Table # Conatins: domain, daily pageviews, daily ads table_data = [] for tr in tables[0].find_all("tr"): # print "New TR--------------------------" counter = 1 counter2 = 1 # only want the first 3 data points for child in tr.children: if counter == 4 and counter2 < 4: if child.string is not None: # print "Keeping this one: " + child.string data_point = re.sub("[$,]", "", child.string) table_data.append(data_point) counter2 += counter2 counter += 1 # ----------------------------------------------------------- # Grab data in second table # Contains: traffic rank, page rank, backlinks for tr in tables[1].find_all("tr"): counter = 1 counter2 = 1 for child in tr.children: if counter == 4 and counter2 < 4: if child.string is not None: data_point = child.string.encode("utf-8").strip() table_data.append(data_point) counter2 += 1 counter += 1 # for item in table_data: # item = str(item) # item2 = item.strip() try: counter = 1 for item in table_data: if counter == 1: url = item else: item = int(re.search("[0-9]+", item).group(0)) # clear out spaces and floats if counter == 2: daily_pageview = item if counter == 3: daily_ads_revenue = item if counter == 4: traffic_rank = item if counter == 5: page_rank = item if counter == 6: backlinks = item counter += 1 # db_access.insert_data_point('websiteoutlook.com', url, 'Daily Pageview', daily_pageview) # db_access.insert_data_point('websiteoutlook.com', url, 'Daily Ads Revenue', daily_ads_revenue) # db_access.insert_data_point('websiteoutlook.com', url, 'Traffic Rank', traffic_rank) # db_access.insert_data_point('websiteoutlook.com', url, 'Page Rank', page_rank) # db_access.insert_data_point('websiteoutlook.com', url, 'Backlinks', backlinks) # db_access.insert_data_point('websiteoutlook.com', url, 'Price', worth) db_query = ( 'REPLACE INTO websiteoutlook SET url="%s", worth="%s", daily_pageviews="%s", daily_revenue="%s", alexa_rank="%s"' % (url, worth, daily_pageview, daily_ads_revenue, traffic_rank) ) con.query(db_query) except: print "Error inserting data for: " + url
def grab_sites_to_crawl(url): soup = souphelper.get_soup(url) if soup: for link in soup.find_all('a'): print link
def scrape_url(url): print 'Now scraping: ' + url soup = souphelper.get_soup('http://www.worthofweb.com/website-value/' + url) if not soup: return 0
def scrape_site(url): print "Now scraping site: " + url soup = souphelper.get_soup("http://www.estimurl.com/US/")