Ejemplo n.º 1
0
def scrape_site(url):
  print "Now scraping: " + url
  soup = souphelper.get_soup(url)
  if soup is None:
    return 0

  try:
    h1s = soup.find_all('h1')
    title = h1s[1].string.strip()
    tables = soup.find_all('table')
    counter = 1
    for td in tables[4].find_all('td'):
      if counter < 4: 
        value = souphelper.string_to_int(td.string)
      if counter == 1:
        price = value
      elif counter == 2:
        income = value
      elif counter == 3:
        cash_flow = value
      else:
        break
      counter += 1
    db_access.insert_data_point('websiteproperties.com', title, 'Price', price)
    db_access.insert_data_point('websiteproperties.com', title, 'Gross Income', income)
    db_access.insert_data_point('websiteproperties.com', title, 'Cash Flow', cash_flow)
  except:
    print "Error scraping page: " + url
Ejemplo n.º 2
0
def scrape_site(url):
    print "Now scraping: " + url
    soup = souphelper.get_soup("http://www.siteworthchecker.com/" + url)
    if soup:

        try:
            worth = souphelper.string_to_int(soup.span.string)
            spans = soup.find_all('span')
            for i in range(len(spans)):
                try:
                    spans[i] = souphelper.string_to_int(str(spans[i]))
                except:
                    spans[i] = 0
            # for i in range(len(spans)):
            #     print str(i) + " : " + str(spans[i])

            daily_visits = spans[3]
            daily_revenue = spans[4]
            alexa_rank = spans[5]
            google_page_rank = spans[6]
            google_indexed_pages = spans[7]
            google_backlinks = spans[9]
            domain_age = spans[17]
        except:
            print "Error scraping: " + url
            return 0

        try:
            db_query = 'REPLACE INTO siteworthchecker SET url="%s", worth="%s", daily_visits="%s", daily_revenue="%s", alexa_rank="%s", google_page_rank="%s", google_indexed_pages="%s", google_backlinks="%s", domain_age="%s"' % (url, worth, daily_visits, daily_revenue, alexa_rank, google_page_rank, google_indexed_pages, google_backlinks, domain_age)
            con.query(db_query)
        except _mysql.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
Ejemplo n.º 3
0
def scrape_site(url):
    print "Now scraping: " + url
    soup = souphelper.get_soup("http://webstatsart.com/" + url)
    if soup:
        try:
            worth_string = soup.h2.string
            worth = souphelper.string_to_int(worth_string)
            if "Million" in worth_string:
                worth = worth * 1000000

            tr = soup.find_all('tr')
            for item in tr:
                scan = str(item)
                # print "--------------------------"
                # print item
                if "Pagerank" in scan:
                    google_page_rank = souphelper.string_to_int(str(item.td.next_sibling))
                if "Alexa Rank" in scan:
                    alexa_rank = souphelper.string_to_int(str(item.td.next_sibling))
                if "Daily Unique Visitors" in scan:
                    daily_visits = souphelper.string_to_int(str(item.td.next_sibling))
                if "Daily Ad Revenue" in scan:
                    daily_revenue = souphelper.string_to_int(str(item.td.next_sibling))
        except:
            return 0

        try:
            db_query = 'REPLACE INTO webstatsart SET url="%s", worth="%s", daily_visits="%s", daily_revenue="%s", alexa_rank="%s", google_page_rank="%s"' % (url, worth, daily_visits, daily_revenue, alexa_rank, google_page_rank)
            con.query(db_query)
            return 1
        except _mysql.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
            return 0
Ejemplo n.º 4
0
def grab_sites_to_crawl(url):
	print "Grabing sites to crawl from " + url
	# doc = open('websitebroker_cat.htm').read()
	# soup = BeautifulSoup(doc)
	soup = souphelper.get_soup(url) 
	if not soup:
		return 0

	for link in soup.find_all('a'):
		if 'site-details' in link['href']:
			sites_to_crawl.append(root + link['href'])
Ejemplo n.º 5
0
def grab_categories_to_crawl(url):
  soup = souphelper.get_soup(url)
  if not soup:
  	return 0

  # For testing DELETE ME later
  # doc = open('websitebroker.htm').read()
  # soup = BeautifulSoup(doc)

  for link in soup.find_all('a'):
  	href = link['href']
  	if 'listing' in href and 'new' not in href and "1" not in href[-1]:
	  	categories_to_crawl.append(root + href)
Ejemplo n.º 6
0
def scrape_site(url):
    print 'Now scraping: ' + url
    soup = souphelper.get_soup('http://www.yandalo.com/www.' + url)
    if not soup:
        return 0

    h1 = soup.find_all('h1')
    for i in range(len(h1)):
        item = str(h1[i])
        if i < len(h1) - 1:
            value = souphelper.string_to_int(str(h1[i+1])[4:])
        # print item
        if 'Worth' in item:
            worth = value
        if 'Pageviews' in item:
            daily_pageviews = value
        if 'Earnings' in item:
            daily_revenue = value

    h3 = soup.find_all('h3')
    for i in range(len(h3)):
        h3[i] = str(h3[i])[4:]
        # print h3[i]
        # print str(i) + ' : ' + item
    try:
        alexa_rank = souphelper.string_to_int(str(h3[1]))
        google_page_rank = str(h3[11])[80:81] #FIX ME!!
        bounce_rate = souphelper.string_to_float(str(h3[15])) #fix, this is a percentage!
        pageviews_per_user = souphelper.string_to_int(str(h3[5])) #again, this is a decimal
        daily_time_on_site = souphelper.string_to_int(str(h3[6])) #decimal
        load_time = souphelper.string_to_float(str(h3[9])) #decimal
        bing_backlinks = souphelper.string_to_int(str(h3[3]))
        inbound_links = souphelper.string_to_int(str(h3[4]))
        visits_from_se = souphelper.string_to_float(str(h3[16])) #decimal
        if 'Yes' in h3[0]:
            dmoz_directory = 1
        else:
            dmoz_directory = 0
    except:
        print 'Error grabbing data for: ' + url
        return 0



    try:
        db_query = 'REPLACE INTO yandalo SET url="%s", worth="%s", daily_pageviews="%s", daily_revenue="%s", alexa_rank="%s", google_page_rank="%s", bounce_rate="%s", pageviews_per_user="******", daily_time_on_site="%s", load_time="%s", bing_backlinks="%s", inbound_links="%s", visits_from_se="%s", dmoz_directory="%s"' % (url, worth, daily_pageviews, daily_revenue, alexa_rank, google_page_rank, bounce_rate, pageviews_per_user, daily_time_on_site, load_time, bing_backlinks, inbound_links, visits_from_se, dmoz_directory)
        con.query(db_query)
    except _mysql.Error, e:
        print "Error %d: %s" % (e.args[0], e.args[1])
        return 0
Ejemplo n.º 7
0
def scrape_site(url):
    print "Now Scraping: " + url
    soup = souphelper.get_soup("http://www.statsmogul.com/" + url)
    if soup:
        td = soup.find_all('td')
        for i in range(len(td)):
            new_value = souphelper.string_to_int(str(td[i]))
            if new_value is not None:
                td[i] = new_value

        try:
            # The actual data
            em = soup.find_all('em')
            worth = souphelper.string_to_int(str(em[4]))
            alexa_rank = td[0]
            compete_rank = td[1]
            quantcast_rank = td[2]
            google_page_rank = td[7]
            domain_characters = td[3]
            if "No" in td[4]:
                domain_dictionary = 0
            else:
                domain_dictionary = 1
            domain_age = td[5]
            if "No" in td[6]:
                domain_dashes_or_numbers = 0
            else:
                domain_dashes_or_numbers = 1
            yahoo_backlinks = td[8]
            if "No" in td[10]:
                meta_tags = 0
            else:
                meta_tags = 1
            if "No" in td[11]:
                dmoz_directory = 0
            else:
                dmoz_directory = 1
        except:
            print "     Error scraping: " + url
            return 0


        try:
            db_query = 'REPLACE INTO statsmogul SET url="%s", worth="%s", alexa_rank="%s", compete_rank="%s", quantcast_rank="%s", google_page_rank="%s", domain_characters="%s", domain_dictionary="%s", domain_age="%s", domain_dashes_or_numbers="%s", yahoo_backlinks="%s", meta_tags="%s", dmoz_directory="%s"' % (url, worth, alexa_rank, compete_rank, quantcast_rank, google_page_rank, domain_characters, domain_dictionary, domain_age, domain_dashes_or_numbers, yahoo_backlinks, meta_tags, dmoz_directory)
            con.query(db_query)
            return 1
        except _mysql.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
            return 0
Ejemplo n.º 8
0
def grab_sites_to_crawl(url):
  soup = souphelper.get_soup(url)
  if soup is None:
    return 0
  # doc = open('websiteproperties.htm').read()
  # soup = BeautifulSoup(doc)

  for link in soup.find_all('a'):
    if link.string is not None:
      href = link['href']
      linkstring = link.string.encode('ascii', 'ignore') #may need to get rid of this on prod
      if "Next" in linkstring:
        print "Found another page!"
        grab_sites_to_crawl(domain + href)
        print ''
      if "view" in href:
        sites_to_crawl.append(domain + href)
Ejemplo n.º 9
0
def scrape_site(url):
	print "Now scraping: " + url
	soup = souphelper.get_soup(url)
	if not soup:
		return 0
	# doc = open('websitebroker_page.htm').read()
	# soup = BeautifulSoup(doc)

	# Want [4]
	tables = soup.find_all('table')
	counter = 1
	try:
		for td in tables[4].find_all('td'):
			# This is totally ridiculous to call the re.search on each of these itmes
			# but for some reason calling it once in a variable was having issues.
			# Fix me if you have time!!!
			if td.string is None:
				break
			if counter == 9:
				visitors = souphelper.string_to_int(td.string)
				# print visitors
			elif counter == 11:
				page_views = souphelper.string_to_int(td.string)
				# print page_views
			elif counter == 13:
				income = souphelper.string_to_int(td.string)
				# print income
			elif counter == 15:
				expenses = souphelper.string_to_int(td.string)
				# print expenses
			elif counter == 17:
				price = souphelper.string_to_int(td.string)
				# print price
			elif counter == 19:
				url = td.string
				# print url
			counter += 1

		db_access.insert_data_point('websitebroker.com', url, 'Price', price)
		db_access.insert_data_point('websitebroker.com', url, 'Visitors', visitors)
		db_access.insert_data_point('websitebroker.com', url, 'Page Views', page_views)
		db_access.insert_data_point('websitebroker.com', url, 'Income', income)
		db_access.insert_data_point('websitebroker.com', url, 'Expenses', expenses)
	except:
		print "Error scraping site: " + url
Ejemplo n.º 10
0
def scrape_site(url):
    print "Now scraping: " + url
    soup = souphelper.get_soup("http://www.cubestat.com/www." + url)
    # Yeah, it's stupid to have to assign these here, but even with try I was getting assignment errors
    # worth = 0
    # daily_pageviews = 0
    # daily_revenue = 0
    # alexa_rank = 0
    # quantcast_rank = 0
    # compete_rank = 0
    # google_page_rank = 0
    # live_indexed_pages = 0
    # domain_age = 0
    if soup:
        # for span in soup.find_all('span'):
        #     print span
        spans = soup.find_all('span')
        for i in range(len(spans)):
            try:
                # print str(i) + ' : ' + str(souphelper.string_to_int(str(spans[i].string)))
                spans[i] = souphelper.string_to_int(str(spans[i].string))
            except:
                spans[i] = 0

        try:
            worth = spans[0]
            daily_pageviews = spans[1]
            daily_revenue = spans[2]
            alexa_rank = spans[9]
            quantcast_rank = spans[10]
            compete_rank = spans[11]
            google_page_rank = spans[12]
            live_indexed_pages = spans[17]
            domain_age = spans[26]
        except:
            print "list index out of range error, cannot scape: " + url
            return 0

        try:
            db_query = 'REPLACE INTO cubestat SET url="%s", worth="%s", daily_pageviews="%s", daily_revenue="%s", alexa_rank="%s", quantcast_rank="%s", compete_rank="%s", google_page_rank="%s", live_indexed_pages="%s", domain_age="%s"' % (url, worth, daily_pageviews, daily_revenue, alexa_rank, quantcast_rank, compete_rank, google_page_rank, live_indexed_pages, domain_age)
            con.query(db_query)
        except _mysql.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
            return 0
Ejemplo n.º 11
0
def scrape_site(url):
    print "Now crawling: " + url
    soup = souphelper.get_soup("http://www.websiteoutlook.com/www." + url)
    if not soup:
        return 0

    # To get the worth of the site, look in <div class="wid">, then the <p> tag
    description = soup.find_all("div", "wid")
    worth_re = re.findall("USD [0-9]+[.]*[0-9]*", str(description[0]))
    millions = re.findall("USD [0-9]+[.]*[0-9]* Million", str(description[0]))
    billions = re.findall("USD [0-9]+[.]*[0-9]* Billion", str(description[0]))
    worth = worth_re[1][4:]  # there seems to be always 2 `USD descriptions, we want the second one
    # websiteoutlook adds millions to the number if it is over a million
    if millions:
        worth = int(float(worth) * 1000000)
    elif billions:
        worth = int(float(worth) * 1000000000)
    else:
        worth = int(re.search("[0-9]+", worth).group(0))

    tables = soup.find_all("table")

    # ----------------------------------------------------------
    # Grab data in First Table
    # Conatins: domain, daily pageviews, daily ads
    table_data = []
    for tr in tables[0].find_all("tr"):
        # print "New TR--------------------------"
        counter = 1
        counter2 = 1  # only want the first 3 data points
        for child in tr.children:
            if counter == 4 and counter2 < 4:
                if child.string is not None:
                    # print "Keeping this one: " + child.string
                    data_point = re.sub("[$,]", "", child.string)
                    table_data.append(data_point)
                    counter2 += counter2
            counter += 1

    # -----------------------------------------------------------
    # Grab data in second table
    # Contains: traffic rank, page rank, backlinks
    for tr in tables[1].find_all("tr"):
        counter = 1
        counter2 = 1
        for child in tr.children:
            if counter == 4 and counter2 < 4:
                if child.string is not None:
                    data_point = child.string.encode("utf-8").strip()
                    table_data.append(data_point)
                    counter2 += 1
            counter += 1
    # for item in table_data:
    #   item = str(item)
    #   item2 = item.strip()
    try:
        counter = 1
        for item in table_data:
            if counter == 1:
                url = item
            else:
                item = int(re.search("[0-9]+", item).group(0))  # clear out spaces and floats
                if counter == 2:
                    daily_pageview = item
                if counter == 3:
                    daily_ads_revenue = item
                if counter == 4:
                    traffic_rank = item
                if counter == 5:
                    page_rank = item
                if counter == 6:
                    backlinks = item
            counter += 1
        # db_access.insert_data_point('websiteoutlook.com', url, 'Daily Pageview', daily_pageview)
        # db_access.insert_data_point('websiteoutlook.com', url, 'Daily Ads Revenue', daily_ads_revenue)
        # db_access.insert_data_point('websiteoutlook.com', url, 'Traffic Rank', traffic_rank)
        # db_access.insert_data_point('websiteoutlook.com', url, 'Page Rank', page_rank)
        # db_access.insert_data_point('websiteoutlook.com', url, 'Backlinks', backlinks)
        # db_access.insert_data_point('websiteoutlook.com', url, 'Price', worth)
        db_query = (
            'REPLACE INTO websiteoutlook SET url="%s", worth="%s", daily_pageviews="%s", daily_revenue="%s", alexa_rank="%s"'
            % (url, worth, daily_pageview, daily_ads_revenue, traffic_rank)
        )
        con.query(db_query)

    except:
        print "Error inserting data for: " + url
Ejemplo n.º 12
0
def grab_sites_to_crawl(url):
    soup = souphelper.get_soup(url)
    if soup:
        for link in soup.find_all('a'):
            print link
Ejemplo n.º 13
0
def scrape_url(url):
    print 'Now scraping: ' + url
    soup = souphelper.get_soup('http://www.worthofweb.com/website-value/' + url)
    if not soup:
        return 0
Ejemplo n.º 14
0
def scrape_site(url):
    print "Now scraping site: " + url
    soup = souphelper.get_soup("http://www.estimurl.com/US/")