def web_scrapper(url): browser = mechanize.Browser( ) # browser or mechanize used to cheat search engine browser.set_handle_robots(False) browser.addheaders = [('User-agent', 'Mozilla')] r = browser.open(url).read() i = 0 soup = BeautifulSoup(r, "html.parser") letters = soup.find_all("img") lobbying = {} l = lsapi('mozscape-295a2fa4c3', '95ef534d72971f96f3fd5776819a50f7') mozMetrics = l.urlMetrics('url') #print (mozMetrics) for element in letters: keyset = element.attrs x = {} if ("alt" in keyset): x["alt"] = element["alt"].encode('ascii', 'replace').decode('ascii') else: x["alt"] = "" if ("height" in keyset): x["height"] = element["height"].encode('ascii', 'replace').decode('ascii') else: x["height"] = 0 if ("width" in keyset): x["width"] = element["width"].encode('ascii', 'replace').decode('ascii') else: x["width"] = 0 x["size"] = getsizes(url) #whether to do it this way for src depends on the wepage #Obviously for Bing you can just use src without adding url x['inbound links'] = mozMetrics['uid'] x['moz page rank'] = mozMetrics['umrp'] x['moz sub-domain rank'] = mozMetrics['fmrp'] if ("src" in keyset): lobbying[ url + element["src"].encode('ascii', 'replace').decode('ascii')] = x # urllib.urlretrieve(element["src"], os.path.basename(element["src"])) else: lobbying[url + str(i)] = x i = i + 1 for key in lobbying.keys(): x = lobbying[key] x['inbound links'] = mozMetrics['uid'] x['moz page rank'] = mozMetrics['umrp'] x['moz sub-domain rank'] = mozMetrics['fmrp'] lobbying[key] = x return lobbying
def api_info(lobbying): l = lsapi('mozscape-295a2fa4c3', '95ef534d72971f96f3fd5776819a50f7') for key in lobbying.keys(): print lobbying[key]['href'] mozMetrics = l.urlMetrics(lobbying[key]['href']) lobbying[key]['inbound links'] = mozMetrics['uid'] lobbying[key]['moz page rank'] = mozMetrics['umrp'] lobbying[key]['moz subdomain rank'] = mozMetrics['fmrp'] mozMetrics_domain = l.urlMetrics(lobbying[key]['domain']) lobbying[key]['domain inbound links'] = mozMetrics_domain['uid'] lobbying[key]['domain page rank'] = mozMetrics_domain['umrp'] print(lobbying[key]) return lobbying
def moz_url_metrics(mozscape): results = [] guser = werkzeug_cache.get('guser') gs = Gspreadsheet(guser.gmail, guser.gpassword, None) gs.login() ss = gs.gclient.open_by_url(mozscape.gspread_link) ws = ss.sheet1 urls = gs.col_one(ws) # FIXME only use the first url at A2, for now url = urls[0] l = lsapi(flask_app.config['MOZSCAPE_API_ACCESS_ID'], flask_app.config['MOZSCAPE_API_SECRET_KEY']) try: # mozscape restriction is NOT to make parallel requests but batch them instead!!! now_timestamp = datetime.utcnow() nrow = 2 metrics = l.urlMetrics(url) # gspread update cells in row: acells = ws.range("B%s:L%s" % (nrow, nrow)) acells[0].value = metrics['uid'] acells[1].value = metrics['uu'] acells[2].value = metrics['ut'] acells[3].value = metrics['us'] acells[4].value = metrics['upa'] acells[5].value = metrics['ueid'] acells[6].value = metrics['umrp'] acells[7].value = metrics['umrr'] acells[8].value = metrics['fmrp'] acells[9].value = metrics['fmrr'] acells[10].value = metrics['pda'] ws.update_cells(acells) mr = MozscapeResult.create(name=mozscape.name, url=url, uid=metrics['uid'], uu=metrics['uu'], ut=metrics['ut'], us=metrics['us'], upa=metrics['upa'], ueid=metrics['ueid'], umrp=metrics['umrp'], umrr=metrics['umrr'], fmrp=metrics['fmrp'], fmrr=metrics['fmrr'], pda=metrics['pda'], timestamp=now_timestamp) except Exception as e: print("Error: moz_url_metrics:\n%s" % e) return len(results)
def web_scrapper(url): browser = mechanize.Browser() # browser or mechanize used to cheat search engine browser.set_handle_robots(False) browser.addheaders = [("User-agent", "Mozilla")] r = browser.open(url).read() i = 0 soup = BeautifulSoup(r, "html.parser") letters = soup.find_all("img") lobbying = {} l = lsapi("mozscape-295a2fa4c3", "95ef534d72971f96f3fd5776819a50f7") mozMetrics = l.urlMetrics("url") # print (mozMetrics) for element in letters: keyset = element.attrs x = {} if "alt" in keyset: x["alt"] = element["alt"].encode("ascii", "replace").decode("ascii") else: x["alt"] = "" if "height" in keyset: x["height"] = element["height"].encode("ascii", "replace").decode("ascii") else: x["height"] = 0 if "width" in keyset: x["width"] = element["width"].encode("ascii", "replace").decode("ascii") else: x["width"] = 0 x["size"] = getsizes(url) # whether to do it this way for src depends on the wepage # Obviously for Bing you can just use src without adding url x["inbound links"] = mozMetrics["uid"] x["moz page rank"] = mozMetrics["umrp"] x["moz sub-domain rank"] = mozMetrics["fmrp"] if "src" in keyset: lobbying[url + element["src"].encode("ascii", "replace").decode("ascii")] = x # urllib.urlretrieve(element["src"], os.path.basename(element["src"])) else: lobbying[url + str(i)] = x i = i + 1 for key in lobbying.keys(): x = lobbying[key] x["inbound links"] = mozMetrics["uid"] x["moz page rank"] = mozMetrics["umrp"] x["moz sub-domain rank"] = mozMetrics["fmrp"] lobbying[key] = x return lobbying
def get_backlinks(url, mozscapeAPIaccessID, mozscapeAPIkey): """ Uses the Mozscape API to retrieve some backlinks on a url. Returns a list of urls. """ # mozscape needs http:// out if len(url)>=7 and "http://"==url[0:7]: url = url[7:] elif len(url)>=8 and "https://"==url[0:8]: url = url[8:] else: return None l = lsapi(mozscapeAPIaccessID, mozscapeAPIkey) links = l.links(url, filters=['external', 'nofollow']) result = list(); for link in links: result.append("http://"+link['uu']) return result
def moz_index_metadata(): mim = None l = lsapi(flask_app.config['MOZSCAPE_API_ACCESS_ID'], flask_app.config['MOZSCAPE_API_SECRET_KEY']) try: now_timestamp = datetime.utcnow() try: mim = MozscapeIndexMetadata.get(MozscapeIndexMetadata.id == 1) except Exception as e: mim = MozscapeIndexMetadata() mim.timestamp = None print( "Error: moz_index_metadata: MozscapeIndexMetadata.get(MozscapeIndexMetadata.id==1)\n%s" % e) # do we need to update db or just return mim: if mim.timestamp is None or now_timestamp >= mim.next_update: metrics = l.index_metadata() mim.index_name = metrics['index_name'] mim.crawl_duration = metrics['crawl_duration'] mim.external_links_per_page = metrics[ 'external_links_per_page'] mim.links_per_page = metrics['links_per_page'] mim.links = metrics['links'] mim.plds = metrics['plds'] mim.fqdns = metrics['fqdns'] mim.nofollow = metrics['nofollow'] mim.urls = metrics['urls'] if str(metrics['locked']) == 'false': mim.locked = False else: mim.locked = True mim.rel_canonical = metrics['rel_canonical'] mim.last_update = datetime.fromtimestamp( metrics['last_update']) mim.next_update = datetime.fromtimestamp( metrics['next_update']) mim.timestamp = now_timestamp mim.save() # create or update except Exception as e: print("Error: moz_index_metadata:\n%s" % e) return mim
def getBackLinks(Url): #print Url bLinks=Set([]) l = lsapi('member-a1c2050723', '9776ad0162ea4c492b2b4d56a0cfcd1a') linksList = l.links(Url) #print "\n\n", linksList for items in linksList: #print items for key, value in items.iteritems(): if (key == "uu"): #print value bLinks.add(value) #picking only 10 backlinks bLinks=list(bLinks) #print bLinks[0:2] for link in visitedPages: if link in bLinks: bLinks.remove(link) #print bLinks[0:2] # Because of the free API limitation. time.sleep( 10 ) return bLinks[0:2]
#! /usr/bin/env python from lsapi import lsapi l = lsapi('my-access-id', 'my-secret-key') # As you may have noticed, there are lots of columns available # I did what I could to make them easily-accessible, but there # are a lot, and the names are long. So, the API calls have # defaults # Let's get some URL metrics. Results are now an array of dictionaries # the i'th dictionary is the results for the i'th URL metrics = l.urlMetrics(['www.moz.com', 'www.moz.com/blog']) # Now let's say we only want specific columns in the results authorities = l.urlMetrics(['www.moz.com'], lsapi.UMCols.domainAuthority | lsapi.UMCols.pageAuthority) # Or if you just need results for one URL mozMetrics = l.urlMetrics('www.moz.com') # Now for some anchor text results anchorResults = l.anchorText('www.moz.com/blog') # Or for just specific columns anchorTermResults = l.anchorText('www.moz.com/blog', cols=lsapi.ATCols.term) # Now for some links results links = l.links('www.moz.com') # The links API has more columns to specify, as well as sort, scope, etc. links = l.links('www.moz.com', scope='domain_to_domain', sort='domain_authority', filters=['external', 'nofollow'], targetCols = lsapi.UMCols.url)
from lsapi import lsapi from lsapi import lsapiException import time # input: file with a list of seed urls # output: file with a list of seed urls + backlinks MOZ = { 'access_id': 'mozscape-d7201e2b23', 'secret_key': 'd605753f7d3a2f970353754a4b123b4c' } l = lsapi(MOZ['access_id'], MOZ['secret_key']) seeds_path = 'input/ebola-1000.txt' result_path = 'input/ebolaSeeds-3.txt' if __name__ == "__main__": print "Start backlink fetcher" with open(seeds_path, 'r') as seeds_fp: seeds = map(lambda x: x.replace("\n", ""), seeds_fp.readlines()) backlinks_dict = {} cnt = 0 # go through seeds, get backlinks from each and put in a dict[seed] = [backlinks] for seed in seeds: cnt += 1 seed_backlinks = [] try: links = l.links(seed, scope='page_to_page', sort='page_authority', filters=['external'], targetCols = lsapi.UMCols.url) except lsapiException, e: links = [] print "lsapiException:", e
# Seomoz Backlink Analysis Discover relevant backlinks. """ import os import csv import time import argparse import numpy as np import pandas as pd from lsapi import lsapi api = lsapi('member-76bd0a8077', '09e78de0f24fbbf8b41b46623b75d5e6') parser = argparse.ArgumentParser(description='Seomoz Analyzer') parser.add_argument('urls', help='path to list of urls to analyze') parser.add_argument('links', help='path to list of urls to output') parser.add_argument('--column', default='resulturl', help='column name') args = parser.parse_args() urls = pd.read_csv(args.urls) urls = set(urls[args.column]) if os.path.exists(args.links): df = pd.read_csv(args.links) for url in set(df['url']):
from lsapi import lsapi l = lsapi("member-a1c2050723", "9776ad0162ea4c492b2b4d56a0cfcd1a") print "Calling API" mozMetrics = l.urlMetrics("http://www.google.com") # print mozMetrics # links = l.links('http://www.google.com') links = l.links("www.soic.indiana.edu/computer-science/") # print "\n\n", links for items in links: # print items for key, value in items.iteritems(): if key == "uu": print key, value print "Call to API Ended"
#! /usr/bin/env python from lsapi import lsapi l = lsapi('my-access-id', 'my-secret-key') # As you may have noticed, there are lots of columns available # I did what I could to make them easily-accessible, but there # are a lot, and the names are long. So, the API calls have # defaults # Let's get some urlmetrics. Results are now an array of dictionaries # the i'th dictionary is the results for the i'th URL metrics = l.urlMetrics(['www.seomoz.org', 'www.seomoz.org/blog']) # Now let's say we only want specific columns in the results authorities = l.urlMetrics(['www.seomoz.org'], lsapi.UMCols.domainAuthority | lsapi.UMCols.pageAuthority) # Or if you just need results for one URL mozMetrics = l.urlMetrics('www.seomoz.org') # Now for some anchor text results anchorResults = l.anchorText('www.seomoz.org/blog') # Or for just specific columns anchorTermResults = l.anchorText('www.seomoz.org/blog', cols=lsapi.ATCols.term) # Now for some links results links = l.links('www.seomoz.org') # The links API has more columns to specify, as well as sort, scope, etc. links = l.links('www.seomoz.org', scope='domain_to_domain', sort='domain_authority', filters=['external', 'nofollow'], targetCols = lsapi.UMCols.url)