from __future__ import print_function import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.graph import Graph, CENTRALITY # A graph is a network of nodes (or concepts) # connected to each other with edges (or links). g = Graph() for n in ("tree", "nest", "bird", "fly", "insect", "ant"): g.add_node(n) g.add_edge("tree", "nest") # Trees have bird nests. g.add_edge("nest", "bird") # Birds live in nests. g.add_edge("bird", "fly") # Birds eat flies. g.add_edge("ant", "bird") # Birds eat ants. g.add_edge("fly", "insect") # Flies are insects. g.add_edge("insect", "ant") # Ants are insects. g.add_edge("ant", "tree") # Ants crawl on trees. # From tree => fly: tree => ant => bird => fly print(g.shortest_path(g.node("tree"), g.node("fly"))) print(g.shortest_path(g.node("nest"), g.node("ant"))) print() # Which nodes get the most traffic? for n in sorted(g.nodes, key=lambda n: n.centrality, reverse=True): print('%.2f' % n.centrality, n)
import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.graph import Graph, CENTRALITY # A graph is a network of nodes (or concepts) # connected to each other with edges (or links). g = Graph() for n in ("tree", "nest", "bird", "fly", "insect", "ant"): g.add_node(n) g.add_edge("tree", "nest") # Trees have bird nests. g.add_edge("nest", "bird") # Birds live in nests. g.add_edge("bird", "fly") # Birds eat flies. g.add_edge("ant", "bird") # Birds eat ants. g.add_edge("fly", "insect") # Flies are insects. g.add_edge("insect", "ant") # Ants are insects. g.add_edge("ant", "tree") # Ants crawl on trees. # From tree => fly: tree => ant => bird => fly print(g.shortest_path(g.node("tree"), g.node("fly"))) print(g.shortest_path(g.node("nest"), g.node("ant"))) print() # Which nodes get the most traffic? for n in sorted(g.nodes, key=lambda n: n.centrality, reverse=True): print('%.2f' % n.centrality, n)
class WebCrawler(): def __init__(self, args, depth=1): self.links = [WebPage(x) for x in args.url] self.depth = depth self.historyDb = WebsiteDatabase() self.done = False self.options = args self.results = {link.url.domain: Result() for link in self.links} self.cloudIndexer = CloudSearchIndexer.forDomainIndex("websites") if args.graph or args.rank: self.webGraph = Graph(distance=30.0) for link in self.links: self.webGraph.add_node(link.url.domain, radius=15, fill=(1, 0, 0, 0.5)) def __del__(self): self.cloudIndexer._commitToAmazon() def crawl(self): if len(self.links) < 1: self.done = True self.finish() return site = self.links.pop(0) if self.historyDb.wasPageVisited(site): print 'reading data' site = self.historyDb.readWebPage(site.url.string, isExternal=site.isExternal, depth=site.depth) else: print 'downloading' try: site.downloadContent() except HTTP404NotFound: return self.fail(site, "404 not found") except URLTimeout: return self.fail(site, "Timeout error") except URLError as err: return self.fail(site, str(err)) connected = True if site.depth == self.depth: connected = False self.historyDb.insertWebpage(site, connection=connected) self.historyDb.appendSession(site) for link in site.getLinks(): if self.isValidForQueue(link): if link.isExternal and (self.options.graph or self.options.rank): self.addDomainNode(link) if site.depth < self.depth: self.links.append(link) elif not link.isExternal and site.depth < self.depth: self.links.insert(0, link) if not self.historyDb.wasPageVisited(site): self.visit(site) site.cleanCashedData() def isValidForQueue(self, link): if link not in self.links and not link.url.anchor: if self.historyDb.isInThisSession(link): self.historyDb.insertRelation(link.parent, link) else: return True return False def addDomainNode(self, page): match = re.search("\.", page.url.domain) if not match: return if page.parent.url.domain == page.url.domain: return if self.webGraph.node(page.url.domain) is None: self.webGraph.add_node(page.url.domain, radius=15) if self.webGraph.edge(page.parent.url.domain, page.url.domain) is None: self.webGraph.add_edge(page.parent.url.domain, page.url.domain, weight=0.0, type='is-related-to') def visit(self, page): print 'visited: ', page.url.string, ' domain: ', page.url.domain, 'graph', self.options.graph self.cloudIndexer.addDocument(page) if page.isExternal and self.options.graph and page.url.domain not in self.results.keys( ): self.webGraph.node(page.url.domain).fill = (0, 1, 0, 0.5) try: if self.options.text: self.results[page.url.domain].wordStats += page.countWords() if self.options.a: links = [link.url.string for link in page.getLinks()] self.results[page.url.domain].links.update(links) if self.options.image: self.results[page.url.domain].images.update(page.getImages()) if self.options.script: self.results[page.url.domain].scripts.update(page.getScripts()) except Exception as e: print "Error parsing document: ", type(e).__name__ + ': ' + str(e) def fail(self, link, error): print 'failed:', link.url.string, 'err: ', error def finish(self): """Print all results and calculate cosine similarity between all provided ur;s""" self.historyDb.clearSession() with Emitter(self.options.console, self.options.file) as output: for key, value in self.results.iteritems(): output.emitLine(key) value.emit(output) if len(self.results ) > 1 and self.options.text and self.options.cos: combinations = [ list(x) for x in itertools.combinations(self.results.keys(), 2) ] for pair in combinations: cosValue = self.results[pair[0]].cosineSimilarity( self.results[pair[1]]) output.emitLine( u"cos similarity between:{0} and {1} = {2}".format( pair[0], pair[1], cosValue)) output.emitLine('') #output.emitLine("max depth: " + str(max(site.depth for site in self.history))) #output.emitLine("sites visited: " + str(len(self.history))) if self.options.graph: self.webGraph.eigenvector_centrality() self.webGraph.export('graph', directed=True, width=2200, height=1600, repulsion=10) if self.options.rank: ranks = self.calculatePageRank() output.emitLine('') output.emit(ranks) def calculatePageRank(self): adjMap = adjacency(self.webGraph, directed=True, stochastic=True) domains = adjMap.keys() M = np.zeros((len(domains), len(domains))) for idx, domain in enumerate(domains): connections = adjMap[domain].keys() for connection in connections: M[idx, domains.index(connection)] = adjMap[domain][connection] M = np.transpose(M) #M = np.array([[0,0,0,0,1], [0.5,0,0,0,0], [0.5,0,0,0,0], [0,1,0.5,0,0], [0,0,0.5,1,0]]) #M = np.array([[0, 0.5, 0],[0.5,0.5, 0], [0.5, 0, 0]]) pageScores = self.executeComputations(M) print pageScores ranks = dict(zip(domains, pageScores)) ranks = sorted(ranks.items(), key=operator.itemgetter(1)) return ranks def executeComputations(self, M): damping = 0.80 error = 0.0000001 N = M.shape[0] v = np.ones(N) v = v / np.linalg.norm(v, 1) last_v = np.full(N, np.finfo(float).max) for i in range(0, N): if sum(M[:, i]) == 0: M[:, i] = np.full(N, 1.0 / N) M_hat = np.multiply(M, damping) + np.full((N, N), (1 - damping) / N) while np.linalg.norm(v - last_v) > error: last_v = v v = np.matmul(M_hat, v) return np.round(v, 6)