def compute_classifier_domain_info(): patterns = set() # Gets a sample of links from the website starting from the homepage already_analyzed = set() links = self._get_all_internal_links(self._start_url) for i in range(depth - 1): if len(links) > 3000: break sub_links = set() cpt = 0 for link in links: print "link %d/%d [%s]" % (cpt, len(links), link) cpt += 1 if (len(sub_links) > 3000): break if not link in already_analyzed: sub_links = sub_links.union(self._get_all_internal_links(link)) already_analyzed.add(link) links = links.union(sub_links) self.log("Finished. Got %d sublinks on %d levels" % (len(links), depth)) # Identifies links pointing to content and generates patterns ones, zeros = classifier.testSVM(links) mean, distMat = distmat.getDistanceMatrix(ones) content = [] for link in distMat: if sum(distMat[link])/len(distMat) >= mean: content += [link] patterns = patternify.getPatterns(content) return patterns
import sys import crawl import classifier import patternify """usage: python run.py http://cnn.com""" crawler = crawl.crawler() #get a crawler object urls = crawler.crawl(sys.argv[1], 500) #get URLs, number of URLs to crawl classifier.trainSVM() #train preliminary classifier using the "content" and "notcontent" files ones, zeros = classifier.testSVM(urls) #classify using preliminary classifier patterns = patternify.getPatterns(ones) #get patterns from classified "ones" (content links) classifier.trainSVM(patterns) #train secondary classifier with pattern features ones, zeros = classifier.testSVM(urls, patterns) #classify using secondary classifier