def findContentsFromURLList(urllist): list_len = len(urllist) for i in range(list_len): for j in range(i + 1, list_len): #print "1 %d %d" %(i,j) url1 = urllist[i] url2 = urllist[j] #print "2 %d %d" %(i,j) if url1 == url2: continue rs = fetchDistance(url1, url2) if rs == None or len(rs) == 0: #print "3 %d %d" %(i,j) contents1 = fetchURLContents(url1) contents2 = fetchURLContents(url2) c1 = findAverageContents(contents1) c2 = findAverageContents(contents2) if c1 == None: logger.debug("[alert] [%s] has no contents" % url1) continue if c2 == None: logger.debug("[alert] [%s] has no contents" % url2) continue #print "4 %d %d" %(i,j) distance = calcTwoHTMLDistance(c1, c2) #print "5 %d %d" %(i,j) r = storeDistance(url1, url2, distance) logger.debug("calculate distance [%s][%s]: %f %s" %(\ url1, url2, distance, r) ) else: #print "6 %d %d %s" %(i,j,str(rs)) logger.debug("find distance [%s][%s]: %f " %(\ url1, url2, rs[0]) )
def findContentsFromURLList(urllist): list_len = len(urllist) for i in range(list_len): for j in range(i+1, list_len): #print "1 %d %d" %(i,j) url1 = urllist[i] url2 = urllist[j] #print "2 %d %d" %(i,j) if url1 == url2: continue rs = fetchDistance(url1, url2) if rs == None or len(rs) == 0: #print "3 %d %d" %(i,j) contents1 = fetchURLContents(url1) contents2 = fetchURLContents(url2) c1 = findAverageContents(contents1) c2 = findAverageContents(contents2) if c1 == None : logger.debug("[alert] [%s] has no contents" %url1 ) continue if c2 == None : logger.debug("[alert] [%s] has no contents" %url2 ) continue #print "4 %d %d" %(i,j) distance = calcTwoHTMLDistance(c1, c2) #print "5 %d %d" %(i,j) r = storeDistance(url1, url2, distance) logger.debug("calculate distance [%s][%s]: %f %s" %(\ url1, url2, distance, r) ) else: #print "6 %d %d %s" %(i,j,str(rs)) logger.debug("find distance [%s][%s]: %f " %(\ url1, url2, rs[0]) )
def extractAndStoreScriptsFromFileList(file_list_path): f = open(file_list_path) urls = set() for line in f: urls.add(line.strip()) for url in urls: print "prcossing scripts of %s " % url hosts, inlines = fetchScripts(url) if hosts == None or inlines == None: contents = fetchURLContents(url) if contents == None or len(contents) == 0: print >> sys.stderr, "%s doesn't have contents " % url continue content = findAverageContents(contents) if content == None: print >> sys.stderr, "failed to extract average content for %s" % url continue extractAndStoreScriptsFromDOM(url, content) else: print "%s already has %d hosts and %d inline scripts " \ %(url, len(hosts), len(inlines))
def extractAndStoreScriptsFromFileList(file_list_path): f = open(file_list_path) urls = set() for line in f: urls.add(line.strip()) for url in urls: print "prcossing scripts of %s " % url hosts, inlines = fetchScripts(url) if hosts == None or inlines == None: contents = fetchURLContents(url) if contents == None or len(contents) == 0: print >> sys.stderr, "%s doesn't have contents " %url continue content = findAverageContents(contents) if content == None: print >> sys.stderr, "failed to extract average content for %s" %url continue extractAndStoreScriptsFromDOM(url, content) else: print "%s already has %d hosts and %d inline scripts " \ %(url, len(hosts), len(inlines))