class TestCase(): def __init__(self): self.settings = Settings.getInstance() self.parsegoogle = GoogleResultParser() self.htmlRetriever = HtmlRetriever(self.settings.use_proxy) self.checker = checker() def test_parse_google_result(self, title1, title2): '''Test method extract_from_source.''' print '-TEST-:', self.test_parse_google_result.__doc__.strip() url = self.checker.pinQuery(title1, title2); print '> url', '-' * 100 print url html = self.htmlRetriever.getHtmlRetry(url, 3, False); print '> html', '-' * 100 print html[0:100] print '\n' print '> blocks', '-' * 100 models = self.parsegoogle.extract_from_source(html) for model in models: print model print '-END TEST-'
class TestCase(): def __init__(self): self.settings = Settings.getInstance() self.parsegoogle = GoogleResultParser() self.htmlRetriever = HtmlRetriever(self.settings.use_proxy) self.checker = checker() def test_parse_google_result(self, title1, title2): '''Test method extract_from_source.''' print '-TEST-:', self.test_parse_google_result.__doc__.strip() url = self.checker.pinQuery(title1, title2) print '> url', '-' * 100 print url html = self.htmlRetriever.getHtmlRetry(url, 3, False) print '> html', '-' * 100 print html[0:100] print '\n' print '> blocks', '-' * 100 models = self.parsegoogle.extract_from_source(html) for model in models: print model print '-END TEST-'
def __init__(self): self.debug_print = True self.settings = Settings.getInstance() self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy) self.htmlRetriever.validate_html_callback = self.validate_html_callback self.parsegoogle = GoogleResultParser()
class checker(): def __init__(self): self.debug_print = True self.settings = Settings.getInstance() self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy) self.htmlRetriever.validate_html_callback = self.validate_html_callback self.parsegoogle = GoogleResultParser() def validate_html_callback(self, source): if source is None or len(source) < 100: return False #Web Images Videos Maps Finance if 'Web' in source and 'Images' in source and 'Finance' in source: return True else: print '---------------------' print 'SOURCE:\n' print source print '---------------------' return False def pinQuery(self, title1, title2): query = "".join(('"', title1, '" AND "', title2, '"')) url = self.settings.urltemplate % query url = url.replace(" ", "%20").replace("\"", "%22") return url def isInSamePage(self, title1, title2, withProxy=False): '''Return NAResult ''' url = self.pinQuery(title1, title2) print '> check url:', url html = None while html is None: html = self.htmlRetriever.getHtmlRetry(url, 10, withProxy) # print "************" # print html # print "************" found_urls = self.parsegoogle.extract_from_source(html) result = NAResult() final_found = False for found_url in found_urls: # print '>>>>>link:', found_url if len(found_url) >= 1: found, domain = self.foundInSamePage(found_url[1]) if found: print '+ found domain: %s (%s)' % (domain, found_url) result.links.append(found_url[0]) final_found = True else: print '- found domain:%s (%s)' % (domain, found_url) result.result = final_found return result def isInSamePageMulti(self, title_pairs): ''' Multithread check google method. ''' isSameMatrix = {} threads = [] i = 0 for title1, title2 in title_pairs: threads.append( CheckGoogleThread(self, isSameMatrix, i, title1, title2)) threads[i].start() i += 1 time.sleep(0.2) restarted_threads = 0 restart_times = 10 check_count = 0 while True: alldone = True print ">> ", isSameMatrix for i in range(0, len(title_pairs) - 1): if i not in isSameMatrix: alldone = False if check_count % restart_times == 0 and restarted_threads < 3: threads[i] = CheckGoogleThread(self, isSameMatrix, i, title1, title2) threads[i].start() restarted_threads += 1 if alldone and len(title_pairs) == len(isSameMatrix): print "All Done: ", isSameMatrix break time.sleep(2) check_count += 1 # return print '-' * 100 print "-Return: ", isSameMatrix print '-' * 100 return isSameMatrix def foundInSamePage(self, url): idx = url.find("/") domain = '' if idx > 0: domain = url[0:idx] else: domain = url if 'dblp' in url: return False, domain for excluded in EXCLUDE_NOISE_SITE: # print 'check with ', excluded,'--', domain if domain == excluded or domain.endswith( excluded) or excluded.endswith(domain): return False, domain return True, domain
class checker(): def __init__(self): self.debug_print = True self.settings = Settings.getInstance() self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy) self.htmlRetriever.validate_html_callback = self.validate_html_callback self.parsegoogle = GoogleResultParser() def validate_html_callback(self, source): if source is None or len(source) < 100: return False #Web Images Videos Maps Finance if 'Web' in source and 'Images' in source and 'Finance' in source: return True else: print '---------------------' print 'SOURCE:\n' print source print '---------------------' return False def pinQuery(self, title1, title2): query = "".join(('"', title1, '" AND "', title2 , '"')) url = self.settings.urltemplate % query url = url.replace(" ", "%20").replace("\"", "%22") return url def isInSamePage(self, title1, title2, withProxy=False): '''Return NAResult ''' url = self.pinQuery(title1, title2) print '> check url:', url html = None while html is None: html = self.htmlRetriever.getHtmlRetry(url, 10, withProxy) # print "************" # print html # print "************" found_urls = self.parsegoogle.extract_from_source(html) result = NAResult() final_found = False for found_url in found_urls: # print '>>>>>link:', found_url if len(found_url) >= 1: found, domain = self.foundInSamePage(found_url[1]) if found: print '+ found domain: %s (%s)' % (domain, found_url) result.links.append(found_url[0]); final_found = True else: print '- found domain:%s (%s)' % (domain, found_url) result.result = final_found return result def isInSamePageMulti(self, title_pairs): ''' Multithread check google method. ''' isSameMatrix = {} threads = [] i = 0 for title1, title2 in title_pairs: threads.append(CheckGoogleThread(self, isSameMatrix, i, title1, title2)) threads[i].start() i += 1 time.sleep(0.2) restarted_threads = 0; restart_times = 10 check_count = 0 while True: alldone = True print ">> ", isSameMatrix for i in range(0, len(title_pairs) - 1): if i not in isSameMatrix: alldone = False if check_count % restart_times == 0 and restarted_threads < 3: threads[i] = CheckGoogleThread(self, isSameMatrix, i, title1, title2) threads[i].start() restarted_threads += 1 if alldone and len(title_pairs) == len(isSameMatrix): print "All Done: ", isSameMatrix break time.sleep(2); check_count += 1 # return print '-' * 100 print "-Return: ", isSameMatrix print '-' * 100 return isSameMatrix def foundInSamePage(self, url): idx = url.find("/"); domain = '' if idx > 0: domain = url[0:idx] else: domain = url if 'dblp' in url: return False, domain for excluded in EXCLUDE_NOISE_SITE: # print 'check with ', excluded,'--', domain if domain == excluded or domain.endswith(excluded) or excluded.endswith(domain): return False, domain return True, domain
def __init__(self): self.settings = Settings.getInstance() self.parsegoogle = GoogleResultParser() self.htmlRetriever = HtmlRetriever(self.settings.use_proxy) self.checker = checker()