Python GoogleResultParser Examples, com.lish.namedisambiguation.parsegoogle.GoogleResultParser Python Examples

Example #1

0

Show file

File: test_parsegoogle.py Project: AlexLyj/aminer-spider

class TestCase():

	def __init__(self):
		self.settings = Settings.getInstance()
		self.parsegoogle = GoogleResultParser()
		self.htmlRetriever = HtmlRetriever(self.settings.use_proxy)
		self.checker = checker()

	def test_parse_google_result(self, title1, title2):
		'''Test method extract_from_source.'''
		print '-TEST-:', self.test_parse_google_result.__doc__.strip()
		url = self.checker.pinQuery(title1, title2);
		print '> url', '-' * 100
		print url

		html = self.htmlRetriever.getHtmlRetry(url, 3, False);
		print '> html', '-' * 100
		print html[0:100]
		print '\n'

		print '> blocks', '-' * 100
		models = self.parsegoogle.extract_from_source(html)
		for model in models:
			print model

		print '-END TEST-'

Example #2

0

Show file

class TestCase():
    def __init__(self):
        self.settings = Settings.getInstance()
        self.parsegoogle = GoogleResultParser()
        self.htmlRetriever = HtmlRetriever(self.settings.use_proxy)
        self.checker = checker()

    def test_parse_google_result(self, title1, title2):
        '''Test method extract_from_source.'''
        print '-TEST-:', self.test_parse_google_result.__doc__.strip()
        url = self.checker.pinQuery(title1, title2)
        print '> url', '-' * 100
        print url

        html = self.htmlRetriever.getHtmlRetry(url, 3, False)
        print '> html', '-' * 100
        print html[0:100]
        print '\n'

        print '> blocks', '-' * 100
        models = self.parsegoogle.extract_from_source(html)
        for model in models:
            print model

        print '-END TEST-'

Example #3

0

Show file

 def __init__(self):
     self.debug_print = True
     self.settings = Settings.getInstance()
     self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy)
     self.htmlRetriever.validate_html_callback = self.validate_html_callback
     self.parsegoogle = GoogleResultParser()

Example #4

0

Show file

class checker():
    def __init__(self):
        self.debug_print = True
        self.settings = Settings.getInstance()
        self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy)
        self.htmlRetriever.validate_html_callback = self.validate_html_callback
        self.parsegoogle = GoogleResultParser()

    def validate_html_callback(self, source):
        if source is None or len(source) < 100:
            return False

        #Web Images Videos Maps Finance
        if 'Web' in source and 'Images' in source and 'Finance' in source:
            return True
        else:
            print '---------------------'
            print 'SOURCE:\n'
            print source
            print '---------------------'
            return False

    def pinQuery(self, title1, title2):
        query = "".join(('"', title1, '" AND "', title2, '"'))
        url = self.settings.urltemplate % query
        url = url.replace(" ", "%20").replace("\"", "%22")
        return url

    def isInSamePage(self, title1, title2, withProxy=False):
        '''Return NAResult
		'''
        url = self.pinQuery(title1, title2)
        print '> check url:', url

        html = None
        while html is None:
            html = self.htmlRetriever.getHtmlRetry(url, 10, withProxy)
#		print "************"
#		print html
#		print "************"
        found_urls = self.parsegoogle.extract_from_source(html)

        result = NAResult()

        final_found = False
        for found_url in found_urls:
            #			print '>>>>>link:', found_url
            if len(found_url) >= 1:
                found, domain = self.foundInSamePage(found_url[1])
                if found:
                    print '+ found domain: %s (%s)' % (domain, found_url)
                    result.links.append(found_url[0])
                    final_found = True
                else:
                    print '- found domain:%s (%s)' % (domain, found_url)
        result.result = final_found
        return result

    def isInSamePageMulti(self, title_pairs):
        ''' Multithread check google method.
		'''
        isSameMatrix = {}
        threads = []

        i = 0
        for title1, title2 in title_pairs:
            threads.append(
                CheckGoogleThread(self, isSameMatrix, i, title1, title2))
            threads[i].start()
            i += 1
            time.sleep(0.2)

        restarted_threads = 0
        restart_times = 10
        check_count = 0
        while True:
            alldone = True
            print ">> ", isSameMatrix
            for i in range(0, len(title_pairs) - 1):
                if i not in isSameMatrix:
                    alldone = False
                    if check_count % restart_times == 0 and restarted_threads < 3:
                        threads[i] = CheckGoogleThread(self, isSameMatrix, i,
                                                       title1, title2)
                        threads[i].start()
                        restarted_threads += 1

            if alldone and len(title_pairs) == len(isSameMatrix):
                print "All Done: ", isSameMatrix
                break
            time.sleep(2)

            check_count += 1

        # return
        print '-' * 100
        print "-Return: ", isSameMatrix
        print '-' * 100
        return isSameMatrix

    def foundInSamePage(self, url):
        idx = url.find("/")
        domain = ''
        if idx > 0:
            domain = url[0:idx]
        else:
            domain = url

        if 'dblp' in url:
            return False, domain

        for excluded in EXCLUDE_NOISE_SITE:
            #			print 'check with ', excluded,'--', domain
            if domain == excluded or domain.endswith(
                    excluded) or excluded.endswith(domain):
                return False, domain
        return True, domain

Example #5

0

Show file

File: checker.py Project: AlexLyj/aminer-spider

	def __init__(self):
		self.debug_print = True
		self.settings = Settings.getInstance()
		self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy)
		self.htmlRetriever.validate_html_callback = self.validate_html_callback 
		self.parsegoogle = GoogleResultParser()

Example #6

0

Show file

File: checker.py Project: AlexLyj/aminer-spider

class checker():
	def __init__(self):
		self.debug_print = True
		self.settings = Settings.getInstance()
		self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy)
		self.htmlRetriever.validate_html_callback = self.validate_html_callback 
		self.parsegoogle = GoogleResultParser()
		
	def validate_html_callback(self, source):
		if source is None or len(source) < 100:
			return False
		
		#Web Images Videos Maps Finance
		if 'Web' in source and 'Images' in source and 'Finance' in source:
			return True
		else:
			print '---------------------'
			print 'SOURCE:\n'
			print source
			print '---------------------'
			return False
		
	def pinQuery(self, title1, title2):
		query = "".join(('"', title1, '" AND "', title2 , '"'))
		url = self.settings.urltemplate % query
		url = url.replace(" ", "%20").replace("\"", "%22")
		return url

	def isInSamePage(self, title1, title2, withProxy=False):
		'''Return NAResult
		'''
		url = self.pinQuery(title1, title2)
		print '> check url:', url
		
		html = None
		while html is None:
			html = self.htmlRetriever.getHtmlRetry(url, 10, withProxy)
#		print "************"
#		print html
#		print "************"
		found_urls = self.parsegoogle.extract_from_source(html)
		
		result = NAResult()
		
		final_found = False
		for found_url in found_urls:
#			print '>>>>>link:', found_url
			if len(found_url) >= 1:
				found, domain = self.foundInSamePage(found_url[1])
				if found:
					print '+ found domain: %s (%s)' % (domain, found_url)
					result.links.append(found_url[0]);
					final_found = True
				else:
					print '- found domain:%s (%s)' % (domain, found_url)
		result.result = final_found
		return result

			
	def isInSamePageMulti(self, title_pairs):
		''' Multithread check google method.
		'''
		isSameMatrix = {}
		threads = []
		
		i = 0
		for title1, title2 in title_pairs:
			threads.append(CheckGoogleThread(self, isSameMatrix, i, title1, title2))
			threads[i].start()
			i += 1
			time.sleep(0.2)

		restarted_threads = 0;
		restart_times = 10
		check_count = 0
		while True:
			alldone = True
			print ">> ", isSameMatrix
			for i in range(0, len(title_pairs) - 1):
				if i not in isSameMatrix:
					alldone = False
					if check_count % restart_times == 0 and restarted_threads < 3:
						threads[i] = CheckGoogleThread(self, isSameMatrix, i, title1, title2)
						threads[i].start()
						restarted_threads += 1
				
			if alldone and len(title_pairs) == len(isSameMatrix):
				print "All Done: ", isSameMatrix
				break
			time.sleep(2);
			
			check_count += 1
		
		# return	
		print '-' * 100
		print "-Return: ", isSameMatrix
		print '-' * 100
		return isSameMatrix
		

	def foundInSamePage(self, url):
		idx = url.find("/");
		domain = ''
		if idx > 0:
			domain = url[0:idx]
		else:
			domain = url
			
		if 'dblp' in url:
			return False, domain
		
		for excluded in EXCLUDE_NOISE_SITE:
#			print 'check with ', excluded,'--', domain
			if domain == excluded or domain.endswith(excluded) or excluded.endswith(domain):
				return False, domain
		return True, domain

Example #7

0

Show file

File: test_parsegoogle.py Project: AlexLyj/aminer-spider

	def __init__(self):
		self.settings = Settings.getInstance()
		self.parsegoogle = GoogleResultParser()
		self.htmlRetriever = HtmlRetriever(self.settings.use_proxy)
		self.checker = checker()

Example #8

0

Show file

 def __init__(self):
     self.settings = Settings.getInstance()
     self.parsegoogle = GoogleResultParser()
     self.htmlRetriever = HtmlRetriever(self.settings.use_proxy)
     self.checker = checker()