def integrate_test_pubs(self, pub_candidates): """ For Debug Errors """ print "- INTEGRATE TEST -:", self.integrate_test_pubs.__doc__.strip() extractor = Extractor.getInstance() matcher = PubMatcher.getInstance() # print queries query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print "Test %s pub, query: \n\t%s" % (len(used_pubs), query) url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload(query) # url = URLCleaner.encodeUrlForDownload(url) print "\t", url # do all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs, all_models, debug_output=True) # print out print "-" * 100 for pub in pubs_found: print "[%s] %s" % (pub.ncitation, pub) print "-" * 100 for pub in pubs_notfound: print "[%s] %s" % ("-", pub) print "-" * 100 print "- test done -"
def integrate_test_pubs(self, pub_candidates): ''' For Debug Errors ''' print '- INTEGRATE TEST -:', self.integrate_test_pubs.__doc__.strip() extractor = Extractor.getInstance() matcher = PubMatcher.getInstance() # print queries query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print 'Test %s pub, query: \n\t%s' % (len(used_pubs), query) url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload( query) # url = URLCleaner.encodeUrlForDownload(url) print "\t", url # do all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = matcher.matchPub(used_pubs, all_models, debug_output=True) # print out print '-' * 100 for pub in pubs_found: print '[%s] %s' % (pub.ncitation, pub) print '-' * 100 for pub in pubs_notfound: print '[%s] %s' % ('-', pub) print '-' * 100 print '- test done -'
def getNodesByPubs(self, pubs): '''Get by pubs. Return: all_models, {key_title:[ExtractedModel,...]}, can be None, or [] Param: pubs, [models.Publication], query generated by this pubs must less than 256. ''' query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pubs) url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload( query) message = "Search Pub[%s] by url[%s]\n" % (used_pubs[0], url) # url = URLCleaner.encodeUrlForDownload(url) html = self.htmlRetriever.getHtmlRetry(url) if html is None: print "Download Page failed." return None models = self.extract_from_source(html) if models is None or len(models) == 0: return None # save models all_models = self.__merge_into_extractedmap( None, models) # {key_title:[ExtractedModel,...]} # for k, v in all_models.items(): # message += "models find: [%s]\n" % v # print message return all_models
def getNodesByPubs(self, pubs): '''Get by pubs. Return: all_models, {key_title:[ExtractedModel,...]}, can be None, or [] Param: pubs, [models.Publication], query generated by this pubs must less than 256. ''' query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pubs) url = self.settings.urltemplate_by_pubs % URLCleaner.encodeUrlForDownload(query) message = "Search Pub[%s] by url[%s]\n" % (used_pubs[0], url) # url = URLCleaner.encodeUrlForDownload(url) html = self.htmlRetriever.getHtmlRetry(url) if html is None: print "Download Page failed." return None models = self.extract_from_source(html) if models is None or len(models) == 0: return None # save models all_models = self.__merge_into_extractedmap(None, models) # {key_title:[ExtractedModel,...]} # for k, v in all_models.items(): # message += "models find: [%s]\n" % v # print message return all_models
def test_retrieve_html(self): print 'Test1: test_retrieve_html()' url = '''allintitle:"Augmenting Branching Temporal Logics with Existential Quantification over Atomic Propositions" OR "Branching-Depth Hierarchies" OR "On the Relative Succinctness of Nondeterministic Buchi and co-Buchi Word Automata"''' url2 = "http://scholar.google.com/scholar?hl=en&num=100&q=%s" % url url2 = URLCleaner.encodeUrlForDownload(url2) url2 = '''http://scholar.google.com/scholar?hl=en&num=100&as_subj=eng&q=%22Finding%20the%20Number%20of%20Factors%20of%20a%20Polynomial%22OR%22Probabilistic%20Models%20of%20Database%20Locking:%20Solutions,%20Computational%20Algorithms,%20and%20Asymptotics%22OR%22The%20AWK%20Programming%20Language%22OR%22Factoring%20Polynomials%20Over%20Algebraic%20Number%20Fields%22''' getter = HtmlRetriever(use_proxy=False) print getter.getHtmlRetry(url2, 1)
def test_retrieve_html2(self): url = '''allintitle:"Augmenting Branching Temporal Logics with Existential Quantification over Atomic Propositions" OR "Branching-Depth Hierarchies" OR "On the Relative Succinctness of Nondeterministic Buchi and co-Buchi Word Automata"''' url2 = "http://scholar.google.com/scholar?hl=en&num=100&q=%s" % url url2 = URLCleaner.encodeUrlForDownload(url2) getter = HtmlRetriever(use_proxy=True) print getter.getHtmlRetry(url2, 1)