def get_plain_text(self, url):
     text = ''
     try:
         page_content = WebHelper.get_page_content_from_url(url)
         if page_content is None:
             print('[Error]', url)
             return ''
         page_content = page_content.decode('utf-8')
         soup = BeautifulSoup(page_content, 'lxml')
         # kill all script and style elements
         for script in soup(["script", "style"]):
             script.extract()
         text = soup.get_text()
         # break into lines and remove leading and trailing space on each
         lines = (' '.join(line.strip().split())
                  for line in text.splitlines())
         text = '\n'.join(lines)
         text = os.linesep.join([s for s in text.splitlines() if s])
         time.sleep(random.randint(1, 3))
     except Exception as e:
         if isinstance(e, KeyboardInterrupt):
             exit()
         else:
             print(e)
     return text
Ejemplo n.º 2
0
 def get_search_page_by_name(cls, name):
     """
     get html content of the search page as a bing_result of the given name
     :param name: name to be searched on search engine
     :return: html content of search page
     """
     name = str(name).replace(' ', '+')
     search_url = cls.__SEARCH_ROOT_URL__ + name
     return WebHelper.get_page_content_from_url(search_url)
 def get_search_page_by_name(cls, name):
     """
     get html content of the search page as a bing_result of the given name
     :param name: name to be searched on search engine
     :return: html content of search page
     """
     name = str(name).replace(' ', '+')
     search_url = cls.__SEARCH_ROOT_URL__ + name
     return WebHelper.get_page_content_from_url(search_url)
Ejemplo n.º 4
0
    __RESULT_DIR_PATH__ = '../google_result/'
    __SEARCH_ROOT_URL__ = 'https://www.google.com/search?hl=en&safe=off&q='


class BingHelper(SearchHelper):
    __parser__ = BingPageHTMLParser
    __RESULT_DIR_PATH__ = '../bing_result/'
    __SEARCH_ROOT_URL__ = 'https://cn.bing.com/search?q='


if __name__ == '__main__':
    # bing_result = GoogleHelper.get_google_search_page_by_name('jie tang mail')
    # resultFile = open('bing_result.html', 'w')
    # resultFile.write(bing_result)
    #
    # title_url_dict = GoogleHelper.get_google_items_from_search_page(bing_result)
    # for url, title in title_url_dict:
    #     print url, title

    content = WebHelper.get_page_content_from_url(
        'http://www.google.com/search?q=jie+tang+tsinghua+email')
    result = open('bing_result.html', 'w')
    result.write(content)

    # proxy = urllib2.ProxyHandler({'http': 'http://*****:*****@tel.lc.ignw.net:25'})
    # auth = urllib2.HTTPBasicAuthHandler()
    # opener = urllib2.build_opener(proxy, auth, urllib2.HTTPHandler)
    # urllib2.install_opener(opener)
    # print 'ready to open'
    # conn = urllib2.urlopen('http://www.google.com')
    # print conn.read()
    __parser__ = GooglePageHTMLParser
    __RESULT_DIR_PATH__ = '../google_result/'
    __SEARCH_ROOT_URL__ = 'https://www.google.com/search?hl=en&safe=off&q='


class BingHelper(SearchHelper):
    __parser__ = BingPageHTMLParser
    __RESULT_DIR_PATH__ = '../bing_result/'
    __SEARCH_ROOT_URL__ = 'https://cn.bing.com/search?q='


if __name__ == '__main__':
    # bing_result = GoogleHelper.get_google_search_page_by_name('jie tang mail')
    # resultFile = open('bing_result.html', 'w')
    # resultFile.write(bing_result)
    #
    # title_url_dict = GoogleHelper.get_google_items_from_search_page(bing_result)
    # for url, title in title_url_dict:
    #     print url, title

    content = WebHelper.get_page_content_from_url('http://www.google.com/search?q=jie+tang+tsinghua+email')
    result = open('bing_result.html', 'w')
    result.write(content)

    # proxy = urllib2.ProxyHandler({'http': 'http://*****:*****@tel.lc.ignw.net:25'})
    # auth = urllib2.HTTPBasicAuthHandler()
    # opener = urllib2.build_opener(proxy, auth, urllib2.HTTPHandler)
    # urllib2.install_opener(opener)
    # print 'ready to open'
    # conn = urllib2.urlopen('http://www.google.com')
    # print conn.read()