class TestMagicGoogle(unittest.TestCase): """ Test MagicGoogle class """ def setUp(self): PROXIES = [{'http': '127.0.0.1:1087', 'https': '127.0.0.1:1087'}] self.mg = MagicGoogle(PROXIES) def tearDown(self): self.mg = None def test_search_url(self): sleep = random.randint(2, 15) result = list(self.mg.search_url(query='python', num=1, pause=sleep)) self.assertEqual(result[0], 'https://www.python.org/', 'test search_url fail')
import os import sys import time import random # import pprint import codecs sys.path.append(os.path.dirname(os.path.dirname(__file__))) # https://github.com/howie6879/magic_google # pip install magic_google from magic_google import MagicGoogle mg = MagicGoogle() x = 0 file = codecs.open("result-" + str(x) + ".txt", "w", "utf-8") for url in mg.search_url(query='Github', num=100, start=x * 100): file.write(url) print(url) # file.write("\n") file.write("\n----------------\n") file.close()
# # time.sleep(random.randint(1, 5)) # Get {'title','url','text'} for i in mg.search(query='python', num=1, language='en'): pprint.pprint(i) time.sleep(random.randint(1, 5)) # Output # {'text': 'The official home of the Python Programming Language.', # 'title': 'Welcome to Python .org', # 'url': 'https://www.python.org/'} # Get first page for url in mg.search_url(query='python'): pprint.pprint(url) time.sleep(random.randint(1, 5)) # Output # 'https://www.python.org/' # 'https://www.python.org/downloads/' # 'https://www.python.org/about/gettingstarted/' # 'https://docs.python.org/2/tutorial/' # 'https://docs.python.org/' # 'https://en.wikipedia.org/wiki/Python_(programming_language)' # 'https://www.codecademy.com/courses/introduction-to-python-6WeG3/0?curriculum_id=4f89dab3d788890003000096' # 'https://www.codecademy.com/learn/python' # 'https://developers.google.com/edu/python/' # 'https://learnpythonthehardway.org/book/'
# # time.sleep(random.randint(1, 5)) # Get {'title','url','text'} for i in mg.search(query=str(search_key), num=1, language='en'): pprint.pprint(i) time.sleep(random.randint(1, 5)) # Output # {'text': 'The official home of the Python Programming Language.', # 'title': 'Welcome to Python .org', # 'url': 'https://www.python.org/'} # Get first page for url in mg.search_url(query=str(search_key)): pprint.pprint(url) time.sleep(random.randint(1, 5)) # Output # 'https://www.python.org/' # 'https://www.python.org/downloads/' # 'https://www.python.org/about/gettingstarted/' # 'https://docs.python.org/2/tutorial/' # 'https://docs.python.org/' # 'https://en.wikipedia.org/wiki/Python_(programming_language)' # 'https://www.codecademy.com/courses/introduction-to-python-6WeG3/0?curriculum_id=4f89dab3d788890003000096' # 'https://www.codecademy.com/learn/python' # 'https://developers.google.com/edu/python/' # 'https://learnpythonthehardway.org/book/'
def search(keyword, num=num, ty='google'): logging.info('搜索关键词:' + keyword) # lang,p=langid.classify(keyword) # if lang=='zh': # logging.info('中文关键词') # else: # logging.info('非中文') # return [] if ty == 'google': mg = MagicGoogle(PROXIES) urls = mg.search_url(query=keyword, num=num, start=0, pause=5) elif ty == 'gcs': results = search_google.api.results(BUILDARGS, CSEARGS) urls = results.get_values('items', 'link') # logging.info(links) else: mg = MagicBaidu() urls = mg.search_url(query=keyword, start=0, pause=5) # Crawling the whole page # result = mg.search_page(query=keyword) cx = tkit.CxExtractor() # Crawling url keywords = [] # for url in mg.search_url(query=keyword): n = 0 file_name = PATH + 'corpu' + str(time.time()) + ".txt" for url in urls: #google #for url in mg.search_url(query=keyword ,start=0, pause=10): #百度 logging.info(url) try: # items= cx.url_text_no_br(url=url) items = url_text(url=url) except: continue # logging.info("*"*50) # plogging.info.plogging.info(items) # plogging.info.plogging.info("*"*50) # items= tkit.Text().text_processing(items) items = text_pre(items) # plogging.info.plogging.info('句子数目为: ',len(items['sentence'])) logging.info('句子数目为: ' + str(items['sentences_num'])) # plogging.info.plogging.info(items) if items['sentences_num'] > 5: n = n + 1 if n % 5 == 0: # keywords = keywords+ items['keywords'] file_name = PATH + 'corpu' + str(time.time()) + ".txt" logging.info('写入文件: ' + file_name) my_open = open(file_name, 'a') my_open.write(str(items['text']) + '\n\n') my_open.close() t = random.randint(30, 100) logging.info('搜索结束休息中 ' + str(t) + 's') logging.info("Start : %s" % time.ctime()) time.sleep(t) logging.info("End : %s" % time.ctime()) return