def main(): ### The start page's URL start_url = 'https://scholar.google.com.tw/citations?view_op=search_authors&hl=en&mauthors=label:complex_systems' ### p_key and n p_key = [] n_key = [] ### Google Scholar Crawler, Class Spider myCrawler = Spider(start_url, p_key, n_key, page=5) results = myCrawler.crawl() with open('result.pickle', 'wb') as f: pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
def main(): ### The start page's URL start_url = 'https://scholar.google.com.tw/scholar?q=frequency+lowering+algorithm&hl=zh-TW&as_sdt=0,5' ### p_key and n p_key = [ 'wdrc', 'dynamic range compression', 'hearing aid', 'speech', 'noise cancellation', 'noise reduction', 'feedback cancellation', 'sound', 'hearing loss' ] n_key = [ 'imagery', 'image', 'visual', 'video', 'optic', 'opto', 'quantum', 'photon' ] ### Google Scholar Crawler, Class Spider myCrawler = Spider(start_url, p_key, n_key, page=5) results = myCrawler.crawl() with open('result.pickle', 'wb') as f: pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
from Spider import Spider from Query import Query import sys arguments = sys.argv if arguments[1] == "crawl": spider = Spider("https://en.wikipedia.org/") spider.crawl() elif arguments[1] == "query": query = Query(arguments[2]) query.query() # # query.multiWordQuery(["action","design"])
if __name__ == "__main__": pickle = os.listdir('pickle/') print('当前的已保存搜索文件:', pickle) name = input('输入搜索代号:') path = name + '.pickle' used_path = name + '_used.pickle' spider_main = Spider(name, used_path) if path not in pickle: start = time.time() url = 'https://www.bilibili.com/index/rank/all-30-3.json' try: spider_main.crawl(url, path) except Exception as e: with open('error/error.txt', 'a+') as f: f.write('94'+str(e) + '\n') end = time.time() times = int(end - start) if times > 60: mins = times//60 second = times - mins * 60 print('搜索用户所用时间为%d分%d秒' % (mins, second)) else: print('搜索用户所用时间为%d秒' % times) else: # 加载先前下载好的文件 spider_main.load_users(path, used_path)