Example #1
0
def main():
    ### The start page's URL
    start_url = 'https://scholar.google.com.tw/citations?view_op=search_authors&hl=en&mauthors=label:complex_systems'

    ### p_key and n
    p_key = []
    n_key = []

    ### Google Scholar Crawler, Class Spider
    myCrawler = Spider(start_url, p_key, n_key, page=5)

    results = myCrawler.crawl()

    with open('result.pickle', 'wb') as f:
        pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
def main():
    ### The start page's URL
    start_url = 'https://scholar.google.com.tw/scholar?q=frequency+lowering+algorithm&hl=zh-TW&as_sdt=0,5'

    ### p_key and n
    p_key = [
        'wdrc', 'dynamic range compression', 'hearing aid', 'speech',
        'noise cancellation', 'noise reduction', 'feedback cancellation',
        'sound', 'hearing loss'
    ]
    n_key = [
        'imagery', 'image', 'visual', 'video', 'optic', 'opto', 'quantum',
        'photon'
    ]

    ### Google Scholar Crawler, Class Spider
    myCrawler = Spider(start_url, p_key, n_key, page=5)

    results = myCrawler.crawl()

    with open('result.pickle', 'wb') as f:
        pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
from Spider import Spider
from Query import Query
import sys

arguments = sys.argv
if arguments[1] == "crawl":
    spider = Spider("https://en.wikipedia.org/")
    spider.crawl()
elif arguments[1] == "query":
    query = Query(arguments[2])
    query.query()
# # query.multiWordQuery(["action","design"])
Example #4
0
if __name__ == "__main__":

    pickle = os.listdir('pickle/')
    print('当前的已保存搜索文件:', pickle)
    name = input('输入搜索代号:')
    path = name + '.pickle'
    used_path = name + '_used.pickle'
    spider_main = Spider(name, used_path)
    if path not in pickle:
        start = time.time()
        url = 'https://www.bilibili.com/index/rank/all-30-3.json'
        
        
        try:
            spider_main.crawl(url, path)
        except Exception as e:
            with open('error/error.txt', 'a+') as f:
                f.write('94'+str(e) + '\n')
                
        end = time.time()
        times = int(end - start)
        if times > 60:
            mins = times//60
            second = times - mins * 60
            print('搜索用户所用时间为%d分%d秒' % (mins, second))
        else:
            print('搜索用户所用时间为%d秒' % times)
    else:
        # 加载先前下载好的文件
        spider_main.load_users(path, used_path)