def past_news(): keywords = search_history.query.all() if keywords: for i in keywords: print("result for ", i.search_keywords) news = i.search_keywords print(crawl.crawler(news))
def server(THREAD_NUM,START_URLS,FETCH_TIME,KEY_WORD,IGNORE_KEY_WORD,DOWNLOAD_MODE,DEPTH,FETCH_COUNT,FETCH_MODE,STORAGE_MODEL,SIMILARITY,FOCUSKEYWORD): global REFUSE_COUNT global QUEUE_URLNODE global QUEUE_HTMLNODE #初始化url结点队列 start_urls = START_URLS start_nodes = init_urlnode(start_urls) for i in start_nodes: QUEUE_URLNODE.put(i) my_tuple_list = [] for i in xrange(THREAD_NUM): my_tuple_list.append((Queue.Queue(),str(i))) #起抓取线程 threads_list = [] for i in xrange(THREAD_NUM): threads_list.append(threading.Thread(target = single_thread,args = (my_tuple_list[i],QUEUE_HTMLNODE,DOWNLOAD_MODE,start_urls))) for i in threads_list: i.setDaemon(True) i.start() #起存储数据库线程 #print (KEY_WORD,QUEUE_COMPLETE_NODE,QUEUE_SMART_NODE,STORAGE_MODEL) db_engine = threading.Thread(target = engine_db,args = (KEY_WORD,QUEUE_COMPLETE_NODE,QUEUE_SMART_NODE,STORAGE_MODEL)) db_engine.setDaemon(True) db_engine.start() #URL结点队列调度 while server_exit_conditions(FETCH_TIME,THREAD_NUM,FETCH_COUNT): for i in my_tuple_list: if QUEUE_URLNODE.qsize() > 0 and i[0].qsize() < 1: QUEUE_URLNODE = fetch_mode(QUEUE_URLNODE,FETCH_MODE) node = QUEUE_URLNODE.get() i[0].put(node) if QUEUE_HTMLNODE.qsize() > 0: html_node = QUEUE_HTMLNODE.get() nodelist = crawler(html_node) for i in nodelist: if i.depth <= DEPTH and SIMILARITY == 0:#SIMILARITY if url_filter_similarity(i.url,KEY_WORD,IGNORE_KEY_WORD,FOCUSKEYWORD): QUEUE_URLNODE.put(i) if STORAGE_MODEL == 1 or STORAGE_MODEL == 2: QUEUE_SMART_NODE.put(i) else: REFUSE_COUNT += 1 elif i.depth <= DEPTH and SIMILARITY == 1: if url_filter_no_similarity(i.url,KEY_WORD,IGNORE_KEY_WORD,FOCUSKEYWORD): QUEUE_URLNODE.put(i) if STORAGE_MODEL == 0 or STORAGE_MODEL == 2: QUEUE_COMPLETE_NODE.put(i) else: REFUSE_COUNT += 1 else: REFUSE_COUNT += 1
def server(THREAD_NUM,START_URLS,FETCH_TIME,KEY_WORD,IGNORE_KEY_WORD,DOWNLOAD_MODE,DEPTH,FETCH_COUNT,FETCH_MODE,STORAGE_MODEL,SIMILARITY,FOCUSKEYWORD): global REFUSE_COUNT global QUEUE_URLNODE global QUEUE_HTMLNODE #初始化url结点队列 start_urls = START_URLS start_nodes = init_urlnode(start_urls) for i in start_nodes: QUEUE_URLNODE.put(i) my_tuple_list = [] for i in xrange(THREAD_NUM): my_tuple_list.append((Queue.Queue(),str(i))) #起抓取线程 threads_list = [] for i in xrange(THREAD_NUM): threads_list.append(threading.Thread(target = single_thread,args = (my_tuple_list[i],QUEUE_HTMLNODE,DOWNLOAD_MODE))) for i in threads_list: i.setDaemon(True) i.start() #起存储数据库线程 db_engine = threading.Thread(target = engine_db,args = (KEY_WORD,QUEUE_COMPLETE_NODE,QUEUE_SMART_NODE,STORAGE_MODEL)) db_engine.setDaemon(True) db_engine.start() #URL结点队列调度 while server_exit_conditions(FETCH_TIME,THREAD_NUM,FETCH_COUNT): for i in my_tuple_list: if QUEUE_URLNODE.qsize() > 0 and i[0].qsize() < 1: QUEUE_URLNODE = fetch_mode(QUEUE_URLNODE,FETCH_MODE) node = QUEUE_URLNODE.get() i[0].put(node) if QUEUE_HTMLNODE.qsize() > 0: html_node = QUEUE_HTMLNODE.get() nodelist = crawler(html_node) for i in nodelist: if i.depth <= DEPTH and SIMILARITY == 0:#SIMILARITY if url_filter_similarity(i.url,KEY_WORD,IGNORE_KEY_WORD,FOCUSKEYWORD): QUEUE_URLNODE.put(i) if STORAGE_MODEL == 1 or STORAGE_MODEL == 2: QUEUE_SMART_NODE.put(i) else: REFUSE_COUNT += 1 elif i.depth <= DEPTH and SIMILARITY == 1: if url_filter_no_similarity(i.url,KEY_WORD,IGNORE_KEY_WORD,FOCUSKEYWORD): QUEUE_URLNODE.put(i) if STORAGE_MODEL == 0 or STORAGE_MODEL == 2: QUEUE_COMPLETE_NODE.put(i) else: REFUSE_COUNT += 1 else: REFUSE_COUNT += 1
from crawl import crawler import pickle keyword = str(input("입력해주세요\n")) startdate = str(input("시작 날짜 형식은 2018-07-11\n")) finishdate = str(input("끝나는 날짜 형식은 2018=07-12\n")) result = crawler().twitter(keyword, startdate, finishdate) with open("Result.txt", "wb") as f: pickle.dump(result, f)
import sys import crawl import classifier import patternify """usage: python run.py http://cnn.com""" crawler = crawl.crawler() #get a crawler object urls = crawler.crawl(sys.argv[1], 500) #get URLs, number of URLs to crawl classifier.trainSVM() #train preliminary classifier using the "content" and "notcontent" files ones, zeros = classifier.testSVM(urls) #classify using preliminary classifier patterns = patternify.getPatterns(ones) #get patterns from classified "ones" (content links) classifier.trainSVM(patterns) #train secondary classifier with pattern features ones, zeros = classifier.testSVM(urls, patterns) #classify using secondary classifier
def _getlinks(self,starturl): crawler = crawl.crawler() links = crawler.crawl(starturl,200) return links
import seed import save import crawl seedURLs = [ 'https://en.wikipedia.org/wiki/Snake', 'https://en.wikipedia.org/wiki/Reptile' ] seedQ = seed.getSeedURLsQ(seedURLs) relatedTerms = seed.getRelatedTerms() pageLimit = 500 save.createDirectory('Assignment 2') save.changeDirectory('Assignment 2') crawl.createSSL() savedPages = crawl.crawler(seedURLs, seedQ, relatedTerms, pageLimit) save.saveFile('_CRAWLED_URLS_', '.txt', save.dictToSave(savedPages))