def proceed(self): while True: self.crawler = WebCrawler(self.get_url()) # self.crawler.save_source_to_file(BAIDU_RESULT_FOLDER.format(self.fetchedCount)) self.soup = BaiduSoup(self.crawler.source) self.soup.parse_current_page() # 试图寻找交集 for newLemma in self.soup.lemmas: # duplicated = 0 if md5_unicode(newLemma[LEMMA_NAME]) in self.totalDic: print('find duplicated lemma: {}, skip saving'.format( newLemma[LEMMA_NAME].encode('utf-8'))) # ++duplicated return else: self.save_lemma_page(newLemma) self.totalLemmas.append(newLemma) self.totalDic[md5_unicode(newLemma[LEMMA_NAME])] = True # if duplicated == LEMMAS_EVERY_PAGE: # print ('find 10 duplicated items, return to 1st page, stop crawling') # return if len(self.totalLemmas) > MAX_LEMMA_COUNT: print('over max lemma count, stop crawling') return if (len(self.soup.lemmas) < LEMMAS_EVERY_PAGE): print('search results less than 10, stop searching') return self.fetchedCount += LEMMAS_EVERY_PAGE
class WebCrawlerTestCase(unittest.TestCase): def setUp(self): self.webCrawler = WebCrawler("https://google.com") def tearDown(self): pass def test_parse_html(self): f = open("test.html", "r") html = f.read() f.close() self.webCrawler.parse_html(html) # expecting 6, base url and links in the html file self.assertEqual(self.webCrawler.unvisited_url.qsize(), 6) # testing if absolute link is built from relative url self.assertTrue( "https://google.com/about" in self.webCrawler.unvisited_url.queue) self.assertTrue( "https://google.com/help" in self.webCrawler.unvisited_url.queue) def test_fetch(self): # test if fetch not fails for unexpected inputs try: self.webCrawler.fetch("randomtext") self.webCrawler.fetch("") self.webCrawler.fetch(None) except Exception: self.fail("fetch() raised Exception unexpectedly!")
def proceed(self, url, level=0): self.fetchedCount += 1 # print ('{} lemmas collected'.format(self.fetchedCount)) crawler = WebCrawler(url) if crawler.response.url == BAIKE_404: print("url: {} returns 404".format(url)) return self.soup = BaikeSoup(crawler.source) self.soup.parse_current_page() lemmaName = self.soup.lemma.encode('utf-8') if not self.download_related: crawler.save_source_to_file( LEMMA_PATTERN_WITH_BOLD.format(lemmaName)) return else: crawler.save_source_to_file(LEMMA_PATTERN.format(lemmaName)) self.loadedUrls[url] = True self.loadedLemma.append(lemmaName) if (url != crawler.response.url): self.loadedUrls[crawler.response.url] = True if (self.soup.lemmaid == ID_UNSET): return tried = 0 while True: relatedApi = RELATED_URL_PATTERN.format(self.soup.lemmaid) crawler = WebCrawler(relatedApi) source = crawler.response.text.encode(crawler.response.encoding) jsonObj = json.loads(source) if isinstance(jsonObj, list): break else: tried += 1 if tried > MAX_RETRY: print( 'tried 5 times but still return error, url: {}'.format( relatedApi)) return level += 1 for relatedLemma in jsonObj[0]['data']: if os.path.isfile( LEMMA_PATTERN.format( relatedLemma['title'].encode('utf-8'))): pass # if (relatedLemma['title'].encode('utf-8') in self.loadedLemma): # print('{} already downloaded, will not start download').format(relatedLemma['title'].encode('utf-8')) elif level > MAX_RECURSION_DEPTH: # print('reach max recursion depth, will not start download') pass elif self.fetchedCount > MAX_DOWNLOAD_COUNT: # print('reach max search count, will not start download') pass elif relatedLemma['url'] in self.loadedUrls: # print('{} already downloaded, will not start download').format(relatedLemma['url']) pass else: self.proceed(relatedLemma['url'], level)
def crawl(url, config, skip_delay=False): ''' RQ worker function which extracts URLs from the page contents at given the URL, then passes new URLs to both the CRAWL and PROCESS queues for futher action. ''' DELAY = int(config.get('crawler', 'crawl_delay')) MAX_DOCS = int(config.get('crawler', 'max_docs')) FRNT_LIST_FILE = config.get('crawler', 'url_frontier_file') TARGET_DOMAIN = config.get('crawler', 'target_domain') ROBOTS_LOC = config.get('crawler', 'robots_loc') if not skip_delay: sleep(float(DELAY)) wc = WebCrawler() urls = wc.crawl(url) rp = robotparser.RobotFileParser(ROBOTS_LOC) rp.read() dl = DocList(FRNT_LIST_FILE) if len(dl) < MAX_DOCS: redis_conn = Redis() for url in urls: did = md5(url).hexdigest() domain = urlsplit(url).netloc try: fetchable = rp.can_fetch('*', url) except KeyError: fetchable = False if (did not in dl) and (domain == TARGET_DOMAIN) and fetchable: dl.append(url) cq = Queue('crawl', connection=redis_conn) cq.enqueue(crawl, args=( url, config )); pq = Queue('process', connection=redis_conn) pq.enqueue(process, args=( url, config ));
class Builder: def __init__(self): pass def get_crawler(self, url): self.crawler = WebCrawler(url) def get_data(self, url): self.get_crawler(url) return self.crawler.get_soup()
def main(args=None): """The main routine""" reload(sys) sys.setdefaultencoding('utf8') parser = argparse.ArgumentParser(description='Web mining excersise 1') initArgParser(parser) args = parser.parse_args() if args.console == False and args.file == None: parser.exit("Error, Invalid output target!") with Emitter(args.console, args.file) as output: output.clear() c = WebCrawler(args, depth=args.depth) start = time.time() while not c.done: c.crawl() end = time.time() print "Exec time: ", end - start
class Builder: def __init__(self): pass def set_crawler(self, url): self.crawler = WebCrawler(url) def get_data(self, url): self.set_crawler(url) manga_list = PL(url, self.crawler.get_soup()) result = manga_list.get_data() # _url = 'https://www.wawacity.vip/?p=manga&id=1872-the-millionaire-detective-balance-unlimited-saison1' # _url = 'https://www.wawacity.vip/?p=manga&id=1874-food-wars-saison5' for k, v in result.items(): result[k]['page'] = self.get_page_data(result[k]['link']) pprint(result) def get_page_data(self, _url): self.set_crawler(_url) manga_page = PD(_url, self.crawler.get_soup()) return manga_page.get_data() def insert_process(self, args): result = {} for num, dict in args.items(): for keys, values in dict.items(): if keys != 'page': result[keys] = values else: for key, value in dict['page'].items(): if key == 'details': for k, v in value.items(): result[k] = v else: result[key] = value ''' inset in DB import from create_table'''
def crawl(url, config, skip_delay=False): ''' RQ worker function which extracts URLs from the page contents at given the URL, then passes new URLs to both the CRAWL and PROCESS queues for futher action. ''' DELAY = int(config.get('crawler', 'crawl_delay')) MAX_DOCS = int(config.get('crawler', 'max_docs')) FRNT_LIST_FILE = config.get('crawler', 'url_frontier_file') TARGET_DOMAIN = config.get('crawler', 'target_domain') ROBOTS_LOC = config.get('crawler', 'robots_loc') if not skip_delay: sleep(float(DELAY)) wc = WebCrawler() urls = wc.crawl(url) rp = robotparser.RobotFileParser(ROBOTS_LOC) rp.read() dl = DocList(FRNT_LIST_FILE) if len(dl) < MAX_DOCS: redis_conn = Redis() for url in urls: did = md5(url).hexdigest() domain = urlsplit(url).netloc try: fetchable = rp.can_fetch('*', url) except KeyError: fetchable = False if (did not in dl) and (domain == TARGET_DOMAIN) and fetchable: dl.append(url) cq = Queue('crawl', connection=redis_conn) cq.enqueue(crawl, args=(url, config)) pq = Queue('process', connection=redis_conn) pq.enqueue(process, args=(url, config))
def main(file_path, base_url, max_pages): sql = """INSERT INTO articles (url,authors,publish_date,scraped_date,top_image,article_text,xml) VALUES (?, ?, ?, ?, ?, ?, ?);""" articles = [] db = Database(file_path) db.create_connection() db.create_table("""CREATE TABLE IF NOT EXISTS articles ( url text PRIMARY KEY, authors text, publish_date text, scraped_date text, top_image text, article_text text, xml text );""") crawler = WebCrawler(base_url, max_pages) crawler.run_crawler() func = partial(get_info, articles=articles) pool = Pool(10) pool.map(func, crawler.links) pool.close() pool.join() db.insert_rows(sql, articles) db.close_connection()
class Builder: def __init__(self): pass def get_crawler(self, url): self.crawler = WebCrawler(url) def get_data(self, url): self.get_crawler(url) # manga_list = PL(url, self.crawler.get_soup()) # result = manga_list.get_data() # # # _url = 'https://www.wawacity.vip/?p=manga&id=1872-the-millionaire-detective-balance-unlimited-saison1' # # _url = 'https://www.wawacity.vip/?p=manga&id=1874-food-wars-saison5' # for k, v in result.items(): # result[k]['page'] = self.get_page_data(result[k]['link']) titles = self.crawler.get_soup().find_all("a", href=True) for title in titles: print(title)
def save_lemma_page(self, lemma): # print (lemma[LEMMA_URL]) self.crawler = WebCrawler(lemma[LEMMA_URL]) lemmaName = lemma[LEMMA_NAME].encode('utf-8') lemmaName = lemmaName.replace('/', '__') self.crawler.save_source_to_file(LEMMA_PATTERN.format(lemmaName))
def setUp(self): self.webCrawler = WebCrawler("https://google.com")
class BaiduWorker(object): def __init__(self, keyword): self.keyword = keyword self.totalDic = {} self.totalLemmas = [] self.crawler = None self.soup = None self.fetchedCount = 0 if not os.path.exists(FOLDER_PREFIX): os.makedirs(FOLDER_PREFIX) self.proceed() self.save_lemma_info() def proceed(self): while True: self.crawler = WebCrawler(self.get_url()) # self.crawler.save_source_to_file(BAIDU_RESULT_FOLDER.format(self.fetchedCount)) self.soup = BaiduSoup(self.crawler.source) self.soup.parse_current_page() # 试图寻找交集 for newLemma in self.soup.lemmas: # duplicated = 0 if md5_unicode(newLemma[LEMMA_NAME]) in self.totalDic: print('find duplicated lemma: {}, skip saving'.format( newLemma[LEMMA_NAME].encode('utf-8'))) # ++duplicated return else: self.save_lemma_page(newLemma) self.totalLemmas.append(newLemma) self.totalDic[md5_unicode(newLemma[LEMMA_NAME])] = True # if duplicated == LEMMAS_EVERY_PAGE: # print ('find 10 duplicated items, return to 1st page, stop crawling') # return if len(self.totalLemmas) > MAX_LEMMA_COUNT: print('over max lemma count, stop crawling') return if (len(self.soup.lemmas) < LEMMAS_EVERY_PAGE): print('search results less than 10, stop searching') return self.fetchedCount += LEMMAS_EVERY_PAGE def save_lemma_page(self, lemma): # print (lemma[LEMMA_URL]) self.crawler = WebCrawler(lemma[LEMMA_URL]) lemmaName = lemma[LEMMA_NAME].encode('utf-8') lemmaName = lemmaName.replace('/', '__') self.crawler.save_source_to_file(LEMMA_PATTERN.format(lemmaName)) def save_lemma_info(self): json_str = json.dumps(self.totalLemmas, ensure_ascii=False, indent=4, sort_keys=True) save_to_file(LEMMA_RECORD_PATH, json_str.encode('utf-8')) def get_url(self): url = SEARCH_QUERY.format(quote(self.keyword), self.fetchedCount) print('fetch baidu search url: {}'.format(url)) return url
from crawler import WebCrawler import datetime # runner file if __name__ == '__main__': search_url = argv[1] or 'https://github.com' print('WebCrawler started, scanning {0}'.format(search_url)) # initiate a crawl, the crawler encapsulates the event loop c = WebCrawler(search_url) c.crawl()
def main(argv): crawler = WebCrawler() crawler.run()
from crawler import WebCrawler crawler = WebCrawler('http://harvix.com')
from urllib.request import urlopen from bs4 import BeautifulSoup as bs from crawler import WebCrawler import json import sys with open('config.json', 'r') as f: config = json.load(f) webCrawler = WebCrawler(config) args = sys.argv if (len(args) == 2): keyword = args[1] numResults = webCrawler.getNumResults(keyword) if (numResults == 0): print("Sorry, No results found") else: print("No. of results found - " + str(numResults)) elif (len(args) == 3): keyword = args[1] pageNum = args[2] productList = webCrawler.getProducts(pageNum, keyword) if (len(productList) == 0): print("Sorry, No results found") else: print("Items found:") for product in productList: print("Name - " + product.getProductName()) print("Price - " + product.getProductPrice()) print("Merchant - " + product.getMerchantName())
from crawler import WebCrawler if __name__ == "__main__": crawler = WebCrawler.WebCrawler() crawler.crawl()
def get_crawler(self, url): self.crawler = WebCrawler(url)
from crawler import WebCrawler import pandas if __name__ == '__main__': wc = WebCrawler('https://www.reddit.com/', limit=10) wc.searchBFS('reddit')
# Hint: # 1. While your solution must handle the case for Web(size=123, degree=5) in # the test script, you may want to use different size and degree settings # for faster tests and for better test coverage. import time from crawler import WebCrawler from web import Web size = 1000 degree = 10 web = Web(size=size, degree=degree) crawler = WebCrawler() start = time.time() urls = crawler.crawl(web) finish = time.time() print("Time took to crawl the URLs: ", finish - start) print("Number of URLs found: ", len(urls)) assert len(urls) == size
from crawler import WebCrawler wc = WebCrawler() urls = wc.crawl('https://en.wikipedia.org/wiki/Main_Page') for url in urls: print url len(urls)
return amount_displayed if __name__ == "__main__": source = 'http://code.activestate.com/recipes/578060-a-simple-webcrawler/' from crawler import WebCrawler from dict_encoder import DictEncoder from matrix_builder import MatrixBuilder from stop_words import get_stop_words import re fmoore_url_regex = re.compile( 'https?\:\/\/lyle\.smu\.edu\/\~fmoore.*(htm|txt|html|php|\/)$') w = WebCrawler(fmoore_url_regex) stop_words = get_stop_words('en') w.start_crawling(['https://lyle.smu.edu/~fmoore/'], 50, stop_words) print("------------------") print("CRAWLING FINISHED") print("------------------") for key, value in w.my_url_dict.items(): print("URL: {0}, Title: {1}, Type: {2}".format(key, value.title, value.status)) print "There is(are) {} graphic file(s).".format(w.image_links) print("------------------") print "Most common words:"
def get_all_pages(self): self.page_soup = [] regex = { '遅れ(10分未満)': '遅れ\(\S+分\S+\)', '遅れ(30分以上)': '遅れ\(\S+分\S+\)', '遅れ(10〜30分)': '遅れ\(\S+分\)', '止まっている': '止まっている', '順調': '順調', 'その他': 'その他', '運転再開': '運転再開' } for page_number in range(0, 900, 30): # 581 list_page = WebCrawler(self.url + str(page_number)) list_page_soup = list_page.get_soup() tables = list_page_soup.find_all('div', {'class': 'div_table'}) for table in tables: spans = table.find_all('span') train = {} for span_counter, span in enumerate(spans): span = span.getText() if span_counter == 0: for key, reg in regex.items(): if key in span: train['line'] = re.sub(reg, '', span).replace(' ', '') train['delay'] = re.findall(reg, span)[0] elif span_counter == 1: train['start_time'] = re.findall(r'\d\d:\d\d', span)[0] tmp = re.findall(r'(\S+) → (\S+)', span) if not tmp: tmp = [(re.findall(r'(\S+) →', span)[0], '')] train['start_station'], train['end_station'] = tmp[0] elif span_counter == 2: train['status'] = span elif span_counter == 3: a = table.parent.parent.find("a", href=True) a = re.findall(r'id=\d+', a['href'])[0] detail_page_soup = WebCrawler('https://mb.jorudan.co.jp/os/live.cgi?' + a).get_soup() detail_table = detail_page_soup.find('table', {'class': 'detail_table'}) trs = detail_table.find_all('tr') english_trad = { '時刻': 'timesOfDay', '区間': 'section', '詳細': 'details' } for tr in trs: tds = tr.find_all('td') train[english_trad[tds[0].getText()]] = tds[1].getText().strip() self.page_soup.append(train)