Beispiel #1
0
    def proceed(self):
        while True:
            self.crawler = WebCrawler(self.get_url())
            # self.crawler.save_source_to_file(BAIDU_RESULT_FOLDER.format(self.fetchedCount))
            self.soup = BaiduSoup(self.crawler.source)
            self.soup.parse_current_page()
            # 试图寻找交集
            for newLemma in self.soup.lemmas:
                # duplicated = 0
                if md5_unicode(newLemma[LEMMA_NAME]) in self.totalDic:
                    print('find duplicated lemma: {}, skip saving'.format(
                        newLemma[LEMMA_NAME].encode('utf-8')))
                    # ++duplicated
                    return
                else:
                    self.save_lemma_page(newLemma)
                    self.totalLemmas.append(newLemma)
                    self.totalDic[md5_unicode(newLemma[LEMMA_NAME])] = True
                # if duplicated == LEMMAS_EVERY_PAGE:
                #     print ('find 10 duplicated items, return to 1st page, stop crawling')
                #     return
                if len(self.totalLemmas) > MAX_LEMMA_COUNT:
                    print('over max lemma count, stop crawling')
                    return
            if (len(self.soup.lemmas) < LEMMAS_EVERY_PAGE):
                print('search results less than 10, stop searching')
                return

            self.fetchedCount += LEMMAS_EVERY_PAGE
class WebCrawlerTestCase(unittest.TestCase):
    def setUp(self):
        self.webCrawler = WebCrawler("https://google.com")

    def tearDown(self):
        pass

    def test_parse_html(self):
        f = open("test.html", "r")
        html = f.read()
        f.close()
        self.webCrawler.parse_html(html)
        # expecting 6, base url and links in the html file
        self.assertEqual(self.webCrawler.unvisited_url.qsize(), 6)

        # testing if absolute link is  built from relative url
        self.assertTrue(
            "https://google.com/about" in self.webCrawler.unvisited_url.queue)
        self.assertTrue(
            "https://google.com/help" in self.webCrawler.unvisited_url.queue)

    def test_fetch(self):

        # test if fetch not fails for unexpected inputs
        try:
            self.webCrawler.fetch("randomtext")
            self.webCrawler.fetch("")
            self.webCrawler.fetch(None)
        except Exception:
            self.fail("fetch() raised Exception unexpectedly!")
Beispiel #3
0
    def proceed(self, url, level=0):
        self.fetchedCount += 1
        # print ('{} lemmas collected'.format(self.fetchedCount))
        crawler = WebCrawler(url)
        if crawler.response.url == BAIKE_404:
            print("url: {} returns 404".format(url))
            return
        self.soup = BaikeSoup(crawler.source)
        self.soup.parse_current_page()
        lemmaName = self.soup.lemma.encode('utf-8')

        if not self.download_related:
            crawler.save_source_to_file(
                LEMMA_PATTERN_WITH_BOLD.format(lemmaName))
            return
        else:
            crawler.save_source_to_file(LEMMA_PATTERN.format(lemmaName))

        self.loadedUrls[url] = True
        self.loadedLemma.append(lemmaName)
        if (url != crawler.response.url):
            self.loadedUrls[crawler.response.url] = True

        if (self.soup.lemmaid == ID_UNSET):
            return
        tried = 0
        while True:
            relatedApi = RELATED_URL_PATTERN.format(self.soup.lemmaid)
            crawler = WebCrawler(relatedApi)
            source = crawler.response.text.encode(crawler.response.encoding)
            jsonObj = json.loads(source)
            if isinstance(jsonObj, list):
                break
            else:
                tried += 1
                if tried > MAX_RETRY:
                    print(
                        'tried 5 times but still return error, url: {}'.format(
                            relatedApi))
                    return

        level += 1
        for relatedLemma in jsonObj[0]['data']:
            if os.path.isfile(
                    LEMMA_PATTERN.format(
                        relatedLemma['title'].encode('utf-8'))):
                pass
            # if (relatedLemma['title'].encode('utf-8') in self.loadedLemma):
            # print('{} already downloaded, will not start download').format(relatedLemma['title'].encode('utf-8'))
            elif level > MAX_RECURSION_DEPTH:
                # print('reach max recursion depth, will not start download')
                pass
            elif self.fetchedCount > MAX_DOWNLOAD_COUNT:
                # print('reach max search count, will not start download')
                pass
            elif relatedLemma['url'] in self.loadedUrls:
                # print('{} already downloaded, will not start download').format(relatedLemma['url'])
                pass
            else:
                self.proceed(relatedLemma['url'], level)
Beispiel #4
0
def crawl(url, config, skip_delay=False):
    '''
    RQ worker function which extracts URLs from the page contents at given the
    URL, then passes new URLs to both the CRAWL and PROCESS queues for futher
    action.
    '''

    DELAY = int(config.get('crawler', 'crawl_delay'))
    MAX_DOCS = int(config.get('crawler', 'max_docs'))
    FRNT_LIST_FILE = config.get('crawler', 'url_frontier_file')
    TARGET_DOMAIN = config.get('crawler', 'target_domain')
    ROBOTS_LOC = config.get('crawler', 'robots_loc')

    if not skip_delay:
        sleep(float(DELAY))

    wc = WebCrawler()
    urls = wc.crawl(url)
    rp = robotparser.RobotFileParser(ROBOTS_LOC)
    rp.read()
    
    dl = DocList(FRNT_LIST_FILE)
    if len(dl) < MAX_DOCS:
        redis_conn = Redis()
        for url in urls:
            did = md5(url).hexdigest()
            domain = urlsplit(url).netloc
            try:
                fetchable = rp.can_fetch('*', url)
            except KeyError:
                fetchable = False
            if (did not in dl) and (domain == TARGET_DOMAIN) and fetchable:
                dl.append(url)
                cq = Queue('crawl', connection=redis_conn)
                cq.enqueue(crawl, args=(
                    url,
                    config
                ));
                pq = Queue('process', connection=redis_conn)
                pq.enqueue(process, args=(
                    url,
                    config
                ));
Beispiel #5
0
class Builder:
    def __init__(self):
        pass

    def get_crawler(self, url):
        self.crawler = WebCrawler(url)

    def get_data(self, url):
        self.get_crawler(url)
        return self.crawler.get_soup()
Beispiel #6
0
def main(args=None):
    """The main routine"""
    reload(sys)
    sys.setdefaultencoding('utf8')

    parser = argparse.ArgumentParser(description='Web mining excersise 1')
    initArgParser(parser)
    args = parser.parse_args()

    if args.console == False and args.file == None:
        parser.exit("Error, Invalid output target!")
    with Emitter(args.console, args.file) as output:
        output.clear()

    c = WebCrawler(args, depth=args.depth)
    start = time.time()
    while not c.done:
        c.crawl()
    end = time.time()
    print "Exec time: ", end - start
Beispiel #7
0
class Builder:
    def __init__(self):
        pass

    def set_crawler(self, url):
        self.crawler = WebCrawler(url)

    def get_data(self, url):
        self.set_crawler(url)
        manga_list = PL(url, self.crawler.get_soup())
        result = manga_list.get_data()

        # _url = 'https://www.wawacity.vip/?p=manga&id=1872-the-millionaire-detective-balance-unlimited-saison1'
        # _url = 'https://www.wawacity.vip/?p=manga&id=1874-food-wars-saison5'
        for k, v in result.items():
            result[k]['page'] = self.get_page_data(result[k]['link'])

        pprint(result)

    def get_page_data(self, _url):
        self.set_crawler(_url)
        manga_page = PD(_url, self.crawler.get_soup())

        return manga_page.get_data()

    def insert_process(self, args):
        result = {}
        for num, dict in args.items():
            for keys, values in dict.items():
                if keys != 'page':
                    result[keys] = values
                else:
                    for key, value in dict['page'].items():
                        if key == 'details':
                            for k, v in value.items():
                                result[k] = v
                        else:
                            result[key] = value
            ''' inset in DB import from create_table'''
Beispiel #8
0
def crawl(url, config, skip_delay=False):
    '''
    RQ worker function which extracts URLs from the page contents at given the
    URL, then passes new URLs to both the CRAWL and PROCESS queues for futher
    action.
    '''

    DELAY = int(config.get('crawler', 'crawl_delay'))
    MAX_DOCS = int(config.get('crawler', 'max_docs'))
    FRNT_LIST_FILE = config.get('crawler', 'url_frontier_file')
    TARGET_DOMAIN = config.get('crawler', 'target_domain')
    ROBOTS_LOC = config.get('crawler', 'robots_loc')

    if not skip_delay:
        sleep(float(DELAY))

    wc = WebCrawler()
    urls = wc.crawl(url)
    rp = robotparser.RobotFileParser(ROBOTS_LOC)
    rp.read()

    dl = DocList(FRNT_LIST_FILE)
    if len(dl) < MAX_DOCS:
        redis_conn = Redis()
        for url in urls:
            did = md5(url).hexdigest()
            domain = urlsplit(url).netloc
            try:
                fetchable = rp.can_fetch('*', url)
            except KeyError:
                fetchable = False
            if (did not in dl) and (domain == TARGET_DOMAIN) and fetchable:
                dl.append(url)
                cq = Queue('crawl', connection=redis_conn)
                cq.enqueue(crawl, args=(url, config))
                pq = Queue('process', connection=redis_conn)
                pq.enqueue(process, args=(url, config))
Beispiel #9
0
def main(file_path, base_url, max_pages):
    sql = """INSERT INTO articles (url,authors,publish_date,scraped_date,top_image,article_text,xml) 
                            VALUES (?, ?, ?, ?, ?, ?, ?);"""
    articles = []
    db = Database(file_path)
    db.create_connection()
    db.create_table("""CREATE TABLE IF NOT EXISTS articles (
                                        url text PRIMARY KEY,
                                        authors text,
                                        publish_date text,
                                        scraped_date text,
                                        top_image text,
                                        article_text text,
                                        xml text
                                        );""")
    crawler = WebCrawler(base_url, max_pages)
    crawler.run_crawler()
    func = partial(get_info, articles=articles)
    pool = Pool(10)
    pool.map(func, crawler.links)
    pool.close()
    pool.join()
    db.insert_rows(sql, articles)
    db.close_connection()
Beispiel #10
0
class Builder:
    def __init__(self):
        pass

    def get_crawler(self, url):
        self.crawler = WebCrawler(url)

    def get_data(self, url):
        self.get_crawler(url)

        # manga_list = PL(url, self.crawler.get_soup())
        # result = manga_list.get_data()
        #
        # # _url = 'https://www.wawacity.vip/?p=manga&id=1872-the-millionaire-detective-balance-unlimited-saison1'
        # # _url = 'https://www.wawacity.vip/?p=manga&id=1874-food-wars-saison5'
        # for k, v in result.items():
        #     result[k]['page'] = self.get_page_data(result[k]['link'])

        titles = self.crawler.get_soup().find_all("a", href=True)
        for title in titles:
            print(title)
Beispiel #11
0
 def save_lemma_page(self, lemma):
     # print (lemma[LEMMA_URL])
     self.crawler = WebCrawler(lemma[LEMMA_URL])
     lemmaName = lemma[LEMMA_NAME].encode('utf-8')
     lemmaName = lemmaName.replace('/', '__')
     self.crawler.save_source_to_file(LEMMA_PATTERN.format(lemmaName))
 def setUp(self):
     self.webCrawler = WebCrawler("https://google.com")
Beispiel #13
0
class BaiduWorker(object):
    def __init__(self, keyword):
        self.keyword = keyword
        self.totalDic = {}
        self.totalLemmas = []
        self.crawler = None
        self.soup = None
        self.fetchedCount = 0

        if not os.path.exists(FOLDER_PREFIX):
            os.makedirs(FOLDER_PREFIX)

        self.proceed()
        self.save_lemma_info()

    def proceed(self):
        while True:
            self.crawler = WebCrawler(self.get_url())
            # self.crawler.save_source_to_file(BAIDU_RESULT_FOLDER.format(self.fetchedCount))
            self.soup = BaiduSoup(self.crawler.source)
            self.soup.parse_current_page()
            # 试图寻找交集
            for newLemma in self.soup.lemmas:
                # duplicated = 0
                if md5_unicode(newLemma[LEMMA_NAME]) in self.totalDic:
                    print('find duplicated lemma: {}, skip saving'.format(
                        newLemma[LEMMA_NAME].encode('utf-8')))
                    # ++duplicated
                    return
                else:
                    self.save_lemma_page(newLemma)
                    self.totalLemmas.append(newLemma)
                    self.totalDic[md5_unicode(newLemma[LEMMA_NAME])] = True
                # if duplicated == LEMMAS_EVERY_PAGE:
                #     print ('find 10 duplicated items, return to 1st page, stop crawling')
                #     return
                if len(self.totalLemmas) > MAX_LEMMA_COUNT:
                    print('over max lemma count, stop crawling')
                    return
            if (len(self.soup.lemmas) < LEMMAS_EVERY_PAGE):
                print('search results less than 10, stop searching')
                return

            self.fetchedCount += LEMMAS_EVERY_PAGE

    def save_lemma_page(self, lemma):
        # print (lemma[LEMMA_URL])
        self.crawler = WebCrawler(lemma[LEMMA_URL])
        lemmaName = lemma[LEMMA_NAME].encode('utf-8')
        lemmaName = lemmaName.replace('/', '__')
        self.crawler.save_source_to_file(LEMMA_PATTERN.format(lemmaName))

    def save_lemma_info(self):
        json_str = json.dumps(self.totalLemmas,
                              ensure_ascii=False,
                              indent=4,
                              sort_keys=True)
        save_to_file(LEMMA_RECORD_PATH, json_str.encode('utf-8'))

    def get_url(self):
        url = SEARCH_QUERY.format(quote(self.keyword), self.fetchedCount)
        print('fetch baidu search url: {}'.format(url))
        return url
Beispiel #14
0
from crawler import WebCrawler
import datetime

# runner file

if __name__ == '__main__':
    search_url = argv[1] or 'https://github.com'
    print('WebCrawler started, scanning {0}'.format(search_url))
    # initiate a crawl, the crawler encapsulates the event loop
    c = WebCrawler(search_url)
    c.crawl()
Beispiel #15
0
def main(argv):
    crawler = WebCrawler()
    crawler.run()
Beispiel #16
0
from crawler import WebCrawler

crawler = WebCrawler('http://harvix.com')
Beispiel #17
0
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
from crawler import WebCrawler
import json
import sys

with open('config.json', 'r') as f:
    config = json.load(f)

webCrawler = WebCrawler(config)
args = sys.argv
if (len(args) == 2):
    keyword = args[1]
    numResults = webCrawler.getNumResults(keyword)
    if (numResults == 0):
        print("Sorry, No results found")
    else:
        print("No. of results found - " + str(numResults))

elif (len(args) == 3):
    keyword = args[1]
    pageNum = args[2]
    productList = webCrawler.getProducts(pageNum, keyword)
    if (len(productList) == 0):
        print("Sorry, No results found")
    else:
        print("Items found:")
        for product in productList:
            print("Name - " + product.getProductName())
            print("Price - " + product.getProductPrice())
            print("Merchant - " + product.getMerchantName())
Beispiel #18
0
from crawler import WebCrawler

if __name__ == "__main__":
    crawler = WebCrawler.WebCrawler()
    crawler.crawl()
Beispiel #19
0
 def get_crawler(self, url):
     self.crawler = WebCrawler(url)
Beispiel #20
0
from crawler import WebCrawler
import pandas

if __name__ == '__main__':
    wc = WebCrawler('https://www.reddit.com/', limit=10)
    wc.searchBFS('reddit')

Beispiel #21
0
# Hint:
#   1. While your solution must handle the case for Web(size=123, degree=5) in
#      the test script, you may want to use different size and degree settings
#      for faster tests and for better test coverage.

import time

from crawler import WebCrawler
from web import Web

size = 1000
degree = 10
web = Web(size=size, degree=degree)
crawler = WebCrawler()
start = time.time()
urls = crawler.crawl(web)
finish = time.time()
print("Time took to crawl the URLs: ", finish - start)
print("Number of URLs found: ", len(urls))
assert len(urls) == size
Beispiel #22
0
from crawler import WebCrawler

wc = WebCrawler()
urls = wc.crawl('https://en.wikipedia.org/wiki/Main_Page')
for url in urls:
    print url
len(urls)
Beispiel #23
0
    return amount_displayed


if __name__ == "__main__":

    source = 'http://code.activestate.com/recipes/578060-a-simple-webcrawler/'

    from crawler import WebCrawler
    from dict_encoder import DictEncoder
    from matrix_builder import MatrixBuilder
    from stop_words import get_stop_words
    import re
    fmoore_url_regex = re.compile(
        'https?\:\/\/lyle\.smu\.edu\/\~fmoore.*(htm|txt|html|php|\/)$')

    w = WebCrawler(fmoore_url_regex)

    stop_words = get_stop_words('en')

    w.start_crawling(['https://lyle.smu.edu/~fmoore/'], 50, stop_words)

    print("------------------")
    print("CRAWLING FINISHED")
    print("------------------")

    for key, value in w.my_url_dict.items():
        print("URL: {0}, Title: {1}, Type: {2}".format(key, value.title,
                                                       value.status))
    print "There is(are) {} graphic file(s).".format(w.image_links)
    print("------------------")
    print "Most common words:"
Beispiel #24
0
    def get_all_pages(self):
        self.page_soup = []
        regex = {
            '遅れ(10分未満)': '遅れ\(\S+分\S+\)',
            '遅れ(30分以上)': '遅れ\(\S+分\S+\)',
            '遅れ(10〜30分)': '遅れ\(\S+分\)',
            '止まっている': '止まっている',
            '順調': '順調',
            'その他': 'その他',
            '運転再開': '運転再開'
        }

        for page_number in range(0, 900, 30):  # 581
            list_page = WebCrawler(self.url + str(page_number))
            list_page_soup = list_page.get_soup()
            tables = list_page_soup.find_all('div', {'class': 'div_table'})

            for table in tables:
                spans = table.find_all('span')
                train = {}

                for span_counter, span in enumerate(spans):
                    span = span.getText()
                    if span_counter == 0:
                        for key, reg in regex.items():
                            if key in span:
                                train['line'] = re.sub(reg, '', span).replace(' ', '')
                                train['delay'] = re.findall(reg, span)[0]

                    elif span_counter == 1:
                        train['start_time'] = re.findall(r'\d\d:\d\d', span)[0]

                        tmp = re.findall(r'(\S+) → (\S+)', span)

                        if not tmp:
                            tmp = [(re.findall(r'(\S+) →', span)[0], '')]

                        train['start_station'], train['end_station'] = tmp[0]

                    elif span_counter == 2:
                        train['status'] = span

                    elif span_counter == 3:
                        a = table.parent.parent.find("a", href=True)
                        a = re.findall(r'id=\d+', a['href'])[0]

                        detail_page_soup = WebCrawler('https://mb.jorudan.co.jp/os/live.cgi?' + a).get_soup()
                        detail_table = detail_page_soup.find('table', {'class': 'detail_table'})
                        trs = detail_table.find_all('tr')

                        english_trad = {
                            '時刻': 'timesOfDay',
                            '区間': 'section',
                            '詳細': 'details'
                        }

                        for tr in trs:
                            tds = tr.find_all('td')
                            train[english_trad[tds[0].getText()]] = tds[1].getText().strip()

                self.page_soup.append(train)