def main():
    _setup_logging()
    downloader = SeleniumHTMLDownloader(
        r'C:\Users\hananavr\Documents\לימודים\מעבדה בין תחומית\facebookCrawler copy\crawler\chromedriver.exe'
    )

    # mongodb://<dbuser>:<dbpassword>@ds215633.mlab.com:15633/pytheas
    store = MongoItemStore(host='ds215633.mlab.com',
                           port='15633',
                           db='pytheas',
                           article_collection='hanan',
                           username='******',
                           password='******')

    items_loader = MongoItemsLoader(host='ds215633.mlab.com',
                                    port='15633',
                                    db='pytheas',
                                    items_collection='hanan',
                                    username='******',
                                    password='******')
    crawler = Crawler(downloader, {
        'www.facebook.com/': FacebookParser(),
        'www.twitter.com/': TwitterParser()
    }, store, items_loader)
    crawler.crawl(FACEBOOK_PAGE_TO_DOWNLOAD_FROM)
Exemple #2
0
 def GET():
     """
     This method fetches the information needed to create the statistic
     of average comment karma by user from Reddit.com and returns it to the main
     application.
     :return: data needed to calculate the average comment karma by user
     """
     logging.debug('GetKarmaStats method called')
     errors = []
     try:
         chosen_username = web.input().chosen_username
     except Exception as e:
         errors.append('GetKarmaStats error:' + str(e))
         logging.error('GetKarmaStats error:' + str(e))
     my_crawler = Crawler(db)
     result = my_crawler.retrieve_user_avg_karma(chosen_username)
     del my_crawler
     if len(errors) == 0:
         return json.dumps({'result': result, 'code': 0})
     else:
         return json.dumps({
             'result': result,
             'code': len(errors),
             'errors': list(errors)
         })
 def test_init_positive02(self):
     my_crawler = Crawler()
     site_to_check = 'http://zabaykin.ru/?p=505'
     res = my_crawler.load_and_tokenize([site_to_check], depth=2)
     # print(res)
     # check number of parsed urls
     self.assertEqual(len(list(set(res.keys()))), 36)
Exemple #4
0
 def GET():
     """
     This method fetches the information needed to create the ranking of
     top users by submissions from Reddit.com and returns it to the main
     application.
     :return: data needed to create the Ranking of top users by comments score
     """
     logging.debug('GetTopUsersCommentsScore method called')
     errors = []
     try:
         chosen_subreddit = web.input().chosen_subreddit
     except Exception as e:
         errors.append('GetTopUsersCommentsScore error:' + str(e))
         logging.error('GetTopUsersCommentsScore error:' + str(e))
     my_crawler = Crawler(db)
     result = my_crawler.retrieve_total_user_comments_score(
         chosen_subreddit)
     del my_crawler
     if len(errors) == 0:
         return json.dumps({'result': result, 'code': 0})
     else:
         return json.dumps({
             'result': result,
             'code': len(errors),
             'errors': list(errors)
         })
Exemple #5
0
 def GET():
     """
     This method fetches the information needed to create the list
     of submissions by user from Reddit.com and returns it to the main
     application.
     :return: data needed to create the list of submissions by user
     """
     logging.debug('GetPostsByUser method called')
     errors = []
     try:
         chosen_username = web.input().chosen_username
     except Exception as e:
         errors.append('GetPostsByUser error:' + str(e))
         logging.error('GetPostsByUser error:' + str(e))
     my_crawler = Crawler(db)
     result = my_crawler.retrieve_user_posts(chosen_username)
     del my_crawler
     if len(errors) == 0:
         return json.dumps({'result': result, 'code': 0})
     else:
         return json.dumps({
             'result': result,
             'code': len(errors),
             'errors': list(errors)
         })
Exemple #6
0
 def setUp(self):
     self.mock_handler = Mock()
     self.mock_http = AsyncMock()
     self.mock_robots_agent = Mock()
     self.crawler = Crawler(
         self.mock_robots_agent, self.mock_http, self.mock_handler
     )
Exemple #7
0
def _main(option, config):
    login_args = '{student} {password} {semester}'.format(**config['account'])
    cookie = loginceiba.info(*login_args.split())
    crawler = Crawler(cookie)
    uploader = Uploader(crawler, option.file)

    course_cache = os.path.join(os.path.dirname(__file__), 'courses.json')
    with open(course_cache, 'r') as courses_data:
        courses = json.load(courses_data)

    for course in map(parse.parse_course, courses):
        if course['作業區'] is None:
            continue
        get_links = lambda hw_url: [
            course['課程資訊']['課程網址'],  # If ignored: 操作錯誤
            'https://ceiba.ntu.edu.tw/modules/hw/hw.php',  # If ignored: 您非修課學生
            'https://ceiba.ntu.edu.tw/modules/hw/' + hw_url
        ]
        name = course['課程資訊']['課程名稱']
        hws = [tuple(hw['名稱'].items())[0] for hw in course['作業區']]
        hws = [(hw[0], get_links(hw[1])) for hw in hws]
        uploader.add_page(UploadPage(name, hws))

    uploader.start()
    uploader.cleanup()
Exemple #8
0
async def _main(site, concurrent, minimum_wait, maximum_wait, max_items):
    start_time = time.time()

    robot_agent = create_robot_agent(USER_AGENT, site)
    emitter = Emitter(max_items)
    handler = KolonialHandlers(site, emitter)
    http_client = HttpClient(USER_AGENT, minimum_wait, maximum_wait)
    crawler = Crawler(robot_agent, http_client, handler)

    queue = asyncio.Queue()
    queue.put_nowait("/")

    workers = [
        asyncio.create_task(worker_task(n, crawler, queue))
        for n in range(concurrent)
    ]
    await queue.join()

    for worker in workers:
        worker.cancel()

    await asyncio.gather(*workers, return_exceptions=True)
    await http_client.close()

    end_time = time.time()
    print(f"{http_client.gets} requests")
    print(f"{emitter.count} entries")
    print(f"took {end_time - start_time}s")
 def test_init_positive01(self):
     my_crawler = Crawler()
     site_to_check = 'http://zabaykin.ru'
     res = my_crawler.load_and_tokenize([site_to_check], depth=1)
     # print(res)
     # check number of paragraphs
     self.assertEqual(len(res[site_to_check]), 244)
def crawl(url, output_dir, depth=2, method="normal", gecko_path="geckodriver", page_name=None, custom_stats_handler=None, custom_process_handler=None):
    head_handlers = {}
    get_handlers = {}

    # get name of page for sub-directories etc. if not custom name given
    if page_name is None:
        page_name = urlparse(url).netloc

    get_handlers['application/pdf'] = LocalStoragePDFHandler(
        directory=output_dir, subdirectory=page_name)

    if custom_stats_handler is None:
        head_handlers['application/pdf'] = CSVStatsPDFHandler(directory=output_dir, name=page_name)
    else:
        for content_type, Handler in custom_stats_handler.items():
            head_handlers[content_type] = Handler

    if custom_process_handler is None:
        process_handler = ProcessHandler()
    else:
        process_handler = custom_process_handler

    if not get_handlers and not head_handlers:
        raise ValueError('You did not specify any output')

    crawler = Crawler(
        downloader=requests_downloader,
        head_handlers=head_handlers,
        get_handlers=get_handlers,
        follow_foreign_hosts=False,
        crawl_method=method,
        gecko_path=gecko_path,
        process_handler=process_handler
    )
    crawler.crawl(url, depth)
Exemple #11
0
def crawl_start():
    if db.get_status() == 1:
        return util.response(1, 'crawler is running!')
    else:
        c = Crawler()
        t = threading.Thread(target=c.run)
        t.start()
        return util.response(0, 'started')
Exemple #12
0
def main():

    c = Crawler()

    indices = c.get_master_indices()

    fs = FileSaver()
    fs.save_files(indices)
Exemple #13
0
def main():
    _setup_logging()
    downloader = SeleniumHTMLDownloader('./lib/chromedriver.exe')
    store = MongoArticleStore("localhost", 27017, "Crawler", "Articles")
    crawler = Crawler(downloader, {
        'ynet.co.il': YnetParser()
    }, store)
    crawler.crawl('https://www.ynet.co.il')
def test_init():
    try:
        crawler = Crawler(get_logger())
        assert not crawler, "crawler is abstract and should be allowed to be instantiated"
    except TypeError as type_err:
        assert type_err, "TypeError is expected; crawler is abstract"
    except Exception as err:
        assert not err, "Exception should be of type TypeError"
Exemple #15
0
def main():
    with open('database.conf', 'r') as confFile:
        confStr = confFile.read()
    conf = json.JSONDecoder().decode(confStr)
    db.init_url(url=conf['mariadb_url'])

    crawler1 = Crawler(1000, url='https://www.szlcsc.com/brand.html')
    crawler2 = Crawler(1000, url='https://www.szlcsc.com/brand.html')
    # crawler.run(url='https://item.szlcsc.com/44085.html')
    # crawler.run(url='https://www.szlcsc.com/catalog.html')
    thread_one = threading.Thread(target=crawler1.run)
    thread_two = threading.Thread(target=crawler2.run)
    thread_one.start()
    thread_two.start()
    thread_one.join()
    thread_two.join()
    print('start')
 def test_structure_positive_01(self):
     my_crawler = Crawler()
     site_to_check = 'http://zabaykin.ru/?p=505'
     res = my_crawler.load_and_tokenize([site_to_check], depth=2)
     self.assertIsInstance(res, OrderedDict)
     for k, el in res.items():
         self.assertIsInstance(k, str)
         self.assertTrue(k.startswith('http'))
         self.assertIsInstance(el, List[str])
 def __init__(self):
     try:
         self.crawler = Crawler()
         self.pgrpah = Graph()
         self.parser = Parser()
         pass
     except Exception as e:
         print("ERROR " + str(e))
         sys.exit(-1)
Exemple #18
0
 def test_find_all_urls(self):
     c = Crawler('http://www.hotline.ua')
     expected_urls = [
         'http://www.hotline.ua/help/hl_checkout/h1/',
         'http://www.hotline.ua/help/hl_checkout/',
         'http://www.hotline.ua/help/hl_checkout/h3/',
         'http://www.hotline.ua/help/hl_checkout/h2/'
         ]
     self.assertEqual(sorted(c.find_all_urls(self.page)), sorted(expected_urls))
Exemple #19
0
def main():
    parser = Parser(HTML.REDIS, PARSER.ELASTICSEARCH)
    """Crawler start"""
    crawler = Crawler(DB.MYSQL, HTML.REDIS, parser)

    try:
        crawler.run()
    except KeyboardInterrupt:
        crawler.stop()
        sys.exit(0)
def correct_document():
    while True:
        Session = sessionmaker(bind=DB_ENGINE)
        session = Session()

        keyword_manager = KeywordManager()
    
        search_keyword = keyword_manager.pull_candidate_keyword()
        print 'Searchkeyword = ' + search_keyword
        
        category = session.query(Category).filter_by(category_name=CONFIG.CATEGORY).first()

        knowledge = session.query(Knowledge).filter_by(knowledge_name=search_keyword).first()
        if knowledge is None:
            knowledge = Knowledge(category.id, search_keyword)
            session.add(knowledge)
            session.commit()
            
        knowledge = session.query(Knowledge).filter_by(knowledge_name=search_keyword).first()
        session.close()
        
        inside_document_corrector = InsideDocumentCollector(search_keyword)
        print '\ncollecting from crawled_knowledge_docuemnts..'
        correct_document_count = inside_document_corrector.collect_document_from_crawled_knowledge_document(knowledge)
        print 'finish. collected '+str(correct_document_count)
        

        print '\ncollecting from without_knowledge_docuemnts..'
        correct_document_count += inside_document_corrector.collect_document_from_without_knowledge_document(knowledge)
        print 'finish. collected '+str(correct_document_count)
        

        print '\ncollecting from web..'
        correct_document_count += Crawler().collect_document_from_web(
         search_keyword, knowledge)
        print 'finish. collected '+str(correct_document_count)
        
        if correct_document_count < 1:
            Session = sessionmaker(bind=DB_ENGINE)
            session = Session()
            session.delete(knowledge)
            session.commit()
            session.close()
            continue
        
        print '\ncreating knowledge association..'
        # 지식관계설정
        pathfinder = Pathfinder()
        pathfinder.find_path(knowledge)
        print 'done.\n'
        
        keyword_manager.delete_current_search_keyword(search_keyword)
        
        #call garbage collector
        gc.enable()
def main():
    configs = load_configs()
    conf_db = configs["database"]
    conf_param = configs["params"]

    crawler = Crawler()
    db = MongoDb(conf_db["address"], conf_db["port"])

    for stock_id in conf_param["stock_ids"]:
        data = crawler.fetch_stock_data(stock_id, conf_param["period"])
        db.append_one(data)
Exemple #22
0
def launch(dir_for_docs, dir_checkpoints, checkpoints_name, description_file,
           lock, inv_index, frontier, documents, step_count):
    crawler = Crawler(frontier, dir_for_docs, dir_checkpoints, checkpoints_name, lock, inv_index, description_file)
    if documents is None:
        open(crawler.file_description, 'w').close()  # Wipe file
    else:
        crawler.documents = documents
    if step_count is not None:
        crawler.steps_count = step_count

    crawler.run()
Exemple #23
0
def stopCrawler(request):
      try:
       id = request.POST.get('id')
       source = Source.objects.get(id=id)
       sourceurl = source.url
       crawlerone =  Crawler(sourceurl)
       crawlerone.stop()
       runingcrawlers.update( {'id':id,'inst':crawlerone} )
       
       return redirect('dashboard')
      except ObjectDoesNotExist:
         return redirect('dashboard')
Exemple #24
0
 def test_fill_disallow_urls_from_robot(self):
     with patch.object(requests, 'get') as mock_get:
         with open('fake_robots.txt', 'r') as fake_robots_txt:
             mock_get.return_value = FakeResponse()
             mock_get.return_value.text = fake_robots_txt.read()
             test_crawler = Crawler(
                 'https://a/',
                 [''], {})
             test_crawler.fill_disallow_urls(URL('https://a/'))
             test_crawler.close()
             self.assertEqual({re.compile('https://a/b.+', re.IGNORECASE)},
                              test_crawler.disallow_urls)
Exemple #25
0
def crawl(url, cfgs):
    click.secho("Crawler will begin on '{url}' with below settings:\n".format(url=url), fg='green')
    config = configs.load_config_section(config_section=cfgs)
    if config is None:
        print(f"Invalid config {cfgs}. Switching to DEFAULT.")
        config = configs.load_config_section(config_section='DEFAULT')
    else:
        print(f"Config set {cfgs} loaded.")
    click.echo()
    crawler = Crawler()
    print(f"Target URL = {url}")
    crawler.crawl(url, config['traversal'], config['user_agent'], int(config['max_depth']), int(config['max_total']))
Exemple #26
0
def test(url, vector_input, sensitive_input, random, speed):
	"""Uses provided vectors and input to test against target."""
	# TODO(piper): pass files in to test.
	c = Crawler(url[0], auth=True)

	if vector_input:
		vectored = c.crawl([Gatherer()])
		
	if sensitive_input:
		[print(line) for line in sensitive_input]
	
	# result = c.crawl([VectorGatherer()])
	print("Finished testing...")
Exemple #27
0
 def test_searcher_with_seen_urls(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=http://scala-lang.org></a>' \
                                      '<a href=https://scala11.html></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
             test_crawler = Crawler(
                 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html',
                 ['scala'], {}, 2)
             test_crawler.seen_urls.add(Page(URL('http://scala-lang.org')))
             test_result = test_crawler.crawl()
             test_crawler.close()
             assert 'http://scala-lang.org' not in test_result
Exemple #28
0
 def test_crawler_zero_result(self):
     with patch.object(Crawler, 'get_html') as mock_get_html:
         mock_get_html.return_value = '<a href=https://scala1.html></a>'
         with patch.object(Crawler, 'write_html') as mock_write_html:
             mock_write_html.return_value = None
             test_crawler = Crawler(
                 'https://docs.scala-lang.org/ru/tour/tour-of-scala.html',
                 ['dog'],
                 {},
                 2)
             test_result = test_crawler.crawl()
             test_crawler.close()
             self.assertEqual(test_result, set())
Exemple #29
0
    def start(self):
        for game_type, crawl_range in self.crawl_range().items():
            crawler = Crawler(game_type)
            for date in pd.date_range(start=crawl_range["begin"],
                                      end=crawl_range["end"]):
                logging.debug(f"command: crawling {game_type} at {date}")
                crawler.run(self.format_date(date))

                # random sleep
                time.sleep(
                    abs(
                        np.random.normal(
                            self.config["commander"]["queryPeriod"])))
Exemple #30
0
def main():

    c = Crawler()
    fs = FileSaver()

    for i in range(EARLIEST, LATEST + 1):
        for j in range(Q1, Q4 + 1):
            print("\nStarting on year %s, quarter %s\n" % (str(i), str(j)))

            if i == 2016 and j == 4:
                pass

            else:
                f = open(BASE_MASTER_PATH.format(i, j), 'r')

                for line in f:
                    if contains_8k(line):
                        line = line.split('|')

                        doc = c.get_8k_form(line[URL])

                        wait = random.randint(1, 30)
                        print('Waiting %s seconds' % (str(wait)))
                        time.sleep(int(wait))
                        print('Current time is %s' %
                              strftime('%Y-%m-%d %H:%M:%S', gmtime()))

                        if isinstance(line[DATE], str) and isinstance(
                                line[COMPANY], str):

                            form_date = line[DATE]

                            string_array = line[COMPANY].split('/')

                            form_company = ""

                            for idx in range(len(string_array)):
                                form_company += string_array[idx]

                        else:

                            form_date = line[DATE].decode('utf-8', 'ignore')
                            form_company = line[COMPANY].decode(
                                'utf-8', 'ignore')

                        print('Found form from company %s on date of %s' %
                              (form_company, form_date))

                        fs.save_8k_file(i, j, doc, form_date, form_company)