Beispiel #1
0
def main():
	t1 = task("http://www.laurentluce.com/posts/python-threads-synchronization-locks-rlocks-semaphores-conditions-events-and-queues/")
	t2 = task("http://stackoverflow.com/questions/15651128/in-this-semaphore-example-is-it-necessary-to-lock-for-refill-and-buy")
	t3 = task("http://bbs.byr.cn/")
	event = Event()
	tasks = TaskQueue(event)
	pages = TaskQueue(None)
	tasks.add(t1)
	tasks.add(t2)
	tasks.add(t3)

	taskLock = BoundedSemaphore(tasks.numOfNewTasks)
	pageLock = BoundedSemaphore(1)
	f = open("test.txt",'w')
	Connector0 = Connector(tasks,taskLock,pages,pageLock,event,'',f, 3000)
	Connector1 = Connector(tasks,taskLock,pages,pageLock,event,'',f, 3001)
	Connector0.start()
	Connector1.start()

	Crawler0 = Crawler('',3000)
	Crawler1 = Crawler('',3001)

	Crawler0.start()
	Crawler1.start()

	Connector1.join()
	Connector0.join()
	Crawler0.join()
	Crawler1.join()
	f.close()
Beispiel #2
0
    def __init__(self):
        # parent construct
        Crawler.__init__(self)

        self.crawl_cookie = {}
        self.status_code = ''
        self.history = ''
Beispiel #3
0
 def run(self):
     robot_url = "http://allrecipes.com/"
     root = 'http://allrecipes.com/Recipes/ViewAll.aspx?Page=1'
     depth_limit = 5
     confine_reg = ['http://allrecipes.com/Recipes/ViewAll.aspx\?Page\=[0-9]*$','http://allrecipes.com/Recipe/[a-zA-Z0-9\-]*/Detail.aspx$']
     c = Crawler(root, depth_limit,confine_reg,robot_url)  
     c.crawl()     
Beispiel #4
0
def main():
    try:
        spider = Crawler()
        spider.go()
    
    except KeyboardInterrupt:
        print("Stopped!")
def start():
	'''抓取进程开始,每次取出一个节点抓取 '''
	# 初始化
	mongo_peoples , redis_client = Init()

	# 待抓取节点集合是否为空
	while redis_client.scard(info_success_set) == 0: # 为空
		# 等待 waiting_size 秒
		time.sleep(wait_time)

	# 从待抓取节点集合随机(右端)取出一个节点
	node = redis_client.spop(info_success_set)
	urlToken = node

	# 抓取节点代表用户的个人主页
	# printx('准备代理……')
	printx('正在抓取用户 %s 的个人信息……'%urlToken)
	try_cnt = try_limit
	while try_cnt > 0:
		try:
			c = Crawler(isCookie=False,timeout=socket_timeout)
			# 手动设置代理IP
			ip = proxyip.get()
			c.set_proxyip(ip)

			people = get_Info(c,urlToken)
			if people==None:
				raise Exception,'抓取的用户信息为空'
		except Exception,e:
			try_cnt -= 1
			print e
			printx('用户 %s 个人信息抓取出错,还可以尝试抓取 %d 次'%(urlToken,try_cnt))
		else:
			break
 def __init__(self):
     crawler.__init__(self)
     self.title = ''
     self.press = ''
     self.date = ''
     self.time = ''
     self.contents = ''
    def __init__(self):
        myCrawler = Crawler(self.LINKS)
        crawledURLs = myCrawler.getVisited()
        linkStructure = myCrawler.getLinkStructure()
        print("Link-Struktur:\n")
        myCrawler.printLinkStructure()

        myPageRank = PageRank(linkStructure)
        pageRanks = myPageRank.getPageRank()
        print("\n\nPageRanks:\n")
        myPageRank.printPageRank()

        myIndex = Index(self.STOPWORDS, crawledURLs)
        index = myIndex.getIndex()
        print("\n\nIndex:\n")
        myIndex.printIndex()

        myScorer = Scorer(pageRanks, index,linkStructure)
        #myScorer.usePageRank(True)
        print("\n\nDokumentenlängen:\n")
        myScorer.printDocumentLengths()
        print("\n\nSuchergebnisse:\n")
        myScorer.calculateScores(["tokens"])
        myScorer.calculateScores(["index"])
        myScorer.calculateScores(["classification"])
        myScorer.calculateScores(["tokens", "classification"])
    def get_article(self, url):
        crawler = Crawler()
        # get html data from url
        web_data = crawler.get_page(url)
        soup = BeautifulSoup(web_data, 'html.parser')

        # remove link news 
        [e.extract() for e in soup('div', {'class':'link_news'})]

        # article title
        self.title = soup('h3', {'id':'articleTitle'})[0].text

        # create date and time of article
        date_time = soup('span', {'class':'t11'})[0].text.split()
        self.date = date_time[0]
        self.time = date_time[1]

        # press name
        press_logo = soup('div', {'class':'press_logo'})[0]
        self.press = press_logo.find('img')['alt']
        del press_logo

        # article contents
        self.contents = soup('div', {'id':'articleBodyContents'})[0].text
        self.contents = re.sub('[\n\r]', '', self.contents)
Beispiel #9
0
 def test_crawl_native_fakeCrawler(self):
     parameter_file = "./test/search_parameters.json"
     c = Crawler("SimpleTest", parameters=FileOperations.get_from_JSON_file(parameter_file))
     self.assertEqual(c.name, "SimpleTest")
     c.crawl_native()
     self.assertTrue(os.path.isfile(parameter_file))
     result_from_file = FileOperations.get_from_JSON_file(c.output["path"])
     self.assertEqual(len(result_from_file), 3)
Beispiel #10
0
 def test_crawl_multithread_mmcoreAsync(self):
     parameter_data = FileOperations.get_from_JSON_file("./test/search_async.json")
     crawlers = parameter_data["crawlers"]
     crawlerName = "dotAsync"
     c = Crawler(crawlerName, parameters=crawlers[crawlerName])
     data = c.crawl_native(threads=None)
     self.assertTrue(len(data) > 0)
     c.save_crawler_data(data, crawlers[crawlerName]["output"])
Beispiel #11
0
 def test_crawl_clientIntegrations(self):
     parameter_data = FileOperations.get_from_JSON_file("./test/search_integration.json")
     crawlers = parameter_data["crawlers"]
     crawlerName = "Integration"
     c = Crawler(crawlerName, parameters=crawlers[crawlerName])
     data = c.crawl_native()
     self.assertTrue(len(data) > 0)
     c.save_crawler_data(data, crawlers[crawlerName]["output"])
Beispiel #12
0
    def __init__(self, forced=False):
        Crawler.__init__(self)
        self.results = set()
        self.forced = forced
        self.success_count = None
        self.failure_count = None

        self.blacklist = []
        self.name_exceptions = ["http://www.cplusplus.com/reference/string/swap/"]
Beispiel #13
0
    def test__process_html(self):
        soup = BeautifulSoup(self.html_test_string)
        c = Crawler("http://test.com")
        c._process_html_asset = mock.Mock()
        c._process_html_link = mock.Mock()

        c._process_html(soup)
        self.assertEqual(c._process_html_asset.call_count, 3)
        self.assertEqual(c._process_html_link.call_count, 4)
Beispiel #14
0
    def test_render_sitemap(self):
        try:
            os.remove("sitemap.pdf")
        except OSError:
            pass

        self.assertEqual(os.path.exists("sitemap.pdf"), False)
        c = Crawler("http://a.com")
        c.render_sitemap()
        self.assertEqual(os.path.exists("sitemap.pdf"), True)
Beispiel #15
0
    def test__process_html_link(self):
        c = Crawler("http://test.com")
        soup = BeautifulSoup(self.html_test_string)

        for link in soup.find_all("a"):
            c._process_html_link(link, "/")

        self.assertEqual(len(c.sitemap.nodes()), 3)
        self.assertEqual(len(c.sitemap.edges()), 2)
        self.assertEqual(len(c.process_q), 3)
Beispiel #16
0
    def test__process_html_good_asset(self):
        c = Crawler("http://test.com")
        soup = BeautifulSoup(self.html_test_string)

        c._does_static_file_exist = mock.Mock(return_value=True)
        for asset in soup.find_all(True, src=True):
            c._process_html_asset(asset, "/")

        self.assertEqual(c._does_static_file_exist.call_count, 2)
        self.assertEqual(len(c.sitemap.nodes()), 3)
        self.assertEqual(len(c.sitemap.edges()), 2)
Beispiel #17
0
    def test__parse_url(self):
        test_list = {
            "http://www.test2.com": None,
            "/a/b/c": "https://www.test.com/a/b/c",
            "/?q=1": "https://www.test.com/?q=1",
            "https://sub.test.com": None
        }

        for test in test_list:
            c = Crawler("https://www.test.com")
            self.assertEqual(c._parse_url(test), test_list[test])
Beispiel #18
0
 def test_saveLog(self):
     """Test the method of saving logs.
     """
     c = Crawler()
     c.log = ['first', 'second']
     c.saveLog()
     self.assertTrue(
         os.path.exists(c.logPath)
         and os.path.isfile(c.logPath))  # Log file existence
     with open(c.logPath, 'r') as fin:
         self.assertEqual(fin.read(),
                          'first\nsecond')  # Log file content correctness
Beispiel #19
0
def crawl_one():
    if request.method == "GET":
        return "Hello"
    elif request.method == "POST":
        req = request.get_json()
        print(req)
        crawler = Crawler()
        currDate = datetime.date(int(req["yr"]), int(req["month"]), int(req["day"]))
        duration = datetime.timedelta(days=int(req["duration"]))
        l_id = req["id"]
        out = crawler.crawl_one(l_id, currDate, duration)
        return json.dumps(out)
Beispiel #20
0
def make_database(db):
    cr = Crawler('https://hackbulgaria.com/api/students/')
    users = cr.get_users()
    courses = cr.get_courses()

    # Making first table
    for course in courses:
        db.add_course(course)

    # Making second and third table
    for user in users:
        db.add_user(user)
Beispiel #21
0
def start():
  parser=argparse.ArgumentParser(description="")
  parser.add_argument("-n","--number-of-levels",type=int,default=1,help="")
  parser.add_argument("url",nargs=1,help="target URL")

  args=parser.parse_args()
  target_url=args.url.pop()
  deep=args.number_of_levels

  mycrawler=Crawler()
  currentDeep=1
  mycrawler.startCrawler(target_url,currentDeep,deep)
Beispiel #22
0
    def __init__(self, forced=False):
        Crawler.__init__(self)
        self.results = set()
        self.forced = forced
        self.success_count = None
        self.failure_count = None

        self.blacklist = [
        ]
        self.name_exceptions = [
            'http://www.cplusplus.com/reference/string/swap/'
        ]
Beispiel #23
0
    def __init__(self, query):
        self.query = ""
        filterWords = POS.POS(query)

        for item in filterWords:
            if self.query == "":
                self.query += item[0]
            else:
                self.query += " " + item[0]

        print(self.query)
        self.crawler = Crawler(2, 1000, self.query)
        self.crawler.start()
Beispiel #24
0
    def reindex():
        #defines our indexing limit (N)
        limit = 100
        root = "http://lyle.smu.edu/~fmoore/"
        myCrawl = Crawler(root, limit)
        myCrawl.startCrawl()

        #creating our reverse index
        script_dir = os.path.dirname(os.path.abspath(__file__))
        stopWords = './assets/stopwords.txt'
        stopWords = open(os.path.join(script_dir, stopWords)).read().splitlines()
        myIndex = Indexer(myCrawl.getTreeIndex(), stopWords).createReverseIndex().saveIndex('assets/reverse_index.json').savePageDigests('assets/page_digests.json')
        return True
Beispiel #25
0
def main():
    # Configuration
    configurator = Config("./ConfigFile.xml")
    ret = configurator.config()

    # Crawl
    crawler = Crawler()
    crawler.crawl()

    # Parse
    parser = Parser()
    parser.parse()
    return
Beispiel #26
0
 def __init__(self,
              b_type='Chrome',
              username='******',
              pwd='Blank123'):
     if 'simas' in username:
         print(
             'This is the real account. Please be cautious while trading. semiAuto mode is recommended.'
         )
     self.controller = Crawler(type=b_type)
     self.browser = None
     self.username = username
     self.pwd = pwd
     self.strategy = None
Beispiel #27
0
def main():
    thread_count = ThreadCount()  # 创建线程计数对象
    error_log = ErrorLog()  # 创建错误日志记录对象
    error_log.clear_error_log()  # 如果没有文件新建文件,清空错误日志
    crawler = Crawler(thread_count)  # 新建爬虫对象
    url = get_one_url()
    while not url is None:  # 判断是否为空
        if thread_count.total < 5:  # 判断当前线程数量是否超出
            print("加载:" + url)
            crawler.get_book(url)  # 让爬虫获取书籍页面
            url = get_one_url()  # 获取新的URL数据
        else:  # 线程数量超出
            sleep(10)  # 休眠10秒等待,线程执行完成
Beispiel #28
0
    def test__nromalize_url(self):
        test_list = {
            "http://www.a.com#abc": "http://www.a.com/",
            "http://www.a.com/a/b/c": "http://www.a.com/a/b/c",

            # if no scheme is provided, urlsplit treats the domain name as the path
            # so we don't expect a trailing "/" after www.a.com
            "www.a.com?abc=123#abc": "://www.a.com?abc=123"
        }

        for test in test_list:
            usplit = urlparse.urlsplit(test)
            c = Crawler("http://mydomain.com")
            self.assertEqual(c._normalize_url(usplit), test_list[test])
Beispiel #29
0
 def _count_process_done(self, files, folders, others, total):
     if not self.counter.canceled:
         self.crawler = Crawler(self, files, folders, others, total)
         self.crawler.processDone.connect(self._process_done)
         self.crawler.updateStatusBar.connect(self._update_status_bar)
         self.crawler.setProgressBarValue.connect(
             self._update_progress_bar_value)
         self.crawler.updateDuplicateInformation.connect(
             self._update_duplicate_information)
         self.crawler.createDuplicateInformation.connect(
             self._create_duplicate_information)
         self.crawler.start()
     else:
         self._end_process_set_button_states()
Beispiel #30
0
 def valuta_sito_web(self, sito):
     crawler = Crawler()
     img_processor = ImageProcessor()
     # img_worker = ImageWorker()
     report_pagine = crawler.generate_reportpagine(sito)
     print(report_pagine.toJSON())
     # TODO: generazione report foto
     report_foto = img_processor.generate_report_foto('web')
     # report_foto = img_worker.generate_reportfoto()
     # TODO: cancellare directory foto
     report = self.evaluate_report(sito, report_pagine, report_foto)
     # TODO: valuta report
     # report = Report(sito, report_pagine, report_foto, valutazione)
     return report
Beispiel #31
0
def scrape_documents(min_count=0):

    doc_count = 0

    s = Crawler()
    docs = s.crawl(min_count)

    while min_count <= 0 or doc_count < min_count:
        for doc in docs:
            log.debug('uploaded image doc from %s', doc.url)
            doc_count += 1
            if doc_count % 100 == 0:
                log.info('%d images and counting...', doc_count)
            yield doc
Beispiel #32
0
    def test_extract1(self):
        """The testing method of extracting a string from a web page. In this case, all the targets of the extractors exist.
        """
        c = Crawler(extractors=[self.Extractor_1('title')])
        url = self.testUrls[0]
        bs = c.getPageBs(url)
        info = c.extract(url, bs)
        self.assertEqual(info, 'Get title: An Interesting Title'
                         )  # Returned information correctness

        expectDict = {'0': {'title': 'An Interesting Title'}}
        with open('tmp/title.json', 'r') as fin:
            self.assertEqual(json.loads(fin.read()),
                             expectDict)  # Record file correctness
Beispiel #33
0
def main(start=0, end=len(const.city_code)):
    city_list = const.city_code
    c = Crawler()
    pool = multiprocessing.Pool(multiprocessing.cpu_count() - 2)
    for i in range(int(start), int(end)):
        if c.check_compete(city_list[i]):
            continue
        for j in range(0, len(city_list)):
            if i != j:
                pool.apply_async(c.get_all_info, (
                    city_list[i],
                    city_list[j],
                ))
    pool.close()
    pool.join()
Beispiel #34
0
class CrawlerTest(unittest.TestCase):
    def setUp(self):
        self.crawler = Crawler()

    def testNavigateAndRetrieveLinks(self):
        crawler = self.crawler
        crawler.navigate("http://www.google.ca")
        ret =  crawler.findNext(".*")
        for r in ret:
            print r



    def tearDown(self):
        self.crawler.close()
Beispiel #35
0
def main():
    parser = argparse.ArgumentParser(
        description='Process Parameters for searching')
    parser.add_argument('key', type=str, help='Input the keys of Google API')
    args = parser.parse_args()
    # command loop
    while True:
        print('1 Add new class data to events library\n'
              '2 Events classification\n'
              '3 Query classification\n'
              '4 Exit\n')
        cmd = input('Please input a option:\n')
        if cmd == '1':
            # create the crawler
            spider = Crawler(args.key)
            while True:
                print('\nEnter the search item and keywords like this\n'
                      'num_of_res item keyword_1 keyword_2 ... keyword_n\n'
                      "--- type 'exit' to exit ---\n")
                cmd = input('Please input your command\n')
                # input check for cmd
                if cmd == '':
                    print('Empty string!')
                    continue
                elif cmd == 'exit':
                    break
                else:
                    cmd = cmd.split(' ')
                    if cmd[0].isdigit():
                        spider.crawl(cmd[0], cmd[2:], cmd[1])
                        print('crawling...')
                        continue
                    else:
                        print('The number of search item is invalid!\n')
                        continue
            continue

        elif cmd == '2':
            print('Events classifier in developing...\n')
            continue
        elif cmd == '3':
            print('Query classifier in developing...\n')
            continue
        elif cmd == '4' or cmd == 'exit':
            break
        else:
            print('Command error, please input your option again\n')
            continue
Beispiel #36
0
def crawl(seed_list, token):
    # Make 5 threads(number of seeds)
    crawler_list = []
    for seed in seed_list:
        name = seed.split('/')[2]
        crawler = Crawler(name, seed, token)
        crawler.start()
        crawler_list.append(crawler)

    # simple work checker
    while True:
        time.sleep(10)
        print('#######{}########'.format(time.ctime()))
        for crawler in crawler_list:
            print('{}: {}/{}'.format(crawler.name, crawler.num_post,
                                     crawler.num_delete))
def step2_download_zipfiles():
	desktop_path = Helper.get_desktop_dir()
	directory = os.path.join(desktop_path, RAW_DATA_PATH)
	if not os.path.exists(directory):
	    os.makedirs(directory)

	db = Database()
	currency_list = db.get_currency_list()
	crawler_list = [Crawler(db) for x in range(THREAD_NUMBER)]

	lock = threading.RLock()

	def down_data(crawler):

	    while len(currency_list) > 0:

	        with lock:
	            currency = currency_list[0]
	            currency_list.remove(currency)

	        crawler.download_historical_data(currency["symbol"], currency["time"], directory)
	    crawler.quit()

	for crawler in crawler_list:
	    t = threading.Thread(target=down_data, args=(crawler, ))
	    t.start()
Beispiel #38
0
class Getter():
    def __init__(self):
        # 对这两个方法进行初始化
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        '''判断代理池的数量书否达到设置的值,达到数量了就返回True,否则返回False'''
        if self.redis.count() >= POOL_UPPER_HRESHOLD:
            return True
        return False

    def run(self):
        print('获取器开始执行...')
        # 如果没有达到数量
        if not self.is_over_threshold():
            # 遍历__CrawlFuncCount__:比如说,当__CrawlFuncCount__==0的时候,callback就取得数crawlFunc里面的索引为0的方法
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                # 获取所有以crawl_开头方法的列表
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 调用get_proxies方法,获取抓取到的代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                # 遍历获取到的代理,添加到数据库中
                for proxy in proxies:
                    self.redis.add(proxy)
Beispiel #39
0
class Main(Process):
    urlToCrawl = ''
    crawling = False
    crawler = Crawler()
    db = DbHandler()

    def __init__(self, url=None):
        global urlToCrawl
        if url is not None:
            urlToCrawl = url

    def start(self, url):
        global urlToCrawl, crawler
        urlToCrawl = url
        self.idle()

    def printen(self, url):
        print url

    def idle(self):
        while self.db.getCrawlstate('crawler')[0]:
            while not Main.crawling:
                status = self.db.getCrawlstate(urlToCrawl)
                if status[0]:
                    Main.crawling = True
                else:
                    time.sleep(1800)
            Main.crawling = False
            Main.crawler.startCrawler(urlToCrawl, status[1])
Beispiel #40
0
 def getPriceInfo(self, code):
     r = random.uniform(0, 3)
     time.sleep(r)
     url = self.sinaApi % (code)
     page = Crawler.getPage(url, "gbk")
     tmp = page.split('"')
     return tmp[1].split(',')
Beispiel #41
0
def main():
    if len(sys.argv) > 1 and len(sys.argv) < 5:
        context = sys.argv[1]
        place = "./"
        clean = False

        if len(sys.argv) == 3:
            if sys.argv[2] == "--clean":
                clean = True
            else:
                place = sys.argv[2]
        elif len(sys.argv) == 4:
            place = sys.argv[2]
            if sys.argv[3] == "--clean":
                clean = True
            else:
                print("Unrecognized option '" + sys.argv[3] + "'")

        with Crawler(place, context) as spider:
            if not clean:
                if (os.path.exists(spider.listFileName)
                        and os.path.exists(spider.visitedFileName)):
                    spider.loadState(spider.listFileName,
                                     spider.visitedFileName)
                else:
                    print("Could not load one or more of the URL lists.")

            spider.recursivePull()
    else:
        printHelp()
Beispiel #42
0
def CompanyInfoThread(category):
    # 建立SQL連線
    conn_cfg = {'host': '', 'user': '', 'password': '', 'db': ''}
    conn = pymysql.connect(**conn_cfg)
    cursor = conn.cursor()
    # 原sql指令為Like '_____' %,在python最後要打兩個%,只打一個無法運作
    sql = "select distinct `公司連結` from db_104.job_link where `職類編號` LIKE '%s%%'" % category
    cursor.execute(sql)
    companylink = cursor.fetchall()
    companylink = [i[0] for i in companylink]  # 建立職缺連結list
    conn.close()

    # 依序讀取公司連結,將公司資訊存成json
    for l in companylink:
        try:
            url = "https://" + l
            content = Crawler.company_info(url)
            dn = "[directory]"
            fn = url.split("company/")[1].split("?")[0]
            if not os.path.exists(dn):
                os.makedirs(dn)
            f = open(dn + fn + ".json", "w", encoding="utf-8")
            json.dump(content, f)
            f.close()
            print(url)
            print("職務類型", category, "的公司:", fn, "complete")

        except:
            pass
Beispiel #43
0
def start():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("-n",
                        "--number-of-levels",
                        type=int,
                        default=1,
                        help="")
    parser.add_argument("url", nargs=1, help="target URL")

    args = parser.parse_args()
    target_url = args.url.pop()
    deep = args.number_of_levels

    mycrawler = Crawler()
    currentDeep = 1
    mycrawler.startCrawler(target_url, currentDeep, deep)
Beispiel #44
0
def test_blocks():
    """
    Check transactions in each of a random sample of blocks.

    Send a request to https://etherchain.org/api/block/:block/tx to get a list
    of all transactions that occurred in that block. Cross-reference with the
    transactions in the local block (in mongo).
    """
    c = Crawler.Crawler(start=False)
    client = c.mongo_client

    sample = random.sample(range(1, 1700000), 100)
    N = len(sample)

    # Track the number of times the number of transactions is different.
    wrong_blocks = list()
    num_error = "Incorrect number of transactions in {}% of {} blocks."

    blocks = client.find({"number": {"$in": sample}})
    for block in blocks:
        n = block["number"]
        uri = "https://etherchain.org/api/block/{}/tx".format(n)
        ethchain = json.loads(requests.get(uri).text)

        # Check the number of transactions in the block
        if len(ethchain["data"]) != len(block["transactions"]):
            wrong_blocks.append(n)

    wrong_nums = len(wrong_blocks)
    pprint.pprint(wrong_blocks)
    assert wrong_nums == 0, num_error.format(100.0 * wrong_nums / N, N)
Beispiel #45
0
    def __sync_thread(self):
        while True:
            try:
                resources_tags = self.auto_tagger.process(
                    Crawler(
                        self.get('black-list'),
                        self.get('white-list'),
                        self.get('crawled-resources'),
                    ).crawl())

                SyncAgent(
                    self.get('settings')['server'],
                    self.get('settings')['user-token'],
                    self.get('settings')['device-token'],
                ).sync(resources_tags)

            except Exception as new_exception:
                print('[ERROR]: When trying to sync: {0}'.format(
                    new_exception.message))

            else:
                self.get('crawled-resources').update(
                    set(resource for resource, _ in resources_tags))

            time.sleep(self.get('settings')['sync']['interval'])
Beispiel #46
0
    def __init__(self):

        # Checks to see if a json file with the data for the week exists
        # TODO: Check if the file is up to date
        if os.path.isfile('../CafeAPI/data.json'):
            # Reads the data from the file into a variable
            with open('../CafeAPI/data.json', 'r') as f:
                self.base = json.load(f)
            print("Database: Retrieved data from file")
        else:
            # Run the Crawler a max of 5 times for more stability in case of unstable internet
            for i in range(4):
                try:
                    # Release Crawler
                    with Crawler() as c:
                        # Navigate and collect data
                        c.nav()

                        # Set data to variable
                        self.base = c.get_info()

                        # Write the data to a file for future reference
                        with open('../Throwaway/CafeAPI/data.json', 'w') as f:
                            json.dump(self.base, f)
                    # Break if all of the above works successfully
                    print(f"Database: Retrieved data from Crawler on try #{i}")
                    break
                except:
                    # This means that something failed and the program has to retry
                    print(
                        f"Database: Something went wrong, loading data retry #{i}"
                    )
                    pass

        print("Database: Initiated Data Collection")
Beispiel #47
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        '''
        Jude whether the number limit of proxies is reached
        '''
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        crawlerCount = self.crawler.__CrawlFuncCount__
        print('crawlerCount: ', crawlerCount)
        for callback_lable in range(crawlerCount):
            if self.is_over_threshold():
                print('The proxies in the pool is too many!')
                break
            callback = self.crawler.__CrawlFunc__[callback_lable]
            proxies = self.crawler.get_proxies(callback)
            for proxy in proxies:
                self.redis.add(proxy)
Beispiel #48
0
    def __init__(self, start_month="2017/01"):

        # 其他网页
        self._crawler = Crawler(start_month=start_month)

        # 国家统计局网页
        self._url = "http://data.stats.gov.cn/easyquery.htm?"
        self._sj_start = start_month.replace("/", "")
        self._index_dict = {
            "制造业采购经理指数(%)": "A0B0101",
            "非制造业商务活动指数(%)": "A0B0201",
            "工业生产者出厂价格指数(上年同月=100)": "A01080101",
            "工业增加值累计增长(%)": "A020102",
            "房地产投资累计值(亿元)": "A060101",
            "流通中现金(M0)供应量期末值(亿元)": "A0D0105",
            "货币(M1)供应量期末值(亿元)": "A0D0103",
            "货币和准货币(M2)供应量期末值(亿元)": "A0D0101",
            "GDP": "A010201",

            # "生产指数": "A0B0102",
            # "新订单指数": "A0B0103",
            # "新出口订单指数": "A0B0104",
            # "在手订单指数": "A0B0105",
            # "产成品库存指数": "A0B0106",
            # "采购量指数": "A0B0107",
            # "进口指数": "A0B0108",
            # "出厂价格指数": "A0B0109",
            # "主要原材料购进价格指数": "A0B010A",
            # "原材料库存指数": "A0B010B",
            # "从业人员指数": "A0B010C",
            # "供应商配送时间指数": "A0B010D",
            # "生产经营活动预期指数": "A0B010E",
            #
            # "新订单指数": "A0B0202",
            # "新出口订单指数": "A0B0203",
            # "在手订单指数": "A0B0204",
            # "存货指数": "A0B0205",
            # "投入品价格指数": "A0B0206",
            # "销售价格指数": "A0B0207",
            # "从业人员指数": "A0B0208",
            # "供应商配送时间指数": "A0B0209",
            # "业务活动预期指数": "A0B020A",
            #
            # "综合PMI产出指数": "A0B0301"
        }

        self._data_by_quarter = ["GDP"]
Beispiel #49
0
 def main(self):
   if self.config.has_option("sources", "bootstrap"):
     self.bootstrap(
       filename = self.config.get("sources", "bootstrap")
     )
   b = Base(
     endpoint = self.config.get("xserver", "endpoint"),
     base = self.config.get("xserver", "base")
   )
   c = Crawler(base = b)
   c.crawl(callback = self.callback)
   self.processCache()
   self.addTopConcepts()
   self.addLinks()
   self.write()
   self.writeTables()
   shutil.rmtree("temp")
Beispiel #50
0
def main():
    argparser = argparse.ArgumentParser(description="Scrapes a Web site and writes the generated HTML to disk for caching")
    argparser.add_argument('root', help='The starting point URL for the crawl (beginning with http:// or https://)')
    args = argparser.parse_args()

    assert args.root.startswith(('https://', 'http://'))
    policy = ScrapingPolicy(args.root)
    Crawler(policy).crawl()
Beispiel #51
0
async def main():
    webpage_store = WebpageStore()
    webpage_processor = WebpageProcessor(webpage_store)
    crawler = Crawler(webpage_processor, max_depth=3, verbose=True)
    # initial_urls = ["https://en.wikipedia.org/wiki/Web_scraping"]
    # initial_urls = [f"https://swapi.co/api/people/{i}" for i in range(1, 3)]
    initial_urls = get_initial_urls()
    await crawler.run(initial_urls)
Beispiel #52
0
    def test_extract3(self):
        """The testing method of extracting with serveral extractors. In this case, some targets of the extractors do not extist.
        """
        self.assertEqual.__self__.maxDiff = None

        c = Crawler(extractors=[
            self.Extractor_1('title'),
            self.Extractor_2('para'),
            self.Extractor_3('gift1'),
            self.Extractor_4('giftTitles')
        ])
        url = self.testUrls[1]
        bs = c.getPageBs(url)
        info = c.extract(url, bs)

        para = 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'

        pattern1 = self.clean(
            f'Get title: An Interesting Title\nGet fakeLatin: {para}\nGet classIsBody: {para}\nFailed to get gift1: .*\nFailed to get giftTitles: .*'
        )
        pattern2 = self.clean(
            f'Get title: An Interesting Title\nGet classIsBody: {para}\nGet fakeLatin: {para}\nFailed to get gift1: .*\nFailed to get giftTitles: .*'
        )
        self.assertTrue(
            re.match(f'({pattern1})|({pattern2})', info.replace(
                '\n', ' ')))  # Returned information pattern correctness

        expectDict = {'0': {'title': 'An Interesting Title'}}
        with open('tmp/title.json', 'r') as fin:
            self.assertEqual(json.loads(fin.read()),
                             expectDict)  # Extracted record file correctness

        expectDict = {'0': {'fakeLatin': para, 'classIsBody': para}}
        with open('tmp/para.json', 'r') as fin:
            self.assertEqual(json.loads(fin.read()),
                             expectDict)  # Extracted record file correctness

        expectDict = {'0': {'gift1': ''}}
        with open('tmp/gift1.json', 'r') as fin:
            self.assertEqual(json.loads(fin.read()),
                             expectDict)  # Extracted record file correctness

        expectDict = {'0': {'giftTitles': ''}}
        with open('tmp/giftTitles.json', 'r') as fin:
            self.assertEqual(json.loads(fin.read()),
                             expectDict)  # Extracted record file correctness
Beispiel #53
0
def runScan(target):

    crawler = Crawler()
    findings = {}

    print("Scanning: ", target)

    findings.clear()
    findings = {"target":target,"sqlinjection":[], "WeakPassword":[]}


    if not crawler.init(target):
        return

    crawler.crawl()
    crawler.findLoginPanel()

    AuthBypass.check_authbypass(crawler.loginFormEndpoints, findings)
    WeakPasswords.check_weak_passwords(crawler.loginFormEndpoints, findings)


    if len(crawler.loginFormEndpoints) > 0:
        findings["loginForm"]="yes"
    else:
        findings["loginForm"] = "no"

    sqli_scan_urls(crawler.uEndPoints, findings)
    sqli_scan_forms(crawler.fEndpoints, findings)
    CommonFunctions.save_findings(findings)
Beispiel #54
0
    def run(self):
        # Spawn threads and start crawling.
        for i in range(self.no_crawlers):
            crawler = Crawler(i, self.queue, self.visited_urls, self.mutex, self.excluded, self.target_domain, self.robotparser)
            self.crawlers.append(crawler)
            crawler.start()

        # Wait for all crawlers to finish.
        self.queue.join()

        # Notify all crawlers to stop.
        for i in range(self.no_crawlers):
            self.queue.put(None)

        self.queue.join()

        # Wait for all threads to exit
        for t in self.crawlers:
            t.join()
Beispiel #55
0
	def build(self, keyWord, num, bfs=False):
		if bfs is False:
			queue = WebQueue()
		else:
			queue = BfsQueue()
		top10List = self.__getTop10(keyWord)
		for url in top10List:
			queue.offer(url, 0)
		return Crawler(num, queue)
		
Beispiel #56
0
    def crawl(max_page):
        text.delete('1.0', END)
        text.insert(END, 'Currently Crawling Please Wait\n')
        search_engine.update()

        count = int(max_page)
        while len(Crawler.queue) > 0 and count > 0:
            queue = str(Crawler.queue.pop())
            Crawler.crawl(queue)
            count -= 1
            text.insert(END, 'Currently Crawling: ' + queue + '\n')
            search_engine.update()

        print('Crawl Finished Can Now Search')
        text.delete('1.0', END)
        text.insert(END, 'Crawl Finished Can Now Search\n')
        text.insert(END, str(len(Crawler.crawled)) + " Url's have been Crawled and Indexed \n")
        text.insert(END, str(len(Crawler.queue)) + " Total Number of Url's In Queue\n")
        search_engine.update()

        Crawler.save_lists()
def get_list(section, date):

    # NAVER news url
    naver_news_url = 'http://news.naver.com/main/list.nhn'
    naver_news_parameter = {'mode':'LSD', 'mid':'sec', 'sid1':'', 'date':date, 'page':''}
    naver_news_parameter['sid1'] = section

    page = 1
    url_list = []
    while True:
        naver_news_parameter['page'] = page
        url =  naver_news_url + '?' + urllib.urlencode(naver_news_parameter)
        
        # get html data
        crawler = Crawler()
        web_data = crawler.get_page(url)
        
        # html parsing
        soup= BeautifulSoup(web_data, 'html.parser')
        list_body = soup('div', {'class':'list_body newsflash_body'})[0]

        # get each article's url
        list_body = list_body.findAll('li')
        current_list = []
        for e in list_body:
            current_list.append(e.find('a')['href'])
        del list_body
        
        # break when current page is end of url list
        if current_list[0] in url_list:
            break

        # add to url list
        url_list += current_list

        # next url page
        page += 1

    return url_list
Beispiel #58
0
 def getWebPage(self, URL, depth):
     '''
     Retreve all the text data from webpage/webpages.
     
     @param URL: URL which is going to be the sourse
     @param depth: the depth of the links from the URL which should be searched
     default = 0
     
     @return: string of all text from all webpages. 
     '''
     if int(depth) != 0:
         t = ""
         crawler = Crawler(URL, int(depth)-1)
         crawler.crawl()
         for l in crawler.links_remembered:
             text = self.Alchemy.URLGetText(str(l.dst))     
             element = ET.XML(text)
             t += element.findtext("text")
     else:
         text = self.Alchemy.URLGetText(URL)     
         element = ET.XML(text)
         t = element.findtext("text")
     return t.encode('ascii','ignore')
Beispiel #59
0
    def test__does_static_file_exist(self):
        exist_codes = [
            "200",
            "300",
            "304"
        ]
        nonexist_codes = [
            "404",
            "500"
        ]

        c = Crawler("http://test.com")

        with mock.patch("Crawler.requests") as mock_requests:
            mock_requests.head.return_value = mock_response = mock.Mock()

            for code in exist_codes:
                mock_response.status_code = code
                self.assertEqual(c._does_static_file_exist(""), True)

            for code in nonexist_codes:
                mock_response.status_code = code
                self.assertEqual(c._does_static_file_exist(""), False)
Beispiel #60
0
def topology_generator():
    '''Creates the Crawler object and calls all the functions to fill a file with the information.'''

    username = raw_input("Username: "******"Password: "******"Please enter your starting IP address: ")

            crawler = Crawler(ip_address, username, password)

            for address in crawler.address_list:
                crawler.update_address(address)
                err_flag = get_lldp.get_switch(crawler, working_file)
                if not err_flag:
                	l3_scrape.bgpcreation(crawler, working_file)

            print "\nScript complete! Check the 'neighbors.txt' file that has been generated.\n"

    except IOError, e:
        print e