def fetch(pointers):
        print "Creating pointer to logs"
        sbook=open("output/success.log","a+")
        fbook=open("output/failure.log","a+")
        print "Starting Crawler"
        while (True):
                terminator = 0
                urls=[]
                for ff in pointers:
                        url = ff.readline().strip()
                        if(len(url)>5):
                                terminator+=1
                                urls.append(url)
      
                print len(urls), sbook, fbook
                try:
			if(len(urls)>0):
                        	crawler.crawl(urls, sbook, fbook)
			else:
				send_email("*****@*****.**", "[email protected], [email protected]", "No Urls left", "")
				break
                except Exception, ex:
			msg = ex+str(urls)
                	send_email("*****@*****.**", "[email protected], [email protected]", "Error in Crawler", "%s")%(msg)    
                sbook.flush()
                fbook.flush()
Ejemplo n.º 2
0
def crawl():
	pagelist=[
	'https://en.wikipedia.org/wiki/Python_(programming_language)',
	'https://en.m.wikipedia.org/wiki/Wikipedia:Former_featured_articles'
	]
	crawler.crawl(pagelist)
	print ('Crawling Completed')
def fetch(pointers):
    print "Creating pointer to logs"
    sbook = open("output/success.log", "a+")
    fbook = open("output/failure.log", "a+")
    print "Starting Crawler"
    while (True):
        terminator = 0
        urls = []
        for ff in pointers:
            url = ff.readline().strip()
            if (len(url) > 5):
                terminator += 1
                urls.append(url)
        send_email("Crawling Started Successfully", str(len(urls)))
        try:
            if (len(urls) > 0):
                crawler.crawl(urls, sbook, fbook)
            else:
                send_email("Crawling Ended Successfully", "")
                break
        except Exception, ex:
            msg = ex[1] + chr(10) + str(urls)
            send_email("Error in Crawler", msg)
        sbook.flush()
        fbook.flush()
Ejemplo n.º 4
0
def main(archivedir, tmp="tmp"):

    print(f"Running ingest with archivedir={archivedir}")
    if not os.path.exists(tmp):
        print("Creating tmp directory")
        os.mkdir(tmp)
    if not os.path.exists(archivedir):
        print("Creating archive directory")
        os.mkdir(archivedir)

    publishers = [
        "cbc", "star", "post", "global", "globe", "macleans", "herald"
    ]
    # publishers = ["cbc"]

    for p in publishers:
        if not os.path.exists(f"{tmp}/{p}"):
            os.mkdir(f"{tmp}/{p}")
            os.mkdir(f"{tmp}/{p}/articles")
        os.chown(f"{tmp}/{p}", os.getuid(), -1)
        os.chown(f"{tmp}/{p}/articles", os.getuid(), -1)
        if not os.path.exists(f"{archivedir}/{p}"):
            os.mkdir(f"{archivedir}/{p}")
            os.mkdir(f"{archivedir}/{p}/articles")
        os.chown(f"{archivedir}/{p}", os.getuid(), -1)
        os.chown(f"{archivedir}/{p}/articles", os.getuid(), -1)

    print("Crawling")
    crawler.crawl(publishers, tmp, archivedir, True)

    shutil.rmtree(tmp)
Ejemplo n.º 5
0
def readbbs(title, url=None):
    if not url:
        url = "http://i-bbs.sijex.net/imageBoard.jsp?id=%s" % title
    if url in loop:
        return
    else:
        print url
        loop[url] = True
    try:
        urls = crawl(url)
    except IOError:
        loop.remove(url)
        sleep(10)
        crawl(url)
        return
    urls.reverse()
    for u in urls:
        host = urlparse(u)[1]
        if host != "i-bbs.sijex.net":
            pass
        elif re.search(r"count=\d+", u):
            readbbs(title, url=u)
        else:
            found = re.search(r"file=(\d+o[^&]*)", u)
            if found:
                fn = found.group(1)
                img = "http://image.i-bbs.sijex.net/bbs/%s/%s" % (title, fn)
                if img in loop:
                    pass
                elif not os.path.exists(fn):
                    print "   ", img
                    loop[img] = True
                    getimg(img, fn)
                    sleep(2)
    return
def fetch(pointers):
    print "Creating pointer to logs"
    sbook=open("output/success.log","a+")
    fbook=open("output/failure.log","a+")
    print "Starting Crawler"
    while (True):
        terminator = 0
        urls=[]
        for ff in pointers:
            url = ff.readline().strip()
            if(len(url)>5):
                terminator+=1
                urls.append(url)
        send_email("Crawling Started Successfully", str(len(urls)))        
        try:
            if(len(urls)>0):
                crawler.crawl(urls, sbook, fbook)
            else:
                send_email("Crawling Ended Successfully", "")
                break
        except Exception, ex:
            msg = ex[1]+chr(10)+str(urls)
            send_email("Error in Crawler", msg)
        sbook.flush()
        fbook.flush()
Ejemplo n.º 7
0
def crawl_pages():
    pages_pos = [] ; crawler.crawl("data/relevant.txt",pages_pos)
    pages_neg = [] ; crawler.crawl("data/irrelevant.txt",pages_neg)

    with open("data/pages_neg.pickle","wb") as n_out , open("data/pages_pos.pickle","wb") as p_out:
        pickle.dump(pages_neg,n_out)
        pickle.dump(pages_pos,p_out)
def fetch(pointers):
    print "Creating pointer to logs"
    sbook = open("output/success.log", "a+")
    fbook = open("output/failure.log", "a+")
    print "Starting Crawler"
    while (True):
        terminator = 0
        urls = []
        for ff in pointers:
            url = ff.readline().strip()
            if (len(url) > 5):
                terminator += 1
                urls.append(url)

        print len(urls), sbook, fbook
        try:
            if (len(urls) > 0):
                crawler.crawl(urls, sbook, fbook)
            else:
                send_email(
                    "*****@*****.**",
                    "[email protected], [email protected]",
                    "No Urls left", "")
                break
        except Exception, ex:
            msg = ex + str(urls)
            send_email("*****@*****.**",
                       "[email protected], [email protected]",
                       "Error in Crawler", "%s") % (msg)
        sbook.flush()
        fbook.flush()
Ejemplo n.º 9
0
    def find(self, *args, **kwargs):

        if (self.listbox.size()) == 0:
            if int(self.w.get()) <= 10:
                # loading screen
                self.label.config(text="Searching Youtube",
                                  fg='white',
                                  bg='#FF0000')
                self.label.update_idletasks()
                # crawling ht esite
                crawl(self.ent.get(), int(self.w.get()))
                self.label.config(text="Completed", fg='white', bg='#FF0000')

                # buttons
                self.op = Button(self.master,
                                 text="Open Video",
                                 command=self.youtube)
                self.op.place(x=720, y=60)

                self.op2 = Button(self.master,
                                  text="Clear Results",
                                  command=self.clear_results)
                self.op2.place(x=720, y=100)

                for item in range(len(final_name)):
                    self.listbox.insert(END, final_name[item])
            else:
                tkinter.messagebox.showinfo(
                    "Limit Exceeded", "You can get results only upto 10")

        else:
            tkinter.messagebox.showinfo("Warning",
                                        "Please clear the previous results.")
Ejemplo n.º 10
0
def mainVerbose(args):
    #payload = "<script>alert(1);</script>" TODO: Add more scripts to test with
    if args['payload'] != '':
        payload = args['payload']
    if args['inject']:
        injector.inject(args['url'], payload, args['keyword'], args['cookie'])
    else:
        crawler.crawl(args['url'], payload, args['keyword'], args['cookie'])
Ejemplo n.º 11
0
 def test_crawl_given_invalid_schema(self):
     value = "foo://github.com"
     raised = False
     try:
         crawl(value)
     except Exception:
         raised = True
     self.assertTrue(raised)
Ejemplo n.º 12
0
 def test_crawl_given_invalid_url(self):
     value = "https://example.reee"
     raised = False
     try:
         crawl(value)
     except Exception:
         raised = True
     self.assertTrue(raised)
Ejemplo n.º 13
0
 def test_crawl_given_valid_url(self):
     value = "https://github.com"
     raised = False
     try:
         crawl(value)
     except Exception:
         raised = True
     self.assertFalse(raised)
Ejemplo n.º 14
0
def crawl():
    """ Run the crawler, download data to Database  """
    print "downloading......"

    import crawler
    crawler.crawl()

    print "Done!"
Ejemplo n.º 15
0
def main():
    ts = time()
    lat = 40.68828329999999
    lng = -73.98899849999998
    logger.info('started...')
    crawl(lat, lng)
    logger.info('stopped.')
    logger.info(f'took {time() - ts}s')
Ejemplo n.º 16
0
def mainCLI(args):
    payload = "<script>alert(1);</script>"  #TODO: Add more scripts to test with
    if args.payload != '':
        payload = args.payload
    if args.inject:
        injector.inject(args.url, payload, args.keyword, args.cookie)
    else:
        crawler.crawl(args.url, payload, args.keyword, args.cookie)
Ejemplo n.º 17
0
def startCrawl(ws):

    #fakeCrawler.crawl(ws)
    global userId, url, limit, sType, keyword
    url = session['url']
    limit = session['limit']
    sType = session['sType']
    keyword = session['keyword']
    crawler.crawl(ws, url, int(limit), sType, keyword)
Ejemplo n.º 18
0
def main(filepath,evn):

    c = Counter()
    for _,js_domain,js_uri,page_url in read_js_log(filepath):
        c.log(js_domain+js_uri, page_url)
    domains = c.calc()
    urls = ['http://'+d.topPage().url for d in domains]
    print("get urls: %d" % (len(urls),))
    crawl(urls,evn)
Ejemplo n.º 19
0
def downloadBooks(books, bookfldr):
  print "Books to be downloaded:"
  print books
  if bookfldr[len(bookfldr)-1] is not ("/" or os.path.sep):
      bookfldr += os.path.sep
  bookfldr = bookfldr.replace("/",os.path.sep)
  for b in books:
      constants.mkdir((bookfldr+b+os.path.sep+"contents"))
      crawler.crawl(b,bookfldr)
Ejemplo n.º 20
0
def crawl_pages():
    pages_pos = []
    crawler.crawl("data/relevant.txt", pages_pos)
    pages_neg = []
    crawler.crawl("data/irrelevant.txt", pages_neg)

    with open("data/pages_neg.pickle",
              "wb") as n_out, open("data/pages_pos.pickle", "wb") as p_out:
        pickle.dump(pages_neg, n_out)
        pickle.dump(pages_pos, p_out)
Ejemplo n.º 21
0
def check_page():
    page = crawl(configuration['targetURL'])  # .decode("utf8")
    page_hash = md5(page)
    c = load()
    if not c['hash'] == page_hash:
        print("HASH CHANGED! (" + page_hash + ")")

        # Run a background thread to archive the page in the web archive
        start_new_thread(crawl, ("https://web.archive.org/save/" +
                                 configuration['targetURL'], False))

        # Check if the file is online and we didn't sent the mail already (if so send it)
        match = parse(page.decode('utf8'))
        if match is not None and not c['mailSent']:
            print(
                "FILE IS ONLINE! Sending mails ... (and we didn't sent them already)"
            )
            docx = crawl(match)
            for person_details in configuration['details']:
                variables = {
                    "name": person_details['name'],
                    "year": person_details['targetYear'],
                    "quarter": person_details['quarter'],
                    "mail": person_details['mail'],
                    "streetAndCity": person_details['streetAndCity'],
                    "phone": person_details['phone'],
                    "matrikelnr": person_details['matrikelnr']
                }
                res = parser.update_document_contents(docx, person_details)
                res_filename = "Antrag Wohnheimzimmer " + variables[
                    'quarter'] + " " + variables['year'] + ".docx"
                mail.send(configuration['mail'], variables, res, res_filename)
            c['mailSent'] = True

        # Send a mail regardless of the above that there is a change
        notification_conf = {
            "body":
            "Something changed! Go and visit " + configuration['targetURL'],
            "subject":
            "IMPORTANT | The watched website has changed! Go check it immediately!",
            "recipient": configuration['mail']['notificationRecipient'],
            "server": configuration['mail']['server']
        }
        if c['mailSent']:
            notification_conf[
                'body'] += "\n\n Oh and btw I already sent your reservation request ;)\n\n Have a good one!\n - AccommodationBot"
        mail.send(notification_conf)

        c['hash'] = page_hash
    else:
        print("Boring old same page...")

    save(c)
Ejemplo n.º 22
0
def main():
    print('Collecting the data: ')
    print(
        'This may take some time.....................................................'
    )
    crawl()
    print('Analyzing the data ............... Almost there...........')
    import sentiment
    try:
        server = HTTPServer(('', 1456), server_handler)
        print('Server Started')
        server.serve_forever()
    except KeyboardInterrupt:
        print("Exiting")
        server.socket.close()
Ejemplo n.º 23
0
def fill_tables():
    conn = None
    try:
        params = config()
        conn = psycopg2.connect(**params)
        print("Filling PostgreSQL database")

        crawler.crawl(conn)

    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
            print("Database connection closed.")
Ejemplo n.º 24
0
def main():
    print len(teamslist)
    (teamsdict, (homescores, awayscores, neutscores)) = \
            crawl(update=1, bb=1, url="cbbga13.txt")

    # normalize scores.
    homescores, awayscores, neutscores = norm_venues(homescores,
            awayscores, neutscores)

    # get final scores matrix.
    scores = homescores + awayscores + neutscores
    # scores = (scores > scores.T)

    games = [("Kansas", "Ohio St."), ("Louisville", "Kentucky")]


    print_probs(teamsdict, scores, games)

    print; print

    valsp = prob_network_ratings(scores)
    valse = energy_min(scores)
    vals = [valsp, valse]

    wins = scores > scores.T
    losses = scores < scores.T

    wl = zip(wins.sum(1), losses.sum(1))

    print_values(teamsdict, teamslist, vals, wl)
Ejemplo n.º 25
0
def startCrawl(ws):
	global userId, url, limit, sType, keyword
	userId = session['userId']
	url = session['url']
	limit = session['limit']
	sType = session['sType']
	keyword = session['keyword']
	path = []
	found = False
	early = False

	database = mongo.db.test #access test collection
	postid = database.insert({'userId' : userId, 'url': url, 'limit': limit, 'sType' : sType, 'keyword' : keyword, 'path': path, 'found': found, 'early': early})

	#call crawler, passing socket and db info
	crawler.crawl(ws, url, int(limit), sType, keyword, postid, database)
Ejemplo n.º 26
0
 def test_external_site(self):
     """
     Test that crawling a page with a link to an external site won't
     lead to following that external site.
     """
     result_sitemap = crawl("https://contains-external-site-link.com")
     self.assertNotIn("https://external-site.com", result_sitemap)
Ejemplo n.º 27
0
def gather():
	for num in midNums:
		datastr=str(crawler.crawl(num))
		
		incidentSave=[]
		data={}
		datastr=cleaner(datastr)
		
		data = disputeInfoGather(datastr)
		incidentData=incidentGather(datastr) 
		data['Sources']=sourceSearchGather(datastr)
		#data['Rest']=narrativeGather(datastr)
		stuff=narrativeGather(datastr)
		for i in stuff:
			data[i]=stuff[i]
		for i in range(len(incidentData)):
			incidentSave.append(incidentData[i])

		data['Incidents']=incidentData
		data['MID 2.1 Sources']=twoSources(datastr)
		data['Articles']=articleGather(datastr)
		#mids.append(copy.deepcopy(data))
		mids.append(data)
		print "FINISHED PARSING MID NUMBER "+num
	return mids
Ejemplo n.º 28
0
def test_invalid_type():
    scenario = [MetadataRecord('wrong_field', [1, 2, 3])]

    with pytest.raises(CrawlingError) as exc:
        list(crawl(scenario))

    info = exc.value
    assert info.args[0] == "The type of field 'wrong_field' is unknown"
Ejemplo n.º 29
0
def main():
    initiate()
    config = Config()

    crawl_interval = timedelta(
        minutes=int(config['settings']['crawl_interval_minutes'])
    )

    last_update = datetime(1, 1, 1, tzinfo=pytz.utc)

    # Get urls to crawl
    domains = db.query.query_domains()

    while datetime.now(tz=pytz.utc) - last_update >= crawl_interval:
        # crawl all Urls
        crawl(domains)
        last_update = datetime.now(tz=pytz.utc)
Ejemplo n.º 30
0
def index():
    lang = request.args.get('lang')
    if lang is None:
        lang = "java"
    response = call(lang)
    soup = BeautifulSoup(response, 'html5lib')
    divs = soup.find_all("div")
    return crawl(divs)
Ejemplo n.º 31
0
def main():
    payload = crawl(month, year)
    filename = f"mpba-{month}-{year}.json"
    filepath = save(filename, payload, output_path)

    employees = parse(payload)
    crawler_result = build_crawler_result(month, year, employees, [filepath])
    print(json.dumps({'cr': crawler_result}, ensure_ascii=False))
Ejemplo n.º 32
0
def send_data(message):
    subreddits = str(message.text.split(" ")[1])
    try:
        answer = crawler.crawl(subreddits)
    except:
        answer = emoji.emojize("Something has happened. Our friends are working on it. We'll be right back! :thumbs_up: ")
   
    bot.reply_to(message, answer)
Ejemplo n.º 33
0
 def test_max_depth(self):
     """
     Check that when we're parsing a webpage with links to pages containing
     more links (and so on), we stop after (and only after) a given depth.
     """
     result = crawl("https://deep-chain.com", max_depth=4)
     self.assertIn("https://4.deep-chain.com", result)
     self.assertNotIn("https://5.deep-chain.com", result)
def fetch( urls, sbook, fbook):
    #send_email("Crawling Started", str(len(urls)))
    now=time.time()
    num_processed, success_count, failure_count = crawler.crawl(urls, sbook, fbook)
    try:
                    msg = "Total Processed: "+str(num_processed)+chr(10)+"Success Count: "+ str(success_count) +chr(10)+"Failure Count: "+str(failure_count)+"in "+str((time.time()-now)/60)+" minutes." 
		    #send_email("Crawling Successfull", msg)
    except Exception, ex:
                    send_email("Crawler Error:", ex[1])
Ejemplo n.º 35
0
def findenter(url):
    """Find the URLs that are in same hosts."""
    host = urlparse(url)[1]
    urls = crawl(url)
    for u in urls:
        h = urlparse(u)[1]
        if (h == host) and (not pat_image.search(u)) \
                       and (not pat_media.search(u)):
            print u
Ejemplo n.º 36
0
def email_generator(usernames, domains=[], links=[]):
	emails = dict()
	max_confidence = max(usernames.values())
	common_domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'aol.com']

	username_from_email = lambda email: email[:email.index('@')]

	for domain in domains:
		if domain[:4] != 'http':
			domain = 'http://' + domain
		# Might not have permission to scrape
		try:
			internal_links = crawl(domain)
			for l in internal_links:
				resp = urllib2.urlopen(l).read()
				parsed_emails = parse_HTML(resp)
				for email in parsed_emails:
					emails[email] = max_confidence
		except:
			pass

		# Common emails of people with their own domains
		# If any of these exist, the chance of it being their email
		# is highly likely since it's a personal domain
		stripped_domain = strip_domain(domain)
		emails['admin@' + stripped_domain] = max_confidence
		emails['info@' + stripped_domain] = max_confidence
		emails['me@' + stripped_domain] = max_confidence

	for link in links:
		if link:
			if link[:4] != 'http':
				link = 'http://' + link
			# Might not have permission to scrape
			try:
				resp = urllib2.urlopen(link).read()
				parsed_emails = parse_HTML(resp)
				for email in set(parsed_emails):
					if email in emails:
						emails[email] *= 1.5
					else:
						emails[email] = max_confidence
			except:
				pass

	for username in usernames:
		for domain in common_domains:
			email = username + '@' + domain
			if email in emails:
				emails[username + '@' + domain] += usernames[username]
			else:	
				emails[username + '@' + domain] = usernames[username]

	return emails
Ejemplo n.º 37
0
def worker(crawler, platform_game):
        before_crawler = time.time()
        rooms = crawler.crawl()
        after_crawler = time.time()
        _store_rooms(rooms, platform_game)
        print '*****************'
        print 'crawler {count} rooms from {platform_name} {game_name}'.format(
            count=len(rooms),
            platform_name=platform_game.platform_name,
            game_name=platform_game.game_name)
        after_save = time.time()
        print 'Crawl time cost: ', after_crawler - before_crawler
        print 'Save time cost: ', after_save - after_crawler
Ejemplo n.º 38
0
def search(request):
    if request.method == "POST":
        form = SearchForm(request.POST)
        if form.is_valid():
            searchstring = form["searchstring"].data
            urlseed = form["urlseed"].data
            depth = form["depth"].data
            result = crawler.crawl(searchstring,urlseed,int(depth))
            html = "<html><body>Url's containing search text are: %s</body></html>" % result
            return HttpResponse(html)
    else:
        form = SearchForm()
    return render_to_response('searchhome.html', { 'form' : form,})
Ejemplo n.º 39
0
def main():

    url = 'http://vg.no'
    keywords = ['snegl',
                'kropp',
                'dette',
                'paradise',
                'pladask',
                'miss universe',
                'frøken norge',
                'sex',
                'triks',
                'pokemon',
                'undertøy',
                'hollywood',
                'kjendis',
                'øvelse',
                'slik',
                'slank',
                'digg'
            ]

    old_posts = database.get_posts()

    try:

        new_posts = crawler.crawl(url, keywords)

        new = 0

        for k,v in new_posts.items():
            if k not in old_posts.keys():
                title = v['title'].encode('utf-8')
                print 'POSTING' , title
                print twitter.post_status(title)

                database.add_post(k, v)
                new += 1
                time.sleep(1)

        if new == 0:
            print 'no new posts found'
        elif new == 1:
            print 'one new post found'
        else:
            print str(new) + ' new posts found'

    except requests.exceptions.ConnectionError:
        print 'could not connect'
Ejemplo n.º 40
0
    def post(self):
        current_mode = self.getCurrentMode()

        if current_mode == MODE.SEARCH:
            user_search = self.request.get('inputSearch')
            
            words = user_search.split()
            words = list(set(words))
        
            results = []
            
            self.render('search.html',
                        str_address = 'search',
                        str_active = CLASS_ACTIVE,
                        query = user_search,
                        results = results[:10],
                        bsearch = True)
        elif current_mode == MODE.CRAWL:
            try:
                logging.info('fetching urls from ' + MYURL)
                crawled = crawl(MYURL)
            except:
                logging.error('An error occured when crawling from url: ' + MYURL)
                self.redirect('/search')
                return
            
            DocMapping.clean()
            DocMapping.add_list(crawled)
            self.redirect('/crawled')

        elif current_mode == MODE.PARSE:
            # parsing mode: extract content from all documents and build data structures.
            # build inverted index
            docs = DocMapping.all()
            invertedIndex ={}
            termFrequency = {}
            termFrequencyByDoc = {}
            docFrequencyByTerm = {}
            logging.info('starting to parse all documents')
            for d in docs:
                parse(d, invertedIndex, termFrequency, termFrequencyByDoc, docFrequencyByTerm)
            logging.info('parsing done!')
            # we need to store this in a blob or cloud storage for later
            #json_str = json.dumps(invertedIndex)
            #json_str = json.dumps(termFrequency)
            #json_str = json.dumps(termFrequencyByDoc)
            #json_str = json.dumps(docFrequencyByTerm)
            self.redirect('/search')
Ejemplo n.º 41
0
def main():
    """
    定义一个入口函数来组织爬虫过程,虽然对于python来说入口函数的形式并不必要.
    """
    # 指定系统默认编码为utf-8
    default_encoding = 'utf-8'
    if sys.getdefaultencoding() != default_encoding:
        reload(sys)
        sys.setdefaultencoding(default_encoding)

    # 必要数据收集
    url = 'http://weibo.cn/moegirlwiki'  # 任意新浪微博移动版页面地址
    DB.dbuser = '******'  # MySQL数据库用户名
    DB.dbpassword = '******'  # MySQL数据库密码
    crawler.username = raw_input('请输入新浪通行证用户名:')  # 爬取网站用户名
    crawler.password = getpass('请输入新浪通行证密码:')  # 爬取网站密码
    crawler.cookie = login(crawler.username, crawler.password)  # 获取第一个有效cookie

    # 初始化数据库,如果需要可以解开注释
    # DB.init_DB()

    # 启动代理池守护进程
    proxies.daemon_pool(test_url='http://weibo.cn', timeout=3, interval=30)

    # 执行爬虫过程
    time0 = time()
    weibo_cnt = crawler.crawl(
            url=url,
            parser=parser,
            login=login,
            write_DB=DB.write_DB,
            get_proxy=proxies.get_proxy,
            average_delay=3,
            page_limit=1000,
            # record_raw_data = True
    )
    time_cost = time() - time0
    print('共爬取数据', weibo_cnt, '条,用时' + get_time_cost_str(time_cost))
def fetch(pointers):
    sbook=open("output/success.log","a+")
    fbook=open("output/failure.log","a+")
    while (True):
        terminator = 0
        urls=[]
        for ff in pointers:
            url = ff.readline().strip()
            if(len(url)>5):
                terminator+=1
                urls.append(url)
                
        try:
            if(len(urls)>0):
                num_processed, success_count, failure_count = crawler.crawl(urls, sbook, fbook)
                try:
                    msg = "Total Processed: "+str(num_processed)+chr(10)+"Success Count: "+ str(success_count) +chr(10)+"Failure Count: "+str(failure_count) 
                    #send_email("Crawling Successfull", msg)
                except Exception, ex:
                    send_email("Crawler Error:", ex[1])
                    
            else:
                break
Ejemplo n.º 43
0
def crawl(request):
    try:
        terms = map(str, request.GET['q'].split())
        seed = str(request.GET['seed'])
    except KeyError:
        terms = None
        seed = None
    else:
        if 'http' not in seed:
            seed = 'http://' + seed

    try:
        depth = int(request.GET['depth'][0])
    except:
        depth = 2 # default depth
    if depth < 3:
        depth = 1

    return render_to_response('rycrawler/crawl.html', {
        'seed': seed,
        'terms': terms,
        'sites': crawler.crawl(seed, depth, terms),
    })
Ejemplo n.º 44
0
def main():
	crawler = SaksCrawler()
	crawler.crawl()

#main()
Ejemplo n.º 45
0
def main():
	crawler = ShopBopCrawler()
	crawler.crawl()
Ejemplo n.º 46
0
def main():
    crawler = NordstromCrawler()
    crawler.crawl()
Ejemplo n.º 47
0
import crawler
page_name = 'xxx'
base_url = 'http://aqvatarius.com/themes/intuitive/'
first_page = 'index.html'
retry_times = 3
result = crawler.crawl(page_name, base_url, first_page, retry_times)
if result == True:
    print('oh yeah.')
Ejemplo n.º 48
0
import unittest
from collections import namedtuple
from common import event
from crawler import prepare, get_items, crawl
import crawler

crawler.add_link = lambda *args, **kwargs: crawl(*args, **kwargs)
crawler.website_sleep_time = 0


html = lambda body: "<html><head></head><body>{}</body></html>".format(body)

test_site_info = namedtuple(
    'test_site_info',
    'name baseurl urlnosaveregex urlregex html_hints test_html test_answer'
)

tests = [

    test_site_info(
        name='test1',
        baseurl='http://test1/',
        test_html={
            'http://test1/': html("<a href='/nosave'></a>"),
            'http://test1/nosave': html("<a href='/event'></a>"),
            'http://test1/event': html("<span>answer</span>"),
        },
        urlnosaveregex='/nosave',
        urlregex='/event',
        html_hints={'name': 'span'},
        test_answer=event(html=html('<span>answer</span>'))
Ejemplo n.º 49
0
def main():
    crawler = JcrewCrawler()
    crawler.crawl()
Ejemplo n.º 50
0
def main():
	crawler = KarmaCrawler()
	crawler.crawl()
Ejemplo n.º 51
0
def main():
    crawler = UrbanOutfittersCrawler()
    crawler.crawl()
Ejemplo n.º 52
0
if args.command == "add":
    #remove options from the list of arguments, as options is useless for adding tags
    parser._actions[-1].container._remove_action(parser._actions[-1])
    parser.add_argument('-b', '--bulk', nargs='*', help='Tag all files under a directory recursively with one set of specified tags')
    parser.add_argument('-i', '--individual', nargs='*', help='Tag one or more files with a set of specified tags for each file')
    parser.add_argument('-p', '--path', nargs='*', help='Tag all files under a directory recursively by their parent directories e.g. a file under /home/user would be tagged home and user. If the first argument passed to -p is not a directory, it will be interpreted as a tag or list of tags (separate by a comma and a space) to NOT assign to files')
    parser.add_argument('-f', '--filename', nargs='*', help='Tag files or directories of files by their filename, tags seperated by a hyphen e.g. a file named python-important-due_friday would be tagged python, important, and due_friday')
    parser.add_argument('--timetest', nargs='?', help='Test the speed of the hasing/adding function using the path argument')
    args = parser.parse_args()
    
    if args.bulk is not None:
        tags = args.bulk[0].split(', ')
        dirs = args.bulk[1:]
        for directory in dirs:
            files = crawl().get_filepaths(directory)
            print(files)
            tagging.tagAdd().bulk(files, tags)
    
    if args.individual is not None:
        for directory in args.individual:
            if os.path.isdir(directory):
                files = crawl().get_filepaths(directory)
                for file in files:
                    tags = input("Tags to assign to %s: " % file).split(', ')
                    file = os.path.realpath(file)
                    tagging.tagAdd().add_tag(tags, file)
            else:
                file = os.path.realpath(directory)
                print(file)
                tags = input("Tags to assign to %s: " % directory).split(', ')
Ejemplo n.º 53
0
def main():
	crawler = NastyCrawler()
	crawler.crawl()
Ejemplo n.º 54
0
import crawler
import sys

page = 'https://www.boardgames-bg.com'
crawler.crawl(page)
Ejemplo n.º 55
0
 def go(self):
     url = self.url_input.text()
     res = crawl(url)
     self.qas.setText('\n'.join(res))
import crawler
crawler = crawler.crawler(None,'urls.txt')
crawler.crawl(depth=1)
print "====Printing resolved inverted index===="
print crawler.get_resolved_inverted_index()
Ejemplo n.º 57
0
    def post(self):
        isbn_input = self.request.get("isbn").strip()
        valid_isbn = re.compile(r'\b(\d{13}|\d{10})\b')
        isbns = re.findall(valid_isbn, isbn_input.replace("-",""))
        booksfound = []

        logging.error("ISBNs: %s" % isbns)
        
        for isbn in isbns:

            if len(isbn)==10:
                newisbn = ''.join(['978', isbn])
                check = unicode((10 - (sum(int(digit) * (3 if idx % 2 else 1) for idx, digit in enumerate(newisbn[:12])) % 10)) % 10)
                isbn = newisbn[:12] + check

            usersbook = Bookshelf.by_name(self.username, isbn)
            # usersbook = None     #delete this line
            if usersbook:
                """book already on bookshelf"""
                # self.response.write("On Shelf")
                self.render("addbooks.html", isbn=isbn, error_isbn='Book is on your bookshelf already')
                return
            else:
                intelligence_book = Intelligence.by_name(isbn)
                # intelligence_book = None           # delete this line
                if not intelligence_book:
                    """scrape off websites and add intelligence"""
                    # self.response.write("Not in Database\n\n")
                    newbookkey = items.bookitem_key(isbn, parent=items.bookitem_key())
                    newbook = items.BookItem(key=newbookkey)
                    newbook.isbn = isbn
                    bookstw_url = ''.join(["http://search.books.com.tw/exep/prod_search.php?cat=BKA&key=", isbn])
                    eslitetw_url = ''.join(["http://www.eslite.com/Search_BW.aspx?query=", isbn])
                    crawler.crawl(newbook, bookstw_url, crawler.create_bookstw_searchresult_callback,
                                           eslitetw_url, crawler.create_eslitetw_searchresult_callback,
                                           googlebooksjs = True)
                    # self.response.write(newbook)

                    if newbook.name:

                        for (i, url) in enumerate(newbook.image_urls):
                            try:
                                result = urlfetch.fetch(url, deadline=10)
                            except DeadlineExceededError:
                                logging.error("Deadline Exceeded While Fetching Book Image\n\n\n")
                                return
                            except DownloadError:
                                logging.error("Download Error While Fetching Book Image. Check network connections.")
                                return
                            if result.status_code == 200:
                                newbook.images.append(result.content)
                                # self.response.write('<img src="/_getimage?key=%s&idx=%s" />' % (newbookkey.id(), str(i)))
                            else:
                                newbook.images.append(None)

                        newbookkey = newbook.update()
                        # logging.error("before %s" % booksfound)
                        booksfound.append(newbook)
                        # logging.error("after %s" % booksfound)
                        # self.render("showbook.html", book=newbook)

                        key = ndb.Key('Intelligence', isbn, parent=intelligence_key(DEFAULT_INTELLIGENCE_NAME))
                        intelligence = Intelligence(key=key)
                        intelligence.isbn = isbn
                        intelligence.name = newbook.name
                        intelligence.language = newbook.language
                        intelligence.update()
                    else:
                        self.render("addbooks.html", isbn=isbn,
                            error_isbn='ISBN not found')
                        return

                else:
                    book = items.BookItem.by_name(isbn)
                    if book is not None:
                        booksfound.append(book)

                bookshelf_book = Bookshelf(key=bookshelf_key(isbn, parent=bookshelf_key(self.username)))
                bookshelf_book.username = self.username
                bookshelf_book.isbn = isbn
                bookshelf_book.status = 4
                # rating = StellarRating(linguistic=0, logical=0, right_minded=0,intrapersonal=0, interpersonal=0, naturalistic=0)
                bookshelf_book.raters = StellarRaters()
                bookshelf_book.rating = StellarRating()
                bookshelf_book.update()
                # self.response.write(''.join([isbn," added to bookshelf"]))

        # logging.error("books found %s" % booksfound)
        self.render("showbook.html", head_title="View Books Added", books=booksfound)
Ejemplo n.º 58
0
def main():
    crawler = BestBuyCrawler()
    crawler.crawl()