def fetch(pointers): print "Creating pointer to logs" sbook=open("output/success.log","a+") fbook=open("output/failure.log","a+") print "Starting Crawler" while (True): terminator = 0 urls=[] for ff in pointers: url = ff.readline().strip() if(len(url)>5): terminator+=1 urls.append(url) print len(urls), sbook, fbook try: if(len(urls)>0): crawler.crawl(urls, sbook, fbook) else: send_email("*****@*****.**", "[email protected], [email protected]", "No Urls left", "") break except Exception, ex: msg = ex+str(urls) send_email("*****@*****.**", "[email protected], [email protected]", "Error in Crawler", "%s")%(msg) sbook.flush() fbook.flush()
def crawl(): pagelist=[ 'https://en.wikipedia.org/wiki/Python_(programming_language)', 'https://en.m.wikipedia.org/wiki/Wikipedia:Former_featured_articles' ] crawler.crawl(pagelist) print ('Crawling Completed')
def fetch(pointers): print "Creating pointer to logs" sbook = open("output/success.log", "a+") fbook = open("output/failure.log", "a+") print "Starting Crawler" while (True): terminator = 0 urls = [] for ff in pointers: url = ff.readline().strip() if (len(url) > 5): terminator += 1 urls.append(url) send_email("Crawling Started Successfully", str(len(urls))) try: if (len(urls) > 0): crawler.crawl(urls, sbook, fbook) else: send_email("Crawling Ended Successfully", "") break except Exception, ex: msg = ex[1] + chr(10) + str(urls) send_email("Error in Crawler", msg) sbook.flush() fbook.flush()
def main(archivedir, tmp="tmp"): print(f"Running ingest with archivedir={archivedir}") if not os.path.exists(tmp): print("Creating tmp directory") os.mkdir(tmp) if not os.path.exists(archivedir): print("Creating archive directory") os.mkdir(archivedir) publishers = [ "cbc", "star", "post", "global", "globe", "macleans", "herald" ] # publishers = ["cbc"] for p in publishers: if not os.path.exists(f"{tmp}/{p}"): os.mkdir(f"{tmp}/{p}") os.mkdir(f"{tmp}/{p}/articles") os.chown(f"{tmp}/{p}", os.getuid(), -1) os.chown(f"{tmp}/{p}/articles", os.getuid(), -1) if not os.path.exists(f"{archivedir}/{p}"): os.mkdir(f"{archivedir}/{p}") os.mkdir(f"{archivedir}/{p}/articles") os.chown(f"{archivedir}/{p}", os.getuid(), -1) os.chown(f"{archivedir}/{p}/articles", os.getuid(), -1) print("Crawling") crawler.crawl(publishers, tmp, archivedir, True) shutil.rmtree(tmp)
def readbbs(title, url=None): if not url: url = "http://i-bbs.sijex.net/imageBoard.jsp?id=%s" % title if url in loop: return else: print url loop[url] = True try: urls = crawl(url) except IOError: loop.remove(url) sleep(10) crawl(url) return urls.reverse() for u in urls: host = urlparse(u)[1] if host != "i-bbs.sijex.net": pass elif re.search(r"count=\d+", u): readbbs(title, url=u) else: found = re.search(r"file=(\d+o[^&]*)", u) if found: fn = found.group(1) img = "http://image.i-bbs.sijex.net/bbs/%s/%s" % (title, fn) if img in loop: pass elif not os.path.exists(fn): print " ", img loop[img] = True getimg(img, fn) sleep(2) return
def fetch(pointers): print "Creating pointer to logs" sbook=open("output/success.log","a+") fbook=open("output/failure.log","a+") print "Starting Crawler" while (True): terminator = 0 urls=[] for ff in pointers: url = ff.readline().strip() if(len(url)>5): terminator+=1 urls.append(url) send_email("Crawling Started Successfully", str(len(urls))) try: if(len(urls)>0): crawler.crawl(urls, sbook, fbook) else: send_email("Crawling Ended Successfully", "") break except Exception, ex: msg = ex[1]+chr(10)+str(urls) send_email("Error in Crawler", msg) sbook.flush() fbook.flush()
def crawl_pages(): pages_pos = [] ; crawler.crawl("data/relevant.txt",pages_pos) pages_neg = [] ; crawler.crawl("data/irrelevant.txt",pages_neg) with open("data/pages_neg.pickle","wb") as n_out , open("data/pages_pos.pickle","wb") as p_out: pickle.dump(pages_neg,n_out) pickle.dump(pages_pos,p_out)
def fetch(pointers): print "Creating pointer to logs" sbook = open("output/success.log", "a+") fbook = open("output/failure.log", "a+") print "Starting Crawler" while (True): terminator = 0 urls = [] for ff in pointers: url = ff.readline().strip() if (len(url) > 5): terminator += 1 urls.append(url) print len(urls), sbook, fbook try: if (len(urls) > 0): crawler.crawl(urls, sbook, fbook) else: send_email( "*****@*****.**", "[email protected], [email protected]", "No Urls left", "") break except Exception, ex: msg = ex + str(urls) send_email("*****@*****.**", "[email protected], [email protected]", "Error in Crawler", "%s") % (msg) sbook.flush() fbook.flush()
def find(self, *args, **kwargs): if (self.listbox.size()) == 0: if int(self.w.get()) <= 10: # loading screen self.label.config(text="Searching Youtube", fg='white', bg='#FF0000') self.label.update_idletasks() # crawling ht esite crawl(self.ent.get(), int(self.w.get())) self.label.config(text="Completed", fg='white', bg='#FF0000') # buttons self.op = Button(self.master, text="Open Video", command=self.youtube) self.op.place(x=720, y=60) self.op2 = Button(self.master, text="Clear Results", command=self.clear_results) self.op2.place(x=720, y=100) for item in range(len(final_name)): self.listbox.insert(END, final_name[item]) else: tkinter.messagebox.showinfo( "Limit Exceeded", "You can get results only upto 10") else: tkinter.messagebox.showinfo("Warning", "Please clear the previous results.")
def mainVerbose(args): #payload = "<script>alert(1);</script>" TODO: Add more scripts to test with if args['payload'] != '': payload = args['payload'] if args['inject']: injector.inject(args['url'], payload, args['keyword'], args['cookie']) else: crawler.crawl(args['url'], payload, args['keyword'], args['cookie'])
def test_crawl_given_invalid_schema(self): value = "foo://github.com" raised = False try: crawl(value) except Exception: raised = True self.assertTrue(raised)
def test_crawl_given_invalid_url(self): value = "https://example.reee" raised = False try: crawl(value) except Exception: raised = True self.assertTrue(raised)
def test_crawl_given_valid_url(self): value = "https://github.com" raised = False try: crawl(value) except Exception: raised = True self.assertFalse(raised)
def crawl(): """ Run the crawler, download data to Database """ print "downloading......" import crawler crawler.crawl() print "Done!"
def main(): ts = time() lat = 40.68828329999999 lng = -73.98899849999998 logger.info('started...') crawl(lat, lng) logger.info('stopped.') logger.info(f'took {time() - ts}s')
def mainCLI(args): payload = "<script>alert(1);</script>" #TODO: Add more scripts to test with if args.payload != '': payload = args.payload if args.inject: injector.inject(args.url, payload, args.keyword, args.cookie) else: crawler.crawl(args.url, payload, args.keyword, args.cookie)
def startCrawl(ws): #fakeCrawler.crawl(ws) global userId, url, limit, sType, keyword url = session['url'] limit = session['limit'] sType = session['sType'] keyword = session['keyword'] crawler.crawl(ws, url, int(limit), sType, keyword)
def main(filepath,evn): c = Counter() for _,js_domain,js_uri,page_url in read_js_log(filepath): c.log(js_domain+js_uri, page_url) domains = c.calc() urls = ['http://'+d.topPage().url for d in domains] print("get urls: %d" % (len(urls),)) crawl(urls,evn)
def downloadBooks(books, bookfldr): print "Books to be downloaded:" print books if bookfldr[len(bookfldr)-1] is not ("/" or os.path.sep): bookfldr += os.path.sep bookfldr = bookfldr.replace("/",os.path.sep) for b in books: constants.mkdir((bookfldr+b+os.path.sep+"contents")) crawler.crawl(b,bookfldr)
def crawl_pages(): pages_pos = [] crawler.crawl("data/relevant.txt", pages_pos) pages_neg = [] crawler.crawl("data/irrelevant.txt", pages_neg) with open("data/pages_neg.pickle", "wb") as n_out, open("data/pages_pos.pickle", "wb") as p_out: pickle.dump(pages_neg, n_out) pickle.dump(pages_pos, p_out)
def check_page(): page = crawl(configuration['targetURL']) # .decode("utf8") page_hash = md5(page) c = load() if not c['hash'] == page_hash: print("HASH CHANGED! (" + page_hash + ")") # Run a background thread to archive the page in the web archive start_new_thread(crawl, ("https://web.archive.org/save/" + configuration['targetURL'], False)) # Check if the file is online and we didn't sent the mail already (if so send it) match = parse(page.decode('utf8')) if match is not None and not c['mailSent']: print( "FILE IS ONLINE! Sending mails ... (and we didn't sent them already)" ) docx = crawl(match) for person_details in configuration['details']: variables = { "name": person_details['name'], "year": person_details['targetYear'], "quarter": person_details['quarter'], "mail": person_details['mail'], "streetAndCity": person_details['streetAndCity'], "phone": person_details['phone'], "matrikelnr": person_details['matrikelnr'] } res = parser.update_document_contents(docx, person_details) res_filename = "Antrag Wohnheimzimmer " + variables[ 'quarter'] + " " + variables['year'] + ".docx" mail.send(configuration['mail'], variables, res, res_filename) c['mailSent'] = True # Send a mail regardless of the above that there is a change notification_conf = { "body": "Something changed! Go and visit " + configuration['targetURL'], "subject": "IMPORTANT | The watched website has changed! Go check it immediately!", "recipient": configuration['mail']['notificationRecipient'], "server": configuration['mail']['server'] } if c['mailSent']: notification_conf[ 'body'] += "\n\n Oh and btw I already sent your reservation request ;)\n\n Have a good one!\n - AccommodationBot" mail.send(notification_conf) c['hash'] = page_hash else: print("Boring old same page...") save(c)
def main(): print('Collecting the data: ') print( 'This may take some time.....................................................' ) crawl() print('Analyzing the data ............... Almost there...........') import sentiment try: server = HTTPServer(('', 1456), server_handler) print('Server Started') server.serve_forever() except KeyboardInterrupt: print("Exiting") server.socket.close()
def fill_tables(): conn = None try: params = config() conn = psycopg2.connect(**params) print("Filling PostgreSQL database") crawler.crawl(conn) except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close() print("Database connection closed.")
def main(): print len(teamslist) (teamsdict, (homescores, awayscores, neutscores)) = \ crawl(update=1, bb=1, url="cbbga13.txt") # normalize scores. homescores, awayscores, neutscores = norm_venues(homescores, awayscores, neutscores) # get final scores matrix. scores = homescores + awayscores + neutscores # scores = (scores > scores.T) games = [("Kansas", "Ohio St."), ("Louisville", "Kentucky")] print_probs(teamsdict, scores, games) print; print valsp = prob_network_ratings(scores) valse = energy_min(scores) vals = [valsp, valse] wins = scores > scores.T losses = scores < scores.T wl = zip(wins.sum(1), losses.sum(1)) print_values(teamsdict, teamslist, vals, wl)
def startCrawl(ws): global userId, url, limit, sType, keyword userId = session['userId'] url = session['url'] limit = session['limit'] sType = session['sType'] keyword = session['keyword'] path = [] found = False early = False database = mongo.db.test #access test collection postid = database.insert({'userId' : userId, 'url': url, 'limit': limit, 'sType' : sType, 'keyword' : keyword, 'path': path, 'found': found, 'early': early}) #call crawler, passing socket and db info crawler.crawl(ws, url, int(limit), sType, keyword, postid, database)
def test_external_site(self): """ Test that crawling a page with a link to an external site won't lead to following that external site. """ result_sitemap = crawl("https://contains-external-site-link.com") self.assertNotIn("https://external-site.com", result_sitemap)
def gather(): for num in midNums: datastr=str(crawler.crawl(num)) incidentSave=[] data={} datastr=cleaner(datastr) data = disputeInfoGather(datastr) incidentData=incidentGather(datastr) data['Sources']=sourceSearchGather(datastr) #data['Rest']=narrativeGather(datastr) stuff=narrativeGather(datastr) for i in stuff: data[i]=stuff[i] for i in range(len(incidentData)): incidentSave.append(incidentData[i]) data['Incidents']=incidentData data['MID 2.1 Sources']=twoSources(datastr) data['Articles']=articleGather(datastr) #mids.append(copy.deepcopy(data)) mids.append(data) print "FINISHED PARSING MID NUMBER "+num return mids
def test_invalid_type(): scenario = [MetadataRecord('wrong_field', [1, 2, 3])] with pytest.raises(CrawlingError) as exc: list(crawl(scenario)) info = exc.value assert info.args[0] == "The type of field 'wrong_field' is unknown"
def main(): initiate() config = Config() crawl_interval = timedelta( minutes=int(config['settings']['crawl_interval_minutes']) ) last_update = datetime(1, 1, 1, tzinfo=pytz.utc) # Get urls to crawl domains = db.query.query_domains() while datetime.now(tz=pytz.utc) - last_update >= crawl_interval: # crawl all Urls crawl(domains) last_update = datetime.now(tz=pytz.utc)
def index(): lang = request.args.get('lang') if lang is None: lang = "java" response = call(lang) soup = BeautifulSoup(response, 'html5lib') divs = soup.find_all("div") return crawl(divs)
def main(): payload = crawl(month, year) filename = f"mpba-{month}-{year}.json" filepath = save(filename, payload, output_path) employees = parse(payload) crawler_result = build_crawler_result(month, year, employees, [filepath]) print(json.dumps({'cr': crawler_result}, ensure_ascii=False))
def send_data(message): subreddits = str(message.text.split(" ")[1]) try: answer = crawler.crawl(subreddits) except: answer = emoji.emojize("Something has happened. Our friends are working on it. We'll be right back! :thumbs_up: ") bot.reply_to(message, answer)
def test_max_depth(self): """ Check that when we're parsing a webpage with links to pages containing more links (and so on), we stop after (and only after) a given depth. """ result = crawl("https://deep-chain.com", max_depth=4) self.assertIn("https://4.deep-chain.com", result) self.assertNotIn("https://5.deep-chain.com", result)
def fetch( urls, sbook, fbook): #send_email("Crawling Started", str(len(urls))) now=time.time() num_processed, success_count, failure_count = crawler.crawl(urls, sbook, fbook) try: msg = "Total Processed: "+str(num_processed)+chr(10)+"Success Count: "+ str(success_count) +chr(10)+"Failure Count: "+str(failure_count)+"in "+str((time.time()-now)/60)+" minutes." #send_email("Crawling Successfull", msg) except Exception, ex: send_email("Crawler Error:", ex[1])
def findenter(url): """Find the URLs that are in same hosts.""" host = urlparse(url)[1] urls = crawl(url) for u in urls: h = urlparse(u)[1] if (h == host) and (not pat_image.search(u)) \ and (not pat_media.search(u)): print u
def email_generator(usernames, domains=[], links=[]): emails = dict() max_confidence = max(usernames.values()) common_domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'aol.com'] username_from_email = lambda email: email[:email.index('@')] for domain in domains: if domain[:4] != 'http': domain = 'http://' + domain # Might not have permission to scrape try: internal_links = crawl(domain) for l in internal_links: resp = urllib2.urlopen(l).read() parsed_emails = parse_HTML(resp) for email in parsed_emails: emails[email] = max_confidence except: pass # Common emails of people with their own domains # If any of these exist, the chance of it being their email # is highly likely since it's a personal domain stripped_domain = strip_domain(domain) emails['admin@' + stripped_domain] = max_confidence emails['info@' + stripped_domain] = max_confidence emails['me@' + stripped_domain] = max_confidence for link in links: if link: if link[:4] != 'http': link = 'http://' + link # Might not have permission to scrape try: resp = urllib2.urlopen(link).read() parsed_emails = parse_HTML(resp) for email in set(parsed_emails): if email in emails: emails[email] *= 1.5 else: emails[email] = max_confidence except: pass for username in usernames: for domain in common_domains: email = username + '@' + domain if email in emails: emails[username + '@' + domain] += usernames[username] else: emails[username + '@' + domain] = usernames[username] return emails
def worker(crawler, platform_game): before_crawler = time.time() rooms = crawler.crawl() after_crawler = time.time() _store_rooms(rooms, platform_game) print '*****************' print 'crawler {count} rooms from {platform_name} {game_name}'.format( count=len(rooms), platform_name=platform_game.platform_name, game_name=platform_game.game_name) after_save = time.time() print 'Crawl time cost: ', after_crawler - before_crawler print 'Save time cost: ', after_save - after_crawler
def search(request): if request.method == "POST": form = SearchForm(request.POST) if form.is_valid(): searchstring = form["searchstring"].data urlseed = form["urlseed"].data depth = form["depth"].data result = crawler.crawl(searchstring,urlseed,int(depth)) html = "<html><body>Url's containing search text are: %s</body></html>" % result return HttpResponse(html) else: form = SearchForm() return render_to_response('searchhome.html', { 'form' : form,})
def main(): url = 'http://vg.no' keywords = ['snegl', 'kropp', 'dette', 'paradise', 'pladask', 'miss universe', 'frøken norge', 'sex', 'triks', 'pokemon', 'undertøy', 'hollywood', 'kjendis', 'øvelse', 'slik', 'slank', 'digg' ] old_posts = database.get_posts() try: new_posts = crawler.crawl(url, keywords) new = 0 for k,v in new_posts.items(): if k not in old_posts.keys(): title = v['title'].encode('utf-8') print 'POSTING' , title print twitter.post_status(title) database.add_post(k, v) new += 1 time.sleep(1) if new == 0: print 'no new posts found' elif new == 1: print 'one new post found' else: print str(new) + ' new posts found' except requests.exceptions.ConnectionError: print 'could not connect'
def post(self): current_mode = self.getCurrentMode() if current_mode == MODE.SEARCH: user_search = self.request.get('inputSearch') words = user_search.split() words = list(set(words)) results = [] self.render('search.html', str_address = 'search', str_active = CLASS_ACTIVE, query = user_search, results = results[:10], bsearch = True) elif current_mode == MODE.CRAWL: try: logging.info('fetching urls from ' + MYURL) crawled = crawl(MYURL) except: logging.error('An error occured when crawling from url: ' + MYURL) self.redirect('/search') return DocMapping.clean() DocMapping.add_list(crawled) self.redirect('/crawled') elif current_mode == MODE.PARSE: # parsing mode: extract content from all documents and build data structures. # build inverted index docs = DocMapping.all() invertedIndex ={} termFrequency = {} termFrequencyByDoc = {} docFrequencyByTerm = {} logging.info('starting to parse all documents') for d in docs: parse(d, invertedIndex, termFrequency, termFrequencyByDoc, docFrequencyByTerm) logging.info('parsing done!') # we need to store this in a blob or cloud storage for later #json_str = json.dumps(invertedIndex) #json_str = json.dumps(termFrequency) #json_str = json.dumps(termFrequencyByDoc) #json_str = json.dumps(docFrequencyByTerm) self.redirect('/search')
def main(): """ 定义一个入口函数来组织爬虫过程,虽然对于python来说入口函数的形式并不必要. """ # 指定系统默认编码为utf-8 default_encoding = 'utf-8' if sys.getdefaultencoding() != default_encoding: reload(sys) sys.setdefaultencoding(default_encoding) # 必要数据收集 url = 'http://weibo.cn/moegirlwiki' # 任意新浪微博移动版页面地址 DB.dbuser = '******' # MySQL数据库用户名 DB.dbpassword = '******' # MySQL数据库密码 crawler.username = raw_input('请输入新浪通行证用户名:') # 爬取网站用户名 crawler.password = getpass('请输入新浪通行证密码:') # 爬取网站密码 crawler.cookie = login(crawler.username, crawler.password) # 获取第一个有效cookie # 初始化数据库,如果需要可以解开注释 # DB.init_DB() # 启动代理池守护进程 proxies.daemon_pool(test_url='http://weibo.cn', timeout=3, interval=30) # 执行爬虫过程 time0 = time() weibo_cnt = crawler.crawl( url=url, parser=parser, login=login, write_DB=DB.write_DB, get_proxy=proxies.get_proxy, average_delay=3, page_limit=1000, # record_raw_data = True ) time_cost = time() - time0 print('共爬取数据', weibo_cnt, '条,用时' + get_time_cost_str(time_cost))
def fetch(pointers): sbook=open("output/success.log","a+") fbook=open("output/failure.log","a+") while (True): terminator = 0 urls=[] for ff in pointers: url = ff.readline().strip() if(len(url)>5): terminator+=1 urls.append(url) try: if(len(urls)>0): num_processed, success_count, failure_count = crawler.crawl(urls, sbook, fbook) try: msg = "Total Processed: "+str(num_processed)+chr(10)+"Success Count: "+ str(success_count) +chr(10)+"Failure Count: "+str(failure_count) #send_email("Crawling Successfull", msg) except Exception, ex: send_email("Crawler Error:", ex[1]) else: break
def crawl(request): try: terms = map(str, request.GET['q'].split()) seed = str(request.GET['seed']) except KeyError: terms = None seed = None else: if 'http' not in seed: seed = 'http://' + seed try: depth = int(request.GET['depth'][0]) except: depth = 2 # default depth if depth < 3: depth = 1 return render_to_response('rycrawler/crawl.html', { 'seed': seed, 'terms': terms, 'sites': crawler.crawl(seed, depth, terms), })
def main(): crawler = SaksCrawler() crawler.crawl() #main()
def main(): crawler = ShopBopCrawler() crawler.crawl()
def main(): crawler = NordstromCrawler() crawler.crawl()
import crawler page_name = 'xxx' base_url = 'http://aqvatarius.com/themes/intuitive/' first_page = 'index.html' retry_times = 3 result = crawler.crawl(page_name, base_url, first_page, retry_times) if result == True: print('oh yeah.')
import unittest from collections import namedtuple from common import event from crawler import prepare, get_items, crawl import crawler crawler.add_link = lambda *args, **kwargs: crawl(*args, **kwargs) crawler.website_sleep_time = 0 html = lambda body: "<html><head></head><body>{}</body></html>".format(body) test_site_info = namedtuple( 'test_site_info', 'name baseurl urlnosaveregex urlregex html_hints test_html test_answer' ) tests = [ test_site_info( name='test1', baseurl='http://test1/', test_html={ 'http://test1/': html("<a href='/nosave'></a>"), 'http://test1/nosave': html("<a href='/event'></a>"), 'http://test1/event': html("<span>answer</span>"), }, urlnosaveregex='/nosave', urlregex='/event', html_hints={'name': 'span'}, test_answer=event(html=html('<span>answer</span>'))
def main(): crawler = JcrewCrawler() crawler.crawl()
def main(): crawler = KarmaCrawler() crawler.crawl()
def main(): crawler = UrbanOutfittersCrawler() crawler.crawl()
if args.command == "add": #remove options from the list of arguments, as options is useless for adding tags parser._actions[-1].container._remove_action(parser._actions[-1]) parser.add_argument('-b', '--bulk', nargs='*', help='Tag all files under a directory recursively with one set of specified tags') parser.add_argument('-i', '--individual', nargs='*', help='Tag one or more files with a set of specified tags for each file') parser.add_argument('-p', '--path', nargs='*', help='Tag all files under a directory recursively by their parent directories e.g. a file under /home/user would be tagged home and user. If the first argument passed to -p is not a directory, it will be interpreted as a tag or list of tags (separate by a comma and a space) to NOT assign to files') parser.add_argument('-f', '--filename', nargs='*', help='Tag files or directories of files by their filename, tags seperated by a hyphen e.g. a file named python-important-due_friday would be tagged python, important, and due_friday') parser.add_argument('--timetest', nargs='?', help='Test the speed of the hasing/adding function using the path argument') args = parser.parse_args() if args.bulk is not None: tags = args.bulk[0].split(', ') dirs = args.bulk[1:] for directory in dirs: files = crawl().get_filepaths(directory) print(files) tagging.tagAdd().bulk(files, tags) if args.individual is not None: for directory in args.individual: if os.path.isdir(directory): files = crawl().get_filepaths(directory) for file in files: tags = input("Tags to assign to %s: " % file).split(', ') file = os.path.realpath(file) tagging.tagAdd().add_tag(tags, file) else: file = os.path.realpath(directory) print(file) tags = input("Tags to assign to %s: " % directory).split(', ')
def main(): crawler = NastyCrawler() crawler.crawl()
import crawler import sys page = 'https://www.boardgames-bg.com' crawler.crawl(page)
def go(self): url = self.url_input.text() res = crawl(url) self.qas.setText('\n'.join(res))
import crawler crawler = crawler.crawler(None,'urls.txt') crawler.crawl(depth=1) print "====Printing resolved inverted index====" print crawler.get_resolved_inverted_index()
def post(self): isbn_input = self.request.get("isbn").strip() valid_isbn = re.compile(r'\b(\d{13}|\d{10})\b') isbns = re.findall(valid_isbn, isbn_input.replace("-","")) booksfound = [] logging.error("ISBNs: %s" % isbns) for isbn in isbns: if len(isbn)==10: newisbn = ''.join(['978', isbn]) check = unicode((10 - (sum(int(digit) * (3 if idx % 2 else 1) for idx, digit in enumerate(newisbn[:12])) % 10)) % 10) isbn = newisbn[:12] + check usersbook = Bookshelf.by_name(self.username, isbn) # usersbook = None #delete this line if usersbook: """book already on bookshelf""" # self.response.write("On Shelf") self.render("addbooks.html", isbn=isbn, error_isbn='Book is on your bookshelf already') return else: intelligence_book = Intelligence.by_name(isbn) # intelligence_book = None # delete this line if not intelligence_book: """scrape off websites and add intelligence""" # self.response.write("Not in Database\n\n") newbookkey = items.bookitem_key(isbn, parent=items.bookitem_key()) newbook = items.BookItem(key=newbookkey) newbook.isbn = isbn bookstw_url = ''.join(["http://search.books.com.tw/exep/prod_search.php?cat=BKA&key=", isbn]) eslitetw_url = ''.join(["http://www.eslite.com/Search_BW.aspx?query=", isbn]) crawler.crawl(newbook, bookstw_url, crawler.create_bookstw_searchresult_callback, eslitetw_url, crawler.create_eslitetw_searchresult_callback, googlebooksjs = True) # self.response.write(newbook) if newbook.name: for (i, url) in enumerate(newbook.image_urls): try: result = urlfetch.fetch(url, deadline=10) except DeadlineExceededError: logging.error("Deadline Exceeded While Fetching Book Image\n\n\n") return except DownloadError: logging.error("Download Error While Fetching Book Image. Check network connections.") return if result.status_code == 200: newbook.images.append(result.content) # self.response.write('<img src="/_getimage?key=%s&idx=%s" />' % (newbookkey.id(), str(i))) else: newbook.images.append(None) newbookkey = newbook.update() # logging.error("before %s" % booksfound) booksfound.append(newbook) # logging.error("after %s" % booksfound) # self.render("showbook.html", book=newbook) key = ndb.Key('Intelligence', isbn, parent=intelligence_key(DEFAULT_INTELLIGENCE_NAME)) intelligence = Intelligence(key=key) intelligence.isbn = isbn intelligence.name = newbook.name intelligence.language = newbook.language intelligence.update() else: self.render("addbooks.html", isbn=isbn, error_isbn='ISBN not found') return else: book = items.BookItem.by_name(isbn) if book is not None: booksfound.append(book) bookshelf_book = Bookshelf(key=bookshelf_key(isbn, parent=bookshelf_key(self.username))) bookshelf_book.username = self.username bookshelf_book.isbn = isbn bookshelf_book.status = 4 # rating = StellarRating(linguistic=0, logical=0, right_minded=0,intrapersonal=0, interpersonal=0, naturalistic=0) bookshelf_book.raters = StellarRaters() bookshelf_book.rating = StellarRating() bookshelf_book.update() # self.response.write(''.join([isbn," added to bookshelf"])) # logging.error("books found %s" % booksfound) self.render("showbook.html", head_title="View Books Added", books=booksfound)
def main(): crawler = BestBuyCrawler() crawler.crawl()