Python Scraper.find_docs Exemples

Langage de programmation: Python

Espace de nommage/Pack: scraper

Class/Type: Scraper

Méthode/Fonction: find_docs

Exemples au hotexamples.com: 3

Python Scraper.find_docs - 3 exemples trouvés. Ce sont les exemples réels les mieux notés de scraper.Scraper.find_docs extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

Scraper(30)

matchTag(7)

connect(6)

__init__(5)

_time_now(5)

close(5)

submit(3)

find_docs(3)

get_children(3)

create_destination(2)

extractTag(2)

get_papers(2)

begin(2)

get_all_page_uris(1)

get_all_skills(1)

get_css(1)

get_and_write_records(1)

getZipLinks(1)

get_manga(1)

get_paths(1)

get_post_data_per_page(1)

get_all_manga(1)

getGameList(1)

getSlist(1)

getQlist(1)

getInformation(1)

getIndexhtm(1)

get_prices(1)

getEvents(1)

getDepts(1)

getAppList(1)

gather_reddit_data(1)

fetch_most_recent_transactions(1)

fetch_booster_usage(1)

extractText(1)

create_organization_sets(1)

create_http_link(1)

get_price(1)

DownloadImage(1)

get_script(1)

scrape_ingredients(1)

update_submission_content(1)

store_parse(1)

stopped(1)

sort(1)

seturldata(1)

set_started_callback(1)

set_output_file(1)

set_finished_callback(1)

set_broadcast_document_callback(1)

Méthodes fréquemment utilisées

Scraper (30)

matchTag (7)

connect (6)

__init__ (5)

_time_now (5)

close (5)

submit (3)

find_docs (3)

get_children (3)

create_destination (2)

Méthodes fréquemment utilisées

extractTag (2)

get_papers (2)

begin (2)

get_all_page_uris (1)

get_all_skills (1)

get_css (1)

get_and_write_records (1)

getZipLinks (1)

get_manga (1)

get_paths (1)

get_post_data_per_page (1)

get_all_manga (1)

getGameList (1)

getSlist (1)

getQlist (1)

getInformation (1)

getIndexhtm (1)

get_prices (1)

getEvents (1)

getDepts (1)

Méthodes fréquemment utilisées

get_post_data_per_page (1)

get_all_manga (1)

getGameList (1)

getSlist (1)

getQlist (1)

getInformation (1)

getIndexhtm (1)

get_prices (1)

getEvents (1)

getDepts (1)

getAppList (1)

gather_reddit_data (1)

fetch_most_recent_transactions (1)

fetch_booster_usage (1)

extractText (1)

create_organization_sets (1)

create_http_link (1)

get_price (1)

DownloadImage (1)

get_script (1)

scrape_ingredients (1)

update_submission_content (1)

store_parse (1)

stopped (1)

sort (1)

seturldata (1)

set_started_callback (1)

set_output_file (1)

set_finished_callback (1)

set_broadcast_document_callback (1)

Méthodes fréquemment utilisées

getAppList (1)

gather_reddit_data (1)

fetch_most_recent_transactions (1)

fetch_booster_usage (1)

extractText (1)

create_organization_sets (1)

create_http_link (1)

get_price (1)

DownloadImage (1)

get_script (1)

scrape_ingredients (1)

update_submission_content (1)

store_parse (1)

stopped (1)

sort (1)

seturldata (1)

set_started_callback (1)

set_output_file (1)

set_finished_callback (1)

set_broadcast_document_callback (1)

setStartedCallback (1)

setFinishedCallback (1)

setBroadcastDocCallback (1)

scrape_thread (1)

scrapeTitle (1)

get_url_slug (1)

scrapeSitename (1)

scrapePrefix (1)

scrapeImage (1)

scrapeDescr (1)

scrap_offers_list_max_page (1)

scrap_offer_details_page (1)

registered_manga (1)

procesa_proyectos (1)

matchByType (1)

is_known_url (1)

insert_into_db (1)

getddixml (1)

crawl (1)

get_url_sound (1)

Exemple #1

0

Afficher le fichier

Fichier : tests.py Projet : reustonium/BarkingOwl

def test_find_docs(): declare_test_start( 'follow_link' ) url_data = { 'url_id': 1, 'target_url': 'http://timduffy.me/', 'max_link_level': 6, 'creation_date_time': str(datetime.datetime.now()), 'doc_type': 'application/pdf', 'dispatch_datetime': str(datetime.datetime.now()), 'allowed_domains': [], } uid = str(uuid.uuid4()) scraper = Scraper(uid) scraper.set_url_data(url_data) docs = scraper.find_docs( ) print '[ TEST ] {0}'.format(json.dumps(scraper.status)) print '[ TEST ] {0}'.format(json.dumps(docs)) passed = False if len(docs) > 0: passed = True declare_test_end( passed )

Exemple #2

0

Afficher le fichier

Fichier : tests.py Projet : reustonium/BarkingOwl

def test_find_all_docs(url): declare_test_start( 'follow_link' ) url_data = { 'url_id': 1, 'target_url': url, # 'http://www.scottsvilleny.org/', 'max_link_level': -1, 'creation_date_time': str(datetime.datetime.now()), 'doc_type': 'application/pdf', 'dispatch_datetime': str(datetime.datetime.now()), 'allowed_domains': [], } uid = str(uuid.uuid4()) scraper = Scraper(uid) scraper.set_url_data(url_data) docs = scraper.find_docs( ) status = scraper.status #print '[ TEST ] {0}'.format(json.dumps(scraper.status)) #print '[ TEST ] {0}'.format(json.dumps(docs)) with open('find_docs_external_results.json','w') as f: f.write(json.dumps(status)) with open('find_docs_external_all_docs.json', 'w') as f: f.write(json.dumps(docs)) passed = False if len(docs) > 0: passed = True declare_test_end( passed ) return docs, status

Exemple #3

0

Afficher le fichier

Fichier : scraperwrapper.py Projet : reustonium/BarkingOwl

class ScraperWrapper(threading.Thread): def __init__(self,address='localhost',exchange='barkingowl',broadcast_interval=5,DEBUG=False): """ __init__() constructor setups up the message bus, inits the thread, and sets up local status variables. """ threading.Thread.__init__(self) self.uid = str(uuid.uuid4()) self.address = address self.exchange = exchange self.DEBUG=DEBUG self.interval = broadcast_interval # create scraper instance self.scraper = Scraper(uid=self.uid) self.scraping = False self.scraper_thread = None # stop control self.stopped = False #setup message bus self.respcon = pika.BlockingConnection(pika.ConnectionParameters( host=self.address)) self.respchan = self.respcon.channel() self.respchan.exchange_declare(exchange=self.exchange,type='fanout') self.reqcon = pika.BlockingConnection(pika.ConnectionParameters(host=address)) self.reqchan = self.reqcon.channel() self.reqchan.exchange_declare(exchange=exchange,type='fanout') result = self.reqchan.queue_declare(exclusive=True) queue_name = result.method.queue self.reqchan.queue_bind(exchange=exchange,queue=queue_name) self.reqchan.basic_consume(self._reqcallback,queue=queue_name,no_ack=True) # start our anouncement of availiability threading.Timer(self.interval, self.broadcast_available).start() if self.DEBUG: print "Scraper Wrapper INIT complete." def run(self): """ run() is called by the threading sub system when ScraperWrapper.start() is called. This function sets up all of the call abcks needed, as well as begins consuming on the message bus. """ # setup call backs self.scraper.set_finished_callback(self.scraper_finished_callback) self.scraper.set_started_callback(self.scraper_started_callback) self.scraper.set_broadcast_document_callback(self.scraper_broadcast_document_callback) # broadcast availability self.broadcast_available() self.reqchan.start_consuming() def stop(self): """ stop() is called to stop consuming on the message bus, and to stop the scraper from running. """ #self.scraper.stop() #if self.scraper_thread != None: # self.scraper_thread.stop() self.reqchan.stop_consuming() self.stopped = True def reset_scraper(self): """ resetscraper() calls reset() within the Scraper class. This resets the state of the scraper. This should not be called unless the scraper has been stoped. """ self.scraper.reset() def broadcast_available(self): """ broadcastavailable() broadcasts a message to the message bus saying the scraper is available to be dispatched a new url to begin scraping. """ # make sure we are not currently scraping if self.scraper.status['busy'] == False: packet = { 'available_datetime': str(datetime.datetime.now()) } payload = { 'command': 'scraper_available', 'source_id': self.uid, 'destination_id': 'broadcast', 'message': packet } jbody = json.dumps(payload) self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody) # boadcast our simple status to the bus self.broadcast_simple_status() # # TODO: move this over to it's own timer, no need to do it here. # #if self.scraper.stopped(): # raise Exception("Scraper Wrapper Exiting") #else: # threading.Timer(self.interval, self.broadcastavailable).start() if not self.scraping and not self.stopped: threading.Timer(self.interval, self.broadcast_available).start() def broadcast_status(self): """ broadcaststatus() broadcasts the status of the scraper to the bus. This includes all of the information kept in all of the state variables within the scraper. Note: this can be a LOT of information. """ packet = { 'status': self.scraper.status, 'url_data': self.status['url_data'], 'status_datetime': str(datetime.datetime.now()) } payload = { 'command': 'scraper_status', 'source_id': self.uid, 'destination_id': 'broadcast', 'message': packet } jbody = json.dumps(payload) #time.sleep(.5) self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody) def broadcast_simple_status(self): """ broadcastsimplestatus() broadcasts a smaller subset of information about the scraper to the bus. This information includes: packet = { 'busy': self.scraper.status['busy'], # boolean of busy status 'link_count': self.scraper.status['linkcount'], # number of links seen by the scraper 'link_count': self.scraper.status['link_count'], # number of links processed by the scraper 'bad_link_count': len(self.scraper.status['badlinks']), # number of bad links seen by the scraper 'target_url': targeturl, # the target url the scraper is working on 'status_datetime': str(isodatetime) # the date/time of the status being sent } """ if self.scraper.status['url_data'] == {}: targeturl = 'null' else: targeturl = self.scraper.status['url_data']['target_url'] packet = { 'busy': self.scraper.status['busy'], 'link_count': self.scraper.status['link_count'], 'link_count': self.scraper.status['link_count'], 'bad_link_count': len(self.scraper.status['bad_links']), 'target_url': targeturl, 'status_datetime': str(datetime.datetime.now()) } payload = { 'command': 'scraper_status_simple', 'source_id': self.uid, 'destination_id': 'broadcast', 'message': packet } jbody = json.dumps(payload) self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody) def scraper_finished_callback(self,payload): """ scraperFinishedCallBack() is the built in, and default, async call back for when the 'scraper finished' command is seen. """ jbody = json.dumps(payload) self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody) return def scraper_started_callback(self,payload): """ scraperFinishedCallBack() is the built in, and default, async call back for when the 'scraper started' command is seen. """ jbody = json.dumps(payload) self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody) return def scraper_broadcast_document_callback(self,payload): """ scraperBroadcastDocCallBack() is the built in, and default, async call back for when the 'scraper finds a new document' command is seen. """ jbody = json.dumps(payload) self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody) return def _scraperstart(self): #if self.scraper.start == False: # self.scraper.start() #self.scraper.begin() self.scraper.find_docs() # message handler def _reqcallback(self,ch,method,properties,body): #try: if True: response = json.loads(body) # commented this out because it made the logs almost impossible to read #if self.DEBUG: # print "Processing Message:\n\t{0}".format(response['command']) if response['command'] == 'url_dispatch': if response['destination_id'] == self.uid: #print "URL Dispatch Command Seen." #print response if self.scraping == False: #print "[Wrapper] Launching Scraper on URL: '{0}'".format(response['message']['targeturl']) self.scraper.set_url_data(response['message']) #if self.scraper.started == False: # self.scraper.start() if self.DEBUG: print "Launching scraper thread ..." self.scraping = True self.scraper_thread = threading.Thread(target=self._scraperstart) self.scraper_thread.start() #self._scraperstart() if self.DEBUG: print " ... Scraper launched successfully." elif response['command'] == 'scraper_finished': if response['source_id'] == self.scraper.uid: self.scraping = False elif response['command'] == 'get_status': self.broadcaststatus() elif response['command'] == 'get_status_simple': self.broadcastsimplestatus() elif response['command'] == 'reset_scraper': if response['destination_id'] == self.uid: self.resetscraper() elif response['command'] == 'shutdown': if response['destination_id'] == self.uid: print "[{0}] Shutting Down Recieved".format(self.uid) self.stop() elif response['command'] == 'global_shutdown': print "Global Shutdown Recieved" self.stop()