valid_urls = [ (re.compile(r"^http://www\.vuenosairez\.com/V2_1/evento\.php\?idEvento=[0-9]+&fechaEvento=[0-9]+$"), get_details) ] def save_to_db(): MusicShow.objects.all().delete() for event_info in events.values(): MusicShow.objects.create(**event_info) if __name__ == "__main__": import datetime pool = Pool( 25, spyder.dispatcher, timeout=20, extra_kargs={"links_path": links_path, "crawled": set(), "maxdepth": 1, "valid_urls": valid_urls}, ) pool.add_to_queue( {"depth": 0, "referrer": None, "url": "http://www.vuenosairez.com/V2_1/resultados-agenda.php?tipoBusq=27&cat=5"} ) start = datetime.datetime.now() pool.start() save_to_db() end = datetime.datetime.now() print "took", end - start, "to crawl and save to db."
showtime_pairs = [(showtimes[i],showtimes[i+1]) for i in range(len(showtimes))[::2]] with local_mutex: for place,times in showtime_pairs: event_showtimes[movie_id].add((place.text_content().strip(), u'%s-%s' % (showtime_date, times.text_content().strip()))) valid_urls = [(re.compile(r'http://www\.lanacion\.com\.ar/espectaculos/cartelera-cine/peliculaFicha\.asp\?pelicula=(\d+)$'), get_movie_details),] def save_to_db(): Movie.objects.all().delete() MovieShow.objects.all().delete() for event_info in events.values(): Movie.objects.create(**event_info) for event_title,venue_showtimes in event_showtimes.items(): for venue,showtime in venue_showtimes: MovieShow.objects.create(movie=event_title, sala=venue, horarios=showtime) if __name__ == '__main__': pool = Pool(25, spyder.dispatcher, timeout=5, extra_kargs={'links_path':links_path, 'crawled':set(), 'maxdepth':None, 'valid_urls':valid_urls,}) doc = get_doc('http://www.lanacion.com.ar/espectaculos/cartelera-cine/index.asp') for movie_id in doc.xpath(u'//div[@id="contenido"]//form//select[@name="pelicula"]/option/@value'): if movie_id.strip(): pool.add_to_queue({'depth':0, 'referrer':None, 'url':'http://www.lanacion.com.ar/espectaculos/cartelera-cine/peliculaFicha.asp?pelicula=%s' % movie_id.strip(),}) start = datetime.datetime.now() pool.start() save_to_db() end = datetime.datetime.now() print 'took', end - start, 'to crawl and save to db.'
for key,path in fields.items(): try: path,func = path except ValueError: func = None nodes = doc.xpath(path) if nodes: value = func(nodes) if func else ', '.join([t.strip() for t in nodes if t.strip()]) with local_mutex: d = {} events.setdefault(event_id, d)[key] = value valid_urls = [(re.compile(r'http://www\.lanacion\.com\.ar/espectaculos/cartelera-teatro/obraFicha\.asp\?obra=[0-9]+&teatro_id=[0-9]+$'), get_details),] def save_to_db(): Theater.objects.all().delete() for event_info in events.values(): Theater.objects.create(**event_info) if __name__ == '__main__': import datetime pool = Pool(25, spyder.dispatcher, timeout=4, extra_kargs={'links_path':links_path, 'crawled':set(), 'maxdepth':1, 'valid_urls':valid_urls,}) pool.add_to_queue({'depth':0, 'referrer':None, 'url':'http://www.lanacion.com.ar/espectaculos/cartelera-teatro/',}) start = datetime.datetime.now() pool.start() save_to_db() end = datetime.datetime.now() print 'took', end - start, 'to crawl and save to db.'
events_synopses[event_id] = synopsis valid_urls = [(re.compile(r'^http://www\.terra\.com\.ar/programaciontv/busqueda\.shtml.*$'), get_details), (re.compile(r'^http://www\.terra\.com\.ar/programaciontv/ficha\.pl\?id=.*$'), get_synopsis),] def save_to_db(these_events): for event_id,occurrences in these_events.items(): for event_info in occurrences: TvShow.objects.create(**event_info) def save_synopses(): for event_id, synopsis in events_synopses.items(): TvShow.objects.filter(show_name=event_id).update(sinopsis=synopsis) if __name__ == '__main__': extra_kargs = {'links_path':links_path, 'crawled':set(), 'maxdepth':None, 'valid_urls':valid_urls, 'link_getter':get_links,} pool = Pool(30, spyder.dispatcher, timeout=4, extra_kargs=extra_kargs) for day in range(7): this_date = datetime.date.today() + datetime.timedelta(days=day) url = this_date.strftime('http://www.terra.com.ar/programaciontv/busqueda.shtml?fe=%Y/%m/%d&o=0') pool.add_to_queue({'depth':0, 'referrer':None, 'url':url,}) start = datetime.datetime.now() TvShow.objects.all().delete() pool.start() save_to_db(events) save_synopses() end = datetime.datetime.now() print 'took', end - start, 'to crawl and save to db.'