def _parseRestaurantPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing restaurant page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return content = soup.find('div', { 'id' : 'content'}) if content is None: return entity = Entity() entity.title = content.find('h1').getText() entity.subcategory = "restaurant" entity.seattletimes = {} details = content.find('div', {'id' : 'edbtext'}) desc = details.find('p').getText() if desc is not None: entity.desc = desc details = details.findAll('p', {'class' : 'list'}) address = details[0].renderContents().strip().replace('<br />', '') address = re.sub('[ \n\t]+', ' ', address) entity.address = address if len(details) > 1: site = details[1].get('href') if site is not None: entity.site = site if len(details) > 2: hoursOfOperation = details[2].getText() if hoursOfOperation is not None: entity.hoursOfOperation = hoursOfOperation key = (entity.title, entity.address) if key in self.seen or '(closed)' in entity.title.lower(): return self.seen.add(key) self._output.put(entity)
def _parseListPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing list page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return results = soup.findAll('td', {'class' : 'summary'}) for result in results: entity = Entity() entity.subcategory = "book" entity.nytimes = {} title = result.find('span', {'class' : 'bookName'}).getText().strip().title() if title.endswith(','): title = title[0:-1] entity.title = title details = result.getText(separator='___') details_match = self.details_re.match(details) if details_match: details_match = details_match.groups() entity.author = details_match[0] entity.publisher = details_match[1] entity.desc = details_match[2] key = (entity.title, entity.author) if key in self.seen: continue self.seen.add(key) self._output.put(entity)
def _parse_series_page(self, name, url): if '**' in name or 'DUPLICATE' in name or name.startswith('.hack'): return utils.log('[%s] parsing page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return contents = soup.findAll('div', {'id' : 'content'}) header = contents[0] h1 = header.find('h1') title = h1.getText() h1.extract() entity = Entity() # parse basic show info entity.title = title entity.subcategory = 'tv' desc = header.getText().replace('\r\n', '\n') if len(desc) > 5: entity.desc = desc entity.sources.thetvdb_id = self._id_re.match(url).groups()[0] # parse images images = map(lambda img: img.get('src'), soup.findAll('img', {'class' : 'banner'})) types = [ 'posters', 'fanart', 'graphical', ] for image_type in types: filtered_images = filter(lambda img: image_type in img, images) if len(filtered_images) > 0: entity.image = "%s%s" % (self.base, filtered_images[0]) break info = contents[1].find('table').find('table') rows = info.findAll('tr') # parse detailed show info info_map = { 0 : 'original_release_date', 3 : 'air_time', 4 : 'network_name', 5 : 'genre', } for k, k2 in info_map.iteritems(): try: value = rows[k].findAll('td')[1].getText() if len(value) > 0: entity[k2] = value except: utils.printException() pass # parse cast try: actors = "%s%s" % (self.base, contents[-1].findAll('a')[-1].get('href')) actors_soup = utils.getSoup(actors) infotables = actors_soup.findAll('table', {'class' : 'infotable'}) cast = [] for infotable in infotables: text = infotable.find('td').getText(separator='___') match = self._actor_re.match(text) if match is not None: groups = match.groups() cast.append('%s as %s' % (groups[0].strip(), groups[1].strip())) # TODO: record actor images if len(cast) > 0: entity.cast = ', '.join(cast) except: pass # parse seasons try: seasons = "%s%s" % (self.base, contents[2].findAll('a')[-1].get('href')) seasons_soup = utils.getSoup(seasons) rows = seasons_soup.find('table', {'id' : 'listtable'}).findAll('tr')[1:] highest_season = -1 earliest = None latest = None # each row is an episode; loop through each episode, recording the # earliest and latest air date for the show overall and the number # of seasons the show ran for. for row in rows: tds = row.findAll('td') episode = tds[0].getText() match = self._season_re.match(episode) if match is not None: groups = match.groups() season = int(groups[0]) episode = int(groups[1]) if season > highest_season: highest_season = season date = tds[2].getText() match = self._date_re.match(date) if match is not None: year, month, day = match.groups() date = datetime(year=int(year), month=int(month), day=int(day)) if earliest is None or date < earliest: earliest = date if latest is None or date > latest: latest = date if highest_season > 0: entity.num_seasons = highest_season if earliest is not None: entity.earliest_air_date = earliest if latest is not None: entity.latest_air_date = latest except: utils.printException() entity2 = self._thetvdb.lookup(entity.sources.thetvdb_id) if entity2 is not None: if entity2.mpaa_rating is not None: entity.mpaa_rating = entity2.mpaa_rating if entity2.imdb_id is not None: entity.imdb_id = entity2.imdb_id self._output.put(entity)
def _parse_dump(self, filename): f = gzip.open(filename, 'rb') context = iter(etree.iterparse(f, events=("start", "end"))) event, root = context.next() nid_re = re.compile('.*\/([0-9]*)$') language_re = re.compile('.*\/languages$') match_genre_re = re.compile('.*\/genres$') match_ratings_re = re.compile('.*\/mpaa_ratings$') match_genre_func = lambda c: re.match(match_genre_re, c.get('scheme')) is not None match_ratings_func = lambda c: re.match(match_ratings_re, c.get('scheme')) is not None match_language_func = lambda c: re.match(language_re, c.get('scheme')) is not None count = 0 bonus_materials = set() # loop through each XML catalog_title element and parse it as a movie Entity for event, elem in context: if event == "end" and elem.tag == "catalog_title": root.clear() try: rating_elem = elem.find('average_rating') if rating_elem is None: continue entity = Entity() nid = elem.find('id').text nid = int(re.match(nid_re, nid).groups()[0]) bonus_materials_elem = elem.find('.//bonus_materials') if bonus_materials_elem is not None: links = map(lambda l: l.get('href'), bonus_materials_elem.findall('link')) for link in links: bonus_material_id = int(re.match(nid_re, link).groups()[0]) #bonus_material_id = re.match(bonus_materials_id_re, link).groups()[0] bonus_materials.add(bonus_material_id) if nid in bonus_materials: continue title = elem.find('title').get('regular') titlel = title.lower() if 'bonus material' in titlel: continue entity.title = title entity.nid = nid entity.desc = elem.find('.//synopsis').text entity.nrating = float(rating_elem.text) categories = elem.findall('category') genres = map(lambda c: c.get('label'), filter(match_genre_func, categories)) entity.ngenres = genres tv = False for genre in genres: if 'tv' in genre.lower(): tv = True break if tv: entity.subcategory = 'tv' else: entity.subcategory = 'movie' ratings = map(lambda c: c.get('label'), filter(match_ratings_func, categories)) if 1 == len(ratings): entity.mpaa_rating = ratings[0] images = elem.find('.//box_art').findall('link') if 3 == len(images) or 4 == len(images): entity.tiny = images[0].get('href') entity.small = images[1].get('href') entity.large = images[2].get('href') if 4 == len(images): entity.hd = images[3].get('href') links = filter(lambda l: 'web page' == l.get('title'), elem.findall('link')) if 1 == len(links): entity.nurl = links[0].get('href') language_elem = elem.find('.//languages_and_audio') language_elems = filter(match_language_func, language_elem.findall('.//category')) release_year_elem = elem.find('release_year') if release_year_elem is not None: entity.original_release_date = release_year_elem.text duration = elem.find('runtime') if duration is not None: entity.track_length = duration.text languages = set() for elem2 in language_elems: languages.add(elem2.get('label').lower()) if 'english' not in languages: continue #utils.log(entity.title) #pprint(entity.getDataAsDict()) """ self._globals['n'] = elem self._globals['s'] = etree.tostring(elem, pretty_print=True) self._globals['e'] = entity break """ self._output.put(entity) count += 1 # give the downstream consumer threads an occasional chance to work if 0 == (count % 512): time.sleep(0.1) elem.clear() except Exception, e: utils.printException() utils.log(elem.find('title').get('regular'))