Python Entity.desc 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: Schemas

클래스/타입: Entity

메소드/함수: desc

hotexamples.com에서의 예제들: 4

Python Entity.desc - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 Schemas.Entity.desc에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

subcategory(24)

title(21)

address(16)

desc(4)

author(3)

image(3)

site(2)

phone(2)

factual(2)

mpaa_rating(2)

publisher(2)

yrating(1)

original_release_date(1)

publish_date(1)

seattletimes(1)

small(1)

sku_number(1)

vicinity(1)

nytimes(1)

subtitle(1)

tiny(1)

yreviews(1)

titlel(1)

track_length(1)

openTable(1)

nrating(1)

nymag(1)

hoursOfOperation(1)

awardAnnals(1)

bid(1)

cast(1)

cuisine(1)

earliest_air_date(1)

entity_id(1)

googleLocal(1)

hd(1)

imdb_id(1)

nurl(1)

isbn(1)

large(1)

lat(1)

latest_air_date(1)

lng(1)

ngenres(1)

nid(1)

num_seasons(1)

yurl(1)

예제 #1

파일 보기

파일: SeattleTimesCrawler.py 프로젝트: Stamped/Stamped

 def _parseRestaurantPage(self, pool, queue, url, name, base=False):
     utils.log('[%s] parsing restaurant page %s (%s)' % (self, name, url))
     
     try:
         soup = utils.getSoup(url)
     except:
         #utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     content = soup.find('div', { 'id' : 'content'})
     
     if content is None:
         return
     
     entity = Entity()
     entity.title = content.find('h1').getText()
     entity.subcategory = "restaurant"
     entity.seattletimes = {}
     
     details = content.find('div', {'id' : 'edbtext'})
     desc    = details.find('p').getText()
     if desc is not None:
         entity.desc = desc
     
     details = details.findAll('p', {'class' : 'list'})
     address = details[0].renderContents().strip().replace('<br />', '')
     address = re.sub('[ \n\t]+', ' ', address)
     entity.address = address
     
     if len(details) > 1:
         site = details[1].get('href')
         if site is not None:
             entity.site = site
     
     if len(details) > 2:
         hoursOfOperation = details[2].getText()
         if hoursOfOperation is not None:
             entity.hoursOfOperation = hoursOfOperation
     
     key = (entity.title, entity.address)
     if key in self.seen or '(closed)' in entity.title.lower():
         return
     
     self.seen.add(key)
     self._output.put(entity)

예제 #2

파일 보기

파일: NYTimesBestSellerCrawler.py 프로젝트: Stamped/Stamped

 def _parseListPage(self, pool, queue, url, name, base=False):
     utils.log('[%s] parsing list page %s (%s)' % (self, name, url))
     
     try:
         soup = utils.getSoup(url)
     except:
         #utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     results = soup.findAll('td', {'class' : 'summary'})
     
     for result in results:
         entity = Entity()
         entity.subcategory = "book"
         entity.nytimes = {}
         
         title = result.find('span', {'class' : 'bookName'}).getText().strip().title()
         if title.endswith(','):
             title = title[0:-1]
         
         entity.title = title
         
         details = result.getText(separator='___')
         details_match = self.details_re.match(details)
         
         if details_match:
             details_match    = details_match.groups()
             entity.author    = details_match[0]
             entity.publisher = details_match[1]
             entity.desc      = details_match[2]
         
         key = (entity.title, entity.author)
         if key in self.seen:
             continue
         
         self.seen.add(key)
         self._output.put(entity)

예제 #3

파일 보기

파일: TheTVDBCrawler.py 프로젝트: Stamped/Stamped

 def _parse_series_page(self, name, url):
     if '**' in name or 'DUPLICATE' in name or name.startswith('.hack'):
         return
     
     utils.log('[%s] parsing page %s (%s)' % (self, name, url))
     
     try:
         soup = utils.getSoup(url)
     except:
         utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     contents = soup.findAll('div', {'id' : 'content'})
     header = contents[0]
     
     h1 = header.find('h1') 
     title = h1.getText()
     h1.extract()
     
     entity = Entity()
     
     # parse basic show info
     entity.title = title
     entity.subcategory = 'tv'
     
     desc = header.getText().replace('\r\n', '\n')
     if len(desc) > 5:
         entity.desc = desc
     
     entity.sources.thetvdb_id = self._id_re.match(url).groups()[0]
     
     # parse images
     images = map(lambda img: img.get('src'), soup.findAll('img', {'class' : 'banner'}))
     types  = [ 'posters', 'fanart', 'graphical', ]
     
     for image_type in types:
         filtered_images = filter(lambda img: image_type in img, images)
         if len(filtered_images) > 0:
             entity.image = "%s%s" % (self.base, filtered_images[0])
             break
     
     info = contents[1].find('table').find('table')
     rows = info.findAll('tr')
     
     # parse detailed show info
     info_map = {
         0 : 'original_release_date', 
         3 : 'air_time', 
         4 : 'network_name', 
         5 : 'genre', 
     }
     
     for k, k2 in info_map.iteritems():
         try:
             value = rows[k].findAll('td')[1].getText()
             if len(value) > 0:
                 entity[k2] = value
         except:
             utils.printException()
             pass
     
     # parse cast
     try:
         actors = "%s%s" % (self.base, contents[-1].findAll('a')[-1].get('href'))
         actors_soup = utils.getSoup(actors)
         
         infotables = actors_soup.findAll('table', {'class' : 'infotable'})
         cast = []
         
         for infotable in infotables:
             text = infotable.find('td').getText(separator='___')
             match = self._actor_re.match(text)
             if match is not None:
                 groups = match.groups()
                 cast.append('%s as %s' % (groups[0].strip(), groups[1].strip()))
                 # TODO: record actor images
         
         if len(cast) > 0:
             entity.cast = ', '.join(cast)
     except:
         pass
     
     # parse seasons
     try:
         seasons = "%s%s" % (self.base, contents[2].findAll('a')[-1].get('href'))
         seasons_soup = utils.getSoup(seasons)
         
         rows = seasons_soup.find('table', {'id' : 'listtable'}).findAll('tr')[1:]
         
         highest_season = -1
         earliest = None
         latest   = None
         
         # each row is an episode; loop through each episode, recording the 
         # earliest and latest air date for the show overall and the number 
         # of seasons the show ran for.
         for row in rows:
             tds = row.findAll('td')
             episode = tds[0].getText()
             match = self._season_re.match(episode)
             
             if match is not None:
                 groups  = match.groups()
                 season  = int(groups[0])
                 episode = int(groups[1])
                 
                 if season > highest_season:
                     highest_season = season
                 
                 date  = tds[2].getText()
                 match = self._date_re.match(date)
                 
                 if match is not None:
                     year, month, day = match.groups()
                     date = datetime(year=int(year), month=int(month), day=int(day))
                     
                     if earliest is None or date < earliest:
                         earliest = date
                     
                     if latest is None or date > latest:
                         latest = date
         
         if highest_season > 0:
             entity.num_seasons = highest_season
         
         if earliest is not None:
             entity.earliest_air_date = earliest
         
         if latest is not None:
             entity.latest_air_date = latest
     except:
         utils.printException()
     
     entity2 = self._thetvdb.lookup(entity.sources.thetvdb_id)
     
     if entity2 is not None:
         if entity2.mpaa_rating is not None:
             entity.mpaa_rating = entity2.mpaa_rating
         if entity2.imdb_id is not None:
             entity.imdb_id     = entity2.imdb_id
     
     self._output.put(entity)

예제 #4

파일 보기

파일: NetflixDump.py 프로젝트: Stamped/Stamped

 def _parse_dump(self, filename):
     f = gzip.open(filename, 'rb')
     context = iter(etree.iterparse(f, events=("start", "end")))
     
     event, root = context.next()
     
     nid_re              = re.compile('.*\/([0-9]*)$')
     language_re         = re.compile('.*\/languages$')
     match_genre_re      = re.compile('.*\/genres$')
     match_ratings_re    = re.compile('.*\/mpaa_ratings$')
     
     match_genre_func    = lambda c: re.match(match_genre_re, c.get('scheme')) is not None
     match_ratings_func  = lambda c: re.match(match_ratings_re, c.get('scheme')) is not None
     match_language_func = lambda c: re.match(language_re, c.get('scheme')) is not None
     
     count = 0
     bonus_materials = set()
     
     # loop through each XML catalog_title element and parse it as a movie Entity
     for event, elem in context:
         if event == "end" and elem.tag == "catalog_title":
             root.clear()
             
             try:
                 rating_elem = elem.find('average_rating')
                 if rating_elem is None:
                     continue
                 
                 entity = Entity()
                 nid = elem.find('id').text
                 nid = int(re.match(nid_re, nid).groups()[0])
                 
                 bonus_materials_elem = elem.find('.//bonus_materials')
                 if bonus_materials_elem is not None:
                     links = map(lambda l: l.get('href'), bonus_materials_elem.findall('link'))
                     
                     for link in links:
                         bonus_material_id = int(re.match(nid_re, link).groups()[0])
                         #bonus_material_id = re.match(bonus_materials_id_re, link).groups()[0]
                         bonus_materials.add(bonus_material_id)
                 
                 if nid in bonus_materials:
                     continue
                 
                 title = elem.find('title').get('regular')
                 titlel = title.lower()
                 
                 if 'bonus material' in titlel:
                     continue
                 
                 entity.title = title
                 entity.nid = nid
                 entity.desc = elem.find('.//synopsis').text
                 entity.nrating = float(rating_elem.text)
                 
                 categories = elem.findall('category')
                 
                 genres = map(lambda c: c.get('label'), filter(match_genre_func, categories))
                 entity.ngenres = genres
                 
                 tv = False
                 for genre in genres:
                     if 'tv' in genre.lower():
                         tv = True
                         break
                 
                 if tv:
                     entity.subcategory = 'tv'
                 else:
                     entity.subcategory = 'movie'
                 
                 ratings = map(lambda c: c.get('label'), filter(match_ratings_func, categories))
                 if 1 == len(ratings):
                     entity.mpaa_rating = ratings[0]
                 
                 images = elem.find('.//box_art').findall('link')
                 if 3 == len(images) or 4 == len(images):
                     entity.tiny  = images[0].get('href')
                     entity.small = images[1].get('href')
                     entity.large = images[2].get('href')
                     
                     if 4 == len(images):
                         entity.hd = images[3].get('href')
                 
                 links = filter(lambda l: 'web page' == l.get('title'), elem.findall('link'))
                 if 1 == len(links):
                     entity.nurl = links[0].get('href')
                 
                 language_elem  = elem.find('.//languages_and_audio')
                 language_elems = filter(match_language_func, language_elem.findall('.//category'))
                 
                 release_year_elem = elem.find('release_year')
                 if release_year_elem is not None:
                     entity.original_release_date = release_year_elem.text
                 
                 duration = elem.find('runtime')
                 if duration is not None:
                     entity.track_length = duration.text
                 
                 languages = set()
                 for elem2 in language_elems:
                     languages.add(elem2.get('label').lower())
                 
                 if 'english' not in languages:
                     continue
                 
                 #utils.log(entity.title)
                 #pprint(entity.getDataAsDict())
                 
                 """
                 self._globals['n'] = elem
                 self._globals['s'] = etree.tostring(elem, pretty_print=True)
                 self._globals['e'] = entity
                 break
                 """
                 
                 self._output.put(entity)
                 count += 1
                 
                 # give the downstream consumer threads an occasional chance to work
                 if 0 == (count % 512):
                     time.sleep(0.1)
                 
                 elem.clear()
             except Exception, e:
                 utils.printException()
                 utils.log(elem.find('title').get('regular'))