Ejemplo n.º 1
0
        
        Globals.options.offset = 0
        if Globals.options.limit:
            Globals.options.limit = max(0, Globals.options.limit - count)
        
        pool.join()
        self._output.put(StopIteration)
        csvFile.close()
        
        utils.log("[%s] finished parsing %d entities" % (self.NAME, count))
    
    def _parseEntity(self, row, count):
        #utils.log("[%s] parsing entity %d" % (self.NAME, count))
        
        entity = Entity()
        entity.subcategory = "app"
        
        entity.factual = {
            'table' : 'iPhone_Apps.csv'
        }
        
        for srcKey, destKey in self._map.iteritems():
            if srcKey in row and row[srcKey] and len(row[srcKey]) > 0:
                entity[destKey] = row[srcKey]
        
        self._output.put(entity)

from crawler import EntitySources
EntitySources.registerSource('factualiPhoneApps', FactualiPhoneAppsDump)

Ejemplo n.º 2
0
            entity.subcategory = 'restaurant'
            entity.sources.yelp = { }
            
            titlel = result.find('a')
            title  = titlel.getText()
            entity.title = self.title_re.match(title).groups()[0]
            entity.yurl  = self.base + titlel.get('href')
            
            addr   = result.find('address').getText(separator)
            match  = self.address_re.match(addr).groups()
            
            entity.address = "%s, %s" % (match[0], match[1])
            entity.phone = match[2]
            
            rating = result.find('img')
            if rating is not None:
                entity.yrating = float(self.rating_reviews_re.match(rating.get('title')).groups()[0])
            
            reviews = result.find('span', {'class' : 'reviews'})
            if reviews is not None:
                entity.yreviews = int(self.rating_reviews_re.match(reviews.getText()).groups()[0])
            
            key = (entity.title, entity.address)
            if key not in self.seen:
                self.seen.add(key)
                self._output.put(entity)

from crawler import EntitySources
EntitySources.registerSource('yelp', YelpCrawler)

Ejemplo n.º 3
0
                    self._count[name] = self._count[name] + 1 
                else: 
                    continue
            
            else:   
                self._count[name] = 1 
        
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.latimes = { }
            
            self._output.put(entity)
        
        #try the next page
        
        try:
            next_page = soup.find('a', {'class': 'next_page'}).get("href")
            if next_page != '':
                next_page_url = "{0}{1}".format('http://findlocal.latimes.com', next_page)
                pool.spawn(self._parseResultsPage, pool, next_page_url)
        except AttributeError:
            # crawling of pages is done
            #utils.log("Done crawling: %s" % href)
            pass

from crawler import EntitySources
EntitySources.registerSource('latimes', LATimesCrawler)

Ejemplo n.º 4
0
        entity.subcategory = "restaurant"
        entity.title = row[1]
        entity.address = row[3] + ', ' + \
                         row[4] + ', ' + \
                         row[5] + ' ' + \
                         row[6]
        
        entity.openTable = {
            'rid' : int(row[8]), 
            'reserveURL' : row[9], 
            'countryID' : row[10], 
            'metroName' : row[0], 
            'neighborhoodName' : row[2], 
        }
        
        # don't make external calls to opentable in test mode
        if not Globals.options.test:
            result = OpenTableParser.parseEntity(entity)
            if result is None:
                return
        
        if entity is not None:
            #print entity.title
            #from pprint import pprint
            #pprint(entity.getDataAsDict())
            self._output.put(entity)

from crawler import EntitySources
EntitySources.registerSource('opentable', OpenTableDump)

Ejemplo n.º 5
0
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            entity = Entity()
            if 'Bars' in result.findNext('span').getText():
                entity.subcategory = "bar"
            else:  
                entity.subcategory = "restaurant"
            
            entity.title   = name
            entity.address = addr
            entity.sources.timeout_sf = { }
            
            self._output.put(entity)
        
        # try the next page
        try: 
            href_get = soup.find('div', { 'class' : 'next' }).find('a').get('href')
            next_page = '{0}{1}'.format('http://www.timeout.com', href_get)
        except Exception: 
            next_page = ''
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)

from crawler import EntitySources
EntitySources.registerSource('timeout_sf', TimeOutSFCrawler)

Ejemplo n.º 6
0
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            if 'Bars' in result.findNext('span').getText():
                entity.subcategory = "bar"
            else:
                entity.subcategory = "restaurant"
            
            entity.title   = name
            entity.address = addr
            entity.sources.timeout_la = { }
            
            self._output.put(entity)
        
        # try the next page
        try: 
            href_get = soup.find('div', { 'class' : 'next' }).find('a').get('href')
            next_page = '{0}{1}'.format('http://www.timeout.com', href_get)
        except Exception: 
            next_page = ''
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)

from crawler import EntitySources
EntitySources.registerSource('timeout_la', TimeOutLACrawler)

Ejemplo n.º 7
0
                                                  result.find('span').findNext('span').findNext('span').findNext('span').getText())
            except Exception:
                addr = ''
                utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                continue
                        
            if addr == '':
                continue
                
            if name == '':
                continue 
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.washmag = { }
            
            self._output.put(entity)
        
        return 
        
from crawler import EntitySources
EntitySources.registerSource('washmag', WashMagCrawler)

Ejemplo n.º 8
0
        details = content.find('div', {'id' : 'edbtext'})
        desc    = details.find('p').getText()
        if desc is not None:
            entity.desc = desc
        
        details = details.findAll('p', {'class' : 'list'})
        address = details[0].renderContents().strip().replace('<br />', '')
        address = re.sub('[ \n\t]+', ' ', address)
        entity.address = address
        
        if len(details) > 1:
            site = details[1].get('href')
            if site is not None:
                entity.site = site
        
        if len(details) > 2:
            hoursOfOperation = details[2].getText()
            if hoursOfOperation is not None:
                entity.hoursOfOperation = hoursOfOperation
        
        key = (entity.title, entity.address)
        if key in self.seen or '(closed)' in entity.title.lower():
            return
        
        self.seen.add(key)
        self._output.put(entity)

from crawler import EntitySources
EntitySources.registerSource('seattletimes', SeattleTimesCrawler)

Ejemplo n.º 9
0
        #self._globals['soup'] = soup
        # parse cuisine
        header = soup.find('div', {'id' : "block-zagat_restaurants-14"})
        if header is not None:
            header = header.find('ul').find('li', {'class' : 'first'})
            
            if header is not None:
                entity.cuisine = header.getText()
        
        # parse website
        site = soup.find('span', {'class' : 'website'})
        if site is not None:
            site = site.find('a')
            
            if site is not None:
                entity.site = site.get('href')
        
        # parse preview image
        img = soup.find('div', {'id' : 'content'}).find('div', {'class' : 'photo'})
        if img is not None:
            img = img.find('img')
            
            if img is not None:
                entity.image = img.get('src')
        
        self._output.put(entity)

from crawler import EntitySources
EntitySources.registerSource('zagat', ZagatCrawler)

Ejemplo n.º 10
0
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "bar"
            entity.title   = name
            entity.address = addr
            entity.sources.sfweekly = { }
            
            self._output.put(entity)
        
        # try the next page
        try: 
            pagination = soup.find('span', { 'class' : 'Pagination' }).getText()
            if 'Next' in pagination:
                pagination = soup.find('span', { 'class' : 'Pagination' })
                href_get = pagination.find('span', { 'class' : 'PaginationSelected' }).findNext('a').get('href')
                next_page = '{0}{1}'.format('http://www.sfweekly.com', href_get)
            else: 
                next_page = '' 
        except Exception: 
            next_page = ''
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)

from crawler import EntitySources
EntitySources.registerSource('sfweekly', SFWeeklyCrawler)

Ejemplo n.º 11
0
                        if earliest is None or date < earliest:
                            earliest = date
                        
                        if latest is None or date > latest:
                            latest = date
            
            if highest_season > 0:
                entity.num_seasons = highest_season
            
            if earliest is not None:
                entity.earliest_air_date = earliest
            
            if latest is not None:
                entity.latest_air_date = latest
        except:
            utils.printException()
        
        entity2 = self._thetvdb.lookup(entity.sources.thetvdb_id)
        
        if entity2 is not None:
            if entity2.mpaa_rating is not None:
                entity.mpaa_rating = entity2.mpaa_rating
            if entity2.imdb_id is not None:
                entity.imdb_id     = entity2.imdb_id
        
        self._output.put(entity)

from crawler import EntitySources
EntitySources.registerSource('thetvdb', TheTVDBCrawler)

Ejemplo n.º 12
0
                        continue
                    
                    #utils.log(entity.title)
                    #pprint(entity.getDataAsDict())
                    
                    """
                    self._globals['n'] = elem
                    self._globals['s'] = etree.tostring(elem, pretty_print=True)
                    self._globals['e'] = entity
                    break
                    """
                    
                    self._output.put(entity)
                    count += 1
                    
                    # give the downstream consumer threads an occasional chance to work
                    if 0 == (count % 512):
                        time.sleep(0.1)
                    
                    elem.clear()
                except Exception, e:
                    utils.printException()
                    utils.log(elem.find('title').get('regular'))
        
        f.close()
        return count

from crawler import EntitySources
EntitySources.registerSource('netflix', NetflixDump)

Ejemplo n.º 13
0
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.chicagomag = { }
            
            self._output.put(entity)
        
        # try the next page
        next_page_all= soup.find('div', { 'id' : 'pager' }).findAll('a')
        next_page = ''
        
        for n in next_page_all: 
            if 'Next' in n.getText():
                next_page = n.get('href')
            else:
                pass
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)

from crawler import EntitySources
EntitySources.registerSource('chicagomag', ChicagoMagCrawler)

Ejemplo n.º 14
0
            except AttributeError:
                postal_code = ""
            
            addr = "%s, %s, %s %s" % (street_addr, locality, region, postal_code)
        except AttributeError:
            try:
                p = summ.find('p').getText()
                r = re.compile('(.*)nr\. ', re.DOTALL)
                m = r.match(p)
                
                if m is None:
                    r = re.compile('(.*)at[. ]', re.DOTALL)
                    m = r.match(p)
                
                addr = m.groups()[0].replace('\n', ' ').strip()
            except AttributeError:
                utils.log("[%s] error parsing %s (%s)" % (self, name, href))
                return
        
        entity = Entity()
        entity.subcategory = subcategory
        entity.title   = name
        entity.address = addr
        entity.nymag = { }
        
        self._output.put(entity)

from crawler import EntitySources
EntitySources.registerSource('nymag', NYMagCrawler)

Ejemplo n.º 15
0
                
                categories = category_ul.findAll('a')
                
                for category in categories:
                    href = category.get('href')
                    name = utils.normalize(category.getText())
                    
                    queue.put_nowait((href, name, depth + 1))
        
        self._globals['books'] = soup
        
        rss_link = soup.find('div', {'id' : 'zg_rssLinks'})
        if rss_link is None:
            return
        
        rss_link = rss_link.findAll('a')[1].get('href')
        if rss_link in self.seen:
            return
        
        self.seen.add(rss_link)
        
        entity = Entity()
        entity.title = rss_link
        entity.subcategory = 'book'
        
        self._output.put(entity)

from crawler import EntitySources
EntitySources.registerSource('amazonbestsellerbookfeeds', AmazonBestSellerBookFeeds)

Ejemplo n.º 16
0
        for result in results:
            entity = Entity()
            entity.subcategory = "book"
            entity.nytimes = {}
            
            title = result.find('span', {'class' : 'bookName'}).getText().strip().title()
            if title.endswith(','):
                title = title[0:-1]
            
            entity.title = title
            
            details = result.getText(separator='___')
            details_match = self.details_re.match(details)
            
            if details_match:
                details_match    = details_match.groups()
                entity.author    = details_match[0]
                entity.publisher = details_match[1]
                entity.desc      = details_match[2]
            
            key = (entity.title, entity.author)
            if key in self.seen:
                continue
            
            self.seen.add(key)
            self._output.put(entity)

from crawler import EntitySources
EntitySources.registerSource('nytimesbooks', NYTimesBestSellerCrawler)

Ejemplo n.º 17
0
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.sfmag = { }
            
            self._output.put(entity)
        
        #locate total pages and compare against current page num to determine if we should iterate again
        try:
            total_pages = soup.find('span', { 'class' : 'last' }).findPrevious('span').getText().strip()
        except AttributeError:
            # crawling of pages is done
            return
        
        index = href.find('&page=')
        end = href.find('&keyword')
        page = href[index+6:end]
        
        if int(page) <= int(total_pages)-1:
            next_page = href.replace('&page=' + str(page), '&page=' + str(int(page)+1))
            pool.spawn(self._parseResultsPage, pool, next_page)
        else:
            return
        
        time.sleep(0.01)

from crawler import EntitySources
EntitySources.registerSource('sfmag', SFMagCrawler)

Ejemplo n.º 18
0
                    
                    self._output.put(entity)
                    count += 1
                    
                    # give the downstream consumer threads an occasional chance to work
                    if 0 == (count % 512):
                        time.sleep(0.1)
                    
                    parent = elem.getparent()
                    while True:
                        prev = elem.getprevious()
                        if prev is None:
                            break
                        parent.remove(prev)
                    
                    elem.clear()
                except Exception, e:
                    utils.printException()
                    #self._globals['books'] = elem
        
        Globals.options.offset -= offset
        if Globals.options.limit:
            Globals.options.limit = max(0, Globals.options.limit - count)
        
        f.close()
        return count

from crawler import EntitySources
EntitySources.registerSource('barnesandnoble', BarnesAndNobleDump)

Ejemplo n.º 19
0
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            if 'Bars' in result.findNext('span').getText():
                entity.subcategory = "bar"
            else:  
                entity.subcategory = "restaurant"
            
            entity.title   = name
            entity.address = addr
            entity.sources.timeout_mia = { }
            
            self._output.put(entity)
        
        # try the next page
        try: 
            href_get = soup.find('div', { 'class' : 'next' }).find('a').get('href')
            next_page = '{0}{1}'.format('http://www.timeout.com', href_get)
        except Exception: 
            next_page = ''
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)

from crawler import EntitySources
EntitySources.registerSource('timeout_mia', TimeOutMIACrawler)

Ejemplo n.º 20
0
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.phillymag = { }
            
            self._output.put(entity)
        
        # try the next page
        next_page_ending = soup.find('div', { 'class' : 'right_align' }).findAll('a')
        next_page = ''
        
        for n in next_page_ending: 
            if 'Next' in str(n):
                next_page = href.replace(href[href.find('?'):], n.get('href'))
            else:
                pass
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)

from crawler import EntitySources
EntitySources.registerSource('phillymag', PhillyMagCrawler)

Ejemplo n.º 21
0
        # extract and parse the rest of the paginated results
        if base:
            page = soup.find('nav').find('span').getText()
            num_pages = int(self.page_re.match(page).groups()[0])
            
            for i in xrange(2, num_pages + 1):
                href = '%s&pg=%d' % (url, i)
                
                queue.put_nowait((href, name))
        
        results = soup.findAll('section', {'class' : 'CWListing'})
        
        for result in results:
            entity = Entity()
            entity.subcategory = "book"
            entity.awardAnnals = {}
            
            entity.title  = result.find('h4').find('a').getText().strip()
            entity.author = result.find('p', {'class' : 'creators'}).getText()
            
            key = (entity.title, entity.author)
            if key in self.seen:
                continue
            
            self.seen.add(key)
            self._output.put(entity)

from crawler import EntitySources
EntitySources.registerSource('awardannals', AwardAnnalsCrawler)

Ejemplo n.º 22
0
                    else:
                        author = author.getText().strip()

                        try:
                            entity.author = self.author_re0.match(author).groups()[0]
                        except AttributeError:
                            try:
                                entity.author = self.author_re1.match(author).groups()[0]
                            except AttributeError:
                                entity.author = author
                                pass

                # pprint(entity)
                # self._globals['books'] = entry

                if asin in self.seen:
                    continue

                self.seen.add(asin)
                self._output.put(entity)
            except:
                utils.printException()
                # print soup.prettify()

        # utils.log("[%s] done parsing feed '%s' (%s)" % (self, data.feed.title, url))


from crawler import EntitySources

EntitySources.registerSource("amazonbookfeed", AmazonBookFeed)
Ejemplo n.º 23
0
        self.video_prices.join()
    
    def _filter(self, row):
        video_id = row.video_id
        
        # only retain videos which are available for purchase in the US storefront
        price_info = self.video_prices.get_row('video_id', video_id)
        
        if price_info is None:
            return False
        
        return {
            'v_retail_price' : price_info['retail_price'], 
            'v_currency_code' : price_info['currency_code'], 
            'v_availability_date' : price_info['availability_date'], 
            'v_sd_price' : price_info['sd_price'], 
            'v_hq_price' : price_info['hq_price'], 
            'v_lc_rental_price' : price_info['lc_rental_price'], 
            'v_sd_rental_price' : price_info['sd_rental_price'], 
            'v_hd_rental_price' : price_info['hd_rental_price'], 
        }

from crawler import EntitySources

#EntitySources.registerSource('apple', AppleEPFDumps)
EntitySources.registerSource('apple_artists', AppleEPFArtistDump)
EntitySources.registerSource('apple_songs',   AppleEPFSongDump)
EntitySources.registerSource('apple_albums',  AppleEPFAlbumDump)
EntitySources.registerSource('apple_videos',  AppleEPFVideoDump)

Ejemplo n.º 24
0
        if not collapsed:
            address = FactualUtils.parseAddress(row)
            if address is not None:
                entity.address = address

        for srcKey, destKey in self._map.iteritems():
            if srcKey in row and row[srcKey]:
                entity[destKey] = row[srcKey]

        self._output.put(entity)


from crawler import EntitySources

EntitySources.registerSource("factualUSRestaurants", FactualUSRestaurantsDump)

to_collapse = {
    "fuddruckers": False,
    "d'angelo grilled sandwiches": False,
    "pizza factory": False,
    "mexico lindo": False,
    "penn station east coast subs": False,
    "dennys": False,
    "au bon pain": False,
    "whataburger restaurants": False,
    "larry's giant subs": False,
    "firehouse sub": False,
    "huddle house": False,
    "lenny's sub shop": False,
    "crown fried chicken": False,
Ejemplo n.º 25
0
        
        try:
            soup = utils.getSoup(href)
        except:
            utils.log("[%s] error downloading page %s" % (self, href))
            return
        
        # parse the address for the current restaurant
        addr     = soup.find('span', {'class' : 'adr'})
        street   = addr.find('span', {'class' : 'street-address'}).getText().strip()
        locality = addr.find('span', {'class' : 'locality'}).getText().strip()
        region   = addr.find('span', {'class' : 'region'}).getText().strip()
        zipcode  = addr.find('a', {'class' : re.compile('postal-code')}).getText().strip()
        
        address = "%s, %s, %s %s" % (street, locality, region, zipcode)
        
        # add the current restaurant to the output for this crawler
        entity = Entity()
        entity.subcategory = "restaurant"
        entity.title   = restaurant_name
        entity.address = address
        entity.sources.urbanspoon = {
            'uurl' : href, 
        }
        
        self._output.put(entity)

from crawler import EntitySources
EntitySources.registerSource('urbanspoon', UrbanspoonCrawler)

Ejemplo n.º 26
0
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.bostonmag = { }
            
            self._output.put(entity)
        
        # try the next page
        next_page_ending = soup.find('div', { 'class' : 'right_align' }).findAll('a')
        next_page = ''
        
        for n in next_page_ending: 
            if 'Next' in str(n):
                next_page = href.replace(href[href.find('?'):], n.get('href'))
            else:
                pass
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)

from crawler import EntitySources
EntitySources.registerSource('bostonmag', BostonMagCrawler)

Ejemplo n.º 27
0
                
            if name == '':
                continue 
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.sfgate = { }
            
            self._output.put(entity)
        
        # try the next page
        try: 
            href_get = soup.find('li', { 'class' : 'next' }).find('a').get('href')
            next_page = '{0}{1}'.format('http://www.sfgate.com', href_get)
        except Exception: 
            next_page = ''
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)

from crawler import EntitySources
EntitySources.registerSource('sfgate', SFGateCrawler)