def google(language, word, n = 8, *args, **kwargs): ''' Downloads suitable images for a given word from Google Images. ''' if not kwargs.has_key('start'): kwargs['start'] = 0 if not kwargs.has_key('itype'): kwargs['itype'] = 'photo|clipart|lineart' if not kwargs.has_key('isize'): kwargs['isize'] = 'small|medium|large|xlarge' if not kwargs.has_key('filetype'): kwargs['filetype'] = 'jpg' info = {'q' : word, 'hl' : language, 'start' : str(kwargs['start']), 'as_filetype' : kwargs['filetype'], 'imgsz' : kwargs['isize'], 'imgtype' : kwargs['itype'], 'rsz' : '8', 'safe' : 'active'} query = '&'.join([x[0] + '=' + x[1] for x in info.items()]) url = 'https://ajax.googleapis.com/ajax/services/search/images?v=1.0&' + query debug('Loading ' + unicode(url) + '...') page = requests.get(url) data = json.loads(page.text) images = [] if data and data.has_key('responseData') and data['responseData']: items = data['responseData']['results'] if items: images += [item['url'] for item in items] if len(images) < int(n): kwargs['start'] += 8 images += google(language, word, n, *args, **kwargs) return images[:int(n)]
def _scrape(self, method, *args, **kwargs): results = [] for Scraper in self.iterscrapers(method): scraper = Scraper(self.word) function = getattr(scraper, method) delimiter = '-' key = delimiter.join([ _f for _f in [ scraper.language, method, scraper.name.lower(), scraper.word.lower(), delimiter.join(args) ] if _f ]) key = key.strip() key = key.replace(' ', delimiter) from datetime import datetime extradata = { 'type': 'lltk-scraping-cache', 'language': scraper.language, 'word': scraper.word, 'method': method, 'source': scraper.name, 'url': scraper.url, 'added': datetime.now().strftime('%Y-%m-%dT%H:%M:%S') } if config['caching']: function = cached(key, extradata)(function) result = function(*args, **kwargs) debug(u'%s: %s.%s(\'%s\') → %s (\'%s\')' % (scraper.name, scraper.language, method, scraper.word, result, scraper.url)) if not isempty(result): self.source = scraper results.append(result) # Remove empty or incomplete answers self.results = self.clean(results) self.results = self.merge(self.results) if config['debug']: for i in range(len(self.results)): debug('%d) %s' % (i + 1, self.results[i])) if self.results: if ('mode' in kwargs and kwargs['mode'] == 'all') or config['scraping-results-mode'] == 'all': # Return all results self.result = self.results else: # Return the first result (which is the best guess since the list is sorted by frequency of occurrence) self.result = self.results[0] else: self.result = [None] return self.result
def enable(identifier = None, *args, **kwargs): ''' Enables a specific cache for the current session. Remember that is has to be registered. ''' global cache if not identifier: for item in (config['default-caches'] + ['NoCache']): if caches.has_key(item): debug('Enabling default cache %s...' % (item,)) cache = caches[item](*args, **kwargs) if not cache.status(): warning('%s could not be loaded. Is the backend running (%s:%d)?' % (item, cache.server, cache.port)) continue # This means that the cache backend was set up successfully break else: debug('Cache backend %s is not registered. Are all requirements satisfied?' % (item,)) elif caches.has_key(identifier): debug('Enabling cache %s...' % (identifier,)) previouscache = cache cache = caches[identifier](*args, **kwargs) if not cache.status(): warning('%s could not be loaded. Is the backend running (%s:%d)?' % (identifier, cache.server, cache.port)) cache = previouscache else: debug('Cache backend %s is not registered. Are all requirements satisfied?' % (identifier,))
def discover(language): ''' Discovers all registered scrapers to be used for the generic scraping interface. ''' debug('Discovering scrapers for \'%s\'...' % (language,)) global scrapers, discovered for language in scrapers.iterkeys(): discovered[language] = {} for scraper in scrapers[language]: blacklist = ['download', 'isdownloaded', 'getelements'] methods = [method for method in dir(scraper) if method not in blacklist and not method.startswith('_') and callable(getattr(scraper, method))] for method in methods: if discovered[language].has_key(method): discovered[language][method].append(scraper) else: discovered[language][method] = [scraper] debug('%d scrapers with %d methods (overall) registered for \'%s\'.' % (len(scrapers[language]), len(discovered[language].keys()), language))
def discover(language): ''' Discovers all registered scrapers to be used for the generic scraping interface. ''' debug('Discovering scrapers for \'%s\'...' % (language, )) global scrapers, discovered for language in scrapers.keys(): discovered[language] = {} for scraper in scrapers[language]: blacklist = ['download', 'isdownloaded', 'getelements'] methods = [ method for method in dir(scraper) if method not in blacklist and not method.startswith('_') and callable(getattr(scraper, method)) ] for method in methods: if method in discovered[language]: discovered[language][method].append(scraper) else: discovered[language][method] = [scraper] debug('%d scrapers with %d methods (overall) registered for \'%s\'.' % (len(scrapers[language]), len(list( discovered[language].keys())), language))
def _scrape(self, method, *args, **kwargs): results = [] for Scraper in self.iterscrapers(method): scraper = Scraper(self.word) function = getattr(scraper, method) delimiter = '-' key = delimiter.join(filter(None, [scraper.language, method, scraper.name.lower(), scraper.word.lower(), delimiter.join(args)])) key = key.strip() key = key.replace(' ', delimiter) from datetime import datetime extradata = {'type' : 'lltk-scraping-cache','language' : scraper.language, 'word' : scraper.word, 'method' : method, 'source' : scraper.name, 'url' : scraper.url, 'added' : datetime.now().strftime('%Y-%m-%dT%H:%M:%S')} if config['caching']: function = cached(key, extradata)(function) result = function(*args, **kwargs) debug(u'%s: %s.%s(\'%s\') → %s (\'%s\')' % (scraper.name, scraper.language, method, scraper.word, result, scraper.url)) if not isempty(result): self.source = scraper results.append(result) # Remove empty or incomplete answers self.results = self.clean(results) self.results = self.merge(self.results) if config['debug']: for i in xrange(len(self.results)): debug('%d) %s' % (i + 1, self.results[i])) if self.results: if (kwargs.has_key('mode') and kwargs['mode'] == 'all') or config['scraping-results-mode'] == 'all': # Return all results self.result = self.results else: # Return the first result (which is the best guess since the list is sorted by frequency of occurrence) self.result = self.results[0] else: self.result = [None] return self.result
def google(language, word, n=8, *args, **kwargs): ''' Downloads suitable images for a given word from Google Images. ''' if not 'start' in kwargs: kwargs['start'] = 0 if not 'itype' in kwargs: kwargs['itype'] = 'photo|clipart|lineart' if not 'isize' in kwargs: kwargs['isize'] = 'small|medium|large|xlarge' if not 'filetype' in kwargs: kwargs['filetype'] = 'jpg' info = { 'q': word, 'hl': language, 'start': str(kwargs['start']), 'as_filetype': kwargs['filetype'], 'imgsz': kwargs['isize'], 'imgtype': kwargs['itype'], 'rsz': '8', 'safe': 'active' } query = '&'.join([x[0] + '=' + x[1] for x in info.items()]) url = 'https://ajax.googleapis.com/ajax/services/search/images?v=1.0&' + query debug('Loading ' + unicode(url) + '...') page = requests.get(url) data = json.loads(page.text) images = [] if data and 'responseData' in data and data['responseData']: items = data['responseData']['results'] if items: images += [item['url'] for item in items] if len(images) < int(n): kwargs['start'] += 8 images += google(language, word, n, *args, **kwargs) return images[:int(n)]
def wrapper(*args, **kwargs): uid = key if not uid: from hashlib import md5 arguments = list(args) + [(a, kwargs[a]) for a in sorted(kwargs.keys())] uid = md5(str(arguments)).hexdigest() if exists(uid): debug('Item \'%s\' is cached (%s).' % (uid, cache)) return get(uid) else: debug('Item \'%s\' is not cached (%s).' % (uid, cache)) result = f(*args, **kwargs) debug('Caching result \'%s\' as \'%s\' (%s)...' % (result, uid, cache)) debug('Extra data: ' + (str(extradata) or 'None')) put(uid, result, extradata) return result