Beispiel #1
0
def google(language, word, n = 8, *args, **kwargs):
	''' Downloads suitable images for a given word from Google Images. '''

	if not kwargs.has_key('start'):
		kwargs['start'] = 0
	if not kwargs.has_key('itype'):
		kwargs['itype'] = 'photo|clipart|lineart'
	if not kwargs.has_key('isize'):
		kwargs['isize'] = 'small|medium|large|xlarge'
	if not kwargs.has_key('filetype'):
		kwargs['filetype'] = 'jpg'

	info = {'q' : word, 'hl' : language, 'start' : str(kwargs['start']), 'as_filetype' : kwargs['filetype'], 'imgsz' : kwargs['isize'], 'imgtype' : kwargs['itype'], 'rsz' : '8', 'safe' : 'active'}
	query = '&'.join([x[0] + '=' + x[1] for x in info.items()])
	url = 'https://ajax.googleapis.com/ajax/services/search/images?v=1.0&' + query

	debug('Loading ' + unicode(url) + '...')
	page = requests.get(url)
	data = json.loads(page.text)
	images = []

	if data and data.has_key('responseData') and data['responseData']:
		items = data['responseData']['results']
		if items:
			images += [item['url'] for item in items]
			if len(images) < int(n):
				kwargs['start'] += 8
				images += google(language, word, n, *args, **kwargs)
	return images[:int(n)]
Beispiel #2
0
    def _scrape(self, method, *args, **kwargs):

        results = []

        for Scraper in self.iterscrapers(method):
            scraper = Scraper(self.word)
            function = getattr(scraper, method)
            delimiter = '-'
            key = delimiter.join([
                _f for _f in [
                    scraper.language, method,
                    scraper.name.lower(),
                    scraper.word.lower(),
                    delimiter.join(args)
                ] if _f
            ])
            key = key.strip()
            key = key.replace(' ', delimiter)
            from datetime import datetime
            extradata = {
                'type': 'lltk-scraping-cache',
                'language': scraper.language,
                'word': scraper.word,
                'method': method,
                'source': scraper.name,
                'url': scraper.url,
                'added': datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
            }
            if config['caching']:
                function = cached(key, extradata)(function)
            result = function(*args, **kwargs)
            debug(u'%s: %s.%s(\'%s\') → %s (\'%s\')' %
                  (scraper.name, scraper.language, method, scraper.word,
                   result, scraper.url))
            if not isempty(result):
                self.source = scraper
                results.append(result)

        # Remove empty or incomplete answers
        self.results = self.clean(results)
        self.results = self.merge(self.results)

        if config['debug']:
            for i in range(len(self.results)):
                debug('%d) %s' % (i + 1, self.results[i]))

        if self.results:
            if ('mode' in kwargs and kwargs['mode']
                    == 'all') or config['scraping-results-mode'] == 'all':
                # Return all results
                self.result = self.results
            else:
                # Return the first result (which is the best guess since the list is sorted by frequency of occurrence)
                self.result = self.results[0]
        else:
            self.result = [None]
        return self.result
Beispiel #3
0
def enable(identifier = None, *args, **kwargs):
	''' Enables a specific cache for the current session. Remember that is has to be registered. '''

	global cache
	if not identifier:
		for item in (config['default-caches'] + ['NoCache']):
			if caches.has_key(item):
				debug('Enabling default cache %s...' % (item,))
				cache = caches[item](*args, **kwargs)
				if not cache.status():
					warning('%s could not be loaded. Is the backend running (%s:%d)?' % (item, cache.server, cache.port))
					continue
				# This means that the cache backend was set up successfully
				break
			else:
				debug('Cache backend %s is not registered. Are all requirements satisfied?' % (item,))
	elif caches.has_key(identifier):
		debug('Enabling cache %s...' % (identifier,))
		previouscache = cache
		cache = caches[identifier](*args, **kwargs)
		if not cache.status():
			warning('%s could not be loaded. Is the backend running (%s:%d)?' % (identifier, cache.server, cache.port))
			cache = previouscache
	else:
		debug('Cache backend %s is not registered. Are all requirements satisfied?' % (identifier,))
Beispiel #4
0
def discover(language):
	''' Discovers all registered scrapers to be used for the generic scraping interface. '''

	debug('Discovering scrapers for \'%s\'...' % (language,))
	global scrapers, discovered
	for language in scrapers.iterkeys():
		discovered[language] = {}
		for scraper in scrapers[language]:
			blacklist = ['download', 'isdownloaded', 'getelements']
			methods = [method for method in dir(scraper) if method not in blacklist and not method.startswith('_') and callable(getattr(scraper, method))]
			for method in methods:
				if discovered[language].has_key(method):
					discovered[language][method].append(scraper)
				else:
					discovered[language][method] = [scraper]
	debug('%d scrapers with %d methods (overall) registered for \'%s\'.' % (len(scrapers[language]), len(discovered[language].keys()), language))
Beispiel #5
0
def discover(language):
    ''' Discovers all registered scrapers to be used for the generic scraping interface. '''

    debug('Discovering scrapers for \'%s\'...' % (language, ))
    global scrapers, discovered
    for language in scrapers.keys():
        discovered[language] = {}
        for scraper in scrapers[language]:
            blacklist = ['download', 'isdownloaded', 'getelements']
            methods = [
                method for method in dir(scraper)
                if method not in blacklist and not method.startswith('_')
                and callable(getattr(scraper, method))
            ]
            for method in methods:
                if method in discovered[language]:
                    discovered[language][method].append(scraper)
                else:
                    discovered[language][method] = [scraper]
    debug('%d scrapers with %d methods (overall) registered for \'%s\'.' %
          (len(scrapers[language]), len(list(
              discovered[language].keys())), language))
Beispiel #6
0
	def _scrape(self, method, *args, **kwargs):

		results = []

		for Scraper in self.iterscrapers(method):
			scraper = Scraper(self.word)
			function = getattr(scraper, method)
			delimiter = '-'
			key = delimiter.join(filter(None, [scraper.language, method, scraper.name.lower(), scraper.word.lower(), delimiter.join(args)]))
			key = key.strip()
			key = key.replace(' ', delimiter)
			from datetime import datetime
			extradata = {'type' : 'lltk-scraping-cache','language' : scraper.language, 'word' : scraper.word, 'method' : method, 'source' : scraper.name, 'url' : scraper.url, 'added' : datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}
			if config['caching']:
				function = cached(key, extradata)(function)
			result = function(*args, **kwargs)
			debug(u'%s: %s.%s(\'%s\') → %s (\'%s\')' % (scraper.name, scraper.language, method, scraper.word, result, scraper.url))
			if not isempty(result):
				self.source = scraper
				results.append(result)

		# Remove empty or incomplete answers
		self.results = self.clean(results)
		self.results = self.merge(self.results)

		if config['debug']:
			for i in xrange(len(self.results)):
				debug('%d) %s' % (i + 1, self.results[i]))

		if self.results:
			if (kwargs.has_key('mode') and kwargs['mode'] == 'all') or config['scraping-results-mode'] == 'all':
				# Return all results
				self.result = self.results
			else:
				# Return the first result (which is the best guess since the list is sorted by frequency of occurrence)
				self.result = self.results[0]
		else:
			self.result = [None]
		return self.result
Beispiel #7
0
def google(language, word, n=8, *args, **kwargs):
    ''' Downloads suitable images for a given word from Google Images. '''

    if not 'start' in kwargs:
        kwargs['start'] = 0
    if not 'itype' in kwargs:
        kwargs['itype'] = 'photo|clipart|lineart'
    if not 'isize' in kwargs:
        kwargs['isize'] = 'small|medium|large|xlarge'
    if not 'filetype' in kwargs:
        kwargs['filetype'] = 'jpg'

    info = {
        'q': word,
        'hl': language,
        'start': str(kwargs['start']),
        'as_filetype': kwargs['filetype'],
        'imgsz': kwargs['isize'],
        'imgtype': kwargs['itype'],
        'rsz': '8',
        'safe': 'active'
    }
    query = '&'.join([x[0] + '=' + x[1] for x in info.items()])
    url = 'https://ajax.googleapis.com/ajax/services/search/images?v=1.0&' + query

    debug('Loading ' + unicode(url) + '...')
    page = requests.get(url)
    data = json.loads(page.text)
    images = []

    if data and 'responseData' in data and data['responseData']:
        items = data['responseData']['results']
        if items:
            images += [item['url'] for item in items]
            if len(images) < int(n):
                kwargs['start'] += 8
                images += google(language, word, n, *args, **kwargs)
    return images[:int(n)]
Beispiel #8
0
		def wrapper(*args, **kwargs):

			uid = key
			if not uid:
				from hashlib import md5
				arguments = list(args) + [(a, kwargs[a]) for a in sorted(kwargs.keys())]
				uid = md5(str(arguments)).hexdigest()
			if exists(uid):
				debug('Item \'%s\' is cached (%s).' % (uid, cache))
				return get(uid)
			else:
				debug('Item \'%s\' is not cached (%s).' % (uid, cache))
				result = f(*args, **kwargs)
				debug('Caching result \'%s\' as \'%s\' (%s)...' % (result, uid, cache))
				debug('Extra data: ' + (str(extradata) or 'None'))
				put(uid, result, extradata)
				return result