Exemple #1
0
 def __init__(self, config):
     self._conf = config
     self._hits = []
     self._recenthits = []
     self._sf = StorageFilters()
Exemple #2
0
class MemoryStorage(object):

    def __init__(self, config):
        self._conf = config
        self._hits = []
        self._recenthits = []
        self._sf = StorageFilters()

    def clear_hits(self, days=7):
        self._hits = []
        self._recenthits = []

    def add_hit(self, hit):
        hitobj = {'url': hit.url(), 'timestamp': hit.timestamp(),
                  'keywords': hit.keywords(), 'path': hit.path(),
                  'title': hit.title(), 'source': hit.source()}
        if not self._sf.filter_path(hit.path()):
            # Don't store hits for blacklisted paths
            return
        self._hits.append(hitobj)
        self._recenthits.append(hitobj)
        recenthits_size = int(self._conf['recenthits_size'])
        if len(self._recenthits) > recenthits_size:
            self._recenthits = self._recenthits[-recenthits_size:]

    def get_recenthits(self, sources, last_timestamp=0):
        recenthits = self._recenthits
        recenthits = filter(self._sf.filter_timestamp(
                                start_time = last_timestamp),
                            recenthits)
        recenthits = filter(self._sf.filter_sources(sources), recenthits)
        return recenthits

    def list_urls(self, unique=False, start_time=None, end_time=None,
                  minimum_hits=1):
        """Returns a list of all the urls. Optional parameters:
        unique Return only unique urls.
        start_time Return only urls requested after this timestamp.
        end_time Return only urls requested before this timestamp.
        minimum_hits Return only urls with at least this amount of hits.
        """
        hits = self._hits

        hits = filter(self._sf.filter_timestamp(start_time, end_time),
                      hits)

        urls = map(operator.itemgetter('url'), hits)

        if unique:
            return list(set(urls))

        return urls

    def get_hitcount(self, url, start_time=None, end_time=None):
        """Returns number of hits for a specific url. Optional parameters:
        start_time Return only urls requested after this timestamp.
        end_time Return only urls requested before this timestamp.
        """
        hits = self._hits

        hits = filter(self._sf.filter_url(url), hits)
        hits = filter(self._sf.filter_timestamp(start_time, end_time),
                      hits)

        return len(hits)

    def get_hitcounts(self, start_time=None, end_time=None, minimum_hits=1,
                      qfield='hit_path'):
        """Return dictionary of hitcounts for all urls using the format
        {name: count} Optional parameters:
        start_time Return only urls requested after this timestamp.
        end_time Return only urls requested before this timestamp.
        minimum_hits Return only urls with at least this amount of hits.
        """
        hits = self._hits

        hits = filter(self._sf.filter_timestamp(start_time, end_time),
                      hits)

        # Get a dictionary like {url: count} or {path: count}
        if qfield == 'hit_url':
            hitcounts = Counter(map(operator.itemgetter('url'), hits))
        elif qfield == 'hit_title':
            hitcounts = Counter(map(operator.itemgetter('title'), hits))
        else:
            hitcounts = Counter(map(operator.itemgetter('path'), hits))

        # Iterate over the hitcounts, putting them through the filter function.
        # Items are removed if the filter function returns false.
        hitcounts = dict(filter(self._sf.filter_hitcount(minimum_hits),
                                hitcounts.iteritems()))
        return hitcounts

    def get_keywords(self, url=None, start_time=None, end_time=None,
                     minimum_count=None):
        """Get all keywords and their counts.
        Returns dictionary: {keyword: count}
        """
        hits = self._hits

        hits = filter(self._sf.filter_url(url), hits)
        hits = filter(self._sf.filter_timestamp(start_time, end_time),
                      hits)

        # Iterate over keywords in hits, combining them in a single list.
        # Then use Counter to get a dictionary like {keyword: count}
        keywords = Counter(reduce(operator.add,
                                  map(operator.itemgetter('keywords'), hits),
                                  []))

        keywords = dict(filter(self._sf.filter_keywordcount(minimum_count),
                        keywords.iteritems()))
        keywords = dict(filter(self._sf.filter_keywords(self._conf),
                        keywords.iteritems()))
        return keywords

    def list_searches(self, keyword=None, limit=None):
        """List all the search phrases which contain the given keyword, or all
        phrases if no keyword given.
        """
        phrases = []
        sources = map(operator.itemgetter('source'), self._hits)
        sources = {}.fromkeys(sources).keys() # make unique
        htmlparser = HTMLParser()
        for source in sources:
            if source.startswith('searches'):
                qpos = source.find(': ')
                if qpos > 0:
                    phrase = source[qpos+2:]
                    if keyword is None or phrase.find(keyword) != -1:
                        phrases.append(htmlparser.escape(phrase))
        if limit is not None:
            return phrases[:limit]
        return phrases