Ejemplo n.º 1
    def __init__(self, max_gram_size, dictionary_file, stopwords_file, use_bloomfilter=False, timing=False):
        Constructor arguments:
        max_gram_size -- The longest phrase (in words) you want to search for in the dictionary.
        The longer your grams, the more permutations generated (to be searched) adding time to the extract.

        dictionary_file -- The path to your dictionary file.  A sorted, lower-cased, list of phrases.

        stopwords_file -- The path to your stopwords file (words or phrases to be excluded).
        This file should also be sorted and lower-cased.

        use_bloomfilter -- Whether or not to use a BloomFilter.  If True extracts will run several
        times faster.  This takes several minutes to build the very first time you run it.

        self.ngfilter = NGramFilter(max_gram_size)
        self.searcher = Searcher(dictionary_file)
	self.stopwords = Searcher(stopwords_file)
        self.timing = timing
        if use_bloomfilter:
            f = open(dictionary_file)
            self.bloom = BloomFilter(self.BF_BYTES, self.BF_HASHES, iter(f))
            self.bloom = None
Ejemplo n.º 2
class Extractor:
    Extractor Constants:    

    When you instantiate a BloomFilter, you will see the false-positive rate displayed.
    False-positives do not introduce any error, since all 'positive' terms are searched in
    the dictionary.  However, more false-positives means more unnecessary dictionary lookups,
    which will slow down extracts.

    The false-positive rate of the BloomFilter may be tuned with the parameters below:
    (Right now these settings reflect a dictionary size of approximately 10MM entries)

    BF_HASHES: The number of hash functions used by the BloomFilter.   The higher this number 
    is, the lower the false-positive rate drops.  However each hash must be computed for every 
    lookup, so keeping a smaller number of hashes speeds things up.

    This is the size of the bitset in use by the BloomFilter.  The bigger the bitset, the lower
    the false-positive rate.  However, the larger the bitset, the more data needs to be read
    into memory at startup (and checked when searched)

    BF_HASHES  = 5
    BF_BYTES   = 8192 * 1024  # 8MB

    def __init__(self, max_gram_size, dictionary_file, stopwords_file, use_bloomfilter=False, timing=False):
        Constructor arguments:
        max_gram_size -- The longest phrase (in words) you want to search for in the dictionary.
        The longer your grams, the more permutations generated (to be searched) adding time to the extract.

        dictionary_file -- The path to your dictionary file.  A sorted, lower-cased, list of phrases.

        stopwords_file -- The path to your stopwords file (words or phrases to be excluded).
        This file should also be sorted and lower-cased.

        use_bloomfilter -- Whether or not to use a BloomFilter.  If True extracts will run several
        times faster.  This takes several minutes to build the very first time you run it.

        self.ngfilter = NGramFilter(max_gram_size)
        self.searcher = Searcher(dictionary_file)
	self.stopwords = Searcher(stopwords_file)
        self.timing = timing
        if use_bloomfilter:
            f = open(dictionary_file)
            self.bloom = BloomFilter(self.BF_BYTES, self.BF_HASHES, iter(f))
            self.bloom = None

    def extract(self, text, case_sensitive=False):
        Extracts any phrases found in the dictionary (and not in the stopwords) from the text provided.

        text -- The text to be searched
        case_sensitive -- Whether or not matches must be case-sensitive (requires provided dictionary
        to be case-sensitive as well.)
        extracts = []

        if self.timing:
            t1 = time.time()

        # TODO: Refactor
            text = remove_accents(unicode(text))
                text = remove_accents(text)

        # TODO: Refactor
        if (not case_sensitive):
            text = text.lower()

        # TODO: Refactor
	    text = text.replace('-',' ') # (e.g. "Japanese-inspired") TODO: Copy out the token variation

        # TODO: Refactor
        tokens = text.split(' ')
        terms = []
        for t in tokens: # trim any  commas, periods, etc..
            if t is not None:
                    t = unicode(t)              # make sure this is unicode
                    t = convert_smart_quotes(t) # deal with curly quotes
                    t = t.strip(",.\'<>!?() ")  # then strip them (and others)
                    if t is not None:

        text = " ".join(terms)

        for gram in self.ngfilter.filter(text):
            if (self.bloom and not gram in self.bloom):
            elif (gram in self.searcher and gram not in self.stopwords):

        if self.timing:
            t2 = time.time()
            print 'extract returned in %0.3f ms' % ((t2-t1)*1000.0)

        return set(extracts)

    # Uses BS4:  ($ sudo easy_install beautifulsoup4 html5lib)
    def extract_url(self, url):
        Extracts any phrases found in the dictionary (and not in the stopwords) from information in the page
        referenced by url.
        import HTMLParser
        from bs4 import BeautifulSoup
        from urllib import urlopen

        h = HTMLParser.HTMLParser()
        soup = BeautifulSoup(urlopen(url).read(), "html5lib")

        results = {}
        title = soup.title.string
        results["title"] = self.extract(h.unescape(title))

        md = soup.find('meta', attrs={'name':re.compile("^description$", re.I)})
        results["meta-description"] = self.extract(h.unescape(md['content'])) if md and md.has_key('content') else None

        mk = soup.find('meta', attrs={'name':re.compile("^keywords$", re.I)})
        results["meta-keywords"] = self.extract(h.unescape(mk['content'])) if mk and mk.has_key('content') else None

        ot = soup.find('meta', attrs={'property':re.compile("^og:title$", re.I)})
        results["og:title"] = self.extract(h.unescape(ot['content'])) if ot and ot.has_key('content') else None

        od = soup.find("meta", attrs={'property':re.compile("^og:description$", re.I)})
        results["og:description"] = self.extract(h.unescape(od['value'])) if od and od.has_key('value') else None

        return results