Esempio n. 1
0
    def test_bigrams(self):
        # Note: Tokens look weird after being analyzed probably due to
        # the stemmer. We could write a bunch of code to "undo" some
        # of the excessive stemming, but it's probably an exercise in
        # futility. Ergo the tests look a little odd. e.g. "youtub"

        # One word a bigram does not make
        assert compute_grams(u'youtube') == []

        # Two words is the minimum number to create a bigram
        assert(
            sorted(compute_grams(u'youtube crash')) ==
            ['crash youtube']
            )

        # Three words creates two bigrams
        assert(
            sorted(compute_grams(u'youtube crash flash')) ==
            ['crash flash', 'crash youtube']
            )

        # Four words creates three bigrams
        assert(
            sorted(compute_grams(u'youtube crash flash bridge')) ==
            ['bridge flash', 'crash flash', 'crash youtube']
            )

        # Nix duplicate bigrams
        assert(
            sorted(compute_grams(u'youtube crash youtube flash')) ==
            ['crash youtube', 'flash youtube']
            )
Esempio n. 2
0
    def test_parsing(self):
        # stop words are removed
        eq_(compute_grams(u'i me him her'), [])

        # capital letters don't matter
        eq_(compute_grams(u'I ME HIM HER'), [])

        # punctuation nixed
        eq_(compute_grams(u'i, me, him, her'), [])
Esempio n. 3
0
    def test_parsing(self):
        # stop words are removed
        eq_(compute_grams(u'i me him her'), [])

        # capital letters don't matter
        eq_(compute_grams(u'I ME HIM HER'), [])

        # punctuation nixed
        eq_(compute_grams(u'i, me, him, her'), [])
Esempio n. 4
0
    def test_parsing(self):
        # stop words are removed
        assert compute_grams(u'i me him her') == []

        # capital letters don't matter
        assert compute_grams(u'I ME HIM HER') == []

        # punctuation nixed
        assert compute_grams(u'i, me, him, her') == []
Esempio n. 5
0
    def test_basic(self):
        test_data = [
            ('The quick brown fox', [u'brown quick', u'brown fox']),
            ('the latest update disables the New tab function', [
                u'disables new', u'function tab', u'new tab', u'latest update',
                u'disables update'
            ]),
            ('why is firefox so damn slow???? many tabs load slow or not at '
             'all!', [u'load tabs', u'load slow', u'slow tabs', u'damn slow']),
            ("I'm one of the guys that likes to try Firefox ahead of the "
             'herd... usually I use Nightly, but then a while back my '
             'favorite add-on, TabMixPlus stopped working because Firefox '
             "redid something in the code. \"No problem,\" says I to myself, "
             "I'll just use Aurora until they get it fixed.", [
                 u'add-on favorite', u'add-on tabmixplus', u'ahead herd',
                 u'ahead try', u'aurora fixed', u'aurora use', u'code problem',
                 u'code redid', u'favorite nightly', u"guys i'm",
                 u'guys likes', u'herd usually', u"i'll just", u"i'll myself",
                 u'just use', u'likes try', u'myself says', u'nightly use',
                 u'problem says', u'redid working', u'stopped tabmixplus',
                 u'stopped working', u'use usually'
             ]),
            ('Being partially sighted, I found the features with Windows XP '
             'and IE8 extremely usefu;. I need everything in Arial black bold '
             'text.', [
                 u'extremely usefu', u'features sighted', u'windows xp',
                 u'ie8 xp', u'black bold', u'partially sighted', u'need usefu',
                 u'features windows', u'arial need', u'arial black',
                 u'bold text', u'extremely ie8'
             ]),
        ]

        for text, expected in test_data:
            assert sorted(compute_grams(text)) == sorted(expected)
Esempio n. 6
0
    def extract_document(cls, obj_id, obj=None):
        if obj is None:
            obj = cls.get_model().objects.get(pk=obj_id)

        def empty_to_unknown(text):
            return u'Unknown' if text == u'' else text

        doc = {
            'id': obj.id,
            'prodchan': obj.prodchan,
            'happy': obj.happy,
            'url': obj.url,
            'description': obj.description,
            'user_agent': obj.user_agent,
            'product': obj.product,
            'channel': obj.channel,
            'version': obj.version,
            'browser': obj.browser,
            'browser_version': obj.browser_version,
            'platform': obj.platform,
            'locale': obj.locale,
            'device': obj.device,
            'manufacturer': obj.manufacturer,
            'created': obj.created,
        }

        # We only compute bigrams for english because the analysis
        # uses English stopwords, stemmers, ...
        if obj.locale.startswith(u'en') and obj.description:
            bigrams = compute_grams(obj.description)
            doc['description_bigrams'] = bigrams

        return doc
Esempio n. 7
0
    def extract_doc(cls, resp, with_id=True):
        """Converts a Response to a dict of values

        This can be used with ``ResponseDocType.from_obj()`` to create a
        ``ResponseDocType`` object or it can be used for indexing.

        :arg resp: a Response object
        :arg with_id: whether or not to include the ``_id`` value--include
            it when you're bulk indexing

        :returns: a dict

        """
        doc = {
            'id': resp.id,
            'happy': resp.happy,
            'api': resp.api,
            'url': resp.url,
            'url_domain': resp.url_domain,
            'has_email': bool(resp.user_email),
            'description': resp.description,
            'user_agent': resp.user_agent,
            'product': resp.product,
            'channel': resp.channel,
            'version': resp.version,
            'browser': resp.browser,
            'browser_version': resp.browser_version,
            'platform': resp.platform,
            'locale': resp.locale,
            'country': resp.country,
            'device': resp.device,
            'manufacturer': resp.manufacturer,
            'source': resp.source,
            'campaign': resp.campaign,
            'source_campaign': '::'.join([
                (resp.source or '--'),
                (resp.campaign or '--')
            ]),
            'organic': (not resp.campaign),
            'created': resp.created
        }

        # We only compute bigrams for english because the analysis
        # uses English stopwords, stemmers, ...
        if resp.locale.startswith(u'en') and resp.description:
            doc['description_bigrams'] = compute_grams(resp.description)
        else:
            doc['description_bigrams'] = []

        if with_id:
            doc['_id'] = doc['id']
        return doc
Esempio n. 8
0
    def extract_doc(cls, resp, with_id=True):
        """Converts a Response to a dict of values

        This can be used with ``ResponseDocType.from_obj()`` to create a
        ``ResponseDocType`` object or it can be used for indexing.

        :arg resp: a Response object
        :arg with_id: whether or not to include the ``_id`` value--include
            it when you're bulk indexing

        :returns: a dict

        """
        doc = {
            "id": resp.id,
            "happy": resp.happy,
            "api": resp.api,
            "url": resp.url,
            "url_domain": resp.url_domain,
            "has_email": bool(resp.user_email),
            "description": resp.description,
            "user_agent": resp.user_agent,
            "product": resp.product,
            "channel": resp.channel,
            "version": resp.version,
            "browser": resp.browser,
            "browser_version": resp.browser_version,
            "platform": resp.platform,
            "locale": resp.locale,
            "country": resp.country,
            "device": resp.device,
            "manufacturer": resp.manufacturer,
            "source": resp.source,
            "campaign": resp.campaign,
            "source_campaign": "::".join([(resp.source or "--"), (resp.campaign or "--")]),
            "organic": (not resp.campaign),
            "created": resp.created,
        }

        # We only compute bigrams for english because the analysis
        # uses English stopwords, stemmers, ...
        if resp.locale.startswith(u"en") and resp.description:
            doc["description_bigrams"] = compute_grams(resp.description)
        else:
            doc["description_bigrams"] = []

        if with_id:
            doc["_id"] = doc["id"]
        return doc
Esempio n. 9
0
    def extract_document(cls, obj_id, obj=None):
        if obj is None:
            obj = cls.get_model().objects.get(pk=obj_id)

        doc = {
            'id': obj.id,
            'prodchan': obj.prodchan,
            'happy': obj.happy,
            'api': obj.api,
            'url': obj.url,
            'url_domain': obj.url_domain,
            'has_email': bool(obj.user_email),
            'description': obj.description,
            'category': obj.category,
            'description_terms': obj.description,
            'user_agent': obj.user_agent,
            'product': obj.product,
            'channel': obj.channel,
            'version': obj.version,
            'browser': obj.browser,
            'browser_version': obj.browser_version,
            'platform': obj.platform,
            'locale': obj.locale,
            'country': obj.country,
            'device': obj.device,
            'manufacturer': obj.manufacturer,
            'source': obj.source,
            'campaign': obj.campaign,
            'source_campaign': '::'.join([
                (obj.source or '--'),
                (obj.campaign or '--')
            ]),
            'organic': (not obj.source and not obj.campaign),
            'created': obj.created,
        }

        # We only compute bigrams for english because the analysis
        # uses English stopwords, stemmers, ...
        if obj.locale.startswith(u'en') and obj.description:
            bigrams = compute_grams(obj.description)
            doc['description_bigrams'] = bigrams

        return doc
Esempio n. 10
0
    def extract_document(cls, obj_id, obj=None):
        if obj is None:
            obj = cls.get_model().objects.get(pk=obj_id)

        def empty_to_unknown(text):
            return u"Unknown" if text == u"" else text

        doc = {
            "id": obj.id,
            "prodchan": obj.prodchan,
            "happy": obj.happy,
            "url": obj.url,
            "url_domain": obj.url_domain,
            "has_email": bool(obj.user_email),
            "description": obj.description,
            "description_terms": obj.description,
            "user_agent": obj.user_agent,
            "product": obj.product,
            "channel": obj.channel,
            "version": obj.version,
            "browser": obj.browser,
            "browser_version": obj.browser_version,
            "platform": obj.platform,
            "locale": obj.locale,
            "country": obj.country,
            "device": obj.device,
            "manufacturer": obj.manufacturer,
            "created": obj.created,
        }

        # We only compute bigrams for english because the analysis
        # uses English stopwords, stemmers, ...
        if obj.locale.startswith(u"en") and obj.description:
            bigrams = compute_grams(obj.description)
            doc["description_bigrams"] = bigrams

        return doc
Esempio n. 11
0
    def test_basic(self):
        test_data = [
            ('The quick brown fox', [u'brown quick', u'brown fox']),

            ('the latest update disables the New tab function',
             [u'disables new', u'function tab', u'new tab', u'latest update',
              u'disables update']),

            ('why is firefox so damn slow???? many tabs load slow or not at '
             'all!',
             [u'load tabs', u'load slow', u'slow tabs', u'damn slow']),

            ("I'm one of the guys that likes to try Firefox ahead of the "
             'herd... usually I use Nightly, but then a while back my '
             'favorite add-on, TabMixPlus stopped working because Firefox '
             "redid something in the code. \"No problem,\" says I to myself, "
             "I'll just use Aurora until they get it fixed.",
             [u'add-on favorite', u'add-on tabmixplus', u'ahead herd',
              u'ahead try', u'aurora fixed', u'aurora use', u'code problem',
              u'code redid', u'favorite nightly', u"guys i'm", u'guys likes',
              u'herd usually', u"i'll just", u"i'll myself", u'just use',
              u'likes try', u'myself says', u'nightly use', u'problem says',
              u'redid working', u'stopped tabmixplus', u'stopped working',
              u'use usually']),

            ('Being partially sighted, I found the features with Windows XP '
             'and IE8 extremely usefu;. I need everything in Arial black bold '
             'text.',
             [u'extremely usefu', u'features sighted', u'windows xp',
              u'ie8 xp', u'black bold', u'partially sighted', u'need usefu',
              u'features windows', u'arial need', u'arial black', u'bold text',
              u'extremely ie8']),
        ]

        for text, expected in test_data:
            assert sorted(compute_grams(text)) == sorted(expected)
Esempio n. 12
0
    def extract_document(cls, obj_id, obj=None):
        if obj is None:
            obj = cls.get_model().objects.get(pk=obj_id)

        doc = {
            'id':
            obj.id,
            'happy':
            obj.happy,
            'api':
            obj.api,
            'url':
            obj.url,
            'url_domain':
            obj.url_domain,
            'has_email':
            bool(obj.user_email),
            'description':
            obj.description,
            'category':
            obj.category,
            'description_terms':
            obj.description,
            'user_agent':
            obj.user_agent,
            'product':
            obj.product,
            'channel':
            obj.channel,
            'version':
            obj.version,
            'browser':
            obj.browser,
            'browser_version':
            obj.browser_version,
            'platform':
            obj.platform,
            'locale':
            obj.locale,
            'country':
            obj.country,
            'device':
            obj.device,
            'manufacturer':
            obj.manufacturer,
            'source':
            obj.source,
            'campaign':
            obj.campaign,
            'source_campaign':
            '::'.join([(obj.source or '--'), (obj.campaign or '--')]),
            'organic': (not obj.campaign),
            'created':
            obj.created,
        }

        # We only compute bigrams for english because the analysis
        # uses English stopwords, stemmers, ...
        if obj.locale.startswith(u'en') and obj.description:
            bigrams = compute_grams(obj.description)
            doc['description_bigrams'] = bigrams

        return doc
Esempio n. 13
0
 def test_empty(self):
     eq_(compute_grams(u''), [])
Esempio n. 14
0
    def extract_doc(cls, resp, with_id=True):
        """Converts a Response to a dict of values

        This can be used with ``ResponseDocType.from_obj()`` to create a
        ``ResponseDocType`` object or it can be used for indexing.

        :arg resp: a Response object
        :arg with_id: whether or not to include the ``_id`` value--include
            it when you're bulk indexing

        :returns: a dict

        """
        doc = {
            'id':
            resp.id,
            'happy':
            resp.happy,
            'api':
            resp.api,
            'url':
            resp.url,
            'url_domain':
            resp.url_domain,
            'has_email':
            bool(resp.user_email),
            'description':
            resp.description,
            'user_agent':
            resp.user_agent,
            'product':
            resp.product,
            'channel':
            resp.channel,
            'version':
            resp.version,
            'browser':
            resp.browser,
            'browser_version':
            resp.browser_version,
            'platform':
            resp.platform,
            'locale':
            resp.locale,
            'country':
            resp.country,
            'device':
            resp.device,
            'manufacturer':
            resp.manufacturer,
            'source':
            resp.source,
            'campaign':
            resp.campaign,
            'source_campaign':
            '::'.join([(resp.source or '--'), (resp.campaign or '--')]),
            'organic': (not resp.campaign),
            'created':
            resp.created
        }

        # We only compute bigrams for english because the analysis
        # uses English stopwords, stemmers, ...
        if resp.locale.startswith(u'en') and resp.description:
            doc['description_bigrams'] = compute_grams(resp.description)
        else:
            doc['description_bigrams'] = []

        if with_id:
            doc['_id'] = doc['id']
        return doc
Esempio n. 15
0
 def test_empty(self):
     eq_(compute_grams(u''), [])
Esempio n. 16
0
 def test_empty(self):
     assert compute_grams(u'') == []