def test_bigrams(self): # Note: Tokens look weird after being analyzed probably due to # the stemmer. We could write a bunch of code to "undo" some # of the excessive stemming, but it's probably an exercise in # futility. Ergo the tests look a little odd. e.g. "youtub" # One word a bigram does not make assert compute_grams(u'youtube') == [] # Two words is the minimum number to create a bigram assert( sorted(compute_grams(u'youtube crash')) == ['crash youtube'] ) # Three words creates two bigrams assert( sorted(compute_grams(u'youtube crash flash')) == ['crash flash', 'crash youtube'] ) # Four words creates three bigrams assert( sorted(compute_grams(u'youtube crash flash bridge')) == ['bridge flash', 'crash flash', 'crash youtube'] ) # Nix duplicate bigrams assert( sorted(compute_grams(u'youtube crash youtube flash')) == ['crash youtube', 'flash youtube'] )
def test_parsing(self): # stop words are removed eq_(compute_grams(u'i me him her'), []) # capital letters don't matter eq_(compute_grams(u'I ME HIM HER'), []) # punctuation nixed eq_(compute_grams(u'i, me, him, her'), [])
def test_parsing(self): # stop words are removed eq_(compute_grams(u'i me him her'), []) # capital letters don't matter eq_(compute_grams(u'I ME HIM HER'), []) # punctuation nixed eq_(compute_grams(u'i, me, him, her'), [])
def test_parsing(self): # stop words are removed assert compute_grams(u'i me him her') == [] # capital letters don't matter assert compute_grams(u'I ME HIM HER') == [] # punctuation nixed assert compute_grams(u'i, me, him, her') == []
def test_basic(self): test_data = [ ('The quick brown fox', [u'brown quick', u'brown fox']), ('the latest update disables the New tab function', [ u'disables new', u'function tab', u'new tab', u'latest update', u'disables update' ]), ('why is firefox so damn slow???? many tabs load slow or not at ' 'all!', [u'load tabs', u'load slow', u'slow tabs', u'damn slow']), ("I'm one of the guys that likes to try Firefox ahead of the " 'herd... usually I use Nightly, but then a while back my ' 'favorite add-on, TabMixPlus stopped working because Firefox ' "redid something in the code. \"No problem,\" says I to myself, " "I'll just use Aurora until they get it fixed.", [ u'add-on favorite', u'add-on tabmixplus', u'ahead herd', u'ahead try', u'aurora fixed', u'aurora use', u'code problem', u'code redid', u'favorite nightly', u"guys i'm", u'guys likes', u'herd usually', u"i'll just", u"i'll myself", u'just use', u'likes try', u'myself says', u'nightly use', u'problem says', u'redid working', u'stopped tabmixplus', u'stopped working', u'use usually' ]), ('Being partially sighted, I found the features with Windows XP ' 'and IE8 extremely usefu;. I need everything in Arial black bold ' 'text.', [ u'extremely usefu', u'features sighted', u'windows xp', u'ie8 xp', u'black bold', u'partially sighted', u'need usefu', u'features windows', u'arial need', u'arial black', u'bold text', u'extremely ie8' ]), ] for text, expected in test_data: assert sorted(compute_grams(text)) == sorted(expected)
def extract_document(cls, obj_id, obj=None): if obj is None: obj = cls.get_model().objects.get(pk=obj_id) def empty_to_unknown(text): return u'Unknown' if text == u'' else text doc = { 'id': obj.id, 'prodchan': obj.prodchan, 'happy': obj.happy, 'url': obj.url, 'description': obj.description, 'user_agent': obj.user_agent, 'product': obj.product, 'channel': obj.channel, 'version': obj.version, 'browser': obj.browser, 'browser_version': obj.browser_version, 'platform': obj.platform, 'locale': obj.locale, 'device': obj.device, 'manufacturer': obj.manufacturer, 'created': obj.created, } # We only compute bigrams for english because the analysis # uses English stopwords, stemmers, ... if obj.locale.startswith(u'en') and obj.description: bigrams = compute_grams(obj.description) doc['description_bigrams'] = bigrams return doc
def extract_doc(cls, resp, with_id=True): """Converts a Response to a dict of values This can be used with ``ResponseDocType.from_obj()`` to create a ``ResponseDocType`` object or it can be used for indexing. :arg resp: a Response object :arg with_id: whether or not to include the ``_id`` value--include it when you're bulk indexing :returns: a dict """ doc = { 'id': resp.id, 'happy': resp.happy, 'api': resp.api, 'url': resp.url, 'url_domain': resp.url_domain, 'has_email': bool(resp.user_email), 'description': resp.description, 'user_agent': resp.user_agent, 'product': resp.product, 'channel': resp.channel, 'version': resp.version, 'browser': resp.browser, 'browser_version': resp.browser_version, 'platform': resp.platform, 'locale': resp.locale, 'country': resp.country, 'device': resp.device, 'manufacturer': resp.manufacturer, 'source': resp.source, 'campaign': resp.campaign, 'source_campaign': '::'.join([ (resp.source or '--'), (resp.campaign or '--') ]), 'organic': (not resp.campaign), 'created': resp.created } # We only compute bigrams for english because the analysis # uses English stopwords, stemmers, ... if resp.locale.startswith(u'en') and resp.description: doc['description_bigrams'] = compute_grams(resp.description) else: doc['description_bigrams'] = [] if with_id: doc['_id'] = doc['id'] return doc
def extract_doc(cls, resp, with_id=True): """Converts a Response to a dict of values This can be used with ``ResponseDocType.from_obj()`` to create a ``ResponseDocType`` object or it can be used for indexing. :arg resp: a Response object :arg with_id: whether or not to include the ``_id`` value--include it when you're bulk indexing :returns: a dict """ doc = { "id": resp.id, "happy": resp.happy, "api": resp.api, "url": resp.url, "url_domain": resp.url_domain, "has_email": bool(resp.user_email), "description": resp.description, "user_agent": resp.user_agent, "product": resp.product, "channel": resp.channel, "version": resp.version, "browser": resp.browser, "browser_version": resp.browser_version, "platform": resp.platform, "locale": resp.locale, "country": resp.country, "device": resp.device, "manufacturer": resp.manufacturer, "source": resp.source, "campaign": resp.campaign, "source_campaign": "::".join([(resp.source or "--"), (resp.campaign or "--")]), "organic": (not resp.campaign), "created": resp.created, } # We only compute bigrams for english because the analysis # uses English stopwords, stemmers, ... if resp.locale.startswith(u"en") and resp.description: doc["description_bigrams"] = compute_grams(resp.description) else: doc["description_bigrams"] = [] if with_id: doc["_id"] = doc["id"] return doc
def extract_document(cls, obj_id, obj=None): if obj is None: obj = cls.get_model().objects.get(pk=obj_id) doc = { 'id': obj.id, 'prodchan': obj.prodchan, 'happy': obj.happy, 'api': obj.api, 'url': obj.url, 'url_domain': obj.url_domain, 'has_email': bool(obj.user_email), 'description': obj.description, 'category': obj.category, 'description_terms': obj.description, 'user_agent': obj.user_agent, 'product': obj.product, 'channel': obj.channel, 'version': obj.version, 'browser': obj.browser, 'browser_version': obj.browser_version, 'platform': obj.platform, 'locale': obj.locale, 'country': obj.country, 'device': obj.device, 'manufacturer': obj.manufacturer, 'source': obj.source, 'campaign': obj.campaign, 'source_campaign': '::'.join([ (obj.source or '--'), (obj.campaign or '--') ]), 'organic': (not obj.source and not obj.campaign), 'created': obj.created, } # We only compute bigrams for english because the analysis # uses English stopwords, stemmers, ... if obj.locale.startswith(u'en') and obj.description: bigrams = compute_grams(obj.description) doc['description_bigrams'] = bigrams return doc
def extract_document(cls, obj_id, obj=None): if obj is None: obj = cls.get_model().objects.get(pk=obj_id) def empty_to_unknown(text): return u"Unknown" if text == u"" else text doc = { "id": obj.id, "prodchan": obj.prodchan, "happy": obj.happy, "url": obj.url, "url_domain": obj.url_domain, "has_email": bool(obj.user_email), "description": obj.description, "description_terms": obj.description, "user_agent": obj.user_agent, "product": obj.product, "channel": obj.channel, "version": obj.version, "browser": obj.browser, "browser_version": obj.browser_version, "platform": obj.platform, "locale": obj.locale, "country": obj.country, "device": obj.device, "manufacturer": obj.manufacturer, "created": obj.created, } # We only compute bigrams for english because the analysis # uses English stopwords, stemmers, ... if obj.locale.startswith(u"en") and obj.description: bigrams = compute_grams(obj.description) doc["description_bigrams"] = bigrams return doc
def test_basic(self): test_data = [ ('The quick brown fox', [u'brown quick', u'brown fox']), ('the latest update disables the New tab function', [u'disables new', u'function tab', u'new tab', u'latest update', u'disables update']), ('why is firefox so damn slow???? many tabs load slow or not at ' 'all!', [u'load tabs', u'load slow', u'slow tabs', u'damn slow']), ("I'm one of the guys that likes to try Firefox ahead of the " 'herd... usually I use Nightly, but then a while back my ' 'favorite add-on, TabMixPlus stopped working because Firefox ' "redid something in the code. \"No problem,\" says I to myself, " "I'll just use Aurora until they get it fixed.", [u'add-on favorite', u'add-on tabmixplus', u'ahead herd', u'ahead try', u'aurora fixed', u'aurora use', u'code problem', u'code redid', u'favorite nightly', u"guys i'm", u'guys likes', u'herd usually', u"i'll just", u"i'll myself", u'just use', u'likes try', u'myself says', u'nightly use', u'problem says', u'redid working', u'stopped tabmixplus', u'stopped working', u'use usually']), ('Being partially sighted, I found the features with Windows XP ' 'and IE8 extremely usefu;. I need everything in Arial black bold ' 'text.', [u'extremely usefu', u'features sighted', u'windows xp', u'ie8 xp', u'black bold', u'partially sighted', u'need usefu', u'features windows', u'arial need', u'arial black', u'bold text', u'extremely ie8']), ] for text, expected in test_data: assert sorted(compute_grams(text)) == sorted(expected)
def extract_document(cls, obj_id, obj=None): if obj is None: obj = cls.get_model().objects.get(pk=obj_id) doc = { 'id': obj.id, 'happy': obj.happy, 'api': obj.api, 'url': obj.url, 'url_domain': obj.url_domain, 'has_email': bool(obj.user_email), 'description': obj.description, 'category': obj.category, 'description_terms': obj.description, 'user_agent': obj.user_agent, 'product': obj.product, 'channel': obj.channel, 'version': obj.version, 'browser': obj.browser, 'browser_version': obj.browser_version, 'platform': obj.platform, 'locale': obj.locale, 'country': obj.country, 'device': obj.device, 'manufacturer': obj.manufacturer, 'source': obj.source, 'campaign': obj.campaign, 'source_campaign': '::'.join([(obj.source or '--'), (obj.campaign or '--')]), 'organic': (not obj.campaign), 'created': obj.created, } # We only compute bigrams for english because the analysis # uses English stopwords, stemmers, ... if obj.locale.startswith(u'en') and obj.description: bigrams = compute_grams(obj.description) doc['description_bigrams'] = bigrams return doc
def test_empty(self): eq_(compute_grams(u''), [])
def extract_doc(cls, resp, with_id=True): """Converts a Response to a dict of values This can be used with ``ResponseDocType.from_obj()`` to create a ``ResponseDocType`` object or it can be used for indexing. :arg resp: a Response object :arg with_id: whether or not to include the ``_id`` value--include it when you're bulk indexing :returns: a dict """ doc = { 'id': resp.id, 'happy': resp.happy, 'api': resp.api, 'url': resp.url, 'url_domain': resp.url_domain, 'has_email': bool(resp.user_email), 'description': resp.description, 'user_agent': resp.user_agent, 'product': resp.product, 'channel': resp.channel, 'version': resp.version, 'browser': resp.browser, 'browser_version': resp.browser_version, 'platform': resp.platform, 'locale': resp.locale, 'country': resp.country, 'device': resp.device, 'manufacturer': resp.manufacturer, 'source': resp.source, 'campaign': resp.campaign, 'source_campaign': '::'.join([(resp.source or '--'), (resp.campaign or '--')]), 'organic': (not resp.campaign), 'created': resp.created } # We only compute bigrams for english because the analysis # uses English stopwords, stemmers, ... if resp.locale.startswith(u'en') and resp.description: doc['description_bigrams'] = compute_grams(resp.description) else: doc['description_bigrams'] = [] if with_id: doc['_id'] = doc['id'] return doc
def test_empty(self): eq_(compute_grams(u''), [])
def test_empty(self): assert compute_grams(u'') == []