def test__get_keywords_returns_only_nouns(self): sentence = "I'm going to use addnow product. It provides great widgets to track social (and 'dark social') " \ "sharing of content. Thanks to all addnow team! You can contact me via email " \ "[email protected] and site http://mysite.com" keywords = get_keywords(sentence, ALLOWED_KEYWORD_TAGS) self.assertEqual(keywords, [ 'addnow', 'product', 'widgets', 'content', 'addnow', 'team', 'email', 'addnow product', 'product widgets', 'widgets content', 'content addnow', 'addnow team', 'team email', 'addnow product widgets', 'product widgets content', 'widgets content addnow', 'content addnow team', 'addnow team email' ])
def test__get_keywords_with_unicode_punct(self): sentence = 'Schindler’s list is an American “epic” historical period drama film‚ directed and co—produced ' \ 'by Steven Spielberg and scripted by Steven Zaillian. It is based on the novel Schindlerʼs ' \ 'Ark by Thomas Keneally‚ an Australian novelist…' keywords = get_keywords(sentence, ALLOWED_KEYWORD_TAGS) self.assertEqual(keywords, [ 'schindler', 'list', 'period', 'drama', 'film', 'steven', 'spielberg', 'steven', 'zaillian', 'schindler', 'ark', 'thomas', 'keneally', 'novelist', 'schindler list', 'list period', 'period drama', 'drama film', 'film steven', 'steven spielberg', 'spielberg steven', 'steven zaillian', 'zaillian schindler', 'schindler ark', 'ark thomas', 'thomas keneally', 'keneally novelist', 'schindler list period', 'list period drama', 'period drama film', 'drama film steven', 'film steven spielberg', 'steven spielberg steven', 'spielberg steven zaillian', 'steven zaillian schindler', 'zaillian schindler ark', 'schindler ark thomas', 'ark thomas keneally', 'thomas keneally novelist' ])
def test__get_keywords_with_disallowed_strings(self): sentence = 'ixzz3kxlsye8l google_api_doesnt_work notfoundhttpexception ______' keywords = get_keywords(sentence, ALLOWED_KEYWORD_TAGS) self.assertEqual(keywords, [])
def test__get_keywords_with_digits_and_dates(self): sentence = '658 250,000 1.9m 8pm 2015/10/12 13:35:34 2015-10-12 13:35:34' keywords = get_keywords(sentence, ALLOWED_KEYWORD_TAGS) self.assertEqual(keywords, [])
def test__get_keywords_with_domain_names(self): sentence = 'freepokertourneys.com buyairplanetickets.com thumbtack.net' keywords = get_keywords(sentence, ALLOWED_KEYWORD_TAGS) self.assertEqual(keywords, [])
def test__get_keywords_with_urls(self): sentence = 'www.google.com www.google.com/search?client=ubuntu https://www.google.com/search?client=ubuntu ' \ 'mailto:[email protected]' keywords = get_keywords(sentence, ALLOWED_KEYWORD_TAGS) self.assertEqual(keywords, [])
def test__get_keywords_with_stop_words(self): sentence = "so i'm sorry that i tried to go this way" keywords = get_keywords(sentence, ALLOWED_KEYWORD_TAGS) self.assertEqual(keywords, [])
def test__get_keywords_with_small_length_of_words(self): sentence = '$, %, (, ), ^, >, <, *, #, @, go, to, am, so, no, ok' keywords = get_keywords(sentence, ALLOWED_KEYWORD_TAGS) self.assertEqual(keywords, [])
def test__get_keywords_with_non_alphanumeric_characters(self): sentence = 'radius/ulna news|pictures|filmography|community vista\win b***ches' keywords = get_keywords(sentence, ALLOWED_KEYWORD_TAGS) self.assertEqual(keywords, [])
def _clean_event_data(site_id, date, query, headers=None, real_ip=None): is_first = True domain = None search_engine = None search_term = None country = constants.UNKNOWN_COUNTRY copied_keywords = [] event = query.get('e_c') source = query.get('e_a') tool = query.get('e_t') uuid = query.get('e_u') url = query.get('e_n') title = query.get('e_i') timestamp = query.get('e_v') keywords_string = query.get('e_k') headers = headers or {} site_config = SiteSettingsCache.get_config(site_id) try: event = constants.Events(event).value except ValueError: raise IncorrectEventException('Incorrect event value: %s' % event) if timestamp: try: date = datetime.utcfromtimestamp(int(timestamp)) except (ValueError, TypeError): raise IncorrectEventException('Unexpected timestamp format: %s' % timestamp) date = date + timedelta(minutes=site_config['offset']) if source and source not in constants.SOURCE_SET: raise IncorrectEventException('Incorrect source value: %s' % source) if tool: try: tool = constants.Tools(tool).value except ValueError: raise IncorrectEventException('Incorrect tool value: %s' % tool) if uuid: is_first = _create_unique_user(uuid, date) user_agent = parse(headers.get('HTTP_USER_AGENT', '')) browser = user_agent.browser.family referrer = headers.get('HTTP_REFERER') if referrer: r = Referer(referrer) domain = r.uri.netloc search_engine = r.referer search_term = r.search_term if real_ip: geoip_data = get_geoip_data(real_ip) if geoip_data and geoip_data['country_code']: country = geoip_data['country_code'] if keywords_string: copied_keywords = get_keywords(keywords_string, ALLOWED_KEYWORD_TAGS) cleaned_data = dict(site=int(site_id), date=date, source=source, tool=tool, url=url, title=title, browser=browser, domain=domain, search_engine=search_engine, search_term=search_term, country=country, copied_keywords=copied_keywords) return event, is_first, cleaned_data