def filtered_tokens(self): # using lemmatized words from msgvis.apps.base.utils import get_stoplist tokens = map(lambda x: x.tweet_word.text, self.tweetword_connections.all()) stop_words = set(get_stoplist()+['ive', 'wasnt', 'didnt', 'dont']) tokens = filter(lambda x: x not in stop_words, tokens) tokens = filter(lambda x: (len(x) > 2) and not (x.startswith('http') and len(x) > 4), tokens) return tokens
def filtered_tokens(self): # using lemmatized words from msgvis.apps.base.utils import get_stoplist tokens = map(lambda x: x.tweet_word.text, self.tweetword_connections.all()) stop_words = set(get_stoplist() + ['ive', 'wasnt', 'didnt', 'dont']) tokens = filter(lambda x: x not in stop_words, tokens) tokens = filter( lambda x: (len(x) > 2) and not (x.startswith('http') and len(x) > 4), tokens) return tokens
def default_feature_context(name, dataset_id): dataset = Dataset.objects.get(pk=dataset_id) queryset = dataset.message_set.all()#filter(language__code='en') filters = [ set(get_stoplist()), ['ive', 'wasnt', 'didnt', 'dont'], LambdaWordFilter(lambda word: word == 'rt' or len(word) <= 2), LambdaWordFilter(lambda word: word.startswith('http') and len(word) > 4) ] return FeatureContext(name=name, queryset=queryset, tokenizer=TweetParserTokenizer, lemmatizer=None,#WordNetLemmatizer(), filters=filters, minimum_frequency=4)
def default_feature_context(name, dataset_id): dataset = Dataset.objects.get(pk=dataset_id) queryset = dataset.message_set.all() #filter(language__code='en') filters = [ set(get_stoplist()), ['ive', 'wasnt', 'didnt', 'dont'], LambdaWordFilter(lambda word: word == 'rt' or len(word) <= 2), LambdaWordFilter( lambda word: word.startswith('http') and len(word) > 4) ] return FeatureContext( name=name, queryset=queryset, tokenizer=TweetParserTokenizer, lemmatizer=None, #WordNetLemmatizer(), filters=filters, minimum_frequency=4)