Ejemplo n.º 1
0
    def filtered_tokens(self):
        # using lemmatized words
        from msgvis.apps.base.utils import get_stoplist
        tokens = map(lambda x: x.tweet_word.text, self.tweetword_connections.all())

        stop_words = set(get_stoplist()+['ive', 'wasnt', 'didnt', 'dont'])
        tokens = filter(lambda x: x not in stop_words, tokens)
        tokens = filter(lambda x: (len(x) > 2) and not (x.startswith('http') and len(x) > 4), tokens)
        return tokens
Ejemplo n.º 2
0
    def filtered_tokens(self):
        # using lemmatized words
        from msgvis.apps.base.utils import get_stoplist
        tokens = map(lambda x: x.tweet_word.text,
                     self.tweetword_connections.all())

        stop_words = set(get_stoplist() + ['ive', 'wasnt', 'didnt', 'dont'])
        tokens = filter(lambda x: x not in stop_words, tokens)
        tokens = filter(
            lambda x:
            (len(x) > 2) and not (x.startswith('http') and len(x) > 4), tokens)
        return tokens
Ejemplo n.º 3
0
def default_feature_context(name, dataset_id):
    dataset = Dataset.objects.get(pk=dataset_id)
    queryset = dataset.message_set.all()#filter(language__code='en')

    filters = [
        set(get_stoplist()),
        ['ive', 'wasnt', 'didnt', 'dont'],
        LambdaWordFilter(lambda word: word == 'rt' or len(word) <= 2),
        LambdaWordFilter(lambda word: word.startswith('http') and len(word) > 4)
    ]

    return FeatureContext(name=name, queryset=queryset,
                        tokenizer=TweetParserTokenizer,
                        lemmatizer=None,#WordNetLemmatizer(),
                        filters=filters,
                        minimum_frequency=4)
Ejemplo n.º 4
0
def default_feature_context(name, dataset_id):
    dataset = Dataset.objects.get(pk=dataset_id)
    queryset = dataset.message_set.all()  #filter(language__code='en')

    filters = [
        set(get_stoplist()), ['ive', 'wasnt', 'didnt', 'dont'],
        LambdaWordFilter(lambda word: word == 'rt' or len(word) <= 2),
        LambdaWordFilter(
            lambda word: word.startswith('http') and len(word) > 4)
    ]

    return FeatureContext(
        name=name,
        queryset=queryset,
        tokenizer=TweetParserTokenizer,
        lemmatizer=None,  #WordNetLemmatizer(),
        filters=filters,
        minimum_frequency=4)