Esempio n. 1
0
    def from_dataframe(cls, review_df, cutoff=25):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        # Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        # Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)
    def from_dataframe(cls, news_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            news_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the NREVectorizer
        """
        relation_vocab = Vocabulary()
        for relation in set(news_df.relation):
            relation_vocab.add_token(relation)

        seq_vocab = SequenceVocabulary()
        for sequence in news_df.sequence:
            word_list = list(jieba.cut(sequence, cut_all=False))
            seq_vocab.add_many(word_list)
        return cls(seq_vocab, relation_vocab)
Esempio n. 3
0
    def from_dataframe(cls, review_df, cutoff=25):
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)
Esempio n. 4
0
    def from_dataframe(cls, dataset_df, cutoff=c_frequencyCutoff):
        """
        Instantiate the Vectorizer from the dataset dataframe

        Args:
            dataset_df (pandas.DataFrame): the tweets dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the TwitterVectorizer
        """
        # instantiate the Vocabulary for text column
        text_vocabulary = cls._get_text_vocabulary()

        # instantiate the Vocabulary for target column
        target_vocabulary = Vocabulary(add_unknown_token=False)

        # add elements to Target Vocabulary
        for target in sorted(set(dataset_df.target)):
            target_vocabulary.add_token(target)

        # Tweet Tokenizer to split text into tokens
        tokenizer = TweetTokenizer()

        # add word to the Text Vocabulary, if its frequency > cutoff
        word_counts = Counter()

        # iterate through the dataset
        for text in dataset_df.text:
            # split text into words
            words = tokenizer.tokenize(text)

            # update word_counts for all words in the text
            for word in words:
                word_counts[word] += 1

        # for all extacted words
        for word, count in word_counts.items():
            # if the word is not punctuation and it appears more than @cutoff times, add it to the Vocabulary
            if (word not in string.punctuation) and (count > cutoff):
                # add token to the Vocabulary
                text_vocabulary.add_token(word)

        return cls(text_vocabulary, target_vocabulary)
Esempio n. 5
0
    def from_dataframe(cls,
                       predictor_df,
                       classifier,
                       cutoff=25):  # GLOVE_MODEL
        """Instantiate the vectorizer from the dataset dataframe

        Args:
            predictor_df (pandas.DataFrame): the predictor dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        if classifier == 'GloVe':
            predictor_vocab = SequenceVocabulary()
        else:
            predictor_vocab = Vocabulary(add_unk=True)

        target_vocab = Vocabulary(add_unk=False)
        max_predictor_length = 0

        # Add targets
        for target in sorted(set(predictor_df.target)):
            target_vocab.add_token(target)

        # Add top words if count > provided count
        word_counts = Counter()
        for index, row in predictor_df.iterrows():
            vector = remove_punctuation(row.predictor)
            max_predictor_length = max(max_predictor_length, len(vector))
            for word in vector:
                word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                predictor_vocab.add_token(word)

        return cls(predictor_vocab, target_vocab,
                   max_predictor_length)  # for CNN