def from_dataframe(cls, review_df, cutoff=25): """Instantiate the vectorizer from the dataset dataframe Args: review_df (pandas.DataFrame): the review dataset cutoff (int): the parameter for frequency-based filtering Returns: an instance of the ReviewVectorizer """ review_vocab = Vocabulary(add_unk=True) rating_vocab = Vocabulary(add_unk=False) # Add ratings for rating in sorted(set(review_df.rating)): rating_vocab.add_token(rating) # Add top words if count > provided count word_counts = Counter() for review in review_df.review: for word in review.split(" "): if word not in string.punctuation: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: review_vocab.add_token(word) return cls(review_vocab, rating_vocab)
def from_dataframe(cls, news_df): """Instantiate the vectorizer from the dataset dataframe Args: news_df (pandas.DataFrame): the target dataset Returns: an instance of the NREVectorizer """ relation_vocab = Vocabulary() for relation in set(news_df.relation): relation_vocab.add_token(relation) seq_vocab = SequenceVocabulary() for sequence in news_df.sequence: word_list = list(jieba.cut(sequence, cut_all=False)) seq_vocab.add_many(word_list) return cls(seq_vocab, relation_vocab)
def from_dataframe(cls, review_df, cutoff=25): review_vocab = Vocabulary(add_unk=True) rating_vocab = Vocabulary(add_unk=False) for rating in sorted(set(review_df.rating)): rating_vocab.add_token(rating) word_counts = Counter() for review in review_df.review: for word in review.split(" "): if word not in string.punctuation: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: review_vocab.add_token(word) return cls(review_vocab, rating_vocab)
def from_dataframe(cls, dataset_df, cutoff=c_frequencyCutoff): """ Instantiate the Vectorizer from the dataset dataframe Args: dataset_df (pandas.DataFrame): the tweets dataset cutoff (int): the parameter for frequency-based filtering Returns: an instance of the TwitterVectorizer """ # instantiate the Vocabulary for text column text_vocabulary = cls._get_text_vocabulary() # instantiate the Vocabulary for target column target_vocabulary = Vocabulary(add_unknown_token=False) # add elements to Target Vocabulary for target in sorted(set(dataset_df.target)): target_vocabulary.add_token(target) # Tweet Tokenizer to split text into tokens tokenizer = TweetTokenizer() # add word to the Text Vocabulary, if its frequency > cutoff word_counts = Counter() # iterate through the dataset for text in dataset_df.text: # split text into words words = tokenizer.tokenize(text) # update word_counts for all words in the text for word in words: word_counts[word] += 1 # for all extacted words for word, count in word_counts.items(): # if the word is not punctuation and it appears more than @cutoff times, add it to the Vocabulary if (word not in string.punctuation) and (count > cutoff): # add token to the Vocabulary text_vocabulary.add_token(word) return cls(text_vocabulary, target_vocabulary)
def from_dataframe(cls, predictor_df, classifier, cutoff=25): # GLOVE_MODEL """Instantiate the vectorizer from the dataset dataframe Args: predictor_df (pandas.DataFrame): the predictor dataset cutoff (int): the parameter for frequency-based filtering Returns: an instance of the ReviewVectorizer """ if classifier == 'GloVe': predictor_vocab = SequenceVocabulary() else: predictor_vocab = Vocabulary(add_unk=True) target_vocab = Vocabulary(add_unk=False) max_predictor_length = 0 # Add targets for target in sorted(set(predictor_df.target)): target_vocab.add_token(target) # Add top words if count > provided count word_counts = Counter() for index, row in predictor_df.iterrows(): vector = remove_punctuation(row.predictor) max_predictor_length = max(max_predictor_length, len(vector)) for word in vector: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: predictor_vocab.add_token(word) return cls(predictor_vocab, target_vocab, max_predictor_length) # for CNN