Beispiel #1
0
 def from_serializable(cls, contents):
     character_vocab = Vocabulary.from_serializable(
         contents["character_vocab"])
     nationality_vocab = Vocabulary.from_serializable(
         contents["nationality_vocab"])
     return cls(character_vocab=character_vocab,
                nationality_vocab=nationality_vocab,
                max_surname_length=contents["max_surname_length"])
    def from_serializable(cls, contents):
        """ Instantiate a ReviewVectorizer from a serializable dictionary

        :param contents (dict): the serializable dictionary
        :return an instance of the ReviewVectorizer class
        """
        review_vocab = Vocabulary.from_serializable(contents["review_vocab"])
        rating_vocab = Vocabulary.from_serializable(contents["rating_vocab"])
        return cls(review_vocab, rating_vocab)
Beispiel #3
0
    def from_dataframe(cls, surname_df):
        character_vocab = SequenceVocabulary(add_unk=False)
        nationality_vocab = Vocabulary(add_unk=False)

        for index, row in surname_df.iterrows():
            for letter in row.surname:
                character_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)

        return cls(character_vocab, nationality_vocab)
    def from_serializable(cls, contents):
        """ Instantiate a SurnameVectorizer from a serializable dictionary

        :param contents (dict): the serializable dictionary
        :return an instance of the SurnameVectorizer class
        """
        surname_vocab = Vocabulary.from_serializable(contents["surname_vocab"])
        nationality_vocab = Vocabulary.from_serializable(
            contents["nationality_vocab"])
        return cls(surname_vocab, nationality_vocab)
Beispiel #5
0
 def from_serializable(cls, contents):
     character_vocab = SequenceVocabulary.from_serializable(
         contents["character_vocab"])
     nationality_vocab = Vocabulary.from_serializable(
         contents["nationality_vocab"])
     return cls(character_vocab=character_vocab,
                nationality_vocab=nationality_vocab)
    def from_dataframe(cls, surname_df):
        surname_vocab = Vocabulary(unk_token="@")
        nationality_vocab = Vocabulary(add_unk=False)

        for index, row in surname_df.iterrows():
            for letter in row.surname:
                surname_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)

        return cls(surname_vocab, nationality_vocab)
    def from_dataframe(cls, news_df, cutoff=25):
        title_vocab = SequenceVocabulary()
        category_vocab = Vocabulary(add_unk=False)

        word_counts = Counter()
        for title in news_df.title:
            for token in title.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1

        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                title_vocab.add_token(word)

        for category in sorted(set(news_df.category)):
            category_vocab.add_token(category)

        return cls(title_vocab, category_vocab)
Beispiel #8
0
    def from_dataframe(cls, surname_df):
        character_vocab = Vocabulary(unk_token="@")
        nationality_vocab = Vocabulary(add_unk=False)
        max_surname_length = 0

        for index, row in surname_df.iterrows():
            max_surname_length = max(max_surname_length, len(row.surname))
            for letter in row.surname:
                character_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)

        return cls(character_vocab, nationality_vocab, max_surname_length)
    def from_dataframe(cls, review_df, cutoff=25):
        """ Instantiate the vectorizer from the dataset dataframe

        :param review_df (pandas.DataFrame): the review dataset
        :param cutoff (int): the parameter for frequency-based filtering
        :return an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)
Beispiel #10
0
 def from_serializable(cls, contents):
     title_vocab = SequenceVocabulary.from_serializable(
         contents["title_vocab"])
     category_vocab = Vocabulary.from_serializable(
         contents["category_vocab"])
     return cls(title_vocab=title_vocab, category_vocab=category_vocab)