def from_dataframe(cls, review_df, cutoff=25):
        """
        Instantiate the vectorizer from the dataset dataframe.

        Args:
            review_df (pandas.Dataframe): the serializable dictionary
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        # Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        # Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)
Beispiel #2
0
    def __init__(self,
                 vocabulary: Vocabulary,
                 tokenizer=split_tokenizer,
                 init_token=None,
                 eos_token=None,
                 pad_token=None,
                 reverse=False):
        self.vocab = vocabulary

        if init_token:
            self.init_idx = vocabulary.add_token(init_token)
            self.init_token = init_token
            self.init_present = 1
        else:
            self.init_present = 0

        if eos_token:
            self.eos_idx = vocabulary.add_token(eos_token)
            self.eos_token = eos_token
            self.eos_present = 1
        else:
            self.eos_present = 0

        if pad_token:
            self.pad_idx = vocabulary.add_token(pad_token)

        self.tokenizer = tokenizer
        self.reverse = reverse
Beispiel #3
0
    # DATA FILES #
    train_loc = locations['train_loc']
    dev_loc = locations['test_loc']
    fasttext_loc = locations['embeddings_loc']
    w2vec_loc = locations['w2vec_loc']
    model_loc = locations['model_loc']
    stopwordsfile = locations['stopwordsfile']

    # VOCABULARY #
    special_tokens = [INIT_TOKEN, UNK_TOKEN, END_TOKEN, PAD_TOKEN]
    with open(train_loc) as f:
        raw_text = f.read()
    voc = Vocabulary(raw_text, bigram=bigram)
    voc.prune(threshold=1)
    for token in special_tokens:
        voc.add_token(token)
    w2idx = voc.w2idx
    idx2w = voc.idx2w
    voc_size = voc.get_length()
    pad_idx = w2idx[PAD_TOKEN]
    init_idx = w2idx[INIT_TOKEN]

    # STOP WORDS #
    with open(stopwordsfile) as f:
        stop_words = f.read().split()
    stop_words.extend(special_tokens)
    stop_idx = [w2idx[w] for w in stop_words if w in w2idx.keys()]

    # PRE-TRAINED EMBEDDINGS #
    if os.path.exists(w2vec_loc):
        with open(w2vec_loc, 'rb') as f: