Ejemplo n.º 1
0
class Corpus(object):
    def __init__(self, path, dict_path):
        self.dictionary = Dictionary()
        add_to_dict = True
        if dict_path and os.path.exists(dict_path):
            print('loading dictionary')
            self.dictionary = self.dictionary.load(dict_path)
            add_to_dict = False
        self.train = self.tokenize(os.path.join(path, 'train.txt'),
                                   add_to_dict)
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'),
                                   add_to_dict)
        self.test = self.tokenize(os.path.join(path, 'test.txt'), add_to_dict)
        if dict_path and not os.path.exists(dict_path):
            self.dictionary.save(dict_path)

    def tokenize(self, path, add_to_dict):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        all_words = list(
            chain.from_iterable([
                sent.split() + ['<eos>']
                for sent in open(path).read().split('\n')
            ]))
        if add_to_dict:
            self.dictionary.add_documents([all_words])
        return torch.LongTensor(self.dictionary.doc2idx(all_words))
Ejemplo n.º 2
0
def make_item_descriptions(max_sentence_length=None):
    descriptions = pd.read_csv(os.path.join(
        'data', 'descriptions.csv')).rename(columns={'movie': 'item'})
    texts = descriptions.description
    texts = texts.apply(lambda x: x.strip().split())
    dictionary = Dictionary(texts.values)
    dictionary.filter_extremes()
    eos_id = len(dictionary.keys())

    # to index list
    texts = texts.apply(
        lambda x: dictionary.doc2idx(x, unknown_word_index=eos_id))
    texts = texts.apply(lambda x: np.array([a for a in x if a != eos_id]))
    max_sentence_length = max(
        texts.apply(len)) if max_sentence_length is None else min(
            max(texts.apply(len)), max_sentence_length)

    # padding
    texts = texts.apply(lambda x: x[:max_sentence_length])
    texts = texts.apply(lambda x: np.pad(x, (0, max_sentence_length - len(x)),
                                         'constant',
                                         constant_values=(0, eos_id)))

    # change types
    texts = texts.apply(lambda x: x.astype(np.int32))
    descriptions.id = descriptions.id.astype(np.int32)

    return descriptions.id.values, texts.values, len(dictionary.keys()) + 1
    def preprocess_phrase(self, phrases: Iterable[str],
                          dictionary: Dictionary) -> List[List[int]]:
        numerized_phrases: List[List[int]] = []

        for phrase in phrases:
            phrase = preprocess_string(phrase, self.custom_filter)
            phrase_idx = dictionary.doc2idx(phrase, None)
            phrase_idx = [x for x in phrase_idx if x is not None]
            numerized_phrases.append(phrase_idx)

        return numerized_phrases
Ejemplo n.º 4
0
class Lda:
    def __init__(self):
        self.model = None
        self.common_dictionary = None
        pass

    def train(self, common_texts, num_topics):
        self.common_dictionary = Dictionary(common_texts)
        common_corpus = [
            self.common_dictionary.doc2bow(text) for text in common_texts
        ]
        self.model = LdaModel(common_corpus,
                              num_topics=num_topics,
                              alpha='auto',
                              eval_every=5)

    def get_topics(self, words=None):
        s = self.model.get_topics().T
        if words is not None:
            common_corpus = self.common_dictionary.doc2idx(words)
            s = s[common_corpus]
        return s
Ejemplo n.º 5
0
    target = len(infoDF)
    lt = LoopTimer(update_after=10, avg_length=1000, target=target)
    for abstract_id, row in infoDF.iterrows():
        doc = Doc(vocab).from_disk(
            os.path.join(path_to_annotations, f"{abstract_id}.spacy"))

        doc = replace_cluster_in_doc(doc, replace_dic, sorted_mentions, nlp)

        lemma_s_list.append(doc_2_token(doc, split_sentences=True))
        lemma_d_list.append(doc_2_token(doc, split_sentences=False))
        abstract_id_list.append(abstract_id)

        breaker = lt.update(f"Create Pandas - {len(lemma_d_list)}")

    dictionary = Dictionary(lemma_d_list)
    id_d_list = [dictionary.doc2idx(document) for document in lemma_d_list]
    id_s_list = [[dictionary.doc2idx(sentence) for sentence in document]
                 for document in lemma_s_list]

    corpus = {
        "abstract_id": abstract_id_list,
        "lemma_sentence": lemma_s_list,
        "lemma_document": lemma_d_list,
        "lemma_id_sentence": id_s_list,
        "lemma_id_document": id_d_list
    }

    with open(os.path.join(path_to_pandas, corpus_file_name), "wb") as handle:
        pickle.dump(corpus, handle)

    dictionary.save(os.path.join(path_to_pandas, dictionary_file_name))
Ejemplo n.º 6
0
class MultiVectorizer():

    reserved = ["<PAD>", "<UNK>"]
    embedding_matrix = None
    embedding_word_vector = {}
    glove = False

    def __init__(self, reserved=None, min_occur=1, glove_path=None, tokenizer=None, embedding_size=300):

        self.mi_occur = min_occur
        self.embedding_size = embedding_size

        self.nlp = spacy.load("en")
        if tokenizer is None:
            self.tokenizer = English().Defaults.create_tokenizer(self.nlp)
        else:
            self.tokenizer = tokenizer

        if glove_path is not None:
            self.load_glove(glove_path)
            self.glove = True

        if reserved is not None:
            self.vocabulary = Dictionary([self.reserved.extend(reserved)])
        else:
            self.vocabulary = Dictionary([self.reserved])

    def get_vocabulary_size(self):
        return len(self.vocabulary.token2id.items())

    def load_glove(self, glove_file_path):
        f = open(glove_file_path, encoding="utf-8")
        for line in tqdm(f):
            value = line.split(" ")
            word = value[0]
            coef = np.array(value[1:], dtype='float32')
            self.embedding_word_vector[word] = coef
        f.close()

    def get_embedding_matrix(self):
        return self.embedding_matrix

    def is_word(self, string_value):
        if self.embedding_word_vector.get(string_value):
            return True

    def get_vocabulary(self):
        return self.vocabulary

    def get_word_id(self, word):
        return self.vocabulary.token2id[word]

    def get_word_from_id(self, index):
        return self.vocabulary.id2token[index]

    def fit_document(self, documents):
        document_tokens = []
        for document in documents:
            section_tokens = []
            for section in document:
                sentence_tokens = []
                for sentence in section:
                    tokens = self.tokenizer(sentence.lower())
                    word_str_tokens = list(map(convert_to_string, tokens))
                    sentence_tokens.append(word_str_tokens)
                    self.vocabulary.add_documents(sentence_tokens)
                section_tokens.append(sentence_tokens)
            document_tokens.append(section_tokens)
        return document_tokens

    def fit_samples_with_sentences(self, samples):
        output_tokens = []
        for sample in samples:
            sentence_tokens = []
            for sentence in sample:
                tokens = self.tokenizer(sentence.lower())
                word_str_tokens = list(map(convert_to_string, tokens))
                sentence_tokens.append(word_str_tokens)
                self.vocabulary.add_documents(sentence_tokens)
            output_tokens.append(sentence_tokens)
        return output_tokens

    def fit(self, X):
        if type(X[0]) == list:
            x_tokens = self.fit_samples_with_sentences(X) #self.fit_document(X)
        else:
            x_tokens = self.fit_text(X)

        self.vocabulary.filter_extremes(no_below=self.mi_occur, no_above=1.0, keep_tokens=self.reserved)

        if self.glove:
            print("Vocabulary Size:",self.get_vocabulary_size())
            self.embedding_matrix = np.zeros((self.get_vocabulary_size(), self.embedding_size))
            for word, i in tqdm(self.vocabulary.token2id.items()):
                if word == "<PAD>":
                    embedding_value = np.zeros((1, self.embedding_size))
                elif word == "<UNK>":
                    sd =  1/np.sqrt(self.embedding_size)
                    np.random.seed(seed=42)
                    embedding_value = np.random.normal(0, scale=sd, size=[1, self.embedding_size])
                else:
                    embedding_value = self.embedding_word_vector.get(word)
                    if embedding_value is None:
                        embedding_value = self.embedding_word_vector.get("<UNK>")
                if embedding_value is not None:
                    self.embedding_matrix[i] = embedding_value
        return  self.transform(x_tokens)

    def fit_text(self, X):
        x_tokens = []
        for x in X:
            if x is not None:
                # x_tokens.append(word_tokenize(x.lower()))
                tokens = self.tokenizer(x.lower())
                word_str_tokens = list(map(convert_to_string, tokens))
                x_tokens.append(word_str_tokens)
                self.vocabulary.add_documents(x_tokens)
        return x_tokens

    def transform(self, X):
        return self.transform_list_of_list(X)

    def transform_list_of_list(self, samples):
        samples_tokens = []
        for sample in samples:
            encoded_tokens = self.transform_section(sample)
            samples_tokens.append(encoded_tokens)
        return samples_tokens

    def transform_document(self, documents):
        document_tokens = []
        for document in documents:
            section_tokens = []
            encoded_tokens = []
            for section in document:
                if type(section) == str:
                    encoded_tokens.append(section)
                    if len(encoded_tokens) == len(document):
                        section_tokens.append(encoded_tokens)
                        section_tokens = self.transform_section(section_tokens)
                else:
                    encoded_tokens = self.transform_section(section)
                    section_tokens.append(encoded_tokens)
            document_tokens.append(section_tokens)
        return document_tokens

    def transform_section(self, X):
        if hasattr(self, "limit"):
            return [[i if i < self.limit else self.reserved.index("<UNK>")
                     for i in self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>"))]
                    for x in X]
        else:
            return [self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>")) for x in X]

    def inverse_transform(self, X):
        return [[ self.vocabulary[i] for i in x ] for x in X]

    def save(self, file_path="./vecorizer.vec"):
        with open(file_path, "wb") as handle:
            pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL)
        return file_path

    @classmethod
    def load(cls, file_path):
        with open(file_path, "rb") as handle:
            self = pickle.load(handle)
        return self
Ejemplo n.º 7
0
    val_sent_tensor_pairs = list(zip(english_val_tensors.values, indo_val_tensors.values))
    val_sent_pairs = list(zip(df_val_in['English'], df_val_in['Indonesian']))
    return val_sent_pairs, val_sent_tensor_pairs


val_sent_pairs, val_sent_tensor_pairs = get_validation_pairs(df_val) #MOD Anurag
print(val_sent_pairs[0])

print(val_sent_pairs[-1])

print(val_sent_pairs[154])

print(val_sent_pairs[154][0])

for w in val_sent_pairs[154][0].split(' '):
    print(english_vocab.doc2idx([w]))


class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden
Ejemplo n.º 8
0
def cooccurence_matrix(infile, total, window, smoothing):
    """
    Generates a co-occurrence matrix using symmetric-window skip-grams of
    length window.  Then generates a PPMI transform using smoothed probabilities.

    :param infile: bz2-compressed file to read.
    :param total: the total number of files, if known, for TQDM to use.
    :param window: symmetric window size to use.
    :param smoothing: smoothing value for smoothed prior distributions
    :param no_below: no_below arg for Gensim dict.
    :param no_above: no_above arg for Gensim dict.
    :return: SVD vectors
    """

    with bz2.open(infile, "r") as F:
        # gensim Dictionary for word<->id mappings
        vocab = Dictionary(i.split()[1:] for i in tqdm(
            F, total=total, desc=f"{infile}: {'Gathering Vocabulary':<25s}"))
        vocab.compactify()
        sleep(.5)
        print("\nVOCAB SIZE: {}".format(len(vocab)))
        sleep(.5)

    with bz2.open(infile, "r") as F:
        INDS = Counter((
            DOC[i], DOC[i + j]
        ) for DOC in (np.array(vocab.doc2idx(J.split()[1:])) for J in tqdm(
            F, total=total, desc=f"{infile}: {'Co-occurrence Matrix':<25s}"))
                       for i in range(1, len(DOC))
                       for j in range(min(window,
                                          len(DOC) - i)))

    # Convert {(A, B):C} dict structure to np.array([C, A, B]) for
    # sparse matrix construction.
    INDS = np.array([[
        INDS[I], I[0], I[1]
    ] for I in tqdm(INDS.keys(), desc=f"{infile}: {'Generating Indices':<25s}")
                     if I[0] != I[1] and I[0] > 0 and I[1] > 0])
    print(INDS.shape)
    ppmi_mat = csr_matrix((INDS[:, 0], (INDS[:, 1], INDS[:, 2])),
                          shape=(len(vocab), len(vocab)))

    print("PPMI matrix shape: {}".format(ppmi_mat.shape))
    del INDS
    # ppmi_mat.eliminate_zeros()
    # Add transpose, since PPMI is symmetric--PPMI(i,j) = PPMI(j,i)
    ppmi_mat = ppmi_mat + ppmi_mat.transpose()

    ### PPMI TRANSFORMATION ###
    print("Generating matrices for PPMI transform...")
    # We'll use these more than once, so only calculate them the one time
    POW = ppmi_mat.power(smoothing)
    TOT = np.sum(ppmi_mat)
    p_i_star = np.array(np.sum(ppmi_mat, axis=1) / TOT).astype(
        np.float32).reshape((-1, ))
    p_star_j = np.array(np.sum(POW, axis=0) / np.sum(POW)).astype(
        np.float32).reshape((-1, ))
    ppmi_mat = ppmi_mat / TOT

    ### PPMI TRANSFORM ###
    data = ppmi_mat.data.astype(np.float32)
    indices = ppmi_mat.indices.astype(np.int32)
    indptr = ppmi_mat.indptr.astype(np.int32)
    for i in trange(indptr.shape[0] - 1,
                    desc=f"{infile}: {'PPMI Transform':<25s}"):
        data[indptr[i]:indptr[i+1]] = \
            np.maximum(
                0,
                np.log2(data[indptr[i]:indptr[i+1]] / (p_i_star[i] * p_star_j[indices[indptr[i]:indptr[i+1]]]))
        )
    ppmi_mat = csr_matrix((data, indices, indptr))
    ppmi_mat.eliminate_zeros()

    ### SVD ###
    sleep(.5)
    print("SVD...")
    # per https://web.stanford.edu/~jurafsky/slp3/16.pdf we only
    # use the raw left singular values as the word embedding vectors
    U = svds(ppmi_mat, k=300, return_singular_vectors="u")[0]

    return U, vocab
Ejemplo n.º 9
0
class MultiVectorizer():

    reserved = ["<PAD>", "<UNK>"]
    embedding_matrix = None
    embedding_word_vector = {}
    glove = False

    def __init__(self, reserved=None, min_occur=1, use_bert=False, glove_path=None, tokenizer=None, embedding_size=300):

        self.mi_occur = min_occur
        self.embedding_size = embedding_size
        self.use_bert = use_bert

        self.nlp = spacy.load("en")
        if tokenizer is None:
            self.tokenizer = English().Defaults.create_tokenizer(self.nlp)
        else:
            self.tokenizer = tokenizer

        if glove_path is not None:
            self.load_glove(glove_path)
            self.glove = True

        if reserved is not None:
            self.vocabulary = Dictionary([self.reserved.extend(reserved)])
        else:
            self.vocabulary = Dictionary([self.reserved])

    def get_vocabulary_size(self):
        if not self.use_bert:
            return len(self.vocabulary.token2id.items())
        else:
            return len(self.tokenizer.vocab.keys())

    def load_glove(self, glove_file_path):
        f = open(glove_file_path, encoding="utf-8")
        for line in tqdm(f):
            value = line.split(" ")
            word = value[0]
            coef = np.array(value[1:], dtype='float32')
            self.embedding_word_vector[word] = coef
        f.close()

    def get_embedding_matrix(self):
        return self.embedding_matrix

    def is_word(self, string_value):
        if self.embedding_word_vector.get(string_value):
            return True

    def get_vocabulary(self):
        if not self.use_bert:
            return self.vocabulary
        else:
            return self.tokenizer.vocab

    def get_word_id(self, word):
        if not self.use_bert:
            return self.vocabulary.token2id[word]
        else:
            return self.tokenizer.vocab[word]


    def get_word_from_id(self, index):
        if not self.use_bert:
            return self.vocabulary.id2token[index]
        else:
            return self.tokenizer.inv_vocab[index]

    def fit_document(self, documents):
        document_tokens = []
        for document in documents:
            section_tokens = []
            for section in document:
                sentence_tokens = []
                for sentence in section:
                    tokens = self.tokenizer(sentence.lower())
                    word_str_tokens = list(map(convert_to_string, tokens))
                    sentence_tokens.append(word_str_tokens)
                    self.vocabulary.add_documents(sentence_tokens)
                section_tokens.append(sentence_tokens)
            document_tokens.append(section_tokens)
        return document_tokens

    def fit_bert_sentences(self, samples, remove_stop_words=True):
        output_tokens = []
        vocab = []
        stop_words = set(stopwords.words('english'))
        for sample in tqdm(samples):
            sentence_tokens = []
            for sentence in sample:
                tokens = self.tokenizer.tokenize(sentence.lower())
                tokens = [w for w in tokens if not w in stop_words]
                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                sentence_tokens.append(tokens)
                vocab.append(tokens)
            output_tokens.append(sentence_tokens)
        #self.vocabulary.add_documents(vocab)
        return output_tokens

    def fit_samples_with_sentences(self, samples, remove_stop_words=True):
        output_tokens = []
        vocab = []
        for sample in tqdm(samples):
            sentence_tokens = []
            for sentence in sample:
                tokens = self.tokenizer(sentence.lower())
                if remove_stop_words:
                    tokens = [token for token in tokens if not token.is_stop]
                word_str_tokens = list(map(convert_to_string, tokens))
                sentence_tokens.append(word_str_tokens)
                vocab.append(word_str_tokens)
            output_tokens.append(sentence_tokens)
        self.vocabulary.add_documents(vocab)
        return output_tokens

    def fit(self, X, remove_stop_words=True, list_of_lists=False):
        if list_of_lists:
            if not self.use_bert:
                x_tokens = self.fit_samples_with_sentences(X,remove_stop_words=remove_stop_words) #self.fit_document(X)
            else:
                x_tokens = self.fit_bert_sentences(X, remove_stop_words=remove_stop_words)
        else:
            x_tokens = self.fit_text(X)

        self.vocabulary.filter_extremes(no_below=self.mi_occur, no_above=1.0, keep_tokens=self.reserved)
        unknown_words = []
        if self.glove:
            #spell = Spellchecker()
            print("Vocabulary Size:",self.get_vocabulary_size())
            self.embedding_matrix = np.zeros((self.get_vocabulary_size(), self.embedding_size))
            for word, i in tqdm(self.vocabulary.token2id.items()):
                if word == "<PAD>":
                    embedding_value = np.zeros((1, self.embedding_size))
                elif word == "<UNK>":
                    sd =  1/np.sqrt(self.embedding_size)
                    np.random.seed(seed=42)
                    embedding_value = np.random.normal(0, scale=sd, size=[1, self.embedding_size])
                else:
                    embedding_value = self.embedding_word_vector.get(word)
                    if embedding_value is None:
                        embedding_value = self.embedding_word_vector.get(self.correct_word(word))
                        if embedding_value is None:
                            unknown_words.append(word)
                            embedding_value = self.embedding_word_vector.get("<UNK>")

                if embedding_value is not None:
                    self.embedding_matrix[i] = embedding_value
        print("Number of unknown words:",len(unknown_words))
        unknown_words_df = pd.DataFrame()
        unknown_words_df["Unknown Words"] = unknown_words
        encoded_tokens = self.transform(x_tokens, list_of_lists=list_of_lists)
        return  encoded_tokens

    def fit_text(self, X, remove_stop_words=True):
        output_tokens = []
        for sample in tqdm(X):
            tokens = self.tokenizer(sample.lower())
            if remove_stop_words:
                tokens = [token for token in tokens if not token.is_stop]
            word_str_tokens = list(map(convert_to_string, tokens))
            output_tokens.append(word_str_tokens)
        self.vocabulary.add_documents(output_tokens)
        return output_tokens

    def correct_word(self, word):
        return word

    def transform(self, X, list_of_lists=False):
        if list_of_lists:
            if not self.use_bert:
                return self.transform_list_of_list(X)
            else:
                return self.transform_bert(X)
        else:
            return self.transform_text(X)

    def transform_list_of_list(self, samples):
        samples_tokens = []
        for sample in samples:
            encoded_tokens = self.transform_text(sample)
            samples_tokens.append(encoded_tokens)
        return samples_tokens

    def transform_document(self, documents):
        document_tokens = []
        for document in documents:
            section_tokens = []
            encoded_tokens = []
            for section in document:
                if type(section) == str:
                    encoded_tokens.append(section)
                    if len(encoded_tokens) == len(document):
                        section_tokens.append(encoded_tokens)
                        section_tokens = self.transform_text(section_tokens)
                else:
                    encoded_tokens = self.transform_text(section)
                    section_tokens.append(encoded_tokens)
            document_tokens.append(section_tokens)
        return document_tokens

    def transform_bert(self, samples):
        samples_tokens = []
        for sample in samples:
            encoded_sentences = []
            for sentence_tokens in sample:
                encoded_tokens = self.tokenizer.convert_tokens_to_ids(sentence_tokens)
                encoded_sentences.append(encoded_tokens)
            samples_tokens.append(encoded_sentences)
        return samples_tokens

    def transform_text(self, X):
        if hasattr(self, "limit"):
            return [[i if i < self.limit else self.reserved.index("<UNK>")
                     for i in self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>"))]
                    for x in X]
        else:
            return [self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>")) for x in X]

    def inverse_transform(self, X):
        return [[ self.vocabulary[i] for i in x ] for x in X]

    def save(self, file_path="./vecorizer.vec"):
        with open(file_path, "wb") as handle:
            pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL)
        return file_path

    @classmethod
    def load(cls, file_path):
        with open(file_path, "rb") as handle:
            self = pickle.load(handle)
        return self
Ejemplo n.º 10
0
        text_to_word_sequence(data['text'])
        for data in tqdm(imdb_dataset(test=True))
    ]
    test_labels = [
        sentiment[data['sentiment']] for data in imdb_dataset(test=True)
    ]

    # test = imdb_dataset(test=True)

    all_texts = np.concatenate((train_texts, test_texts)).tolist()

    vocabulary = Dictionary(documents=all_texts)
    vocabulary.save('imdb_vocabulary')

    train_x = np.asarray([
        np.asarray(vocabulary.doc2idx(doc), dtype=np.int32) + 1
        for doc in tqdm(train_texts)
    ])
    train_y = np.asarray(train_labels, dtype=np.int32)

    test_x = np.asarray([
        np.asarray(vocabulary.doc2idx(doc), dtype=np.int32) + 1
        for doc in tqdm(test_texts)
    ])
    test_y = np.asarray(test_labels, dtype=np.int32)

    np.save('train_x.npy', train_x)
    np.save('train_y', train_y)
    np.save('test_x.npy', test_x)
    np.save('test_y.npy', test_y)
Ejemplo n.º 11
0
class TextData(Iterator):
    def __init__(self,
                 filename=None,
                 vocab_size=None,
                 max_len=None,
                 chunk_size=10**5,
                 delimiter=None,
                 size_mb=4024,
                 pad_symbol='<pad>',
                 start_symbol='<s>',
                 end_symbol='</s>',
                 unknown_symbol='<unk>',
                 default_pad_start=False,
                 default_pad_end=True,
                 filter_on=None,
                 prune_at=10**10,
                 encoding='utf8',
                 **kwargs):
        """
        This is the object to store text and read them into vocabulary
        indices. The object is an iterable that yields vocabulary indices of the
        tokens in the sentences.
        :param filename: Textfile that contains source sentences.
        :type filename: str
        :param vocab_size: Max no. of words to keep in the source vocab.
        :type vocab_size: int
        :param chunk_size: Use to limit no. of sentences to load at a time when populating the vocabulary.
        :type chunk_size: int
        :param delimiter: Delimiter to split on when "tokenizing"
        :type delimiter: str
        :param size_mb: Memory footprint of the bounter object use to count the vocab.
        :type size_mb: int
        :param start_symbol: Start symbol use for padding.
        :type start_symbol: str
        :param end_symbol: End symbol use for padding.
        :type end_symbol: str
        :param unknown_symbol: Unknown symbol for OOV words.
        :type unknown_symbol: str
        :param default_pad_start: By default, pad the <s> to sentence when vectorizing.
        :type default_pad_start: bool
        :param default_pad_end: By default, pad the </s> to sentence when vectorizing.
        :type default_pad_end: bool
        :param filter_on: Option to filter on term-freq ('tf') or doc-freq ('df')
        :type filter_on: str
        :param prune_at: *prune_at* parameter used by gensim.Dictionary
        :type prune_at: int
        """
        if 'loadfrom' not in kwargs:  # Creating.

            self.filename = absolute_path(filename)

            # Check that inputs are not None.
            assert Path(self.filename).exists(
            ), "File {filename} does not exist".format(filename=filename)

            # Initialize encoding.
            self.encoding = encoding

            # Initialize the pad, start, end and unknown symbols.
            self.PAD, self.PAD_IDX = pad_symbol, 0
            self.START, self.START_IDX = start_symbol, 1
            self.END, self.END_IDX = end_symbol, 2
            self.UNK, self.UNK_IDX = unknown_symbol, 3
            self.default_pad_start = default_pad_start
            self.default_pad_end = default_pad_end

            # Save the user-specific delimiter
            self.delimiter = delimiter

            # Gensim related attribute to keep the pruning cap.
            self.prune_at = prune_at

            # Populate the source vocabulary.
            print('Creating Vocabulary...', end='\n', file=sys.stderr)
            self.vocab = Dictionary(
                [[pad_symbol], [start_symbol], [end_symbol], [unknown_symbol]],
                prune_at=self.prune_at)
            self.counter = bounter(size_mb=size_mb)

            print('Building source vocab and counter...',
                  end=' ',
                  file=sys.stderr)
            self.populate_dictionary(self.filename, self.vocab, self.counter,
                                     chunk_size)
            # Use the user-specified source/target vocab size if set,
            # else use the full vocab_size.
            self.vocab_size = min(len(
                self.vocab), vocab_size) if vocab_size else len(self.vocab)

            # Keep the vocabulary to a max set by user.
            if filter_on and self.vocab_size < len(self.vocab):
                print('Filtering least frequent words in vocab.',
                      end='\n',
                      file=sys.stderr)
                if filter_on == 'tf':
                    self.filter_n_least_frequent(
                        self.vocab,
                        self.counter,
                        self.vocab_size,
                        keep_tokens=['<pad>', '<s>', '</s>', '<unk>'])
                elif filter_on == 'df':
                    self.vocab.filter_extremes(
                        no_below=1,
                        no_above=self.prune_at,
                        keep_n=self.vocab_size,
                        keep_tokens=['<pad>', '<s>', '</s>', '<unk>'])

            self.iterable = self._iterate()

        else:  # Loading.
            self.load(kwargs['loadfrom'], filename,
                      kwargs.get('load_counter', False))
            self.iterable = self._iterate()

    @timing
    def load(self, loadfrom=None, filename=None, load_counter=False):
        """
        The load function.

        :param filename: Path to the filename of the corpus to read,
                         this will overwrite filename in the TextData.json.
        :type filename: str

        :param loadfrom: The path to load the directory for the ParallelData.
        :type loadfrom: str

        :param load_counter: Whether to load the src and trg bounter objects.
        :type load_counter: bool
        """
        assert loadfrom is not None
        config_file = loadfrom + '/TextData.json'
        if not Path(config_file).exists():
            raise DataError('{} config file not found!!'.format(config_file))
        else:
            print('Loading TextData from {}'.format(config_file),
                  end=' ',
                  file=sys.stderr)
            with open(config_file) as fin:
                self.__dict__ = json.load(fin)

            # If the data is saved with TextData.save(copy_data=True),
            # it will appear in self.__dict__ and
            # we set the filename from relative to absolute path
            # if data is copied when saved, i.e. `filename` in self.__dict__
            if 'filename' in self.__dict__:
                self.filename = os.path.join(loadfrom, self.filename)
            # If user specified filename when loading the TextData, e.g.
            #   TextData(filename='path/to/textfile', loadfrom='...'),
            # then we overwrite the filename.
            elif filename:
                self.filename = filename
            else:
                raise DataError(
                    "You need to set the filename when loading TextData, e.g.\n"
                    "\tTextData(loadfrom='path/to/textdata', filename='inputfile.txt')"
                )
            # Check if the filename exists.
            if not os.path.isfile(self.filename):
                raise DataError("The text file at {} doesn't exist!!")

            try:
                with open(os.path.join(loadfrom, self.vocab), 'rb') as fin:
                    self.vocab = pickle.load(fin)
            except:
                raise DataError("{}/vocab.pkl isn't found".format(loadfrom))

            if load_counter:
                if ('counter' not in self.__dict__):
                    raise DataError('TextData counter not found!!')
                with open(os.path.join(loadfrom, self.counter), 'rb') as fin:
                    self.counter = pickle.load(fin)

    @timing
    def save(self, saveto, save_counter=False, copy_data=False):
        """
        The save function.
        :param saveto: The path to save the directory for the TextData.
        :type saveto: str
        :param save_counter: Whether to save the bounter objects.
        :type save_counter: bool
        :para copy_data: Make a local copy of the data.
        :type copy_data: bool
        """
        print("Saving TextData to {saveto}".format(saveto=saveto),
              end=' ',
              file=sys.stderr)
        # Create the directory if it doesn't exist.
        if not Path(saveto).exists():
            os.makedirs(saveto)

        # Save the vocab files.
        with open(saveto + '/vocab.pkl', 'wb') as fout:
            pickle.dump(self.vocab, fout)
        with open(saveto + '/vocab.tsv', 'w') as fout:
            for idx, word in self.vocab.items():
                print('\t'.join([str(idx), word]), end='\n', file=fout)

        # Initialize the config file.
        config_json = {
            'delimiter': self.delimiter,
            'encoding': self.encoding,
            'PAD': self.PAD,
            'PAD_IDX': self.PAD_IDX,
            'START': self.START,
            'START_IDX': self.START_IDX,
            'END': self.END,
            'END_IDX': self.END_IDX,
            'UNK': self.UNK,
            'UNK_IDX': self.UNK_IDX,
            'vocab_size': self.vocab_size,
            'vocab': 'vocab.pkl',
            'default_pad_start': self.default_pad_start,
            'default_pad_end': self.default_pad_end
        }

        # Check whether we should save the counter.
        if save_counter:
            with open(saveto + '/counter.pkl', 'wb') as fout:
                pickle.dump(self.counter, fout)
            with open(saveto + '/counter.tsv', 'w') as fout:
                for word, count in self.counter.items():
                    print('\t'.join([str(word), str(count)]),
                          end='\n',
                          file=fout)

        config_json['counter'] = 'counter.pkl' if save_counter else None

        if copy_data:
            _, _filename = os.path.split(
                self.filename)  # Filename without path.
            new_filename = os.path.join(saveto, _filename)
            print('\n\tCopying {} \n\tto {}'.format(self.filename,
                                                    new_filename),
                  end='\n',
                  file=sys.stderr)
            copyfile(absolute_path(self.filename), new_filename)
            config_json['filename'] = _filename

        # Dump the config file.
        with open(saveto + '/TextData.json', 'w') as fout:
            json.dump(config_json, fout, indent=2)

    def split_tokens(self, s):
        """
        A "tokenizer" that splits on space. If the delimiter is set to an empty
        string, it will read characters as tokens.
        :param s: The input string.
        :type s: str
        """
        if self.delimiter == '':  # Character models.
            return list(s.strip())
        else:  # Word models.
            return s.strip().split(self.delimiter)

    @timing
    def populate_dictionary(self, filename, vocab, counter, chunk_size):
        with open(filename, encoding=self.encoding) as fin:
            for chunk in tqdm(per_chunk(fin, chunk_size)):
                if all(c == None for c in chunk): break
                chunk_list_of_tokens = [
                    self.split_tokens(s) for s in chunk if s
                ]
                vocab.add_documents(chunk_list_of_tokens, self.prune_at)
                counter.update(chain(*chunk_list_of_tokens))

    def filter_n_least_frequent(self,
                                vocab,
                                counter,
                                n,
                                keep_tokens=['<pad>', '<s>', '</s>', '<unk>']):
        """
        Remove the least frequent items form the vocabulary.
        :param vocab: self.src_vocab or self.trg_vocab
        :type vocab: gensim.Dictionary
        :param counter: self.src_counter or self.trg_counter
        :type counter: bounter
        :param n: The upper limit of how many items to keep in the vocabulary
        :type n: int
        """
        # If n is bigger than user specified size, don't filter anything.
        if n < len(vocab.token2id):
            good_ids = [
                vocab.token2id[token]
                for token, _ in sorted(counter.items(), key=itemgetter(1))[-n:]
                if token in vocab.token2id
            ]
            good_ids += [self.vocab.token2id[_keep] for _keep in keep_tokens]
            print(good_ids)
            vocab.filter_tokens(good_ids=good_ids)

    def vectorize(self, sent, pad_start=True, pad_end=True):
        """
        Vectorize the sentence, converts it into a list of the indices based on
        the vocabulary. This is used by the `variable_from_sent()`.
        :param sent: The input sentence to convert to vocabulary indices
        :type sent: list(str)
        :param pad_start: Pad the start with the START_IDX [default: True]
        :type pad_end: bool
        :param pad_end: Pad the start with the END_IDX [default: True]
        :type pad_end: bool
        """
        sent = self.split_tokens(sent) if type(sent) == str else sent
        vsent = self.vocab.doc2idx(sent, unknown_word_index=self.UNK_IDX)
        if pad_start:
            vsent = [self.START_IDX] + vsent
        if pad_end:
            vsent = vsent + [self.END_IDX]
        return vsent

    def unvectorize(self, vector, unpad_start=True, unpad_end=True):
        """
        Convert the vector to the natural text sentence.
        """
        return ' '.join([self.vocab[idx] for idx in map(int, chain(*vector))
                         ][unpad_left:-unpad_right])

    def reset(self):
        """
        Resets the iterator to the 0th item.
        """
        self.iterable = self._iterate()

    def lines(self):
        """
        The function to iterate through the source and target file.
        """
        with open(self.filename) as fin:
            for line in fin:
                yield line.strip()

    def _iterate(self):
        """
        The helper function to iterate through the source and target file
        and convert the lines into vocabulary indices.
        """
        for line in self.lines():
            sent = self.vectorize(line, self.default_pad_start,
                                  self.default_pad_end)
            yield sent

    def __next__(self):
        return next(self.iterable)

    def shuffle(self):
        return iter(sorted(self, key=lambda k: random.random()))
Ejemplo n.º 12
0
class Corpora(Loader):
    """

    """
    is_built = False

    def __init__(self,
                 data_path: str,
                 prefix: str = None,
                 iterator: str = 'token',
                 parsing: str = 'simple',
                 word_up_limit: float = 0.75,
                 word_low_limit: int = 20,
                 dictionary: str = None,
                 shuffle: bool = False,
                 seed: int = 42,
                 document_minimum_length: int = 5,
                 stopwords: str = None):

        iter_map = dict(token=self.tokenize,
                        bow=self.bowize,
                        sentences=self.sentences)
        self.iterator = iter_map[iterator]

        self.word_low_limit = word_low_limit
        self.word_up_limit = word_up_limit

        if stopwords:
            self.stopwords = [w.strip() for w in open(stopwords).readlines()]
        else:
            self.stopwords = []

        if not dictionary:
            self.dictionary = Dictionary()
        else:
            self.dictionary = Dictionary.load_from_text(dictionary)
            if self.stopwords:
                self.dictionary.filter_tokens(
                    bad_ids=self.dictionary.doc2idx(self.stopwords))
            self.is_built = True

        self.shuffle = shuffle
        if self.shuffle:
            np.random.seed(seed)

        self.document_minimum_length = document_minimum_length

        corpus = self.init_corpus(data_path, prefix, parsing)

        super(Corpora, self).__init__(corpus=corpus)

    def __enter__(self):
        if not self.is_built:
            self.build()

        return super(Corpora, self).__enter__()

    def __exit__(self, *args):
        self.clear()
        return super(Corpora, self).__exit__(*args)

    def __iter__(self):
        for v in self.iterator():
            yield v

    def __getitem__(self, key):
        return self.iterator(index=key)

    def init_corpus(self, path: str, prefix: str, parsing: str):
        """

        """
        directory = [os.path.join(path, f) for f in os.listdir(path)]
        folders = list(filter(lambda p: os.path.isdir(p), directory))
        if prefix:
            folders = list(filter(lambda p: prefix in p, folders))

        corpus = [Corpus(path=p, parsing=parsing).load() for p in folders]
        self.__paths = {c.path: c for c in corpus}

        return corpus

    def load_vectors(self, path: str):
        """

        """
        if not path.endswith('.csv'):
            raise AssertionError(
                'Asserted the vectors to be provided with csv.')
        #TODO Use dask in case of too large word vector maps.
        return pd.read_csv(path)

    def build(self):
        """

        """
        if self.is_built:
            logging.warn('Attempted to build already built Corpora.')
            return

        for c in self.corpus:
            self.dictionary.add_documents(c.tokens)
            c.clear()

        self.dictionary.filter_extremes(no_below=self.word_low_limit,
                                        no_above=self.word_up_limit)

        return self

    def clear(self):
        """

        """
        self.dictionary = Dictionary()

    def bowize(self, index=None):
        """

        """
        N = len(self)

        iterable = self._iterator(index)

        for idx in self._indices(iterable):
            corpus = iterable[idx]
            tokens = corpus.tokens

            for ind in self._indices(tokens):
                doc_tokens = tokens[ind]
                bow = self.dictionary.doc2bow(doc_tokens)
                if len(bow) > self.document_minimum_length:
                    yield bow, N
                else:
                    logging.warn(
                        f'Received empty file at {corpus.documents[ind]}, skipping.'
                    )
                    corpus.mark_empty(ind)
            corpus.clear()

    def tokenize(self, index=None):
        """

        """
        N = len(self)

        iterable = self._iterator(index)

        for idx in self._indices(iterable):
            corpus = iterable[idx]
            tokens = corpus.tokens
            self._move()
            for ind in self._indices(tokens):
                doc_tokens = tokens[ind]
                if len(doc_tokens) > self.document_minimum_length:
                    yield doc_tokens, N
                else:
                    logging.warn(
                        f'Received empty file at {corpus.documents[ind]}, skipping.'
                    )
                    corpus.mark_empty(ind)
            corpus.clear()

    def sentences(self, index=None):
        """

        """
        iterable = self._iterator(index=index)
        for ind in self._indices(iterable=iterable):
            corpus = iterable[ind]
            for sentence in corpus.sentences:
                if len(sentence) > self.document_minimum_length:
                    yield sentence
                else:
                    logging.warn(
                        f'Received empty file at {corpus.documents[ind]}, skipping.'
                    )

    def documents(self, index=None):
        """

        """
        for c in self.corpus:
            if len(c) > 1:
                yield c.documents
            else:
                for doc in c.documents:
                    yield doc

    @property
    def years(self):
        """

        """
        return sorted([int(c.year) for c in self.corpus])

    def _iterator(self, index=None):
        """

        """
        iterator = self.corpus
        if index:
            if isinstance(index, int):
                iterator = [self.corpus[index]]  #TODO: Handle indices as slice
            elif isinstance(index, str):
                iterator = [self.__paths[index]]
        return iterator

    def _indices(self, iterable):
        """

        """
        if self.shuffle:
            indices = np.random.permutation(len(iterable))
        else:
            indices = range(len(iterable))
        return indices