class Corpus(object): def __init__(self, path, dict_path): self.dictionary = Dictionary() add_to_dict = True if dict_path and os.path.exists(dict_path): print('loading dictionary') self.dictionary = self.dictionary.load(dict_path) add_to_dict = False self.train = self.tokenize(os.path.join(path, 'train.txt'), add_to_dict) self.valid = self.tokenize(os.path.join(path, 'valid.txt'), add_to_dict) self.test = self.tokenize(os.path.join(path, 'test.txt'), add_to_dict) if dict_path and not os.path.exists(dict_path): self.dictionary.save(dict_path) def tokenize(self, path, add_to_dict): """Tokenizes a text file.""" assert os.path.exists(path) all_words = list( chain.from_iterable([ sent.split() + ['<eos>'] for sent in open(path).read().split('\n') ])) if add_to_dict: self.dictionary.add_documents([all_words]) return torch.LongTensor(self.dictionary.doc2idx(all_words))
def make_item_descriptions(max_sentence_length=None): descriptions = pd.read_csv(os.path.join( 'data', 'descriptions.csv')).rename(columns={'movie': 'item'}) texts = descriptions.description texts = texts.apply(lambda x: x.strip().split()) dictionary = Dictionary(texts.values) dictionary.filter_extremes() eos_id = len(dictionary.keys()) # to index list texts = texts.apply( lambda x: dictionary.doc2idx(x, unknown_word_index=eos_id)) texts = texts.apply(lambda x: np.array([a for a in x if a != eos_id])) max_sentence_length = max( texts.apply(len)) if max_sentence_length is None else min( max(texts.apply(len)), max_sentence_length) # padding texts = texts.apply(lambda x: x[:max_sentence_length]) texts = texts.apply(lambda x: np.pad(x, (0, max_sentence_length - len(x)), 'constant', constant_values=(0, eos_id))) # change types texts = texts.apply(lambda x: x.astype(np.int32)) descriptions.id = descriptions.id.astype(np.int32) return descriptions.id.values, texts.values, len(dictionary.keys()) + 1
def preprocess_phrase(self, phrases: Iterable[str], dictionary: Dictionary) -> List[List[int]]: numerized_phrases: List[List[int]] = [] for phrase in phrases: phrase = preprocess_string(phrase, self.custom_filter) phrase_idx = dictionary.doc2idx(phrase, None) phrase_idx = [x for x in phrase_idx if x is not None] numerized_phrases.append(phrase_idx) return numerized_phrases
class Lda: def __init__(self): self.model = None self.common_dictionary = None pass def train(self, common_texts, num_topics): self.common_dictionary = Dictionary(common_texts) common_corpus = [ self.common_dictionary.doc2bow(text) for text in common_texts ] self.model = LdaModel(common_corpus, num_topics=num_topics, alpha='auto', eval_every=5) def get_topics(self, words=None): s = self.model.get_topics().T if words is not None: common_corpus = self.common_dictionary.doc2idx(words) s = s[common_corpus] return s
target = len(infoDF) lt = LoopTimer(update_after=10, avg_length=1000, target=target) for abstract_id, row in infoDF.iterrows(): doc = Doc(vocab).from_disk( os.path.join(path_to_annotations, f"{abstract_id}.spacy")) doc = replace_cluster_in_doc(doc, replace_dic, sorted_mentions, nlp) lemma_s_list.append(doc_2_token(doc, split_sentences=True)) lemma_d_list.append(doc_2_token(doc, split_sentences=False)) abstract_id_list.append(abstract_id) breaker = lt.update(f"Create Pandas - {len(lemma_d_list)}") dictionary = Dictionary(lemma_d_list) id_d_list = [dictionary.doc2idx(document) for document in lemma_d_list] id_s_list = [[dictionary.doc2idx(sentence) for sentence in document] for document in lemma_s_list] corpus = { "abstract_id": abstract_id_list, "lemma_sentence": lemma_s_list, "lemma_document": lemma_d_list, "lemma_id_sentence": id_s_list, "lemma_id_document": id_d_list } with open(os.path.join(path_to_pandas, corpus_file_name), "wb") as handle: pickle.dump(corpus, handle) dictionary.save(os.path.join(path_to_pandas, dictionary_file_name))
class MultiVectorizer(): reserved = ["<PAD>", "<UNK>"] embedding_matrix = None embedding_word_vector = {} glove = False def __init__(self, reserved=None, min_occur=1, glove_path=None, tokenizer=None, embedding_size=300): self.mi_occur = min_occur self.embedding_size = embedding_size self.nlp = spacy.load("en") if tokenizer is None: self.tokenizer = English().Defaults.create_tokenizer(self.nlp) else: self.tokenizer = tokenizer if glove_path is not None: self.load_glove(glove_path) self.glove = True if reserved is not None: self.vocabulary = Dictionary([self.reserved.extend(reserved)]) else: self.vocabulary = Dictionary([self.reserved]) def get_vocabulary_size(self): return len(self.vocabulary.token2id.items()) def load_glove(self, glove_file_path): f = open(glove_file_path, encoding="utf-8") for line in tqdm(f): value = line.split(" ") word = value[0] coef = np.array(value[1:], dtype='float32') self.embedding_word_vector[word] = coef f.close() def get_embedding_matrix(self): return self.embedding_matrix def is_word(self, string_value): if self.embedding_word_vector.get(string_value): return True def get_vocabulary(self): return self.vocabulary def get_word_id(self, word): return self.vocabulary.token2id[word] def get_word_from_id(self, index): return self.vocabulary.id2token[index] def fit_document(self, documents): document_tokens = [] for document in documents: section_tokens = [] for section in document: sentence_tokens = [] for sentence in section: tokens = self.tokenizer(sentence.lower()) word_str_tokens = list(map(convert_to_string, tokens)) sentence_tokens.append(word_str_tokens) self.vocabulary.add_documents(sentence_tokens) section_tokens.append(sentence_tokens) document_tokens.append(section_tokens) return document_tokens def fit_samples_with_sentences(self, samples): output_tokens = [] for sample in samples: sentence_tokens = [] for sentence in sample: tokens = self.tokenizer(sentence.lower()) word_str_tokens = list(map(convert_to_string, tokens)) sentence_tokens.append(word_str_tokens) self.vocabulary.add_documents(sentence_tokens) output_tokens.append(sentence_tokens) return output_tokens def fit(self, X): if type(X[0]) == list: x_tokens = self.fit_samples_with_sentences(X) #self.fit_document(X) else: x_tokens = self.fit_text(X) self.vocabulary.filter_extremes(no_below=self.mi_occur, no_above=1.0, keep_tokens=self.reserved) if self.glove: print("Vocabulary Size:",self.get_vocabulary_size()) self.embedding_matrix = np.zeros((self.get_vocabulary_size(), self.embedding_size)) for word, i in tqdm(self.vocabulary.token2id.items()): if word == "<PAD>": embedding_value = np.zeros((1, self.embedding_size)) elif word == "<UNK>": sd = 1/np.sqrt(self.embedding_size) np.random.seed(seed=42) embedding_value = np.random.normal(0, scale=sd, size=[1, self.embedding_size]) else: embedding_value = self.embedding_word_vector.get(word) if embedding_value is None: embedding_value = self.embedding_word_vector.get("<UNK>") if embedding_value is not None: self.embedding_matrix[i] = embedding_value return self.transform(x_tokens) def fit_text(self, X): x_tokens = [] for x in X: if x is not None: # x_tokens.append(word_tokenize(x.lower())) tokens = self.tokenizer(x.lower()) word_str_tokens = list(map(convert_to_string, tokens)) x_tokens.append(word_str_tokens) self.vocabulary.add_documents(x_tokens) return x_tokens def transform(self, X): return self.transform_list_of_list(X) def transform_list_of_list(self, samples): samples_tokens = [] for sample in samples: encoded_tokens = self.transform_section(sample) samples_tokens.append(encoded_tokens) return samples_tokens def transform_document(self, documents): document_tokens = [] for document in documents: section_tokens = [] encoded_tokens = [] for section in document: if type(section) == str: encoded_tokens.append(section) if len(encoded_tokens) == len(document): section_tokens.append(encoded_tokens) section_tokens = self.transform_section(section_tokens) else: encoded_tokens = self.transform_section(section) section_tokens.append(encoded_tokens) document_tokens.append(section_tokens) return document_tokens def transform_section(self, X): if hasattr(self, "limit"): return [[i if i < self.limit else self.reserved.index("<UNK>") for i in self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>"))] for x in X] else: return [self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>")) for x in X] def inverse_transform(self, X): return [[ self.vocabulary[i] for i in x ] for x in X] def save(self, file_path="./vecorizer.vec"): with open(file_path, "wb") as handle: pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL) return file_path @classmethod def load(cls, file_path): with open(file_path, "rb") as handle: self = pickle.load(handle) return self
val_sent_tensor_pairs = list(zip(english_val_tensors.values, indo_val_tensors.values)) val_sent_pairs = list(zip(df_val_in['English'], df_val_in['Indonesian'])) return val_sent_pairs, val_sent_tensor_pairs val_sent_pairs, val_sent_tensor_pairs = get_validation_pairs(df_val) #MOD Anurag print(val_sent_pairs[0]) print(val_sent_pairs[-1]) print(val_sent_pairs[154]) print(val_sent_pairs[154][0]) for w in val_sent_pairs[154][0].split(' '): print(english_vocab.doc2idx([w])) class EncoderRNN(nn.Module): def __init__(self, input_size, hidden_size): super(EncoderRNN, self).__init__() self.hidden_size = hidden_size self.embedding = nn.Embedding(input_size, hidden_size) self.gru = nn.GRU(hidden_size, hidden_size) def forward(self, input, hidden): embedded = self.embedding(input).view(1, 1, -1) output = embedded output, hidden = self.gru(output, hidden) return output, hidden
def cooccurence_matrix(infile, total, window, smoothing): """ Generates a co-occurrence matrix using symmetric-window skip-grams of length window. Then generates a PPMI transform using smoothed probabilities. :param infile: bz2-compressed file to read. :param total: the total number of files, if known, for TQDM to use. :param window: symmetric window size to use. :param smoothing: smoothing value for smoothed prior distributions :param no_below: no_below arg for Gensim dict. :param no_above: no_above arg for Gensim dict. :return: SVD vectors """ with bz2.open(infile, "r") as F: # gensim Dictionary for word<->id mappings vocab = Dictionary(i.split()[1:] for i in tqdm( F, total=total, desc=f"{infile}: {'Gathering Vocabulary':<25s}")) vocab.compactify() sleep(.5) print("\nVOCAB SIZE: {}".format(len(vocab))) sleep(.5) with bz2.open(infile, "r") as F: INDS = Counter(( DOC[i], DOC[i + j] ) for DOC in (np.array(vocab.doc2idx(J.split()[1:])) for J in tqdm( F, total=total, desc=f"{infile}: {'Co-occurrence Matrix':<25s}")) for i in range(1, len(DOC)) for j in range(min(window, len(DOC) - i))) # Convert {(A, B):C} dict structure to np.array([C, A, B]) for # sparse matrix construction. INDS = np.array([[ INDS[I], I[0], I[1] ] for I in tqdm(INDS.keys(), desc=f"{infile}: {'Generating Indices':<25s}") if I[0] != I[1] and I[0] > 0 and I[1] > 0]) print(INDS.shape) ppmi_mat = csr_matrix((INDS[:, 0], (INDS[:, 1], INDS[:, 2])), shape=(len(vocab), len(vocab))) print("PPMI matrix shape: {}".format(ppmi_mat.shape)) del INDS # ppmi_mat.eliminate_zeros() # Add transpose, since PPMI is symmetric--PPMI(i,j) = PPMI(j,i) ppmi_mat = ppmi_mat + ppmi_mat.transpose() ### PPMI TRANSFORMATION ### print("Generating matrices for PPMI transform...") # We'll use these more than once, so only calculate them the one time POW = ppmi_mat.power(smoothing) TOT = np.sum(ppmi_mat) p_i_star = np.array(np.sum(ppmi_mat, axis=1) / TOT).astype( np.float32).reshape((-1, )) p_star_j = np.array(np.sum(POW, axis=0) / np.sum(POW)).astype( np.float32).reshape((-1, )) ppmi_mat = ppmi_mat / TOT ### PPMI TRANSFORM ### data = ppmi_mat.data.astype(np.float32) indices = ppmi_mat.indices.astype(np.int32) indptr = ppmi_mat.indptr.astype(np.int32) for i in trange(indptr.shape[0] - 1, desc=f"{infile}: {'PPMI Transform':<25s}"): data[indptr[i]:indptr[i+1]] = \ np.maximum( 0, np.log2(data[indptr[i]:indptr[i+1]] / (p_i_star[i] * p_star_j[indices[indptr[i]:indptr[i+1]]])) ) ppmi_mat = csr_matrix((data, indices, indptr)) ppmi_mat.eliminate_zeros() ### SVD ### sleep(.5) print("SVD...") # per https://web.stanford.edu/~jurafsky/slp3/16.pdf we only # use the raw left singular values as the word embedding vectors U = svds(ppmi_mat, k=300, return_singular_vectors="u")[0] return U, vocab
class MultiVectorizer(): reserved = ["<PAD>", "<UNK>"] embedding_matrix = None embedding_word_vector = {} glove = False def __init__(self, reserved=None, min_occur=1, use_bert=False, glove_path=None, tokenizer=None, embedding_size=300): self.mi_occur = min_occur self.embedding_size = embedding_size self.use_bert = use_bert self.nlp = spacy.load("en") if tokenizer is None: self.tokenizer = English().Defaults.create_tokenizer(self.nlp) else: self.tokenizer = tokenizer if glove_path is not None: self.load_glove(glove_path) self.glove = True if reserved is not None: self.vocabulary = Dictionary([self.reserved.extend(reserved)]) else: self.vocabulary = Dictionary([self.reserved]) def get_vocabulary_size(self): if not self.use_bert: return len(self.vocabulary.token2id.items()) else: return len(self.tokenizer.vocab.keys()) def load_glove(self, glove_file_path): f = open(glove_file_path, encoding="utf-8") for line in tqdm(f): value = line.split(" ") word = value[0] coef = np.array(value[1:], dtype='float32') self.embedding_word_vector[word] = coef f.close() def get_embedding_matrix(self): return self.embedding_matrix def is_word(self, string_value): if self.embedding_word_vector.get(string_value): return True def get_vocabulary(self): if not self.use_bert: return self.vocabulary else: return self.tokenizer.vocab def get_word_id(self, word): if not self.use_bert: return self.vocabulary.token2id[word] else: return self.tokenizer.vocab[word] def get_word_from_id(self, index): if not self.use_bert: return self.vocabulary.id2token[index] else: return self.tokenizer.inv_vocab[index] def fit_document(self, documents): document_tokens = [] for document in documents: section_tokens = [] for section in document: sentence_tokens = [] for sentence in section: tokens = self.tokenizer(sentence.lower()) word_str_tokens = list(map(convert_to_string, tokens)) sentence_tokens.append(word_str_tokens) self.vocabulary.add_documents(sentence_tokens) section_tokens.append(sentence_tokens) document_tokens.append(section_tokens) return document_tokens def fit_bert_sentences(self, samples, remove_stop_words=True): output_tokens = [] vocab = [] stop_words = set(stopwords.words('english')) for sample in tqdm(samples): sentence_tokens = [] for sentence in sample: tokens = self.tokenizer.tokenize(sentence.lower()) tokens = [w for w in tokens if not w in stop_words] tokens = ["[CLS]"] + tokens + ["[SEP]"] sentence_tokens.append(tokens) vocab.append(tokens) output_tokens.append(sentence_tokens) #self.vocabulary.add_documents(vocab) return output_tokens def fit_samples_with_sentences(self, samples, remove_stop_words=True): output_tokens = [] vocab = [] for sample in tqdm(samples): sentence_tokens = [] for sentence in sample: tokens = self.tokenizer(sentence.lower()) if remove_stop_words: tokens = [token for token in tokens if not token.is_stop] word_str_tokens = list(map(convert_to_string, tokens)) sentence_tokens.append(word_str_tokens) vocab.append(word_str_tokens) output_tokens.append(sentence_tokens) self.vocabulary.add_documents(vocab) return output_tokens def fit(self, X, remove_stop_words=True, list_of_lists=False): if list_of_lists: if not self.use_bert: x_tokens = self.fit_samples_with_sentences(X,remove_stop_words=remove_stop_words) #self.fit_document(X) else: x_tokens = self.fit_bert_sentences(X, remove_stop_words=remove_stop_words) else: x_tokens = self.fit_text(X) self.vocabulary.filter_extremes(no_below=self.mi_occur, no_above=1.0, keep_tokens=self.reserved) unknown_words = [] if self.glove: #spell = Spellchecker() print("Vocabulary Size:",self.get_vocabulary_size()) self.embedding_matrix = np.zeros((self.get_vocabulary_size(), self.embedding_size)) for word, i in tqdm(self.vocabulary.token2id.items()): if word == "<PAD>": embedding_value = np.zeros((1, self.embedding_size)) elif word == "<UNK>": sd = 1/np.sqrt(self.embedding_size) np.random.seed(seed=42) embedding_value = np.random.normal(0, scale=sd, size=[1, self.embedding_size]) else: embedding_value = self.embedding_word_vector.get(word) if embedding_value is None: embedding_value = self.embedding_word_vector.get(self.correct_word(word)) if embedding_value is None: unknown_words.append(word) embedding_value = self.embedding_word_vector.get("<UNK>") if embedding_value is not None: self.embedding_matrix[i] = embedding_value print("Number of unknown words:",len(unknown_words)) unknown_words_df = pd.DataFrame() unknown_words_df["Unknown Words"] = unknown_words encoded_tokens = self.transform(x_tokens, list_of_lists=list_of_lists) return encoded_tokens def fit_text(self, X, remove_stop_words=True): output_tokens = [] for sample in tqdm(X): tokens = self.tokenizer(sample.lower()) if remove_stop_words: tokens = [token for token in tokens if not token.is_stop] word_str_tokens = list(map(convert_to_string, tokens)) output_tokens.append(word_str_tokens) self.vocabulary.add_documents(output_tokens) return output_tokens def correct_word(self, word): return word def transform(self, X, list_of_lists=False): if list_of_lists: if not self.use_bert: return self.transform_list_of_list(X) else: return self.transform_bert(X) else: return self.transform_text(X) def transform_list_of_list(self, samples): samples_tokens = [] for sample in samples: encoded_tokens = self.transform_text(sample) samples_tokens.append(encoded_tokens) return samples_tokens def transform_document(self, documents): document_tokens = [] for document in documents: section_tokens = [] encoded_tokens = [] for section in document: if type(section) == str: encoded_tokens.append(section) if len(encoded_tokens) == len(document): section_tokens.append(encoded_tokens) section_tokens = self.transform_text(section_tokens) else: encoded_tokens = self.transform_text(section) section_tokens.append(encoded_tokens) document_tokens.append(section_tokens) return document_tokens def transform_bert(self, samples): samples_tokens = [] for sample in samples: encoded_sentences = [] for sentence_tokens in sample: encoded_tokens = self.tokenizer.convert_tokens_to_ids(sentence_tokens) encoded_sentences.append(encoded_tokens) samples_tokens.append(encoded_sentences) return samples_tokens def transform_text(self, X): if hasattr(self, "limit"): return [[i if i < self.limit else self.reserved.index("<UNK>") for i in self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>"))] for x in X] else: return [self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>")) for x in X] def inverse_transform(self, X): return [[ self.vocabulary[i] for i in x ] for x in X] def save(self, file_path="./vecorizer.vec"): with open(file_path, "wb") as handle: pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL) return file_path @classmethod def load(cls, file_path): with open(file_path, "rb") as handle: self = pickle.load(handle) return self
text_to_word_sequence(data['text']) for data in tqdm(imdb_dataset(test=True)) ] test_labels = [ sentiment[data['sentiment']] for data in imdb_dataset(test=True) ] # test = imdb_dataset(test=True) all_texts = np.concatenate((train_texts, test_texts)).tolist() vocabulary = Dictionary(documents=all_texts) vocabulary.save('imdb_vocabulary') train_x = np.asarray([ np.asarray(vocabulary.doc2idx(doc), dtype=np.int32) + 1 for doc in tqdm(train_texts) ]) train_y = np.asarray(train_labels, dtype=np.int32) test_x = np.asarray([ np.asarray(vocabulary.doc2idx(doc), dtype=np.int32) + 1 for doc in tqdm(test_texts) ]) test_y = np.asarray(test_labels, dtype=np.int32) np.save('train_x.npy', train_x) np.save('train_y', train_y) np.save('test_x.npy', test_x) np.save('test_y.npy', test_y)
class TextData(Iterator): def __init__(self, filename=None, vocab_size=None, max_len=None, chunk_size=10**5, delimiter=None, size_mb=4024, pad_symbol='<pad>', start_symbol='<s>', end_symbol='</s>', unknown_symbol='<unk>', default_pad_start=False, default_pad_end=True, filter_on=None, prune_at=10**10, encoding='utf8', **kwargs): """ This is the object to store text and read them into vocabulary indices. The object is an iterable that yields vocabulary indices of the tokens in the sentences. :param filename: Textfile that contains source sentences. :type filename: str :param vocab_size: Max no. of words to keep in the source vocab. :type vocab_size: int :param chunk_size: Use to limit no. of sentences to load at a time when populating the vocabulary. :type chunk_size: int :param delimiter: Delimiter to split on when "tokenizing" :type delimiter: str :param size_mb: Memory footprint of the bounter object use to count the vocab. :type size_mb: int :param start_symbol: Start symbol use for padding. :type start_symbol: str :param end_symbol: End symbol use for padding. :type end_symbol: str :param unknown_symbol: Unknown symbol for OOV words. :type unknown_symbol: str :param default_pad_start: By default, pad the <s> to sentence when vectorizing. :type default_pad_start: bool :param default_pad_end: By default, pad the </s> to sentence when vectorizing. :type default_pad_end: bool :param filter_on: Option to filter on term-freq ('tf') or doc-freq ('df') :type filter_on: str :param prune_at: *prune_at* parameter used by gensim.Dictionary :type prune_at: int """ if 'loadfrom' not in kwargs: # Creating. self.filename = absolute_path(filename) # Check that inputs are not None. assert Path(self.filename).exists( ), "File {filename} does not exist".format(filename=filename) # Initialize encoding. self.encoding = encoding # Initialize the pad, start, end and unknown symbols. self.PAD, self.PAD_IDX = pad_symbol, 0 self.START, self.START_IDX = start_symbol, 1 self.END, self.END_IDX = end_symbol, 2 self.UNK, self.UNK_IDX = unknown_symbol, 3 self.default_pad_start = default_pad_start self.default_pad_end = default_pad_end # Save the user-specific delimiter self.delimiter = delimiter # Gensim related attribute to keep the pruning cap. self.prune_at = prune_at # Populate the source vocabulary. print('Creating Vocabulary...', end='\n', file=sys.stderr) self.vocab = Dictionary( [[pad_symbol], [start_symbol], [end_symbol], [unknown_symbol]], prune_at=self.prune_at) self.counter = bounter(size_mb=size_mb) print('Building source vocab and counter...', end=' ', file=sys.stderr) self.populate_dictionary(self.filename, self.vocab, self.counter, chunk_size) # Use the user-specified source/target vocab size if set, # else use the full vocab_size. self.vocab_size = min(len( self.vocab), vocab_size) if vocab_size else len(self.vocab) # Keep the vocabulary to a max set by user. if filter_on and self.vocab_size < len(self.vocab): print('Filtering least frequent words in vocab.', end='\n', file=sys.stderr) if filter_on == 'tf': self.filter_n_least_frequent( self.vocab, self.counter, self.vocab_size, keep_tokens=['<pad>', '<s>', '</s>', '<unk>']) elif filter_on == 'df': self.vocab.filter_extremes( no_below=1, no_above=self.prune_at, keep_n=self.vocab_size, keep_tokens=['<pad>', '<s>', '</s>', '<unk>']) self.iterable = self._iterate() else: # Loading. self.load(kwargs['loadfrom'], filename, kwargs.get('load_counter', False)) self.iterable = self._iterate() @timing def load(self, loadfrom=None, filename=None, load_counter=False): """ The load function. :param filename: Path to the filename of the corpus to read, this will overwrite filename in the TextData.json. :type filename: str :param loadfrom: The path to load the directory for the ParallelData. :type loadfrom: str :param load_counter: Whether to load the src and trg bounter objects. :type load_counter: bool """ assert loadfrom is not None config_file = loadfrom + '/TextData.json' if not Path(config_file).exists(): raise DataError('{} config file not found!!'.format(config_file)) else: print('Loading TextData from {}'.format(config_file), end=' ', file=sys.stderr) with open(config_file) as fin: self.__dict__ = json.load(fin) # If the data is saved with TextData.save(copy_data=True), # it will appear in self.__dict__ and # we set the filename from relative to absolute path # if data is copied when saved, i.e. `filename` in self.__dict__ if 'filename' in self.__dict__: self.filename = os.path.join(loadfrom, self.filename) # If user specified filename when loading the TextData, e.g. # TextData(filename='path/to/textfile', loadfrom='...'), # then we overwrite the filename. elif filename: self.filename = filename else: raise DataError( "You need to set the filename when loading TextData, e.g.\n" "\tTextData(loadfrom='path/to/textdata', filename='inputfile.txt')" ) # Check if the filename exists. if not os.path.isfile(self.filename): raise DataError("The text file at {} doesn't exist!!") try: with open(os.path.join(loadfrom, self.vocab), 'rb') as fin: self.vocab = pickle.load(fin) except: raise DataError("{}/vocab.pkl isn't found".format(loadfrom)) if load_counter: if ('counter' not in self.__dict__): raise DataError('TextData counter not found!!') with open(os.path.join(loadfrom, self.counter), 'rb') as fin: self.counter = pickle.load(fin) @timing def save(self, saveto, save_counter=False, copy_data=False): """ The save function. :param saveto: The path to save the directory for the TextData. :type saveto: str :param save_counter: Whether to save the bounter objects. :type save_counter: bool :para copy_data: Make a local copy of the data. :type copy_data: bool """ print("Saving TextData to {saveto}".format(saveto=saveto), end=' ', file=sys.stderr) # Create the directory if it doesn't exist. if not Path(saveto).exists(): os.makedirs(saveto) # Save the vocab files. with open(saveto + '/vocab.pkl', 'wb') as fout: pickle.dump(self.vocab, fout) with open(saveto + '/vocab.tsv', 'w') as fout: for idx, word in self.vocab.items(): print('\t'.join([str(idx), word]), end='\n', file=fout) # Initialize the config file. config_json = { 'delimiter': self.delimiter, 'encoding': self.encoding, 'PAD': self.PAD, 'PAD_IDX': self.PAD_IDX, 'START': self.START, 'START_IDX': self.START_IDX, 'END': self.END, 'END_IDX': self.END_IDX, 'UNK': self.UNK, 'UNK_IDX': self.UNK_IDX, 'vocab_size': self.vocab_size, 'vocab': 'vocab.pkl', 'default_pad_start': self.default_pad_start, 'default_pad_end': self.default_pad_end } # Check whether we should save the counter. if save_counter: with open(saveto + '/counter.pkl', 'wb') as fout: pickle.dump(self.counter, fout) with open(saveto + '/counter.tsv', 'w') as fout: for word, count in self.counter.items(): print('\t'.join([str(word), str(count)]), end='\n', file=fout) config_json['counter'] = 'counter.pkl' if save_counter else None if copy_data: _, _filename = os.path.split( self.filename) # Filename without path. new_filename = os.path.join(saveto, _filename) print('\n\tCopying {} \n\tto {}'.format(self.filename, new_filename), end='\n', file=sys.stderr) copyfile(absolute_path(self.filename), new_filename) config_json['filename'] = _filename # Dump the config file. with open(saveto + '/TextData.json', 'w') as fout: json.dump(config_json, fout, indent=2) def split_tokens(self, s): """ A "tokenizer" that splits on space. If the delimiter is set to an empty string, it will read characters as tokens. :param s: The input string. :type s: str """ if self.delimiter == '': # Character models. return list(s.strip()) else: # Word models. return s.strip().split(self.delimiter) @timing def populate_dictionary(self, filename, vocab, counter, chunk_size): with open(filename, encoding=self.encoding) as fin: for chunk in tqdm(per_chunk(fin, chunk_size)): if all(c == None for c in chunk): break chunk_list_of_tokens = [ self.split_tokens(s) for s in chunk if s ] vocab.add_documents(chunk_list_of_tokens, self.prune_at) counter.update(chain(*chunk_list_of_tokens)) def filter_n_least_frequent(self, vocab, counter, n, keep_tokens=['<pad>', '<s>', '</s>', '<unk>']): """ Remove the least frequent items form the vocabulary. :param vocab: self.src_vocab or self.trg_vocab :type vocab: gensim.Dictionary :param counter: self.src_counter or self.trg_counter :type counter: bounter :param n: The upper limit of how many items to keep in the vocabulary :type n: int """ # If n is bigger than user specified size, don't filter anything. if n < len(vocab.token2id): good_ids = [ vocab.token2id[token] for token, _ in sorted(counter.items(), key=itemgetter(1))[-n:] if token in vocab.token2id ] good_ids += [self.vocab.token2id[_keep] for _keep in keep_tokens] print(good_ids) vocab.filter_tokens(good_ids=good_ids) def vectorize(self, sent, pad_start=True, pad_end=True): """ Vectorize the sentence, converts it into a list of the indices based on the vocabulary. This is used by the `variable_from_sent()`. :param sent: The input sentence to convert to vocabulary indices :type sent: list(str) :param pad_start: Pad the start with the START_IDX [default: True] :type pad_end: bool :param pad_end: Pad the start with the END_IDX [default: True] :type pad_end: bool """ sent = self.split_tokens(sent) if type(sent) == str else sent vsent = self.vocab.doc2idx(sent, unknown_word_index=self.UNK_IDX) if pad_start: vsent = [self.START_IDX] + vsent if pad_end: vsent = vsent + [self.END_IDX] return vsent def unvectorize(self, vector, unpad_start=True, unpad_end=True): """ Convert the vector to the natural text sentence. """ return ' '.join([self.vocab[idx] for idx in map(int, chain(*vector)) ][unpad_left:-unpad_right]) def reset(self): """ Resets the iterator to the 0th item. """ self.iterable = self._iterate() def lines(self): """ The function to iterate through the source and target file. """ with open(self.filename) as fin: for line in fin: yield line.strip() def _iterate(self): """ The helper function to iterate through the source and target file and convert the lines into vocabulary indices. """ for line in self.lines(): sent = self.vectorize(line, self.default_pad_start, self.default_pad_end) yield sent def __next__(self): return next(self.iterable) def shuffle(self): return iter(sorted(self, key=lambda k: random.random()))
class Corpora(Loader): """ """ is_built = False def __init__(self, data_path: str, prefix: str = None, iterator: str = 'token', parsing: str = 'simple', word_up_limit: float = 0.75, word_low_limit: int = 20, dictionary: str = None, shuffle: bool = False, seed: int = 42, document_minimum_length: int = 5, stopwords: str = None): iter_map = dict(token=self.tokenize, bow=self.bowize, sentences=self.sentences) self.iterator = iter_map[iterator] self.word_low_limit = word_low_limit self.word_up_limit = word_up_limit if stopwords: self.stopwords = [w.strip() for w in open(stopwords).readlines()] else: self.stopwords = [] if not dictionary: self.dictionary = Dictionary() else: self.dictionary = Dictionary.load_from_text(dictionary) if self.stopwords: self.dictionary.filter_tokens( bad_ids=self.dictionary.doc2idx(self.stopwords)) self.is_built = True self.shuffle = shuffle if self.shuffle: np.random.seed(seed) self.document_minimum_length = document_minimum_length corpus = self.init_corpus(data_path, prefix, parsing) super(Corpora, self).__init__(corpus=corpus) def __enter__(self): if not self.is_built: self.build() return super(Corpora, self).__enter__() def __exit__(self, *args): self.clear() return super(Corpora, self).__exit__(*args) def __iter__(self): for v in self.iterator(): yield v def __getitem__(self, key): return self.iterator(index=key) def init_corpus(self, path: str, prefix: str, parsing: str): """ """ directory = [os.path.join(path, f) for f in os.listdir(path)] folders = list(filter(lambda p: os.path.isdir(p), directory)) if prefix: folders = list(filter(lambda p: prefix in p, folders)) corpus = [Corpus(path=p, parsing=parsing).load() for p in folders] self.__paths = {c.path: c for c in corpus} return corpus def load_vectors(self, path: str): """ """ if not path.endswith('.csv'): raise AssertionError( 'Asserted the vectors to be provided with csv.') #TODO Use dask in case of too large word vector maps. return pd.read_csv(path) def build(self): """ """ if self.is_built: logging.warn('Attempted to build already built Corpora.') return for c in self.corpus: self.dictionary.add_documents(c.tokens) c.clear() self.dictionary.filter_extremes(no_below=self.word_low_limit, no_above=self.word_up_limit) return self def clear(self): """ """ self.dictionary = Dictionary() def bowize(self, index=None): """ """ N = len(self) iterable = self._iterator(index) for idx in self._indices(iterable): corpus = iterable[idx] tokens = corpus.tokens for ind in self._indices(tokens): doc_tokens = tokens[ind] bow = self.dictionary.doc2bow(doc_tokens) if len(bow) > self.document_minimum_length: yield bow, N else: logging.warn( f'Received empty file at {corpus.documents[ind]}, skipping.' ) corpus.mark_empty(ind) corpus.clear() def tokenize(self, index=None): """ """ N = len(self) iterable = self._iterator(index) for idx in self._indices(iterable): corpus = iterable[idx] tokens = corpus.tokens self._move() for ind in self._indices(tokens): doc_tokens = tokens[ind] if len(doc_tokens) > self.document_minimum_length: yield doc_tokens, N else: logging.warn( f'Received empty file at {corpus.documents[ind]}, skipping.' ) corpus.mark_empty(ind) corpus.clear() def sentences(self, index=None): """ """ iterable = self._iterator(index=index) for ind in self._indices(iterable=iterable): corpus = iterable[ind] for sentence in corpus.sentences: if len(sentence) > self.document_minimum_length: yield sentence else: logging.warn( f'Received empty file at {corpus.documents[ind]}, skipping.' ) def documents(self, index=None): """ """ for c in self.corpus: if len(c) > 1: yield c.documents else: for doc in c.documents: yield doc @property def years(self): """ """ return sorted([int(c.year) for c in self.corpus]) def _iterator(self, index=None): """ """ iterator = self.corpus if index: if isinstance(index, int): iterator = [self.corpus[index]] #TODO: Handle indices as slice elif isinstance(index, str): iterator = [self.__paths[index]] return iterator def _indices(self, iterable): """ """ if self.shuffle: indices = np.random.permutation(len(iterable)) else: indices = range(len(iterable)) return indices