def run(self): folder = join(config.paths["rawdata"], "atd") # List txt files try: files = listdir(folder) except FileNotFoundError: raise ImporterError(info, 'Directory "{}" does not exist'.format(folder)) # Keep only .txt files files = [file for file in files if file.split(".")[-1] == "txt"] # Check if files exist if len(files) == 0: raise ImporterError(info, 'There are no valid files in the folder.') # Add files one by one with data.document_writer(self.info) as document_writer: docinfo = DocumentInfo(document_writer) for filename in ProgressIterator(files, doc_progress_label): if filename.split(".")[-1] != "txt": continue with open(join(folder, filename), "r", encoding="utf8") as file: text = file.read() docinfo.add_document(text) # Print Meta Information docinfo.save_meta(self.info)
def tokenize(info): if not check_requirements(info): nbprint('Skipping Tokenizer (requirements not satisfied)') return if config.skip_existing and data.tokenized_document_exists(info): nbprint('Skipping Tokenizer (file exists)') return try: current_tokenizer = get_tokenizer(info) with data.document_reader(info) as documents: with data.tokenized_document_writer(info) as tokenized_documents: for document in ProgressIterator(documents, "Documents"): tokens = current_tokenizer.tokenize(document['text']) token_str = join_tokens(tokens) tokenized_document = { 'id': document['id'], 'tokens': token_str, 'class_id': document['class_id'] } tokenized_documents.write(tokenized_document) except EmbeddingError as err: nbprint(err) data.clear_file(data.tokenized_document_filename(info)) return nbprint('Tokenizer: success')
def make_term_doc_mat_count(info, runvars): counts, i, j, mat_ids = [], [], [], [] idx, excluded = 0, 0 vocab = data.load_vocab_dict(info) with data.tokenized_document_reader(info) as documents: for document in ProgressIterator(documents, 'Documents'): tokens = split_tokens(document['tokens']) tokencnt = Counter(tokens).most_common() num_tokens = 0 for token, count in tokencnt: if token in vocab: counts.append(count) i.append(vocab[token]['id']) j.append(idx) num_tokens += count if num_tokens > 0: idx += 1 mat_ids.append(document['id']) else: excluded += 1 nbprint("Documents {}, Excluded {} empty documents".format(idx, excluded)) term_doc_mat_shape = (len(vocab), idx) runvars['term_doc_mat_count'] = sparse.coo_matrix( (counts, (i, j)), shape=term_doc_mat_shape).tocsc() runvars['mat_ids'] = mat_ids
def make_cbow_mat_tf_idf(info, runvars): # Create tf-idf matrix make_term_doc_mat_tf_idf(info, runvars) tf_idf_mat = runvars['term_doc_mat_tf_idf'] # Load vocabulary and wordembedding vectors vocab_list = data.load_vocab_list(info) model = get_model(info) embedding_function = model.embedding_function # Create a zero matrix cbow_tf_idf_shape = (model.vector_size, tf_idf_mat.shape[1]) cbow_tf_idf = np.zeros(cbow_tf_idf_shape) # Iterate over all nonzero entries of the tf-idf matrix: nonzeros = zip(*sparse.find(tf_idf_mat)) for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = tf_idf_mat.nnz, print_every = 5000): # Add each entry times the corresponding vector to the matrix try: cbow_tf_idf[:,doc_idx] = cbow_tf_idf[:,doc_idx] + value * embedding_function(vocab_list[token_idx]) except OOVException: pass # Return the matrix runvars['cbow_mat'] = cbow_tf_idf
def clustering_metrics(): metric_fcts = load_metric_fcts('clustering') clustering_data = data.load_metric_data('clustering') # First everything by taking the column wise maximum as cluster idx nbprint('H Matrix').push() h_mat_infos = data.get_all_h_mat_infos(labeled_only=True) for info in ProgressIterator(h_mat_infos, print_every=1): # Grab the corresponding entry from clustering data metric_data_entry = grab_metric_data_entry(clustering_data, info) # Iterate all metric functions and store result in entry for metric_id, metric_fct in metric_fcts.items(): # Skip metric if it already exists: if metric_id in metric_data_entry: continue # Compute the metric labels_true = load_ground_truth_classes(info) labels_pred = load_class_array_from_h_mat(info) metric_data_entry[metric_id] = metric_fct(labels_true, labels_pred) # Save everything in between data.save_metric_data(clustering_data, 'clustering') # Taking indices directly from c nbprint.pop()('C Vector').push() c_vec_infos = data.get_all_c_vec_infos(labeled_only=True) for info in ProgressIterator(c_vec_infos, print_every=1): # Grab the corresponding entry from clustering data metric_data_entry = grab_metric_data_entry(clustering_data, info) # Iterate all metric functions and store result in entry for metric_id, metric_fct in metric_fcts.items(): # Skip metric if it already exists: if metric_id in metric_data_entry: continue # Compute the metric labels_true = load_ground_truth_classes(info) labels_pred = data.load_c_vec(info) metric_data_entry[metric_id] = metric_fct(labels_true, labels_pred) # Save everything in between data.save_metric_data(clustering_data, 'clustering') nbprint.pop()
def load_documents(self): text_class_pairs = zip(self.rawdata.data, self.rawdata.target) for text, class_idx in ProgressIterator(text_class_pairs, doc_progress_label, length=len(self.rawdata.data)): classname = self.rawdata.target_names[class_idx] class_id = self.classinfo.increase_class_count(classname) self.docinfo.add_document(text, class_id)
def import_archive(self): # Iterate all files in archive with zipfile.ZipFile(self.archivepath) as zip: filenames = [info.filename for info in zip.infolist()] for filename in ProgressIterator(filenames): if filename.endswith('.txt'): with zip.open(filename, 'r') as txtfile: text = txtfile.read().decode('utf-8') self.docinfo.add_document(text)
def parse_file(self, jsonfile): for line in ProgressIterator(jsonfile, 'Parsing tweets'): tweet = json.loads(line) if 'extended_tweet' in tweet: text = tweet['extended_tweet']['full_text'] elif 'text' in tweet: text = tweet['text'] else: continue self.docinfo.add_document(text)
def load_data(self, file): min_length = self.info['data_info']['min_length'] cr = csv.reader(file) next(cr) for row in ProgressIterator(cr): classname = row[2] text = row[5] if len(text) >= min_length and classname in self.valid_classes: class_id = self.classinfo.increase_class_count(classname) self.docinfo.add_document(text, class_id)
def build_lookup(self, vocab): self.lookup = {} for token in ProgressIterator(vocab, print_every=10000): words = token.split('_') if '' in words or len(words) == 0: continue current_lookup = self.lookup for word in words: lower_word = word.lower() if not lower_word in current_lookup: current_lookup[lower_word] = {} current_lookup = current_lookup[lower_word] if exists_token not in current_lookup: current_lookup[exists_token] = token
def _run(self, info): # Maximum number of iterations self.max_iter = info['model_info'].get('max_iter', 200) # If the mean of the differences between two iterates of H falls below this threshold, the algorithm stops self.theshold = info['model_info'].get('eps', 1e-4) # Check if kmeans exists kmeans_info = info.copy() kmeans_info['model_name'] = 'kmeans' if data.c_vec_exists(kmeans_info): nbprint('Loading k-means for initial H') c = data.load_c_vec(kmeans_info) else: nbprint('Running k-means for initial H') model = KMeansSklearn(n_clusters=info["num_topics"], init='k-means++', random_state=42, verbose=0) c = model.fit_predict(self.input_mat.transpose()) # Construct H from c self.H = np.full((info["num_topics"],self.input_mat.shape[1]), 0.2) for doc, topic in enumerate(c): self.H[topic, doc] += 1 # Iterate updates nbprint('Running updates') for iteration in ProgressIterator(range(self.max_iter), print_every = 1): # Update W HHT = self.H @ self.H.T try: HHTinv = np.linalg.inv(HHT) except LinAlgError: HHTinv = np.linalg.pinv(HHT) W = self.input_mat @ self.H.T @ HHTinv # Update H XTW = self.input_mat.T @ W WTW = W.T @ W frac = ((self.plus(XTW) + self.H.T @ self.minus(WTW)) / (self.minus(XTW) + self.H.T @ self.plus(WTW))) Hpre = self.H.copy() self.H = (self.H.T * np.sqrt(frac)).T mean_h_change = np.mean(np.abs(self.H - Hpre)) if mean_h_change < self.theshold: nbprint('Converged after {} iterations. (Threshold = {})'.format(iteration+1, self.theshold)) return nbprint('Did not converge after {} iterations with last change {} for threshold {}'.format(self.max_iter, mean_h_change, self.theshold))
def make_phrase_mat(info, runvars): model = get_model(info) embedding_function = model.embedding_function batch = [] batchsize = 0 min_batchsize = 4096 current_idx = 0 # Count documents num_documents = 0 with data.document_reader(info) as documents: for document in ProgressIterator(documents, 'Counting Documents'): num_documents += 1 # Create a zero matrix phrase_mat_shape = (model.vector_size, num_documents) phrase_mat = np.zeros(phrase_mat_shape) with data.document_reader(info) as documents: progress_iterator = ProgressIterator(documents, 'Vectorizing Documents') for document in progress_iterator: batch.append(document['text']) batchsize += 1 if batchsize >= min_batchsize: phrase_mat[:, current_idx:current_idx + batchsize] = embedding_function(batch) current_idx += batchsize batchsize = 0 batch = [] if batchsize > 0: phrase_mat[:, current_idx:current_idx + batchsize] = embedding_function(batch) runvars['phrase_mat'] = phrase_mat
def _wenmf(self): self.errors = [] self.Ht = normalize(self.H, axis=0).T for iteration in ProgressIterator(range(self.max_iter), print_every=1): HHT = np.dot(self.Ht.T, self.Ht) W_old = np.copy(self.W) for w in range(self.W_update_num): for r in range(self.num_topics): hr = self.Ht[:, r] idx = [i for i in range(self.num_topics) if i != r] wr = 1 / HHT[r, r] * (self.input_mat @ hr - self.W[:, idx] @ HHT[idx, r]) wr = self._iter_w_update(r, wr) wr = np.maximum(wr, self.eps).T self.W[:, r] = wr mean_w_change = np.mean(np.abs((self.W - W_old) / W_old)) VTVW = np.dot(self.v_mat.T, np.dot(self.v_mat, self.W)) WTVTVW = np.dot(self.W.T, VTVW) Ht_old = np.copy(self.Ht) for h in range(self.H_update_num): for r in range(self.num_topics): VTVwr = VTVW[:, r] idx = [i for i in range(self.num_topics) if i != r] hr = 1 / WTVTVW[r, r] * ( VTVwr.T @ self.input_mat).T - self.Ht[:, idx] @ WTVTVW[idx, r] hr = np.maximum(hr, self.eps) self.Ht[:, r] = hr mean_h_change = np.mean(np.abs((self.Ht - Ht_old) / Ht_old)) if self.log_error or self.print_error: VX = self.v_mat @ self.input_mat VW = self.v_mat @ self.W err = np.linalg.norm(VX - VW @ self.Ht.T) / np.linalg.norm(VX) if self.print_error: nbprint('Error: {}'.format(err)) if self.log_error: self.errors.append(err) nbprint('mean_w_change={}, mean_h_change={} ({})'.format( mean_w_change, mean_h_change, self.threshold)) if iteration + 1 >= self.min_iter and mean_w_change < self.threshold and mean_h_change < self.threshold: nbprint( 'Converged after {} iterations. (threshold = {})'.format( iteration + 1, self.threshold)) break self.H = self.Ht.T
def count_tokens(info, runvars): rawcounts = {} num_docs = 0 with data.tokenized_document_reader(info) as documents: for document in ProgressIterator(documents, 'Counting Tokens'): num_docs += 1 tokens = split_tokens(document['tokens']) for token in tokens: try: rawcounts[token].increase_total() except KeyError: rawcounts[token] = VocabItem(token, total=1) for token in set(tokens): rawcounts[token].increase_document() runvars['rawcounts'] = rawcounts runvars['num_docs'] = num_docs
def build_lookup(self, vocab): self.lookup = {} for token in ProgressIterator(vocab, print_every=10000): words = token.replace("_"," ").split() if len(words) == 0: continue current_lookup = self.lookup for word in words: lower_word = word.lower() if not lower_word in current_lookup: current_lookup[lower_word] = {} current_lookup = current_lookup[lower_word] if exists_token in current_lookup: current_lookup[exists_token].append(token) else: current_lookup[exists_token] = [token,]
def load_classes(self, file): self.valid_classes = ClassInfo() min_length = self.info['data_info']['min_length'] cr = csv.reader(file) next(cr) for row in ProgressIterator(cr): classname = row[2] text = row[5] if len(text) >= min_length: self.valid_classes.increase_class_count(classname) min_class_size = self.info['data_info']['min_class_size'] self.valid_classes = [ c['info'] for c in self.valid_classes.make_class_list() if c['count'] > min_class_size ]
def parse_files(self, jsonfile): nbprint("Loading documents") for line in ProgressIterator(jsonfile): tweet = json.loads(line) text = tweet["full_text"] id = int(tweet["id_str"]) #id field is incorrect/rounded classname = self.id_to_classname[id] if (self.max_docs_per_cls is not None and self.classinfo.classes.get( classname, (0, 0))[1] >= self.max_docs_per_cls): continue else: class_id = self.classinfo.increase_class_count(classname) self.docinfo.add_document(text, class_id)
def _pre_algorithm(self): # Load the embeddings embedding_model = get_model(self.info) embeddings = embedding_model.get_embeddings() vector_size = embedding_model.vector_size() # Load the vocab vocab = data.load_vocab_list(self.info) # construct V v_shape = (vector_size, len(vocab)) self.v_mat = np.zeros(v_shape) for idx, token in enumerate(vocab): try: self.v_mat[:, idx] = embeddings[token] except: pass # find elements in the nullspace of VTV if self.null is not None: nbprint('Finding {} elements in ker(VTV)'.format(self.num_kernel)) self.kernelvectors = [] for i in ProgressIterator(range(2 * self.num_kernel), print_every=1): op = LinearOperator( (len(vocab), len(vocab)), matvec=lambda x: self.v_mat.transpose() @ (self.v_mat @ x)) try: w, v = eigs(op, k=1, which='SM', maxiter=100) w = np.real(w[0]) v = np.real(v[:, 0]) if w < 1e-10: v = v / np.sqrt(np.sum(np.square(v))) self.kernelvectors.append(v) if len(self.kernelvectors) >= self.num_kernel: break except ArpackNoConvergence: nbprint('eigs did not converge') self.v_sums = [np.sum(v) for v in self.kernelvectors] # Initialize W and H from NMF nbprint('Initial NMF') nmf_model = NMFSklearn(self.num_topics, init='nndsvd') self.W = np.maximum(nmf_model.fit_transform(self.input_mat), self.eps) self.H = np.maximum(nmf_model.components_, self.eps)
def load_id_to_classname(self, folderpath, filename): nbprint("Extracting tsv") self.id_to_classname = {} max_depth = self.info['data_info']['maxdepth'] tarfilename = join(folderpath, filename + ".tar.bz2") with tarfile.open(tarfilename, "r:bz2") as tar: tsvfile = tar.extractfile(filename + ".tsv") for line in ProgressIterator(tsvfile): fields = line.decode().split() id = int(fields[0]) classname = fields[3] classname = classname.strip("*") classhierarchy = classname.split("/") classhierarchy = classhierarchy[1:max_depth + 1] classname = "/".join(classhierarchy) self.id_to_classname[id] = classname
def load_documents(self): for file in ProgressIterator(reuters.fileids(), doc_progress_label): categories = reuters.categories(file) if len(categories) > 1: continue classname = categories[0] if not classname in self.valid_classes: continue class_id = self.classinfo.increase_class_count(classname) text = " ".join(reuters.words(file)) text = re.sub("(\d+) \. (\d+)", r"\1.\2", text) text = re.sub("(\d+) \, (\d+)", r"\1,\2", text) text = re.sub(" \.", ".", text) text = re.sub(" \.", ".", text) text = re.sub(" \,", ",", text) text = re.sub(" \)", ")", text) text = re.sub("\( ", "(", text) text = re.sub(" \\' ", "'", text) self.docinfo.add_document(text, class_id)
def make_cbow_mat_minmaxmean(info, runvars): # Get count matrix count_mat = runvars['term_doc_mat_count'] # Load vocabulary and wordembedding vectors vocab_list = data.load_vocab_list(info) model = get_model(info) embedding_function = model.embedding_function # Create a zero matrix cbow_m_shape = (model.vector_size, count_mat.shape[1]) cbow_min = np.full(cbow_m_shape, np.inf) cbow_max = np.full(cbow_m_shape, -np.inf) cbow_mean = np.zeros(cbow_m_shape) column_sum = np.zeros(count_mat.shape[1]) # Iterate over all nonzero entries of the count matrix: nonzeros = zip(*sparse.find(count_mat)) for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000): try: embedding_vector = embedding_function(vocab_list[token_idx]) except OOVException: continue # Entry wise minimum with the embedding vector cbow_min[:,doc_idx] = np.minimum(cbow_min[:,doc_idx], embedding_vector) # Entry wise maximum with the embedding vector cbow_max[:,doc_idx] = np.maximum(cbow_max[:,doc_idx], embedding_vector) # Sum up all embedding vectors and the total number of tokens in the document cbow_mean[:,doc_idx] = cbow_mean[:,doc_idx] + value * embedding_vector column_sum[doc_idx] = column_sum[doc_idx] + value # Divide sum by number of tokens cbow_mean = cbow_mean * sparse.diags(1/np.maximum(1, column_sum)) # Stack all matrices and return cbow_mat = np.vstack((cbow_min,cbow_max,cbow_mean)) cbow_mat[np.invert(np.isfinite(cbow_mat))] = 0 runvars['cbow_mat'] = cbow_mat
def classification_metrics(): metric_fcts = load_metric_fcts('classification') classification_data = data.load_metric_data('classification') h_mat_infos = data.get_all_h_mat_infos(labeled_only=True) for info in ProgressIterator(h_mat_infos, print_every=1): nbprint(info) # Grab the corresponding entry from clustering data metric_data_entry = grab_metric_data_entry(classification_data, info) # Iterate all metric functions and store result in entry for metric_id, metric_fct in metric_fcts.items(): # Skip metric if it already exists: if metric_id in metric_data_entry: continue # Compute the metric labels_true = load_ground_truth_classes(info) h_mat = data.load_h_mat(info) metric_data_entry[metric_id] = metric_fct(labels_true, h_mat) # Save everything in between data.save_metric_data(classification_data, 'classification')
def fv_build_mat(info, runvars): # Get matrices count_mat = runvars['term_doc_mat_count'] mean_vec = runvars['mean_vec'] var_vec = runvars['var_vec'] # Load vocabulary and wordembedding vectors vocab_list = data.load_vocab_list(info) model = get_model(info) embedding_function = model.embedding_function # Create a zero matrix dimension = model.vector_size fv_m_shape = (dimension*2, count_mat.shape[1]) fv_mat = np.zeros(fv_m_shape) fv_num_tokens_shape = (1, count_mat.shape[1]) fv_num_tokens = np.zeros(fv_num_tokens_shape) # iterate all nonzero entries nonzeros = zip(*sparse.find(count_mat)) for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000): try: embedding_vector = embedding_function(vocab_list[token_idx]) except OOVException: continue fv_mat[:dimension, doc_idx] += value * (embedding_vector - mean_vec) / var_vec fv_mat[dimension:, doc_idx] += value * (np.square(embedding_vector - mean_vec) / (var_vec * np.sqrt(var_vec)) - (1 / np.sqrt(var_vec))) fv_num_tokens[0,doc_idx] += value # normalize fv_num_tokens[fv_num_tokens == 0] = 1 fv_mat *= np.power(fv_num_tokens, -0.5) fv_mat[:dimension,:] = (fv_mat[:dimension,:].transpose() * np.nan_to_num(np.power(1 / var_vec, -0.5))).transpose() fv_mat[dimension:,:] = (fv_mat[dimension:,:].transpose() * np.nan_to_num(np.power(2 / var_vec, -0.5))).transpose() runvars['cbow_mat'] = fv_mat
def load_documents(self): for filename in ProgressIterator(self.files, doc_progress_label): classname = filename.split(".")[0] class_id = self.classinfo.increase_class_count(classname) text = self.load_file(filename) self.docinfo.add_document(text, class_id)