def get_corpus(wiki_raw_filename, token_version, dictionary): nbprint('Loading Corpus') wiki_raw_path = join(config.paths["rawdata"], 'wiki/' + wiki_raw_filename) tokenizer_func = get_tokenizer_func(token_version).tokenize return WikiCorpus(wiki_raw_path, dictionary=dictionary, tokenizer_func=tokenizer_func)
def run(self): # Set the NLTK path (http://www.nltk.org/_modules/nltk/data.html) nltk_path.append(join(config.paths["rawdata"], "nltk")) try: # Check which classes are valid depending on min_docs_per_class nbprint('Loading classes') self.load_valid_classes() # Load the documents with data.document_writer(self.info) as document_writer: # Initialize info classes self.classinfo = ClassInfo() self.docinfo = DocumentInfo(document_writer) # Load documents and store class information in classinfo self.load_documents() # Print Meta Information self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info) except (LookupError, FileNotFoundError): raise ImporterError( info, 'Directory "{}" does not contain the required corpus.'.format( nltk_path)) # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info)
def coherence_metrics(): metric_fcts = load_metric_fcts('coherence') if len(metric_fcts) == 0: nbprint('No metrics active.') return topiclist_infos = data.get_all_topiclist_infos() if len(topiclist_infos) == 0: nbprint('No topics found.') return # Group them into batches based on topic_version and add num_tokens topiclist_info_batches = defaultdict(list) for info in topiclist_infos: for num_tokens in config.metrics['num_tokens']: extended_info = info.copy() extended_info['num_tokens'] = num_tokens if 'second_info' in info: token_version = info['second_info']['token_version'] else: token_version = info['token_version'] topiclist_info_batches[token_version].append(extended_info) for token_version, batch in topiclist_info_batches.items(): nbprint('Batch {}'.format(token_version)).push() for metric_id, fct in metric_fcts.items(): start = time.time() nbprint('Metric: {}'.format( config.metrics['coherence'][metric_id]['name'])).push() coherence_metric_batch(token_version, batch, metric_id, fct) end = time.time() nbprint('Runtime: {} minutes'.format((end - start) / 60)).pop() nbprint.pop()
def filter_total_size(info, runvars): max_tokens = get_option(info, 'max_tokens') if max_tokens: old_length = len(runvars['counts']) runvars['counts'][:] = runvars['counts'][:max_tokens] nbprint('Removed {} tokens to limit vocabulary size to {}'.format( old_length - len(runvars['counts']), max_tokens))
def _run(self, info): nbprint('Running k-means') model = KMeansSklearn(n_clusters=info["num_topics"], init='k-means++', random_state=42, verbose=0) self.c = model.fit_predict(self.input_mat.transpose())
def make_term_doc_mat_count(info, runvars): counts, i, j, mat_ids = [], [], [], [] idx, excluded = 0, 0 vocab = data.load_vocab_dict(info) with data.tokenized_document_reader(info) as documents: for document in ProgressIterator(documents, 'Documents'): tokens = split_tokens(document['tokens']) tokencnt = Counter(tokens).most_common() num_tokens = 0 for token, count in tokencnt: if token in vocab: counts.append(count) i.append(vocab[token]['id']) j.append(idx) num_tokens += count if num_tokens > 0: idx += 1 mat_ids.append(document['id']) else: excluded += 1 nbprint("Documents {}, Excluded {} empty documents".format(idx, excluded)) term_doc_mat_shape = (len(vocab), idx) runvars['term_doc_mat_count'] = sparse.coo_matrix( (counts, (i, j)), shape=term_doc_mat_shape).tocsc() runvars['mat_ids'] = mat_ids
def _run(self, info): self.info = info # Settings self.num_topics = self.info["num_topics"] # ALgorithm for optimizing over the nullspace self.null = self.info["model_info"].get('null', None) # Minimum number of iterations self.min_iter = info['model_info'].get('min_iter', 15) # Maximum number of iterations self.max_iter = info['model_info'].get('max_iter', 100) # Mimum value of W and H entries, small positive value for stability self.eps = info['model_info'].get('eps', 1e-16) # How often does W get updated each iteration self.W_update_num = info['model_info'].get('W_update_num', 2) # How often does H get updated each iteration self.H_update_num = info['model_info'].get('H_update_num', 1) # If the mean of the relative differences between two iterates falls below this threshold, the algorithm stops self.threshold = info['model_info'].get('threshold', 1e-3) # If all c values are below c_threshold in one iteration, the iterations stop (original NMF) self.c_threshold = info['model_info'].get('c_threshold', 1e-8) # Kernel elements used for nullspace updates self.num_kernel = info['model_info'].get('num_kernel', 20) # Compute and log error after each iteration self.log_error = info['model_info'].get('log_error', False) # Compute and print error after each iteration self.print_error = info['model_info'].get('print_error', False) self._pre_algorithm() nbprint('WeNMFN') self._wenmf() self.meta = {'errors': self.errors}
def load_documents(self): nbprint('Loading xml file') self.documents = [] filename = join(config.paths["rawdata"], "yahooL5/manner.xml") #i, max = 0, 100000 current_doc = None for event, elem in etree.iterparse(filename, events=('start', 'end'), recover=True): #i = i+1 #if i % math.floor(max/10) == 0: # print(i/max) #if i > max: # break; if elem.tag == "document": if event == "start": current_doc = Document() elif event == "end": if current_doc.complete(): self.documents.append(current_doc) current_doc = None elif event == "end" and not current_doc is None: current_doc.add_elem(elem)
def import_archive(self): # Iterate all files in archive with zipfile.ZipFile(self.archivepath) as zip: filenames = [info.filename for info in zip.infolist()] for filename in filenames: nbprint(filename) with zip.open(filename) as jsonfile: self.parse_file(jsonfile)
def run_importer(info=None): nbprint('Importer').push() if info is None: iterate(["data"], [import_data], depth=0) else: import_data(info) nbprint.pop()
def run_tokenizer(info=None): nbprint('Tokenizer').push() if info is None: iterate(['token:BC', 'data'], tokenize) else: tokenize(info) nbprint.pop()
def filter(self): if self._filter is None: nbprint('Loading word2vec filter...') if self.fast_filter: self._filter = FastWord2VecFilter(self.vocab, self.track_exclusion) else: self._filter = Word2VecFilter(self.vocab, self.track_exclusion) nbprint.clear_last() return self._filter
def check_requirements(info): # Check if documents file exists if not data.documents_exists(info): # Run importer nbprint('Documents missing.') run_importer(info) # Check if it was successfull return data.documents_exists(info) return True
def check_requirements(info): # Check if tokens file exists if not data.tokenized_document_exists(info): # Run Tokenizer nbprint('Tokens missing.') run_tokenizer(info) # Check if it was successfull return data.tokenized_document_exists(info) return True
def _run(self, info): nbprint('Running c-means') cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(self.input_mat, info["num_topics"], 1.1, error=0.0005, maxiter=100, init=None) self.H = u
def count_mat(info): # Reset runvars global runvars runvars = None # Check if Vocab and Tokens exist if not check_requirements(info): nbprint('Skipping Vectorizer (requirements not satisfied)') raise BreakIteration()
def filter_max_word_length(info, runvars): max_word_length = get_option(info, 'max_word_length') if max_word_length: old_length = len(runvars['counts']) runvars['counts'][:] = [ vocab_item for vocab_item in runvars['counts'] if len(vocab_item.token) <= max_word_length ] nbprint('Removed {} tokens with length greater than {}'.format( old_length - len(runvars['counts']), max_word_length))
def vocab(self): if self._vocab is None: nbprint('Loading embedding vocab...') if data.embedding_meta_exists('vocab', self.info): self._vocab = data.load_embedding_meta('vocab', self.info) else: self._vocab = self._load_vocab() data.save_embedding_meta(self._vocab, 'vocab', self.info) nbprint.clear_last() return self._vocab
def run_model(info): info['num_topics'] = convert_num_topics(info, info['num_topics']) # Check if input mat exists if config.skip_existing and info['model'].output_exists(info): nbprint('Skipping Model (file(s) exists)') return info['model'].run(info) info['model'].save(info) nbprint('Model: success')
def run_vocab(info=None): nbprint('Vocab').push() if info is None: iterate(["data", "token", "vocab"], [check_tokens, build_vocab]) else: check_tokens(info) build_vocab(info) nbprint.pop()
def check_requirements(info): # Check if documents file exists if not data.input_mat_exists(info): # Run Importer nbprint('Input mat missing.') run_vectorizer(info) # Check if it was successfull if not data.input_mat_exists(info): return False return True
def filter_min_count(info, runvars): min_count = get_option(info, 'min_count') if min_count: old_length = len(runvars['counts']) runvars['counts'][:] = [ vocab_item for vocab_item in runvars['counts'] if vocab_item.total >= min_count ] nbprint( 'Removed {} tokens occuring less than {} times in total.'.format( old_length - len(runvars['counts']), min_count))
def filter_max_docs(info, runvars): max_docs = get_option(info, 'max_docs') if max_docs: max_docs = to_abs(max_docs, runvars['num_docs']) old_length = len(runvars['counts']) runvars['counts'][:] = [ vocab_item for vocab_item in runvars['counts'] if vocab_item.document <= max_docs ] nbprint('Removed {} tokens occuring in more than {} documents'.format( old_length - len(runvars['counts']), max_docs))
def run_distiller(): global rejector rejector = Rejector(0.99) nbprint('Distiller').push() iterate(['distiller', 'distillerinputs'], add_second_info, print_iterates=False) nbprint.pop()
def filter_stopwords(info, runvars): stopwords_corpus_name = get_option(info, 'stopwords') if stopwords_corpus_name: stopword_corpus = set(stopwords.words(stopwords_corpus_name)) old_length = len(runvars['counts']) runvars['counts'][:] = [ vocab_item for vocab_item in runvars['counts'] if vocab_item.token not in stopword_corpus ] nbprint('Removed {} tokens in the {} stopword corpus'.format( old_length - len(runvars['counts']), stopwords_corpus_name))
def _run(self, info): nbprint('Running LDA') vocab = data.load_vocab(info) id2word = {e['id']: e['token'] for e in vocab} corpus = Sparse2Corpus(self.input_mat) lda = LdaModel(corpus, id2word=id2word, num_topics=info["num_topics"]) self.W = lda.get_topics().T self.H = np.zeros((info["num_topics"], self.input_mat.shape[1])) for idx, doc in enumerate(corpus): weights = lda[doc] for topic, value in weights: self.H[topic, idx] = value
def build_vocab(info): # Check if vocab exists if config.skip_existing and data.vocab_exists(info): nbprint('Skipping Vocab (file exists)') return # Build vocab current_vocab_builder = get_vocab_builder(info) current_vocab_builder.build_vocab() vocab = current_vocab_builder.get_vocab() # Save Vocab data.save_vocab(vocab, info)
def save_documents(self): nbprint('Saving documents') self.classinfo = ClassInfo() # Open Writer with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) for doc in self.documents: text = doc.content['subject'] class_id = self.classinfo.increase_class_count( doc.content['maincat']) self.docinfo.add_document(text, class_id)
def save(self): if self.topiclist is None: nbprint('Distiller did not produce topiclist.') return if self.topic_token_version is None: nbprint( 'Distiller did not set "topic_token_version", discarding result.' ) return self.runinfo['topic_token_version'] = self.topic_token_version data.save_topiclist(self.topiclist, self.runinfo)
def phrase(info): # Check if Documents exist if not check_phrase_requirements(info): nbprint('Skipping Vectorizer (requirements not satisfied)') return # Check if input mat exists if config.skip_existing and data.input_mat_exists(info): nbprint('Skipping Vectorizer (file exists)') return make_phrase_mat(info, runvars) data.save_dense_input_mat(runvars['phrase_mat'], info)