Beispiel #1
0
def get_corpus(wiki_raw_filename, token_version, dictionary):
    nbprint('Loading Corpus')
    wiki_raw_path = join(config.paths["rawdata"], 'wiki/' + wiki_raw_filename)
    tokenizer_func = get_tokenizer_func(token_version).tokenize
    return WikiCorpus(wiki_raw_path,
                      dictionary=dictionary,
                      tokenizer_func=tokenizer_func)
Beispiel #2
0
    def run(self):
        # Set the NLTK path (http://www.nltk.org/_modules/nltk/data.html)
        nltk_path.append(join(config.paths["rawdata"], "nltk"))

        try:
            # Check which classes are valid depending on min_docs_per_class
            nbprint('Loading classes')
            self.load_valid_classes()

            # Load the documents
            with data.document_writer(self.info) as document_writer:
                # Initialize info classes
                self.classinfo = ClassInfo()
                self.docinfo = DocumentInfo(document_writer)

                # Load documents and store class information in classinfo
                self.load_documents()

            # Print Meta Information
            self.docinfo.save_meta(self.info)
            self.classinfo.save_meta(self.info)

        except (LookupError, FileNotFoundError):
            raise ImporterError(
                info,
                'Directory "{}" does not contain the required corpus.'.format(
                    nltk_path))

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)
Beispiel #3
0
def coherence_metrics():
    metric_fcts = load_metric_fcts('coherence')
    if len(metric_fcts) == 0:
        nbprint('No metrics active.')
        return

    topiclist_infos = data.get_all_topiclist_infos()
    if len(topiclist_infos) == 0:
        nbprint('No topics found.')
        return

    # Group them into batches based on topic_version and add num_tokens
    topiclist_info_batches = defaultdict(list)
    for info in topiclist_infos:
        for num_tokens in config.metrics['num_tokens']:
            extended_info = info.copy()
            extended_info['num_tokens'] = num_tokens
            if 'second_info' in info:
                token_version = info['second_info']['token_version']
            else:
                token_version = info['token_version']
            topiclist_info_batches[token_version].append(extended_info)

    for token_version, batch in topiclist_info_batches.items():
        nbprint('Batch {}'.format(token_version)).push()
        for metric_id, fct in metric_fcts.items():
            start = time.time()
            nbprint('Metric: {}'.format(
                config.metrics['coherence'][metric_id]['name'])).push()
            coherence_metric_batch(token_version, batch, metric_id, fct)
            end = time.time()
            nbprint('Runtime: {} minutes'.format((end - start) / 60)).pop()
        nbprint.pop()
def filter_total_size(info, runvars):
    max_tokens = get_option(info, 'max_tokens')
    if max_tokens:
        old_length = len(runvars['counts'])
        runvars['counts'][:] = runvars['counts'][:max_tokens]
        nbprint('Removed {} tokens to limit vocabulary size to {}'.format(
            old_length - len(runvars['counts']), max_tokens))
Beispiel #5
0
 def _run(self, info):
     nbprint('Running k-means')
     model = KMeansSklearn(n_clusters=info["num_topics"],
                           init='k-means++',
                           random_state=42,
                           verbose=0)
     self.c = model.fit_predict(self.input_mat.transpose())
Beispiel #6
0
def make_term_doc_mat_count(info, runvars):
    counts, i, j, mat_ids = [], [], [], []
    idx, excluded = 0, 0
    vocab = data.load_vocab_dict(info)

    with data.tokenized_document_reader(info) as documents:
        for document in ProgressIterator(documents, 'Documents'):
            tokens = split_tokens(document['tokens'])
            tokencnt = Counter(tokens).most_common()
            num_tokens = 0
            for token, count in tokencnt:
                if token in vocab:
                    counts.append(count)
                    i.append(vocab[token]['id'])
                    j.append(idx)
                    num_tokens += count
            if num_tokens > 0:
                idx += 1
                mat_ids.append(document['id'])
            else:
                excluded += 1
    nbprint("Documents {}, Excluded {} empty documents".format(idx, excluded))
    term_doc_mat_shape = (len(vocab), idx)
    runvars['term_doc_mat_count'] = sparse.coo_matrix(
        (counts, (i, j)), shape=term_doc_mat_shape).tocsc()
    runvars['mat_ids'] = mat_ids
Beispiel #7
0
    def _run(self, info):
        self.info = info

        # Settings
        self.num_topics = self.info["num_topics"]
        # ALgorithm for optimizing over the nullspace
        self.null = self.info["model_info"].get('null', None)
        # Minimum number of iterations
        self.min_iter = info['model_info'].get('min_iter', 15)
        # Maximum number of iterations
        self.max_iter = info['model_info'].get('max_iter', 100)
        # Mimum value of W and H entries, small positive value for stability
        self.eps = info['model_info'].get('eps', 1e-16)
        # How often does W get updated each iteration
        self.W_update_num = info['model_info'].get('W_update_num', 2)
        # How often does H get updated each iteration
        self.H_update_num = info['model_info'].get('H_update_num', 1)
        # If the mean of the relative differences between two iterates falls below this threshold, the algorithm stops
        self.threshold = info['model_info'].get('threshold', 1e-3)
        # If all c values are below c_threshold in one iteration, the iterations stop (original NMF)
        self.c_threshold = info['model_info'].get('c_threshold', 1e-8)
        # Kernel elements used for nullspace updates
        self.num_kernel = info['model_info'].get('num_kernel', 20)
        # Compute and log error after each iteration
        self.log_error = info['model_info'].get('log_error', False)
        # Compute and print error after each iteration
        self.print_error = info['model_info'].get('print_error', False)

        self._pre_algorithm()
        nbprint('WeNMFN')
        self._wenmf()

        self.meta = {'errors': self.errors}
Beispiel #8
0
    def load_documents(self):
        nbprint('Loading xml file')

        self.documents = []
        filename = join(config.paths["rawdata"], "yahooL5/manner.xml")
        #i, max = 0, 100000
        current_doc = None

        for event, elem in etree.iterparse(filename,
                                           events=('start', 'end'),
                                           recover=True):
            #i = i+1
            #if i % math.floor(max/10) == 0:
            #    print(i/max)
            #if i > max:
            #    break;

            if elem.tag == "document":
                if event == "start":
                    current_doc = Document()
                elif event == "end":
                    if current_doc.complete():
                        self.documents.append(current_doc)
                    current_doc = None
            elif event == "end" and not current_doc is None:
                current_doc.add_elem(elem)
Beispiel #9
0
 def import_archive(self):
     # Iterate all files in archive
     with zipfile.ZipFile(self.archivepath) as zip:
         filenames = [info.filename for info in zip.infolist()]
         for filename in filenames:
             nbprint(filename)
             with zip.open(filename) as jsonfile:
                 self.parse_file(jsonfile)
Beispiel #10
0
def run_importer(info=None):
    nbprint('Importer').push()

    if info is None:
        iterate(["data"], [import_data], depth=0)
    else:
        import_data(info)

    nbprint.pop()
Beispiel #11
0
def run_tokenizer(info=None):
    nbprint('Tokenizer').push()

    if info is None:
        iterate(['token:BC', 'data'], tokenize)
    else:
        tokenize(info)

    nbprint.pop()
Beispiel #12
0
 def filter(self):
     if self._filter is None:
         nbprint('Loading word2vec filter...')
         if self.fast_filter:
             self._filter = FastWord2VecFilter(self.vocab, self.track_exclusion)
         else:
             self._filter = Word2VecFilter(self.vocab, self.track_exclusion)
         nbprint.clear_last()
     return self._filter
Beispiel #13
0
def check_requirements(info):
    # Check if documents file exists
    if not data.documents_exists(info):
        # Run importer
        nbprint('Documents missing.')
        run_importer(info)
        # Check if it was successfull
        return data.documents_exists(info)
    return True
Beispiel #14
0
def check_requirements(info):
    # Check if tokens file exists
    if not data.tokenized_document_exists(info):
        # Run Tokenizer
        nbprint('Tokens missing.')
        run_tokenizer(info)
        # Check if it was successfull
        return data.tokenized_document_exists(info)
    return True
Beispiel #15
0
 def _run(self, info):
     nbprint('Running c-means')
     cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(self.input_mat,
                                                      info["num_topics"],
                                                      1.1,
                                                      error=0.0005,
                                                      maxiter=100,
                                                      init=None)
     self.H = u
Beispiel #16
0
def count_mat(info):
    # Reset runvars
    global runvars
    runvars = None

    # Check if Vocab and Tokens exist
    if not check_requirements(info):
        nbprint('Skipping Vectorizer (requirements not satisfied)')
        raise BreakIteration()
def filter_max_word_length(info, runvars):
    max_word_length = get_option(info, 'max_word_length')
    if max_word_length:
        old_length = len(runvars['counts'])
        runvars['counts'][:] = [
            vocab_item for vocab_item in runvars['counts']
            if len(vocab_item.token) <= max_word_length
        ]
        nbprint('Removed {} tokens with length greater than {}'.format(
            old_length - len(runvars['counts']), max_word_length))
Beispiel #18
0
 def vocab(self):
     if self._vocab is None:
         nbprint('Loading embedding vocab...')
         if data.embedding_meta_exists('vocab', self.info):
             self._vocab = data.load_embedding_meta('vocab', self.info)
         else:
             self._vocab = self._load_vocab()
             data.save_embedding_meta(self._vocab, 'vocab', self.info)
         nbprint.clear_last()
     return self._vocab
Beispiel #19
0
def run_model(info):
    info['num_topics'] = convert_num_topics(info, info['num_topics'])

    # Check if input mat exists
    if config.skip_existing and info['model'].output_exists(info):
        nbprint('Skipping Model (file(s) exists)')
        return
    info['model'].run(info)
    info['model'].save(info)
    nbprint('Model: success')
Beispiel #20
0
def run_vocab(info=None):
    nbprint('Vocab').push()

    if info is None:
        iterate(["data", "token", "vocab"], [check_tokens, build_vocab])
    else:
        check_tokens(info)
        build_vocab(info)

    nbprint.pop()
Beispiel #21
0
def check_requirements(info):
    # Check if documents file exists
    if not data.input_mat_exists(info):
        # Run Importer
        nbprint('Input mat missing.')
        run_vectorizer(info)
        # Check if it was successfull
        if not data.input_mat_exists(info):
            return False
    return True
def filter_min_count(info, runvars):
    min_count = get_option(info, 'min_count')
    if min_count:
        old_length = len(runvars['counts'])
        runvars['counts'][:] = [
            vocab_item for vocab_item in runvars['counts']
            if vocab_item.total >= min_count
        ]
        nbprint(
            'Removed {} tokens occuring less than {} times in total.'.format(
                old_length - len(runvars['counts']), min_count))
def filter_max_docs(info, runvars):
    max_docs = get_option(info, 'max_docs')
    if max_docs:
        max_docs = to_abs(max_docs, runvars['num_docs'])
        old_length = len(runvars['counts'])
        runvars['counts'][:] = [
            vocab_item for vocab_item in runvars['counts']
            if vocab_item.document <= max_docs
        ]
        nbprint('Removed {} tokens occuring in more than {} documents'.format(
            old_length - len(runvars['counts']), max_docs))
Beispiel #24
0
def run_distiller():
    global rejector
    rejector = Rejector(0.99)

    nbprint('Distiller').push()

    iterate(['distiller', 'distillerinputs'],
            add_second_info,
            print_iterates=False)

    nbprint.pop()
def filter_stopwords(info, runvars):
    stopwords_corpus_name = get_option(info, 'stopwords')
    if stopwords_corpus_name:
        stopword_corpus = set(stopwords.words(stopwords_corpus_name))
        old_length = len(runvars['counts'])
        runvars['counts'][:] = [
            vocab_item for vocab_item in runvars['counts']
            if vocab_item.token not in stopword_corpus
        ]
        nbprint('Removed {} tokens in the {} stopword corpus'.format(
            old_length - len(runvars['counts']), stopwords_corpus_name))
Beispiel #26
0
 def _run(self, info):
     nbprint('Running LDA')
     vocab = data.load_vocab(info)
     id2word = {e['id']: e['token'] for e in vocab}
     corpus = Sparse2Corpus(self.input_mat)
     lda = LdaModel(corpus, id2word=id2word, num_topics=info["num_topics"])
     self.W = lda.get_topics().T
     self.H = np.zeros((info["num_topics"], self.input_mat.shape[1]))
     for idx, doc in enumerate(corpus):
         weights = lda[doc]
         for topic, value in weights:
             self.H[topic, idx] = value
Beispiel #27
0
def build_vocab(info):
    # Check if vocab exists
    if config.skip_existing and data.vocab_exists(info):
        nbprint('Skipping Vocab (file exists)')
        return

    # Build vocab
    current_vocab_builder = get_vocab_builder(info)
    current_vocab_builder.build_vocab()
    vocab = current_vocab_builder.get_vocab()

    # Save Vocab
    data.save_vocab(vocab, info)
Beispiel #28
0
    def save_documents(self):
        nbprint('Saving documents')

        self.classinfo = ClassInfo()

        # Open Writer
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            for doc in self.documents:
                text = doc.content['subject']
                class_id = self.classinfo.increase_class_count(
                    doc.content['maincat'])
                self.docinfo.add_document(text, class_id)
Beispiel #29
0
    def save(self):
        if self.topiclist is None:
            nbprint('Distiller did not produce topiclist.')
            return

        if self.topic_token_version is None:
            nbprint(
                'Distiller did not set "topic_token_version", discarding result.'
            )
            return

        self.runinfo['topic_token_version'] = self.topic_token_version
        data.save_topiclist(self.topiclist, self.runinfo)
Beispiel #30
0
def phrase(info):
    # Check if Documents exist
    if not check_phrase_requirements(info):
        nbprint('Skipping Vectorizer (requirements not satisfied)')
        return

    # Check if input mat exists
    if config.skip_existing and data.input_mat_exists(info):
        nbprint('Skipping Vectorizer (file exists)')
        return

    make_phrase_mat(info, runvars)
    data.save_dense_input_mat(runvars['phrase_mat'], info)