Beispiel #1
0
    def run(self):
        folder = join(config.paths["rawdata"], "atd")

        # List txt files
        try:
            files = listdir(folder)
        except FileNotFoundError:
            raise ImporterError(info,
                                'Directory "{}" does not exist'.format(folder))

        # Keep only .txt files
        files = [file for file in files if file.split(".")[-1] == "txt"]

        # Check if files exist
        if len(files) == 0:
            raise ImporterError(info,
                                'There are no valid files in the folder.')

        # Add files one by one
        with data.document_writer(self.info) as document_writer:
            docinfo = DocumentInfo(document_writer)
            for filename in ProgressIterator(files, doc_progress_label):
                if filename.split(".")[-1] != "txt":
                    continue
                with open(join(folder, filename), "r",
                          encoding="utf8") as file:
                    text = file.read()
                    docinfo.add_document(text)
            # Print Meta Information
            docinfo.save_meta(self.info)
Beispiel #2
0
def tokenize(info):
    if not check_requirements(info):
        nbprint('Skipping Tokenizer (requirements not satisfied)')
        return
    if config.skip_existing and data.tokenized_document_exists(info):
        nbprint('Skipping Tokenizer (file exists)')
        return

    try:
        current_tokenizer = get_tokenizer(info)

        with data.document_reader(info) as documents:
            with data.tokenized_document_writer(info) as tokenized_documents:
                for document in ProgressIterator(documents, "Documents"):
                    tokens = current_tokenizer.tokenize(document['text'])
                    token_str = join_tokens(tokens)
                    tokenized_document = {
                        'id': document['id'],
                        'tokens': token_str,
                        'class_id': document['class_id']
                    }
                    tokenized_documents.write(tokenized_document)

    except EmbeddingError as err:
        nbprint(err)
        data.clear_file(data.tokenized_document_filename(info))
        return

    nbprint('Tokenizer: success')
Beispiel #3
0
def make_term_doc_mat_count(info, runvars):
    counts, i, j, mat_ids = [], [], [], []
    idx, excluded = 0, 0
    vocab = data.load_vocab_dict(info)

    with data.tokenized_document_reader(info) as documents:
        for document in ProgressIterator(documents, 'Documents'):
            tokens = split_tokens(document['tokens'])
            tokencnt = Counter(tokens).most_common()
            num_tokens = 0
            for token, count in tokencnt:
                if token in vocab:
                    counts.append(count)
                    i.append(vocab[token]['id'])
                    j.append(idx)
                    num_tokens += count
            if num_tokens > 0:
                idx += 1
                mat_ids.append(document['id'])
            else:
                excluded += 1
    nbprint("Documents {}, Excluded {} empty documents".format(idx, excluded))
    term_doc_mat_shape = (len(vocab), idx)
    runvars['term_doc_mat_count'] = sparse.coo_matrix(
        (counts, (i, j)), shape=term_doc_mat_shape).tocsc()
    runvars['mat_ids'] = mat_ids
Beispiel #4
0
def make_cbow_mat_tf_idf(info, runvars):
    # Create tf-idf matrix
    make_term_doc_mat_tf_idf(info, runvars)
    tf_idf_mat = runvars['term_doc_mat_tf_idf']
    
    # Load vocabulary and wordembedding vectors
    vocab_list = data.load_vocab_list(info)
    model = get_model(info)
    embedding_function = model.embedding_function
    
    # Create a zero matrix
    cbow_tf_idf_shape = (model.vector_size, tf_idf_mat.shape[1])
    cbow_tf_idf = np.zeros(cbow_tf_idf_shape)
    
    # Iterate over all nonzero entries of the tf-idf matrix:
    nonzeros = zip(*sparse.find(tf_idf_mat))
    for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = tf_idf_mat.nnz, print_every = 5000):
        # Add each entry times the corresponding vector to the matrix
        try:
            cbow_tf_idf[:,doc_idx] = cbow_tf_idf[:,doc_idx] + value * embedding_function(vocab_list[token_idx])
        except OOVException:
            pass
        
    # Return the matrix
    runvars['cbow_mat'] = cbow_tf_idf
Beispiel #5
0
def clustering_metrics():
    metric_fcts = load_metric_fcts('clustering')
    clustering_data = data.load_metric_data('clustering')

    # First everything by taking the column wise maximum as cluster idx
    nbprint('H Matrix').push()
    h_mat_infos = data.get_all_h_mat_infos(labeled_only=True)
    for info in ProgressIterator(h_mat_infos, print_every=1):
        # Grab the corresponding entry from clustering data
        metric_data_entry = grab_metric_data_entry(clustering_data, info)

        # Iterate all metric functions and store result in entry
        for metric_id, metric_fct in metric_fcts.items():
            # Skip metric if it already exists:
            if metric_id in metric_data_entry:
                continue

            # Compute the metric
            labels_true = load_ground_truth_classes(info)
            labels_pred = load_class_array_from_h_mat(info)
            metric_data_entry[metric_id] = metric_fct(labels_true, labels_pred)

            # Save everything in between
            data.save_metric_data(clustering_data, 'clustering')

    # Taking indices directly from c
    nbprint.pop()('C Vector').push()
    c_vec_infos = data.get_all_c_vec_infos(labeled_only=True)
    for info in ProgressIterator(c_vec_infos, print_every=1):
        # Grab the corresponding entry from clustering data
        metric_data_entry = grab_metric_data_entry(clustering_data, info)

        # Iterate all metric functions and store result in entry
        for metric_id, metric_fct in metric_fcts.items():
            # Skip metric if it already exists:
            if metric_id in metric_data_entry:
                continue

            # Compute the metric
            labels_true = load_ground_truth_classes(info)
            labels_pred = data.load_c_vec(info)
            metric_data_entry[metric_id] = metric_fct(labels_true, labels_pred)

            # Save everything in between
            data.save_metric_data(clustering_data, 'clustering')
    nbprint.pop()
Beispiel #6
0
 def load_documents(self):
     text_class_pairs = zip(self.rawdata.data, self.rawdata.target)
     for text, class_idx in ProgressIterator(text_class_pairs,
                                             doc_progress_label,
                                             length=len(self.rawdata.data)):
         classname = self.rawdata.target_names[class_idx]
         class_id = self.classinfo.increase_class_count(classname)
         self.docinfo.add_document(text, class_id)
Beispiel #7
0
 def import_archive(self):
     # Iterate all files in archive
     with zipfile.ZipFile(self.archivepath) as zip:
         filenames = [info.filename for info in zip.infolist()]
         for filename in ProgressIterator(filenames): 
             if filename.endswith('.txt'):
                 with zip.open(filename, 'r') as txtfile:
                     text = txtfile.read().decode('utf-8')
                     self.docinfo.add_document(text)
Beispiel #8
0
 def parse_file(self, jsonfile):
     for line in ProgressIterator(jsonfile, 'Parsing tweets'):
         tweet = json.loads(line)
         if 'extended_tweet' in tweet:
             text = tweet['extended_tweet']['full_text']
         elif 'text' in tweet:
             text = tweet['text']
         else:
             continue
         self.docinfo.add_document(text)
Beispiel #9
0
    def load_data(self, file):
        min_length = self.info['data_info']['min_length']
        cr = csv.reader(file)
        next(cr)
        for row in ProgressIterator(cr):
            classname = row[2]
            text = row[5]

            if len(text) >= min_length and classname in self.valid_classes:
                class_id = self.classinfo.increase_class_count(classname)
                self.docinfo.add_document(text, class_id)
Beispiel #10
0
 def build_lookup(self, vocab):
     self.lookup = {}
     for token in ProgressIterator(vocab, print_every=10000):
         words = token.split('_')
         if '' in words or len(words) == 0:
             continue
         current_lookup = self.lookup
         for word in words:
             lower_word = word.lower()
             if not lower_word in current_lookup:
                 current_lookup[lower_word] = {}
             current_lookup = current_lookup[lower_word]
         if exists_token not in current_lookup:
             current_lookup[exists_token] = token
Beispiel #11
0
 def _run(self, info):
     # Maximum number of iterations
     self.max_iter = info['model_info'].get('max_iter', 200)      
     # If the mean of the differences between two iterates of H falls below this threshold, the algorithm stops
     self.theshold = info['model_info'].get('eps', 1e-4)   
     
     # Check if kmeans exists
     kmeans_info = info.copy()
     kmeans_info['model_name'] = 'kmeans'
     if data.c_vec_exists(kmeans_info):
         nbprint('Loading k-means for initial H')
         c = data.load_c_vec(kmeans_info)
     else:
         nbprint('Running k-means for initial H')
         model = KMeansSklearn(n_clusters=info["num_topics"], 
                               init='k-means++', 
                               random_state=42,
                               verbose=0)
         c = model.fit_predict(self.input_mat.transpose())
     
     # Construct H from c
     self.H = np.full((info["num_topics"],self.input_mat.shape[1]), 0.2)
     for doc, topic in enumerate(c):
         self.H[topic, doc] += 1
     
     # Iterate updates
     nbprint('Running updates')
     for iteration in ProgressIterator(range(self.max_iter), print_every = 1):
         # Update W
         HHT = self.H @ self.H.T
         try:
             HHTinv = np.linalg.inv(HHT)
         except LinAlgError:
             HHTinv = np.linalg.pinv(HHT)
         W = self.input_mat @ self.H.T @ HHTinv
         
         # Update H
         XTW = self.input_mat.T @ W
         WTW = W.T @ W
         frac = ((self.plus(XTW)  + self.H.T @ self.minus(WTW)) / 
                 (self.minus(XTW) + self.H.T @ self.plus(WTW)))
         Hpre = self.H.copy()
         self.H = (self.H.T * np.sqrt(frac)).T
         
         mean_h_change = np.mean(np.abs(self.H - Hpre))
         if mean_h_change < self.theshold:
             nbprint('Converged after {} iterations. (Threshold = {})'.format(iteration+1, self.theshold))
             return
     nbprint('Did not converge after {} iterations with last change {} for threshold {}'.format(self.max_iter, mean_h_change, self.theshold))
Beispiel #12
0
def make_phrase_mat(info, runvars):
    model = get_model(info)
    embedding_function = model.embedding_function
    batch = []
    batchsize = 0
    min_batchsize = 4096
    current_idx = 0

    # Count documents
    num_documents = 0
    with data.document_reader(info) as documents:
        for document in ProgressIterator(documents, 'Counting Documents'):
            num_documents += 1

    # Create a zero matrix
    phrase_mat_shape = (model.vector_size, num_documents)
    phrase_mat = np.zeros(phrase_mat_shape)

    with data.document_reader(info) as documents:
        progress_iterator = ProgressIterator(documents,
                                             'Vectorizing Documents')
        for document in progress_iterator:
            batch.append(document['text'])
            batchsize += 1

            if batchsize >= min_batchsize:
                phrase_mat[:, current_idx:current_idx +
                           batchsize] = embedding_function(batch)
                current_idx += batchsize
                batchsize = 0
                batch = []

        if batchsize > 0:
            phrase_mat[:, current_idx:current_idx +
                       batchsize] = embedding_function(batch)
    runvars['phrase_mat'] = phrase_mat
Beispiel #13
0
    def _wenmf(self):
        self.errors = []
        self.Ht = normalize(self.H, axis=0).T

        for iteration in ProgressIterator(range(self.max_iter), print_every=1):
            HHT = np.dot(self.Ht.T, self.Ht)
            W_old = np.copy(self.W)
            for w in range(self.W_update_num):
                for r in range(self.num_topics):
                    hr = self.Ht[:, r]
                    idx = [i for i in range(self.num_topics) if i != r]
                    wr = 1 / HHT[r, r] * (self.input_mat @ hr -
                                          self.W[:, idx] @ HHT[idx, r])
                    wr = self._iter_w_update(r, wr)
                    wr = np.maximum(wr, self.eps).T
                    self.W[:, r] = wr
            mean_w_change = np.mean(np.abs((self.W - W_old) / W_old))

            VTVW = np.dot(self.v_mat.T, np.dot(self.v_mat, self.W))
            WTVTVW = np.dot(self.W.T, VTVW)
            Ht_old = np.copy(self.Ht)
            for h in range(self.H_update_num):
                for r in range(self.num_topics):
                    VTVwr = VTVW[:, r]
                    idx = [i for i in range(self.num_topics) if i != r]
                    hr = 1 / WTVTVW[r, r] * (
                        VTVwr.T
                        @ self.input_mat).T - self.Ht[:, idx] @ WTVTVW[idx, r]
                    hr = np.maximum(hr, self.eps)
                    self.Ht[:, r] = hr
            mean_h_change = np.mean(np.abs((self.Ht - Ht_old) / Ht_old))

            if self.log_error or self.print_error:
                VX = self.v_mat @ self.input_mat
                VW = self.v_mat @ self.W
                err = np.linalg.norm(VX - VW @ self.Ht.T) / np.linalg.norm(VX)
                if self.print_error:
                    nbprint('Error: {}'.format(err))
                if self.log_error:
                    self.errors.append(err)
            nbprint('mean_w_change={}, mean_h_change={} ({})'.format(
                mean_w_change, mean_h_change, self.threshold))
            if iteration + 1 >= self.min_iter and mean_w_change < self.threshold and mean_h_change < self.threshold:
                nbprint(
                    'Converged after {} iterations. (threshold = {})'.format(
                        iteration + 1, self.threshold))
                break
        self.H = self.Ht.T
def count_tokens(info, runvars):
    rawcounts = {}
    num_docs = 0
    with data.tokenized_document_reader(info) as documents:
        for document in ProgressIterator(documents, 'Counting Tokens'):
            num_docs += 1
            tokens = split_tokens(document['tokens'])
            for token in tokens:
                try:
                    rawcounts[token].increase_total()
                except KeyError:
                    rawcounts[token] = VocabItem(token, total=1)
            for token in set(tokens):
                rawcounts[token].increase_document()
    runvars['rawcounts'] = rawcounts
    runvars['num_docs'] = num_docs
Beispiel #15
0
 def build_lookup(self, vocab):
     self.lookup = {}
     for token in ProgressIterator(vocab, print_every=10000):
         words = token.replace("_"," ").split()
         if len(words) == 0:
             continue
         current_lookup = self.lookup
         for word in words:
             lower_word = word.lower()
             if not lower_word in current_lookup:
                 current_lookup[lower_word] = {}
             current_lookup = current_lookup[lower_word]
         if exists_token in current_lookup:
             current_lookup[exists_token].append(token)
         else:
             current_lookup[exists_token] = [token,]
Beispiel #16
0
    def load_classes(self, file):
        self.valid_classes = ClassInfo()
        min_length = self.info['data_info']['min_length']
        cr = csv.reader(file)
        next(cr)
        for row in ProgressIterator(cr):
            classname = row[2]
            text = row[5]

            if len(text) >= min_length:
                self.valid_classes.increase_class_count(classname)
        min_class_size = self.info['data_info']['min_class_size']
        self.valid_classes = [
            c['info'] for c in self.valid_classes.make_class_list()
            if c['count'] > min_class_size
        ]
Beispiel #17
0
    def parse_files(self, jsonfile):
        nbprint("Loading documents")
        for line in ProgressIterator(jsonfile):

            tweet = json.loads(line)
            text = tweet["full_text"]

            id = int(tweet["id_str"])  #id field is incorrect/rounded
            classname = self.id_to_classname[id]

            if (self.max_docs_per_cls is not None
                    and self.classinfo.classes.get(
                        classname, (0, 0))[1] >= self.max_docs_per_cls):
                continue
            else:
                class_id = self.classinfo.increase_class_count(classname)
                self.docinfo.add_document(text, class_id)
Beispiel #18
0
    def _pre_algorithm(self):
        # Load the embeddings
        embedding_model = get_model(self.info)
        embeddings = embedding_model.get_embeddings()
        vector_size = embedding_model.vector_size()

        # Load the vocab
        vocab = data.load_vocab_list(self.info)

        # construct V
        v_shape = (vector_size, len(vocab))
        self.v_mat = np.zeros(v_shape)
        for idx, token in enumerate(vocab):
            try:
                self.v_mat[:, idx] = embeddings[token]
            except:
                pass

        # find elements in the nullspace of VTV
        if self.null is not None:
            nbprint('Finding {} elements in ker(VTV)'.format(self.num_kernel))
            self.kernelvectors = []
            for i in ProgressIterator(range(2 * self.num_kernel),
                                      print_every=1):
                op = LinearOperator(
                    (len(vocab), len(vocab)),
                    matvec=lambda x: self.v_mat.transpose() @ (self.v_mat @ x))
                try:
                    w, v = eigs(op, k=1, which='SM', maxiter=100)
                    w = np.real(w[0])
                    v = np.real(v[:, 0])
                    if w < 1e-10:
                        v = v / np.sqrt(np.sum(np.square(v)))
                        self.kernelvectors.append(v)
                        if len(self.kernelvectors) >= self.num_kernel:
                            break
                except ArpackNoConvergence:
                    nbprint('eigs did not converge')
            self.v_sums = [np.sum(v) for v in self.kernelvectors]

        # Initialize W and H from NMF
        nbprint('Initial NMF')
        nmf_model = NMFSklearn(self.num_topics, init='nndsvd')
        self.W = np.maximum(nmf_model.fit_transform(self.input_mat), self.eps)
        self.H = np.maximum(nmf_model.components_, self.eps)
Beispiel #19
0
    def load_id_to_classname(self, folderpath, filename):
        nbprint("Extracting tsv")

        self.id_to_classname = {}
        max_depth = self.info['data_info']['maxdepth']
        tarfilename = join(folderpath, filename + ".tar.bz2")

        with tarfile.open(tarfilename, "r:bz2") as tar:
            tsvfile = tar.extractfile(filename + ".tsv")
            for line in ProgressIterator(tsvfile):
                fields = line.decode().split()
                id = int(fields[0])
                classname = fields[3]

                classname = classname.strip("*")
                classhierarchy = classname.split("/")
                classhierarchy = classhierarchy[1:max_depth + 1]
                classname = "/".join(classhierarchy)

                self.id_to_classname[id] = classname
Beispiel #20
0
    def load_documents(self):
        for file in ProgressIterator(reuters.fileids(), doc_progress_label):
            categories = reuters.categories(file)
            if len(categories) > 1:
                continue
            classname = categories[0]
            if not classname in self.valid_classes:
                continue
            class_id = self.classinfo.increase_class_count(classname)

            text = " ".join(reuters.words(file))
            text = re.sub("(\d+) \. (\d+)", r"\1.\2", text)
            text = re.sub("(\d+) \, (\d+)", r"\1,\2", text)
            text = re.sub(" \.", ".", text)
            text = re.sub(" \.", ".", text)
            text = re.sub(" \,", ",", text)
            text = re.sub(" \)", ")", text)
            text = re.sub("\( ", "(", text)
            text = re.sub(" \\' ", "'", text)

            self.docinfo.add_document(text, class_id)
Beispiel #21
0
def make_cbow_mat_minmaxmean(info, runvars):
    # Get count matrix
    count_mat = runvars['term_doc_mat_count']
    
    # Load vocabulary and wordembedding vectors
    vocab_list = data.load_vocab_list(info)
    model = get_model(info)
    embedding_function = model.embedding_function
    
    # Create a zero matrix
    cbow_m_shape = (model.vector_size, count_mat.shape[1])
    cbow_min = np.full(cbow_m_shape, np.inf)
    cbow_max = np.full(cbow_m_shape, -np.inf)
    cbow_mean = np.zeros(cbow_m_shape)
    column_sum = np.zeros(count_mat.shape[1])
    
    # Iterate over all nonzero entries of the count matrix:
    nonzeros = zip(*sparse.find(count_mat))
    for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000):
        try:
            embedding_vector = embedding_function(vocab_list[token_idx])
        except OOVException:
            continue
        # Entry wise minimum with the embedding vector
        cbow_min[:,doc_idx] = np.minimum(cbow_min[:,doc_idx], embedding_vector)
        # Entry wise maximum with the embedding vector
        cbow_max[:,doc_idx] = np.maximum(cbow_max[:,doc_idx], embedding_vector)
        # Sum up all embedding vectors and the total number of tokens in the document
        cbow_mean[:,doc_idx] = cbow_mean[:,doc_idx] + value * embedding_vector
        column_sum[doc_idx] = column_sum[doc_idx] + value
        
    # Divide sum by number of tokens
    cbow_mean = cbow_mean * sparse.diags(1/np.maximum(1, column_sum))
    
    # Stack all matrices and return
    cbow_mat = np.vstack((cbow_min,cbow_max,cbow_mean))
    cbow_mat[np.invert(np.isfinite(cbow_mat))] = 0
    runvars['cbow_mat'] = cbow_mat
Beispiel #22
0
def classification_metrics():
    metric_fcts = load_metric_fcts('classification')
    classification_data = data.load_metric_data('classification')

    h_mat_infos = data.get_all_h_mat_infos(labeled_only=True)
    for info in ProgressIterator(h_mat_infos, print_every=1):
        nbprint(info)
        # Grab the corresponding entry from clustering data
        metric_data_entry = grab_metric_data_entry(classification_data, info)

        # Iterate all metric functions and store result in entry
        for metric_id, metric_fct in metric_fcts.items():
            # Skip metric if it already exists:
            if metric_id in metric_data_entry:
                continue

            # Compute the metric
            labels_true = load_ground_truth_classes(info)
            h_mat = data.load_h_mat(info)
            metric_data_entry[metric_id] = metric_fct(labels_true, h_mat)

            # Save everything in between
            data.save_metric_data(classification_data, 'classification')
Beispiel #23
0
def fv_build_mat(info, runvars):    
    # Get matrices
    count_mat = runvars['term_doc_mat_count']
    mean_vec = runvars['mean_vec']
    var_vec = runvars['var_vec']
    
    # Load vocabulary and wordembedding vectors
    vocab_list = data.load_vocab_list(info)
    model = get_model(info)
    embedding_function = model.embedding_function
    
    # Create a zero matrix
    dimension = model.vector_size
    fv_m_shape = (dimension*2, count_mat.shape[1])
    fv_mat = np.zeros(fv_m_shape)
    fv_num_tokens_shape = (1, count_mat.shape[1])
    fv_num_tokens = np.zeros(fv_num_tokens_shape)
    
    # iterate all nonzero entries
    nonzeros = zip(*sparse.find(count_mat))
    for token_idx, doc_idx, value in ProgressIterator(nonzeros, length = count_mat.nnz, print_every = 5000):
        try:
            embedding_vector = embedding_function(vocab_list[token_idx])
        except OOVException:
            continue
        fv_mat[:dimension, doc_idx] += value * (embedding_vector - mean_vec) / var_vec
        fv_mat[dimension:, doc_idx] += value * (np.square(embedding_vector - mean_vec) / (var_vec * np.sqrt(var_vec)) - (1 / np.sqrt(var_vec)))
        fv_num_tokens[0,doc_idx] += value
        
    # normalize
    fv_num_tokens[fv_num_tokens == 0] = 1
    fv_mat *= np.power(fv_num_tokens, -0.5)
    fv_mat[:dimension,:] = (fv_mat[:dimension,:].transpose() * np.nan_to_num(np.power(1 / var_vec, -0.5))).transpose()
    fv_mat[dimension:,:] = (fv_mat[dimension:,:].transpose() * np.nan_to_num(np.power(2 / var_vec, -0.5))).transpose()
        
    runvars['cbow_mat'] = fv_mat
Beispiel #24
0
 def load_documents(self):
     for filename in ProgressIterator(self.files, doc_progress_label):
         classname = filename.split(".")[0]
         class_id = self.classinfo.increase_class_count(classname)
         text = self.load_file(filename)
         self.docinfo.add_document(text, class_id)