コード例 #1
0
ファイル: format_20ng_data.py プロジェクト: anukat2015/ARKcat
def export_20ng(remove_headers=False, remove_footers=False, remove_quotes=False, categories=None):
    output_dir = os.path.join('..', 'datasets', '20ng', 'data')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    remove = []
    if remove_headers:
        remove.append('headers')
    if remove_footers:
        remove.append('footers')
    if remove_quotes:
        remove.append('quotes')

    print categories

    ng_train = fetch_20newsgroups(subset='train', remove=remove, categories=categories)
    keys = ['train' + str(i) for i in range(len(ng_train.data))]
    print len(keys)
    train_text = dict(zip(keys, ng_train.data))
    fh.write_to_json(train_text, os.path.join(output_dir, 'train.json'))

    train_labels = pd.DataFrame(ng_train.target, columns=['target'], index=keys)
    train_labels.to_csv(os.path.join(output_dir, 'train.csv'))
    print train_labels.shape

    ng_test = fetch_20newsgroups(subset='test', remove=remove, categories=categories)
    keys = ['test' + str(i) for i in range(len(ng_test.data))]
    test_text = dict(zip(keys, ng_train.data))
    fh.write_to_json(test_text, os.path.join(output_dir, 'test.json'))

    test_labels = pd.DataFrame(ng_test.target, columns=['target'], index=keys)
    test_labels.to_csv(os.path.join(output_dir, 'test.csv'))
コード例 #2
0
    def extract_features(self, source, write_to_file=True, vocab_source=None):
        print "Extracting ngram tokens"

        # read in a dict of {document_key: text}
        data = fh.read_json(source)
        all_items = data.keys()

        tokens = self.extract_tokens_from_text(data)

        if vocab_source is None:
            vocab = self.make_vocabulary(tokens, all_items)
            vocab.prune(self.min_df)
            self.vocab = vocab
        else:
            vocab = self.load_vocabulary(vocab_source)
            self.vocab = vocab

        feature_counts, index = self.extract_feature_counts(all_items, tokens, vocab)

        if write_to_file:
            vocab.write_to_file(self.get_vocab_filename())
            fh.write_to_json(index, self.get_index_filename(), sort_keys=False)
            fh.pickle_data(feature_counts, self.get_feature_filename())

        self.feature_counts = feature_counts
        self.index = index
        self.column_names = np.array(self.vocab.index2token)
        self.do_transformations()
コード例 #3
0
 def write_to_file(self, filename):
     json_obj = {
         'index2token': self.index2token,
         'counts': self.counts,
         'doc_counts': self.doc_counts
     }
     fh.write_to_json(json_obj, filename, sort_keys=False)
コード例 #4
0
def preprocess_data(train_infile, test_infile, output_dir, vocab_size, use_mallet_stopwords=False, replace_num=False, lemmatize=False, log_transform=False, keep_nonalphanum=False, only_alpha=False, min_length=1):

    print("Loading SpaCy")
    parser = English()
    train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings = load_and_process_data(train_infile, vocab_size, parser, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length)
    test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings = load_and_process_data(test_infile, vocab_size, parser, vocab=train_vocab, label_list=label_list, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length)
    fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz'))
    fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json'))
    fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json'))
    fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz'))
    fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz'))
    fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json'))
    fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz'))
    n_labels = len(label_list)
    label_dict = dict(zip(range(n_labels), label_list))
    fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json'))
    fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json'))

    # save output for David Blei's lda-c code
    fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat'))
    fh.write_list_to_text(test_dat, os.path.join(output_dir, 'test.dat'))

    # save output for Mallet
    fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt'))
    fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt'))

    # save output for Jacob Eisenstein's SAGE code:
    train_sage_output['te_data'] = test_sage_output['tr_data']
    train_sage_output['te_aspect'] = test_sage_output['tr_aspect']
    savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output)

    # save output in SVM format
    fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt'))
    fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
コード例 #5
0
def main():
    usage = "%prog infiles.jsonlist[.gz]"
    parser = OptionParser(usage=usage)
    parser.add_option('-o',
                      dest='output_dir',
                      default='output',
                      help='Output_dir: default=%default')
    parser.add_option('-w',
                      dest='target_word',
                      default='mass shooting',
                      help='Target word: default=%default')

    (options, args) = parser.parse_args()
    infiles = args
    output_dir = options.output_dir
    target_word = options.target_word

    if not os.path.exists(output_dir):
        fh.makedirs(output_dir)

    n_articles_per_day = defaultdict(int)
    target_count_per_day = defaultdict(int)
    for f in infiles:
        print(f)
        articles = fh.read_jsonlist(f)
        print(len(articles))
        for i, article in enumerate(articles):
            if i % 10000 == 0 and i > 0:
                print(i)
            year = int(article['year'])
            month = int(article['month'])
            day = int(article['day'])
            date = datetime.date(year=year, month=month, day=day)
            ordinal_date = date.toordinal()
            n_articles_per_day[ordinal_date] += 1
            text = ''
            if 'headline' in article:
                text += article['headline'] + '\n'
            if 'body' in article:
                text += article['body']
            if 'text' in article:
                text += article['text']

            text = ' ' + clean_text(text, lower=True) + ' '
            if target_word in text:
                if 'film' not in text and 'game' not in text:
                    target_count_per_day[ordinal_date] += 1

    fh.write_to_json(n_articles_per_day,
                     os.path.join(output_dir, 'articles_per_day.json'))
    fh.write_to_json(target_count_per_day,
                     os.path.join(output_dir, 'target_counts_per_day.json'))
コード例 #6
0
def save_weights(output_dir, beta, bg, feature_names, sparsity_threshold=1e-5):
    np.savez(os.path.join(output_dir, 'beta.npz'), beta=beta)
    if bg is not None:
        np.savez(os.path.join(output_dir, 'bg.npz'), bg=bg)
    fh.write_to_json(feature_names, os.path.join(output_dir, 'vocab.json'), sort_keys=False)

    topics_file = os.path.join(output_dir, 'topics.txt')
    lines = []
    for i in range(len(beta)):
        order = list(np.argsort(beta[i]))
        order.reverse()
        pos_words = [feature_names[j] for j in order[:100] if beta[i][j] > sparsity_threshold]
        output = ' '.join(pos_words)
        lines.append(output)

    fh.write_list_to_text(lines, topics_file)
コード例 #7
0
def export_20ng(remove_headers=False,
                remove_footers=False,
                remove_quotes=False,
                categories=None):
    output_dir = os.path.join('..', 'datasets', '20ng', 'data')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    remove = []
    if remove_headers:
        remove.append('headers')
    if remove_footers:
        remove.append('footers')
    if remove_quotes:
        remove.append('quotes')

    print categories

    ng_train = fetch_20newsgroups(subset='train',
                                  remove=remove,
                                  categories=categories)
    keys = ['train' + str(i) for i in range(len(ng_train.data))]
    print len(keys)
    train_text = dict(zip(keys, ng_train.data))
    fh.write_to_json(train_text, os.path.join(output_dir, 'train.json'))

    train_labels = pd.DataFrame(ng_train.target,
                                columns=['target'],
                                index=keys)
    train_labels.to_csv(os.path.join(output_dir, 'train.csv'))
    print train_labels.shape

    ng_test = fetch_20newsgroups(subset='test',
                                 remove=remove,
                                 categories=categories)
    keys = ['test' + str(i) for i in range(len(ng_test.data))]
    test_text = dict(zip(keys, ng_train.data))
    fh.write_to_json(test_text, os.path.join(output_dir, 'test.json'))

    test_labels = pd.DataFrame(ng_test.target, columns=['target'], index=keys)
    test_labels.to_csv(os.path.join(output_dir, 'test.csv'))
コード例 #8
0
    def extract_features(self, source, write_to_file=True):
        print "Extracting ngram tokens:"

        # read in a dict of {document_key: text}
        data = fh.read_json(source)
        all_items = data.keys()

        tokens = self.extract_tokens_from_file(data, self.get_n())

        vocab = self.make_vocabulary(tokens, all_items)

        feature_counts, oov_counts = self.extract_feature_counts(all_items, tokens, vocab)

        if write_to_file:
            vocab.write_to_file(self.get_vocab_filename())
            fh.write_to_json(all_items, self.get_index_filename(), sort_keys=False)
            fh.pickle_data(feature_counts, self.get_feature_filename())
            fh.write_to_json(oov_counts, self.get_oov_count_filename(), sort_keys=False)

        self.feature_counts = feature_counts
        self.index = all_items
        self.vocab = vocab
コード例 #9
0
def preprocess_data(train_infile, output_dir, vocab_size, label_type, test_prop, use_mallet_stopwords=False, replace_num=False, group_size=1, only_alpha=False, min_length=3):

    print("Loading SpaCy")
    parser = English()

    with codecs.open(train_infile, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    n_items = len(lines)
    n_test = int(test_prop * n_items)
    n_train = n_items - n_test
    train_indices = np.random.choice(range(n_items), n_train, replace=False)
    test_indices = list(set(range(n_items)) - set(train_indices))

    train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings, label_index = load_and_process_data(train_infile, vocab_size, parser, label_type, train_indices, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length)
    test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings, _ = load_and_process_data(train_infile, vocab_size, parser, label_type, test_indices, vocab=train_vocab, label_list=label_list, label_index=label_index, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length)
    fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz'))
    fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json'))
    fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json'))
    fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz'))
    fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz'))
    fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json'))
    fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz'))
    fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json'))
    fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat'))
    n_labels = len(label_list)
    label_dict = dict(zip(range(n_labels), label_list))
    fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json'))

    fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt'))
    fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt'))

    train_sage_output['te_data'] = test_sage_output['tr_data']
    train_sage_output['te_aspect'] = test_sage_output['tr_aspect']
    savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output)

    fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt'))
    fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
コード例 #10
0
 def write_to_file(self, filename):
     fh.write_to_json(self.index2token, filename, sort_keys=False)
コード例 #11
0
               
               'accuracy_value': float(acc['value']),
               'accuracy_epoch': int(acc.get('epoch', 0)),
            },
            index=[run],
        )

        results.to_csv(
            Path(run_args.o, "dev_metrics.csv"),
            mode='a',
            header=run==0, # only save header for the first run
        )

        if run_args.store_all:
            seed_path = Path(run_args.o, str(seed))
            if not seed_path.exists():
                seed_path.mkdir()
            for fpath in Path(run_args.o).glob("*"):
                if fpath.name not in ['torch_model.pt', 'dev_metrics.csv'] and fpath.is_file():
                    shutil.copyfile(fpath, Path(seed_path, fpath.name))

        # stop entirely if run was very bad
        if npmi['value'] < run_args.min_acceptable_npmi:
            with open(Path(run_args.o, "stopped_due_to_low_npmi.txt"), "w") as outfile:
                outfile.write("")
            print(f"Stopped: NPMI of {npmi['value']:0.4f} < {run_args.min_acceptable_npmi}")
            break

    # Save the arguments
    fh.write_to_json(checkpoint["options"].__dict__, Path(run_args.o, "args.json"))
コード例 #12
0
def preprocess_data(train_infile, test_infile, output_dir, train_prefix, test_prefix, min_doc_count=0, max_doc_freq=1.0, vocab_size=None, stopwords=None, tokenized=False, keep_num=False, keep_alphanum=False, strip_html=False, lower=True, min_length=3, label_fields=None):

    if stopwords == 'mallet':
        print("Using Mallet stopwords")
        stopword_list = fh.read_text(os.path.join('stopwords', 'mallet_stopwords.txt'))
    elif stopwords == 'snowball':
        print("Using snowball stopwords")
        stopword_list = fh.read_text(os.path.join('stopwords', 'snowball_stopwords.txt'))
    elif stopwords is not None:
        print("Using custom stopwords")
        stopword_list = fh.read_text(os.path.join('stopwords', stopwords + '_stopwords.txt'))
    else:
        stopword_list = []
    stopword_set = {s.strip() for s in stopword_list}

    print("Reading data files")
    train_items = fh.read_jsonlist(train_infile)
    n_train = len(train_items)
    print("Found {:d} training documents".format(n_train))

    if test_infile is not None:
        test_items = fh.read_jsonlist(test_infile)
        n_test = len(test_items)
        print("Found {:d} test documents".format(n_test))
    else:
        test_items = []
        n_test = 0

    all_items = train_items + test_items
    n_items = n_train + n_test

    label_lists = {}
    if label_fields is not None:
        if ',' in label_fields:
            label_fields = label_fields.split(',')
        else:
            label_fields = [label_fields]
        for label_name in label_fields:
            label_set = set()
            for i, item in enumerate(all_items):
                if label_name is not None:
                    label_set.add(item[label_name])
            label_list = list(label_set)
            label_list.sort()
            n_labels = len(label_list)
            print("Found label %s with %d classes" % (label_name, n_labels))
            label_lists[label_name] = label_list
    else:
        label_fields = []

    # make vocabulary
    train_parsed = []
    test_parsed = []

    print("Parsing %d documents" % n_items)
    if (tokenized):
        print("Using tokenized_text element as the input")
    word_counts = Counter()
    doc_counts = Counter()
    count = 0

    vocab = None
    for i, item in enumerate(all_items):
        if i % 1000 == 0 and count > 0:
            print(i)

        if (tokenized):
            text = item['tokenized_text']
            tokens = text.split()
        else:
            text = item['text']
            tokens, _ = tokenize(text, strip_html=strip_html, lower=lower, keep_numbers=keep_num, keep_alphanum=keep_alphanum, min_length=min_length, stopwords=stopword_set, vocab=vocab)

        # store the parsed documents
        if i < n_train:
            train_parsed.append(tokens)
        else:
            test_parsed.append(tokens)

        # keep track fo the number of documents with each word
        word_counts.update(tokens)
        doc_counts.update(set(tokens))

    print("Size of full vocabulary=%d" % len(word_counts))

    print("Selecting the vocabulary")
    most_common = doc_counts.most_common()
    words, doc_counts = zip(*most_common)
    doc_freqs = np.array(doc_counts) / float(n_items)
    vocab = [word for i, word in enumerate(words) if doc_counts[i] >= min_doc_count and doc_freqs[i] <= max_doc_freq]
    most_common = [word for i, word in enumerate(words) if doc_freqs[i] > max_doc_freq]
    if max_doc_freq < 1.0:
        print("Excluding words with frequency > {:0.2f}:".format(max_doc_freq), most_common)

    print("Vocab size after filtering = %d" % len(vocab))
    if vocab_size is not None:
        if len(vocab) > int(vocab_size):
            vocab = vocab[:int(vocab_size)]

    vocab_size = len(vocab)
    print("Final vocab size = %d" % vocab_size)

    print("Most common words remaining:", ' '.join(vocab[:10]))
    vocab.sort()

    fh.write_to_json(vocab, os.path.join(output_dir, train_prefix + '.vocab.json'))

    train_X_sage, tr_aspect, tr_no_aspect, tr_widx, vocab_for_sage = process_subset(train_items, train_parsed, label_fields, label_lists, vocab, output_dir, train_prefix)
    if n_test > 0:
        test_X_sage, te_aspect, te_no_aspect, _, _= process_subset(test_items, test_parsed, label_fields, label_lists, vocab, output_dir, test_prefix)

    train_sum = np.array(train_X_sage.sum(axis=0))
    print("%d words missing from training data" % np.sum(train_sum == 0))

    if n_test > 0:
        test_sum = np.array(test_X_sage.sum(axis=0))
        print("%d words missing from test data" % np.sum(test_sum == 0))

    sage_output = {'tr_data': train_X_sage, 'tr_aspect': tr_aspect, 'widx': tr_widx, 'vocab': vocab_for_sage}
    if n_test > 0:
        sage_output['te_data'] = test_X_sage
        sage_output['te_aspect'] = te_aspect
    savemat(os.path.join(output_dir, 'sage_labeled.mat'), sage_output)
    sage_output['tr_aspect'] = tr_no_aspect
    if n_test > 0:
        sage_output['te_aspect'] = te_no_aspect
    savemat(os.path.join(output_dir, 'sage_unlabeled.mat'), sage_output)

    print("Done!")
コード例 #13
0
def main():
    usage = "%prog infile.txt output_dir output_prefix"
    parser = OptionParser(usage=usage)
    parser.add_option(
        '-m',
        dest='max_lines',
        default=None,
        help=
        'Quit after processing this many lines (documents): default=%default')
    #parser.add_option('--lower', action="store_true", dest="lower", default=False,
    #                  help='Lower case words: default=%default')

    (options, args) = parser.parse_args()

    infile = args[0]
    output_dir = args[1]
    output_prefix = args[2]

    max_lines = options.max_lines
    if max_lines is not None:
        max_lines = int(max_lines)

    vocab = []
    vocab_index = {}

    counter = Counter()

    # start by converting each document into a dict of word counts, building a vocab as we go
    rows = []
    cols = []
    values = []
    n_docs = 0
    print("Counting words...")
    with codecs.open(infile, 'r', encoding='utf-8') as f:
        for line_i, line in enumerate(f):
            line = line.strip()
            if len(line) > 0:
                if max_lines is not None and line_i >= max_lines:
                    print("Quitting after processing %d lines" % (line_i + 1))
                    break
                if n_docs % 1000 == 0 and n_docs > 0:
                    print(n_docs)
                # split on white space
                words = line.split()
                # filter out everything that's not just letters, and lower case
                words = [
                    word.lower() for word in words
                    if re.match('^[a-zA-Z]*$', word) is not None
                ]
                # look for new words and add them to the vocabulary
                new_words = [word for word in words if word not in vocab_index]
                if len(new_words) > 0:
                    vocab_size = len(vocab)
                    #print("Adding %d words to vocab" % len(new_words))
                    #print("New total should be %d" % (vocab_size + len(new_words)))
                    vocab.extend(new_words)
                    vocab_index.update(
                        dict(
                            zip(new_words,
                                range(vocab_size,
                                      vocab_size + len(new_words)))))
                indices = [vocab_index[word] for word in words]
                counter.clear()
                counter.update(indices)
                keys = counter.keys()
                counts = counter.values()
                rows.extend([line_i] * len(keys))
                cols.extend(keys)
                values.extend(counts)
                n_docs += 1

    print("Processed %d documents" % n_docs)
    print("Size of final vocab = %d" % len(vocab))
    print("Saving counts...")

    # now convert these count vectors in to a giant sparse matrix
    counts = sparse.coo_matrix((values, (rows, cols)),
                               shape=(n_docs, len(vocab)))
    fh.save_sparse(counts, os.path.join(output_dir, output_prefix + '.npz'))
    fh.write_to_json(vocab,
                     os.path.join(output_dir, output_prefix + '.vocab.json'))
    print("Done")
コード例 #14
0
def preprocess_data(train_infile,
                    test_infile,
                    output_dir,
                    train_prefix,
                    test_prefix,
                    min_doc_count=0,
                    max_doc_freq=1.0,
                    vocab_size=None,
                    stopwords=None,
                    keep_num=False,
                    keep_alphanum=False,
                    strip_html=False,
                    min_length=3,
                    label_name=None,
                    output_plaintext=False,
                    bigrams=False):

    if stopwords == 'mallet':
        print("Using Mallet stopwords")
        stopword_list = fh.read_text(
            os.path.join('stopwords', 'mallet_stopwords.txt'))
    elif stopwords == 'snowball':
        print("Using snowball stopwords")
        stopword_list = fh.read_text(
            os.path.join('stopwords', 'snowball_stopwords.txt'))
    elif stopwords is not None:
        print("Using custom stopwords")
        stopword_list = fh.read_text(
            os.path.join('stopwords', stopwords + '_stopwords.txt'))
    else:
        stopword_list = []
    stopword_set = {s.strip() for s in stopword_list}

    print("Reading data files")
    train_items = fh.read_jsonlist(train_infile)
    n_train = len(train_items)

    if test_infile is not None:
        test_items = fh.read_jsonlist(test_infile)
        n_test = len(test_items)
    else:
        test_items = []
        n_test = 0

    all_items = train_items + test_items
    n_items = n_train + n_test

    # determine labels and metadata from data
    metadata_keys = set()
    print("Dealing with labels and metadata")
    # find all the metadata keys present
    for i, item in enumerate(all_items):
        if 'text' not in item:
            print("Text field not found for item %d" % i)
            sys.exit()
        if 'metadata' in item:
            for key in item['metadata'].keys():
                metadata_keys.add(key)

    # only keep the ones that are present everywhere
    if len(metadata_keys) > 0:
        for i, item in enumerate(all_items):
            if 'metadata' not in item:
                print('metadata not found for item %d' % i)
            for key in metadata_keys:
                if key not in item['metadata']:
                    print(
                        'dropping metadata field %s (not found for item %d)' %
                        (key, i))
                    metadata_keys.remove(key)

    metadata_keys = list(metadata_keys)
    metadata_keys.sort()
    if len(metadata_keys) > 0:
        print("Metadata keys:", metadata_keys)

    label_set = set()
    for i, item in enumerate(all_items):
        if label_name is not None:
            label_set.add(item[label_name])

    label_list = list(label_set)
    label_list.sort()
    n_labels = len(label_list)
    if label_name is not None:
        print("Using label %s with %d classes" % (label_name, n_labels))

    # make vocabulary
    train_parsed = []
    test_parsed = []

    print("Parsing %d documents" % n_items)
    word_counts = Counter()
    doc_counts = Counter()
    count = 0
    for i, item in enumerate(all_items):
        if i % 1000 == 0 and count > 0:
            print(i)

        text = item['text']
        tokens = tokenize(text,
                          strip_html,
                          keep_numbers=keep_num,
                          keep_alphanum=keep_alphanum,
                          min_length=min_length,
                          stopwords=stopword_set,
                          bigrams=bigrams)

        # store the parsed documents
        if i < n_train:
            train_parsed.append(tokens)
        else:
            test_parsed.append(tokens)

        # keep track fo the number of documents with each word
        word_counts.update(tokens)
        doc_counts.update(set(tokens))

    print("Size of full vocabulary=%d" % len(word_counts))

    print("Selecting the vocabulary")
    most_common = doc_counts.most_common()
    words, doc_counts = zip(*most_common)
    doc_freqs = np.array(doc_counts) / float(n_items)
    vocab = [
        word for i, word in enumerate(words)
        if doc_counts[i] >= min_doc_count and doc_freqs[i] <= max_doc_freq
    ]
    most_common = [
        word for i, word in enumerate(words) if doc_freqs[i] > max_doc_freq
    ]
    print("Excluding most common:", most_common)

    print("Vocab size after filtering = %d" % len(vocab))
    if vocab_size is not None:
        if len(vocab) > int(vocab_size):
            vocab = vocab[:int(vocab_size)]

    vocab_size = len(vocab)
    print("Final vocab size = %d" % vocab_size)

    print("Most common words remaining:", ' '.join(vocab[:10]))
    vocab.sort()

    fh.write_to_json(vocab,
                     os.path.join(output_dir, train_prefix + '.vocab.json'))

    train_X_sage, tr_aspect, tr_no_aspect, tr_widx, vocab_for_sage = process_subset(
        train_items,
        train_parsed,
        label_name,
        label_list,
        vocab,
        metadata_keys,
        output_dir,
        train_prefix,
        output_plaintext=output_plaintext)
    if n_test > 0:
        test_X_sage, te_aspect, te_no_aspect, _, _ = process_subset(
            test_items,
            test_parsed,
            label_name,
            label_list,
            vocab,
            metadata_keys,
            output_dir,
            test_prefix,
            output_plaintext=output_plaintext)

    train_sum = np.array(train_X_sage.sum(axis=0))
    print("%d words missing from training data" % np.sum(train_sum == 0))

    if n_test > 0:
        test_sum = np.array(test_X_sage.sum(axis=0))
        print("%d words missing from test data" % np.sum(test_sum == 0))

    sage_output = {
        'tr_data': train_X_sage,
        'tr_aspect': tr_aspect,
        'widx': tr_widx,
        'vocab': vocab_for_sage
    }
    if n_test > 0:
        sage_output['te_data'] = test_X_sage
        sage_output['te_aspect'] = te_aspect
    savemat(os.path.join(output_dir, 'sage_labeled.mat'), sage_output)
    sage_output['tr_aspect'] = tr_no_aspect
    if n_test > 0:
        sage_output['te_aspect'] = te_no_aspect
    savemat(os.path.join(output_dir, 'sage_unlabeled.mat'), sage_output)
コード例 #15
0
def process_subset(
    items,
    ids,
    parsed,
    labels,
    label_fields,
    label_lists,
    vocab,
    output_dir,
    output_prefix,
    count_dtype=np.int,
):
    n_items = len(items)
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))

    if not ids or len(ids) != n_items:
        ids = [str(i) for i in range(n_items)]

    # create a label index using string representations
    if labels:
        labels_df = pd.DataFrame.from_records(labels, index=ids)

        for label_field in label_fields:
            labels_df_subset = pd.get_dummies(labels_df[label_field])

            # for any classes not present in the subset, add 0 columns
            # (handles case where classes appear in only one of train or test)
            for category in label_lists[label_field]:
                if category not in labels_df_subset:
                    labels_df_subset[category] = 0

            labels_df_subset.to_csv(
                os.path.join(output_dir,
                             output_prefix + "." + label_field + ".csv"))
            if labels_df[label_field].nunique() == 2:
                labels_df_subset.iloc[:, 1].to_csv(
                    os.path.join(
                        output_dir,
                        output_prefix + "." + label_field + "_vector.csv"),
                    header=[label_field],
                )
            # used later
            label_index = dict(
                zip(labels_df_subset.columns, range(len(labels_df_subset))))
    X = np.zeros([n_items, vocab_size], dtype=count_dtype)

    dat_strings = []
    dat_labels = []
    mallet_strings = []
    fast_text_lines = []

    counter = Counter()
    word_counter = Counter()
    doc_lines = []
    print("Converting to count representations")
    for i, words in enumerate(parsed):
        # get the vocab indices of words that are in the vocabulary
        words = words.split()

        indices = [vocab_index[word] for word in words if word in vocab_index]
        word_subset = [word for word in words if word in vocab_index]

        counter.clear()
        counter.update(indices)
        word_counter.clear()
        word_counter.update(word_subset)

        if len(counter.keys()) > 0:
            # udpate the counts
            mallet_strings.append(
                str(i) + "\t" + "en" + "\t" + " ".join(word_subset))

            dat_string = str(int(len(counter))) + " "
            dat_string += " ".join([
                str(k) + ":" + str(int(v))
                for k, v in zip(list(counter.keys()), list(counter.values()))
            ])
            dat_strings.append(dat_string)

            # for dat formart, assume just one label is given
            if len(label_fields) > 0:
                label = labels[i][label_fields[-1]]
                dat_labels.append(str(label_index[str(label)]))
            values = np.array(list(counter.values()), dtype=count_dtype)
            X[np.ones(len(counter.keys()), dtype=int) * i,
              list(counter.keys())] += values

    # convert to a sparse representation
    sparse_X = sparse.csr_matrix(X)
    fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + ".npz"))

    print("Size of {:s} document-term matrix:".format(output_prefix),
          sparse_X.shape)

    fh.write_to_json(ids, os.path.join(output_dir,
                                       output_prefix + ".ids.json"))

    # save output for Mallet
    fh.write_list_to_text(
        mallet_strings, os.path.join(output_dir,
                                     output_prefix + ".mallet.txt"))

    # save output for David Blei's LDA/SLDA code
    fh.write_list_to_text(
        dat_strings, os.path.join(output_dir, output_prefix + ".data.dat"))
    if len(dat_labels) > 0:
        fh.write_list_to_text(
            dat_labels,
            os.path.join(output_dir,
                         output_prefix + "." + label_field + ".dat"),
        )

    # save output for Jacob Eisenstein's SAGE code:
    sparse_X_sage = sparse.csr_matrix(X, dtype=float)
    vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object)
    vocab_for_sage[:] = vocab

    # for SAGE, assume only a single label has been given
    if len(label_fields) > 0:
        # convert array to vector of labels for SAGE
        sage_aspect = (
            np.argmax(np.array(labels_df_subset.values, dtype=float), axis=1) +
            1)
    else:
        sage_aspect = np.ones([n_items, 1], dtype=float)
    sage_no_aspect = np.array([n_items, 1], dtype=float)
    widx = np.arange(vocab_size, dtype=float) + 1

    return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage
コード例 #16
0
def preprocess_data(
    train_infile,
    test_infile,
    output_dir,
    train_prefix,
    test_prefix,
    min_doc_count=0,
    max_doc_freq=1.0,
    ngram_range=(1, 1),
    vocab_size=None,
    stopwords=None,
    keep_num=False,
    keep_alphanum=False,
    strip_html=False,
    lower=True,
    min_word_length=3,
    max_doc_length=5000,
    label_fields=None,
    workers=4,
    proc_multiplier=500,
):

    if stopwords == "mallet":
        print("Using Mallet stopwords")
        stopword_list = fh.read_text(
            os.path.join("stopwords", "mallet_stopwords.txt"))
    elif stopwords == "snowball":
        print("Using snowball stopwords")
        stopword_list = fh.read_text(
            os.path.join("stopwords", "snowball_stopwords.txt"))
    elif stopwords is not None:
        print("Using custom stopwords")
        stopword_list = fh.read_text(
            os.path.join("stopwords", stopwords + "_stopwords.txt"))
    else:
        stopword_list = []
    stopword_set = {s.strip() for s in stopword_list}

    print("Reading data files")
    train_items = fh.LazyJsonlistReader(train_infile)
    n_train = len(train_items)
    print("Found {:d} training documents".format(n_train))

    if test_infile is not None:
        test_items = fh.LazyJsonlistReader(test_infile)
        n_test = len(test_items)
        print("Found {:d} test documents".format(n_test))
    else:
        test_items = []
        n_test = 0

    n_items = n_train + n_test

    if label_fields:
        label_lists = {}
        if "," in label_fields:
            label_fields = label_fields.split(",")
        else:
            label_fields = [label_fields]
    if label_fields is None:
        label_fields = []

    # make vocabulary
    train_ids, train_parsed, train_labels = [], [], []
    test_ids, test_parsed, test_labels = [], [], []

    print("Parsing documents")
    word_counts = Counter()
    doc_counts = Counter()

    vocab = None

    # process in blocks
    pool = multiprocessing.Pool(workers)
    chunksize = proc_multiplier * workers

    kwargs = {
        "strip_html": strip_html,
        "lower": lower,
        "keep_numbers": keep_num,
        "keep_alphanum": keep_alphanum,
        "min_length": min_word_length,
        "stopwords": stopword_set,
        "ngram_range": ngram_range,
        "vocab": vocab,
        "label_fields": label_fields,
    }

    # these two loops below do the majority of the preprocessing. unfortunately, without
    # a major refactor, they cannot be turned into generators and the results of
    # tokenization must be appended to a list. this unfortunately implies a large
    # memory footprint
    for i, group in enumerate(chunkize(iter(train_items),
                                       chunksize=chunksize)):
        print(f"On training chunk {i} of {len(train_items) // chunksize}",
              end="\r")
        for ids, tokens, labels in pool.imap(partial(_process_item, **kwargs),
                                             group):
            # store the parsed documents
            if ids is not None:
                train_ids.append(ids)
            if labels is not None:
                train_labels.append(labels)
            tokens = tokens[:max_doc_length]

            # keep track of the number of documents with each word
            word_counts.update(tokens)
            doc_counts.update(set(tokens))
            train_parsed.append(" ".join(tokens))  # more efficient storage

    print("Train set processing complete")

    for i, group in enumerate(chunkize(iter(test_items), chunksize=chunksize)):
        print(f"On testing chunk {i} of {len(test_items) // chunksize}",
              end="\r")
        for ids, tokens, labels in pool.imap(partial(_process_item, **kwargs),
                                             group):
            # store the parsed documents
            if ids is not None:
                test_ids.append(ids)
            if labels is not None:
                test_labels.append(labels)
            tokens = tokens[:max_doc_length]

            # keep track of the number of documents with each word
            word_counts.update(tokens)
            doc_counts.update(set(tokens))
            test_parsed.append(" ".join(tokens))  # more efficient storage

    print("Test set processing complete")
    pool.terminate()

    print("Size of full vocabulary=%d" % len(word_counts))

    # store possible label values
    if label_fields:
        labels_df = pd.DataFrame.from_records(train_labels + test_labels)
    for label_name in label_fields:
        label_list = sorted(labels_df[label_name].unique().tolist())
        n_labels = len(label_list)
        print("Found label %s with %d classes" % (label_name, n_labels))
        label_lists[label_name] = label_list

    print("Selecting the vocabulary")
    most_common = doc_counts.most_common()
    words, doc_counts = zip(*most_common)
    doc_freqs = np.array(doc_counts) / float(n_items)
    vocab = [
        word for i, word in enumerate(words)
        if doc_counts[i] >= min_doc_count and doc_freqs[i] <= max_doc_freq
    ]
    most_common = [
        word for i, word in enumerate(words) if doc_freqs[i] > max_doc_freq
    ]
    if max_doc_freq < 1.0:
        print(
            "Excluding words with frequency > {:0.2f}:".format(max_doc_freq),
            most_common,
        )

    print("Vocab size after filtering = %d" % len(vocab))
    if vocab_size is not None:
        if len(vocab) > int(vocab_size):
            vocab = vocab[:int(vocab_size)]

    vocab_size = len(vocab)
    print("Final vocab size = %d" % vocab_size)

    print("Most common words remaining:", " ".join(vocab[:10]))
    vocab.sort()

    fh.write_to_json(vocab,
                     os.path.join(output_dir, train_prefix + ".vocab.json"))

    count_dtype = np.uint16 if max_doc_length < np.iinfo(
        np.uint16).max else np.int

    train_X_sage, tr_aspect, tr_no_aspect, tr_widx, vocab_for_sage = process_subset(
        train_items,
        train_ids,
        train_parsed,
        train_labels,
        label_fields,
        label_lists,
        vocab,
        output_dir,
        train_prefix,
        count_dtype=count_dtype,
    )
    if n_test > 0:
        test_X_sage, te_aspect, te_no_aspect, _, _ = process_subset(
            test_items,
            test_ids,
            test_parsed,
            test_labels,
            label_fields,
            label_lists,
            vocab,
            output_dir,
            test_prefix,
            count_dtype=count_dtype,
        )

    train_sum = np.array(train_X_sage.sum(axis=0))
    print("%d words missing from training data" % np.sum(train_sum == 0))

    if n_test > 0:
        test_sum = np.array(test_X_sage.sum(axis=0))
        print("%d words missing from test data" % np.sum(test_sum == 0))

    sage_output = {
        "tr_data": train_X_sage,
        "tr_aspect": tr_aspect,
        "widx": tr_widx,
        "vocab": vocab_for_sage,
    }
    if n_test > 0:
        sage_output["te_data"] = test_X_sage
        sage_output["te_aspect"] = te_aspect
    savemat(os.path.join(output_dir, "sage_labeled.mat"), sage_output)
    sage_output["tr_aspect"] = tr_no_aspect
    if n_test > 0:
        sage_output["te_aspect"] = te_no_aspect
    savemat(os.path.join(output_dir, "sage_unlabeled.mat"), sage_output)

    print("Done!")
コード例 #17
0
def process_subset(items, parsed, label_field, label_list, vocab, output_dir,
                   output_prefix):
    n_items = len(items)
    n_labels = len(label_list)
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))

    ids = []
    for i, item in enumerate(items):
        if 'id' in item:
            ids.append(item['id'])
    if len(ids) != n_items:
        ids = [str(i) for i in range(n_items)]

    # create a label index using string representations
    label_list_strings = [str(label) for label in label_list]
    label_index = dict(zip(label_list_strings, range(n_labels)))

    # convert labels to a data frame
    if n_labels > 0:
        label_matrix = np.zeros([n_items, n_labels], dtype=int)
        label_vector = np.zeros(n_items, dtype=int)

        for i, item in enumerate(items):
            id = ids[i]
            label = item[label_field]
            label_matrix[i, label_index[str(label)]] = 1
            label_vector[i] = label_index[str(label)]

        labels_df = pd.DataFrame(label_matrix,
                                 index=ids,
                                 columns=label_list_strings)
        labels_df.to_csv(
            os.path.join(output_dir,
                         output_prefix + '.' + label_field + '.csv'))
        label_vector_df = pd.DataFrame(label_vector,
                                       index=ids,
                                       columns=[label_field])
        label_vector_df.to_csv(
            os.path.join(output_dir, output_prefix + '.label_vector.csv'))

    else:
        print("No labels found")

    X = np.zeros([n_items, vocab_size], dtype=int)

    dat_strings = []
    dat_labels = []
    mallet_strings = []
    fast_text_lines = []

    counter = Counter()
    word_counter = Counter()
    doc_lines = []
    print("Converting to count representations")
    for i, words in enumerate(parsed):
        # get the vocab indices of words that are in the vocabulary
        indices = [vocab_index[word] for word in words if word in vocab_index]
        word_subset = [word for word in words if word in vocab_index]

        counter.clear()
        counter.update(indices)
        word_counter.clear()
        word_counter.update(word_subset)

        if len(counter.keys()) > 0:
            # udpate the counts
            mallet_strings.append(
                str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset))

            dat_string = str(int(len(counter))) + ' '
            dat_string += ' '.join([
                str(k) + ':' + str(int(v))
                for k, v in zip(list(counter.keys()), list(counter.values()))
            ])
            dat_strings.append(dat_string)

            if label_field is not None:
                label = items[i][label_field]
                dat_labels.append(str(label_index[str(label)]))

            values = list(counter.values())
            X[np.ones(len(counter.keys()), dtype=int) * i,
              list(counter.keys())] += values

    # convert to a sparse representation
    sparse_X = sparse.csr_matrix(X)
    fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz'))

    print(sparse_X.shape)
    print(len(dat_strings))

    fh.write_to_json(ids, os.path.join(output_dir,
                                       output_prefix + '.ids.json'))

    # save output for Mallet
    fh.write_list_to_text(
        mallet_strings, os.path.join(output_dir,
                                     output_prefix + '.mallet.txt'))

    # save output for David Blei's LDA/SLDA code
    fh.write_list_to_text(
        dat_strings, os.path.join(output_dir, output_prefix + '.data.dat'))
    if len(dat_labels) > 0:
        fh.write_list_to_text(
            dat_labels,
            os.path.join(output_dir,
                         output_prefix + '.' + label_field + '.dat'))

    # save output for Jacob Eisenstein's SAGE code:
    sparse_X_sage = sparse.csr_matrix(X, dtype=float)
    vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object)
    vocab_for_sage[:] = vocab
    if n_labels > 0:
        # convert array to vector of labels for SAGE
        sage_aspect = np.argmax(np.array(labels_df.values, dtype=float),
                                axis=1) + 1
    else:
        sage_aspect = np.ones([n_items, 1], dtype=float)
    sage_no_aspect = np.array([n_items, 1], dtype=float)
    widx = np.arange(vocab_size, dtype=float) + 1

    return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage
コード例 #18
0
ファイル: vocabulary_with_counts.py プロジェクト: benbo/botc
 def write_to_file(self, filename):
     json_obj = {'index2token': self.index2token, 'counts': self.counts, 'doc_counts': self.doc_counts}
     fh.write_to_json(json_obj, filename, sort_keys=False)
コード例 #19
0
def process_subset(items, parsed, label_fields, label_lists, vocab, output_dir, output_prefix):
    n_items = len(items)
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))

    ids = []
    for i, item in enumerate(items):
        if 'id' in item:
            ids.append(item['id'])
    if len(ids) != n_items:
        ids = [str(i) for i in range(n_items)]

    # create a label index using string representations
    for label_field in label_fields:
        label_list = label_lists[label_field]
        n_labels = len(label_list)
        label_list_strings = [str(label) for label in label_list]
        label_index = dict(zip(label_list_strings, range(n_labels)))

        # convert labels to a data frame
        if n_labels > 0:
            label_matrix = np.zeros([n_items, n_labels], dtype=int)
            label_vector = np.zeros(n_items, dtype=int)

            for i, item in enumerate(items):
                label = item[label_field]
                label_matrix[i, label_index[str(label)]] = 1
                label_vector[i] = label_index[str(label)]

            labels_df = pd.DataFrame(label_matrix, index=ids, columns=label_list_strings)
            labels_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '.csv'))
            label_vector_df = pd.DataFrame(label_vector, index=ids, columns=[label_field])
            if n_labels == 2:
                label_vector_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '_vector.csv'))

    rows = []
    cols = []
    vals = []

    dat_strings = []
    dat_labels = []
    mallet_strings = []
    fast_text_lines = []

    counter = Counter()
    word_counter = Counter()
    doc_lines = []
    print("Converting to count representations")
    for i, words in enumerate(parsed):
        # get the vocab indices of words that are in the vocabulary
        indices = [vocab_index[word] for word in words if word in vocab_index]
        word_subset = [word for word in words if word in vocab_index]

        counter.clear()
        counter.update(indices)
        word_counter.clear()
        word_counter.update(word_subset)

        if len(counter.keys()) > 0:
            # udpate the counts
            mallet_strings.append(str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset))

            dat_string = str(int(len(counter))) + ' '
            dat_string += ' '.join([str(k) + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values()))])
            dat_strings.append(dat_string)

            # for dat formart, assume just one label is given
            if len(label_fields) > 0:
                label = items[i][label_fields[-1]]
                dat_labels.append(str(label_index[str(label)]))

            values = list(counter.values())
            rows.extend([i] * len(counter))
            token_indices = sorted(counter.keys())
            cols.extend(list(token_indices))
            vals.extend([counter[k] for k in token_indices])

    # convert to a sparse representation
    sparse_X = sparse.coo_matrix((vals, (rows, cols)), shape=(n_items, vocab_size)).tocsr()
    fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz'))

    print("Size of {:s} document-term matrix:".format(output_prefix), sparse_X.shape)

    fh.write_to_json(ids, os.path.join(output_dir, output_prefix + '.ids.json'))

    # save output for Mallet
    fh.write_list_to_text(mallet_strings, os.path.join(output_dir, output_prefix + '.mallet.txt'))

    # save output for David Blei's LDA/SLDA code
    fh.write_list_to_text(dat_strings, os.path.join(output_dir, output_prefix + '.data.dat'))
    if len(dat_labels) > 0:
        fh.write_list_to_text(dat_labels, os.path.join(output_dir, output_prefix + '.' + label_field + '.dat'))

    # save output for Jacob Eisenstein's SAGE code:
    #sparse_X_sage = sparse.csr_matrix(X, dtype=float)
    vocab_for_sage = np.zeros((vocab_size,), dtype=np.object)
    vocab_for_sage[:] = vocab

    # for SAGE, assume only a single label has been given
    if len(label_fields) > 0:
        # convert array to vector of labels for SAGE
        sage_aspect = np.argmax(np.array(labels_df.values, dtype=float), axis=1) + 1
    else:
        sage_aspect = np.ones([n_items, 1], dtype=float)
    sage_no_aspect = np.array([n_items, 1], dtype=float)
    widx = np.arange(vocab_size, dtype=float) + 1

    return sparse_X, sage_aspect, sage_no_aspect, widx, vocab_for_sage
コード例 #20
0
ファイル: vocabulary.py プロジェクト: anukat2015/ARKcat
 def write_to_file(self, filename):
     fh.write_to_json(self.index2token, filename, sort_keys=False)