def export_20ng(remove_headers=False, remove_footers=False, remove_quotes=False, categories=None): output_dir = os.path.join('..', 'datasets', '20ng', 'data') if not os.path.exists(output_dir): os.makedirs(output_dir) remove = [] if remove_headers: remove.append('headers') if remove_footers: remove.append('footers') if remove_quotes: remove.append('quotes') print categories ng_train = fetch_20newsgroups(subset='train', remove=remove, categories=categories) keys = ['train' + str(i) for i in range(len(ng_train.data))] print len(keys) train_text = dict(zip(keys, ng_train.data)) fh.write_to_json(train_text, os.path.join(output_dir, 'train.json')) train_labels = pd.DataFrame(ng_train.target, columns=['target'], index=keys) train_labels.to_csv(os.path.join(output_dir, 'train.csv')) print train_labels.shape ng_test = fetch_20newsgroups(subset='test', remove=remove, categories=categories) keys = ['test' + str(i) for i in range(len(ng_test.data))] test_text = dict(zip(keys, ng_train.data)) fh.write_to_json(test_text, os.path.join(output_dir, 'test.json')) test_labels = pd.DataFrame(ng_test.target, columns=['target'], index=keys) test_labels.to_csv(os.path.join(output_dir, 'test.csv'))
def extract_features(self, source, write_to_file=True, vocab_source=None): print "Extracting ngram tokens" # read in a dict of {document_key: text} data = fh.read_json(source) all_items = data.keys() tokens = self.extract_tokens_from_text(data) if vocab_source is None: vocab = self.make_vocabulary(tokens, all_items) vocab.prune(self.min_df) self.vocab = vocab else: vocab = self.load_vocabulary(vocab_source) self.vocab = vocab feature_counts, index = self.extract_feature_counts(all_items, tokens, vocab) if write_to_file: vocab.write_to_file(self.get_vocab_filename()) fh.write_to_json(index, self.get_index_filename(), sort_keys=False) fh.pickle_data(feature_counts, self.get_feature_filename()) self.feature_counts = feature_counts self.index = index self.column_names = np.array(self.vocab.index2token) self.do_transformations()
def write_to_file(self, filename): json_obj = { 'index2token': self.index2token, 'counts': self.counts, 'doc_counts': self.doc_counts } fh.write_to_json(json_obj, filename, sort_keys=False)
def preprocess_data(train_infile, test_infile, output_dir, vocab_size, use_mallet_stopwords=False, replace_num=False, lemmatize=False, log_transform=False, keep_nonalphanum=False, only_alpha=False, min_length=1): print("Loading SpaCy") parser = English() train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings = load_and_process_data(train_infile, vocab_size, parser, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length) test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings = load_and_process_data(test_infile, vocab_size, parser, vocab=train_vocab, label_list=label_list, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length) fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz')) fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json')) fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json')) fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz')) fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz')) fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json')) fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz')) n_labels = len(label_list) label_dict = dict(zip(range(n_labels), label_list)) fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json')) fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json')) # save output for David Blei's lda-c code fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat')) fh.write_list_to_text(test_dat, os.path.join(output_dir, 'test.dat')) # save output for Mallet fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt')) fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt')) # save output for Jacob Eisenstein's SAGE code: train_sage_output['te_data'] = test_sage_output['tr_data'] train_sage_output['te_aspect'] = test_sage_output['tr_aspect'] savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output) # save output in SVM format fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt')) fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
def main(): usage = "%prog infiles.jsonlist[.gz]" parser = OptionParser(usage=usage) parser.add_option('-o', dest='output_dir', default='output', help='Output_dir: default=%default') parser.add_option('-w', dest='target_word', default='mass shooting', help='Target word: default=%default') (options, args) = parser.parse_args() infiles = args output_dir = options.output_dir target_word = options.target_word if not os.path.exists(output_dir): fh.makedirs(output_dir) n_articles_per_day = defaultdict(int) target_count_per_day = defaultdict(int) for f in infiles: print(f) articles = fh.read_jsonlist(f) print(len(articles)) for i, article in enumerate(articles): if i % 10000 == 0 and i > 0: print(i) year = int(article['year']) month = int(article['month']) day = int(article['day']) date = datetime.date(year=year, month=month, day=day) ordinal_date = date.toordinal() n_articles_per_day[ordinal_date] += 1 text = '' if 'headline' in article: text += article['headline'] + '\n' if 'body' in article: text += article['body'] if 'text' in article: text += article['text'] text = ' ' + clean_text(text, lower=True) + ' ' if target_word in text: if 'film' not in text and 'game' not in text: target_count_per_day[ordinal_date] += 1 fh.write_to_json(n_articles_per_day, os.path.join(output_dir, 'articles_per_day.json')) fh.write_to_json(target_count_per_day, os.path.join(output_dir, 'target_counts_per_day.json'))
def save_weights(output_dir, beta, bg, feature_names, sparsity_threshold=1e-5): np.savez(os.path.join(output_dir, 'beta.npz'), beta=beta) if bg is not None: np.savez(os.path.join(output_dir, 'bg.npz'), bg=bg) fh.write_to_json(feature_names, os.path.join(output_dir, 'vocab.json'), sort_keys=False) topics_file = os.path.join(output_dir, 'topics.txt') lines = [] for i in range(len(beta)): order = list(np.argsort(beta[i])) order.reverse() pos_words = [feature_names[j] for j in order[:100] if beta[i][j] > sparsity_threshold] output = ' '.join(pos_words) lines.append(output) fh.write_list_to_text(lines, topics_file)
def extract_features(self, source, write_to_file=True): print "Extracting ngram tokens:" # read in a dict of {document_key: text} data = fh.read_json(source) all_items = data.keys() tokens = self.extract_tokens_from_file(data, self.get_n()) vocab = self.make_vocabulary(tokens, all_items) feature_counts, oov_counts = self.extract_feature_counts(all_items, tokens, vocab) if write_to_file: vocab.write_to_file(self.get_vocab_filename()) fh.write_to_json(all_items, self.get_index_filename(), sort_keys=False) fh.pickle_data(feature_counts, self.get_feature_filename()) fh.write_to_json(oov_counts, self.get_oov_count_filename(), sort_keys=False) self.feature_counts = feature_counts self.index = all_items self.vocab = vocab
def preprocess_data(train_infile, output_dir, vocab_size, label_type, test_prop, use_mallet_stopwords=False, replace_num=False, group_size=1, only_alpha=False, min_length=3): print("Loading SpaCy") parser = English() with codecs.open(train_infile, 'r', encoding='utf-8') as f: lines = f.readlines() n_items = len(lines) n_test = int(test_prop * n_items) n_train = n_items - n_test train_indices = np.random.choice(range(n_items), n_train, replace=False) test_indices = list(set(range(n_items)) - set(train_indices)) train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings, label_index = load_and_process_data(train_infile, vocab_size, parser, label_type, train_indices, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length) test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings, _ = load_and_process_data(train_infile, vocab_size, parser, label_type, test_indices, vocab=train_vocab, label_list=label_list, label_index=label_index, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length) fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz')) fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json')) fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json')) fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz')) fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz')) fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json')) fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz')) fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json')) fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat')) n_labels = len(label_list) label_dict = dict(zip(range(n_labels), label_list)) fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json')) fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt')) fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt')) train_sage_output['te_data'] = test_sage_output['tr_data'] train_sage_output['te_aspect'] = test_sage_output['tr_aspect'] savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output) fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt')) fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
def write_to_file(self, filename): fh.write_to_json(self.index2token, filename, sort_keys=False)
'accuracy_value': float(acc['value']), 'accuracy_epoch': int(acc.get('epoch', 0)), }, index=[run], ) results.to_csv( Path(run_args.o, "dev_metrics.csv"), mode='a', header=run==0, # only save header for the first run ) if run_args.store_all: seed_path = Path(run_args.o, str(seed)) if not seed_path.exists(): seed_path.mkdir() for fpath in Path(run_args.o).glob("*"): if fpath.name not in ['torch_model.pt', 'dev_metrics.csv'] and fpath.is_file(): shutil.copyfile(fpath, Path(seed_path, fpath.name)) # stop entirely if run was very bad if npmi['value'] < run_args.min_acceptable_npmi: with open(Path(run_args.o, "stopped_due_to_low_npmi.txt"), "w") as outfile: outfile.write("") print(f"Stopped: NPMI of {npmi['value']:0.4f} < {run_args.min_acceptable_npmi}") break # Save the arguments fh.write_to_json(checkpoint["options"].__dict__, Path(run_args.o, "args.json"))
def preprocess_data(train_infile, test_infile, output_dir, train_prefix, test_prefix, min_doc_count=0, max_doc_freq=1.0, vocab_size=None, stopwords=None, tokenized=False, keep_num=False, keep_alphanum=False, strip_html=False, lower=True, min_length=3, label_fields=None): if stopwords == 'mallet': print("Using Mallet stopwords") stopword_list = fh.read_text(os.path.join('stopwords', 'mallet_stopwords.txt')) elif stopwords == 'snowball': print("Using snowball stopwords") stopword_list = fh.read_text(os.path.join('stopwords', 'snowball_stopwords.txt')) elif stopwords is not None: print("Using custom stopwords") stopword_list = fh.read_text(os.path.join('stopwords', stopwords + '_stopwords.txt')) else: stopword_list = [] stopword_set = {s.strip() for s in stopword_list} print("Reading data files") train_items = fh.read_jsonlist(train_infile) n_train = len(train_items) print("Found {:d} training documents".format(n_train)) if test_infile is not None: test_items = fh.read_jsonlist(test_infile) n_test = len(test_items) print("Found {:d} test documents".format(n_test)) else: test_items = [] n_test = 0 all_items = train_items + test_items n_items = n_train + n_test label_lists = {} if label_fields is not None: if ',' in label_fields: label_fields = label_fields.split(',') else: label_fields = [label_fields] for label_name in label_fields: label_set = set() for i, item in enumerate(all_items): if label_name is not None: label_set.add(item[label_name]) label_list = list(label_set) label_list.sort() n_labels = len(label_list) print("Found label %s with %d classes" % (label_name, n_labels)) label_lists[label_name] = label_list else: label_fields = [] # make vocabulary train_parsed = [] test_parsed = [] print("Parsing %d documents" % n_items) if (tokenized): print("Using tokenized_text element as the input") word_counts = Counter() doc_counts = Counter() count = 0 vocab = None for i, item in enumerate(all_items): if i % 1000 == 0 and count > 0: print(i) if (tokenized): text = item['tokenized_text'] tokens = text.split() else: text = item['text'] tokens, _ = tokenize(text, strip_html=strip_html, lower=lower, keep_numbers=keep_num, keep_alphanum=keep_alphanum, min_length=min_length, stopwords=stopword_set, vocab=vocab) # store the parsed documents if i < n_train: train_parsed.append(tokens) else: test_parsed.append(tokens) # keep track fo the number of documents with each word word_counts.update(tokens) doc_counts.update(set(tokens)) print("Size of full vocabulary=%d" % len(word_counts)) print("Selecting the vocabulary") most_common = doc_counts.most_common() words, doc_counts = zip(*most_common) doc_freqs = np.array(doc_counts) / float(n_items) vocab = [word for i, word in enumerate(words) if doc_counts[i] >= min_doc_count and doc_freqs[i] <= max_doc_freq] most_common = [word for i, word in enumerate(words) if doc_freqs[i] > max_doc_freq] if max_doc_freq < 1.0: print("Excluding words with frequency > {:0.2f}:".format(max_doc_freq), most_common) print("Vocab size after filtering = %d" % len(vocab)) if vocab_size is not None: if len(vocab) > int(vocab_size): vocab = vocab[:int(vocab_size)] vocab_size = len(vocab) print("Final vocab size = %d" % vocab_size) print("Most common words remaining:", ' '.join(vocab[:10])) vocab.sort() fh.write_to_json(vocab, os.path.join(output_dir, train_prefix + '.vocab.json')) train_X_sage, tr_aspect, tr_no_aspect, tr_widx, vocab_for_sage = process_subset(train_items, train_parsed, label_fields, label_lists, vocab, output_dir, train_prefix) if n_test > 0: test_X_sage, te_aspect, te_no_aspect, _, _= process_subset(test_items, test_parsed, label_fields, label_lists, vocab, output_dir, test_prefix) train_sum = np.array(train_X_sage.sum(axis=0)) print("%d words missing from training data" % np.sum(train_sum == 0)) if n_test > 0: test_sum = np.array(test_X_sage.sum(axis=0)) print("%d words missing from test data" % np.sum(test_sum == 0)) sage_output = {'tr_data': train_X_sage, 'tr_aspect': tr_aspect, 'widx': tr_widx, 'vocab': vocab_for_sage} if n_test > 0: sage_output['te_data'] = test_X_sage sage_output['te_aspect'] = te_aspect savemat(os.path.join(output_dir, 'sage_labeled.mat'), sage_output) sage_output['tr_aspect'] = tr_no_aspect if n_test > 0: sage_output['te_aspect'] = te_no_aspect savemat(os.path.join(output_dir, 'sage_unlabeled.mat'), sage_output) print("Done!")
def main(): usage = "%prog infile.txt output_dir output_prefix" parser = OptionParser(usage=usage) parser.add_option( '-m', dest='max_lines', default=None, help= 'Quit after processing this many lines (documents): default=%default') #parser.add_option('--lower', action="store_true", dest="lower", default=False, # help='Lower case words: default=%default') (options, args) = parser.parse_args() infile = args[0] output_dir = args[1] output_prefix = args[2] max_lines = options.max_lines if max_lines is not None: max_lines = int(max_lines) vocab = [] vocab_index = {} counter = Counter() # start by converting each document into a dict of word counts, building a vocab as we go rows = [] cols = [] values = [] n_docs = 0 print("Counting words...") with codecs.open(infile, 'r', encoding='utf-8') as f: for line_i, line in enumerate(f): line = line.strip() if len(line) > 0: if max_lines is not None and line_i >= max_lines: print("Quitting after processing %d lines" % (line_i + 1)) break if n_docs % 1000 == 0 and n_docs > 0: print(n_docs) # split on white space words = line.split() # filter out everything that's not just letters, and lower case words = [ word.lower() for word in words if re.match('^[a-zA-Z]*$', word) is not None ] # look for new words and add them to the vocabulary new_words = [word for word in words if word not in vocab_index] if len(new_words) > 0: vocab_size = len(vocab) #print("Adding %d words to vocab" % len(new_words)) #print("New total should be %d" % (vocab_size + len(new_words))) vocab.extend(new_words) vocab_index.update( dict( zip(new_words, range(vocab_size, vocab_size + len(new_words))))) indices = [vocab_index[word] for word in words] counter.clear() counter.update(indices) keys = counter.keys() counts = counter.values() rows.extend([line_i] * len(keys)) cols.extend(keys) values.extend(counts) n_docs += 1 print("Processed %d documents" % n_docs) print("Size of final vocab = %d" % len(vocab)) print("Saving counts...") # now convert these count vectors in to a giant sparse matrix counts = sparse.coo_matrix((values, (rows, cols)), shape=(n_docs, len(vocab))) fh.save_sparse(counts, os.path.join(output_dir, output_prefix + '.npz')) fh.write_to_json(vocab, os.path.join(output_dir, output_prefix + '.vocab.json')) print("Done")
def preprocess_data(train_infile, test_infile, output_dir, train_prefix, test_prefix, min_doc_count=0, max_doc_freq=1.0, vocab_size=None, stopwords=None, keep_num=False, keep_alphanum=False, strip_html=False, min_length=3, label_name=None, output_plaintext=False, bigrams=False): if stopwords == 'mallet': print("Using Mallet stopwords") stopword_list = fh.read_text( os.path.join('stopwords', 'mallet_stopwords.txt')) elif stopwords == 'snowball': print("Using snowball stopwords") stopword_list = fh.read_text( os.path.join('stopwords', 'snowball_stopwords.txt')) elif stopwords is not None: print("Using custom stopwords") stopword_list = fh.read_text( os.path.join('stopwords', stopwords + '_stopwords.txt')) else: stopword_list = [] stopword_set = {s.strip() for s in stopword_list} print("Reading data files") train_items = fh.read_jsonlist(train_infile) n_train = len(train_items) if test_infile is not None: test_items = fh.read_jsonlist(test_infile) n_test = len(test_items) else: test_items = [] n_test = 0 all_items = train_items + test_items n_items = n_train + n_test # determine labels and metadata from data metadata_keys = set() print("Dealing with labels and metadata") # find all the metadata keys present for i, item in enumerate(all_items): if 'text' not in item: print("Text field not found for item %d" % i) sys.exit() if 'metadata' in item: for key in item['metadata'].keys(): metadata_keys.add(key) # only keep the ones that are present everywhere if len(metadata_keys) > 0: for i, item in enumerate(all_items): if 'metadata' not in item: print('metadata not found for item %d' % i) for key in metadata_keys: if key not in item['metadata']: print( 'dropping metadata field %s (not found for item %d)' % (key, i)) metadata_keys.remove(key) metadata_keys = list(metadata_keys) metadata_keys.sort() if len(metadata_keys) > 0: print("Metadata keys:", metadata_keys) label_set = set() for i, item in enumerate(all_items): if label_name is not None: label_set.add(item[label_name]) label_list = list(label_set) label_list.sort() n_labels = len(label_list) if label_name is not None: print("Using label %s with %d classes" % (label_name, n_labels)) # make vocabulary train_parsed = [] test_parsed = [] print("Parsing %d documents" % n_items) word_counts = Counter() doc_counts = Counter() count = 0 for i, item in enumerate(all_items): if i % 1000 == 0 and count > 0: print(i) text = item['text'] tokens = tokenize(text, strip_html, keep_numbers=keep_num, keep_alphanum=keep_alphanum, min_length=min_length, stopwords=stopword_set, bigrams=bigrams) # store the parsed documents if i < n_train: train_parsed.append(tokens) else: test_parsed.append(tokens) # keep track fo the number of documents with each word word_counts.update(tokens) doc_counts.update(set(tokens)) print("Size of full vocabulary=%d" % len(word_counts)) print("Selecting the vocabulary") most_common = doc_counts.most_common() words, doc_counts = zip(*most_common) doc_freqs = np.array(doc_counts) / float(n_items) vocab = [ word for i, word in enumerate(words) if doc_counts[i] >= min_doc_count and doc_freqs[i] <= max_doc_freq ] most_common = [ word for i, word in enumerate(words) if doc_freqs[i] > max_doc_freq ] print("Excluding most common:", most_common) print("Vocab size after filtering = %d" % len(vocab)) if vocab_size is not None: if len(vocab) > int(vocab_size): vocab = vocab[:int(vocab_size)] vocab_size = len(vocab) print("Final vocab size = %d" % vocab_size) print("Most common words remaining:", ' '.join(vocab[:10])) vocab.sort() fh.write_to_json(vocab, os.path.join(output_dir, train_prefix + '.vocab.json')) train_X_sage, tr_aspect, tr_no_aspect, tr_widx, vocab_for_sage = process_subset( train_items, train_parsed, label_name, label_list, vocab, metadata_keys, output_dir, train_prefix, output_plaintext=output_plaintext) if n_test > 0: test_X_sage, te_aspect, te_no_aspect, _, _ = process_subset( test_items, test_parsed, label_name, label_list, vocab, metadata_keys, output_dir, test_prefix, output_plaintext=output_plaintext) train_sum = np.array(train_X_sage.sum(axis=0)) print("%d words missing from training data" % np.sum(train_sum == 0)) if n_test > 0: test_sum = np.array(test_X_sage.sum(axis=0)) print("%d words missing from test data" % np.sum(test_sum == 0)) sage_output = { 'tr_data': train_X_sage, 'tr_aspect': tr_aspect, 'widx': tr_widx, 'vocab': vocab_for_sage } if n_test > 0: sage_output['te_data'] = test_X_sage sage_output['te_aspect'] = te_aspect savemat(os.path.join(output_dir, 'sage_labeled.mat'), sage_output) sage_output['tr_aspect'] = tr_no_aspect if n_test > 0: sage_output['te_aspect'] = te_no_aspect savemat(os.path.join(output_dir, 'sage_unlabeled.mat'), sage_output)
def process_subset( items, ids, parsed, labels, label_fields, label_lists, vocab, output_dir, output_prefix, count_dtype=np.int, ): n_items = len(items) vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) if not ids or len(ids) != n_items: ids = [str(i) for i in range(n_items)] # create a label index using string representations if labels: labels_df = pd.DataFrame.from_records(labels, index=ids) for label_field in label_fields: labels_df_subset = pd.get_dummies(labels_df[label_field]) # for any classes not present in the subset, add 0 columns # (handles case where classes appear in only one of train or test) for category in label_lists[label_field]: if category not in labels_df_subset: labels_df_subset[category] = 0 labels_df_subset.to_csv( os.path.join(output_dir, output_prefix + "." + label_field + ".csv")) if labels_df[label_field].nunique() == 2: labels_df_subset.iloc[:, 1].to_csv( os.path.join( output_dir, output_prefix + "." + label_field + "_vector.csv"), header=[label_field], ) # used later label_index = dict( zip(labels_df_subset.columns, range(len(labels_df_subset)))) X = np.zeros([n_items, vocab_size], dtype=count_dtype) dat_strings = [] dat_labels = [] mallet_strings = [] fast_text_lines = [] counter = Counter() word_counter = Counter() doc_lines = [] print("Converting to count representations") for i, words in enumerate(parsed): # get the vocab indices of words that are in the vocabulary words = words.split() indices = [vocab_index[word] for word in words if word in vocab_index] word_subset = [word for word in words if word in vocab_index] counter.clear() counter.update(indices) word_counter.clear() word_counter.update(word_subset) if len(counter.keys()) > 0: # udpate the counts mallet_strings.append( str(i) + "\t" + "en" + "\t" + " ".join(word_subset)) dat_string = str(int(len(counter))) + " " dat_string += " ".join([ str(k) + ":" + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values())) ]) dat_strings.append(dat_string) # for dat formart, assume just one label is given if len(label_fields) > 0: label = labels[i][label_fields[-1]] dat_labels.append(str(label_index[str(label)])) values = np.array(list(counter.values()), dtype=count_dtype) X[np.ones(len(counter.keys()), dtype=int) * i, list(counter.keys())] += values # convert to a sparse representation sparse_X = sparse.csr_matrix(X) fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + ".npz")) print("Size of {:s} document-term matrix:".format(output_prefix), sparse_X.shape) fh.write_to_json(ids, os.path.join(output_dir, output_prefix + ".ids.json")) # save output for Mallet fh.write_list_to_text( mallet_strings, os.path.join(output_dir, output_prefix + ".mallet.txt")) # save output for David Blei's LDA/SLDA code fh.write_list_to_text( dat_strings, os.path.join(output_dir, output_prefix + ".data.dat")) if len(dat_labels) > 0: fh.write_list_to_text( dat_labels, os.path.join(output_dir, output_prefix + "." + label_field + ".dat"), ) # save output for Jacob Eisenstein's SAGE code: sparse_X_sage = sparse.csr_matrix(X, dtype=float) vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object) vocab_for_sage[:] = vocab # for SAGE, assume only a single label has been given if len(label_fields) > 0: # convert array to vector of labels for SAGE sage_aspect = ( np.argmax(np.array(labels_df_subset.values, dtype=float), axis=1) + 1) else: sage_aspect = np.ones([n_items, 1], dtype=float) sage_no_aspect = np.array([n_items, 1], dtype=float) widx = np.arange(vocab_size, dtype=float) + 1 return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage
def preprocess_data( train_infile, test_infile, output_dir, train_prefix, test_prefix, min_doc_count=0, max_doc_freq=1.0, ngram_range=(1, 1), vocab_size=None, stopwords=None, keep_num=False, keep_alphanum=False, strip_html=False, lower=True, min_word_length=3, max_doc_length=5000, label_fields=None, workers=4, proc_multiplier=500, ): if stopwords == "mallet": print("Using Mallet stopwords") stopword_list = fh.read_text( os.path.join("stopwords", "mallet_stopwords.txt")) elif stopwords == "snowball": print("Using snowball stopwords") stopword_list = fh.read_text( os.path.join("stopwords", "snowball_stopwords.txt")) elif stopwords is not None: print("Using custom stopwords") stopword_list = fh.read_text( os.path.join("stopwords", stopwords + "_stopwords.txt")) else: stopword_list = [] stopword_set = {s.strip() for s in stopword_list} print("Reading data files") train_items = fh.LazyJsonlistReader(train_infile) n_train = len(train_items) print("Found {:d} training documents".format(n_train)) if test_infile is not None: test_items = fh.LazyJsonlistReader(test_infile) n_test = len(test_items) print("Found {:d} test documents".format(n_test)) else: test_items = [] n_test = 0 n_items = n_train + n_test if label_fields: label_lists = {} if "," in label_fields: label_fields = label_fields.split(",") else: label_fields = [label_fields] if label_fields is None: label_fields = [] # make vocabulary train_ids, train_parsed, train_labels = [], [], [] test_ids, test_parsed, test_labels = [], [], [] print("Parsing documents") word_counts = Counter() doc_counts = Counter() vocab = None # process in blocks pool = multiprocessing.Pool(workers) chunksize = proc_multiplier * workers kwargs = { "strip_html": strip_html, "lower": lower, "keep_numbers": keep_num, "keep_alphanum": keep_alphanum, "min_length": min_word_length, "stopwords": stopword_set, "ngram_range": ngram_range, "vocab": vocab, "label_fields": label_fields, } # these two loops below do the majority of the preprocessing. unfortunately, without # a major refactor, they cannot be turned into generators and the results of # tokenization must be appended to a list. this unfortunately implies a large # memory footprint for i, group in enumerate(chunkize(iter(train_items), chunksize=chunksize)): print(f"On training chunk {i} of {len(train_items) // chunksize}", end="\r") for ids, tokens, labels in pool.imap(partial(_process_item, **kwargs), group): # store the parsed documents if ids is not None: train_ids.append(ids) if labels is not None: train_labels.append(labels) tokens = tokens[:max_doc_length] # keep track of the number of documents with each word word_counts.update(tokens) doc_counts.update(set(tokens)) train_parsed.append(" ".join(tokens)) # more efficient storage print("Train set processing complete") for i, group in enumerate(chunkize(iter(test_items), chunksize=chunksize)): print(f"On testing chunk {i} of {len(test_items) // chunksize}", end="\r") for ids, tokens, labels in pool.imap(partial(_process_item, **kwargs), group): # store the parsed documents if ids is not None: test_ids.append(ids) if labels is not None: test_labels.append(labels) tokens = tokens[:max_doc_length] # keep track of the number of documents with each word word_counts.update(tokens) doc_counts.update(set(tokens)) test_parsed.append(" ".join(tokens)) # more efficient storage print("Test set processing complete") pool.terminate() print("Size of full vocabulary=%d" % len(word_counts)) # store possible label values if label_fields: labels_df = pd.DataFrame.from_records(train_labels + test_labels) for label_name in label_fields: label_list = sorted(labels_df[label_name].unique().tolist()) n_labels = len(label_list) print("Found label %s with %d classes" % (label_name, n_labels)) label_lists[label_name] = label_list print("Selecting the vocabulary") most_common = doc_counts.most_common() words, doc_counts = zip(*most_common) doc_freqs = np.array(doc_counts) / float(n_items) vocab = [ word for i, word in enumerate(words) if doc_counts[i] >= min_doc_count and doc_freqs[i] <= max_doc_freq ] most_common = [ word for i, word in enumerate(words) if doc_freqs[i] > max_doc_freq ] if max_doc_freq < 1.0: print( "Excluding words with frequency > {:0.2f}:".format(max_doc_freq), most_common, ) print("Vocab size after filtering = %d" % len(vocab)) if vocab_size is not None: if len(vocab) > int(vocab_size): vocab = vocab[:int(vocab_size)] vocab_size = len(vocab) print("Final vocab size = %d" % vocab_size) print("Most common words remaining:", " ".join(vocab[:10])) vocab.sort() fh.write_to_json(vocab, os.path.join(output_dir, train_prefix + ".vocab.json")) count_dtype = np.uint16 if max_doc_length < np.iinfo( np.uint16).max else np.int train_X_sage, tr_aspect, tr_no_aspect, tr_widx, vocab_for_sage = process_subset( train_items, train_ids, train_parsed, train_labels, label_fields, label_lists, vocab, output_dir, train_prefix, count_dtype=count_dtype, ) if n_test > 0: test_X_sage, te_aspect, te_no_aspect, _, _ = process_subset( test_items, test_ids, test_parsed, test_labels, label_fields, label_lists, vocab, output_dir, test_prefix, count_dtype=count_dtype, ) train_sum = np.array(train_X_sage.sum(axis=0)) print("%d words missing from training data" % np.sum(train_sum == 0)) if n_test > 0: test_sum = np.array(test_X_sage.sum(axis=0)) print("%d words missing from test data" % np.sum(test_sum == 0)) sage_output = { "tr_data": train_X_sage, "tr_aspect": tr_aspect, "widx": tr_widx, "vocab": vocab_for_sage, } if n_test > 0: sage_output["te_data"] = test_X_sage sage_output["te_aspect"] = te_aspect savemat(os.path.join(output_dir, "sage_labeled.mat"), sage_output) sage_output["tr_aspect"] = tr_no_aspect if n_test > 0: sage_output["te_aspect"] = te_no_aspect savemat(os.path.join(output_dir, "sage_unlabeled.mat"), sage_output) print("Done!")
def process_subset(items, parsed, label_field, label_list, vocab, output_dir, output_prefix): n_items = len(items) n_labels = len(label_list) vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) ids = [] for i, item in enumerate(items): if 'id' in item: ids.append(item['id']) if len(ids) != n_items: ids = [str(i) for i in range(n_items)] # create a label index using string representations label_list_strings = [str(label) for label in label_list] label_index = dict(zip(label_list_strings, range(n_labels))) # convert labels to a data frame if n_labels > 0: label_matrix = np.zeros([n_items, n_labels], dtype=int) label_vector = np.zeros(n_items, dtype=int) for i, item in enumerate(items): id = ids[i] label = item[label_field] label_matrix[i, label_index[str(label)]] = 1 label_vector[i] = label_index[str(label)] labels_df = pd.DataFrame(label_matrix, index=ids, columns=label_list_strings) labels_df.to_csv( os.path.join(output_dir, output_prefix + '.' + label_field + '.csv')) label_vector_df = pd.DataFrame(label_vector, index=ids, columns=[label_field]) label_vector_df.to_csv( os.path.join(output_dir, output_prefix + '.label_vector.csv')) else: print("No labels found") X = np.zeros([n_items, vocab_size], dtype=int) dat_strings = [] dat_labels = [] mallet_strings = [] fast_text_lines = [] counter = Counter() word_counter = Counter() doc_lines = [] print("Converting to count representations") for i, words in enumerate(parsed): # get the vocab indices of words that are in the vocabulary indices = [vocab_index[word] for word in words if word in vocab_index] word_subset = [word for word in words if word in vocab_index] counter.clear() counter.update(indices) word_counter.clear() word_counter.update(word_subset) if len(counter.keys()) > 0: # udpate the counts mallet_strings.append( str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset)) dat_string = str(int(len(counter))) + ' ' dat_string += ' '.join([ str(k) + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values())) ]) dat_strings.append(dat_string) if label_field is not None: label = items[i][label_field] dat_labels.append(str(label_index[str(label)])) values = list(counter.values()) X[np.ones(len(counter.keys()), dtype=int) * i, list(counter.keys())] += values # convert to a sparse representation sparse_X = sparse.csr_matrix(X) fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz')) print(sparse_X.shape) print(len(dat_strings)) fh.write_to_json(ids, os.path.join(output_dir, output_prefix + '.ids.json')) # save output for Mallet fh.write_list_to_text( mallet_strings, os.path.join(output_dir, output_prefix + '.mallet.txt')) # save output for David Blei's LDA/SLDA code fh.write_list_to_text( dat_strings, os.path.join(output_dir, output_prefix + '.data.dat')) if len(dat_labels) > 0: fh.write_list_to_text( dat_labels, os.path.join(output_dir, output_prefix + '.' + label_field + '.dat')) # save output for Jacob Eisenstein's SAGE code: sparse_X_sage = sparse.csr_matrix(X, dtype=float) vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object) vocab_for_sage[:] = vocab if n_labels > 0: # convert array to vector of labels for SAGE sage_aspect = np.argmax(np.array(labels_df.values, dtype=float), axis=1) + 1 else: sage_aspect = np.ones([n_items, 1], dtype=float) sage_no_aspect = np.array([n_items, 1], dtype=float) widx = np.arange(vocab_size, dtype=float) + 1 return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage
def write_to_file(self, filename): json_obj = {'index2token': self.index2token, 'counts': self.counts, 'doc_counts': self.doc_counts} fh.write_to_json(json_obj, filename, sort_keys=False)
def process_subset(items, parsed, label_fields, label_lists, vocab, output_dir, output_prefix): n_items = len(items) vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) ids = [] for i, item in enumerate(items): if 'id' in item: ids.append(item['id']) if len(ids) != n_items: ids = [str(i) for i in range(n_items)] # create a label index using string representations for label_field in label_fields: label_list = label_lists[label_field] n_labels = len(label_list) label_list_strings = [str(label) for label in label_list] label_index = dict(zip(label_list_strings, range(n_labels))) # convert labels to a data frame if n_labels > 0: label_matrix = np.zeros([n_items, n_labels], dtype=int) label_vector = np.zeros(n_items, dtype=int) for i, item in enumerate(items): label = item[label_field] label_matrix[i, label_index[str(label)]] = 1 label_vector[i] = label_index[str(label)] labels_df = pd.DataFrame(label_matrix, index=ids, columns=label_list_strings) labels_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '.csv')) label_vector_df = pd.DataFrame(label_vector, index=ids, columns=[label_field]) if n_labels == 2: label_vector_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '_vector.csv')) rows = [] cols = [] vals = [] dat_strings = [] dat_labels = [] mallet_strings = [] fast_text_lines = [] counter = Counter() word_counter = Counter() doc_lines = [] print("Converting to count representations") for i, words in enumerate(parsed): # get the vocab indices of words that are in the vocabulary indices = [vocab_index[word] for word in words if word in vocab_index] word_subset = [word for word in words if word in vocab_index] counter.clear() counter.update(indices) word_counter.clear() word_counter.update(word_subset) if len(counter.keys()) > 0: # udpate the counts mallet_strings.append(str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset)) dat_string = str(int(len(counter))) + ' ' dat_string += ' '.join([str(k) + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values()))]) dat_strings.append(dat_string) # for dat formart, assume just one label is given if len(label_fields) > 0: label = items[i][label_fields[-1]] dat_labels.append(str(label_index[str(label)])) values = list(counter.values()) rows.extend([i] * len(counter)) token_indices = sorted(counter.keys()) cols.extend(list(token_indices)) vals.extend([counter[k] for k in token_indices]) # convert to a sparse representation sparse_X = sparse.coo_matrix((vals, (rows, cols)), shape=(n_items, vocab_size)).tocsr() fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz')) print("Size of {:s} document-term matrix:".format(output_prefix), sparse_X.shape) fh.write_to_json(ids, os.path.join(output_dir, output_prefix + '.ids.json')) # save output for Mallet fh.write_list_to_text(mallet_strings, os.path.join(output_dir, output_prefix + '.mallet.txt')) # save output for David Blei's LDA/SLDA code fh.write_list_to_text(dat_strings, os.path.join(output_dir, output_prefix + '.data.dat')) if len(dat_labels) > 0: fh.write_list_to_text(dat_labels, os.path.join(output_dir, output_prefix + '.' + label_field + '.dat')) # save output for Jacob Eisenstein's SAGE code: #sparse_X_sage = sparse.csr_matrix(X, dtype=float) vocab_for_sage = np.zeros((vocab_size,), dtype=np.object) vocab_for_sage[:] = vocab # for SAGE, assume only a single label has been given if len(label_fields) > 0: # convert array to vector of labels for SAGE sage_aspect = np.argmax(np.array(labels_df.values, dtype=float), axis=1) + 1 else: sage_aspect = np.ones([n_items, 1], dtype=float) sage_no_aspect = np.array([n_items, 1], dtype=float) widx = np.arange(vocab_size, dtype=float) + 1 return sparse_X, sage_aspect, sage_no_aspect, widx, vocab_for_sage