def main(args): data = model.data.Dataset(args.input) with open(args.vocab, 'r') as f: vocab = [w.strip() for w in f.readlines()] vocab_to_id = dict(zip(vocab, range(len(vocab)))) if not os.path.isdir(args.output): os.mkdir(args.output) labels = {} for collection in data.collections: output_path = os.path.join(args.output, '{}.csv'.format(collection)) #with open(output_path, 'w', newline='') as f: with open(output_path, 'w') as f: w = csv.writer(f, delimiter=',') for y, x in data.rows(collection, num_epochs=1): try: if y not in labels: labels[y] = len(labels) w.writerow((labels[y], preprocess(x, vocab_to_id, args.dataset_type))) except: import pdb pdb.set_trace() with open(os.path.join(args.output, 'labels.txt'), 'w') as f: f.write('\n'.join([k for k in sorted(labels, key=labels.get)]))
def main(args): args.split_train_val = str2bool(args.split_train_val) doc_train_filename = args.training_file doc_val_filename = args.validation_file doc_test_filename = args.test_file train_csv_filename = os.path.join(args.data_output, "training.csv") val_csv_filename = os.path.join(args.data_output, "validation.csv") test_csv_filename = os.path.join(args.data_output, "test.csv") if not os.path.exists(args.data_output): os.makedirs(args.data_output) docnade_vocabulary = args.vocab_size docnade_vocab_filename = os.path.join(args.data_output, "vocab_docnade.vocab") lstm_vocab_filename = os.path.join(args.data_output, "vocab_lstm.vocab") mapping_dict_filename = os.path.join(args.data_output, "mapping_dict.pkl") train_docs, train_docs_labels = load_file(doc_train_filename) test_docs, test_docs_labels = load_file(doc_test_filename) #if not args.split_train_val: val_docs, val_docs_labels = load_file(doc_val_filename) print(np.unique(train_docs_labels)) train_docs, train_docs_labels = shuffle(train_docs, train_docs_labels, random_state=123) val_docs, val_docs_labels = shuffle(val_docs, val_docs_labels, random_state=123) #test_docs, test_docs_labels = shuffle(test_docs, test_docs_labels, random_state=123) ########################################################################### # Prepare CSV file new_train_docs = [] with open(train_csv_filename, 'w', newline='') as csvfile: filewriter = csv.writer(csvfile, delimiter=',') for doc, label in zip(train_docs, train_docs_labels): new_doc_tokens = tokens(str(doc).lower().strip()) new_doc_tokens = [ token for token in new_doc_tokens if not token in cachedStopWords ] new_doc = ' '.join(new_doc_tokens) li = [str(label).lower().strip(), str(new_doc).lower().strip()] filewriter.writerow(li) new_train_docs.append(str(new_doc).lower().strip()) new_val_docs = [] with open(val_csv_filename, 'w', newline='') as csvfile: filewriter = csv.writer(csvfile, delimiter=',') for doc, label in zip(val_docs, val_docs_labels): new_doc_tokens = tokens(str(doc).lower().strip()) new_doc_tokens = [ token for token in new_doc_tokens if not token in cachedStopWords ] new_doc = ' '.join(new_doc_tokens) li = [str(label).lower().strip(), str(new_doc).lower().strip()] filewriter.writerow(li) new_val_docs.append(str(new_doc).lower().strip()) new_test_docs = [] with open(test_csv_filename, 'w', newline='') as csvfile: filewriter = csv.writer(csvfile, delimiter=',') for doc, label in zip(test_docs, test_docs_labels): new_doc_tokens = tokens(str(doc).lower().strip()) new_doc_tokens = [ token for token in new_doc_tokens if not token in cachedStopWords ] new_doc = ' '.join(new_doc_tokens) li = [str(label).lower().strip(), str(new_doc).lower().strip()] filewriter.writerow(li) new_test_docs.append(str(new_doc).lower().strip()) total_docs = [] total_docs.extend(new_train_docs) total_docs.extend(new_val_docs) #total_docs.extend(new_test_docs) # Saving docnade vocabulary #representer = TF(total_docs, max_features=docnade_vocabulary) representer = TF(total_docs, max_features=None) vocab_dict_docnade = representer.get_feature_names() with open(docnade_vocab_filename, "w") as f: f.write('\n'.join(vocab_dict_docnade)) # Preparing CSV files for DocNADE Tensorflow data = model.data.Dataset(args.data_output) with open(docnade_vocab_filename, 'r') as f: vocab = [w.strip() for w in f.readlines()] vocab_to_id = dict(zip(vocab, range(len(vocab)))) if not os.path.isdir(args.data_output): os.mkdir(args.data_output) labels = {} removed_indices = {"training": [], "test": [], "validation": []} for collection in data.collections: output_path = os.path.join(args.data_output, '{}_docnade.csv'.format(collection)) with open(output_path, 'w', newline='') as f: #with open(output_path, 'w') as f: w = csv.writer(f, delimiter=',') count = -1 for y, x in data.rows(collection, num_epochs=1): count += 1 try: pre = preprocess(x, vocab_to_id, "docnade") if pre is None: removed_indices[str(collection).lower()].append(count) continue if ':' in y: temp_labels = y.split(':') new_label = [] for label in temp_labels: if label not in labels: labels[label] = len(labels) new_label.append(str(labels[label])) temp_label = ':'.join(new_label) w.writerow((temp_label, pre)) else: if y not in labels: labels[y] = len(labels) w.writerow((labels[y], pre)) except: import pdb pdb.set_trace() with open(os.path.join(args.data_output, 'labels.txt'), 'w') as f: f.write('\n'.join([k for k in sorted(labels, key=labels.get)]))
def main(args): args.split_train_val = str2bool(args.split_train_val) doc_train_filename = args.training_file doc_val_filename = args.validation_file doc_test_filename = args.test_file train_csv_filename = os.path.join(args.data_output, "training.csv") val_csv_filename = os.path.join(args.data_output, "validation.csv") test_csv_filename = os.path.join(args.data_output, "test.csv") if not os.path.exists(args.data_output): os.makedirs(args.data_output) docnade_vocabulary = args.vocab_size docnade_vocab_filename = os.path.join(args.data_output, "vocab_docnade.vocab") lstm_vocab_filename = os.path.join(args.data_output, "vocab_lstm.vocab") mapping_dict_filename = os.path.join(args.data_output, "mapping_dict.pkl") train_docs, train_docs_labels = load_file(doc_train_filename) test_docs, test_docs_labels = load_file(doc_test_filename) if not args.split_train_val: val_docs, val_docs_labels = load_file(doc_val_filename) print(np.unique(train_docs_labels)) ########################################################################### # Prepare CSV file if args.split_train_val: from sklearn.model_selection import train_test_split indices = np.arange(len(train_docs)) #val_size = len(train_docs) * args.split_ratio val_size = args.split_num train_docs, val_docs, train_docs_labels, val_docs_labels, split_index_train, split_index_dev = train_test_split( train_docs, train_docs_labels, indices, test_size=50, random_state=1234) new_train_docs = [] with open(train_csv_filename, 'w') as csvfile: filewriter = csv.writer(csvfile, delimiter=',') for doc, label in zip(train_docs, train_docs_labels): new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip()) new_doc = ' '.join(new_doc_tokens) #doc_tokens = tokenize(str(doc).lower().strip()) li = [str(label).lower().strip(), str(new_doc).lower().strip()] filewriter.writerow(li) new_train_docs.append(str(new_doc).lower().strip()) new_val_docs = [] with open(val_csv_filename, 'w') as csvfile: filewriter = csv.writer(csvfile, delimiter=',') for doc, label in zip(val_docs, val_docs_labels): new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip()) new_doc = ' '.join(new_doc_tokens) li = [str(label).lower().strip(), str(doc).lower().strip()] filewriter.writerow(li) new_val_docs.append(str(new_doc).lower().strip()) new_test_docs = [] with open(test_csv_filename, 'w') as csvfile: filewriter = csv.writer(csvfile, delimiter=',') for doc, label in zip(test_docs, test_docs_labels): new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip()) new_doc = ' '.join(new_doc_tokens) li = [str(label).lower().strip(), str(doc).lower().strip()] filewriter.writerow(li) new_test_docs.append(str(new_doc).lower().strip()) total_docs = [] total_docs.extend(new_train_docs) total_docs.extend(new_val_docs) # Saving docnade vocabulary representer = TF(total_docs, max_features=docnade_vocabulary) vocab_dict_docnade = representer.get_feature_names() with open(docnade_vocab_filename, "w") as f: f.write('\n'.join(vocab_dict_docnade)) # Saving lstm vocabulary representer = TF(total_docs, max_features=None) vocab_dict_lstm = representer.get_feature_names() with open(lstm_vocab_filename, "w") as f: f.write('\n'.join(vocab_dict_lstm)) # Creating mapping dictionary mapping_dict = {} for i, word in enumerate(vocab_dict_docnade): mapping_dict[int(i)] = int(vocab_dict_lstm.index(str(word))) with open(mapping_dict_filename, "wb") as f: pickle.dump(mapping_dict, f) print("Mapping dictionary created.") # Preparing CSV files for DocNADE Tensorflow data = model.data.Dataset(args.data_output) with open(docnade_vocab_filename, 'r') as f: vocab = [w.strip() for w in f.readlines()] vocab_to_id = dict(zip(vocab, range(len(vocab)))) if not os.path.isdir(args.data_output): os.mkdir(args.data_output) labels = {} removed_indices = {"training": [], "test": [], "validation": []} for collection in data.collections: output_path = os.path.join(args.data_output, '{}_docnade.csv'.format(collection)) #with open(output_path, 'w', newline='') as f: with open(output_path, 'w') as f: w = csv.writer(f, delimiter=',') count = -1 for y, x in data.rows(collection, num_epochs=1): count += 1 try: pre = preprocess(x, vocab_to_id, "docnade") if pre is None: removed_indices[str(collection).lower()].append(count) continue if ':' in y: temp_labels = y.split(':') new_label = [] for label in temp_labels: if label not in labels: labels[label] = len(labels) new_label.append(str(labels[label])) temp_label = ':'.join(new_label) w.writerow((temp_label, pre)) else: if y not in labels: labels[y] = len(labels) w.writerow((labels[y], pre)) except: import pdb pdb.set_trace() with open(os.path.join(args.data_output, 'labels.txt'), 'w') as f: f.write('\n'.join([k for k in sorted(labels, key=labels.get)])) # Preparing CSV files for LSTM Tensorflow with open(lstm_vocab_filename, 'r') as f: vocab = [w.strip() for w in f.readlines()] vocab_to_id = dict(zip(vocab, range(len(vocab)))) labels = {} for collection in data.collections: removed_indices_collection = removed_indices[str(collection).lower()] output_path = os.path.join(args.data_output, '{}_lstm.csv'.format(collection)) #with open(output_path, 'w', newline='') as f: with open(output_path, 'w') as f: w = csv.writer(f, delimiter=',') count = -1 for y, x in data.rows(collection, num_epochs=1): count += 1 try: pre = preprocess(x, vocab_to_id, "lstm") if count in removed_indices_collection: continue if ':' in y: temp_labels = y.split(':') new_label = [] for label in temp_labels: if label not in labels: labels[label] = len(labels) new_label.append(str(labels[label])) temp_label = ':'.join(new_label) w.writerow((temp_label, pre)) else: if y not in labels: labels[y] = len(labels) w.writerow((labels[y], pre)) except: import pdb pdb.set_trace()
def main(args): args.split_train_val = str2bool(args.split_train_val) doc_train_filename = args.training_file doc_val_filename = args.validation_file doc_test_filename = args.test_file train_csv_filename = os.path.join(args.data_output, "training.csv") val_csv_filename = os.path.join(args.data_output, "validation.csv") test_csv_filename = os.path.join(args.data_output, "test.csv") docnade_vocabulary = args.vocab_size docnade_vocab_filename = os.path.join(args.data_output, "vocab_docnade.vocab") lstm_vocab_filename = os.path.join(args.data_output, "vocab_lstm.vocab") mapping_dict_filename = os.path.join(args.data_output, "mapping_dict.pkl") train_docs, train_docs_labels = load_file(doc_train_filename) test_docs, test_docs_labels = load_file(doc_test_filename) if not args.split_train_val: val_docs, val_docs_labels = load_file(doc_val_filename) print(np.unique(train_docs_labels)) ################### Calculate average document length ##################### dataset_name = str(args.data_output.split('/')[-1]) total_docs = [] total_docs.extend(train_docs) total_docs.extend(val_docs) total_docs.extend(test_docs) doc_lengths = [] for doc in total_docs: doc_tokens = str(doc).lower().strip().split() doc_length = len(doc_tokens) doc_lengths.append(doc_length) average_doc_length = np.mean(doc_lengths) print("Average doc length for dataset [ %s ] is = %f" % (dataset_name, average_doc_length)) #exit() ########################################################################### # Prepare CSV file if args.split_train_val: from sklearn.model_selection import train_test_split indices = np.arange(len(train_docs)) #val_size = len(train_docs) * args.split_ratio val_size = args.split_num train_docs, val_docs, train_docs_labels, val_docs_labels, split_index_train, split_index_dev = train_test_split(train_docs, train_docs_labels, indices, test_size=50, random_state=1234) new_train_docs = [] with open(train_csv_filename, 'w') as csvfile: filewriter = csv.writer(csvfile, delimiter=',') for doc, label in zip(train_docs, train_docs_labels): new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip()) new_doc = ' '.join(new_doc_tokens) #doc_tokens = tokenize(str(doc).lower().strip()) li = [str(label).lower().strip(), str(new_doc).lower().strip()] filewriter.writerow(li) new_train_docs.append(str(new_doc).lower().strip()) new_val_docs = [] with open(val_csv_filename, 'w') as csvfile: filewriter = csv.writer(csvfile, delimiter=',') for doc, label in zip(val_docs, val_docs_labels): new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip()) new_doc = ' '.join(new_doc_tokens) li = [str(label).lower().strip(), str(doc).lower().strip()] filewriter.writerow(li) new_val_docs.append(str(new_doc).lower().strip()) new_test_docs = [] with open(test_csv_filename, 'w') as csvfile: filewriter = csv.writer(csvfile, delimiter=',') for doc, label in zip(test_docs, test_docs_labels): new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip()) new_doc = ' '.join(new_doc_tokens) li = [str(label).lower().strip(), str(doc).lower().strip()] filewriter.writerow(li) new_test_docs.append(str(new_doc).lower().strip()) total_docs = [] total_docs.extend(new_train_docs) total_docs.extend(new_val_docs) """ ######################################################## total_tokens = [] for doc in total_docs: total_tokens.extend(doc.split(' ')) glove_embeddings = loadGloveModel(hidden_size=200) glove_keys = glove_embeddings.keys() total_missing = 0 for token in total_tokens: if str(token).lower().strip() in glove_keys: continue else: total_missing += 1 print("Total tokens missing %s / %s" % (total_missing, len(total_tokens))) exit() ######################################################## """ # Saving docnade vocabulary representer = TF(total_docs, max_features=docnade_vocabulary) vocab_dict_docnade = representer.get_feature_names() with open(docnade_vocab_filename, "w") as f: f.write('\n'.join(vocab_dict_docnade)) """ # Saving lstm vocabulary representer = TF(total_docs, max_features=None) vocab_dict_lstm = representer.get_feature_names() with open(lstm_vocab_filename, "w") as f: f.write('\n'.join(vocab_dict_lstm)) # Creating mapping dictionary mapping_dict = {} for i, word in enumerate(vocab_dict_docnade): mapping_dict[int(i)] = int(vocab_dict_lstm.index(str(word))) with open(mapping_dict_filename, "wb") as f: pickle.dump(mapping_dict, f) print("Mapping dictionary created.") """ # Preparing CSV files for DocNADE Tensorflow data = model.data.Dataset(args.data_output) with open(docnade_vocab_filename, 'r') as f: vocab = [w.strip() for w in f.readlines()] vocab_to_id = dict(zip(vocab, range(len(vocab)))) if not os.path.isdir(args.data_output): os.mkdir(args.data_output) labels = {} removed_indices = {"training":[], "test":[], "validation":[]} for collection in data.collections: output_path = os.path.join(args.data_output, '{}_docnade.csv'.format(collection)) #with open(output_path, 'w', newline='') as f: with open(output_path, 'w') as f: w = csv.writer(f, delimiter=',') count = -1 #for y, x in data.rows(collection, num_epochs=1): for index, (y, x) in data.rows(collection, num_epochs=1): count += 1 try: pre = preprocess(x, vocab_to_id, "docnade") if pre is None: removed_indices[str(collection).lower()].append(count) continue if ':' in y: temp_labels = y.split(':') new_label = [] for label in temp_labels: if label not in labels: labels[label] = len(labels) new_label.append(str(labels[label])) temp_label = ':'.join(new_label) w.writerow((temp_label, pre)) else: if y not in labels: labels[y] = len(labels) w.writerow((labels[y], pre)) except: import pdb; pdb.set_trace() with open(os.path.join(args.data_output, 'labels.txt'), 'w') as f: f.write('\n'.join([k for k in sorted(labels, key=labels.get)])) with open(os.path.join(args.data_output, 'removed_indices.pkl'), "wb") as f: pickle.dump(removed_indices, f) print(removed_indices) """