Ejemplo n.º 1
0
def main(args):
    data = model.data.Dataset(args.input)
    with open(args.vocab, 'r') as f:
        vocab = [w.strip() for w in f.readlines()]
    vocab_to_id = dict(zip(vocab, range(len(vocab))))

    if not os.path.isdir(args.output):
        os.mkdir(args.output)

    labels = {}
    for collection in data.collections:
        output_path = os.path.join(args.output, '{}.csv'.format(collection))
        #with open(output_path, 'w', newline='') as f:
        with open(output_path, 'w') as f:
            w = csv.writer(f, delimiter=',')
            for y, x in data.rows(collection, num_epochs=1):
                try:
                    if y not in labels:
                        labels[y] = len(labels)
                    w.writerow((labels[y],
                                preprocess(x, vocab_to_id, args.dataset_type)))
                except:
                    import pdb
                    pdb.set_trace()

    with open(os.path.join(args.output, 'labels.txt'), 'w') as f:
        f.write('\n'.join([k for k in sorted(labels, key=labels.get)]))
Ejemplo n.º 2
0
def main(args):
    args.split_train_val = str2bool(args.split_train_val)

    doc_train_filename = args.training_file
    doc_val_filename = args.validation_file
    doc_test_filename = args.test_file

    train_csv_filename = os.path.join(args.data_output, "training.csv")
    val_csv_filename = os.path.join(args.data_output, "validation.csv")
    test_csv_filename = os.path.join(args.data_output, "test.csv")

    if not os.path.exists(args.data_output):
        os.makedirs(args.data_output)

    docnade_vocabulary = args.vocab_size
    docnade_vocab_filename = os.path.join(args.data_output,
                                          "vocab_docnade.vocab")
    lstm_vocab_filename = os.path.join(args.data_output, "vocab_lstm.vocab")

    mapping_dict_filename = os.path.join(args.data_output, "mapping_dict.pkl")

    train_docs, train_docs_labels = load_file(doc_train_filename)
    test_docs, test_docs_labels = load_file(doc_test_filename)
    #if not args.split_train_val:
    val_docs, val_docs_labels = load_file(doc_val_filename)

    print(np.unique(train_docs_labels))

    train_docs, train_docs_labels = shuffle(train_docs,
                                            train_docs_labels,
                                            random_state=123)
    val_docs, val_docs_labels = shuffle(val_docs,
                                        val_docs_labels,
                                        random_state=123)
    #test_docs, test_docs_labels = shuffle(test_docs, test_docs_labels, random_state=123)

    ###########################################################################
    # Prepare CSV file

    new_train_docs = []
    with open(train_csv_filename, 'w', newline='') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',')
        for doc, label in zip(train_docs, train_docs_labels):
            new_doc_tokens = tokens(str(doc).lower().strip())
            new_doc_tokens = [
                token for token in new_doc_tokens
                if not token in cachedStopWords
            ]
            new_doc = ' '.join(new_doc_tokens)
            li = [str(label).lower().strip(), str(new_doc).lower().strip()]
            filewriter.writerow(li)
            new_train_docs.append(str(new_doc).lower().strip())

    new_val_docs = []
    with open(val_csv_filename, 'w', newline='') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',')
        for doc, label in zip(val_docs, val_docs_labels):
            new_doc_tokens = tokens(str(doc).lower().strip())
            new_doc_tokens = [
                token for token in new_doc_tokens
                if not token in cachedStopWords
            ]
            new_doc = ' '.join(new_doc_tokens)
            li = [str(label).lower().strip(), str(new_doc).lower().strip()]
            filewriter.writerow(li)
            new_val_docs.append(str(new_doc).lower().strip())

    new_test_docs = []
    with open(test_csv_filename, 'w', newline='') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',')
        for doc, label in zip(test_docs, test_docs_labels):
            new_doc_tokens = tokens(str(doc).lower().strip())
            new_doc_tokens = [
                token for token in new_doc_tokens
                if not token in cachedStopWords
            ]
            new_doc = ' '.join(new_doc_tokens)
            li = [str(label).lower().strip(), str(new_doc).lower().strip()]
            filewriter.writerow(li)
            new_test_docs.append(str(new_doc).lower().strip())

    total_docs = []
    total_docs.extend(new_train_docs)
    total_docs.extend(new_val_docs)
    #total_docs.extend(new_test_docs)

    # Saving docnade vocabulary
    #representer = TF(total_docs, max_features=docnade_vocabulary)
    representer = TF(total_docs, max_features=None)
    vocab_dict_docnade = representer.get_feature_names()

    with open(docnade_vocab_filename, "w") as f:
        f.write('\n'.join(vocab_dict_docnade))

    # Preparing CSV files for DocNADE Tensorflow
    data = model.data.Dataset(args.data_output)

    with open(docnade_vocab_filename, 'r') as f:
        vocab = [w.strip() for w in f.readlines()]
    vocab_to_id = dict(zip(vocab, range(len(vocab))))

    if not os.path.isdir(args.data_output):
        os.mkdir(args.data_output)

    labels = {}
    removed_indices = {"training": [], "test": [], "validation": []}
    for collection in data.collections:
        output_path = os.path.join(args.data_output,
                                   '{}_docnade.csv'.format(collection))
        with open(output_path, 'w', newline='') as f:
            #with open(output_path, 'w') as f:
            w = csv.writer(f, delimiter=',')
            count = -1
            for y, x in data.rows(collection, num_epochs=1):
                count += 1
                try:
                    pre = preprocess(x, vocab_to_id, "docnade")
                    if pre is None:
                        removed_indices[str(collection).lower()].append(count)
                        continue
                    if ':' in y:
                        temp_labels = y.split(':')
                        new_label = []
                        for label in temp_labels:
                            if label not in labels:
                                labels[label] = len(labels)
                            new_label.append(str(labels[label]))
                        temp_label = ':'.join(new_label)
                        w.writerow((temp_label, pre))
                    else:
                        if y not in labels:
                            labels[y] = len(labels)
                        w.writerow((labels[y], pre))
                except:
                    import pdb
                    pdb.set_trace()

    with open(os.path.join(args.data_output, 'labels.txt'), 'w') as f:
        f.write('\n'.join([k for k in sorted(labels, key=labels.get)]))
Ejemplo n.º 3
0
def main(args):
    args.split_train_val = str2bool(args.split_train_val)

    doc_train_filename = args.training_file
    doc_val_filename = args.validation_file
    doc_test_filename = args.test_file

    train_csv_filename = os.path.join(args.data_output, "training.csv")
    val_csv_filename = os.path.join(args.data_output, "validation.csv")
    test_csv_filename = os.path.join(args.data_output, "test.csv")

    if not os.path.exists(args.data_output):
        os.makedirs(args.data_output)

    docnade_vocabulary = args.vocab_size
    docnade_vocab_filename = os.path.join(args.data_output,
                                          "vocab_docnade.vocab")
    lstm_vocab_filename = os.path.join(args.data_output, "vocab_lstm.vocab")

    mapping_dict_filename = os.path.join(args.data_output, "mapping_dict.pkl")

    train_docs, train_docs_labels = load_file(doc_train_filename)
    test_docs, test_docs_labels = load_file(doc_test_filename)
    if not args.split_train_val:
        val_docs, val_docs_labels = load_file(doc_val_filename)

    print(np.unique(train_docs_labels))

    ###########################################################################
    # Prepare CSV file

    if args.split_train_val:
        from sklearn.model_selection import train_test_split
        indices = np.arange(len(train_docs))
        #val_size = len(train_docs) * args.split_ratio
        val_size = args.split_num
        train_docs, val_docs, train_docs_labels, val_docs_labels, split_index_train, split_index_dev = train_test_split(
            train_docs,
            train_docs_labels,
            indices,
            test_size=50,
            random_state=1234)

    new_train_docs = []
    with open(train_csv_filename, 'w') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',')
        for doc, label in zip(train_docs, train_docs_labels):
            new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip())
            new_doc = ' '.join(new_doc_tokens)
            #doc_tokens = tokenize(str(doc).lower().strip())
            li = [str(label).lower().strip(), str(new_doc).lower().strip()]
            filewriter.writerow(li)
            new_train_docs.append(str(new_doc).lower().strip())

    new_val_docs = []
    with open(val_csv_filename, 'w') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',')
        for doc, label in zip(val_docs, val_docs_labels):
            new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip())
            new_doc = ' '.join(new_doc_tokens)
            li = [str(label).lower().strip(), str(doc).lower().strip()]
            filewriter.writerow(li)
            new_val_docs.append(str(new_doc).lower().strip())

    new_test_docs = []
    with open(test_csv_filename, 'w') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',')
        for doc, label in zip(test_docs, test_docs_labels):
            new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip())
            new_doc = ' '.join(new_doc_tokens)
            li = [str(label).lower().strip(), str(doc).lower().strip()]
            filewriter.writerow(li)
            new_test_docs.append(str(new_doc).lower().strip())

    total_docs = []
    total_docs.extend(new_train_docs)
    total_docs.extend(new_val_docs)

    # Saving docnade vocabulary
    representer = TF(total_docs, max_features=docnade_vocabulary)
    vocab_dict_docnade = representer.get_feature_names()

    with open(docnade_vocab_filename, "w") as f:
        f.write('\n'.join(vocab_dict_docnade))

    # Saving lstm vocabulary
    representer = TF(total_docs, max_features=None)
    vocab_dict_lstm = representer.get_feature_names()

    with open(lstm_vocab_filename, "w") as f:
        f.write('\n'.join(vocab_dict_lstm))

    # Creating mapping dictionary
    mapping_dict = {}

    for i, word in enumerate(vocab_dict_docnade):
        mapping_dict[int(i)] = int(vocab_dict_lstm.index(str(word)))

    with open(mapping_dict_filename, "wb") as f:
        pickle.dump(mapping_dict, f)

    print("Mapping dictionary created.")

    # Preparing CSV files for DocNADE Tensorflow
    data = model.data.Dataset(args.data_output)

    with open(docnade_vocab_filename, 'r') as f:
        vocab = [w.strip() for w in f.readlines()]
    vocab_to_id = dict(zip(vocab, range(len(vocab))))

    if not os.path.isdir(args.data_output):
        os.mkdir(args.data_output)

    labels = {}
    removed_indices = {"training": [], "test": [], "validation": []}
    for collection in data.collections:
        output_path = os.path.join(args.data_output,
                                   '{}_docnade.csv'.format(collection))
        #with open(output_path, 'w', newline='') as f:
        with open(output_path, 'w') as f:
            w = csv.writer(f, delimiter=',')
            count = -1
            for y, x in data.rows(collection, num_epochs=1):
                count += 1
                try:
                    pre = preprocess(x, vocab_to_id, "docnade")
                    if pre is None:
                        removed_indices[str(collection).lower()].append(count)
                        continue
                    if ':' in y:
                        temp_labels = y.split(':')
                        new_label = []
                        for label in temp_labels:
                            if label not in labels:
                                labels[label] = len(labels)
                            new_label.append(str(labels[label]))
                        temp_label = ':'.join(new_label)
                        w.writerow((temp_label, pre))
                    else:
                        if y not in labels:
                            labels[y] = len(labels)
                        w.writerow((labels[y], pre))
                except:
                    import pdb
                    pdb.set_trace()

    with open(os.path.join(args.data_output, 'labels.txt'), 'w') as f:
        f.write('\n'.join([k for k in sorted(labels, key=labels.get)]))

    # Preparing CSV files for LSTM Tensorflow
    with open(lstm_vocab_filename, 'r') as f:
        vocab = [w.strip() for w in f.readlines()]
    vocab_to_id = dict(zip(vocab, range(len(vocab))))

    labels = {}
    for collection in data.collections:
        removed_indices_collection = removed_indices[str(collection).lower()]
        output_path = os.path.join(args.data_output,
                                   '{}_lstm.csv'.format(collection))
        #with open(output_path, 'w', newline='') as f:
        with open(output_path, 'w') as f:
            w = csv.writer(f, delimiter=',')
            count = -1
            for y, x in data.rows(collection, num_epochs=1):
                count += 1
                try:
                    pre = preprocess(x, vocab_to_id, "lstm")
                    if count in removed_indices_collection:
                        continue
                    if ':' in y:
                        temp_labels = y.split(':')
                        new_label = []
                        for label in temp_labels:
                            if label not in labels:
                                labels[label] = len(labels)
                            new_label.append(str(labels[label]))
                        temp_label = ':'.join(new_label)
                        w.writerow((temp_label, pre))
                    else:
                        if y not in labels:
                            labels[y] = len(labels)
                        w.writerow((labels[y], pre))
                except:
                    import pdb
                    pdb.set_trace()
Ejemplo n.º 4
0
def main(args):
	args.split_train_val = str2bool(args.split_train_val)

	doc_train_filename = args.training_file
	doc_val_filename = args.validation_file
	doc_test_filename = args.test_file
	
	train_csv_filename = os.path.join(args.data_output, "training.csv")
	val_csv_filename = os.path.join(args.data_output, "validation.csv")
	test_csv_filename = os.path.join(args.data_output, "test.csv")

	docnade_vocabulary = args.vocab_size
	docnade_vocab_filename = os.path.join(args.data_output, "vocab_docnade.vocab")
	lstm_vocab_filename = os.path.join(args.data_output, "vocab_lstm.vocab")

	mapping_dict_filename = os.path.join(args.data_output, "mapping_dict.pkl")
	

	train_docs, train_docs_labels = load_file(doc_train_filename)
	test_docs, test_docs_labels = load_file(doc_test_filename)
	if not args.split_train_val:
		val_docs, val_docs_labels = load_file(doc_val_filename)

	print(np.unique(train_docs_labels))

	################### Calculate average document length #####################
	dataset_name = str(args.data_output.split('/')[-1])

	total_docs = []
	total_docs.extend(train_docs)
	total_docs.extend(val_docs)
	total_docs.extend(test_docs)

	doc_lengths = []
	for doc in total_docs:
		doc_tokens = str(doc).lower().strip().split()
		doc_length = len(doc_tokens)
		doc_lengths.append(doc_length)

	average_doc_length = np.mean(doc_lengths)
	print("Average doc length for dataset [ %s ] is = %f" % (dataset_name, average_doc_length))

	#exit()
	###########################################################################
	# Prepare CSV file

	if args.split_train_val:
		from sklearn.model_selection import train_test_split
		indices = np.arange(len(train_docs))
		#val_size = len(train_docs) * args.split_ratio
		val_size = args.split_num
		train_docs, val_docs, train_docs_labels, val_docs_labels, split_index_train, split_index_dev = train_test_split(train_docs, train_docs_labels, indices, test_size=50, random_state=1234)

	new_train_docs = []
	with open(train_csv_filename, 'w') as csvfile:
		filewriter = csv.writer(csvfile, delimiter=',')
		for doc, label in zip(train_docs, train_docs_labels):
			new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip())
			new_doc = ' '.join(new_doc_tokens)
			#doc_tokens = tokenize(str(doc).lower().strip())
			li = [str(label).lower().strip(), str(new_doc).lower().strip()]
			filewriter.writerow(li)
			new_train_docs.append(str(new_doc).lower().strip())

	new_val_docs = []
	with open(val_csv_filename, 'w') as csvfile:
		filewriter = csv.writer(csvfile, delimiter=',')
		for doc, label in zip(val_docs, val_docs_labels):
			new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip())
			new_doc = ' '.join(new_doc_tokens)
			li = [str(label).lower().strip(), str(doc).lower().strip()]
			filewriter.writerow(li)
			new_val_docs.append(str(new_doc).lower().strip())

	new_test_docs = []
	with open(test_csv_filename, 'w') as csvfile:
		filewriter = csv.writer(csvfile, delimiter=',')
		for doc, label in zip(test_docs, test_docs_labels):
			new_doc_tokens = tokenizer.tokenize(str(doc).lower().strip())
			new_doc = ' '.join(new_doc_tokens)
			li = [str(label).lower().strip(), str(doc).lower().strip()]
			filewriter.writerow(li)
			new_test_docs.append(str(new_doc).lower().strip())

	total_docs = []
	total_docs.extend(new_train_docs)
	total_docs.extend(new_val_docs)

	"""
	########################################################
	total_tokens = []
	for doc in total_docs:
		total_tokens.extend(doc.split(' '))

	glove_embeddings = loadGloveModel(hidden_size=200)
	glove_keys = glove_embeddings.keys()

	total_missing = 0
	for token in total_tokens:
		if str(token).lower().strip() in glove_keys:
			continue
		else:
			total_missing += 1

	print("Total tokens missing %s / %s" % (total_missing, len(total_tokens)))

	exit()
	########################################################
	"""

	# Saving docnade vocabulary
	representer = TF(total_docs, max_features=docnade_vocabulary)
	vocab_dict_docnade = representer.get_feature_names()

	with open(docnade_vocab_filename, "w") as f:
		f.write('\n'.join(vocab_dict_docnade))
	"""
	# Saving lstm vocabulary
	representer = TF(total_docs, max_features=None)
	vocab_dict_lstm = representer.get_feature_names()

	with open(lstm_vocab_filename, "w") as f:
		f.write('\n'.join(vocab_dict_lstm))

	# Creating mapping dictionary
	mapping_dict = {}

	for i, word in enumerate(vocab_dict_docnade):
		mapping_dict[int(i)] = int(vocab_dict_lstm.index(str(word)))

	with open(mapping_dict_filename, "wb") as f:
		pickle.dump(mapping_dict, f)

	print("Mapping dictionary created.")
	"""
	# Preparing CSV files for DocNADE Tensorflow
	data = model.data.Dataset(args.data_output)

	with open(docnade_vocab_filename, 'r') as f:
		vocab = [w.strip() for w in f.readlines()]
	vocab_to_id = dict(zip(vocab, range(len(vocab))))

	if not os.path.isdir(args.data_output):
		os.mkdir(args.data_output)

	labels = {}
	removed_indices = {"training":[], "test":[], "validation":[]}
	for collection in data.collections:
		output_path = os.path.join(args.data_output, '{}_docnade.csv'.format(collection))
		#with open(output_path, 'w', newline='') as f:
		with open(output_path, 'w') as f:
			w = csv.writer(f, delimiter=',')
			count = -1
			#for y, x in data.rows(collection, num_epochs=1):
			for index, (y, x) in data.rows(collection, num_epochs=1):
				count += 1
				try:
					pre = preprocess(x, vocab_to_id, "docnade")
					if pre is None:
						removed_indices[str(collection).lower()].append(count)
						continue
					if ':' in y:
						temp_labels = y.split(':')
						new_label = []
						for label in temp_labels:
							if label not in labels:
								labels[label] = len(labels)
							new_label.append(str(labels[label]))
						temp_label = ':'.join(new_label)
						w.writerow((temp_label, pre))
					else:
						if y not in labels:
							labels[y] = len(labels)
						w.writerow((labels[y], pre))
				except:
					import pdb; pdb.set_trace()

	with open(os.path.join(args.data_output, 'labels.txt'), 'w') as f:
		f.write('\n'.join([k for k in sorted(labels, key=labels.get)]))

	with open(os.path.join(args.data_output, 'removed_indices.pkl'), "wb") as f:
		pickle.dump(removed_indices, f)

	print(removed_indices)
	"""