def split_dataset_category(input_file):
    input_data = read_csv(input_file, True)

    category_to_questions = {
    }  # Dict where key = category, value = [list of real questions in that category, list of generated questions in that category]

    # Output dictionaries where key is question, value is category
    train_dict = {}
    testA_dict = {}
    testB_dict = {}

    # Add questions to categories
    for row in input_data:
        category = row[0].split(' - ')[0]
        question = row[2]
        source = row[3]

        if category in category_to_questions.keys():
            if source == 'Author Generated':
                category_to_questions.get(category)[1].append(question)
            else:
                category_to_questions.get(category)[0].append(question)
        else:
            category_to_questions[category] = [[], []]

            if source == 'Author Generated':
                category_to_questions.get(category)[1].append(question)
            else:
                category_to_questions.get(category)[0].append(question)

    for category in category_to_questions.keys():
        if category == '' or category == 'Other':
            continue

        real_questions = category_to_questions.get(category)[0]
        generated_questions = category_to_questions.get(category)[1]

        print(category, len(real_questions))

        random.shuffle(real_questions)
        random.shuffle(generated_questions)

        for question in generated_questions:
            testB_dict[question] = category

        counter = 0
        for i in range(len(real_questions)):
            question = real_questions[i]

            if counter < 20:
                if question not in train_dict:
                    counter += 1
                    train_dict[question] = category
            else:
                if question not in testA_dict:
                    counter += 1
                    testA_dict[question] = category

    return [train_dict, testA_dict, testB_dict]
def get_questions_to_id(input_file):
    result = {}     # Dictionary where key is question, value is ID
    input_data = read_csv(input_file, True)

    for row in input_data:
        result[row[0]] = int(row[1])

    return result
def get_questions_to_category(input_path):
    result = {}     # Dictionary where key is question, value is category
    input_data = read_csv(input_path, skip_header=False)

    for row in input_data:
        result[row[0]] = row[1]

    return result
def get_id_to_questions(input_file):
    result = {}     # Dictionary where key is ID, value is list of questions
    input_data = read_csv(input_file, True)   # 2D array where first column is question, second column is question ID

    for row in input_data:
        current_id = int(row[1])
        if current_id not in result:
            result[current_id] = []

        result.get(current_id).append(row[0])

    return result
def get_all_embeddings(input_csv, tokenizer, model):
    result = {
    }  # Dictionary where key = question and value = bert embedding for that question

    reader = read_csv(input_csv, True)

    for row in reader:
        question = row[2]
        embedding = get_embedding(question, tokenizer, model)
        question = ''.join([i if ord(i) < 128 else ' ' for i in question])

        result[question] = embedding

    return result
def combine_with_augmented_dataset(original_pickle_path, new_pickle_path,
                                   augmented_dataset_path, tokenizer, model):
    original_data = read_pickle(original_pickle_path)
    augmented_data = read_csv(augmented_dataset_path, skip_header=False)

    for row in augmented_data:
        question = row[0]
        embedding = get_embedding(question, tokenizer, model)
        question = ''.join([i if ord(i) < 128 else ' ' for i in question])

        original_data[question] = embedding

    save_to_pickle(original_data, new_pickle_path)

    return original_data
Exemple #7
0
def get_embedding_data(input_file, category_to_num, question_to_embedding):
    input_data = read_csv(input_file, False)

    embeddings = []
    labels = []

    for row in input_data:
        if row[1] == '':
            continue
        else:
            embedding = question_to_embedding[row[0]]
            category_number = category_to_num[row[1]]
            embeddings.append(embedding)
            labels.append(category_number)

    return np.asarray(embeddings), np.asarray(labels)
Exemple #8
0
    #trim so that we have the desired number of augmented sentences
    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
        augmented_sentences = [
            s for s in augmented_sentences if random.uniform(0, 1) < keep_prob
        ]

    #append the original sentence
    augmented_sentences.append(sentence)

    return augmented_sentences


#			MAIN			#
input_data = read_csv('dataset_categories/train20.csv', skip_header=False)
new_result = {}

for each in input_data:
    question = each[0]
    label = each[1]
    augmented_questions = eda(question, num_aug=16)

    for augmented_question in augmented_questions:
        new_result[augmented_question] = label

    new_result[question] = label

write_dict_to_csv('train20_augmented.csv', new_result)
 def do_readcsv(self, _arg):
     try:
         methods.read_csv()
     except:
         print('file not found. try fixing csv path')
    for current_id in id_to_questions.keys():
        real_questions = id_to_questions.get(current_id)[0]
        generated_questions = id_to_questions.get(current_id)[1]

        if len(real_questions) >= 4:  #If there are at least 4 questions
            random.shuffle(real_questions)

            for question in real_questions[:3]:
                train_dict[question] = current_id

            for question in real_questions[3:]:
                testA_dict[question] = current_id

            for question in generated_questions:
                testB_dict[question] = current_id

            counter += 1

    print(f"{counter} distinct question IDs")

    return [train_dict, testA_dict, testB_dict]


#           MAIN            #
master_data = read_csv('data/final_master_dataset.csv', True)
output_names = 'train3.csv', 'testA.csv', 'testB.csv'

sub_datasets = split_dataset(master_data)

for i in range(3):
    write_dict_to_csv(output_names[i], sub_datasets[i])