Esempio n. 1
0
    for filename in filenames:
        samples = json_load(filename)
        output = []
        for sample in tqdm(samples):
            topic = sample['topic']
            if (topic not in stats_dict):
                stats_dict[topic] = 1
            else:
                stats_dict[topic] += 1

    stats_dict_sorted = {
        k: v
        for k, v in sorted(
            stats_dict.items(), key=lambda item: item[1], reverse=True)
    }
    for topic in stats_dict_sorted:
        output.append({"topic": topic, "amount": stats_dict_sorted[topic]})

    return output


if __name__ == '__main__':
    input_num = int(sys.argv[1])
    filenames = []
    for i in range(1, input_num + 1):
        print(sys.argv[i + 1])
        filenames.append(sys.argv[i + 1])
    output_file = sys.argv[input_num + 2]

    json_dump(convert_samples(filenames), output_file)
import sys
from tqdm import tqdm
from data.utils import json_load, json_dump
'''
cmd args

data\empatheticdialogue\train_json.json data\empatheticdialogue\train_json_classification.json
data\empatheticdialogue\test_json.json data\empatheticdialogue\test_json_classification.json
data\empatheticdialogue\valid_json.json data\empatheticdialogue\valid_json_classification.json
'''


def convert_samples(filename):
    samples = json_load(filename)
    output = []
    for sample in tqdm(samples):
        topic = sample['context']
        content = sample['content']
        for turn in content:
            output.append({'text': turn, 'topic': topic})

    return output


if __name__ == '__main__':
    input_file = sys.argv[1]
    output_file = sys.argv[2]

    json_dump(convert_samples(input_file), output_file)
        txt, act, emotion = txt.split(
            ' __eou__ '), act.split(), emotion.split()

        content = []
        for t, a, e in zip(txt, act, emotion):
            utterance = {
                'text': t.rstrip('__eou__').strip(),
                'act': acts_dict[int(a)],
                'emotion': emotions_dict[int(e)]
            }
            content.append(utterance)
        sample = {'topic': topic, 'length': len(content), 'content': content}
        samples.append(sample)
    samples.sort(key=lambda x: x['length'])
    return samples


if __name__ == '__main__':
    overall_filename, topic_filename = sys.argv[1], sys.argv[2]
    topics_list = {
        o: int(t)
        for o, t in zip(get_text(overall_filename), get_text(topic_filename))
    }

    input_text, input_act, input_emotion = sys.argv[3], sys.argv[4], sys.argv[
        5]
    output_filename = sys.argv[6]

    samples = get_samples(input_text, input_act, input_emotion, topics_list)
    json_dump(samples, output_filename)
    s = re.sub(r'[.]+[\n]+[,]', ".\n", s)
    s = s.split()
    return ' '.join(s), len(s)


def get_samples(filename, word_threshold=100):
    samples = []

    with open(filename, 'r', encoding="utf8") as f:
        data = csv.reader(f)
        # overview,headline,text,sectionLabel,title
        header = next(data)

        for i, row in enumerate(tqdm(data)):
            # currently using overview (not sure which is the best choice)
            # some seems to have repetition
            content = row[0].strip()
            content, length = clean(content)
            if length < word_threshold:
                sample = {'emotion': 'neutral', 'content': content}
                samples.append(sample)

    return samples


if __name__ == '__main__':
    input_file = sys.argv[1]
    output_file = sys.argv[2]

    json_dump(get_samples(input_file), output_file)
    return output


def convert_samples(filename):
    samples = json_load(filename)
    output = []
    output2 = []
    random.seed(9)
    for sample in tqdm(samples):
        topic = sample['topic']
        content = sample['content']
        for turn in content:
            if random.random() >= 0.2:
                output.append({'text': turn['text'], 'topic': topic})
            else:
                output2.append({'text': turn['text'], 'topic': topic})

    result = [output, output2]
    return result


if __name__ == '__main__':
    input_file = sys.argv[1]
    output_file = sys.argv[2]
    filter_list = ['Politics', 'Attitude & Emotion', 'Relationship', 'Health']

    json_dump(convert_samples_with_filter(input_file, filter_list),
              output_file)
    # json_dump(convert_samples(input_file), output_file)