-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset_analysis.py
executable file
·77 lines (60 loc) · 2.51 KB
/
dataset_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import describe
from transformers import GPT2Tokenizer, GPT2Config
from numpy import percentile
from utils import get_dataset
"""
Most of the analysis being done here is to identify
potential bottlenecks in running modeling experiments.
One key reason is that the models can't have long inputs. For e.g.
running GPT2-medium on a (4 * 2 * 960) size crashes CoLab.
Useful things to find out:
1. Distribution of conversation lengths (can use this to find a sensible num. history turns)
2. Average num of tokens per turn (Need this to find out how many tokens are present in an average input)
3. Expected context size: E[num_history_tokens]
4. Average output size
5. Distribution of knowledge length
6. Distribution of input size: E[knowledge_tokens + num_history_tokens + response_tokens]
"""
def analyze_split(split, meta):
# Average conversation length
meta_info = defaultdict(list)
for dialog in split:
(history, reply, fact) = dialog
meta_info["history_length"].append(len(history))
num_history_tokens = sum(len(toks) for toks in history)
meta_info["num_history_tokens"].append(num_history_tokens)
reply_len = len(reply)
meta_info["reply_token_count"].append(reply_len)
fact_len = len(fact)
meta_info["fact_token_count"].append(fact_len)
meta_info["num_total_tokens"].append(num_history_tokens + reply_len + fact_len)
results_path = os.path.join('analysis_results', f'dataset_analysis_{meta["title"]}.txt')
with open(results_path, 'w') as data_file:
data_file.write(f"Split: {meta['title']}\n")
for metric, values in meta_info.items():
data_file.write(f"{metric}\n")
data_file.write(f"{describe(values)}\n")
data_file.write(f"{percentile(values, [50, 75, 90, 95, 99])}\n\n")
print(f"Results saved to {results_path}")
def analyze_dataset(dataset):
dataset_meta = {
"train": {
"title": "Train"
},
"valid_freq": {
"title": "Valid Frequent"
}
}
for split, meta in dataset_meta.items():
data_split = dataset[split]
analyze_split(data_split, meta)
if __name__ == '__main__':
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
dataset_path = 'processed_output'
dataset_cache = './dataset_cache'
dataset = get_dataset(tokenizer, dataset_path, dataset_cache)
analyze_dataset(dataset)