def build_corpus(self): print(f'Loading training trees from `{self.train_path}`...') if self.multitask == 'ccg': train_treebank = ccg.fromfile(self.train_path) else: with open(self.train_path) as f: train_treebank = [fromstring(line.strip()) for line in f] print(f'Loading development trees from `{self.dev_path}`...') with open(self.dev_path) as f: dev_treebank = [fromstring(line.strip()) for line in f] print(f'Loading test trees from `{self.test_path}`...') with open(self.test_path) as f: test_treebank = [fromstring(line.strip()) for line in f] if self.multitask == 'spans': # need trees with span-information train_treebank = [tree.convert() for tree in train_treebank] dev_treebank = [tree.convert() for tree in dev_treebank] test_treebank = [tree.convert() for tree in test_treebank] print("Constructing vocabularies...") if self.vocab_path is not None: print(f'Using word vocabulary specified in `{self.vocab_path}`') with open(self.vocab_path) as f: vocab = json.load(f) words = [word for word, count in vocab.items() for _ in range(count)] else: words = [word for tree in train_treebank for word in tree.words()] if self.multitask == 'none': labels = [] else: labels = [label for tree in train_treebank for label in tree.labels()] if self.multitask == 'none': words = [UNK, START] + words else: words = [UNK, START, STOP] + words word_vocab = Vocabulary.fromlist(words, unk_value=UNK) label_vocab = Vocabulary.fromlist(labels) self.word_vocab = word_vocab self.label_vocab = label_vocab self.train_treebank = train_treebank self.dev_treebank = dev_treebank self.test_treebank = test_treebank print('\n'.join(( 'Corpus statistics:', f'Vocab: {word_vocab.size:,} words, {label_vocab.size:,} nonterminals', f'Train: {len(train_treebank):,} sentences', f'Dev: {len(dev_treebank):,} sentences', f'Test: {len(test_treebank):,} sentences')))
def build_corpus(self): print(f'Loading training trees from `{self.train_path}`...') with open(self.train_path) as f: train_treebank = [fromstring(line.strip()) for line in f] print(f'Loading development trees from `{self.dev_path}`...') with open(self.dev_path) as f: dev_treebank = [fromstring(line.strip()) for line in f] print(f'Loading test trees from `{self.test_path}`...') with open(self.test_path) as f: test_treebank = [fromstring(line.strip()) for line in f] if self.unlabeled: print(f'Converting trees to unlabeled form...') for tree in train_treebank: tree.unlabelize() if self.model_type == 'crf': print(f'Converting trees to CNF...') train_treebank = [tree.cnf() for tree in train_treebank] if self.unlabeled: for tree in train_treebank: tree.remove_chains() print("Constructing vocabularies...") if self.vocab_path is not None: print(f'Using word vocabulary specified in `{self.vocab_path}`') with open(self.vocab_path) as f: vocab = json.load(f) words = [ word for word, count in vocab.items() for _ in range(count) ] else: words = [word for tree in train_treebank for word in tree.words()] if self.max_sent_len > 0: filtered_treebank = [ tree for tree in train_treebank if len(tree.words()) <= self.max_sent_len ] print( "Using sentences with length <= {}: {:.1%} of all training trees." .format(self.max_sent_len, len(filtered_treebank) / len(train_treebank))) train_treebank = filtered_treebank if self.min_label_count > 1: counted_labels = Counter( [label for tree in train_treebank for label in tree.labels()]) filtered_labels = [ label for label, count in counted_labels.most_common() if count >= self.min_label_count ] filtered_treebank = [ tree for tree in train_treebank if all(label in filtered_labels for label in tree.labels()) ] print( "Using labels with count >= {}: {}/{} ({:.1%}) of all labels and {:.1%} of all training trees." .format(self.min_label_count, len(filtered_labels), len(counted_labels), len(filtered_labels) / len(counted_labels), len(filtered_treebank) / len(train_treebank))) train_treebank = filtered_treebank labels = [label for tree in train_treebank for label in tree.labels()] if self.model_type == 'crf': words = [UNK, START, STOP] + words else: words = [UNK] + words word_vocab = Vocabulary.fromlist(words, unk_value=UNK) label_vocab = Vocabulary.fromlist(labels) ## # counted_labels = Counter(label_vocab.counts).most_common() # pprint(counted_labels) ## if self.model_type.endswith('rnng'): # Order is very important! See DiscParser/GenParser classes to know why. if self.model_type == 'disc-rnng': actions = [SHIFT, REDUCE ] + [NT(label) for label in label_vocab] elif self.model_type == 'gen-rnng': actions = [REDUCE] + [NT(label) for label in label_vocab ] + [GEN(word) for word in word_vocab] action_vocab = Vocabulary() for action in actions: action_vocab.add(action) else: action_vocab = Vocabulary() self.word_vocab = word_vocab self.label_vocab = label_vocab self.action_vocab = action_vocab self.train_treebank = train_treebank self.dev_treebank = dev_treebank self.test_treebank = test_treebank print('\n'.join(( 'Corpus statistics:', f'Vocab: {word_vocab.size:,} words, {label_vocab.size:,} nonterminals, {action_vocab.size:,} actions', f'Train: {len(train_treebank):,} sentences', f'Dev: {len(dev_treebank):,} sentences', f'Test: {len(test_treebank):,} sentences')))