def _generate_examples(self, files): for filename in files: logging.info("⏳ Generating examples from = %s", filename) for line_num, block in iter_blocks(filename=filename): # last example id, words, lemma, upos, xpos, feats, head, deprel, deps, misc = [ list(value) for value in zip(*block) ] if self.config.deps: deps = [[ label.split(':', maxsplit=1) for label in dep.split('|') ] for dep in deps] deps = [[{ 'id': depid, 'head': int(label[0]), 'rel': label[-1] } for label in dep] for depid, dep in enumerate(deps)] deps = list(itertools.chain(*deps)) if any([dep['head'] >= len(words) for dep in deps]): continue yield line_num, { "id": id, "form": words, "lemma": lemma, "upos": upos, "xpos": xpos, "feats": feats, "head": head, "deprel": deprel, "deps": deps, "misc": misc, }
def _generate_examples(self, filepath): logging.info("⏳ Generating examples from = %s", filepath) for line_num, block in iter_blocks(filename=filepath): # last example words, bio = [list(value) for value in zip(*block)] yield line_num, {"words": words, "bio": bio}
def iter(self, filename: str, fields, multi_field: str = None, split=None, strip=None, proxy_property: dict = None): fields, fields_slices = self.build_slice(fields, multi_field, proxy_property) if proxy_property is not None: field_map = {field[0]: idx for idx, field in enumerate(fields)} for proxy, source in proxy_property.items(): if proxy in field_map: fields_slices[field_map[proxy]] = fields_slices[ field_map[source]] for line_num, block in tqdm(list(iter_blocks(filename, split, strip))): values = [list(value) for value in zip(*block)] values = [values[field_slice] for field_slice in fields_slices] processed, more = self.post_fn(values) if more: for values in processed: try: yield Example.fromlist(values, fields) except Exception as e: print(line_num, e) else: try: yield Example.fromlist(processed, fields) except Exception as e: print(line_num, e)
def build_vocabs(data_dir, train_file, dev_file=None, test_file=None, min_freq=5): counters = { 'word': (1, Counter()), 'lemma': (2, Counter()), 'upos': (3, Counter()), 'xpos': (4, Counter()), 'feats': (5, Counter()), 'deprel': (7, Counter()), # FOR CHAR FEATS 'word_char': (1, Counter()), # DEPS 'deps': (8, Counter()) } if any([ os.path.exists(os.path.join(data_dir, 'vocabs', f'{key}.txt')) for key in counters ]): return if not os.path.exists(os.path.join(data_dir, 'vocabs')): os.makedirs(os.path.join(data_dir, 'vocabs')) for file_name in [train_file, dev_file, test_file]: for line_num, block in iter_blocks( filename=os.path.join(data_dir, file_name)): values = [list(value) for value in zip(*block)] for name, (row, counter) in counters.items(): if 'char' in name: counter.update(itertools.chain(*values[row])) elif 'deps' == name: try: deps = [[ label.split(':', maxsplit=1)[1] for label in dep.split('|') ] for dep in values[row]] counter.update(itertools.chain(*deps)) except: counter.update('_') else: counter.update(values[row]) for feat, (row, counter) in counters.items(): if 'word' in feat: counter = Counter({ word: count for word, count in counter.items() if count > min_freq }) with open(os.path.join(data_dir, 'vocabs', f'{feat}.txt'), mode='w') as f: f.write('\n'.join(sorted(counter.keys())))
def _generate_examples(self, files): for filename in files: logging.info("⏳ Generating examples from = %s", filename) for line_num, block in iter_blocks(filename=filename): # last example words, predicate, *roles = [list(value) for value in zip(*block)] yield line_num, { "form": words, "predicate": predicate, "arguments": roles }
def build_vocabs(data_dir, *files): counters = {'predicate': (1, Counter()), 'arguments': (slice(2, None), Counter())} if any([os.path.exists(os.path.join(data_dir, 'vocabs', f'{key}.txt')) for key in counters]): return if not os.path.exists(os.path.join(data_dir, 'vocabs')): os.makedirs(os.path.join(data_dir, 'vocabs')) for filename in files: for line_num, block in iter_blocks(filename=filename): values = [list(value) for value in zip(*block)] for name, (row, counter) in counters.items(): current = values[row] if not len(current): continue item = current[0] if isinstance(item, list): for item in current: counter.update(item) else: counter.update(current) for feat, (row, counter) in counters.items(): with open(os.path.join(data_dir, 'vocabs', f'{feat}.txt'), mode='w') as f: # some process if feat == 'predicate': tags = sorted(counter.keys()) tags.remove('_') tags = ['_'] + tags elif feat == 'arguments': tags = sorted(counter.keys()) tags.remove('O') if 'B-V' in tags: tags.remove('B-V') tags_backup = ['O', 'B-V'] else: tags_backup = ['O'] tags = sorted(set([tag[2:] for tag in tags])) tags = [f'B-{tag}' for tag in tags] + [f'I-{tag}' for tag in tags] tags = tags_backup + tags else: tags = ['_'] f.write('\n'.join(tags))
def build_vocabs(data_dir, *files): counter = Counter() if os.path.exists(os.path.join(data_dir, 'vocabs', 'bio.txt')): return if not os.path.exists(os.path.join(data_dir, 'vocabs')): os.makedirs(os.path.join(data_dir, 'vocabs')) for filename in files: for line_num, block in iter_blocks(filename=filename): values = [list(value) for value in zip(*block)] counter.update(values[1]) with open(os.path.join(data_dir, 'vocabs', 'bio.txt'), mode='w') as f: tags = sorted(counter.keys()) tags.remove('O') f.write('\n'.join(['O'] + tags))