def parse_file(f): """ Parses a corpus vertical file (or its stripped version containing only tags) arguments: f -- a file to be parsed returns: a tuple containing parsed div tags """ opus_list = [] metadata = {} words = [] for line in f: tag, start, attrs = vp.parse_line(line) if start is True: if tag == 'opus': print('t = %s, start: %s, attrs: %s' % (tag, start, attrs)) metadata.update(attrs) elif start is False: if tag == 'opus': metadata['wordcount'] = len(set(words)) metadata['poscount'] = len(words) opus_list.append(metadata) metadata = {} words = [] else: words.append(vp.parse_word(line)) return tuple(opus_list)
def parse_file(f): """ Parses a corpus vertical file (or its stripped version containing only tags) arguments: f -- a file to be parsed returns: a tuple containing parsed div tags """ curr_doc = {} div_list = [] metadata = {} pos_count = 0 for line in f: tag, start, attrs = vp.parse_line(line) if start is True: if tag == 'doc': curr_doc.update(attrs) elif tag == 'div': attrs['__doc__'] = curr_doc attrs['group'] = curr_doc.get('group') attrs['id'] = normalize_div_id(attrs['id']) metadata.update(attrs) elif start is False: if tag == 'div': metadata['poscount'] = pos_count div_list.append(metadata) metadata = {} pos_count = 0 elif tag == 'doc': curr_doc = {} else: pos_count += 1 return tuple(div_list)