def load_metas(path, pattern, offset, count, load): records = load(path, offset) records = match_names(records, pattern) records = head(records, count) for record in records: for item in parse_meta(record.file): yield item
def load_taiga_social(path, offset=3985892864, count=4): records = load_tar(path, offset=offset) records = match_names(records, '*/texts/*.txt') records = head(records, count) for record in records: network = parse_filename_id(record.name) network = NETWORKS[network] for record in parse_social(record.file, network): yield record
def load_texts(path, pattern, offset, count, parse_id, load, encoding='utf8'): records = load(path, offset=offset) records = match_names(records, pattern) records = head(records, count) for record in records: id = parse_id(record.name) file = TextIOWrapper(record.file, encoding) text = file.read() yield TaigaRecord(id=id, meta=None, text=text)