def parse_toloka_lrwc(lines): skip_header(lines) records = parse_tsv(lines) for record in records: hyponym, hypernym, genitive, judgement, confidence = record judgement = parse_judgement(judgement) confidence = parse_confidence(confidence) yield LRWCRecord(hyponym, hypernym, genitive, judgement, confidence)
def parse_buriy(lines, max_text=10000000): rows = parse_csv(lines, max_field=max_text) skip_header(rows) for row in rows: timestamp, url, edition, topics, title, text = row timestamp = parse_timestamp(timestamp) edition = maybe_none(edition, ('', '-')) topics = maybe_none(topics) yield BuriyRecord(timestamp=timestamp, url=url, edition=edition, topics=topics, title=title, text=text)
def parse_news(lines): # tass raises "field larger than field limit" rows = parse_csv(lines, max_field=100000000) skip_header(rows) for row in rows: (timestamp, url, edition, topics, authors, title, text, fb, vk, ok, twitter, lj, tg, likes, views, comments) = none_row(row) timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S') if authors: authors = authors.split(',') # empty texts in meduza text = text or '' stats = Stats(maybe_int(fb), maybe_int(vk), maybe_int(ok), maybe_int(twitter), maybe_int(lj), maybe_int(tg), maybe_int(likes), maybe_int(views), maybe_int(comments)) yield NewsRecord(timestamp, url, edition, topics, authors, title, text, stats)
def parse_news(lines): rows = parse_csv(fix_csv(lines)) header = skip_header(rows) for row in rows: row = list(none_row(row)) if len(row) != len(header) + 1: # extra , before EOL # rare Д.Акулинин, а также М.Кузовлев.\n\",-,-,-,-,-,-,-,-,- continue (timestamp, url, edition, topics, authors, title, text, fb, vk, ok, twitter, lj, tg, likes, views, comments, _) = row timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S') stats = Stats(maybe_int(fb), maybe_int(vk), maybe_int(ok), maybe_int(twitter), maybe_int(lj), maybe_int(tg), maybe_int(likes), maybe_int(views), maybe_int(comments)) if authors: authors = authors.split(',') yield NewsRecord(timestamp, url, fix_new_line(edition), fix_new_line(topics), authors, fix_new_line(title), fix_new_line(text), stats)
def parse_meta(file, encoding='utf8'): lines = TextIOWrapper(file, encoding) rows = parse_tsv(lines) header = skip_header(rows) for row in rows: yield dict(zip(header, row))
def parse_lenta(lines): rows = parse_csv(lines) skip_header(rows) for cells in rows: yield LentaRecord(*cells)
def parse_lenta_(lines, lenta_class): rows = parse_csv(lines) skip_header(rows) for cells in rows: yield lenta_class(*cells)
def parse_ruadrect(lines): rows = parse_tsv(lines) skip_header(rows) for cells in rows: yield RuADReCTRecord(*cells)
def parse_simlex(lines): skip_header(lines) records = parse_tsv(lines) for word1, word2, score in records: score = float(score) yield SimlexRecord(word1, word2, score)