def parse_russe(lines): records = parse_csv(lines) items = dict_csv(records) for item in items: word1 = item['word1'] word2 = item['word2'] sim = float(item['sim']) yield RusseSemRecord(word1, word2, sim)
def parse_buriy(lines, max_text=10000000): rows = parse_csv(lines, max_field=max_text) skip_header(rows) for row in rows: timestamp, url, edition, topics, title, text = row timestamp = parse_timestamp(timestamp) edition = maybe_none(edition, ('', '-')) topics = maybe_none(topics) yield BuriyRecord(timestamp=timestamp, url=url, edition=edition, topics=topics, title=title, text=text)
def parse_news(lines): rows = parse_csv(fix_csv(lines)) header = skip_header(rows) for row in rows: row = list(none_row(row)) if len(row) != len(header) + 1: # extra , before EOL # rare Д.Акулинин, а также М.Кузовлев.\n\",-,-,-,-,-,-,-,-,- continue (timestamp, url, edition, topics, authors, title, text, fb, vk, ok, twitter, lj, tg, likes, views, comments, _) = row timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S') stats = Stats(maybe_int(fb), maybe_int(vk), maybe_int(ok), maybe_int(twitter), maybe_int(lj), maybe_int(tg), maybe_int(likes), maybe_int(views), maybe_int(comments)) if authors: authors = authors.split(',') yield NewsRecord(timestamp, url, fix_new_line(edition), fix_new_line(topics), authors, fix_new_line(title), fix_new_line(text), stats)
def parse_news(lines): # tass raises "field larger than field limit" rows = parse_csv(lines, max_field=100000000) skip_header(rows) for row in rows: (timestamp, url, edition, topics, authors, title, text, fb, vk, ok, twitter, lj, tg, likes, views, comments) = none_row(row) timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S') if authors: authors = authors.split(',') # empty texts in meduza text = text or '' stats = Stats(maybe_int(fb), maybe_int(vk), maybe_int(ok), maybe_int(twitter), maybe_int(lj), maybe_int(tg), maybe_int(likes), maybe_int(views), maybe_int(comments)) yield NewsRecord(timestamp, url, edition, topics, authors, title, text, stats)
def parse_lenta(lines): rows = parse_csv(lines) skip_header(rows) for cells in rows: yield LentaRecord(*cells)
def parse_lenta_(lines, lenta_class): rows = parse_csv(lines) skip_header(rows) for cells in rows: yield lenta_class(*cells)