Exemple #1
0
def parse_russe(lines):
    records = parse_csv(lines)
    items = dict_csv(records)
    for item in items:
        word1 = item['word1']
        word2 = item['word2']
        sim = float(item['sim'])
        yield RusseSemRecord(word1, word2, sim)
Exemple #2
0
def parse_buriy(lines, max_text=10000000):
    rows = parse_csv(lines, max_field=max_text)
    skip_header(rows)
    for row in rows:
        timestamp, url, edition, topics, title, text = row
        timestamp = parse_timestamp(timestamp)
        edition = maybe_none(edition, ('', '-'))
        topics = maybe_none(topics)
        yield BuriyRecord(timestamp=timestamp,
                          url=url,
                          edition=edition,
                          topics=topics,
                          title=title,
                          text=text)
Exemple #3
0
def parse_news(lines):
    rows = parse_csv(fix_csv(lines))
    header = skip_header(rows)
    for row in rows:
        row = list(none_row(row))
        if len(row) != len(header) + 1:  # extra , before EOL
            # rare Д.Акулинин, а также М.Кузовлев.\n\",-,-,-,-,-,-,-,-,-
            continue

        (timestamp, url, edition, topics, authors, title, text, fb, vk, ok,
         twitter, lj, tg, likes, views, comments, _) = row
        timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
        stats = Stats(maybe_int(fb), maybe_int(vk), maybe_int(ok),
                      maybe_int(twitter), maybe_int(lj), maybe_int(tg),
                      maybe_int(likes), maybe_int(views), maybe_int(comments))
        if authors:
            authors = authors.split(',')
        yield NewsRecord(timestamp, url, fix_new_line(edition),
                         fix_new_line(topics), authors, fix_new_line(title),
                         fix_new_line(text), stats)
Exemple #4
0
def parse_news(lines):
    # tass raises "field larger than field limit"
    rows = parse_csv(lines, max_field=100000000)
    skip_header(rows)
    for row in rows:
        (timestamp, url, edition, topics, authors, title, text, fb, vk, ok,
         twitter, lj, tg, likes, views, comments) = none_row(row)

        timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')

        if authors:
            authors = authors.split(',')

        # empty texts in meduza
        text = text or ''

        stats = Stats(maybe_int(fb), maybe_int(vk), maybe_int(ok),
                      maybe_int(twitter), maybe_int(lj), maybe_int(tg),
                      maybe_int(likes), maybe_int(views), maybe_int(comments))
        yield NewsRecord(timestamp, url, edition, topics, authors, title, text,
                         stats)
Exemple #5
0
def parse_lenta(lines):
    rows = parse_csv(lines)
    skip_header(rows)
    for cells in rows:
        yield LentaRecord(*cells)
Exemple #6
0
def parse_lenta_(lines, lenta_class):
    rows = parse_csv(lines)
    skip_header(rows)
    for cells in rows:
        yield lenta_class(*cells)