Ejemplo n.º 1
0
def parse_toloka_lrwc(lines):
    skip_header(lines)
    records = parse_tsv(lines)
    for record in records:
        hyponym, hypernym, genitive, judgement, confidence = record
        judgement = parse_judgement(judgement)
        confidence = parse_confidence(confidence)
        yield LRWCRecord(hyponym, hypernym, genitive, judgement, confidence)
Ejemplo n.º 2
0
def parse_buriy(lines, max_text=10000000):
    rows = parse_csv(lines, max_field=max_text)
    skip_header(rows)
    for row in rows:
        timestamp, url, edition, topics, title, text = row
        timestamp = parse_timestamp(timestamp)
        edition = maybe_none(edition, ('', '-'))
        topics = maybe_none(topics)
        yield BuriyRecord(timestamp=timestamp,
                          url=url,
                          edition=edition,
                          topics=topics,
                          title=title,
                          text=text)
Ejemplo n.º 3
0
Archivo: ods.py Proyecto: natasha/corus
def parse_news(lines):
    # tass raises "field larger than field limit"
    rows = parse_csv(lines, max_field=100000000)
    skip_header(rows)
    for row in rows:
        (timestamp, url, edition, topics, authors, title, text, fb, vk, ok,
         twitter, lj, tg, likes, views, comments) = none_row(row)

        timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')

        if authors:
            authors = authors.split(',')

        # empty texts in meduza
        text = text or ''

        stats = Stats(maybe_int(fb), maybe_int(vk), maybe_int(ok),
                      maybe_int(twitter), maybe_int(lj), maybe_int(tg),
                      maybe_int(likes), maybe_int(views), maybe_int(comments))
        yield NewsRecord(timestamp, url, edition, topics, authors, title, text,
                         stats)
Ejemplo n.º 4
0
def parse_news(lines):
    rows = parse_csv(fix_csv(lines))
    header = skip_header(rows)
    for row in rows:
        row = list(none_row(row))
        if len(row) != len(header) + 1:  # extra , before EOL
            # rare Д.Акулинин, а также М.Кузовлев.\n\",-,-,-,-,-,-,-,-,-
            continue

        (timestamp, url, edition, topics, authors, title, text, fb, vk, ok,
         twitter, lj, tg, likes, views, comments, _) = row
        timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
        stats = Stats(maybe_int(fb), maybe_int(vk), maybe_int(ok),
                      maybe_int(twitter), maybe_int(lj), maybe_int(tg),
                      maybe_int(likes), maybe_int(views), maybe_int(comments))
        if authors:
            authors = authors.split(',')
        yield NewsRecord(timestamp, url, fix_new_line(edition),
                         fix_new_line(topics), authors, fix_new_line(title),
                         fix_new_line(text), stats)
Ejemplo n.º 5
0
def parse_meta(file, encoding='utf8'):
    lines = TextIOWrapper(file, encoding)
    rows = parse_tsv(lines)
    header = skip_header(rows)
    for row in rows:
        yield dict(zip(header, row))
Ejemplo n.º 6
0
def parse_lenta(lines):
    rows = parse_csv(lines)
    skip_header(rows)
    for cells in rows:
        yield LentaRecord(*cells)
Ejemplo n.º 7
0
def parse_lenta_(lines, lenta_class):
    rows = parse_csv(lines)
    skip_header(rows)
    for cells in rows:
        yield lenta_class(*cells)
Ejemplo n.º 8
0
def parse_ruadrect(lines):
    rows = parse_tsv(lines)
    skip_header(rows)
    for cells in rows:
        yield RuADReCTRecord(*cells)
Ejemplo n.º 9
0
def parse_simlex(lines):
    skip_header(lines)
    records = parse_tsv(lines)
    for word1, word2, score in records:
        score = float(score)
        yield SimlexRecord(word1, word2, score)