Beispiel #1
0
def load_wiki(path):
    lines = load_bz2_lines(path)
    records = pages_from(lines)
    for record in records:
        id, revision, title, _, _, page = record
        extractor = Extractor_(id, revision, title, page)
        data = extractor.extract_()
        yield WikiRecord.from_json(data)
Beispiel #2
0
def load_lenta2(path):
    lines = load_bz2_lines(path)
    return parse_lenta2(lines)
Beispiel #3
0
def load_wikiner(path):
    lines = load_bz2_lines(path)
    for line in lines:
        record = parse_wikiner(line)
        if record:
            yield record
Beispiel #4
0
def load_lenta2(path):
    """
    loads Lenta.Ru Dataset v1.1
    """
    lines = load_bz2_lines(path)
    return parse_lenta2(lines)