Ejemplo n.º 1
0
def generate_url_updates():
    current_id = None
    urls = []
    for line in read_gzip_lines(PAPERURLS_FILE, _ENCODING):
        next_id, _, url = line.split('\t')
        url = url.strip()
        if next_id != current_id:
            if len(urls) > 0:
                yield json.dumps({'id': current_id, 'urls': {'set': urls}})
            current_id = next_id
            urls = []
        urls.append(url)
Ejemplo n.º 2
0
def generate_reference_updates():
    current_id = None
    references = []
    for line in read_gzip_lines(REFERENCES_FILE, _ENCODING):
        next_id, ref = line.split('\t')
        ref = ref.strip()
        if next_id != current_id:
            if len(references) > 0:
                yield json.dumps({
                    'PaperId': current_id,
                    'References': {
                        'set': references
                    }
                })
            current_id = next_id
            references = []
        references.append(ref)
Ejemplo n.º 3
0
def generate_paper_author_affiliations():
    current_paper_id = None
    rels = dict()
    for idx, thing in enumerate(
            generate_json_dict(PaperAuthorAffiliations,
                               read_gzip_lines(PAPER_AUTHOR_FILE, _ENCODING))):
        if 0 == idx % 10_000:
            print(f'{idx}', end=' ')
        if 0 == idx % 100_000:
            print()
        rel = PaperAuthorAffiliation(**thing)
        if rel.PaperId != current_paper_id:
            if 0 < len(rels):
                yield current_paper_id, [
                    rels[idx].AuthorId for idx in sorted(rels)
                ]
            current_paper_id = rel.PaperId
            rels = dict()
        rels[int(rel.AuthorSequenceNumber)] = rel
Ejemplo n.º 4
0
def read_all():
    for filename in file_list():
        yield from read_gzip_lines(filename, _ENCODING)
Ejemplo n.º 5
0
def read_all():
    yield from read_gzip_lines(_DATA, _ENCODING)
Ejemplo n.º 6
0
def generate_denormailzed_papers():
    yield from read_gzip_lines(DENORM_PAPER_FILE, _ENCODING)
Ejemplo n.º 7
0
def generate_conference_instance_updates():
    yield from read_gzip_lines(CONF_INST_UPDATE_FILE, _ENCODING)
Ejemplo n.º 8
0
def generate_conference_series_updates():
    yield from read_gzip_lines(CONF_SER_UPDATE_FILE, _ENCODING)
Ejemplo n.º 9
0
def generate_journal_updates():
    yield from read_gzip_lines(JOURNAL_UPDATE_FILE, _ENCODING)
Ejemplo n.º 10
0
def generate_author_updates():
    yield from read_gzip_lines(AUTHOR_UPDATE_FILE, _ENCODING)
Ejemplo n.º 11
0
def generate_conference_series():
    yield from generate_json_string(
        ConferenceSeries, read_gzip_lines(CONF_SERIES_FILE, _ENCODING))
Ejemplo n.º 12
0
def generate_conference_instances():
    yield from generate_json_string(ConferenceInstances,
                                    read_gzip_lines(CONF_INST_FILE, _ENCODING))
Ejemplo n.º 13
0
def generate_journals():
    yield from generate_json_string(Journals,
                                    read_gzip_lines(JOURNALS_FILE, _ENCODING))
Ejemplo n.º 14
0
def generate_papers():
    yield from generate_json_string(Papers,
                                    read_gzip_lines(PAPERS_FILE, _ENCODING))
Ejemplo n.º 15
0
def generate_authors():
    yield from generate_json_string(Authors,
                                    read_gzip_lines(AUTHORS_FILE, _ENCODING))
Ejemplo n.º 16
0
def read_gzip_lines_utf8(path):
    yield from read_gzip_lines(path, encoding=_ENCODING)