コード例 #1
0
def load_xml_docs(docpath, min_words=256):
    idx = 1
    path_xml = parser.list_xml_path(docpath)
    for filename in path_xml:
        document = NXMLDocument(idx, filename)
        if len(document.wordlist) >= min_words:
            print('\t{0:03d} -> {1}'.format(document.id, document.filename))
            yield (idx, document.filename, document.wordlist)
            idx += 1
コード例 #2
0
def load_xml_docs(docpath, min_words=256):
    """
    Generator function that goes through all nxml documents in the path and returns
    a tuple of document index, filename and word list.
    """
    idx = 1
    path_xml = parser.list_xml_path(docpath)
    for filename in path_xml:
        document = Document(idx, filename)
        if len(document.wordlist) >= min_words:
            print('\t{0:03d} -> {1}'.format(document.id, document.filename))
            yield (idx, document.filename, document.wordlist)
            idx += 1
コード例 #3
0
def process_file(date_update, fraction=0.01):
    """Process unzipped Pubmed Open-Access folder to parquet file"""
    print("Process Pubmed Open-Access file to parquet with fraction = %s" %
          str(fraction))
    date_update_str = date_update.strftime("%Y_%m_%d")
    if glob(os.path.join(save_dir, 'pubmed_oa_*.parquet')):
        subprocess.call(['rm', '-rf', 'pubmed_oa_*.parquet'
                         ])  # remove if folder still exist

    path_all = pp.list_xml_path(unzip_dir)
    if fraction < 1:
        n_sample = int(fraction * len(path_all))
        rand_index = random.sample(range(len(path_all)), n_sample)
        rand_index.sort()
        path_sample = [path_all[i] for i in rand_index]
    else:
        path_sample = path_all

    path_rdd = sc.parallelize(path_sample,
                              numSlices=10000)  # use only example path
    parse_results_rdd = path_rdd.map(
        lambda x: Row(file_name=os.path.basename(x), **pp.parse_pubmed_xml(x)))
    pubmed_oa_df = parse_results_rdd.toDF()
    pubmed_oa_df_sel = pubmed_oa_df[[
        'full_title', 'abstract', 'doi', 'file_name', 'pmc', 'pmid',
        'publication_year', 'publisher_id', 'journal', 'subjects'
    ]]
    pubmed_oa_df_sel.write.parquet(os.path.join(
        save_dir, 'pubmed_oa_%s.parquet' % date_update_str),
                                   mode='overwrite')

    parse_name_rdd = parse_results_rdd.map(lambda x: parse_name(x)).\
        filter(lambda x: x is not None).\
        flatMap(lambda xs: [x for x in xs])
    parse_name_df = parse_name_rdd.toDF()
    parse_name_df.write.parquet(os.path.join(
        save_dir, 'pubmed_oa_author_%s.parquet' % date_update_str),
                                mode='overwrite')

    parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\
        filter(lambda x: x is not None).\
        flatMap(lambda xs: [x for x in xs])
    parse_affil_df = parse_affil_rdd.toDF()
    # change to parse_affil_df
    parse_affil_df.write.parquet(os.path.join(
        save_dir, 'pubmed_oa_affiliation_%s.parquet' % date_update_str),
                                 mode='overwrite')
    print('Finished parsing Pubmed Open-Access subset')
コード例 #4
0
def process_file(date_update, fraction=0.01):
    """Process unzipped Pubmed Open-Access folder to parquet file"""
    print("Process Pubmed Open-Access file to parquet with fraction = %s" % str(fraction))
    date_update_str = date_update.strftime("%Y_%m_%d")
    if glob(os.path.join(save_dir, 'pubmed_oa_*.parquet')):
        subprocess.call(['rm', '-rf', 'pubmed_oa_*.parquet']) # remove if folder still exist

    path_all = pp.list_xml_path(unzip_dir)
    if fraction < 1:
        n_sample = int(fraction * len(path_all))
        rand_index = random.sample(range(len(path_all)), n_sample)
        rand_index.sort()
        path_sample = [path_all[i] for i in rand_index]
    else:
        path_sample = path_all

    path_rdd = sc.parallelize(path_sample, numSlices=10000) # use only example path
    parse_results_rdd = path_rdd.map(lambda x: Row(file_name=os.path.basename(x), **pp.parse_pubmed_xml(x)))
    pubmed_oa_df = parse_results_rdd.toDF()
    pubmed_oa_df_sel = pubmed_oa_df[['full_title', 'abstract', 'doi',
                                     'file_name', 'pmc', 'pmid',
                                     'publication_year', 'publisher_id',
                                     'journal', 'subjects']]
    pubmed_oa_df_sel.write.parquet(os.path.join(save_dir, 'pubmed_oa_%s.parquet' % date_update_str),
                                   mode='overwrite')

    parse_name_rdd = parse_results_rdd.map(lambda x: parse_name(x)).\
        filter(lambda x: x is not None).\
        flatMap(lambda xs: [x for x in xs])
    parse_name_df = parse_name_rdd.toDF()
    parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_author_%s.parquet' % date_update_str),
                                mode='overwrite')

    parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\
        filter(lambda x: x is not None).\
        flatMap(lambda xs: [x for x in xs])
    parse_affil_df = parse_affil_rdd.toDF()
    parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_affiliation_%s.parquet' % date_update_str),
                                mode='overwrite')
    print('Finished parsing Pubmed Open-Access subset')