def load_xml_docs(docpath, min_words=256): idx = 1 path_xml = parser.list_xml_path(docpath) for filename in path_xml: document = NXMLDocument(idx, filename) if len(document.wordlist) >= min_words: print('\t{0:03d} -> {1}'.format(document.id, document.filename)) yield (idx, document.filename, document.wordlist) idx += 1
def load_xml_docs(docpath, min_words=256): """ Generator function that goes through all nxml documents in the path and returns a tuple of document index, filename and word list. """ idx = 1 path_xml = parser.list_xml_path(docpath) for filename in path_xml: document = Document(idx, filename) if len(document.wordlist) >= min_words: print('\t{0:03d} -> {1}'.format(document.id, document.filename)) yield (idx, document.filename, document.wordlist) idx += 1
def process_file(date_update, fraction=0.01): """Process unzipped Pubmed Open-Access folder to parquet file""" print("Process Pubmed Open-Access file to parquet with fraction = %s" % str(fraction)) date_update_str = date_update.strftime("%Y_%m_%d") if glob(os.path.join(save_dir, 'pubmed_oa_*.parquet')): subprocess.call(['rm', '-rf', 'pubmed_oa_*.parquet' ]) # remove if folder still exist path_all = pp.list_xml_path(unzip_dir) if fraction < 1: n_sample = int(fraction * len(path_all)) rand_index = random.sample(range(len(path_all)), n_sample) rand_index.sort() path_sample = [path_all[i] for i in rand_index] else: path_sample = path_all path_rdd = sc.parallelize(path_sample, numSlices=10000) # use only example path parse_results_rdd = path_rdd.map( lambda x: Row(file_name=os.path.basename(x), **pp.parse_pubmed_xml(x))) pubmed_oa_df = parse_results_rdd.toDF() pubmed_oa_df_sel = pubmed_oa_df[[ 'full_title', 'abstract', 'doi', 'file_name', 'pmc', 'pmid', 'publication_year', 'publisher_id', 'journal', 'subjects' ]] pubmed_oa_df_sel.write.parquet(os.path.join( save_dir, 'pubmed_oa_%s.parquet' % date_update_str), mode='overwrite') parse_name_rdd = parse_results_rdd.map(lambda x: parse_name(x)).\ filter(lambda x: x is not None).\ flatMap(lambda xs: [x for x in xs]) parse_name_df = parse_name_rdd.toDF() parse_name_df.write.parquet(os.path.join( save_dir, 'pubmed_oa_author_%s.parquet' % date_update_str), mode='overwrite') parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\ filter(lambda x: x is not None).\ flatMap(lambda xs: [x for x in xs]) parse_affil_df = parse_affil_rdd.toDF() # change to parse_affil_df parse_affil_df.write.parquet(os.path.join( save_dir, 'pubmed_oa_affiliation_%s.parquet' % date_update_str), mode='overwrite') print('Finished parsing Pubmed Open-Access subset')
def process_file(date_update, fraction=0.01): """Process unzipped Pubmed Open-Access folder to parquet file""" print("Process Pubmed Open-Access file to parquet with fraction = %s" % str(fraction)) date_update_str = date_update.strftime("%Y_%m_%d") if glob(os.path.join(save_dir, 'pubmed_oa_*.parquet')): subprocess.call(['rm', '-rf', 'pubmed_oa_*.parquet']) # remove if folder still exist path_all = pp.list_xml_path(unzip_dir) if fraction < 1: n_sample = int(fraction * len(path_all)) rand_index = random.sample(range(len(path_all)), n_sample) rand_index.sort() path_sample = [path_all[i] for i in rand_index] else: path_sample = path_all path_rdd = sc.parallelize(path_sample, numSlices=10000) # use only example path parse_results_rdd = path_rdd.map(lambda x: Row(file_name=os.path.basename(x), **pp.parse_pubmed_xml(x))) pubmed_oa_df = parse_results_rdd.toDF() pubmed_oa_df_sel = pubmed_oa_df[['full_title', 'abstract', 'doi', 'file_name', 'pmc', 'pmid', 'publication_year', 'publisher_id', 'journal', 'subjects']] pubmed_oa_df_sel.write.parquet(os.path.join(save_dir, 'pubmed_oa_%s.parquet' % date_update_str), mode='overwrite') parse_name_rdd = parse_results_rdd.map(lambda x: parse_name(x)).\ filter(lambda x: x is not None).\ flatMap(lambda xs: [x for x in xs]) parse_name_df = parse_name_rdd.toDF() parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_author_%s.parquet' % date_update_str), mode='overwrite') parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\ filter(lambda x: x is not None).\ flatMap(lambda xs: [x for x in xs]) parse_affil_df = parse_affil_rdd.toDF() parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_affiliation_%s.parquet' % date_update_str), mode='overwrite') print('Finished parsing Pubmed Open-Access subset')