def pubmed_parser(path_xml): ar = pp.parse_pubmed_xml(path_xml) if not isinstance(path_xml, str): path_xml.seek(0) paragraph_dicts = pp.parse_pubmed_paragraph(path_xml) paragraphs = [] for p in paragraph_dicts: del (p['pmc']) del (p['pmid']) paragraphs.append(p) ar['paragraphs'] = paragraphs num(ar, 'publication_year') try: ar['publication_date'] = datetime.datetime.strptime( ar['publication_date'], "%d-%m-%Y") except ValueError: try: print(ar['publication_date']) # assume error in 'day' and retry with the first day of the month ar['publication_date'] = datetime.datetime.strptime( "01" + ar['publication_date'][2:], "%d-%m-%Y") except ValueError: # a workaround, until we have a robust parser ar['publication_date'] = datetime.datetime(2000, 1, 1) return ar
def pmc2txt(xml_in, pmcid, job_size): pubmed_out = pp.parse_pubmed_xml(xml_in) ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False) cnt=0 bcnt = 0 print 'PMC2Txt', xml_in pmcid_no = pmcid.replace('PMC', '') sub_dir = 'pmcinput/%d' % (int(pmcid_no) % job_size) full_text = '' for paragraph in ft_out: if 'text' in paragraph: full_text += paragraph['text'] full_text = u2a_convert(pmcid, full_text, 'fulltext') if not os.path.exists(sub_dir): os.makedirs(sub_dir) f_tmp_in_fn = '%s/%s.txt' % (sub_dir,pmid) f_tmp_in = open(f_tmp_in_fn, 'w') #text = '%s|t|%s\n%s|a|%s\n' % (pmid, title, pmid, abstract) #PubTator Format #text = '%s %s' % (title, abstract) # PWTEES FORMAT f_tmp_in.write(full_text) f_tmp_in.close()
def parse_single_nxml(fpath, fname): """parse a particular file in the path Args: fpath - the whole path of the file fname - the file name Return: text - a text formulated in a special format """ fname = fname[:-5] # remove extension parsed_dict = pp.parse_pubmed_xml(fpath) abstract = parsed_dict["abstract"] if len(abstract) == 0: return None title = parsed_dict["full_title"] if len(parsed_dict["full_title"]) > 1 else EMPTY_TITLE text = START_TOKEN + "{}\n".format(fname) text += "{}\n".format(title.strip()) text += abstract + "\n" return text
def extract_content_from_file(filepath: str = None) -> List[str]: """ Return the textual content of a PMC OpenAccess Article :param filepath: nxml filepath :type filepath: str :return: list of text chunks :rtype: List[str] """ all_parts = list() try: metadata = pp.parse_pubmed_xml(filepath) if metadata.get("full_title") is not None: all_parts.append(metadata.get("full_title").strip("\n ")) if metadata.get("abstract") is not None: all_parts.append(metadata.get("abstract").strip("\n ")) except TypeError: pass try: paragraphs = pp.parse_pubmed_paragraph(filepath) for par in paragraphs: if par.get("text") is not None: all_parts.append(par.get("text").strip("\n ")) except TypeError: pass return all_parts
def pmc2txt(xml_in, pmcid, job_size, dest_dir): try: pubmed_out = pp.parse_pubmed_xml(xml_in) ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False) except Error as e: print 'Error in parsing nxml file %s' % xml_in return cnt = 0 bcnt = 0 #print 'PMC2Txt', xml_in pmcid_no = pmcid.replace('PMC', '') sub_dir = '%s/%d' % (dest_dir, int(pmcid_no) % job_size) full_text = '' for paragraph in ft_out: if 'text' in paragraph: full_text += paragraph['text'] full_text = u2a_convert(pmcid, full_text, 'fulltext') if not os.path.exists(sub_dir): os.makedirs(sub_dir) f_tmp_in_fn = '%s/%s.txt' % (sub_dir, pmcid) f_tmp_in = open(f_tmp_in_fn, 'w') f_tmp_in.write(full_text) f_tmp_in.close()
def query_data(fin_loc): # 450 out of 610 abstractions are available dict_out = pp.parse_pubmed_xml(fin_loc) # abstract = pub_dict["abstract"] # pmid = pub_dict["pmid"] # title = pub_dict["full_title"] # journal = pub_dict["journal"] # year = pub_dict["publication_year"] # date = dict_out["publication_date"] # # url = pub_dict["url"] # data_to_write = (abstract, pmid, title, journal, year, date) count = 0 dump_count = 0 if len( dict_out["abstract"] ) >= 5: # some publication don't have abstract, here we filter out abstraction by string length with open(os.path.join(fout_path, f"{dict_out['pmid']}.json"), "w") as out_f: json.dump(dict_out, out_f) # with open(os.path.join(fout_path, f"{pub_dict['pmid']}.txt"), "w") as out_f: # for data in data_to_write: # out_f.write(data + "\n") dump_count += 1 else: count += 1 with open(f"{fout_path}/missing_abstract", "a+") as f: f.write(dict_out["pmid"] + ": " + dict_out["abstract"])
def extract_text(file_path): abstract = pp.parse_pubmed_xml(file_path)['abstract'] body = pp.parse_pubmed_paragraph(file_path) body_text = '' for i in body: body_text += i['text'] + ' ' with open('../data/text/' + file_path[file_path.rfind('/') + 1:], 'w') as myfile: myfile.write(abstract + ' ' + body_text)
def test_parse_pubmed_xml(): """ Test parsing metadata from a PubMed XML file """ parsed_xml = pp.parse_pubmed_xml(os.path.join("data", "pone.0046493.nxml")) assert isinstance(parsed_xml, dict) assert len(parsed_xml.get("abstract")) > 0 assert len(parsed_xml.get("full_title")) > 0 assert parsed_xml.get("pmc") == "3460867" assert parsed_xml.get("doi") == "10.1371/journal.pone.0046493"
def test_parse_pubmed_xml(): """ Test parsing metadata from a PubMed XML file """ parsed_xml = pp.parse_pubmed_xml(os.path.join("data", "pone.0046493.nxml")) assert isinstance(parsed_xml, dict) assert len(parsed_xml.get("abstract")) > 0 assert len(parsed_xml.get("full_title")) > 0 assert parsed_xml.get("pmc") == "3460867" assert parsed_xml.get("doi") == "10.1371/journal.pone.0046493" assert parsed_xml.get("subjects") == "Research Article; Biology; Biochemistry; Enzymes; Enzyme Metabolism; Lipids; Fatty Acids; Glycerides; Lipid Metabolism; Neutral Lipids; Metabolism; Lipid Metabolism; Proteins; Globular Proteins; Protein Classes; Recombinant Proteins; Biotechnology; Microbiology; Bacterial Pathogens; Bacteriology; Emerging Infectious Diseases; Host-Pathogen Interaction; Microbial Growth and Development; Microbial Metabolism; Microbial Pathogens; Microbial Physiology; Proteomics; Sequence Analysis; Spectrometric Identification of Proteins"
def extractAbstract(full_path, paper_dir, nxml_file): """ Extract abstract from nxml_file and write as .txt to paper_dir """ # Run parser pubmed_xml_dict = pp.parse_pubmed_xml(f'{full_path}/{nxml_file}') # Write out abstract with open(f'{full_path}/{paper_dir}_abstract.txt', 'w') as text_file: text_file.write(pubmed_xml_dict['abstract']) print('\nAbstract written!')
def pmc2redis(xml_in, pmcid, redis_server): pubmed_out = pp.parse_pubmed_xml(xml_in) ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False) r = redis.StrictRedis(host='%s' % redis_server, port=6379, db=0) pipe = r.pipeline() print 'PMC2Redis', xml_in title = pubmed_out['title'].encode('utf-8').replace('\n', ' ') title = u2a_convert(pmcid, title, 'title') abstract = '' if pubmed_out['abstract'] is not None: abstract = pubmed_out['abstract'].encode('utf-8').replace('\n', ' ') abstract = u2a_convert(pmcid, abstract, 'abstract') else: print 'Cannot find abstract for PMCID %s' % pmcid full_text = '' for paragraph in ft_out: if 'text' in paragraph: full_text += paragraph['text'] full_text = u2a_convert(pmcid, full_text, 'fulltext') # affiliation: corresponding author's affiliation # authors: authors, each separated by ; # mesh_terms: list of MeSH terms, each separated by ; # keywords: list of keywords, each separated by ; # pubdate: Publication date. Defaults to year information only. year = pubmed_out['pubdate'] author = pubmed_out['author'] keywords = pubmed_out['keywords'] mesh_terms = pubmed_out['mesh_terms'] affiliation = pubmed_out['affiliation'] journal = pubmed_out['journal'] pipe.set('%s:title' % pmcid, '%s' % title) pipe.set('%s:abstract' % pmcid, '%s' % abstract) pipe.set('%s:fulltext' % pmcid, '%s' % full_text) pipe.set('%s:pubtator' % pmcid, '%s|t|%s\n%s|a|%s' % (pmcid, title, pmcid, abstract)) pipe.set('%s:pubdate' % pmcid, year) pipe.set('%s:author' % pmcid, author) pipe.set('%s:mesh_terms' % pmcid, mesh_terms) pipe.set('%s:keywords' % pmcid, keywords) pipe.set('%s:affiliation' % pmcid, affiliation) pipe.set('%s:journal' % pmcid, journal) pipe.execute()
def parse_pubmed_article(xml_path, section_keywords): """ Parse pubmed xml file into human-readable format :param xml_path: path to xml file to parse :param section_keywords: keyword strings to filter pubmed article sections :return: dictionary file containing relevant fields """ body = pp.parse_pubmed_paragraph(xml_path) if check_for_section(body, section_keywords): metadata = pp.parse_pubmed_xml(xml_path) pubmed_dict = build_pubmed_dict(metadata, body, section_keywords) return pubmed_dict else: metadata = pp.parse_pubmed_xml(xml_path) return { 'pmid': metadata['pmid'], 'pmcid': metadata['pmc'], 'title': metadata['full_title'], 'journal': metadata['journal'], 'parsed': False }
def get_words(self): words = [] pubmed_dict = parser.parse_pubmed_xml(self.filename) text = pubmed_dict['full_title'] + ' ' + pubmed_dict['abstract'] pubmed_paras_dict = parser.parse_pubmed_paragraph(self.filename) for paras in pubmed_paras_dict: text = text + paras['text'] # encodes the unicode string to ascii and replaces the xml entity character references # with '?' symbols. decode() then converts this byte string to a regular string for later # processing - strip(punctuation) fails otherwise. replace() gets rid of all '?' symbols and # replaces with a space. Later the text is split into words. text = text.encode('ascii', 'replace').decode('ascii').replace('?', ' ') return text
def pmc2pubtator(xml_in, pmcid, job_size, dest_dir): try: pubmed_out = pp.parse_pubmed_xml(xml_in) ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False) except Error as e: print 'Error in parsing nxml file %s ' % xml_in return -1 cnt = 0 bcnt = 0 #print 'PMC2Txt', xml_in pmcid_no = pmcid.replace('PMC', '') full_text = '' for paragraph in ft_out: if 'text' in paragraph: full_text += paragraph['text'] full_text = u2a_convert(pmcid, full_text, 'fulltext') pmcnumber = pubmed_out['pmc'] title = pubmed_out['full_title'] abstract = pubmed_out['abstract'] ttle = u2a_convert(pmcid, title, 'title') abst = u2a_convert(pmcid, abstract, 'abstract') if not os.path.exists(dest_dir): os.makedirs(dest_dir) f_tmp_in_fn = '%s/%s.txt' % (dest_dir, pmcid) if os.path.exists(f_tmp_in_fn): with open(f_tmp_in_fn) as f_tmp_in: if len(f_tmp_in.readlines()) > 0: f_tmp_in.close() return 1 f_tmp_in = open(f_tmp_in_fn, 'w') f_tmp_in.write(pmcnumber + '|t|' + ttle.strip()) f_tmp_in.write('\n') f_tmp_in.write(pmcnumber + '|a|' + abst.strip() + full_text.strip()) f_tmp_in.write('\n') f_tmp_in.write('\n') f_tmp_in.close() return 1
def main(): try: for subdir, dirs, files in os.walk(directory_path_chunk): for file in files: if file.endswith('.nxml'): print(file) filename = os.path.join(subdir, file) dict_out = pp.parse_pubmed_xml(filename) xml_json = json.dumps(dict_out, ensure_ascii=False) document_info = parse_document(dict_out) parse_scientist(dict_out, document_info) except Exception: pass
def process_file(date_update, fraction=0.01): """Process unzipped Pubmed Open-Access folder to parquet file""" print("Process Pubmed Open-Access file to parquet with fraction = %s" % str(fraction)) date_update_str = date_update.strftime("%Y_%m_%d") if glob(os.path.join(save_dir, 'pubmed_oa_*.parquet')): subprocess.call(['rm', '-rf', 'pubmed_oa_*.parquet' ]) # remove if folder still exist path_all = pp.list_xml_path(unzip_dir) if fraction < 1: n_sample = int(fraction * len(path_all)) rand_index = random.sample(range(len(path_all)), n_sample) rand_index.sort() path_sample = [path_all[i] for i in rand_index] else: path_sample = path_all path_rdd = sc.parallelize(path_sample, numSlices=10000) # use only example path parse_results_rdd = path_rdd.map( lambda x: Row(file_name=os.path.basename(x), **pp.parse_pubmed_xml(x))) pubmed_oa_df = parse_results_rdd.toDF() pubmed_oa_df_sel = pubmed_oa_df[[ 'full_title', 'abstract', 'doi', 'file_name', 'pmc', 'pmid', 'publication_year', 'publisher_id', 'journal', 'subjects' ]] pubmed_oa_df_sel.write.parquet(os.path.join( save_dir, 'pubmed_oa_%s.parquet' % date_update_str), mode='overwrite') parse_name_rdd = parse_results_rdd.map(lambda x: parse_name(x)).\ filter(lambda x: x is not None).\ flatMap(lambda xs: [x for x in xs]) parse_name_df = parse_name_rdd.toDF() parse_name_df.write.parquet(os.path.join( save_dir, 'pubmed_oa_author_%s.parquet' % date_update_str), mode='overwrite') parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\ filter(lambda x: x is not None).\ flatMap(lambda xs: [x for x in xs]) parse_affil_df = parse_affil_rdd.toDF() # change to parse_affil_df parse_affil_df.write.parquet(os.path.join( save_dir, 'pubmed_oa_affiliation_%s.parquet' % date_update_str), mode='overwrite') print('Finished parsing Pubmed Open-Access subset')
def extract(path): article = pp.parse_pubmed_xml(path) pmid = article['pmid'] title = article['full_title'] abstract = article['abstract'] if pmid not in pmids: return None doc = { 'pmid': pmid, 'title': title, 'abstract': abstract, 'toks': { 'title': [tok.text for tok in nlp(title)], 'abstract': [tok.text for tok in nlp(abstract)], } } return doc
def merge(self): print('PubMed path:', self.pubmed_path) with open(self.output_filename, mode='w', newline='\n') as ofile: # PubMed for filename in glob.glob(os.path.join(self.pubmed_path, '**/*.xml'), recursive=self.recursive): print('file:', filename) dicts_out = pmp.parse_medline_xml(filename) self.write_dicts(dicts_out, 'abstract', ofile, 'title', 'pubmed_abstract') # PMC for filename in glob.glob(os.path.join(self.pubmed_path, '**/*.nxml'), recursive=self.recursive): print('file:', filename) # OA abstract try: dicts_out = [pmp.parse_pubmed_xml(filename)] self.write_dicts(dicts_out, 'abstract', ofile, 'full_title', 'pmc_oa_abstract') except: pass # OA image caption try: dicts_out = pmp.parse_pubmed_caption(filename) self.write_dicts(dicts_out, 'fig_caption', ofile, 'fig_label', 'pmc_oa_image-caption') except: pass # OA Paragraph try: dicts_out = pmp.parse_pubmed_paragraph(filename, all_paragraph=True) self.write_dicts(dicts_out, 'text', ofile, 'reference_ids', 'pmc_oa_paragraph') except: pass
def parse_oa_xml(xml_file, output_file, mode): """Import pubmed open access XML file into prophet database.""" # For open access import pubmed_parser as pp if mode == 'paper': dicts_out = pp.parse_pubmed_xml(xml_file) elif mode == 'paragraphs': dicts_out = pp.parse_pubmed_paragraph(xml_file, all_paragraph=True) elif mode == 'references': dicts_out = pp.parse_pubmed_references(xml_file) elif mode == 'tables': dicts_out = pp.parse_pubmed_table(xml_file, return_xml=False) elif mode == 'figures': dicts_out = pp.parse_pubmed_caption(xml_file) with open(output_file, 'w') as fp: json.dump(dicts_out, fp, cls=DateEncoder)
def build_case_report_json(xml_path: str) -> dict: """Makes and returns a JSON object from pubmed XML files Args: xml_path (str): path to input XML file """ pubmed_xml = pp.parse_pubmed_xml(xml_path) pubmed_paragraph = pp.parse_pubmed_paragraph(xml_path) pubmed_references = pp.parse_pubmed_references(xml_path) subjects = pubmed_get_subjects(pubmed_xml) keywords = get_keywords(subjects) article_type = get_article_type(subjects) case_report = { "pmID": pubmed_xml.get("pmid"), "doi": pubmed_xml.get("doi"), "title": pubmed_xml.get("full_title"), "messages": [], "source_files": [], "modifications": [], "normalizations": [], # ctime : 1351154734.5055847, "text": pubmed_get_text(pubmed_paragraph), "entities": [], "attributes": [], # date : { type: Date, default: Date.now } "relations": [], "triggers": [], "events": [], "equivs": [], "comments": [], # sentence_offsets : [], # token_offsets : [], "action": None, "abstract": pubmed_xml.get("abstract"), "authors": pubmed_get_authors(pubmed_xml), "keywords": keywords, "introduction": None, "discussion": None, "references": [], "journal": pubmed_xml.get("journal"), "article_type": article_type, # For filtering. } return case_report
def BibParser(item): ''' Function is designed to take a nxml file with file path, parse it, and then return a list containing the specific values of interest. :param item: string containing the path to a nxml file :return: list of specific information parsed from the nxml file ''' # parse dictionary dict1 = pp.parse_pubmed_xml(item) # get publication info pmid = dict1['pmid'] pmcid = dict1['pmc'] title = dict1['full_title'].encode('utf8') doi = dict1['doi'] date = dict1['publication_year'] authors = [x[:-1] for x in dict1['author_list']] journal = dict1['journal'] return pmcid, pmid, title, doi, date, journal, authors
def process_file(date_update, fraction=0.01): """Process unzipped Pubmed Open-Access folder to parquet file""" print("Process Pubmed Open-Access file to parquet with fraction = %s" % str(fraction)) date_update_str = date_update.strftime("%Y_%m_%d") if glob(os.path.join(save_dir, 'pubmed_oa_*.parquet')): subprocess.call(['rm', '-rf', 'pubmed_oa_*.parquet']) # remove if folder still exist path_all = pp.list_xml_path(unzip_dir) if fraction < 1: n_sample = int(fraction * len(path_all)) rand_index = random.sample(range(len(path_all)), n_sample) rand_index.sort() path_sample = [path_all[i] for i in rand_index] else: path_sample = path_all path_rdd = sc.parallelize(path_sample, numSlices=10000) # use only example path parse_results_rdd = path_rdd.map(lambda x: Row(file_name=os.path.basename(x), **pp.parse_pubmed_xml(x))) pubmed_oa_df = parse_results_rdd.toDF() pubmed_oa_df_sel = pubmed_oa_df[['full_title', 'abstract', 'doi', 'file_name', 'pmc', 'pmid', 'publication_year', 'publisher_id', 'journal', 'subjects']] pubmed_oa_df_sel.write.parquet(os.path.join(save_dir, 'pubmed_oa_%s.parquet' % date_update_str), mode='overwrite') parse_name_rdd = parse_results_rdd.map(lambda x: parse_name(x)).\ filter(lambda x: x is not None).\ flatMap(lambda xs: [x for x in xs]) parse_name_df = parse_name_rdd.toDF() parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_author_%s.parquet' % date_update_str), mode='overwrite') parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\ filter(lambda x: x is not None).\ flatMap(lambda xs: [x for x in xs]) parse_affil_df = parse_affil_rdd.toDF() parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_affiliation_%s.parquet' % date_update_str), mode='overwrite') print('Finished parsing Pubmed Open-Access subset')
def process_paper(self, file, db): """ Loads a pdf file in the folder, and extracts its content into an XML file, as well as into the mongodb database :param file: the name of the paper to be processed :param db: mongo db :return: """ try: xml, t = self.get_xml(file) if t == 'grobid': NS = {'tei': 'http://www.tei-c.org/ns/1.0'} result = grobid_mapping.tei_to_dict(xml) mongo_set_dict = dict() if 'abstract' in result: mongo_set_dict["content.abstract"] = result["abstract"] if 'notes' in result: mongo_set_dict["content.notes"] = result["notes"] if 'fulltext' in result: mongo_set_dict["content.fulltext"] = result["fulltext"] with open(cfg.folder_content_xml + file + ".txt", 'w') as f: print(result["fulltext"]) if 'chapters' in result: mongo_set_dict["content.chapters"] = result["chapters"] mongo_result = db.publications.update_one({'_id': file[:10]}, {'$set': result}, upsert=True) print(mongo_result) logging.info("Processed " + file + ' with new xml') if t == 'pmc': filename_xml = cfg.source_xml + file[:-4] + ".nxml" meta = pp.parse_pubmed_xml(filename_xml) ref = pp.parse_pubmed_references(filename_xml) article_text = pp.parse_pubmed_paragraph(filename_xml, all_paragraph=True) result = dict() fulltext = [] for par in article_text: fulltext.append(par['text']) result['title'] = meta['full_title'] result['authors'] = meta['author_list'] result['journal'] = meta['journal'] result['year'] = meta['publication_year'] result['type'] = meta['subjects'] result['domain'] = 'biomedical' result['license'] = 'open_access' result['content.abstract'] = meta['abstract'] result['content.keywords'] = meta['keywords'] result['content.references'] = ref result['content.fulltext'] = ''.join(fulltext) translator = str.maketrans('', '', string.punctuation) chapters = defaultdict(list) for par in article_text: section = par['section'] section = section.translate(translator) chapters[section].append(par['text']) chapters_par = [] for key in chapters: chapter_paragraphs = { 'paragraphs': chapters[key], 'title': key } chapters_par.append([chapter_paragraphs]) result['content.chapters'] = chapters_par mongo_result = db.publications.update_one( {'_id': 'PMC_' + meta['pmc']}, {'$set': result}, upsert=True) print(mongo_result) logging.info("Processed " + file + ' with original nxml') except Exception: logging.exception('Cannot process paper', file, exc_info=True)
def parse_pubmed(src): """Parse pubmed xml article data and return metadata and text.""" metadata = pubmed_parser.parse_pubmed_xml(src) text = pubmed_parser.parse_pubmed_paragraph(src, all_paragraph=True) text = ' '.join(' '.join([x['text'] for x in text]).split()) return metadata, text
import pubmed_parser as pp def walk(path='./sample'): for parent, _, file_lst in os.walk(path): for file_name in file_lst: if file_name.endswith('xml'): yield os.path.join(parent, file_name) if __name__ == '__main__': corpus = [] # Read text. for path in walk(): doc = pp.parse_pubmed_xml(path) text = doc['abstract'] corpus.append(text) ################################################ # Example with n-grams for n in [1, 2, 3]. ################################################ print('\n\n\nExample with n-grams for n in [1, 2, 3].') vectorizer = TfidfVectorizer(ngram_range=(1, 3)) X = vectorizer.fit_transform(corpus) ngrams = vectorizer.get_feature_names() print('# of n-grams:') print(collections.Counter([len(x.split()) for x in ngrams])) # Counter({3: 618, 2: 550, 1: 295})
""" Must install pubmed_parser. pip install git+git://github.com/titipata/pubmed_parser.git Source: https://github.com/titipata/pubmed_parser """ import os import pubmed_parser as pp path = './sample/Am_J_Speech_Lang_Pathol/PMC6802915.nxml' dict_out = pp.parse_pubmed_xml(path) print(dict_out.keys())