def extract_sentences(paper_path, para_yes): """extracts sentences from a paper into two lists, given that para_yes contains a list of document element numbers corresponding to paragraphs manually identified as those containing synthesis information""" f = open(paper_path, 'rb') doc = Document.from_file(f, readers=[HtmlReader()]) sen_yes_arr = list() sen_no_arr = list() elem_all = np.arange(0, len(doc)) para_no = np.delete(elem_all, para_yes) for i in para_no: if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph: for sentence in doc.elements[i]: sen_no_arr.append(sentence) for i in para_yes: if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph: for sentence in doc.elements[i]: sen_yes_arr.append(sentence) return sen_yes_arr, sen_no_arr
def read_html_paper(paper_path): """Opens a HTML paper and stores it as a chemdataextractor Document""" f = open(paper_path, 'rb') doc = Document.from_file(f, readers=[HtmlReader()]) return doc
def get_img(self, doc): """Get images from doc using chemdataextractor""" # Load document image data from file tem_images = [] cde_doc = Document.from_file(open(doc[1], "rb")) log.info('This article is : %s' % doc[0]) imgs = cde_doc.figures del cde_doc # Identify relevant images from records for img in imgs: detected = False # Used to avoid processing images twice records = img.records caption = img.caption for record in records: if detected is True: break rec = record.serialize() if [self.img_type] in rec.values(): detected = True log.info('%s instance found!' % self.img_type) tem_images.append((doc[0], img.id, img.url, caption.text.replace('\n', ' '))) if len(tem_images) != 0: return tem_images else: return None
def extract(): """Extract melting points from patents.""" Paragraph.parsers = [CompoundParser(), ChemicalLabelParser(), MpParser()] Table.parsers = [] patents = [] for root, dirs, files in os.walk('../examples/mp/grants'): for filename in files: if not filename.endswith('.xml'): continue path = os.path.abspath(os.path.join(root, filename)) size = os.path.getsize(path) patents.append((path, filename, size)) patents = sorted(patents, key=lambda p: p[2]) for path, filename, size in patents: print(path) shutil.copyfile(path, '../examples/mp/used/%s' % filename) with open(path) as f: d = Document.from_file(f) if os.path.isfile('../examples/mp/results/%s.json' % filename): continue records = [ r.serialize() for r in d.records if len(r.melting_points) == 1 ] with open('../examples/mp/results/%s.json' % filename, 'w') as fout: fout.write( json.dumps(records, ensure_ascii=False, indent=2).encode('utf8'))
def test_document_usage(self): """Test RscHtmlReader used via Document.from_file.""" fname = '1752-153X-5-55.html' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'springer', fname), 'rb') d = Document.from_file(f, readers=[SpringerHtmlReader()]) self.assertEqual(len(d.elements), 97)
def test_document_usage(self): """Test ElsevierHtmlReader used via Document.from_file.""" fname = 'S0143720816310816.html' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'elsevier', fname), 'rb') d = Document.from_file(f, readers=[ElsevierHtmlReader()]) self.assertEqual(len(d.elements), 246)
def test_document_usage(self): """Test RscHtmlReader used via Document.from_file.""" fname = '10.1039_C6OB02074G.html' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'rsc', fname), 'rb') d = Document.from_file(f, readers=[RscHtmlReader()]) self.assertEqual(len(d.elements), 61)
def test_document_usage(self): """Test UsptoXmlReader used via Document.from_file.""" fname = 'US06840965B2.xml' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'uspto', fname), 'rb') d = Document.from_file(f, readers=[UsptoXmlReader()]) self.assertEqual(len(d.elements), 112)
def test_document_usage(self): """Test AcsHtmlReader used via Document.from_file.""" fname = 'acs.jmedchem.6b00723.html' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'acs', fname), 'rb') d = Document.from_file(f, readers=[AcsHtmlReader()]) self.assertEqual(len(d.elements), 198)
def extract_document(filename, extract_all=True, allow_wildcards=False, output=os.path.join(os.path.dirname(os.getcwd()), 'csd')): """ Extracts chemical records from a document and identifies chemical schematic diagrams. Then substitutes in if the label was found in a record :param filename: Location of document to be extracted :param extract_all : Boolean to determine whether output is combined with chemical records :param allow_wildcards: Bool to indicate whether results containing wildcards are permitted :param output: Directory to store extracted images :return : Dictionary of chemical records with diagram SMILES strings, or List of label candidates and smiles """ log.info('Extracting from %s ...' % filename) # Extract the raw records from CDE doc = Document.from_file(filename) figs = doc.figures # Identify image candidates csds = find_image_candidates(figs, filename) # Download figures locally fig_paths = download_figs(csds, output) log.info("All relevant figures from %s downloaded successfully" % filename) # When diagrams are not found, return results without CSR extraction if extract_all and not fig_paths: log.info('No chemical diagrams detected. Returning chemical records.') return doc.records.serialize() elif not extract_all and not fig_paths: log.info('No chemical diagrams detected. Returning empty list.') return [] log.info('Chemical diagram(s) detected. Running ChemSchematicResolver...') # Run CSR results = [] for path in fig_paths: try: results.append(extract_image(path, allow_wildcards=allow_wildcards)) except: log.error('Could not extract image at %s' % path) pass if not extract_all: return results records = doc.records.serialize() # Substitute smiles for labels combined_results = substitute_labels(records, results) log.info( 'All diagram results extracted and combined with chemical records.') return combined_results
def list_chemicals(foin): fchem=open(foin,'rb') docchem=Document.from_file(fchem) ct=0 t = PrettyTable(['Filename','Entity_count','Start','End','Entity']) for i in docchem.cems: ct=ct+1 t.add_row([foin,ct,i.start,i.end,i.text]) t.align='l' t.border=False return(t)
def get_result(f, fname): try: document = Document.from_file(f, fname=fname) except Exception: return {} records = document.records.serialize() records = natsort.natsorted( records, lambda x: x.get('labels', ['ZZZ%s' % (99 - len(x.get('names', [])))])[0]) result = { 'records': records, 'abbreviations': document.abbreviation_definitions } return result
def cde_read_pdfs(argv, pdf_path="./pdfs"): try: pdf_path = argv[0] except: print("missing arguments" + "\n -string pdf files path") return pdf_dir = Path(pdf_path) files_list = get_files_list(pdf_dir) print(files_list) for a_file in files_list: file_name = a_file.name pdf_f = open(a_file, 'rb') doc = Document.from_file(pdf_f) uniques = get_uniques(doc) max_lbl, max_val = get_max(uniques) print(file_name, "Unique entities:", len(uniques), "Most common entity:", max_lbl, max_val)
def extract_all_chem_names(read_path = "", write_in_file = False): start_time = time.time() doc = None #Always open files as binary print(f"\nLoading the information from: {read_path}.....") with open(read_path, "rb") as f: doc = Document.from_file(f) f.close() print("Document is created") print("\nExtracting all chemical names in the document.....") #records ==> records of all mentions and abbreviations, properties and spectra #Concatenate all the chemical names in the list all_chemicals = [] for compound in doc.records: comp = compound.serialize() if "names" in comp: all_chemicals += comp["names"] all_chemicals.sort() total_chems = len(all_chemicals) print(f"The total number of chemical names extracted are {total_chems}") if (write_in_file): write_path = read_path.split(".")[0] + "_chem_list.txt" print(f"\nWriting the information to file {write_path}") with open(write_path, "w+") as f: for chemical in all_chemicals: f.write(chemical + "\n") f.close() tot_time = round(time.time() - start_time, 3) print(f"\nTime taken to extract all the chemical name from the doc is {tot_time} seconds") return all_chemicals;
def ligandfinder(docpath, index): #update to handle list of synonyms found_chems = [] #''' import document''' f = open(docpath, 'rb') doc = Document.from_file(f) no_elements = len(doc.elements) for i in range(0, no_elements): check = False if isinstance(doc.elements[i], cde.doc.text.Paragraph): #will do tables later for sentence in doc.elements[i].raw_sentences: #'''scrape for sentences/paragraphs with ligand synonyms''' if 'ligand' or 'ligands' or 'coordinating' or 'surfactant' or 'surfactants' in cwt.tokenize( sentence): check = True if check: if not doc.elements[i].cems == True: #'''scrape those sentences/paragraphs for ligand names''' for chemical in doc.elements[i].cems: if chemical.text not in found_chems and chemical.text not in cf.element_filter: found_chems.append( (chemical.text, index )) #'''add ligand names to chem_records_df''' return found_chems
import csv from chemdataextractor import Document if len(sys.argv) == 1: print('No file path provided. Terminating program.') sys.exit() file_path = sys.argv[1] if not os.path.isfile(file_path): print('Path does not refer to a valid file. Terminating program') sys.exit() with open(file_path, 'rb') as f: #Convert to Document full_text_doc = Document.from_file(f) #Extract lists of records from Documents print("Extracting... (This may take a minute or two)") doc_records = full_text_doc.records.serialize() print("Extracted") print(doc_records) concentration_matrix = [] cleaned_doc_records = [ record for record in doc_records if 'measured_concentrations' in record ] for record in cleaned_doc_records: name = record['names'][0] for concentration in record['measured_concentrations']: concentration_matrix.append([name, *concentration.values()])
req_head = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' } # get the page content html_response = requests.get(article_url, headers=req_head) #save the content as a temporary file in the local disk f = open("temp.html", "w+") f.write(str(html_response.content)) f.close() # open de temporary file and read in binary mode f = open("temp.html", 'rb') # create a document object from the file doc = Document.from_file(f) for element in doc.elements: print(element) para = doc.elements[14] print(para) print("Sentences:", len(para.sentences)) print("Tokens:", para.tokens) print("Tokens:", len(para.tokens)) print("Tokens:", len(para.tokens[0])) #list of unique occurences uniques = [] for chement in doc.cems:
class SoParser(BaseParser): root = so def interpret(self, result, start, end): compound = Compound(solubility=[ Solubility(value=first(result.xpath('./value/text()')), units=first(result.xpath('./units/text()'))) ]) yield compound # In[40]: Paragraph.parsers = [SoParser()] # In[42]: d = Document( Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'), Paragraph( u'The procedure was followed to yield a pale yellow solid (solubility is 28 mg/mL)' )) d.records.serialize() # In[12]: d = Document.from_file("../../test1.htm") d.records.serialize() # In[ ]:
print('Chemical entities(cems):\n', doc.cems) print('abbreviation definitions:\n', doc.abbreviation_definitions) print('Records:\n', doc.records) print('Record 0:\n', doc.records[0].serialize()) print('Record 1:\n', doc.records[1].serialize()) filepath = "pdfs/c8cp05975f.pdf" f = open(filepath, 'rb') doc = Document.from_file(f) #, readers=[PdfReader()]) for element in doc.elements: print(element) para = doc.elements[14] print(para) print("Sentences:", len(para.sentences)) print("Tokens:", para.tokens) print("Tokens:", len(para.tokens)) print("Tokens:", len(para.tokens[0])) #list of unique occurences uniques = [] for chement in doc.cems:
def paragraph_extract(self): no_pargas = [] if self.journal == "Springer" or self.journal == "NaturePublishingGroup": html_file = os.listdir(self.html_path) number_file = len(os.listdir(self.html_path)) no_pargas = [] success_extracted = [] content_exist = [] content_label = "Sec\d+\S*" for file_i in range(0, number_file): sole_file = html_file[file_i] file = open(self.html_path + '/' + sole_file, 'rb') doc = Document.from_file(file) paragraphs = doc.paragraphs all_parag = '' content_find = None for parag in paragraphs: if "Abs" in str(parag.id): text = parag.text if text[0].isupper() and len(text) > 300: all_parag += '\n' refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') text = text.replace("\n", '') all_parag += text all_parag += " " re_d = re.findall(content_label, str(parag.id)) if re_d: text = parag.text if text[0].isupper() and len(text) > 300: all_parag += '\n' refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') text = text.replace("\n", '') all_parag += text all_parag += " " content_find = True if content_find: content_exist.append(sole_file) # 若全文找不到段落标签 if not all_parag: self.log_wp.print_log("No paragraph label:%s", sole_file) no_pargas.append(sole_file) for parag in paragraphs: text = parag.text if text[0].isupper() and len(text) > 300: all_parag += '\n' refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') text = text.replace("\n", '') all_parag += text all_parag += " " else: success_extracted.append(sole_file) txt_name = str(sole_file).replace(".html", ".txt") path = self.out_path + '/' + txt_name self.log_wp.write_totxt_log(path, str(all_parag)) if self.journal == "Tandfonline": html_file = os.listdir(self.html_path) number_file = len(os.listdir(self.html_path)) no_pargas = [] success_extracted = [] content_exist = [] content_label = "[Ss]\d{3}\S*" for file_i in range(0, number_file): sole_file = html_file[file_i] file = open(self.html_path + '/' + sole_file, 'rb') doc = Document.from_file(file) elements = doc.elements parags = doc.paragraphs all_parag = '' abs_search = None content_find = None for ele in elements: if str(ele.id) == 'abstract': abs_search = 1 if abs_search == 1: abstract = ele.text if abstract[0].isupper() and len(abstract) > 300: all_parag += '\n' refs = re.findall(r"\[\d+[^\[]*\]", abstract) for ref in refs: abstract = abstract.replace(ref, '') abstract = abstract.replace("\n", '') all_parag += abstract all_parag += " " break for parag in parags: re_d = re.findall(content_label, str(parag.id)) if re_d: text = parag.text if text[0].isupper() and len(text) > 300: all_parag += '\n' refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') text = text.replace("\n", '') all_parag += text all_parag += " " content_find = True if content_find: content_exist.append(sole_file) # 若全文找不到段落标签 if not all_parag: self.log_wp.print_log("No paragraph label:%s", sole_file) no_pargas.append(sole_file) for parag in parags: text = parag.text if text[0].isupper() and len(text) > 300: all_parag += '\n' text = text.replace("\n", '') refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') all_parag += text all_parag += " " else: success_extracted.append(sole_file) txt_name = str(sole_file).replace(".html", ".txt") path = self.out_path + '/' + txt_name self.log_wp.write_totxt_log(path, str(all_parag)) if self.journal == "WileyBlackwell": html_file = os.listdir(self.html_path) number_file = len(os.listdir(self.html_path)) no_pargas = [] success_extracted = [] content_exist = [] content_label = "sec" for file_i in range(0, number_file): sole_file = html_file[file_i] file = open(self.html_path + '/' + sole_file, 'rb') doc = Document.from_file(file) parags = doc.paragraphs all_parag = '' content_find = None for parag in parags: if content_label in str( parag.id) and "reference" not in str( parag.id): # 避免参考文件信息加入到结果中 text = parag.text if text[0].isupper() and len(text) > 300: all_parag += '\n' refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') text = text.replace("\n", '') all_parag += text all_parag += " " content_find = True if content_find: content_exist.append(sole_file) # 若全文找不到段落标签 if not all_parag: self.log_wp.print_log("No paragraph label:%s", sole_file) no_pargas.append(sole_file) for parag in parags: text = parag.text if text[0].isupper() and len(text) > 300: all_parag += '\n' text = text.replace("\n", '') refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') all_parag += text all_parag += " " else: success_extracted.append(sole_file) txt_name = str(sole_file).replace(".html", ".txt") path = self.out_path + '/' + txt_name self.log_wp.write_totxt_log(path, str(all_parag)) if self.journal == "ASME": html_file = os.listdir(self.html_path) number_file = len(os.listdir(self.html_path)) no_pargas = [] success_extracted = [] content_exist = [] content_label = "ContentTab" for file_i in range(0, number_file): sole_file = html_file[file_i] file = open(self.html_path + '/' + sole_file, 'rb') doc = Document.from_file(file) parags = doc.paragraphs all_parag = '' content_find = None for parag in parags: if content_label == str(parag.id): text = parag.text if text == "References": break if text[0].isupper() and len(text) > 300: all_parag += '\n' refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') text = text.replace("\n", '') all_parag += text all_parag += " " content_find = True if content_find: content_exist.append(sole_file) # 若全文找不到段落标签 if not all_parag: self.log_wp.print_log("No paragraph label:%s", sole_file) no_pargas.append(sole_file) for parag in parags: text = parag.text if text[0].isupper() and len(text) > 300: all_parag += '\n' text = text.replace("\n", '') refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') all_parag += text all_parag += " " else: success_extracted.append(sole_file) txt_name = str(sole_file).replace(".html", ".txt") path = self.out_path + '/' + txt_name self.log_wp.write_totxt_log(path, str(all_parag)) if self.journal == "MDPI": html_file = os.listdir(self.html_path) number_file = len(os.listdir(self.html_path)) no_pargas = [] success_extracted = [] content_exist = [] content_label = "sec\d+\S*" content_label_2 = "^\d+[A-Z]\S*" for file_i in range(0, number_file): sole_file = html_file[file_i] file = open(self.html_path + '/' + sole_file, 'rb') doc = Document.from_file(file) paragraphs = doc.paragraphs all_parag = '' content_find = None for parag in paragraphs: if "Abs" in str(parag.id) or "abs" in str(parag.id): text = parag.text if text[0].isupper() and len(text) > 300: all_parag += '\n' refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') text = text.replace("\n", '') all_parag += text all_parag += " " re_d = re.findall(content_label, str(parag.id)) re_d_2 = re.findall(content_label_2, str(parag.id)) if re_d or re_d_2: text = parag.text if text[0].isupper() and len(text) > 300: all_parag += '\n' refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') text = text.replace("\n", '') all_parag += text all_parag += " " content_find = True if content_find: content_exist.append(sole_file) # 若全文找不到段落标签 if not all_parag: self.log_wp.print_log("No paragraph label:%s", sole_file) no_pargas.append(sole_file) for parag in paragraphs: text = parag.text if text[0].isupper() and len(text) > 300: all_parag += '\n' refs = re.findall(r"\[\d+[^\[]*\]", text) for ref in refs: text = text.replace(ref, '') text = text.replace("\n", '') all_parag += text all_parag += " " else: success_extracted.append(sole_file) txt_name = str(sole_file).replace(".html", ".txt") path = self.out_path + '/' + txt_name self.log_wp.write_totxt_log(path, str(all_parag)) return no_pargas, success_extracted, content_exist
""" Script to run the ChemDataExtractor on the 200 pmids. http://chemdataextractor.org/docs/cem https://pubs.acs.org/doi/abs/10.1021/acs.jcim.6b00207 """ import os from chemdataextractor import Document from chemdataextractor.reader import PlainTextReader cde_annotations = open("tool_annotations/ChemDataExtractor_annotations.txt", "w", encoding="utf8") for d in os.listdir("../citations"): with open("../citations/{}".format(d), "rb") as f: citation = Document.from_file(f, readers=[PlainTextReader()]) pmid = d.split(".")[0] cde_annotations.write("\n{}\n".format(pmid)) for ann in citation.cems: cde_annotations.write("{}\n".format(ann)) cde_annotations.close()