Beispiel #1
0
def test_parse_pubmed_caption():
    """
    Test parsing captions and figure ID from a PubMed XML file
    """
    captions = pp.parse_pubmed_caption(
        os.path.join("data", "pone.0046493.nxml"))
    assert isinstance(captions, list)
    assert isinstance(captions[0], dict)
    assert (len(captions) == 4
            ), "Expected number of figures/captions to have a length of 4"
Beispiel #2
0
    def merge(self):
        print('PubMed path:', self.pubmed_path)

        with open(self.output_filename, mode='w', newline='\n') as ofile:

            # PubMed
            for filename in glob.glob(os.path.join(self.pubmed_path,
                                                   '**/*.xml'),
                                      recursive=self.recursive):
                print('file:', filename)
                dicts_out = pmp.parse_medline_xml(filename)

                self.write_dicts(dicts_out, 'abstract', ofile, 'title',
                                 'pubmed_abstract')

            # PMC
            for filename in glob.glob(os.path.join(self.pubmed_path,
                                                   '**/*.nxml'),
                                      recursive=self.recursive):
                print('file:', filename)

                # OA abstract
                try:
                    dicts_out = [pmp.parse_pubmed_xml(filename)]
                    self.write_dicts(dicts_out, 'abstract', ofile,
                                     'full_title', 'pmc_oa_abstract')
                except:
                    pass

                # OA image caption
                try:
                    dicts_out = pmp.parse_pubmed_caption(filename)
                    self.write_dicts(dicts_out, 'fig_caption', ofile,
                                     'fig_label', 'pmc_oa_image-caption')
                except:
                    pass

                # OA Paragraph
                try:
                    dicts_out = pmp.parse_pubmed_paragraph(filename,
                                                           all_paragraph=True)
                    self.write_dicts(dicts_out, 'text', ofile, 'reference_ids',
                                     'pmc_oa_paragraph')
                except:
                    pass
def parse_oa_xml(xml_file, output_file, mode):
    """Import pubmed open access XML file into prophet database."""
    # For open access
    import pubmed_parser as pp

    if mode == 'paper':
        dicts_out = pp.parse_pubmed_xml(xml_file)
    elif mode == 'paragraphs':
        dicts_out = pp.parse_pubmed_paragraph(xml_file, all_paragraph=True)
    elif mode == 'references':
        dicts_out = pp.parse_pubmed_references(xml_file)
    elif mode == 'tables':
        dicts_out = pp.parse_pubmed_table(xml_file, return_xml=False)
    elif mode == 'figures':
        dicts_out = pp.parse_pubmed_caption(xml_file)

    with open(output_file, 'w') as fp:
        json.dump(dicts_out, fp, cls=DateEncoder)
Beispiel #4
0
    def extract(self, tar_buffer):
        tar_buffer.seek(0)

        tar = tarfile.open(fileobj=tar_buffer)
        members = tar.getmembers()

        imgs_files = re.compile(r'.*(\.gif|jpe?g|tiff?|png|webp|bmp)$')
        text_file = re.compile(r'.*(\.nxml)$')

        imgs = {}

        for mem in members:
            if imgs_files.match(mem.name):
                img_ref = os.path.basename(mem.name)
                img_ref = os.path.splitext(img_ref)[0]
                imbuffer = tar.extractfile(mem).read()
                imgs[img_ref] = imbuffer

            if text_file.match(mem.name):
                text = tar.extractfile(mem.name).read().decode('utf-8')
                caption = pp.parse_pubmed_caption(text)

        return imgs, caption