Python extract_references_from_string_xml Examples, invenio.refextract_api.extract_references_from_string_xml Python Examples

Example #1

0

Show file

File: edpsciences_package.py Project: GiorgosPa/harvesting-kit

 def _add_references(self, rec):
     for label, ref_type, text_ref, ext_link, authors, year, source, volume, page in self._get_references():
         subfields = []
         if label:
             subfields.append(("o", label))
         if text_ref:
             ref_xml = extract_references_from_string_xml(text_ref)
             dom = parseString(ref_xml)
             fields = dom.getElementsByTagName("datafield")[0]
             fields = fields.getElementsByTagName("subfield")
             for field in fields:
                 data = field.firstChild.data
                 code = field.getAttribute("code")
                 subfields.append((code, data))
             subfields.append(("9", "refextract"))
         if ref_type:
             subfields.append(("d", ref_type))
         if text_ref:
             subfields.append(("m", text_ref))
         if ext_link:
             subfields.append(("u", ext_link))
         for author in authors:
             subfields.append(("h", author))
         if year:
             subfields.append(("y", year))
         if source and volume and page:
             subfields.append(("s", source + "," + volume + "," + page))
         elif source and volume:
             subfields.append(("s", source + "," + volume))
         elif source and page:
             subfields.append(("s", source + "," + page))
         elif source:
             subfields.append(("s", source))
         record_add_field(rec, "999", ind1="C", ind2="5", subfields=subfields)

Example #2

0

Show file

File: docextract_webinterface.py Project: AlbertoPeon/invenio

    def extract_references_txt(self, req, form):
        """Extract references from plain text"""
        check_login(req)

        if 'txt' not in form:
            return 'No text specified'

        txt = form['txt'].value

        return extract_references_from_string_xml(txt)

Example #3

0

Show file

File: docextract_webinterface.py Project: petros-ioannidis/invenio

    def extract_references_txt(self, req, form):
        """Extract references from plain text"""
        check_login(req)

        if 'txt' not in form:
            return 'No text specified'

        txt = form['txt'].value

        return extract_references_from_string_xml(txt)

Example #4

0

Show file

File: refextract_cli.py Project: AlbertoPeon/invenio

def extract_one(config, pdf_path):
    """Extract references from one file"""

    # the document body is not empty:
    # 2. If necessary, locate the reference section:
    if config.treat_as_reference_section:
        docbody = open(pdf_path).read().decode('utf-8')
        out = extract_references_from_string_xml(docbody)
    else:
        write_message("* processing pdffile: %s" % pdf_path, verbose=2)
        out = extract_references_from_file_xml(pdf_path)

    return out

Example #5

0

Show file

def extract_one(config, pdf_path):
    """Extract references from one file"""

    # the document body is not empty:
    # 2. If necessary, locate the reference section:
    if config.treat_as_reference_section:
        docbody = open(pdf_path).read().decode('utf-8')
        out = extract_references_from_string_xml(docbody)
    else:
        write_message("* processing pdffile: %s" % pdf_path, verbose=2)
        out = extract_references_from_file_xml(pdf_path)

    return out

Example #6

0

Show file

File: docextract_webinterface.py Project: AlbertoPeon/invenio

    def extract_references(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)

        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value
            references_xml = extract_from_pdf_string(pdf)
        elif 'arxiv' in form and form['arxiv'].value:
            url = make_arxiv_url(arxiv_id=form['arxiv'].value)
            references_xml = extract_references_from_url_xml(url)
        elif 'url' in form and form['url'].value:
            url = form['url'].value
            references_xml = extract_references_from_url_xml(url)
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = self.extract_references_template()
        else:
            out = """
            <style type="text/css">
                #referenceinp_link { display: none; }
            </style>
            """
            out += format_record(0,
                                'hdref',
                                xml_record=references_xml.encode('utf-8'),
                                user_info=user_info)

        # Render the page (including header, footer)
        return page(title='References Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)

Example #7

0

Show file

File: docextract_webinterface.py Project: petros-ioannidis/invenio

    def extract_references(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)

        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value
            references_xml = extract_from_pdf_string(pdf)
        elif 'arxiv' in form and form['arxiv'].value:
            url = make_arxiv_url(arxiv_id=form['arxiv'].value)
            references_xml = extract_references_from_url_xml(url)
        elif 'url' in form and form['url'].value:
            url = form['url'].value
            references_xml = extract_references_from_url_xml(url)
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = self.extract_references_template()
        else:
            out = """
            <style type="text/css">
                #referenceinp_link { display: none; }
            </style>
            """
            out += format_record(0,
                                 'hdref',
                                 xml_record=references_xml.encode('utf-8'),
                                 user_info=user_info)

        # Render the page (including header, footer)
        return page(title='References Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)

Example #8

0

Show file

File: bibedit_utils.py Project: ppiotr/Invenio

def replace_references(recid, uid=None, txt=None, url=None):
    """Replace references for a record

    The record itself is not updated, the marc xml of the document with updated
    references is returned

    Parameters:
    * recid: the id of the record
    * txt: references in text mode
    * inspire: format of ther references
    """
    # Parse references
    if txt is not None:
        references_xml = extract_references_from_string_xml(
            txt, is_only_references=True)
    elif url is not None:
        references_xml = extract_references_from_url_xml(url)
    else:
        references_xml = extract_references_from_record_xml(recid)
    references = create_record(references_xml.encode('utf-8'))

    dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_file_contents(
        recid, uid)
    out_xml = None

    references_to_add = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='5')
    refextract_status = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='6')

    if references_to_add:
        # Replace 999 fields
        record_delete_fields(record, '999')
        record_add_fields(record, '999', references_to_add)
        record_add_fields(record, '999', refextract_status)
        # Update record references
        out_xml = record_xml_output(record)

    return out_xml

Example #9

0

Show file

File: docextract_webinterface.py Project: tsgit/invenio

    def extract(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)

        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value.strip()
            references_xml = extract_from_pdf_string(pdf)
        elif 'arxiv' in form and form['arxiv'].value:
            url = make_arxiv_url(arxiv_id=form['arxiv'].value.strip())
            references_xml = extract_references_from_url_xml(url)
        elif 'url' in form and form['url'].value:
            url = form['url'].value.strip()
            try:
                references_xml = extract_references_from_url_xml(url)
            except (FullTextNotAvailable, ConnectionError, HTTPError, Timeout):
                references_xml = None
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value.decode('utf-8', 'ignore')
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = docextract_templates.tmpl_web_form()
        else:
            references_html = format_record(0,
                                            'hdref',
                                            xml_record=references_xml,
                                            user_info=user_info)
            out = docextract_templates.tmpl_web_result(references_html)

        # Render the page (including header, footer)
        return page(title='References Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)

Example #10

0

Show file

File: bibedit_utils.py Project: BessemAamira/invenio

def replace_references(recid, uid=None, txt=None, url=None):
    """Replace references for a record

    The record itself is not updated, the marc xml of the document with updated
    references is returned

    Parameters:
    * recid: the id of the record
    * txt: references in text mode
    * inspire: format of ther references
    """
    # Parse references
    if txt is not None:
        references_xml = extract_references_from_string_xml(txt, is_only_references=True)
    elif url is not None:
        references_xml = extract_references_from_url_xml(url)
    else:
        references_xml = extract_references_from_record_xml(recid)
    references = create_record(references_xml)

    dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents(recid, uid)
    out_xml = None

    references_to_add = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='5')
    refextract_status = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='6')

    if references_to_add:
        # Replace 999 fields
        record_delete_fields(record, '999')
        record_add_fields(record, '999', references_to_add)
        record_add_fields(record, '999', refextract_status)
        # Update record references
        out_xml = record_xml_output(record)

    return out_xml

Example #11

0

Show file

File: docextract_webinterface.py Project: aw-bib/tind-invenio

    def extract(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)

        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value
            references_xml = extract_from_pdf_string(pdf)
        elif 'arxiv' in form and form['arxiv'].value:
            url = make_arxiv_url(arxiv_id=form['arxiv'].value)
            references_xml = extract_references_from_url_xml(url)
        elif 'url' in form and form['url'].value:
            url = form['url'].value
            references_xml = extract_references_from_url_xml(url)
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value.decode('utf-8', 'ignore')
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = docextract_templates.tmpl_web_form()
        else:
            references_html = format_record(0,
                                           'hdref',
                                            xml_record=references_xml,
                                            user_info=user_info)
            out = docextract_templates.tmpl_web_result(references_html)

        # Render the page (including header, footer)
        return page(title='References Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)

Example #12

0

Show file

File: docextract_webinterface.py Project: ppiotr/Invenio

    def extract(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)
        plots = None
        list_image_names = []
        list_caption = []
        plots_dir = os.path.join(CFG_PREFIX, "var/www/img/plots/")
        # unique folder name
        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value
            references_xml = extract_from_pdf_string(pdf)
            
            pdf_string = form['pdf'].file.read()
            pdf = safe_mkstemp('extract.pdf')
            f = open(pdf, 'w')
            f.write(pdf_string)
            f.close()

            plots = 'File pdf: ' + str(pdf) + '<br />'
            (exit_code, output_buffer, stderr_output_buffer) = run_shell_command(CFG_PDFPLOTEXTRACTOR_PATH + ' ' + pdf)
            plotextracted_pdf_path = pdf + ".extracted/extracted.json"

            code, figures, extracted = merging_articles(None, plotextracted_pdf_path)
            id_fulltext = ""
            marc_path = create_MARCXML(figures, id_fulltext, code, extracted, write_file=True)
            plots += marc_path + '<br />'

            f = open (marc_path, 'r')
            record_xml = f.read()
            f.close()
            
            #plots_dir = "/opt/invenio/var/www/img/plots/"
            if os.path.exists(plots_dir):
                shutil.rmtree(plots_dir)
            os.mkdir(plots_dir)

            re_list = REGEXP_RECORD.findall(record_xml)
            for r in re_list:
                re_subfield = REGEXP_SUBFIELD_A.findall(r)
                for index, image_path in enumerate(re_subfield):
                    if index == 0:
                        run_shell_command('cp ' + image_path + ' ' + plots_dir)

        elif 'arxiv' in form and form['arxiv'].value:
            plots = ""
            url_pdf = make_arxiv_url(arxiv_id=form['arxiv'].value)
            references_xml = extract_references_from_url_xml(url_pdf)
            url_tarball = make_arxiv_tar_url(arxiv_id=form['arxiv'].value)
 
            plotextracted_xml_path, plotextracted_pdf_path = extract_plots_from_latex_and_pdf(url_tarball, url_pdf)
            plots += 'TAR: ' + plotextracted_xml_path + '<br />'
            plots += 'PDF: ' + plotextracted_pdf_path + '<br />'
            
           
	    '''
	    code, figures, extracted = merging_latex_pdf(plotextracted_xml_path, None, "", )
            id_fulltext = ""
            marc_path = create_MARCXML(figures, id_fulltext, code, extracted, write_file=True)
	    '''
	    dest_dir = os.path.join(CFG_TMPDIR, 'textmining')
	    try:
		os.mkdir(dest_dir)
	    except OSError:
		pass
	    code, message, figures, marc_path = merging_latex_pdf(plotextracted_xml_path, "", "", dest_dir)



            plots += 'OUTPUT: ' + marc_path + '<br />'

            f = open (marc_path, 'r')
            record_xml = f.read()
            f.close()
            
            if os.path.exists(plots_dir):
                shutil.rmtree(plots_dir)
            os.mkdir(plots_dir)

            re_list = REGEXP_RECORD.findall(record_xml)
            for r in re_list:
                re_subfield = REGEXP_SUBFIELD_A.findall(r)
                re_subfield_caption = REGEXP_SUBFIELD_D.findall(r) 
                for index, image_path in enumerate(re_subfield):
                    if index == 0:
                        run_shell_command('cp ' + image_path + ' ' + plots_dir)
                        list_image_names.append(os.path.split(image_path)[1])
                        list_caption.append(re_subfield_caption[index])
        
        elif 'url' in form and form['url'].value:
            url = form['url'].value
            references_xml = extract_references_from_url_xml(url)
            plots = "ME3"
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = self.extract_references_template()
        else:
            out = """
            <style type="text/css">
                #referenceinp_link { display: none; }
                /*img.plot { width: 250px; height: 250px; }*/
            </style>
            """
            out += format_record(0,
                                'hdref',
                                xml_record=references_xml.encode('utf-8'),
                                user_info=user_info)
            if plots:
                out += "<h2>Plots</h2>"
                out += plots
                dirList = os.listdir(plots_dir)
                
                for i, fname in enumerate(dirList):
                    out += '<h3>Figure ' + str(i+1) + '</h3> <p><img src="/img/plots/' + fname + '" class="plot"></p>'
                    index = list_image_names.index(fname)
                    out += '<p>' + list_caption[index] + '</p>'

        # Render the page (including header, footer)
        return page(title='Document Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)

Example #13

0

Show file

File: elsevier_package.py Project: ksachs/harvesting-kit

 def _add_references(self, xml_doc, rec):
     if self.CONSYN:
         for label, authors, doi, issue, page, title, volume, year,\
                 textref, ext_link, isjournal, comment, journal, publisher,\
                 editors, book_title in self.get_references(xml_doc):
             subfields = []
             if textref and not authors:
                 textref = textref.replace('\"', '\'')
                 ref_xml = extract_references_from_string_xml(textref)
                 dom = xml.dom.minidom.parseString(ref_xml)
                 fields = dom.getElementsByTagName("datafield")[0]
                 fields = fields.getElementsByTagName("subfield")
                 for field in fields:
                     data = field.firstChild.data
                     code = field.getAttribute("code")
                     if code == 's':
                         try:
                             journal = data.split(',')[0]
                             journal, vol = fix_journal_name(journal, self.journal_mappings)
                             vol += data.split(',')[1]
                             try:
                                 page = data.split(',')[2]
                                 journal = journal + "," + vol + "," + page
                                 subfields.append(('s', journal))
                             except IndexError:
                                 journal = journal + "," + vol
                                 subfields.append(('s', journal))
                         except IndexError:
                             subfields.append(('s', data))
                     else:
                         subfields.append((code, data))
                 if label:
                     label = re.sub("[\[\].)]", "", label)
                     subfields.append(('o', label))
                 if subfields:
                     record_add_field(rec, '999', ind1='C', ind2='5',
                                      subfields=subfields)
             else:
                 if doi:
                     subfields.append(('a', doi))
                 for author in authors:
                     subfields.append(('h', author))
                 if issue:
                     subfields.append(('n', issue))
                 if ext_link:
                     subfields.append(('r', ext_link))
                 if title:
                     subfields.append(('t', title))
                 elif textref:
                     subfields.append(('m', textref))
                 if publisher:
                     subfields.append(('p', publisher))
                 if volume:
                     subfields.append(('v', volume))
                 if year:
                     subfields.append(('y', year))
                 if comment:
                     subfields.append(('m', comment))
                 for editor in editors:
                     subfields.append(('e', editor))
                 if book_title:
                     subfields.append(('q', book_title))
                 if label:
                     label = re.sub("[\[\].)]", "", label)
                     subfields.append(('o', label))
                 if journal:
                     journal, vol = fix_journal_name(journal, self.journal_mappings)
                     volume = vol + volume
                     if volume and page:
                         journal = journal + "," + volume + "," + page
                         subfields.append(('s', journal))
                     elif volume:
                         journal = journal + "," + volume
                         subfields.append(('s', journal))
                     else:
                         subfields.append(('s', journal))
                 if subfields:
                     record_add_field(rec, '999', ind1='C', ind2='5',
                                      subfields=subfields)
     else:
         for label, authors, doi, issue, page, title, volume, year,\
                 textref, ext_link in self.get_references(xml_doc):
             subfields = []
             if doi:
                 subfields.append(('a', doi))
             for author in authors:
                 subfields.append(('h', author))
             if issue:
                 subfields.append(('n', issue))
             if label:
                 subfields.append(('o', label))
             if page:
                 subfields.append(('p', page))
             if ext_link:
                 subfields.append(('r', ext_link))
             if title and volume and year and page:
                 subfields.append(
                     ('s', '%s %s (%s) %s' % (title, volume, year, page)))
             elif textref:
                 subfields.append(('m', textref))
             if title:
                 subfields.append(('t', title))
             if volume:
                 subfields.append(('v', volume))
             if year:
                 subfields.append(('y', year))
             if subfields:
                 record_add_field(
                     rec, '999', ind1='C', ind2='5', subfields=subfields)

Example #14

0

Show file

File: springer_crawler.py Project: ksachs/harvesting-kit

    def _get_record(self, link):
        link = link.find('a')['href']
        url = urlparse.urljoin(self.base_url, link)
        page = urllib2.urlopen(url)
        page = BeautifulSoup(page)
        self.content = page.body.find('div', attrs={'id': 'content'})

        publication_title = self.content.find('div', {'id': 'publication-title'})
        if publication_title:
            publication_title = publication_title.find('a').text
        else:
            publication_title = ''
        series_title = self._find('a', {'id': 'series-title'})
        if series_title == 'NATO Science Series':
            series_title = 'NATO Sci.Ser.'
        title = self._find('h1', {'id': 'title'})
        volume = self._find('span', {'id': 'book-volume'})
        if volume:
            volume = re.sub(r'\D', '', volume)
        else:
            volume = self._find('span', {'id': 'volume-range'})
            volume = re.sub(r'\D', '', volume)
        issue = self._find('a', {'id': 'issue-range'})
        if issue:
            issue = issue.split()[1]
        year = self._find('span', {'id': 'copyright-year'})
        year = re.sub(r'\D', '', year)
        if not year:
            year = self._find('dd', {'id': 'abstract-about-cover-date'})
            year = re.sub(r'\D', '', year)[:4]
        abstract = self._find('div', {'class': 'abstract-content formatted'})
        page_range = self._find('span', {'id': 'page-range'})
        if page_range:
            page_range = page_range.replace('pp', '').strip()
        publisher = self._find('dd', {'id': 'abstract-about-publisher'})
        copyright_holder = self._find('dd', {'id': 'abstract-about-book-copyright-holder'})
        issn = self._find('dd', {'id': 'abstract-about-book-series-print-issn'})
        doi = self._find('dd', {'class': 'doi'})
        subtitle = self._find('dd', {'id': 'abstract-about-book-series-subtitle'})
        online_isbn = self._find('dd', {'id': 'abstract-about-book-online-isbn'})
        print_isbn = self._find('dd', {'id': 'abstract-about-book-print-isbn'})
        editors = []
        editors_affiliations = []
        for editor in self.content.findAll('li', attrs={'itemprop': 'editor'}):
            editors.append(editor.find('a').text)
            try:
                editors_affiliations.append(editor.find('sup')['title'])
            except KeyError:
                editors_affiliations.append('')
            except TypeError:
                editors_affiliations.append('')
        authors = []
        authors_affiliations = []
        summary = self.content.find('div', attrs={'class': 'summary'})
        for author in summary.findAll('li', attrs={'itemprop': 'author'}):
            author_name = author.find('a').text
            author_names = []
            author_names.append(author_name.split()[-1] + ",")
            author_names += author_name.split()[:-1]
            author_name = " ".join(author_names)
            author_name = collapse_initials(author_name)
            authors.append(author_name)
            try:
                authors_affiliations.append(author.find('sup')['title'])
            except KeyError:
                authors_affiliations.append('')
            except TypeError:
                authors_affiliations.append('')
        try:
            attrs = {'id': 'abstract-actions-download-chapter-pdf-link'}
            fulltext = self.content.find('a', attrs=attrs)
            fulltext = urlparse.urljoin(self.base_url, fulltext['href'])
        except TypeError:
            fulltext = ''

        #create marc record
        rec = {}
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        first_author = True
        for i in range(len(authors)):
            subfields = [('a', '%s' % (authors[i]))]
            if authors_affiliations[i]:
                subfields.append(('v', authors_affiliations[i]))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Springer')])
        if copyright_holder:
            record_add_field(rec, '542', subfields=[('f', copyright_holder), ('g', year)])
        if not series_title:
            series_title = publication_title

        subfields = []
        if series_title:
            subfields.append(('p', series_title))
        if volume:
            subfields.append(('v', volume))
        if issue:
            subfields.append(('n', issue))
        if page_range:
            subfields.append(('c', page_range))
        if year:
            subfields.append(('y', year))

        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        record_add_field(rec, '980', subfields=[('a', 'BookChapter')])

        if fulltext:
            record_add_field(rec, 'FFT', subfields=[('a', fulltext),
                                                    ('t', 'Springer'),
                                                    ('d', 'Fulltext')])

        recordString = record_xml_output(rec)
        #removes whitespaces except spaces
        recordString = re.sub(r'[\n\t\r\f\v]', '', recordString)
        #removes two or more consecutive spaces
        recordString = re.sub(r' {2,}', '', recordString)
        record = parseString(recordString)

        references = []
        ref_fields = []
        references_container = self.content.find('div', attrs={'id': 'abstract-references'})
        if references_container:
            references = references_container.findAll('li')
            for reference in references:
                ref = xml_to_text(parseString(reference.decode()))
                #removes the space between hep-th/ and the identifier
                ref = re.sub(r'hep-th/\s(\d*)', r'hep-th/\1', ref)
                ref = extract_references_from_string_xml(ref)
                ref = parseString(ref)
                for field in ref.childNodes:
                    for subfield in field.getElementsByTagName('subfield'):
                        if subfield.getAttribute('code') == 'm':
                            text = subfield.firstChild.data
                            text = re.sub(r'\[?arXiv:', '', text)
                            text = text.replace('CrossRef', '')
                            if text.startswith(': '):
                                text = text[2:]
                            if text:
                                subfield.firstChild.data = text
                            else:
                                parentNode = subfield.parentNode
                                parentNode.removeChild(subfield)
                    ref_fields.append(field.firstChild)
            for field in ref_fields:
                record.firstChild.appendChild(field)
        return record.firstChild

Example #15

0

Show file

    def _get_record(self, link):
        link = link.find('a')['href']
        url = urlparse.urljoin(self.base_url, link)
        page = urllib2.urlopen(url)
        page = BeautifulSoup(page)
        self.content = page.body.find('div', attrs={'id': 'content'})

        publication_title = self.content.find('div',
                                              {'id': 'publication-title'})
        if publication_title:
            publication_title = publication_title.find('a').text
        else:
            publication_title = ''
        series_title = self._find('a', {'id': 'series-title'})
        if series_title == 'NATO Science Series':
            series_title = 'NATO Sci.Ser.'
        title = self._find('h1', {'id': 'title'})
        if not title:
            title = self._find('h1', {'class': 'ChapterTitle'})
        volume = self._find('span', {'id': 'book-volume'})
        if volume:
            volume = re.sub(r'\D', '', volume)
        else:
            volume = self._find('span', {'id': 'volume-range'})
            volume = re.sub(r'\D', '', volume)
        issue = self._find('a', {'id': 'issue-range'})
        if issue:
            issue = issue.split()[1]
        year = self._find('span', {'id': 'copyright-year'})
        if not year:
            year = self._find(
                'dd', {'id': 'abstract-about-book-chapter-copyright-year'})
        year = re.sub(r'\D', '', year)
        if not year:
            year = self._find('dd', {'id': 'abstract-about-cover-date'})
            year = re.sub(r'\D', '', year)[:4]
        abstract = self._find('div', {'class': 'abstract-content formatted'})
        page_range = self._find('span', {'id': 'page-range'})
        if not page_range:
            page_range = self._find(
                'dd', {'id': 'abstract-about-book-chapter-page-ranges'})
        if page_range:
            page_range = page_range.replace('pp', '').strip()
        #publisher = self._find('dd', {'id': 'abstract-about-publisher'})
        copyright_holder = self._find(
            'dd', {'id': 'abstract-about-book-copyright-holder'})
        #issn = self._find('dd', {'id': 'abstract-about-book-series-print-issn'})
        doi = self._find('dd', {'class': 'doi'})
        #subtitle = self._find('dd', {'id': 'abstract-about-book-series-subtitle'})
        #online_isbn = self._find('dd', {'id': 'abstract-about-book-online-isbn'})
        #print_isbn = self._find('dd', {'id': 'abstract-about-book-print-isbn'})
        editors = []
        editors_affiliations = []
        for editor in self.content.findAll('li', attrs={'itemprop': 'editor'}):
            editors.append(editor.find('a').text)
            try:
                editors_affiliations.append(editor.find('sup')['title'])
            except KeyError:
                editors_affiliations.append('')
            except TypeError:
                editors_affiliations.append('')
        authors = []
        authors_affiliations = []
        summary = self.content.find('div', attrs={'class': 'summary'})
        for author in summary.findAll('li', attrs={'itemprop': 'author'}):
            author_name = author.find('a').text
            author_names = []
            author_names.append(author_name.split()[-1] + ",")
            author_names += author_name.split()[:-1]
            author_name = " ".join(author_names)
            author_name = collapse_initials(author_name)
            authors.append(author_name)
            try:
                authors_affiliations.append(author.find('sup')['title'])
            except KeyError:
                authors_affiliations.append('')
            except TypeError:
                authors_affiliations.append('')
        try:
            attrs = {'id': 'abstract-actions-download-chapter-pdf-link'}
            fulltext = self.content.find('a', attrs=attrs)
            fulltext = urlparse.urljoin(self.base_url, fulltext['href'])
        except TypeError:
            fulltext = ''

        #create Marc record
        rec = create_record()
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        first_author = True
        for i in range(len(authors)):
            subfields = [('a', '%s' % (authors[i]))]
            if authors_affiliations[i]:
                subfields.append(('v', authors_affiliations[i]))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'Springer')])
        if copyright_holder:
            record_add_field(rec,
                             '542',
                             subfields=[('f', copyright_holder), ('g', year)])
        if not series_title:
            series_title = publication_title

        subfields = []
        if series_title:
            subfields.append(('p', series_title))
        if volume:
            subfields.append(('v', volume))
        if issue:
            subfields.append(('n', issue))
        if page_range:
            subfields.append(('c', page_range))
        if year:
            subfields.append(('y', year))

        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        record_add_field(rec, '980', subfields=[('a', 'BookChapter')])

        if fulltext:
            record_add_field(rec,
                             'FFT',
                             subfields=[('a', fulltext), ('t', 'Springer'),
                                        ('d', 'Fulltext')])

        recordString = record_xml_output(rec)
        #removes whitespace except spaces
        recordString = re.sub(r'[\n\t\r\f\v]', '', recordString)
        #removes two or more consecutive spaces
        recordString = re.sub(r' {2,}', '', recordString)
        record = parseString(recordString)

        references = []
        ref_fields = []
        references_container = self.content.find(
            'div', attrs={'id': 'abstract-references'})
        if references_container:
            references = references_container.findAll('li')
            for reference in references:
                try:
                    from invenio.refextract_api import (
                        extract_references_from_string_xml)
                    ref = xml_to_text(parseString(reference.decode()))
                    #removes the space between hep-th/ and the identifier
                    ref = re.sub(r'hep-th/\s(\d*)', r'hep-th/\1', ref)
                    ref = extract_references_from_string_xml(ref)
                    ref = parseString(ref)
                    for field in ref.childNodes:
                        for subfield in field.getElementsByTagName('subfield'):
                            if subfield.getAttribute('code') == 'm':
                                text = subfield.firstChild.data
                                text = re.sub(r'\[?arXiv:', '', text)
                                text = text.replace('CrossRef', '')
                                if text.startswith(': '):
                                    text = text[2:]
                                if text:
                                    subfield.firstChild.data = text
                                else:
                                    parentNode = subfield.parentNode
                                    parentNode.removeChild(subfield)
                        ref_fields.append(field.firstChild)
                except ImportError:
                    record_add_field(rec,
                                     '999',
                                     ind1='C',
                                     ind2='5',
                                     subfields=[('m', reference.decode())])
            for field in ref_fields:
                record.firstChild.appendChild(field)
        return record.firstChild

Example #16

0

Show file

File: edpsciences_package.py Project: GiorgosPa/harvesting-kit

    def get_record_rich(self, filename):
        """
        Gets the Marc xml of the files in xaml_rich directory

        :param fileName: the name of the file to parse.
        :type fileName: string

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(filename)
        rec = create_record()
        articles = self.document.getElementsByTagName("ArticleID")
        for article in articles:
            article_type = article.getAttribute("Type")
            if not article_type == "Article":
                return ""
            doi = get_value_in_tag(self.document, "DOI")
            date = ""
            for tag in self.document.getElementsByTagName("Accepted"):
                year = get_value_in_tag(tag, "Year")
                month = get_value_in_tag(tag, "Month").zfill(2)
                day = get_value_in_tag(tag, "Day").zfill(2)
                date = "%s-%s-%s" % (year, month, day)
            if not date:
                for tag in self.document.getElementsByTagName("OnlineDate"):
                    year = get_value_in_tag(tag, "Year")
                    month = get_value_in_tag(tag, "Month").zfill(2)
                    day = get_value_in_tag(tag, "Day").zfill(2)
                    date = "%s-%s-%s" % (year, month, day)
            first_page = get_value_in_tag(article, "FirstPage")
            last_page = get_value_in_tag(article, "LastPage")
            subjects = article.getElementsByTagName("Keyword")
            subjects = map(xml_to_text, subjects)
            subject = ", ".join(subjects)
            copyright_statement = get_value_in_tag(article, "Copyright")
        journal = get_value_in_tag(self.document, "JournalTitle")
        journal, volume = fix_journal_name(journal, self.journal_mappings)
        issues = self.document.getElementsByTagName("IssueID")
        for issue in issues:
            volume += get_value_in_tag(issue, "Volume")
            year = get_value_in_tag(issue, "Year")
        title = get_value_in_tag(self.document, "Title")
        authors = self.document.getElementsByTagName("Author")
        affiliations = self.document.getElementsByTagName("Affiliation")

        def affiliation_pair(a):
            return a.getAttribute("ID"), get_value_in_tag(a, "UnstructuredAffiliation")

        affiliations = map(affiliation_pair, affiliations)
        affiliations = dict(affiliations)

        def author_pair(a):
            surname = get_value_in_tag(a, "LastName")
            first_name = get_value_in_tag(a, "FirstName")
            middle_name = get_value_in_tag(a, "MiddleName")
            if middle_name:
                name = "%s, %s %s" % (surname, first_name, middle_name)
            else:
                name = "%s, %s" % (surname, first_name)
            try:
                affid = a.getElementsByTagName("AffiliationID")[0].getAttribute("Label")
                affiliation = affiliations[affid]
            except IndexError:
                affiliation = ""
            except KeyError:
                affiliation = ""
            return name, affiliation

        authors = map(author_pair, authors)
        abstract = get_value_in_tag(self.document, "Abstract")
        references = self.document.getElementsByTagName("Bibliomixed")

        for reference in references:
            subfields = []
            label = reference.getAttribute("N")
            if label:
                subfields.append(("o", label))
            bibliosets = reference.getElementsByTagName("Biblioset")
            for tag in bibliosets:
                ref_year = get_value_in_tag(tag, "Date")
                ref_journal = get_value_in_tag(tag, "JournalShortTitle")
                ref_journal, ref_volume = fix_journal_name(ref_journal, self.journal_mappings)
                ref_volume += get_value_in_tag(tag, "Volume")
                ref_page = get_value_in_tag(tag, "ArtPageNums")
                if ref_year:
                    subfields.append(("y", ref_year))
                if ref_journal and ref_volume and ref_page:
                    subfields.append(("s", "%s,%s,%s" % (ref_journal, ref_volume, ref_page)))
                reference.removeChild(tag)
            text_ref = xml_to_text(reference)
            ref_xml = extract_references_from_string_xml(text_ref)
            dom = parseString(ref_xml)
            fields = dom.getElementsByTagName("datafield")[0]
            fields = fields.getElementsByTagName("subfield")
            if fields:
                subfields.append(("9", "refextract"))
            for field in fields:
                data = field.firstChild.data
                code = field.getAttribute("code")
                if code == "m" and bibliosets:
                    continue
                else:
                    subfields.append((code, data))
            if subfields:
                record_add_field(rec, "999", ind1="C", ind2="5", subfields=subfields)

        if title:
            record_add_field(rec, "245", subfields=[("a", title)])
        if date:
            record_add_field(rec, "260", subfields=[("c", date), ("t", "published")])
        if doi:
            record_add_field(rec, "024", ind1="7", subfields=[("a", doi), ("2", "DOI")])
        if abstract:
            record_add_field(rec, "520", subfields=[("a", abstract), ("9", "EDPSciences")])
        first_author = True
        for author in authors:
            if first_author:
                subfields = [("a", author[0])]
                if author[1]:
                    subfields.append(("v", author[1]))
                record_add_field(rec, "100", subfields=subfields)
                first_author = False
            else:
                subfields = [("a", author[0])]
                if author[1]:
                    subfields.append(("v", author[1]))
                record_add_field(rec, "700", subfields=subfields)
        subfields = []
        if journal and volume and first_page:
            subfields.append(("s", "%s,%s,%s" % (journal, volume, first_page)))
        if first_page and last_page:
            try:
                nuber_of_pages = int(last_page) - int(first_page)
                record_add_field(rec, "300", subfields=[("a", str(nuber_of_pages))])
            except ValueError:
                pass
            subfields.append(("c", "%s-%s" % (first_page, last_page)))
        if year:
            subfields.append(("y", year))
        record_add_field(rec, "773", subfields=subfields)
        record_add_field(rec, "980", subfields=[("a", "HEP")])
        if copyright_statement:
            record_add_field(rec, "542", subfields=[("f", copyright_statement)])
        if subject:
            record_add_field(rec, "650", ind1="1", ind2="7", subfields=[("2", "EDPSciences"), ("a", subject)])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""