def test_format_record(self):
        """ bibformat - correct formatting"""
        #use output format that has no match TEST DISABLED DURING MIGRATION
        #result = bibformat_engine.format_record(recID=None, of="test2", xml_record=self.xml_text_2)
        #self.assertEqual(result.replace("\n", ""),"")

        #use output format that link to unknown template
        result = bibformat_engine.format_record(recID=None,
                                                of="test3",
                                                xml_record=self.xml_text_2)
        self.assertEqual(result.replace("\n", ""), "")

        #Unknown output format TEST DISABLED DURING MIGRATION
        #result = bibformat_engine.format_record(recID=None, of="unkno", xml_record=self.xml_text_3)
        #self.assertEqual(result.replace("\n", ""),"")

        #Default formatting
        result = bibformat_engine.format_record(recID=None,
                                                ln='fr',
                                                of="test3",
                                                xml_record=self.xml_text_3)
        self.assertEqual(
            result,
            '''<h1>hi</h1> this is my template\ntest<bfe_non_existing_element must disappear/><test_1  non prefixed element must stay as any normal tag/>tfrgarbage\n<br/>test me!&lt;b&gt;ok&lt;/b&gt;a default valueeditor\n<br/>test me!<b>ok</b>a default valueeditor\n<br/>test me!&lt;b&gt;ok&lt;/b&gt;a default valueeditor\n'''
        )
    def test_format_record(self):
        """ bibformat - correct formatting"""
        bibformat_engine.CFG_BIBFORMAT_OUTPUTS_PATH = CFG_BIBFORMAT_OUTPUTS_PATH
        bibformat_engine.CFG_BIBFORMAT_ELEMENTS_PATH = CFG_BIBFORMAT_ELEMENTS_PATH
        bibformat_engine.CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH = CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH
        bibformat_engine.CFG_BIBFORMAT_TEMPLATES_PATH = CFG_BIBFORMAT_TEMPLATES_PATH

        # use output format that has no match TEST DISABLED DURING MIGRATION
        # result = bibformat_engine.format_record(recID=None, of="test2", xml_record=self.xml_text_2)
        # self.assertEqual(result.replace("\n", ""),"")

        # use output format that link to unknown template
        result = bibformat_engine.format_record(recID=None, of="test3", xml_record=self.xml_text_2)
        self.assertEqual(result.replace("\n", ""), "")

        # Unknown output format TEST DISABLED DURING MIGRATION
        # result = bibformat_engine.format_record(recID=None, of="unkno", xml_record=self.xml_text_3)
        # self.assertEqual(result.replace("\n", ""),"")

        # Default formatting
        result = bibformat_engine.format_record(recID=None, ln="fr", of="test3", xml_record=self.xml_text_3)
        self.assertEqual(
            result,
            """<h1>hi</h1> this is my template\ntest<bfe_non_existing_element must disappear/><test_1  non prefixed element must stay as any normal tag/>tfrgarbage\n<br/>test me!&lt;b&gt;ok&lt;/b&gt;a default valueeditor\n<br/>test me!<b>ok</b>a default valueeditor\n<br/>test me!&lt;b&gt;ok&lt;/b&gt;a default valueeditor\n""",
        )
def clean(r) :
  #print r
  fr = format_record(r, "hlxu", "fr")
  fr = format_record(r, "hlxu")
  #print fr
  fr = fr[0]
  fr = fr.replace('<br>','\n')
  fr = fr.replace('<pre>','\n')
  fr = fr.replace('<br />','\n')
  fr = fr.replace('</pre>','\n')
  fr = fr.replace('&nbsp;','')
  fr = fr.replace('%``','``')
  return fr
def clean(r):
    #print r
    fr = format_record(r, "hlxu", "fr")
    fr = format_record(r, "tlcv")
    #print fr
    fr = fr[0]
    fr = fr.replace('<br>', '\n')
    fr = fr.replace('<pre>', '\n')
    fr = fr.replace('<br />', '\n')
    fr = fr.replace('</pre>', '\n')
    fr = fr.replace('&nbsp;', '')
    fr = fr.replace('%``', '``')
    return fr
Exemple #5
0
def afs_sync(modified_records, time_estimator, tot, now):
    """Sync to AFS."""
    write_message("Appending output to %s" % CFG_OUTPUT_PATH)
    prodsyncname = CFG_OUTPUT_PATH + now.strftime("%Y%m%d%H%M%S") + '.xml.gz'
    r = gzip.open(prodsyncname, "w")
    print >> r, '<collection xmlns="http://www.loc.gov/MARC21/slim">'
    for i, recid in enumerate(modified_records):
        record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0]
        if not record:
            write_message("Error formatting record {0} as 'xme': {1}".format(
                recid, record
            ))
        else:
            print >> r, record
        if shall_sleep(recid, i, tot, time_estimator):
            r.flush()
            task_sleep_now_if_required()
    print >> r, '</collection>'
    r.close()
    prodsync_tarname = CFG_OUTPUT_PATH + '.tar'
    write_message("Adding %s to %s" % (prodsyncname, prodsync_tarname))
    prodsync_tar = tarfile.open(prodsync_tarname, 'a')
    prodsync_tar.add(prodsyncname)
    prodsync_tar.close()
    os.remove(prodsyncname)
 def test_empty_formatting(self):
     """bibformat - formatting empty record"""
     result = bibformat_engine.format_record(recID=0,
                                             of='hb',
                                             verbose=9,
                                             xml_record=self.empty_record_xml)
     self.assertEqual(result, '')
Exemple #7
0
def afs_sync(modified_records, time_estimator, tot, now):
    """Sync to AFS."""
    write_message("Appending output to %s" % CFG_OUTPUT_PATH)
    prodsyncname = CFG_OUTPUT_PATH + now.strftime("%Y%m%d%H%M%S") + '.xml.gz'
    r = gzip.open(prodsyncname, "w")
    print >> r, '<collection xmlns="http://www.loc.gov/MARC21/slim">'
    for i, recid in enumerate(modified_records):
        with run_ro_on_slave_db():
            record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0]
        if not record:
            write_message("Error formatting record {0} as 'xme': {1}".format(
                recid, record))
        else:
            print >> r, record
        if shall_sleep(recid, i, tot, time_estimator):
            r.flush()
            task_sleep_now_if_required()
    print >> r, '</collection>'
    r.close()
    prodsync_tarname = CFG_OUTPUT_PATH + '.tar'
    write_message("Adding %s to %s" % (prodsyncname, prodsync_tarname))
    prodsync_tar = tarfile.open(prodsync_tarname, 'a')
    prodsync_tar.add(prodsyncname)
    prodsync_tar.close()
    os.remove(prodsyncname)
 def test_empty_formatting(self):
     """bibformat - formatting empty record"""
     result = bibformat_engine.format_record(recID=0,
                                             of='hb',
                                             verbose=9,
                                             xml_record=self.empty_record_xml)
     self.assertEqual(result, '')
 def test_format_translations_no_2nd_pass_en(self):
     result, needs_2nd_pass = bibformat_engine.format_record(
                                             recID=None,
                                             of="test7",
                                             xml_record=self.xml_text_2,
                                             ln='en')
     self.assertEqual(result.strip(), 'Title en\n<input type="button" value="Record"/>')
     self.assertEqual(needs_2nd_pass, False)
Exemple #10
0
 def test_format_translations_no_2nd_pass_fr(self):
     ln = 'fr'
     result, needs_2nd_pass = bibformat_engine.format_record(
         recID=None, of="test7", xml_record=self.xml_text_2, ln=ln)
     _ = gettext_set_language(ln)
     self.assertEqual(
         result.strip(),
         'Titre fr\n<input type="button" value="%s"/>' % _('Record'))
     self.assertEqual(needs_2nd_pass, False)
Exemple #11
0
    def test_format_2_passes_manually(self):
        result, needs_2nd_pass = bibformat_engine.format_record(
            recID=None, of="test6", xml_record=self.xml_text_2)
        self.assertEqual(result, "<bfe_test_6 />\n")
        self.assertEqual(needs_2nd_pass, True)

        out = bibformat_engine.format_record_2nd_pass(recID=None,
                                                      template=result)
        self.assertEqual(out, "helloworld\n")
 def test_format_translations_no_2nd_pass_fr(self):
     ln = 'fr'
     result, needs_2nd_pass = bibformat_engine.format_record(
                                             recID=None,
                                             of="test7",
                                             xml_record=self.xml_text_2,
                                             ln=ln)
     _ = gettext_set_language(ln)
     self.assertEqual(result.strip(), 'Titre fr\n<input type="button" value="%s"/>' % _('Record'))
     self.assertEqual(needs_2nd_pass, False)
    def test_format_record(self):
        """ bibformat - correct formatting"""
        #use output format that has no match TEST DISABLED DURING MIGRATION
        #result = bibformat_engine.format_record(recID=None, of="test2", xml_record=self.xml_text_2)
        #self.assertEqual(result.replace("\n", ""),"")

        #use output format that link to unknown template
        result, needs_2nd_pass = bibformat_engine.format_record(recID=None, of="test3", xml_record=self.xml_text_2)
        self.assertEqual(result.replace("\n", ""), "")
        self.assertEqual(needs_2nd_pass, False)

        #Unknown output format TEST DISABLED DURING MIGRATION
        #result = bibformat_engine.format_record(recID=None, of="unkno", xml_record=self.xml_text_3)
        #self.assertEqual(result.replace("\n", ""),"")

        #Default formatting
        result, needs_2nd_pass = bibformat_engine.format_record(recID=None, ln='fr', of="test3", xml_record=self.xml_text_3)
        self.assertEqual(result, '''<h1>hi</h1> this is my template\ntest<bfe_non_existing_element must disappear/><test_1  non prefixed element must stay as any normal tag/>tfrgarbage\n<br/>test me!&lt;b&gt;ok&lt;/b&gt;a default valueeditor\n<br/>test me!<b>ok</b>a default valueeditor\n<br/>test me!&lt;b&gt;ok&lt;/b&gt;a default valueeditor\n''')
        self.assertEqual(needs_2nd_pass, False)
    def test_format_2_passes_manually(self):
        result, needs_2nd_pass = bibformat_engine.format_record(
                                                recID=None,
                                                of="test6",
                                                xml_record=self.xml_text_2)
        self.assertEqual(result, "<bfe_test_6 />\n")
        self.assertEqual(needs_2nd_pass, True)

        out = bibformat_engine.format_record_2nd_pass(recID=None,
                                                      template=result)
        self.assertEqual(out, "helloworld\n")
Exemple #15
0
def bst_dump_records():
    try:
        os.makedirs(os.path.join(CFG_WEBDIR, 'dumps'))
    except OSError:
        pass
    html_index = open(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'),
                      "w")
    print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>"
    for collection in CFG_EXPORTED_COLLECTIONS:
        task_update_progress(collection)
        print >> html_index, """
<li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a>
(<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % {
            'prefix': CFG_SITE_URL,
            'collection': collection,
            'date': time.ctime()
        }
        write_message("Preparing %s-records.xml.gz" % collection)
        output_path = os.path.join(CFG_WEBDIR, 'dumps',
                                   '.%s-records.xml.gz' % collection)
        output = gzip.open(output_path, "w")
        print >> output, "<collection>"
        reclist = get_collection_reclist(collection)
        tot = len(reclist)
        time_estimator = get_time_estimator(tot)
        for i, recid in enumerate(reclist):
            with run_ro_on_slave_db():
                print >> output, format_record(recid, 'xme', user_info={})[0]
            time_estimation = time_estimator()[1]
            if (i + 1) % 100 == 0:
                task_update_progress(
                    "%s %s (%s%%) -> %s" %
                    (collection, recid, (i + 1) * 100 / tot,
                     time.strftime("%Y-%m-%d %H:%M:%S",
                                   time.localtime(time_estimation))))
                task_sleep_now_if_required()
        print >> output, "</collection>"
        output.close()
        write_message("Computing checksum")
        print >> open(output_path + '.md5', "w"), calculate_md5(output_path)
        os.rename(
            output_path,
            os.path.join(CFG_WEBDIR, 'dumps',
                         '%s-records.xml.gz' % collection))
        os.rename(
            output_path + '.md5',
            os.path.join(CFG_WEBDIR, 'dumps',
                         '%s-records.xml.gz.md5' % collection))
        write_message("DONE")
    print >> html_index, "</ul></body></html>"
    html_index.close()
    os.rename(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'),
              os.path.join(CFG_WEBDIR, 'dumps', 'inspire-dump.html'))
Exemple #16
0
def redis_sync(modified_records, time_estimator, tot):
    """Sync to redis."""
    r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS)
    for i, recid in enumerate(modified_records):
        with run_ro_on_slave_db():
            record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0]
        if not record:
            write_message("Error formatting record {0} as 'xme': {1}".format(
                recid, record))
        else:
            r.rpush('legacy_records', zlib.compress(record))
        if shall_sleep(recid, i, tot, time_estimator):
            task_sleep_now_if_required()
Exemple #17
0
def redis_sync(modified_records, time_estimator, tot):
    """Sync to redis."""
    r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS)
    for i, recid in enumerate(modified_records):
        record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0]
        if not record:
            write_message("Error formatting record {0} as 'xme': {1}".format(
                recid, record
            ))
        else:
            r.rpush('legacy_records', zlib.compress(record))
        if shall_sleep(recid, i, tot, time_estimator):
            task_sleep_now_if_required()
    def test_format_translations_with_2nd_pass_en(self):
        result, needs_2nd_pass = bibformat_engine.format_record(
                                                recID=None,
                                                of="test8",
                                                xml_record=self.xml_text_2,
                                                ln='en')
        self.assertEqual(result.strip(), '<lang>\n  <en>Title en</en>\n  <fr>Titre fr</fr>\n</lang>\n<bfe_test_6 />\n<input type="button" value="_(Record)_"/>')
        self.assertEqual(needs_2nd_pass, True)

        out = bibformat_engine.format_record_2nd_pass(recID=None,
                                                      template=result,
                                                      ln='en')
        self.assertEqual(out, 'Title en\nhelloworld\n<input type="button" value="Record"/>')
Exemple #19
0
    def test_format_translations_with_2nd_pass_en(self):
        result, needs_2nd_pass = bibformat_engine.format_record(
            recID=None, of="test8", xml_record=self.xml_text_2, ln='en')
        self.assertEqual(
            result.strip(),
            '<lang>\n  <en>Title en</en>\n  <fr>Titre fr</fr>\n</lang>\n<bfe_test_6 />\n<input type="button" value="_(Record)_"/>'
        )
        self.assertEqual(needs_2nd_pass, True)

        out = bibformat_engine.format_record_2nd_pass(recID=None,
                                                      template=result,
                                                      ln='en')
        self.assertEqual(
            out, 'Title en\nhelloworld\n<input type="button" value="Record"/>')
Exemple #20
0
def bst_dump_records():
    try:
        os.makedirs(os.path.join(CFG_WEBDIR, "dumps"))
    except OSError:
        pass
    html_index = open(os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), "w")
    print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>"
    for collection in CFG_EXPORTED_COLLECTIONS:
        task_update_progress(collection)
        print >> html_index, """
<li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a>
(<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % {
            "prefix": CFG_SITE_URL,
            "collection": collection,
            "date": time.ctime(),
        }
        write_message("Preparing %s-records.xml.gz" % collection)
        output_path = os.path.join(CFG_WEBDIR, "dumps", ".%s-records.xml.gz" % collection)
        output = gzip.open(output_path, "w")
        print >> output, "<collection>"
        reclist = get_collection_reclist(collection)
        tot = len(reclist)
        time_estimator = get_time_estimator(tot)
        for i, recid in enumerate(reclist):
            print >> output, format_record(recid, "xme", user_info={})[0]
            time_estimation = time_estimator()[1]
            if (i + 1) % 100 == 0:
                task_update_progress(
                    "%s %s (%s%%) -> %s"
                    % (
                        collection,
                        recid,
                        (i + 1) * 100 / tot,
                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)),
                    )
                )
                task_sleep_now_if_required()
        print >> output, "</collection>"
        output.close()
        write_message("Computing checksum")
        print >>open(output_path + ".md5", "w"), calculate_md5(output_path)
        os.rename(output_path, os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz" % collection))
        os.rename(output_path + ".md5", os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz.md5" % collection))
        write_message("DONE")
    print >> html_index, "</ul></body></html>"
    html_index.close()
    os.rename(
        os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), os.path.join(CFG_WEBDIR, "dumps", "inspire-dump.html")
    )
def _entry_data_preview(data, recformat='hd'):
    """
    Formats the data using format_record
    """
    if recformat == 'hd' or recformat == 'xm':
        try:
            data = format_record(recID=None, of=recformat, xml_record=data)
        except:
            print "This is not a XML string"

    if data is "" or data is None:
        print 'NAI EINAI ADEIO'
        data = 'Could not render data'
    else:
        pass
    return data
    def extract_references(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)

        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value
            references_xml = extract_from_pdf_string(pdf)
        elif 'arxiv' in form and form['arxiv'].value:
            url = make_arxiv_url(arxiv_id=form['arxiv'].value)
            references_xml = extract_references_from_url_xml(url)
        elif 'url' in form and form['url'].value:
            url = form['url'].value
            references_xml = extract_references_from_url_xml(url)
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = self.extract_references_template()
        else:
            out = """
            <style type="text/css">
                #referenceinp_link { display: none; }
            </style>
            """
            out += format_record(0,
                                'hdref',
                                xml_record=references_xml.encode('utf-8'),
                                user_info=user_info)

        # Render the page (including header, footer)
        return page(title='References Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)
    def extract_references(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)

        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value
            references_xml = extract_from_pdf_string(pdf)
        elif 'arxiv' in form and form['arxiv'].value:
            url = make_arxiv_url(arxiv_id=form['arxiv'].value)
            references_xml = extract_references_from_url_xml(url)
        elif 'url' in form and form['url'].value:
            url = form['url'].value
            references_xml = extract_references_from_url_xml(url)
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = self.extract_references_template()
        else:
            out = """
            <style type="text/css">
                #referenceinp_link { display: none; }
            </style>
            """
            out += format_record(0,
                                 'hdref',
                                 xml_record=references_xml.encode('utf-8'),
                                 user_info=user_info)

        # Render the page (including header, footer)
        return page(title='References Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)
Exemple #24
0
def format_record(recID,
                  of,
                  ln=CFG_SITE_LANG,
                  verbose=0,
                  search_pattern=None,
                  xml_record=None,
                  user_info=None,
                  on_the_fly=False):
    """
    Format a record in given output format.

    Return a formatted version of the record in the specified
    language, search pattern, and with the specified output format.
    The function will define which format template must be applied.

    The record to be formatted can be specified with its ID (with
    'recID' parameter) or given as XML representation (with
    'xml_record' parameter). If 'xml_record' is specified 'recID' is
    ignored (but should still be given for reference. A dummy recid 0
    or -1 could be used).

    'user_info' allows to grant access to some functionalities on a
    page depending on the user's priviledges. The 'user_info' object
    makes sense only in the case of on-the-fly formatting. 'user_info'
    is the same object as the one returned by
    'webuser.collect_user_info(req)'

    @param recID: the ID of record to format.
    @type recID: int
    @param of: an output format code (or short identifier for the output format)
    @type of: string
    @param ln: the language to use to format the record
    @type ln: string
    @param verbose: the level of verbosity from 0 to 9 (O: silent,
                                                       5: errors,
                                                       7: errors and warnings, stop if error in format elements
                                                       9: errors and warnings, stop if error (debug mode ))
    @type verbose: int
    @param search_pattern: list of strings representing the user request in web interface
    @type search_pattern: list(string)
    @param xml_record: an xml string represention of the record to format
    @type xml_record: string or None
    @param user_info: the information of the user who will view the formatted page (if applicable)
    @param on_the_fly: if False, try to return an already preformatted version of the record in the database
    @type on_the_fly: boolean
    @return: formatted record
    @rtype: string
    """
    from invenio.search_engine import record_exists
    if search_pattern is None:
        search_pattern = []

    out = ""

    if verbose == 9:
        out += """\n<span class="quicknote">
        Formatting record %i with output format %s.
        </span>""" % (recID, of)
    ############### FIXME: REMOVE WHEN MIGRATION IS DONE ###############
    if CFG_BIBFORMAT_USE_OLD_BIBFORMAT and CFG_PATH_PHP:
        return bibformat_engine.call_old_bibformat(recID,
                                                   of=of,
                                                   on_the_fly=on_the_fly)
    ############################# END ##################################
    if not on_the_fly and \
       (ln == CFG_SITE_LANG or \
        of.lower() == 'xm' or \
        CFG_BIBFORMAT_USE_OLD_BIBFORMAT or \
        (of.lower() in CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS)) and \
        record_exists(recID) != -1:
        # Try to fetch preformatted record. Only possible for records
        # formatted in CFG_SITE_LANG language (other are never
        # stored), or of='xm' which does not depend on language.
        # Exceptions are made for output formats defined in
        # CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS, which are
        # always served from the same cache for any language.  Also,
        # do not fetch from DB when record has been deleted: we want
        # to return an "empty" record in that case
        res = bibformat_dblayer.get_preformatted_record(recID, of)
        if res is not None:
            # record 'recID' is formatted in 'of', so return it
            if verbose == 9:
                last_updated = bibformat_dblayer.get_preformatted_record_date(
                    recID, of)
                out += """\n<br/><span class="quicknote">
                Found preformatted output for record %i (cache updated on %s).
                </span><br/>""" % (recID, last_updated)
            if of.lower() == 'xm':
                res = filter_hidden_fields(res, user_info)
            # try to replace language links in pre-cached res, if applicable:
            if ln != CFG_SITE_LANG and of.lower(
            ) in CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS:
                # The following statements try to quickly replace any
                # language arguments in URL links.  Not an exact
                # science, but should work most of the time for most
                # of the formats, with not too many false positives.
                # We don't have time to parse output much here.
                res = res.replace('?ln=' + CFG_SITE_LANG, '?ln=' + ln)
                res = res.replace('&ln=' + CFG_SITE_LANG, '&ln=' + ln)
                res = res.replace('&amp;ln=' + CFG_SITE_LANG, '&amp;ln=' + ln)
            out += res
            return out
        else:
            if verbose == 9:
                out += """\n<br/><span class="quicknote">
                No preformatted output found for record %s.
                </span>""" % recID

    # Live formatting of records in all other cases
    if verbose == 9:
        out += """\n<br/><span class="quicknote">
        Formatting record %i on-the-fly.
        </span>""" % recID

    try:
        out += bibformat_engine.format_record(recID=recID,
                                              of=of,
                                              ln=ln,
                                              verbose=verbose,
                                              search_pattern=search_pattern,
                                              xml_record=xml_record,
                                              user_info=user_info)
        if of.lower() == 'xm':
            out = filter_hidden_fields(out, user_info)
        return out
    except Exception, e:
        register_exception(prefix="An error occured while formatting record %i in %s" % \
                           (recID, of),
                           alert_admin=True)
        #Failsafe execution mode
        import invenio.template
        websearch_templates = invenio.template.load('websearch')
        if verbose == 9:
            out += """\n<br/><span class="quicknote">
            An error occured while formatting record %i. (%s)
            </span>""" % (recID, str(e))
        if of.lower() == 'hd':
            if verbose == 9:
                out += """\n<br/><span class="quicknote">
                Formatting record %i with websearch_templates.tmpl_print_record_detailed.
                </span><br/>""" % recID
                return out + websearch_templates.tmpl_print_record_detailed(
                    ln=ln,
                    recID=recID,
                )
        if verbose == 9:
            out += """\n<br/><span class="quicknote">
            Formatting record %i with websearch_templates.tmpl_print_record_brief.
            </span><br/>""" % recID
        return out + websearch_templates.tmpl_print_record_brief(
            ln=ln,
            recID=recID,
        )
Exemple #25
0
 def test_format_translations_no_2nd_pass_en(self):
     result, needs_2nd_pass = bibformat_engine.format_record(
         recID=None, of="test7", xml_record=self.xml_text_2, ln='en')
     self.assertEqual(result.strip(),
                      'Title en\n<input type="button" value="Record"/>')
     self.assertEqual(needs_2nd_pass, False)
Exemple #26
0
def format_record(recID, of, ln=CFG_SITE_LANG, verbose=0, search_pattern=None,
                  xml_record=None, user_info=None, on_the_fly=False):
    """
    Format a record in given output format.

    Return a formatted version of the record in the specified
    language, search pattern, and with the specified output format.
    The function will define which format template must be applied.

    The record to be formatted can be specified with its ID (with
    'recID' parameter) or given as XML representation (with
    'xml_record' parameter). If 'xml_record' is specified 'recID' is
    ignored (but should still be given for reference. A dummy recid 0
    or -1 could be used).

    'user_info' allows to grant access to some functionalities on a
    page depending on the user's priviledges. The 'user_info' object
    makes sense only in the case of on-the-fly formatting. 'user_info'
    is the same object as the one returned by
    'webuser.collect_user_info(req)'

    @param recID: the ID of record to format.
    @type recID: int
    @param of: an output format code (or short identifier for the output format)
    @type of: string
    @param ln: the language to use to format the record
    @type ln: string
    @param verbose: the level of verbosity from 0 to 9 (O: silent,
                                                       5: errors,
                                                       7: errors and warnings, stop if error in format elements
                                                       9: errors and warnings, stop if error (debug mode ))
    @type verbose: int
    @param search_pattern: list of strings representing the user request in web interface
    @type search_pattern: list(string)
    @param xml_record: an xml string represention of the record to format
    @type xml_record: string or None
    @param user_info: the information of the user who will view the formatted page (if applicable)
    @param on_the_fly: if False, try to return an already preformatted version of the record in the database
    @type on_the_fly: boolean
    @return: formatted record
    @rtype: string
    """
    from invenio.search_engine import record_exists
    if search_pattern is None:
        search_pattern = []

    out = ""

    if verbose == 9:
        out += """\n<span class="quicknote">
        Formatting record %i with output format %s.
        </span>""" % (recID, of)
    ############### FIXME: REMOVE WHEN MIGRATION IS DONE ###############
    if CFG_BIBFORMAT_USE_OLD_BIBFORMAT and CFG_PATH_PHP:
        return bibformat_engine.call_old_bibformat(recID, of=of, on_the_fly=on_the_fly)
    ############################# END ##################################
    if not on_the_fly and \
       (ln == CFG_SITE_LANG or \
        of.lower() == 'xm' or \
        CFG_BIBFORMAT_USE_OLD_BIBFORMAT or \
        (of.lower() in CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS)) and \
        record_exists(recID) != -1:
        # Try to fetch preformatted record. Only possible for records
        # formatted in CFG_SITE_LANG language (other are never
        # stored), or of='xm' which does not depend on language.
        # Exceptions are made for output formats defined in
        # CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS, which are
        # always served from the same cache for any language.  Also,
        # do not fetch from DB when record has been deleted: we want
        # to return an "empty" record in that case
        res = bibformat_dblayer.get_preformatted_record(recID, of)
        if res is not None:
            # record 'recID' is formatted in 'of', so return it
            if verbose == 9:
                last_updated = bibformat_dblayer.get_preformatted_record_date(recID, of)
                out += """\n<br/><span class="quicknote">
                Found preformatted output for record %i (cache updated on %s).
                </span><br/>""" % (recID, last_updated)
            if of.lower() == 'xm':
                res = filter_hidden_fields(res, user_info)
            # try to replace language links in pre-cached res, if applicable:
            if ln != CFG_SITE_LANG and of.lower() in CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS:
                # The following statements try to quickly replace any
                # language arguments in URL links.  Not an exact
                # science, but should work most of the time for most
                # of the formats, with not too many false positives.
                # We don't have time to parse output much here.
                res = res.replace('?ln=' + CFG_SITE_LANG, '?ln=' + ln)
                res = res.replace('&ln=' + CFG_SITE_LANG, '&ln=' + ln)
                res = res.replace('&amp;ln=' + CFG_SITE_LANG, '&amp;ln=' + ln)
            out += res
            return out
        else:
            if verbose == 9:
                out += """\n<br/><span class="quicknote">
                No preformatted output found for record %s.
                </span>"""% recID


    # Live formatting of records in all other cases
    if verbose == 9:
        out += """\n<br/><span class="quicknote">
        Formatting record %i on-the-fly.
        </span>""" % recID

    try:
        out += bibformat_engine.format_record(recID=recID,
                                              of=of,
                                              ln=ln,
                                              verbose=verbose,
                                              search_pattern=search_pattern,
                                              xml_record=xml_record,
                                              user_info=user_info)
        if of.lower() == 'xm':
            out = filter_hidden_fields(out, user_info)
        return out
    except Exception, e:
        register_exception(prefix="An error occured while formatting record %i in %s" % \
                           (recID, of),
                           alert_admin=True)
        #Failsafe execution mode
        import invenio.template
        websearch_templates = invenio.template.load('websearch')
        if verbose == 9:
            out += """\n<br/><span class="quicknote">
            An error occured while formatting record %i. (%s)
            </span>""" % (recID, str(e))
        if of.lower() == 'hd':
            if verbose == 9:
                out += """\n<br/><span class="quicknote">
                Formatting record %i with websearch_templates.tmpl_print_record_detailed.
                </span><br/>""" % recID
                return out + websearch_templates.tmpl_print_record_detailed(
                    ln = ln,
                    recID = recID,
                    )
        if verbose == 9:
            out += """\n<br/><span class="quicknote">
            Formatting record %i with websearch_templates.tmpl_print_record_brief.
            </span><br/>""" % recID
        return out + websearch_templates.tmpl_print_record_brief(ln = ln,
                                                                 recID = recID,
                                                                 )
    def extract(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)
        plots = None
        list_image_names = []
        list_caption = []
        plots_dir = os.path.join(CFG_PREFIX, "var/www/img/plots/")
        # unique folder name
        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value
            references_xml = extract_from_pdf_string(pdf)
            
            pdf_string = form['pdf'].file.read()
            pdf = safe_mkstemp('extract.pdf')
            f = open(pdf, 'w')
            f.write(pdf_string)
            f.close()

            plots = 'File pdf: ' + str(pdf) + '<br />'
            (exit_code, output_buffer, stderr_output_buffer) = run_shell_command(CFG_PDFPLOTEXTRACTOR_PATH + ' ' + pdf)
            plotextracted_pdf_path = pdf + ".extracted/extracted.json"

            code, figures, extracted = merging_articles(None, plotextracted_pdf_path)
            id_fulltext = ""
            marc_path = create_MARCXML(figures, id_fulltext, code, extracted, write_file=True)
            plots += marc_path + '<br />'

            f = open (marc_path, 'r')
            record_xml = f.read()
            f.close()
            
            #plots_dir = "/opt/invenio/var/www/img/plots/"
            if os.path.exists(plots_dir):
                shutil.rmtree(plots_dir)
            os.mkdir(plots_dir)

            re_list = REGEXP_RECORD.findall(record_xml)
            for r in re_list:
                re_subfield = REGEXP_SUBFIELD_A.findall(r)
                for index, image_path in enumerate(re_subfield):
                    if index == 0:
                        run_shell_command('cp ' + image_path + ' ' + plots_dir)

        elif 'arxiv' in form and form['arxiv'].value:
            plots = ""
            url_pdf = make_arxiv_url(arxiv_id=form['arxiv'].value)
            references_xml = extract_references_from_url_xml(url_pdf)
            url_tarball = make_arxiv_tar_url(arxiv_id=form['arxiv'].value)
 
            plotextracted_xml_path, plotextracted_pdf_path = extract_plots_from_latex_and_pdf(url_tarball, url_pdf)
            plots += 'TAR: ' + plotextracted_xml_path + '<br />'
            plots += 'PDF: ' + plotextracted_pdf_path + '<br />'
            
           
	    '''
	    code, figures, extracted = merging_latex_pdf(plotextracted_xml_path, None, "", )
            id_fulltext = ""
            marc_path = create_MARCXML(figures, id_fulltext, code, extracted, write_file=True)
	    '''
	    dest_dir = os.path.join(CFG_TMPDIR, 'textmining')
	    try:
		os.mkdir(dest_dir)
	    except OSError:
		pass
	    code, message, figures, marc_path = merging_latex_pdf(plotextracted_xml_path, "", "", dest_dir)



            plots += 'OUTPUT: ' + marc_path + '<br />'

            f = open (marc_path, 'r')
            record_xml = f.read()
            f.close()
            
            if os.path.exists(plots_dir):
                shutil.rmtree(plots_dir)
            os.mkdir(plots_dir)

            re_list = REGEXP_RECORD.findall(record_xml)
            for r in re_list:
                re_subfield = REGEXP_SUBFIELD_A.findall(r)
                re_subfield_caption = REGEXP_SUBFIELD_D.findall(r) 
                for index, image_path in enumerate(re_subfield):
                    if index == 0:
                        run_shell_command('cp ' + image_path + ' ' + plots_dir)
                        list_image_names.append(os.path.split(image_path)[1])
                        list_caption.append(re_subfield_caption[index])
        
        elif 'url' in form and form['url'].value:
            url = form['url'].value
            references_xml = extract_references_from_url_xml(url)
            plots = "ME3"
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = self.extract_references_template()
        else:
            out = """
            <style type="text/css">
                #referenceinp_link { display: none; }
                /*img.plot { width: 250px; height: 250px; }*/
            </style>
            """
            out += format_record(0,
                                'hdref',
                                xml_record=references_xml.encode('utf-8'),
                                user_info=user_info)
            if plots:
                out += "<h2>Plots</h2>"
                out += plots
                dirList = os.listdir(plots_dir)
                
                for i, fname in enumerate(dirList):
                    out += '<h3>Figure ' + str(i+1) + '</h3> <p><img src="/img/plots/' + fname + '" class="plot"></p>'
                    index = list_image_names.index(fname)
                    out += '<p>' + list_caption[index] + '</p>'

        # Render the page (including header, footer)
        return page(title='Document Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)
Exemple #28
0
def format_record(recID, of, ln=CFG_SITE_LANG, verbose=0, search_pattern=None,
                  xml_record=None, user_info=None, on_the_fly=False):
    """
    Formats a record given output format.

    Returns a formatted version of the record in the specified
    language, search pattern, and with the specified output format.
    The function will define which format template must be applied.

    The record to be formatted can be specified with its ID (with
    'recID' parameter) or given as XML representation(with
    'xml_record' parameter). If both are specified 'recID' is ignored.

    'user_info' allows to grant access to some functionalities on a
    page depending on the user's priviledges. The 'user_info' object
    makes sense only in the case of on-the-fly formatting. 'user_info'
    is the same object as the one returned by
    'webuser.collect_user_info(req)'

    @param recID: the ID of record to format
    @param of: an output format code (or short identifier for the output format)
    @param ln: the language to use to format the record
    @param verbose: the level of verbosity from 0 to 9 (O: silent,
                                                       5: errors,
                                                       7: errors and warnings, stop if error in format elements
                                                       9: errors and warnings, stop if error (debug mode ))
    @param search_pattern: list of strings representing the user request in web interface
    @param xml_record: an xml string represention of the record to format
    @param user_info: the information of the user who will view the formatted page (if applicable)
    @param on_the_fly: if False, try to return an already preformatted version of the record in the database
    @return: formatted record
    """
    from invenio.search_engine import record_exists
    if search_pattern is None:
        search_pattern = []

    out = ""

    if verbose == 9:
        out += """\n<span class="quicknote">
        Formatting record %i with output format %s.
        </span>""" % (recID, of)
    ############### FIXME: REMOVE WHEN MIGRATION IS DONE ###############
    if CFG_BIBFORMAT_USE_OLD_BIBFORMAT and CFG_PATH_PHP:
        return bibformat_engine.call_old_bibformat(recID, format=of, on_the_fly=on_the_fly)
    ############################# END ##################################
    if not on_the_fly and \
       (ln == CFG_SITE_LANG or \
        of.lower() == 'xm' or \
        CFG_BIBFORMAT_USE_OLD_BIBFORMAT or \
        (CFG_BIBFORMAT_ENABLE_I18N_BRIEF_FORMAT == False and of.lower() == 'hb')) and \
        record_exists(recID) != -1:
        # Try to fetch preformatted record Only possible for records
        # formatted in CFG_SITE_LANG language (other are never
        # stored), or of='xm' which does not depend on language.
        # Also, when formatting in HB, and when
        # CFG_BIBFORMAT_ENABLE_I18N_BRIEF_FORMAT is set to False,
        # ignore other languages and fetch the preformatted output.
        # Also, do not fetch from DB when record has been deleted: we
        # want to return an "empty" record in that case
        res = bibformat_dblayer.get_preformatted_record(recID, of)
        if res is not None:
            # record 'recID' is formatted in 'of', so return it
            if verbose == 9:
                last_updated = bibformat_dblayer.get_preformatted_record_date(recID, of)
                out += """\n<br/><span class="quicknote">
                Found preformatted output for record %i (cache updated on %s).
                </span><br/>""" % (recID, last_updated)
            if of.lower() == 'xm':
                res = filter_hidden_fields(res, user_info)
            out += res
            return out
        else:
            if verbose == 9:
                out += """\n<br/><span class="quicknote">
                No preformatted output found for record %s.
                </span>"""% recID


    # Live formatting of records in all other cases
    if verbose == 9:
        out += """\n<br/><span class="quicknote">
        Formatting record %i on-the-fly.
        </span>""" % recID

    try:
        out += bibformat_engine.format_record(recID=recID,
                                              of=of,
                                              ln=ln,
                                              verbose=verbose,
                                              search_pattern=search_pattern,
                                              xml_record=xml_record,
                                              user_info=user_info)
        if of.lower() == 'xm':
            out = filter_hidden_fields(out, user_info)
        return out
    except Exception, e:
        register_exception(prefix="An error occured while formatting record %i in %s" % \
                           (recID, of),
                           alert_admin=True)
        #Failsafe execution mode
        import invenio.template
        websearch_templates = invenio.template.load('websearch')
        if verbose == 9:
            out += """\n<br/><span class="quicknote">
            An error occured while formatting record %i. (%s)
            </span>""" % (recID, str(e))
        if of.lower() == 'hd':
            if verbose == 9:
                out += """\n<br/><span class="quicknote">
                Formatting record %i with websearch_templates.tmpl_print_record_detailed.
                </span><br/>""" % recID
                return out + websearch_templates.tmpl_print_record_detailed(
                    ln = ln,
                    recID = recID,
                    )
        if verbose == 9:
            out += """\n<br/><span class="quicknote">
            Formatting record %i with websearch_templates.tmpl_print_record_brief.
            </span><br/>""" % recID
        return out + websearch_templates.tmpl_print_record_brief(ln = ln,
                                                                 recID = recID,
                                                                 )
Exemple #29
0
def format_records(recIDs,
                   of,
                   ln=CFG_SITE_LANG,
                   verbose=0,
                   search_pattern=None,
                   xml_records=None,
                   user_info=None,
                   record_prefix=None,
                   record_separator=None,
                   record_suffix=None,
                   prologue="",
                   epilogue="",
                   req=None,
                   on_the_fly=False):
    """
    Format records given by a list of record IDs or a list of records
    as xml.  Adds a prefix before each record, a suffix after each
    record, plus a separator between records.

    Also add optional prologue and epilogue to the complete formatted
    list.

    You can either specify a list of record IDs to format, or a list
    of xml records, but not both (if both are specified recIDs is
    ignored).

    'record_separator' is a function that returns a string as
    separator between records.  The function must take an integer as
    unique parameter, which is the index in recIDs (or xml_records) of
    the record that has just been formatted. For example separator(i)
    must return the separator between recID[i] and recID[i+1].
    Alternatively separator can be a single string, which will be used
    to separate all formatted records.  The same applies to
    'record_prefix' and 'record_suffix'.

    'req' is an optional parameter on which the result of the function
    are printed lively (prints records after records) if it is given.
    Note that you should set 'req' content-type by yourself, and send
    http header before calling this function as it will not do it.

    This function takes the same parameters as 'format_record' except for:
    @param recIDs: a list of record IDs
    @type recIDs: list(int)
    @param of: an output format code (or short identifier for the output format)
    @type of: string
    @param ln: the language to use to format the record
    @type ln: string
    @param verbose: the level of verbosity from 0 to 9 (0: silent,
                                                        5: errors,
                                                        7: errors and warnings, stop if error in format elements
                                                        9: errors and warnings, stop if error (debug mode ))
    @type verbose: int
    @param search_pattern: list of strings representing the user request in web interface
    @type search_pattern: list(string)
    @param user_info: the information of the user who will view the formatted page (if applicable)
    @param xml_records: a list of xml string representions of the records to format
    @type xml_records: list(string)
    @param record_prefix: a string printed before B{each} formatted records (n times)
    @type record_prefix: string
    @param record_suffix: a string printed after B{each} formatted records (n times)
    @type record_suffix: string
    @param prologue: a string printed at the beginning of the complete formatted records (1x)
    @type prologue: string
    @param epilogue: a string printed at the end of the complete formatted output (1x)
    @type epilogue: string
    @param record_separator: either a string or a function that returns string to join formatted records
    @param record_separator: string or function
    @param req: an optional request object where to print records
    @param on_the_fly: if False, try to return an already preformatted version of the record in the database
    @type on_the_fly: boolean
    @rtype: string
    """
    if req is not None:
        req.write(prologue)

    formatted_records = ''

    #Fill one of the lists with Nones
    if xml_records is not None:
        recIDs = map(lambda x: None, xml_records)
    else:
        xml_records = map(lambda x: None, recIDs)

    total_rec = len(recIDs)
    last_iteration = False
    for i in range(total_rec):
        if i == total_rec - 1:
            last_iteration = True

        #Print prefix
        if record_prefix is not None:
            if isinstance(record_prefix, str):
                formatted_records += record_prefix
                if req is not None:
                    req.write(record_prefix)
            else:
                string_prefix = record_prefix(i)
                formatted_records += string_prefix
                if req is not None:
                    req.write(string_prefix)

        #Print formatted record
        formatted_record = format_record(recIDs[i], of, ln, verbose, \
                                         search_pattern, xml_records[i],\
                                         user_info, on_the_fly)
        formatted_records += formatted_record
        if req is not None:
            req.write(formatted_record)

        #Print suffix
        if record_suffix is not None:
            if isinstance(record_suffix, str):
                formatted_records += record_suffix
                if req is not None:
                    req.write(record_suffix)
            else:
                string_suffix = record_suffix(i)
                formatted_records += string_suffix
                if req is not None:
                    req.write(string_suffix)

        #Print separator if needed
        if record_separator is not None and not last_iteration:
            if isinstance(record_separator, str):
                formatted_records += record_separator
                if req is not None:
                    req.write(record_separator)
            else:
                string_separator = record_separator(i)
                formatted_records += string_separator
                if req is not None:
                    req.write(string_separator)

    if req is not None:
        req.write(epilogue)

    return prologue + formatted_records + epilogue