Example #1
0
   def add_url(self, url, lastmod=datetime(1900, 1, 1), changefreq="", priority="", alternate=False):
       """ create a new url node. Returns the number of url nodes in sitemap"""
       self.num_urls += 1
       canonical_url, alternate_urls = get_canonical_and_alternates_urls(url, drop_ln=not alternate)
       url_node = u"""
 <url>
   <loc>%s</loc>%s
 </url>"""
       optional = ''
       if lastmod:
           optional += u"""
   <lastmod>%s</lastmod>""" % lastmod.strftime('%Y-%m-%dT%H:%M:%S' + \
                                               DEFAULT_TIMEZONE)
       if changefreq:
           optional += u"""
   <changefreq>%s</changefreq>""" % changefreq
       if priority:
           optional += u"""
   <priority>%s</priority>""" % priority
       if alternate:
           for ln, alternate_url in iteritems(alternate_urls):
               ln = ln.replace('_', '-') ## zh_CN -> zh-CN
               optional += u"""
   <xhtml:link rel="alternate" hreflang="%s" href="%s" />""" % (ln, encode_for_xml(alternate_url, quote=True))
       url_node %= (encode_for_xml(canonical_url), optional)
       self.file_size += len(url_node)
       self.filedescriptor.write(url_node)
       return self.num_urls
Example #2
0
   def add_url(self,
               url,
               lastmod=datetime(1900, 1, 1),
               changefreq="",
               priority="",
               alternate=False):
       """ create a new url node. Returns the number of url nodes in sitemap"""
       self.num_urls += 1
       canonical_url, alternate_urls = get_canonical_and_alternates_urls(
           url, drop_ln=not alternate)
       url_node = u"""
 <url>
   <loc>%s</loc>%s
 </url>"""
       optional = ''
       if lastmod:
           optional += u"""
   <lastmod>%s</lastmod>""" % lastmod.strftime('%Y-%m-%dT%H:%M:%S' + \
                                               DEFAULT_TIMEZONE)
       if changefreq:
           optional += u"""
   <changefreq>%s</changefreq>""" % changefreq
       if priority:
           optional += u"""
   <priority>%s</priority>""" % priority
       if alternate:
           for ln, alternate_url in iteritems(alternate_urls):
               ln = ln.replace('_', '-')  ## zh_CN -> zh-CN
               optional += u"""
   <xhtml:link rel="alternate" hreflang="%s" href="%s" />""" % (
                   ln, encode_for_xml(alternate_url, quote=True))
       url_node %= (encode_for_xml(canonical_url), optional)
       self.file_size += len(url_node)
       self.filedescriptor.write(url_node)
       return self.num_urls
Example #3
0
def _bibconvert_escape(dummy_ctx, value):
    """Bridge to lxml to escape the provided value."""
    try:
        if isinstance(value, str):
            string_value = value
        elif isinstance(value, (int, long)):
            string_value = str(value)
        elif isinstance(value, list):
            value = value[0]
            if isinstance(value, str):
                string_value = value
            elif isinstance(value, (int, long)):
                string_value = str(value)
            else:
                string_value = value.text
        else:
            string_value = value.text

        return encode_for_xml(string_value)

    except Exception as err:
        print("Error during formatting function evaluation: {0}".format(err),
              file=sys.stderr)

    return ''
Example #4
0
def _bibconvert_escape(dummy_ctx, value):
    """Bridge to lxml to escape the provided value."""
    try:
        if isinstance(value, str):
            string_value = value
        elif isinstance(value, (int, long)):
            string_value = str(value)
        elif isinstance(value, list):
            value = value[0]
            if isinstance(value, str):
                string_value = value
            elif isinstance(value, (int, long)):
                string_value = str(value)
            else:
                string_value = value.text
        else:
            string_value = value.text

        return encode_for_xml(string_value)

    except Exception as err:
        print("Error during formatting function evaluation: {0}".format(err),
              file=sys.stderr)

    return ''
Example #5
0
 def __new__(cls, original_string='', escape_quotes=False):
     if isinstance(original_string, EscapedString):
         escaped_string = str(original_string)
     else:
         if original_string and not str(original_string).strip():
             escaped_string = '&nbsp;'
         else:
             escaped_string = encode_for_xml(str(original_string), wash=True, quote=escape_quotes)
     obj = str.__new__(cls, escaped_string)
     obj.original_string = original_string
     obj.escape_quotes = escape_quotes
     return obj
Example #6
0
def create_ill_record(book_info):
    """
    Create a new ILL record

    @param book_info: book's information
    @type book_info: tuple

    @return MARC record
    """

    (title, author, place, publisher, year, edition, isbn) = book_info

    ill_record = """
        <record>
            <datafield tag="020" ind1=" " ind2=" ">
                <subfield code="a">%(isbn)s</subfield>
            </datafield>
            <datafield tag="100" ind1=" " ind2=" ">
                <subfield code="a">%(author)s</subfield>
            </datafield>
            <datafield tag="245" ind1=" " ind2=" ">
                <subfield code="a">%(title)s</subfield>
            </datafield>
            <datafield tag="250" ind1=" " ind2=" ">
                <subfield code="a">%(edition)s</subfield>
            </datafield>
            <datafield tag="260" ind1=" " ind2=" ">
                <subfield code="a">%(place)s</subfield>
                <subfield code="b">%(publisher)s</subfield>
                <subfield code="c">%(year)s</subfield>
            </datafield>
            <datafield tag="980" ind1=" " ind2=" ">
                <subfield code="a">ILLBOOK</subfield>
            </datafield>
        </record>
  """ % {'isbn':      encode_for_xml(isbn),
         'author':    encode_for_xml(author),
         'title':     encode_for_xml(title),
         'edition':   encode_for_xml(edition),
         'place':     encode_for_xml(place),
         'publisher': encode_for_xml(publisher),
         'year':      encode_for_xml(year)}

    file_path = '%s/%s_%s.xml' % (CFG_TMPDIR, 'bibcirculation_ill_book',
                                  time.strftime("%Y%m%d_%H%M%S"))

    xml_file = open(file_path, 'w')
    xml_file.write(ill_record)
    xml_file.close()

    # Pass XML file to BibUpload.
    task_low_level_submission('bibupload', 'bibcirculation',
                              '-P', '5', '-i', file_path)

    return ill_record
Example #7
0
def create_ill_record(book_info):
    """
    Create a new ILL record

    @param book_info: book's information
    @type book_info: tuple

    @return MARC record
    """

    (title, author, place, publisher, year, edition, isbn) = book_info

    ill_record = """
        <record>
            <datafield tag="020" ind1=" " ind2=" ">
                <subfield code="a">%(isbn)s</subfield>
            </datafield>
            <datafield tag="100" ind1=" " ind2=" ">
                <subfield code="a">%(author)s</subfield>
            </datafield>
            <datafield tag="245" ind1=" " ind2=" ">
                <subfield code="a">%(title)s</subfield>
            </datafield>
            <datafield tag="250" ind1=" " ind2=" ">
                <subfield code="a">%(edition)s</subfield>
            </datafield>
            <datafield tag="260" ind1=" " ind2=" ">
                <subfield code="a">%(place)s</subfield>
                <subfield code="b">%(publisher)s</subfield>
                <subfield code="c">%(year)s</subfield>
            </datafield>
            <datafield tag="980" ind1=" " ind2=" ">
                <subfield code="a">ILLBOOK</subfield>
            </datafield>
        </record>
  """ % {'isbn':      encode_for_xml(isbn),
         'author':    encode_for_xml(author),
         'title':     encode_for_xml(title),
         'edition':   encode_for_xml(edition),
         'place':     encode_for_xml(place),
         'publisher': encode_for_xml(publisher),
         'year':      encode_for_xml(year)}

    file_path = '%s/%s_%s.xml' % (CFG_TMPDIR, 'bibcirculation_ill_book',
                                  time.strftime("%Y%m%d_%H%M%S"))

    xml_file = open(file_path, 'w')
    xml_file.write(ill_record)
    xml_file.close()

    # Pass XML file to BibUpload.
    task_low_level_submission('bibupload', 'bibcirculation',
                              '-P', '5', '-i', file_path)

    return ill_record
Example #8
0
 def __new__(cls, original_string='', escape_quotes=False):
     if isinstance(original_string, EscapedString):
         escaped_string = str(original_string)
     else:
         if original_string and not str(original_string).strip():
             escaped_string = '&nbsp;'
         else:
             escaped_string = encode_for_xml(str(original_string),
                                             wash=True,
                                             quote=escape_quotes)
     obj = str.__new__(cls, escaped_string)
     obj.original_string = original_string
     obj.escape_quotes = escape_quotes
     return obj
Example #9
0
def format_element(bfo, type='xm', encodeForXML='yes'):
    """Print the complete current record as XML.

    :param type: the type of xml. Can be 'xml', 'oai_dc', 'marcxml', 'xd'
    :param encodeForXML: if 'yes', replace all < > and & with html
        corresponding escaped characters.
    """
    assert type == 'xm'
    from invenio.modules.records.api import get_record
    from invenio.utils.text import encode_for_xml

    out = get_record(bfo.recID).legacy_export_as_marc()

    if encodeForXML.lower() == 'yes':
        return encode_for_xml(out)
    else:
        return out
Example #10
0
def format_element(bfo, type='xml', encodeForXML='yes'):
    """
    Prints the complete current record as XML.

    @param type: the type of xml. Can be 'xml', 'oai_dc', 'marcxml', 'xd'
    @param encodeForXML: if 'yes', replace all < > and & with html corresponding escaped characters.
    """
    from invenio.modules.formatter.utils import record_get_xml
    from invenio.utils.text import encode_for_xml
    #Can be used to output various xml flavours.

    out = record_get_xml(bfo.recID, format=type, on_the_fly=True)

    if encodeForXML.lower() == 'yes':
        return encode_for_xml(out)
    else:
        return out
Example #11
0
def _output_marc(output_complete,
                 categories,
                 kw_field=bconfig.CFG_MAIN_FIELD,
                 auth_field=bconfig.CFG_AUTH_FIELD,
                 acro_field=bconfig.CFG_ACRON_FIELD,
                 provenience='BibClassify'):
    """Output the keywords in the MARCXML format.

    :var skw_matches: list of single keywords
    :var ckw_matches: list of composite keywords
    :var author_keywords: dictionary of extracted author keywords
    :var acronyms: dictionary of acronyms
    :var spires: boolean, True=generate spires output - BUT NOTE: it is
            here only not to break compatibility, in fact spires output
            should never be used for xml because if we read marc back
            into the KeywordToken objects, we would not find them
    :keyword provenience: string that identifies source (authority) that
        assigned the contents of the field
    :return: string, formatted MARC"""

    kw_template = ('<datafield tag="%s" ind1="%s" ind2="%s">\n'
                   '    <subfield code="2">%s</subfield>\n'
                   '    <subfield code="a">%s</subfield>\n'
                   '    <subfield code="n">%s</subfield>\n'
                   '    <subfield code="9">%s</subfield>\n'
                   '</datafield>\n')

    output = []

    tag, ind1, ind2 = _parse_marc_code(kw_field)
    for keywords in (output_complete["Single keywords"],
                     output_complete["Core keywords"]):
        for kw in keywords:
            output.append(kw_template %
                          (tag, ind1, ind2, encode_for_xml(provenience),
                           encode_for_xml(kw), keywords[kw],
                           encode_for_xml(categories[kw])))

    for field, keywords in ((auth_field, output_complete["Author keywords"]),
                            (acro_field, output_complete["Acronyms"])):
        if keywords and len(
                keywords) and field:  # field='' we shall not save the keywords
            tag, ind1, ind2 = _parse_marc_code(field)
            for kw, info in keywords.items():
                output.append(
                    kw_template %
                    (tag, ind1, ind2, encode_for_xml(provenience),
                     encode_for_xml(kw), '', encode_for_xml(categories[kw])))

    return "".join(output)
Example #12
0
def _output_marc(output_complete, categories,
                 kw_field=cfg["CLASSIFIER_RECORD_KEYWORD_FIELD"],
                 auth_field=cfg["CLASSIFIER_RECORD_KEYWORD_AUTHOR_FIELD"],
                 acro_field=cfg["CLASSIFIER_RECORD_KEYWORD_ACRONYM_FIELD"],
                 provenience='Classifier'):
    """Output the keywords in the MARCXML format.

    :var skw_matches: list of single keywords
    :var ckw_matches: list of composite keywords
    :var author_keywords: dictionary of extracted author keywords
    :var acronyms: dictionary of acronyms
    :var spires: boolean, True=generate spires output - BUT NOTE: it is
            here only not to break compatibility, in fact spires output
            should never be used for xml because if we read marc back
            into the KeywordToken objects, we would not find them
    :keyword provenience: string that identifies source (authority) that
        assigned the contents of the field
    :return: string, formatted MARC
    """
    kw_template = ('<datafield tag="%s" ind1="%s" ind2="%s">\n'
                   '    <subfield code="2">%s</subfield>\n'
                   '    <subfield code="a">%s</subfield>\n'
                   '    <subfield code="n">%s</subfield>\n'
                   '    <subfield code="9">%s</subfield>\n'
                   '</datafield>\n')

    output = []

    tag, ind1, ind2 = _parse_marc_code(kw_field)
    for keywords in (output_complete["Single keywords"],
                     output_complete["Core keywords"]):
        for kw in keywords:
            output.append(kw_template % (tag, ind1, ind2,
                                         encode_for_xml(provenience),
                                         encode_for_xml(kw), keywords[kw],
                                         encode_for_xml(categories[kw])))

    for field, keywords in ((auth_field, output_complete["Author keywords"]),
                            (acro_field, output_complete["Acronyms"])):
        # field='' we shall not save the keywords
        if keywords and len(keywords) and field:
            tag, ind1, ind2 = _parse_marc_code(field)
            for kw, info in keywords.items():
                output.append(kw_template % (tag, ind1, ind2,
                                             encode_for_xml(provenience),
                                             encode_for_xml(kw), '',
                                             encode_for_xml(categories[kw])))

    return "".join(output)
Example #13
0
def bibconvert_escape_libxslt(dummy_ctx, value):
    """
    Bridge to libxslt to escape the provided value.
    """
    try:
        if isinstance(value, str):
            string_value = value
        elif isinstance(value, (int, long)):
            string_value = str(value)
        else:
            string_value = libxml2.xmlNode(_obj=value[0]).serialize('utf8')

        return encode_for_xml(string_value)

    except Exception as err:
        sys.stderr.write("Error during formatting function evaluation: " + \
                         str(err) + \
                         '\n')

    return ''
Example #14
0
def _output_marc(skw_matches, ckw_matches, author_keywords, acronyms, spires=False,
                 kw_field=bconfig.CFG_MAIN_FIELD, auth_field=bconfig.CFG_AUTH_FIELD,
                 acro_field=bconfig.CFG_ACRON_FIELD, provenience='BibClassify'):
    """Outputs the keywords in the MARCXML format.
    @var skw_matches: list of single keywords
    @var ckw_matches: list of composite keywords
    @var author_keywords: dictionary of extracted author keywords
    @var acronyms: dictionary of acronyms
    @var spires: boolean, True=generate spires output - BUT NOTE: it is
            here only not to break compatibility, in fact spires output
            should never be used for xml because if we read marc back
            into the KeywordToken objects, we would not find them
    @keyword provenience: string that identifies source (authority) that
        assigned the contents of the field
    @return: string, formatted MARC"""

    kw_template = ('<datafield tag="%s" ind1="%s" ind2="%s">\n'
                   '    <subfield code="2">%s</subfield>\n'
                   '    <subfield code="a">%s</subfield>\n'
                   '    <subfield code="n">%s</subfield>\n'
                   '    <subfield code="9">%s</subfield>\n'
                   '</datafield>\n')

    output = []

    tag, ind1, ind2 = _parse_marc_code(kw_field)
    for keywords in (skw_matches, ckw_matches):
        if keywords and len(keywords):
            for kw, info in keywords:
                output.append(kw_template % (tag, ind1, ind2, encode_for_xml(provenience),
                                             encode_for_xml(kw.output(spires)), len(info[0]),
                                             encode_for_xml(kw.getType())))

    for field, keywords in ((auth_field, author_keywords), (acro_field, acronyms)):
        if keywords and len(keywords) and field: # field='' we shall not save the keywords
            tag, ind1, ind2 = _parse_marc_code(field)
            for kw, info in keywords.items():
                output.append(kw_template % (tag, ind1, ind2, encode_for_xml(provenience),
                                             encode_for_xml(kw), '', encode_for_xml(kw.getType())))

    return "".join(output)
Example #15
0
def record_get_xml(recID, format='xm', decompress=zlib.decompress,
                   on_the_fly=False):
    """
    Returns an XML string of the record given by recID.

    The function builds the XML directly from the database,
    without using the standard formatting process.

    'format' allows to define the flavour of XML:
        - 'xm' for standard XML
        - 'marcxml' for MARC XML
        - 'oai_dc' for OAI Dublin Core
        - 'xd' for XML Dublin Core

    If record does not exist, returns empty string.
    If the record is deleted, returns an empty MARCXML (with recid
    controlfield, OAI ID fields and 980__c=DELETED)

    @param recID: the id of the record to retrieve
    @param format: the format to use
    @param on_the_fly: if False, try to fetch precreated one in database
    @param decompress: the library to use to decompress cache from DB
    @return: the xml string of the record
    """
    from invenio.legacy.search_engine import record_exists

    def get_creation_date(recID, fmt="%Y-%m-%d"):
        "Returns the creation date of the record 'recID'."
        out = ""
        res = run_sql("SELECT DATE_FORMAT(creation_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1)
        if res:
            out = res[0][0]
        return out

    def get_modification_date(recID, fmt="%Y-%m-%d"):
        "Returns the date of last modification for the record 'recID'."
        out = ""
        res = run_sql("SELECT DATE_FORMAT(modification_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1)
        if res:
            out = res[0][0]
        return out

    #_ = gettext_set_language(ln)

    out = ""

    # sanity check:
    record_exist_p = record_exists(recID)
    if record_exist_p == 0: # doesn't exist
        return out

    # print record opening tags, if needed:
    if format == "marcxml" or format == "oai_dc":
        out += "  <record>\n"
        out += "   <header>\n"

        for identifier in get_fieldvalues(recID, CFG_OAI_ID_FIELD):
            out += "    <identifier>%s</identifier>\n" % identifier
        out += "    <datestamp>%s</datestamp>\n" % get_modification_date(recID)
        out += "   </header>\n"
        out += "   <metadata>\n"

    if format.startswith("xm") or format == "marcxml":
        res = None
        if on_the_fly is False:
            # look for cached format existence:
            query = """SELECT value FROM bibfmt WHERE
            id_bibrec='%s' AND format='%s'""" % (recID, format)
            res = run_sql(query, None, 1)
        if res and record_exist_p == 1:
            # record 'recID' is formatted in 'format', so print it
            out += "%s" % decompress(res[0][0])
        else:
            # record 'recID' is not formatted in 'format' -- they are
            # not in "bibfmt" table; so fetch all the data from
            # "bibXXx" tables:
            if format == "marcxml":
                out += """    <record xmlns="http://www.loc.gov/MARC21/slim">\n"""
                out += "        <controlfield tag=\"001\">%d</controlfield>\n" % int(recID)
            elif format.startswith("xm"):
                out += """    <record>\n"""
                out += "        <controlfield tag=\"001\">%d</controlfield>\n" % int(recID)
            if record_exist_p == -1:
                # deleted record, so display only OAI ID and 980:
                oai_ids = get_fieldvalues(recID, CFG_OAI_ID_FIELD)
                if oai_ids:
                    out += "<datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\"><subfield code=\"%s\">%s</subfield></datafield>\n" % \
                           (CFG_OAI_ID_FIELD[0:3],
                            CFG_OAI_ID_FIELD[3:4],
                            CFG_OAI_ID_FIELD[4:5],
                            CFG_OAI_ID_FIELD[5:6],
                            oai_ids[0])
                out += "<datafield tag=\"980\" ind1=\" \" ind2=\" \"><subfield code=\"c\">DELETED</subfield></datafield>\n"
                from invenio.legacy.search_engine import get_merged_recid
                merged_recid = get_merged_recid(recID)
                if merged_recid: # record was deleted but merged to other record, so display this information:
                    out += "<datafield tag=\"970\" ind1=\" \" ind2=\" \"><subfield code=\"d\">%d</subfield></datafield>\n" % merged_recid
            else:
                # controlfields
                query = "SELECT b.tag,b.value,bb.field_number FROM bib00x AS b, bibrec_bib00x AS bb "\
                        "WHERE bb.id_bibrec='%s' AND b.id=bb.id_bibxxx AND b.tag LIKE '00%%' "\
                        "ORDER BY bb.field_number, b.tag ASC" % recID
                res = run_sql(query)
                for row in res:
                    field, value = row[0], row[1]
                    value = encode_for_xml(value)
                    out += """        <controlfield tag="%s">%s</controlfield>\n""" % \
                           (encode_for_xml(field[0:3]), value)
                # datafields
                i = 1 # Do not process bib00x and bibrec_bib00x, as
                      # they are controlfields. So start at bib01x and
                      # bibrec_bib00x (and set i = 0 at the end of
                      # first loop)
                for digit1 in range(0, 10):
                    for digit2 in range(i, 10):
                        bx = "bib%d%dx" % (digit1, digit2)
                        bibx = "bibrec_bib%d%dx" % (digit1, digit2)
                        query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\
                                "WHERE bb.id_bibrec='%s' AND b.id=bb.id_bibxxx AND b.tag LIKE '%s%%' "\
                                "ORDER BY bb.field_number, b.tag ASC" % (bx,
                                                                         bibx,
                                                                         recID,
                                                                         str(digit1)+str(digit2))
                        res = run_sql(query)
                        field_number_old = -999
                        field_old = ""
                        for row in res:
                            field, value, field_number = row[0], row[1], row[2]
                            ind1, ind2 = field[3], field[4]
                            if ind1 == "_" or ind1 == "":
                                ind1 = " "
                            if ind2 == "_" or ind2 == "":
                                ind2 = " "
                            # print field tag
                            if field_number != field_number_old or \
                                   field[:-1] != field_old[:-1]:
                                if field_number_old != -999:
                                    out += """        </datafield>\n"""
                                out += """        <datafield tag="%s" ind1="%s" ind2="%s">\n""" % \
                                       (encode_for_xml(field[0:3]),
                                        encode_for_xml(ind1),
                                        encode_for_xml(ind2))
                                field_number_old = field_number
                                field_old = field
                            # print subfield value
                            value = encode_for_xml(value)
                            out += """            <subfield code="%s">%s</subfield>\n""" % \
                                   (encode_for_xml(field[-1:]), value)

                        # all fields/subfields printed in this run, so close the tag:
                        if field_number_old != -999:
                            out += """        </datafield>\n"""
                    i = 0 # Next loop should start looking at bib%0 and bibrec_bib00x
            # we are at the end of printing the record:
            out += "    </record>\n"

    elif format == "xd" or format == "oai_dc":
        # XML Dublin Core format, possibly OAI -- select only some bibXXx fields:
        out += """    <dc xmlns="http://purl.org/dc/elements/1.1/"
                         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                         xsi:schemaLocation="http://purl.org/dc/elements/1.1/
                                             http://www.openarchives.org/OAI/1.1/dc.xsd">\n"""
        if record_exist_p == -1:
            out += ""
        else:
            for f in get_fieldvalues(recID, "041__a"):
                out += "        <language>%s</language>\n" % f

            for f in get_fieldvalues(recID, "100__a"):
                out += "        <creator>%s</creator>\n" % encode_for_xml(f)

            for f in get_fieldvalues(recID, "700__a"):
                out += "        <creator>%s</creator>\n" % encode_for_xml(f)

            for f in get_fieldvalues(recID, "245__a"):
                out += "        <title>%s</title>\n" % encode_for_xml(f)

            for f in get_fieldvalues(recID, "65017a"):
                out += "        <subject>%s</subject>\n" % encode_for_xml(f)

            for f in get_fieldvalues(recID, "8564_u"):
                out += "        <identifier>%s</identifier>\n" % encode_for_xml(f)

            for f in get_fieldvalues(recID, "520__a"):
                out += "        <description>%s</description>\n" % encode_for_xml(f)

            out += "        <date>%s</date>\n" % get_creation_date(recID)
        out += "    </dc>\n"


    # print record closing tags, if needed:
    if format == "marcxml" or format == "oai_dc":
        out += "   </metadata>\n"
        out += "  </record>\n"

    return out
Example #16
0
 def encode_for_marcxml(value):
     from invenio.utils.text import encode_for_xml
     if isinstance(value, unicode):
         value = value.encode('utf8')
     return encode_for_xml(str(value))
Example #17
0
def document_upload(req=None,
                    folder="",
                    matching="",
                    mode="",
                    exec_date="",
                    exec_time="",
                    ln=CFG_SITE_LANG,
                    priority="1",
                    email_logs_to=None):
    """ Take files from the given directory and upload them with the appropiate mode.
    @parameters:
        + folder: Folder where the files to upload are stored
        + matching: How to match file names with record fields (report number, barcode,...)
        + mode: Upload mode (append, revise, replace)
    @return: tuple (file, error code)
        file: file name causing the error to notify the user
        error code:
            1 - More than one possible recID, ambiguous behaviour
            2 - No records match that file name
            3 - File already exists
    """
    import sys
    from invenio.legacy.bibdocfile.api import BibRecDocs, file_strip_ext
    from invenio.utils.hash import md5
    import shutil
    from invenio.legacy.search_engine import perform_request_search, \
                                      search_pattern, \
                                      guess_collection_of_a_record
    _ = gettext_set_language(ln)
    errors = []
    info = [0, []]  # Number of files read, name of the files
    try:
        files = os.listdir(folder)
    except OSError as error:
        errors.append(("", error))
        return errors, info
    err_desc = {
        1: _("More than one possible recID, ambiguous behaviour"),
        2: _("No records match that file name"),
        3: _("File already exists"),
        4: _("A file with the same name and format already exists")
    }
    # Create directory DONE/ if doesn't exist
    folder = (folder[-1] == "/") and folder or (folder + "/")
    files_done_dir = folder + "DONE/"
    try:
        os.mkdir(files_done_dir)
    except OSError:
        # Directory exists or no write permission
        pass
    for docfile in files:
        if os.path.isfile(os.path.join(folder, docfile)):
            info[0] += 1
            identifier = file_strip_ext(docfile)
            extension = docfile[len(identifier):]
            rec_id = None
            if identifier:
                rec_id = search_pattern(p=identifier, f=matching, m='e')
            if not rec_id:
                errors.append((docfile, err_desc[2]))
                continue
            elif len(rec_id) > 1:
                errors.append((docfile, err_desc[1]))
                continue
            else:
                rec_id = str(list(rec_id)[0])
            rec_info = BibRecDocs(rec_id)
            if rec_info.bibdocs:
                for bibdoc in rec_info.bibdocs:
                    attached_files = bibdoc.list_all_files()
                    file_md5 = md5(
                        open(os.path.join(folder, docfile),
                             "rb").read()).hexdigest()
                    num_errors = len(errors)
                    for attached_file in attached_files:
                        if attached_file.checksum == file_md5:
                            errors.append((docfile, err_desc[3]))
                            break
                        elif attached_file.get_full_name() == docfile:
                            errors.append((docfile, err_desc[4]))
                            break
                if len(errors) > num_errors:
                    continue
            # Check if user has rights to upload file
            if req is not None:
                file_collection = guess_collection_of_a_record(int(rec_id))
                auth_code, auth_message = acc_authorize_action(
                    req, 'runbatchuploader', collection=file_collection)
                if auth_code != 0:
                    error_msg = _(
                        "No rights to upload to collection '%(x_name)s'",
                        x_name=file_collection)
                    errors.append((docfile, error_msg))
                    continue
            # Move document to be uploaded to temporary folder
            (fd, tmp_file) = tempfile.mkstemp(
                prefix=identifier + "_" +
                time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_",
                suffix=extension,
                dir=CFG_TMPSHAREDDIR)
            shutil.copy(os.path.join(folder, docfile), tmp_file)
            # Create MARC temporary file with FFT tag and call bibupload
            (fd, filename) = tempfile.mkstemp(prefix=identifier + '_',
                                              dir=CFG_TMPSHAREDDIR)
            filedesc = os.fdopen(fd, 'w')
            marc_content = """ <record>
                                    <controlfield tag="001">%(rec_id)s</controlfield>
                                        <datafield tag="FFT" ind1=" " ind2=" ">
                                            <subfield code="n">%(name)s</subfield>
                                            <subfield code="a">%(path)s</subfield>
                                        </datafield>
                               </record> """ % {
                'rec_id': rec_id,
                'name': encode_for_xml(identifier),
                'path': encode_for_xml(tmp_file),
            }
            filedesc.write(marc_content)
            filedesc.close()
            info[1].append(docfile)
            user = ""
            if req is not None:
                user_info = collect_user_info(req)
                user = user_info['nickname']
            if not user:
                user = "******"
            # Execute bibupload with the appropiate mode

            task_arguments = ('bibupload', user, "--" + mode,
                              "--priority=" + priority, "-N", "batchupload")

            if exec_date:
                date = '--runtime=' + "\'" + exec_date + ' ' + exec_time + "\'"
                task_arguments += (date, )
            if email_logs_to:
                task_arguments += ("--email-logs-to", email_logs_to)
            task_arguments += (filename, )

            jobid = task_low_level_submission(*task_arguments)

            # write batch upload history
            run_sql(
                """INSERT INTO hstBATCHUPLOAD (user, submitdate,
                    filename, execdate, id_schTASK, batch_mode)
                    VALUES (%s, NOW(), %s, %s, %s, "document")""",
                (user_info['nickname'], docfile, exec_date != "" and
                 (exec_date + ' ' + exec_time)
                 or time.strftime("%Y-%m-%d %H:%M:%S"), str(jobid)))

            # Move file to DONE folder
            done_filename = docfile + "_" + time.strftime(
                "%Y%m%d%H%M%S", time.localtime()) + "_" + str(jobid)
            try:
                os.rename(os.path.join(folder, docfile),
                          os.path.join(files_done_dir, done_filename))
            except OSError:
                errors.append('MoveError')
    return errors, info
Example #18
0
def assemble_caption(begin_line, begin_index, end_line, end_index, lines):
    """
    Take write_messageation about the caption of a picture and put it all together
    in a nice way.  If it spans multiple lines, put it on one line.  If it
    contains controlled characters, strip them out.  If it has tags we don't
    want to worry about, get rid of them, etc.

    @param: begin_line (int): the index of the line where the caption begins
    @param: begin_index (int): the index within the line where the caption
        begins
    @param: end_line (int): the index of the line where the caption ends
    @param: end_index (int): the index within the line where the caption ends
    @param: lines ([string, string, ...]): the line strings of the text

    @return: caption (string): the caption, nicely formatted and pieced together
    """

    # stuff we don't like
    label_head = '\\label{'

    # reassemble that sucker
    if end_line > begin_line:
        # our caption spanned multiple lines
        caption = lines[begin_line][begin_index:]

        for included_line_index in range(begin_line + 1, end_line):
            caption = caption + ' ' + lines[included_line_index]

        caption = caption + ' ' + lines[end_line][:end_index]
        caption = caption.replace('\n', ' ')
        caption = caption.replace('  ', ' ')
    else:
        # it fit on one line
        caption = lines[begin_line][begin_index:end_index]

    # clean out a label tag, if there is one
    label_begin = caption.find(label_head)
    if label_begin > -1:
        # we know that our caption is only one line, so if there's a label
        # tag in it, it will be all on one line.  so we make up some args
        dummy_start, dummy_start_line, label_end, dummy_end = \
                find_open_and_close_braces(0, label_begin, '{', [caption])
        caption = caption[:label_begin] + caption[label_end + 1:]

    # clean out characters not allowed in MARCXML
    # not allowed: & < >
    try:
        caption = wash_for_utf8(caption)
        caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'), wash=True)
    except: # that damn encode thing threw an error on astro-ph/0601014
        sys.stderr.write(caption)
        sys.stderr.write(' cannot be processed\n')
        caption = caption.replace('&', '&amp;').replace('<', '&lt;')
        caption = caption.replace('>', '&gt;')

    caption = caption.strip()

    if len(caption) > 1 and caption[0] == '{' and caption[-1] == '}':
        caption = caption[1:-1]

    return caption
Example #19
0
def document_upload(req=None, folder="", matching="", mode="", exec_date="", exec_time="", ln=CFG_SITE_LANG, priority="1", email_logs_to=None):
    """ Take files from the given directory and upload them with the appropiate mode.
    @parameters:
        + folder: Folder where the files to upload are stored
        + matching: How to match file names with record fields (report number, barcode,...)
        + mode: Upload mode (append, revise, replace)
    @return: tuple (file, error code)
        file: file name causing the error to notify the user
        error code:
            1 - More than one possible recID, ambiguous behaviour
            2 - No records match that file name
            3 - File already exists
    """
    import sys
    from invenio.legacy.bibdocfile.api import BibRecDocs, file_strip_ext
    from invenio.utils.hash import md5
    import shutil
    from invenio.legacy.search_engine import perform_request_search, \
                                      search_pattern, \
                                      guess_collection_of_a_record
    _ = gettext_set_language(ln)
    errors = []
    info = [0, []] # Number of files read, name of the files
    try:
        files = os.listdir(folder)
    except OSError as error:
        errors.append(("", error))
        return errors, info
    err_desc = {1: _("More than one possible recID, ambiguous behaviour"), 2: _("No records match that file name"),
                3: _("File already exists"), 4: _("A file with the same name and format already exists")}
    # Create directory DONE/ if doesn't exist
    folder = (folder[-1] == "/") and folder or (folder + "/")
    files_done_dir = folder + "DONE/"
    try:
        os.mkdir(files_done_dir)
    except OSError:
        # Directory exists or no write permission
        pass
    for docfile in files:
        if os.path.isfile(os.path.join(folder, docfile)):
            info[0] += 1
            identifier = file_strip_ext(docfile)
            extension = docfile[len(identifier):]
            rec_id = None
            if identifier:
                rec_id = search_pattern(p=identifier, f=matching, m='e')
            if not rec_id:
                errors.append((docfile, err_desc[2]))
                continue
            elif len(rec_id) > 1:
                errors.append((docfile, err_desc[1]))
                continue
            else:
                rec_id = str(list(rec_id)[0])
            rec_info = BibRecDocs(rec_id)
            if rec_info.bibdocs:
                for bibdoc in rec_info.bibdocs:
                    attached_files = bibdoc.list_all_files()
                    file_md5 = md5(open(os.path.join(folder, docfile), "rb").read()).hexdigest()
                    num_errors = len(errors)
                    for attached_file in attached_files:
                        if attached_file.checksum == file_md5:
                            errors.append((docfile, err_desc[3]))
                            break
                        elif attached_file.get_full_name() == docfile:
                            errors.append((docfile, err_desc[4]))
                            break
                if len(errors) > num_errors:
                    continue
            # Check if user has rights to upload file
            if req is not None:
                file_collection = guess_collection_of_a_record(int(rec_id))
                auth_code, auth_message = acc_authorize_action(req, 'runbatchuploader', collection=file_collection)
                if auth_code != 0:
                    error_msg = _("No rights to upload to collection '%(x_name)s'", x_name=file_collection)
                    errors.append((docfile, error_msg))
                    continue
            # Move document to be uploaded to temporary folder
            (fd, tmp_file) = tempfile.mkstemp(prefix=identifier + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_", suffix=extension, dir=CFG_TMPSHAREDDIR)
            shutil.copy(os.path.join(folder, docfile), tmp_file)
            # Create MARC temporary file with FFT tag and call bibupload
            (fd, filename) = tempfile.mkstemp(prefix=identifier + '_', dir=CFG_TMPSHAREDDIR)
            filedesc = os.fdopen(fd, 'w')
            marc_content = """ <record>
                                    <controlfield tag="001">%(rec_id)s</controlfield>
                                        <datafield tag="FFT" ind1=" " ind2=" ">
                                            <subfield code="n">%(name)s</subfield>
                                            <subfield code="a">%(path)s</subfield>
                                        </datafield>
                               </record> """ % {'rec_id': rec_id,
                                                'name': encode_for_xml(identifier),
                                                'path': encode_for_xml(tmp_file),
                                                }
            filedesc.write(marc_content)
            filedesc.close()
            info[1].append(docfile)
            user = ""
            if req is not None:
                user_info = collect_user_info(req)
                user = user_info['nickname']
            if not user:
                user = "******"
            # Execute bibupload with the appropiate mode

            task_arguments = ('bibupload', user, "--" + mode,
                              "--priority=" + priority, "-N", "batchupload")

            if exec_date:
                date = '--runtime=' + "\'" + exec_date + ' ' + exec_time + "\'"
                task_arguments += (date, )
            if email_logs_to:
                task_arguments += ("--email-logs-to", email_logs_to)
            task_arguments += (filename, )

            jobid = task_low_level_submission(*task_arguments)

            # write batch upload history
            run_sql("""INSERT INTO hstBATCHUPLOAD (user, submitdate,
                    filename, execdate, id_schTASK, batch_mode)
                    VALUES (%s, NOW(), %s, %s, %s, "document")""",
                    (user_info['nickname'], docfile,
                    exec_date != "" and (exec_date + ' ' + exec_time)
                    or time.strftime("%Y-%m-%d %H:%M:%S"), str(jobid)))

            # Move file to DONE folder
            done_filename = docfile + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_" + str(jobid)
            try:
                os.rename(os.path.join(folder, docfile), os.path.join(files_done_dir, done_filename))
            except OSError:
                errors.append('MoveError')
    return errors, info