def add_url(self, url, lastmod=datetime(1900, 1, 1), changefreq="", priority="", alternate=False): """ create a new url node. Returns the number of url nodes in sitemap""" self.num_urls += 1 canonical_url, alternate_urls = get_canonical_and_alternates_urls( url, drop_ln=not alternate) url_node = u""" <url> <loc>%s</loc>%s </url>""" optional = '' if lastmod: optional += u""" <lastmod>%s</lastmod>""" % lastmod.strftime('%Y-%m-%dT%H:%M:%S' + \ DEFAULT_TIMEZONE) if changefreq: optional += u""" <changefreq>%s</changefreq>""" % changefreq if priority: optional += u""" <priority>%s</priority>""" % priority if alternate: for ln, alternate_url in alternate_urls.iteritems(): ln = ln.replace('_', '-') ## zh_CN -> zh-CN optional += u""" <xhtml:link rel="alternate" hreflang="%s" href="%s" />""" % ( ln, encode_for_xml(alternate_url, quote=True)) url_node %= (encode_for_xml(canonical_url), optional) self.file_size += len(url_node) self.filedescriptor.write(url_node) return self.num_urls
def add_url(self, url, lastmod=datetime(1900, 1, 1), changefreq="", priority="", alternate=False): """ create a new url node. Returns the number of url nodes in sitemap""" self.num_urls += 1 canonical_url, alternate_urls = get_canonical_and_alternates_urls(url, drop_ln=not alternate) url_node = u""" <url> <loc>%s</loc>%s </url>""" optional = '' if lastmod: optional += u""" <lastmod>%s</lastmod>""" % lastmod.strftime('%Y-%m-%dT%H:%M:%S' + \ DEFAULT_TIMEZONE) if changefreq: optional += u""" <changefreq>%s</changefreq>""" % changefreq if priority: optional += u""" <priority>%s</priority>""" % priority if alternate: for ln, alternate_url in alternate_urls.iteritems(): ln = ln.replace('_', '-') ## zh_CN -> zh-CN optional += u""" <xhtml:link rel="alternate" hreflang="%s" href="%s" />""" % (ln, encode_for_xml(alternate_url, quote=True)) url_node %= (encode_for_xml(canonical_url), optional) self.file_size += len(url_node) self.filedescriptor.write(url_node) return self.num_urls
def bibconvert_escape_lxml(dummy_ctx, value): """ Bridge to lxml to escape the provided value. """ try: if isinstance(value, str): string_value = value elif isinstance(value, (int, long)): string_value = str(value) elif isinstance(value, list): value = value[0] if isinstance(value, str): string_value = value elif isinstance(value, (int, long)): string_value = str(value) else: string_value = value.text else: string_value = value.text return encode_for_xml(string_value) except Exception, err: sys.stderr.write("Error during formatting function evaluation: " + \ str(err) + \ '\n')
def assemble_caption(begin_line, begin_index, end_line, end_index, lines): """ Take write_messageation about the caption of a picture and put it all together in a nice way. If it spans multiple lines, put it on one line. If it contains controlled characters, strip them out. If it has tags we don't want to worry about, get rid of them, etc. @param: begin_line (int): the index of the line where the caption begins @param: begin_index (int): the index within the line where the caption begins @param: end_line (int): the index of the line where the caption ends @param: end_index (int): the index within the line where the caption ends @param: lines ([string, string, ...]): the line strings of the text @return: caption (string): the caption, nicely formatted and pieced together """ # stuff we don't like label_head = "\\label{" # reassemble that sucker if end_line > begin_line: # our caption spanned multiple lines caption = lines[begin_line][begin_index:] for included_line_index in range(begin_line + 1, end_line): caption = caption + " " + lines[included_line_index] caption = caption + " " + lines[end_line][:end_index] caption = caption.replace("\n", " ") caption = caption.replace(" ", " ") else: # it fit on one line caption = lines[begin_line][begin_index:end_index] # clean out a label tag, if there is one label_begin = caption.find(label_head) if label_begin > -1: # we know that our caption is only one line, so if there's a label # tag in it, it will be all on one line. so we make up some args dummy_start, dummy_start_line, label_end, dummy_end = find_open_and_close_braces(0, label_begin, "{", [caption]) caption = caption[:label_begin] + caption[label_end + 1 :] # clean out characters not allowed in MARCXML # not allowed: & < > try: caption = wash_for_utf8(caption) caption = encode_for_xml(caption.encode("utf-8", "xmlcharrefreplace"), wash=True) except: # that damn encode thing threw an error on astro-ph/0601014 sys.stderr.write(caption) sys.stderr.write(" cannot be processed\n") caption = caption.replace("&", "&").replace("<", "<") caption = caption.replace(">", ">") caption = caption.strip() if len(caption) > 1 and caption[0] == "{" and caption[-1] == "}": caption = caption[1:-1] return caption
def assemble_caption(begin_line, begin_index, end_line, end_index, lines): """ Take write_messageation about the caption of a picture and put it all together in a nice way. If it spans multiple lines, put it on one line. If it contains controlled characters, strip them out. If it has tags we don't want to worry about, get rid of them, etc. @param: begin_line (int): the index of the line where the caption begins @param: begin_index (int): the index within the line where the caption begins @param: end_line (int): the index of the line where the caption ends @param: end_index (int): the index within the line where the caption ends @param: lines ([string, string, ...]): the line strings of the text @return: caption (string): the caption, nicely formatted and pieced together """ # stuff we don't like label_head = '\\label{' # reassemble that sucker if end_line > begin_line: # our caption spanned multiple lines caption = lines[begin_line][begin_index:] for included_line_index in range(begin_line + 1, end_line): caption = caption + ' ' + lines[included_line_index] caption = caption + ' ' + lines[end_line][:end_index] caption = caption.replace('\n', ' ') caption = caption.replace(' ', ' ') else: # it fit on one line caption = lines[begin_line][begin_index:end_index] # clean out a label tag, if there is one label_begin = caption.find(label_head) if label_begin > -1: # we know that our caption is only one line, so if there's a label # tag in it, it will be all on one line. so we make up some args dummy_start, dummy_start_line, label_end, dummy_end = \ find_open_and_close_braces(0, label_begin, '{', [caption]) caption = caption[:label_begin] + caption[label_end + 1:] # clean out characters not allowed in MARCXML # not allowed: & < > try: caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'), wash=True) except: # that damn encode thing threw an error on astro-ph/0601014 sys.stderr.write(caption) sys.stderr.write(' cannot be processed\n') caption = caption.replace('&', '&').replace('<', '<') caption = caption.replace('>', '>') caption = caption.strip() if len(caption) > 1 and caption[0] == '{' and caption[-1] == '}': caption = caption[1:-1] return caption
def create_ill_record(book_info): """ Create a new ILL record @param book_info: book's information @type book_info: tuple @return MARC record """ (title, author, place, publisher, year, edition, isbn) = book_info ill_record = """ <record> <datafield tag="020" ind1=" " ind2=" "> <subfield code="a">%(isbn)s</subfield> </datafield> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">%(author)s</subfield> </datafield> <datafield tag="245" ind1=" " ind2=" "> <subfield code="a">%(title)s</subfield> </datafield> <datafield tag="250" ind1=" " ind2=" "> <subfield code="a">%(edition)s</subfield> </datafield> <datafield tag="260" ind1=" " ind2=" "> <subfield code="a">%(place)s</subfield> <subfield code="b">%(publisher)s</subfield> <subfield code="c">%(year)s</subfield> </datafield> <datafield tag="980" ind1=" " ind2=" "> <subfield code="a">ILLBOOK</subfield> </datafield> </record> """ % { 'isbn': encode_for_xml(isbn), 'author': encode_for_xml(author), 'title': encode_for_xml(title), 'edition': encode_for_xml(edition), 'place': encode_for_xml(place), 'publisher': encode_for_xml(publisher), 'year': encode_for_xml(year) } file_path = '%s/%s_%s.xml' % (CFG_TMPDIR, 'bibcirculation_ill_book', time.strftime("%Y%m%d_%H%M%S")) xml_file = open(file_path, 'w') xml_file.write(ill_record) xml_file.close() # Pass XML file to BibUpload. task_low_level_submission('bibupload', 'bibcirculation', '-P', '5', '-i', file_path) return ill_record
def __new__(cls, original_string='', escape_quotes=False): if isinstance(original_string, EscapedString): escaped_string = str(original_string) else: if original_string and not str(original_string).strip(): escaped_string = ' ' else: escaped_string = encode_for_xml(str(original_string), wash=True, quote=escape_quotes) obj = str.__new__(cls, escaped_string) obj.original_string = original_string obj.escape_quotes = escape_quotes return obj
def create_ill_record(book_info): """ Create a new ILL record @param book_info: book's information @type book_info: tuple @return MARC record """ (title, author, place, publisher, year, edition, isbn) = book_info ill_record = """ <record> <datafield tag="020" ind1=" " ind2=" "> <subfield code="a">%(isbn)s</subfield> </datafield> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">%(author)s</subfield> </datafield> <datafield tag="245" ind1=" " ind2=" "> <subfield code="a">%(title)s</subfield> </datafield> <datafield tag="250" ind1=" " ind2=" "> <subfield code="a">%(edition)s</subfield> </datafield> <datafield tag="260" ind1=" " ind2=" "> <subfield code="a">%(place)s</subfield> <subfield code="b">%(publisher)s</subfield> <subfield code="c">%(year)s</subfield> </datafield> <datafield tag="980" ind1=" " ind2=" "> <subfield code="a">ILLBOOK</subfield> </datafield> </record> """ % { "isbn": encode_for_xml(isbn), "author": encode_for_xml(author), "title": encode_for_xml(title), "edition": encode_for_xml(edition), "place": encode_for_xml(place), "publisher": encode_for_xml(publisher), "year": encode_for_xml(year), } file_path = "%s/%s_%s.xml" % (CFG_TMPDIR, "bibcirculation_ill_book", time.strftime("%Y%m%d_%H%M%S")) xml_file = open(file_path, "w") xml_file.write(ill_record) xml_file.close() # Pass XML file to BibUpload. task_low_level_submission("bibupload", "bibcirculation", "-P", "5", "-i", file_path) return ill_record
def create_ill_record(book_info): """ Create a new ILL record @param book_info: book's information @type book_info: tuple @return MARC record """ (title, author, place, publisher, year, edition, isbn) = book_info ill_record = """ <record> <datafield tag="020" ind1=" " ind2=" "> <subfield code="a">%(isbn)s</subfield> </datafield> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">%(author)s</subfield> </datafield> <datafield tag="245" ind1=" " ind2=" "> <subfield code="a">%(title)s</subfield> </datafield> <datafield tag="250" ind1=" " ind2=" "> <subfield code="a">%(edition)s</subfield> </datafield> <datafield tag="260" ind1=" " ind2=" "> <subfield code="a">%(place)s</subfield> <subfield code="b">%(publisher)s</subfield> <subfield code="c">%(year)s</subfield> </datafield> <datafield tag="980" ind1=" " ind2=" "> <subfield code="a">ILLBOOK</subfield> </datafield> </record> """ % {'isbn': encode_for_xml(isbn), 'author': encode_for_xml(author), 'title': encode_for_xml(title), 'edition': encode_for_xml(edition), 'place': encode_for_xml(place), 'publisher': encode_for_xml(publisher), 'year': encode_for_xml(year)} file_path = '%s/%s_%s.xml' % (CFG_TMPDIR, 'bibcirculation_ill_book', time.strftime("%Y%m%d_%H%M%S")) xml_file = open(file_path, 'w') xml_file.write(ill_record) xml_file.close() # Pass XML file to BibUpload. task_low_level_submission('bibupload', 'bibcirculation', '-P', '5', '-i', file_path) return ill_record
def bibconvert_escape_libxslt(dummy_ctx, value): """ Bridge to libxslt to escape the provided value. """ try: if isinstance(value, str): string_value = value elif isinstance(value, (int, long)): string_value = str(value) else: string_value = libxml2.xmlNode(_obj=value[0]).serialize("utf8") return encode_for_xml(string_value) except Exception, err: sys.stderr.write("Error during formatting function evaluation: " + str(err) + "\n")
def format_element(bfo, type='xml', encodeForXML='yes'): """ Prints the complete current record as XML. @param type: the type of xml. Can be 'xml', 'oai_dc', 'marcxml', 'xd' @param encodeForXML: if 'yes', replace all < > and & with html corresponding escaped characters. """ from invenio.bibformat_utils import record_get_xml from invenio.textutils import encode_for_xml #Can be used to output various xml flavours. out = record_get_xml(bfo.recID, format=type, on_the_fly=True) if encodeForXML.lower() == 'yes': return encode_for_xml(out) else: return out
def bibconvert_escape_libxslt(dummy_ctx, value): """ Bridge to libxslt to escape the provided value. """ try: if isinstance(value, str): string_value = value elif isinstance(value, (int, long)): string_value = str(value) else: string_value = libxml2.xmlNode(_obj=value[0]).serialize('utf8') return encode_for_xml(string_value) except Exception, err: sys.stderr.write("Error during formatting function evaluation: " + \ str(err) + \ '\n')
def _output_marc(skw_matches, ckw_matches, author_keywords, acronyms, spires=False, kw_field=bconfig.CFG_MAIN_FIELD, auth_field=bconfig.CFG_AUTH_FIELD, acro_field=bconfig.CFG_ACRON_FIELD, provenience='BibClassify'): """Outputs the keywords in the MARCXML format. @var skw_matches: list of single keywords @var ckw_matches: list of composite keywords @var author_keywords: dictionary of extracted author keywords @var acronyms: dictionary of acronyms @var spires: boolean, True=generate spires output - BUT NOTE: it is here only not to break compatibility, in fact spires output should never be used for xml because if we read marc back into the KeywordToken objects, we would not find them @keyword provenience: string that identifies source (authority) that assigned the contents of the field @return: string, formatted MARC""" kw_template = ('<datafield tag="%s" ind1="%s" ind2="%s">\n' ' <subfield code="2">%s</subfield>\n' ' <subfield code="a">%s</subfield>\n' ' <subfield code="n">%s</subfield>\n' ' <subfield code="9">%s</subfield>\n' '</datafield>\n') output = [] tag, ind1, ind2 = _parse_marc_code(kw_field) for keywords in (skw_matches, ckw_matches): if keywords and len(keywords): for kw, info in keywords: output.append(kw_template % (tag, ind1, ind2, encode_for_xml(provenience), encode_for_xml(kw.output(spires)), len(info[0]), encode_for_xml(kw.getType()))) for field, keywords in ((auth_field, author_keywords), (acro_field, acronyms)): if keywords and len(keywords) and field: # field='' we shall not save the keywords tag, ind1, ind2 = _parse_marc_code(field) for kw, info in keywords.items(): output.append(kw_template % (tag, ind1, ind2, encode_for_xml(provenience), encode_for_xml(kw), '', encode_for_xml(kw.getType()))) return "".join(output)
def print_record(sysno, format='marcxml', record_exists_result=None): """Prints record 'sysno' formatted according to 'format'. - if record does not exist, return nothing. - if record has been deleted and CFG_OAI_DELETED_POLICY is 'transient' or 'deleted', then return only header, with status 'deleted'. - if record has been deleted and CFG_OAI_DELETED_POLICY is 'no', then return nothing. Optional parameter 'record_exists_result' has the value of the result of the record_exists(sysno) function (in order not to call that function again if already done.) """ out = "" # sanity check: if record_exists_result is not None: _record_exists = record_exists_result else: _record_exists = record_exists(sysno) if not _record_exists: return if (format == "dc") or (format == "oai_dc"): format = "xd" # print record opening tags: out = out + " <record>\n" if _record_exists == -1: # Deleted? if CFG_OAI_DELETED_POLICY == "persistent" or \ CFG_OAI_DELETED_POLICY == "transient": out = out + " <header status=\"deleted\">\n" else: return else: out = out + " <header>\n" for ident in get_field(sysno, CFG_OAI_ID_FIELD): out = "%s <identifier>%s</identifier>\n" % (out, escape_space(ident)) out = "%s <datestamp>%s</datestamp>\n" % (out, get_modification_date(sysno)) for set in get_field(sysno, CFG_OAI_SET_FIELD): if set: # Print only if field not empty out = "%s <setSpec>%s</setSpec>\n" % (out, set) out = out + " </header>\n" if _record_exists == -1: # Deleted? pass else: out = out + " <metadata>\n" if format == "marcxml": formatted_record = get_preformatted_record(sysno, 'xm') if formatted_record is not None: ## MARCXML is already preformatted. Adapt it if needed # Infoscience modification : # Added custom validator from Swiss librarians formatted_record = formatted_record.replace( "<record>", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://ead.nb.admin.ch/web/standards/slb/MARC21/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>" ) formatted_record = formatted_record.replace( "<record xmlns=\"http://www.loc.gov/MARC21/slim\">", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://ead.nb.admin.ch/web/standards/slb/MARC21/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>" ) formatted_record = formatted_record.replace( "</record", "</marc:record") formatted_record = formatted_record.replace( "<controlfield", "<marc:controlfield") formatted_record = formatted_record.replace( "</controlfield", "</marc:controlfield") formatted_record = formatted_record.replace( "<datafield", "<marc:datafield") formatted_record = formatted_record.replace( "</datafield", "</marc:datafield") formatted_record = formatted_record.replace( "<subfield", "<marc:subfield") formatted_record = formatted_record.replace( "</subfield", "</marc:subfield") out += formatted_record else: ## MARCXML is not formatted in the database, so produce it. # Infoscience modification out = out + " <marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://ead.nb.admin.ch/web/standards/slb/MARC21/MARC21slim.xsd\" type=\"Bibliographic\">" out = out + " <marc:leader>00000coc 2200000uu 4500</marc:leader>" out = "%s <marc:controlfield tag=\"001\">%d</marc:controlfield>\n" % ( out, int(sysno)) for digit1 in range(0, 10): for digit2 in range(0, 10): bibbx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s "\ "ORDER BY bb.field_number, b.tag ASC" % (bibbx, bibx) res = run_sql(query, (sysno, '%d%d%%' % (digit1, digit2))) field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] if ind1 == "_": ind1 = " " if ind2 == "_": ind2 = " " # print field tag if field_number != field_number_old or field[: -1] != field_old[: -1]: if format == "marcxml": if field_number_old != -999: if field_old[0:2] == "00": out = out + " </marc:controlfield>\n" else: out = out + " </marc:datafield>\n" if field[0:2] == "00": out = "%s <marc:controlfield tag=\"%s\">\n" % ( out, encode_for_xml(field[0:3])) else: out = "%s <marc:datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\">\n" % ( out, encode_for_xml(field[0:3]), encode_for_xml(ind1).lower(), encode_for_xml(ind2).lower()) field_number_old = field_number field_old = field # print subfield value if format == "marcxml": value = encode_for_xml(value) if (field[0:2] == "00"): out = "%s %s\n" % (out, value) else: out = "%s <marc:subfield code=\"%s\">%s</marc:subfield>\n" % ( out, encode_for_xml(field[-1:]), value) # fetch next subfield # all fields/subfields printed in this run, so close the tag: if (format == "marcxml") and field_number_old != -999: if field_old[0:2] == "00": out = out + " </marc:controlfield>\n" else: out = out + " </marc:datafield>\n" out = out + " </marc:record>\n" elif format == "xd": out += format_record(sysno, 'xoaidc') # print record closing tags: out = out + " </metadata>\n" out = out + " </record>\n" return out
def print_record(sysno, format='marcxml', record_exists_result=None): """Prints record 'sysno' formatted according to 'format'. - if record does not exist, return nothing. - if record has been deleted and CFG_OAI_DELETED_POLICY is 'transient' or 'deleted', then return only header, with status 'deleted'. - if record has been deleted and CFG_OAI_DELETED_POLICY is 'no', then return nothing. Optional parameter 'record_exists_result' has the value of the result of the record_exists(sysno) function (in order not to call that function again if already done.) """ out = "" # sanity check: if record_exists_result is not None: _record_exists = record_exists_result else: _record_exists = record_exists(sysno) if not _record_exists: return if (format == "dc") or (format == "oai_dc"): format = "xd" # print record opening tags: out = out + " <record>\n" if _record_exists == -1: # Deleted? if CFG_OAI_DELETED_POLICY == "persistent" or \ CFG_OAI_DELETED_POLICY == "transient": out = out + " <header status=\"deleted\">\n" else: return else: out = out + " <header>\n" for ident in get_field(sysno, CFG_OAI_ID_FIELD): out = "%s <identifier>%s</identifier>\n" % (out, escape_space(ident)) out = "%s <datestamp>%s</datestamp>\n" % (out, get_modification_date(sysno)) for set in get_field(sysno, CFG_OAI_SET_FIELD): if set: # Print only if field not empty out = "%s <setSpec>%s</setSpec>\n" % (out, set) out = out + " </header>\n" if _record_exists == -1: # Deleted? pass else: out = out + " <metadata>\n" if format == "marcxml": formatted_record = get_preformatted_record(sysno, 'xm') if formatted_record is not None: ## MARCXML is already preformatted. Adapt it if needed formatted_record = formatted_record.replace("<record>", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>") formatted_record = formatted_record.replace("<record xmlns=\"http://www.loc.gov/MARC21/slim\">", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>") formatted_record = formatted_record.replace("</record", "</marc:record") formatted_record = formatted_record.replace("<controlfield", "<marc:controlfield") formatted_record = formatted_record.replace("</controlfield", "</marc:controlfield") formatted_record = formatted_record.replace("<datafield", "<marc:datafield") formatted_record = formatted_record.replace("</datafield", "</marc:datafield") formatted_record = formatted_record.replace("<subfield", "<marc:subfield") formatted_record = formatted_record.replace("</subfield", "</marc:subfield") out += formatted_record else: ## MARCXML is not formatted in the database, so produce it. out = out + " <marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">" out = out + " <marc:leader>00000coc 2200000uu 4500</marc:leader>" out = "%s <marc:controlfield tag=\"001\">%d</marc:controlfield>\n" % (out, int(sysno)) for digit1 in range(0, 10): for digit2 in range(0, 10): bibbx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s "\ "ORDER BY bb.field_number, b.tag ASC" % (bibbx, bibx) res = run_sql(query, (sysno, '%d%d%%' % (digit1, digit2))) field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] if ind1 == "_": ind1 = " " if ind2 == "_": ind2 = " " # print field tag if field_number != field_number_old or field[:-1] != field_old[:-1]: if format == "marcxml": if field_number_old != -999: if field_old[0:2] == "00": out = out + " </marc:controlfield>\n" else: out = out + " </marc:datafield>\n" if field[0:2] == "00": out = "%s <marc:controlfield tag=\"%s\">\n" % (out, encode_for_xml(field[0:3])) else: out = "%s <marc:datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\">\n" % (out, encode_for_xml(field[0:3]), encode_for_xml(ind1).lower(), encode_for_xml(ind2).lower()) field_number_old = field_number field_old = field # print subfield value if format == "marcxml": value = encode_for_xml(value) if(field[0:2] == "00"): out = "%s %s\n" % (out, value) else: out = "%s <marc:subfield code=\"%s\">%s</marc:subfield>\n" % (out, encode_for_xml(field[-1:]), value) # fetch next subfield # all fields/subfields printed in this run, so close the tag: if (format == "marcxml") and field_number_old != -999: if field_old[0:2] == "00": out = out + " </marc:controlfield>\n" else: out = out + " </marc:datafield>\n" out = out + " </marc:record>\n" elif format == "xd": out += format_record(sysno, 'xoaidc') # print record closing tags: out = out + " </metadata>\n" out = out + " </record>\n" return out
def encode_for_marcxml(value): from invenio.textutils import encode_for_xml return encode_for_xml(str(value))
dir=CFG_TMPSHAREDDIR) shutil.copy(os.path.join(folder, docfile), tmp_file) # Create MARC temporary file with FFT tag and call bibupload (fd, filename) = tempfile.mkstemp(prefix=identifier + '_', dir=CFG_TMPSHAREDDIR) filedesc = os.fdopen(fd, 'w') marc_content = """ <record> <controlfield tag="001">%(rec_id)s</controlfield> <datafield tag="FFT" ind1=" " ind2=" "> <subfield code="n">%(name)s</subfield> <subfield code="a">%(path)s</subfield> </datafield> </record> """ % { 'rec_id': rec_id, 'name': encode_for_xml(identifier), 'path': encode_for_xml(tmp_file), } filedesc.write(marc_content) filedesc.close() info[1].append(docfile) user = "" if req is not None: user_info = collect_user_info(req) user = user_info['nickname'] if not user: user = "******" # Execute bibupload with the appropiate mode task_arguments = ('bibupload', user, "--" + mode, "--priority=" + priority, "-N", "batchupload")
def record_get_xml(recID, format='xm', decompress=zlib.decompress, on_the_fly=False): """ Returns an XML string of the record given by recID. The function builds the XML directly from the database, without using the standard formatting process. 'format' allows to define the flavour of XML: - 'xm' for standard XML - 'marcxml' for MARC XML - 'oai_dc' for OAI Dublin Core - 'xd' for XML Dublin Core If record does not exist, returns empty string. If the record is deleted, returns an empty MARCXML (with recid controlfield, OAI ID fields and 980__c=DELETED) @param recID: the id of the record to retrieve @param on_the_fly: if False, try to fetch precreated one in database @return: the xml string of the record """ from invenio.search_engine import record_exists def get_fieldvalues(recID, tag): """Return list of field values for field TAG inside record RECID.""" out = [] if tag == "001___": # we have asked for recID that is not stored in bibXXx tables out.append(str(recID)) else: # we are going to look inside bibXXx tables digit = tag[0:2] bx = "bib%sx" % digit bibx = "bibrec_bib%sx" % digit query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag LIKE '%s'" \ "ORDER BY bibx.field_number, bx.tag ASC" % (bx, bibx, recID, tag) res = run_sql(query) for row in res: out.append(row[0]) return out def get_creation_date(recID, fmt="%Y-%m-%d"): "Returns the creation date of the record 'recID'." out = "" res = run_sql("SELECT DATE_FORMAT(creation_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out def get_modification_date(recID, fmt="%Y-%m-%d"): "Returns the date of last modification for the record 'recID'." out = "" res = run_sql("SELECT DATE_FORMAT(modification_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out #_ = gettext_set_language(ln) out = "" # sanity check: record_exist_p = record_exists(recID) if record_exist_p == 0: # doesn't exist return out # print record opening tags, if needed: if format == "marcxml" or format == "oai_dc": out += " <record>\n" out += " <header>\n" for identifier in get_fieldvalues(recID, CFG_OAI_ID_FIELD): out += " <identifier>%s</identifier>\n" % identifier out += " <datestamp>%s</datestamp>\n" % get_modification_date(recID) out += " </header>\n" out += " <metadata>\n" if format.startswith("xm") or format == "marcxml": res = None if on_the_fly == False: # look for cached format existence: query = """SELECT value FROM bibfmt WHERE id_bibrec='%s' AND format='%s'""" % (recID, format) res = run_sql(query, None, 1) if res and record_exist_p == 1: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format' -- they are # not in "bibfmt" table; so fetch all the data from # "bibXXx" tables: if format == "marcxml": out += """ <record xmlns="http://www.loc.gov/MARC21/slim">\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int(recID) elif format.startswith("xm"): out += """ <record>\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int(recID) if record_exist_p == -1: # deleted record, so display only OAI ID and 980: oai_ids = get_fieldvalues(recID, CFG_OAI_ID_FIELD) if oai_ids: out += "<datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\"><subfield code=\"%s\">%s</subfield></datafield>\n" % \ (CFG_OAI_ID_FIELD[0:3], CFG_OAI_ID_FIELD[3:4], CFG_OAI_ID_FIELD[4:5], CFG_OAI_ID_FIELD[5:6], oai_ids[0]) out += "<datafield tag=\"980\" ind1=\" \" ind2=\" \"><subfield code=\"c\">DELETED</subfield></datafield>\n" else: # controlfields query = "SELECT b.tag,b.value,bb.field_number FROM bib00x AS b, bibrec_bib00x AS bb "\ "WHERE bb.id_bibrec='%s' AND b.id=bb.id_bibxxx AND b.tag LIKE '00%%' "\ "ORDER BY bb.field_number, b.tag ASC" % recID res = run_sql(query) for row in res: field, value = row[0], row[1] value = encode_for_xml(value) out += """ <controlfield tag="%s">%s</controlfield>\n""" % \ (encode_for_xml(field[0:3]), value) # datafields i = 1 # Do not process bib00x and bibrec_bib00x, as # they are controlfields. So start at bib01x and # bibrec_bib00x (and set i = 0 at the end of # first loop) for digit1 in range(0, 10): for digit2 in range(i, 10): bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec='%s' AND b.id=bb.id_bibxxx AND b.tag LIKE '%s%%' "\ "ORDER BY bb.field_number, b.tag ASC" % (bx, bibx, recID, str(digit1)+str(digit2)) res = run_sql(query) field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] if ind1 == "_" or ind1 == "": ind1 = " " if ind2 == "_" or ind2 == "": ind2 = " " # print field tag if field_number != field_number_old or \ field[:-1] != field_old[:-1]: if field_number_old != -999: out += """ </datafield>\n""" out += """ <datafield tag="%s" ind1="%s" ind2="%s">\n""" % \ (encode_for_xml(field[0:3]), encode_for_xml(ind1), encode_for_xml(ind2)) field_number_old = field_number field_old = field # print subfield value value = encode_for_xml(value) out += """ <subfield code="%s">%s</subfield>\n""" % \ (encode_for_xml(field[-1:]), value) # all fields/subfields printed in this run, so close the tag: if field_number_old != -999: out += """ </datafield>\n""" i = 0 # Next loop should start looking at bib%0 and bibrec_bib00x # we are at the end of printing the record: out += " </record>\n" elif format == "xd" or format == "oai_dc": # XML Dublin Core format, possibly OAI -- select only some bibXXx fields: out += """ <dc xmlns="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://www.openarchives.org/OAI/1.1/dc.xsd">\n""" if record_exist_p == -1: out += "" else: for f in get_fieldvalues(recID, "041__a"): out += " <language>%s</language>\n" % f for f in get_fieldvalues(recID, "100__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "700__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "245__a"): out += " <title>%s</title>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "65017a"): out += " <subject>%s</subject>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "8564_u"): out += " <identifier>%s</identifier>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "520__a"): out += " <description>%s</description>\n" % encode_for_xml(f) out += " <date>%s</date>\n" % get_creation_date(recID) out += " </dc>\n" # print record closing tags, if needed: if format == "marcxml" or format == "oai_dc": out += " </metadata>\n" out += " </record>\n" return out
def record_get_xml(recID, format='xm', decompress=zlib.decompress, on_the_fly=False): """ Returns an XML string of the record given by recID. The function builds the XML directly from the database, without using the standard formatting process. 'format' allows to define the flavour of XML: - 'xm' for standard XML - 'marcxml' for MARC XML - 'oai_dc' for OAI Dublin Core - 'xd' for XML Dublin Core If record does not exist, returns empty string. If the record is deleted, returns an empty MARCXML (with recid controlfield, OAI ID fields and 980__c=DELETED) @param recID: the id of the record to retrieve @param on_the_fly: if False, try to fetch precreated one in database @return: the xml string of the record """ from invenio.search_engine import record_exists def get_fieldvalues(recID, tag): """Return list of field values for field TAG inside record RECID.""" out = [] if tag == "001___": # we have asked for recID that is not stored in bibXXx tables out.append(str(recID)) else: # we are going to look inside bibXXx tables digit = tag[0:2] bx = "bib%sx" % digit bibx = "bibrec_bib%sx" % digit query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag LIKE '%s'" \ "ORDER BY bibx.field_number, bx.tag ASC" % (bx, bibx, recID, tag) res = run_sql(query) for row in res: out.append(row[0]) return out def get_creation_date(recID, fmt="%Y-%m-%d"): "Returns the creation date of the record 'recID'." out = "" res = run_sql( "SELECT DATE_FORMAT(creation_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out def get_modification_date(recID, fmt="%Y-%m-%d"): "Returns the date of last modification for the record 'recID'." out = "" res = run_sql( "SELECT DATE_FORMAT(modification_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out #_ = gettext_set_language(ln) out = "" # sanity check: record_exist_p = record_exists(recID) if record_exist_p == 0: # doesn't exist return out # print record opening tags, if needed: if format == "marcxml" or format == "oai_dc": out += " <record>\n" out += " <header>\n" for identifier in get_fieldvalues(recID, CFG_OAI_ID_FIELD): out += " <identifier>%s</identifier>\n" % identifier out += " <datestamp>%s</datestamp>\n" % get_modification_date(recID) out += " </header>\n" out += " <metadata>\n" if format.startswith("xm") or format == "marcxml": res = None if on_the_fly == False: # look for cached format existence: query = """SELECT value FROM bibfmt WHERE id_bibrec='%s' AND format='%s'""" % (recID, format) res = run_sql(query, None, 1) if res and record_exist_p == 1: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format' -- they are # not in "bibfmt" table; so fetch all the data from # "bibXXx" tables: if format == "marcxml": out += """ <record xmlns="http://www.loc.gov/MARC21/slim">\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int( recID) elif format.startswith("xm"): out += """ <record>\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int( recID) if record_exist_p == -1: # deleted record, so display only OAI ID and 980: oai_ids = get_fieldvalues(recID, CFG_OAI_ID_FIELD) if oai_ids: out += "<datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\"><subfield code=\"%s\">%s</subfield></datafield>\n" % \ (CFG_OAI_ID_FIELD[0:3], CFG_OAI_ID_FIELD[3:4], CFG_OAI_ID_FIELD[4:5], CFG_OAI_ID_FIELD[5:6], oai_ids[0]) out += "<datafield tag=\"980\" ind1=\" \" ind2=\" \"><subfield code=\"c\">DELETED</subfield></datafield>\n" else: # controlfields query = "SELECT b.tag,b.value,bb.field_number FROM bib00x AS b, bibrec_bib00x AS bb "\ "WHERE bb.id_bibrec='%s' AND b.id=bb.id_bibxxx AND b.tag LIKE '00%%' "\ "ORDER BY bb.field_number, b.tag ASC" % recID res = run_sql(query) for row in res: field, value = row[0], row[1] value = encode_for_xml(value) out += """ <controlfield tag="%s">%s</controlfield>\n""" % \ (encode_for_xml(field[0:3]), value) # datafields i = 1 # Do not process bib00x and bibrec_bib00x, as # they are controlfields. So start at bib01x and # bibrec_bib00x (and set i = 0 at the end of # first loop) for digit1 in range(0, 10): for digit2 in range(i, 10): bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec='%s' AND b.id=bb.id_bibxxx AND b.tag LIKE '%s%%' "\ "ORDER BY bb.field_number, b.tag ASC" % (bx, bibx, recID, str(digit1)+str(digit2)) res = run_sql(query) field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] if ind1 == "_" or ind1 == "": ind1 = " " if ind2 == "_" or ind2 == "": ind2 = " " # print field tag if field_number != field_number_old or \ field[:-1] != field_old[:-1]: if field_number_old != -999: out += """ </datafield>\n""" out += """ <datafield tag="%s" ind1="%s" ind2="%s">\n""" % \ (encode_for_xml(field[0:3]), encode_for_xml(ind1), encode_for_xml(ind2)) field_number_old = field_number field_old = field # print subfield value value = encode_for_xml(value) out += """ <subfield code="%s">%s</subfield>\n""" % \ (encode_for_xml(field[-1:]), value) # all fields/subfields printed in this run, so close the tag: if field_number_old != -999: out += """ </datafield>\n""" i = 0 # Next loop should start looking at bib%0 and bibrec_bib00x # we are at the end of printing the record: out += " </record>\n" elif format == "xd" or format == "oai_dc": # XML Dublin Core format, possibly OAI -- select only some bibXXx fields: out += """ <dc xmlns="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://www.openarchives.org/OAI/1.1/dc.xsd">\n""" if record_exist_p == -1: out += "" else: for f in get_fieldvalues(recID, "041__a"): out += " <language>%s</language>\n" % f for f in get_fieldvalues(recID, "100__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "700__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "245__a"): out += " <title>%s</title>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "65017a"): out += " <subject>%s</subject>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "8564_u"): out += " <identifier>%s</identifier>\n" % encode_for_xml( f) for f in get_fieldvalues(recID, "520__a"): out += " <description>%s</description>\n" % encode_for_xml( f) out += " <date>%s</date>\n" % get_creation_date(recID) out += " </dc>\n" # print record closing tags, if needed: if format == "marcxml" or format == "oai_dc": out += " </metadata>\n" out += " </record>\n" return out
def encode_for_marcxml(value): from invenio.textutils import encode_for_xml if isinstance(value, unicode): value = value.encode('utf8') return encode_for_xml(str(value))
continue tempfile.tempdir = CFG_TMPSHAREDDIR # Move document to be uploaded to temporary folder tmp_file = tempfile.mktemp(prefix=identifier + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_", suffix=extension) shutil.copy(os.path.join(folder, docfile), tmp_file) # Create MARC temporary file with FFT tag and call bibupload filename = tempfile.mktemp(prefix=identifier + '_') filedesc = open(filename, 'w') marc_content = """ <record> <controlfield tag="001">%(rec_id)s</controlfield> <datafield tag="FFT" ind1=" " ind2=" "> <subfield code="n">%(name)s</subfield> <subfield code="a">%(path)s</subfield> </datafield> </record> """ % {'rec_id': rec_id, 'name': encode_for_xml(identifier), 'path': encode_for_xml(tmp_file), } filedesc.write(marc_content) filedesc.close() info[1].append(docfile) user = "" if req is not None: user_info = collect_user_info(req) user = user_info['nickname'] if not user: user = "******" # Execute bibupload with the appropiate mode task_arguments = ('bibupload', user, "--" + mode, "--name=" + docfile, "--priority=" + priority)