def add_url(self, url, lastmod=datetime(1900, 1, 1), changefreq="", priority="", alternate=False): """ create a new url node. Returns the number of url nodes in sitemap""" self.num_urls += 1 canonical_url, alternate_urls = get_canonical_and_alternates_urls(url, drop_ln=not alternate) url_node = u""" <url> <loc>%s</loc>%s </url>""" optional = '' if lastmod: optional += u""" <lastmod>%s</lastmod>""" % lastmod.strftime('%Y-%m-%dT%H:%M:%S' + \ DEFAULT_TIMEZONE) if changefreq: optional += u""" <changefreq>%s</changefreq>""" % changefreq if priority: optional += u""" <priority>%s</priority>""" % priority if alternate: for ln, alternate_url in iteritems(alternate_urls): ln = ln.replace('_', '-') ## zh_CN -> zh-CN optional += u""" <xhtml:link rel="alternate" hreflang="%s" href="%s" />""" % (ln, encode_for_xml(alternate_url, quote=True)) url_node %= (encode_for_xml(canonical_url), optional) self.file_size += len(url_node) self.filedescriptor.write(url_node) return self.num_urls
def add_url(self, url, lastmod=datetime(1900, 1, 1), changefreq="", priority="", alternate=False): """ create a new url node. Returns the number of url nodes in sitemap""" self.num_urls += 1 canonical_url, alternate_urls = get_canonical_and_alternates_urls( url, drop_ln=not alternate) url_node = u""" <url> <loc>%s</loc>%s </url>""" optional = '' if lastmod: optional += u""" <lastmod>%s</lastmod>""" % lastmod.strftime('%Y-%m-%dT%H:%M:%S' + \ DEFAULT_TIMEZONE) if changefreq: optional += u""" <changefreq>%s</changefreq>""" % changefreq if priority: optional += u""" <priority>%s</priority>""" % priority if alternate: for ln, alternate_url in iteritems(alternate_urls): ln = ln.replace('_', '-') ## zh_CN -> zh-CN optional += u""" <xhtml:link rel="alternate" hreflang="%s" href="%s" />""" % ( ln, encode_for_xml(alternate_url, quote=True)) url_node %= (encode_for_xml(canonical_url), optional) self.file_size += len(url_node) self.filedescriptor.write(url_node) return self.num_urls
def _bibconvert_escape(dummy_ctx, value): """Bridge to lxml to escape the provided value.""" try: if isinstance(value, str): string_value = value elif isinstance(value, (int, long)): string_value = str(value) elif isinstance(value, list): value = value[0] if isinstance(value, str): string_value = value elif isinstance(value, (int, long)): string_value = str(value) else: string_value = value.text else: string_value = value.text return encode_for_xml(string_value) except Exception as err: print("Error during formatting function evaluation: {0}".format(err), file=sys.stderr) return ''
def __new__(cls, original_string='', escape_quotes=False): if isinstance(original_string, EscapedString): escaped_string = str(original_string) else: if original_string and not str(original_string).strip(): escaped_string = ' ' else: escaped_string = encode_for_xml(str(original_string), wash=True, quote=escape_quotes) obj = str.__new__(cls, escaped_string) obj.original_string = original_string obj.escape_quotes = escape_quotes return obj
def create_ill_record(book_info): """ Create a new ILL record @param book_info: book's information @type book_info: tuple @return MARC record """ (title, author, place, publisher, year, edition, isbn) = book_info ill_record = """ <record> <datafield tag="020" ind1=" " ind2=" "> <subfield code="a">%(isbn)s</subfield> </datafield> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">%(author)s</subfield> </datafield> <datafield tag="245" ind1=" " ind2=" "> <subfield code="a">%(title)s</subfield> </datafield> <datafield tag="250" ind1=" " ind2=" "> <subfield code="a">%(edition)s</subfield> </datafield> <datafield tag="260" ind1=" " ind2=" "> <subfield code="a">%(place)s</subfield> <subfield code="b">%(publisher)s</subfield> <subfield code="c">%(year)s</subfield> </datafield> <datafield tag="980" ind1=" " ind2=" "> <subfield code="a">ILLBOOK</subfield> </datafield> </record> """ % {'isbn': encode_for_xml(isbn), 'author': encode_for_xml(author), 'title': encode_for_xml(title), 'edition': encode_for_xml(edition), 'place': encode_for_xml(place), 'publisher': encode_for_xml(publisher), 'year': encode_for_xml(year)} file_path = '%s/%s_%s.xml' % (CFG_TMPDIR, 'bibcirculation_ill_book', time.strftime("%Y%m%d_%H%M%S")) xml_file = open(file_path, 'w') xml_file.write(ill_record) xml_file.close() # Pass XML file to BibUpload. task_low_level_submission('bibupload', 'bibcirculation', '-P', '5', '-i', file_path) return ill_record
def format_element(bfo, type='xm', encodeForXML='yes'): """Print the complete current record as XML. :param type: the type of xml. Can be 'xml', 'oai_dc', 'marcxml', 'xd' :param encodeForXML: if 'yes', replace all < > and & with html corresponding escaped characters. """ assert type == 'xm' from invenio.modules.records.api import get_record from invenio.utils.text import encode_for_xml out = get_record(bfo.recID).legacy_export_as_marc() if encodeForXML.lower() == 'yes': return encode_for_xml(out) else: return out
def format_element(bfo, type='xml', encodeForXML='yes'): """ Prints the complete current record as XML. @param type: the type of xml. Can be 'xml', 'oai_dc', 'marcxml', 'xd' @param encodeForXML: if 'yes', replace all < > and & with html corresponding escaped characters. """ from invenio.modules.formatter.utils import record_get_xml from invenio.utils.text import encode_for_xml #Can be used to output various xml flavours. out = record_get_xml(bfo.recID, format=type, on_the_fly=True) if encodeForXML.lower() == 'yes': return encode_for_xml(out) else: return out
def _output_marc(output_complete, categories, kw_field=bconfig.CFG_MAIN_FIELD, auth_field=bconfig.CFG_AUTH_FIELD, acro_field=bconfig.CFG_ACRON_FIELD, provenience='BibClassify'): """Output the keywords in the MARCXML format. :var skw_matches: list of single keywords :var ckw_matches: list of composite keywords :var author_keywords: dictionary of extracted author keywords :var acronyms: dictionary of acronyms :var spires: boolean, True=generate spires output - BUT NOTE: it is here only not to break compatibility, in fact spires output should never be used for xml because if we read marc back into the KeywordToken objects, we would not find them :keyword provenience: string that identifies source (authority) that assigned the contents of the field :return: string, formatted MARC""" kw_template = ('<datafield tag="%s" ind1="%s" ind2="%s">\n' ' <subfield code="2">%s</subfield>\n' ' <subfield code="a">%s</subfield>\n' ' <subfield code="n">%s</subfield>\n' ' <subfield code="9">%s</subfield>\n' '</datafield>\n') output = [] tag, ind1, ind2 = _parse_marc_code(kw_field) for keywords in (output_complete["Single keywords"], output_complete["Core keywords"]): for kw in keywords: output.append(kw_template % (tag, ind1, ind2, encode_for_xml(provenience), encode_for_xml(kw), keywords[kw], encode_for_xml(categories[kw]))) for field, keywords in ((auth_field, output_complete["Author keywords"]), (acro_field, output_complete["Acronyms"])): if keywords and len( keywords) and field: # field='' we shall not save the keywords tag, ind1, ind2 = _parse_marc_code(field) for kw, info in keywords.items(): output.append( kw_template % (tag, ind1, ind2, encode_for_xml(provenience), encode_for_xml(kw), '', encode_for_xml(categories[kw]))) return "".join(output)
def _output_marc(output_complete, categories, kw_field=cfg["CLASSIFIER_RECORD_KEYWORD_FIELD"], auth_field=cfg["CLASSIFIER_RECORD_KEYWORD_AUTHOR_FIELD"], acro_field=cfg["CLASSIFIER_RECORD_KEYWORD_ACRONYM_FIELD"], provenience='Classifier'): """Output the keywords in the MARCXML format. :var skw_matches: list of single keywords :var ckw_matches: list of composite keywords :var author_keywords: dictionary of extracted author keywords :var acronyms: dictionary of acronyms :var spires: boolean, True=generate spires output - BUT NOTE: it is here only not to break compatibility, in fact spires output should never be used for xml because if we read marc back into the KeywordToken objects, we would not find them :keyword provenience: string that identifies source (authority) that assigned the contents of the field :return: string, formatted MARC """ kw_template = ('<datafield tag="%s" ind1="%s" ind2="%s">\n' ' <subfield code="2">%s</subfield>\n' ' <subfield code="a">%s</subfield>\n' ' <subfield code="n">%s</subfield>\n' ' <subfield code="9">%s</subfield>\n' '</datafield>\n') output = [] tag, ind1, ind2 = _parse_marc_code(kw_field) for keywords in (output_complete["Single keywords"], output_complete["Core keywords"]): for kw in keywords: output.append(kw_template % (tag, ind1, ind2, encode_for_xml(provenience), encode_for_xml(kw), keywords[kw], encode_for_xml(categories[kw]))) for field, keywords in ((auth_field, output_complete["Author keywords"]), (acro_field, output_complete["Acronyms"])): # field='' we shall not save the keywords if keywords and len(keywords) and field: tag, ind1, ind2 = _parse_marc_code(field) for kw, info in keywords.items(): output.append(kw_template % (tag, ind1, ind2, encode_for_xml(provenience), encode_for_xml(kw), '', encode_for_xml(categories[kw]))) return "".join(output)
def bibconvert_escape_libxslt(dummy_ctx, value): """ Bridge to libxslt to escape the provided value. """ try: if isinstance(value, str): string_value = value elif isinstance(value, (int, long)): string_value = str(value) else: string_value = libxml2.xmlNode(_obj=value[0]).serialize('utf8') return encode_for_xml(string_value) except Exception as err: sys.stderr.write("Error during formatting function evaluation: " + \ str(err) + \ '\n') return ''
def _output_marc(skw_matches, ckw_matches, author_keywords, acronyms, spires=False, kw_field=bconfig.CFG_MAIN_FIELD, auth_field=bconfig.CFG_AUTH_FIELD, acro_field=bconfig.CFG_ACRON_FIELD, provenience='BibClassify'): """Outputs the keywords in the MARCXML format. @var skw_matches: list of single keywords @var ckw_matches: list of composite keywords @var author_keywords: dictionary of extracted author keywords @var acronyms: dictionary of acronyms @var spires: boolean, True=generate spires output - BUT NOTE: it is here only not to break compatibility, in fact spires output should never be used for xml because if we read marc back into the KeywordToken objects, we would not find them @keyword provenience: string that identifies source (authority) that assigned the contents of the field @return: string, formatted MARC""" kw_template = ('<datafield tag="%s" ind1="%s" ind2="%s">\n' ' <subfield code="2">%s</subfield>\n' ' <subfield code="a">%s</subfield>\n' ' <subfield code="n">%s</subfield>\n' ' <subfield code="9">%s</subfield>\n' '</datafield>\n') output = [] tag, ind1, ind2 = _parse_marc_code(kw_field) for keywords in (skw_matches, ckw_matches): if keywords and len(keywords): for kw, info in keywords: output.append(kw_template % (tag, ind1, ind2, encode_for_xml(provenience), encode_for_xml(kw.output(spires)), len(info[0]), encode_for_xml(kw.getType()))) for field, keywords in ((auth_field, author_keywords), (acro_field, acronyms)): if keywords and len(keywords) and field: # field='' we shall not save the keywords tag, ind1, ind2 = _parse_marc_code(field) for kw, info in keywords.items(): output.append(kw_template % (tag, ind1, ind2, encode_for_xml(provenience), encode_for_xml(kw), '', encode_for_xml(kw.getType()))) return "".join(output)
def record_get_xml(recID, format='xm', decompress=zlib.decompress, on_the_fly=False): """ Returns an XML string of the record given by recID. The function builds the XML directly from the database, without using the standard formatting process. 'format' allows to define the flavour of XML: - 'xm' for standard XML - 'marcxml' for MARC XML - 'oai_dc' for OAI Dublin Core - 'xd' for XML Dublin Core If record does not exist, returns empty string. If the record is deleted, returns an empty MARCXML (with recid controlfield, OAI ID fields and 980__c=DELETED) @param recID: the id of the record to retrieve @param format: the format to use @param on_the_fly: if False, try to fetch precreated one in database @param decompress: the library to use to decompress cache from DB @return: the xml string of the record """ from invenio.legacy.search_engine import record_exists def get_creation_date(recID, fmt="%Y-%m-%d"): "Returns the creation date of the record 'recID'." out = "" res = run_sql("SELECT DATE_FORMAT(creation_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out def get_modification_date(recID, fmt="%Y-%m-%d"): "Returns the date of last modification for the record 'recID'." out = "" res = run_sql("SELECT DATE_FORMAT(modification_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out #_ = gettext_set_language(ln) out = "" # sanity check: record_exist_p = record_exists(recID) if record_exist_p == 0: # doesn't exist return out # print record opening tags, if needed: if format == "marcxml" or format == "oai_dc": out += " <record>\n" out += " <header>\n" for identifier in get_fieldvalues(recID, CFG_OAI_ID_FIELD): out += " <identifier>%s</identifier>\n" % identifier out += " <datestamp>%s</datestamp>\n" % get_modification_date(recID) out += " </header>\n" out += " <metadata>\n" if format.startswith("xm") or format == "marcxml": res = None if on_the_fly is False: # look for cached format existence: query = """SELECT value FROM bibfmt WHERE id_bibrec='%s' AND format='%s'""" % (recID, format) res = run_sql(query, None, 1) if res and record_exist_p == 1: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format' -- they are # not in "bibfmt" table; so fetch all the data from # "bibXXx" tables: if format == "marcxml": out += """ <record xmlns="http://www.loc.gov/MARC21/slim">\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int(recID) elif format.startswith("xm"): out += """ <record>\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int(recID) if record_exist_p == -1: # deleted record, so display only OAI ID and 980: oai_ids = get_fieldvalues(recID, CFG_OAI_ID_FIELD) if oai_ids: out += "<datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\"><subfield code=\"%s\">%s</subfield></datafield>\n" % \ (CFG_OAI_ID_FIELD[0:3], CFG_OAI_ID_FIELD[3:4], CFG_OAI_ID_FIELD[4:5], CFG_OAI_ID_FIELD[5:6], oai_ids[0]) out += "<datafield tag=\"980\" ind1=\" \" ind2=\" \"><subfield code=\"c\">DELETED</subfield></datafield>\n" from invenio.legacy.search_engine import get_merged_recid merged_recid = get_merged_recid(recID) if merged_recid: # record was deleted but merged to other record, so display this information: out += "<datafield tag=\"970\" ind1=\" \" ind2=\" \"><subfield code=\"d\">%d</subfield></datafield>\n" % merged_recid else: # controlfields query = "SELECT b.tag,b.value,bb.field_number FROM bib00x AS b, bibrec_bib00x AS bb "\ "WHERE bb.id_bibrec='%s' AND b.id=bb.id_bibxxx AND b.tag LIKE '00%%' "\ "ORDER BY bb.field_number, b.tag ASC" % recID res = run_sql(query) for row in res: field, value = row[0], row[1] value = encode_for_xml(value) out += """ <controlfield tag="%s">%s</controlfield>\n""" % \ (encode_for_xml(field[0:3]), value) # datafields i = 1 # Do not process bib00x and bibrec_bib00x, as # they are controlfields. So start at bib01x and # bibrec_bib00x (and set i = 0 at the end of # first loop) for digit1 in range(0, 10): for digit2 in range(i, 10): bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec='%s' AND b.id=bb.id_bibxxx AND b.tag LIKE '%s%%' "\ "ORDER BY bb.field_number, b.tag ASC" % (bx, bibx, recID, str(digit1)+str(digit2)) res = run_sql(query) field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] if ind1 == "_" or ind1 == "": ind1 = " " if ind2 == "_" or ind2 == "": ind2 = " " # print field tag if field_number != field_number_old or \ field[:-1] != field_old[:-1]: if field_number_old != -999: out += """ </datafield>\n""" out += """ <datafield tag="%s" ind1="%s" ind2="%s">\n""" % \ (encode_for_xml(field[0:3]), encode_for_xml(ind1), encode_for_xml(ind2)) field_number_old = field_number field_old = field # print subfield value value = encode_for_xml(value) out += """ <subfield code="%s">%s</subfield>\n""" % \ (encode_for_xml(field[-1:]), value) # all fields/subfields printed in this run, so close the tag: if field_number_old != -999: out += """ </datafield>\n""" i = 0 # Next loop should start looking at bib%0 and bibrec_bib00x # we are at the end of printing the record: out += " </record>\n" elif format == "xd" or format == "oai_dc": # XML Dublin Core format, possibly OAI -- select only some bibXXx fields: out += """ <dc xmlns="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://www.openarchives.org/OAI/1.1/dc.xsd">\n""" if record_exist_p == -1: out += "" else: for f in get_fieldvalues(recID, "041__a"): out += " <language>%s</language>\n" % f for f in get_fieldvalues(recID, "100__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "700__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "245__a"): out += " <title>%s</title>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "65017a"): out += " <subject>%s</subject>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "8564_u"): out += " <identifier>%s</identifier>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "520__a"): out += " <description>%s</description>\n" % encode_for_xml(f) out += " <date>%s</date>\n" % get_creation_date(recID) out += " </dc>\n" # print record closing tags, if needed: if format == "marcxml" or format == "oai_dc": out += " </metadata>\n" out += " </record>\n" return out
def encode_for_marcxml(value): from invenio.utils.text import encode_for_xml if isinstance(value, unicode): value = value.encode('utf8') return encode_for_xml(str(value))
def document_upload(req=None, folder="", matching="", mode="", exec_date="", exec_time="", ln=CFG_SITE_LANG, priority="1", email_logs_to=None): """ Take files from the given directory and upload them with the appropiate mode. @parameters: + folder: Folder where the files to upload are stored + matching: How to match file names with record fields (report number, barcode,...) + mode: Upload mode (append, revise, replace) @return: tuple (file, error code) file: file name causing the error to notify the user error code: 1 - More than one possible recID, ambiguous behaviour 2 - No records match that file name 3 - File already exists """ import sys from invenio.legacy.bibdocfile.api import BibRecDocs, file_strip_ext from invenio.utils.hash import md5 import shutil from invenio.legacy.search_engine import perform_request_search, \ search_pattern, \ guess_collection_of_a_record _ = gettext_set_language(ln) errors = [] info = [0, []] # Number of files read, name of the files try: files = os.listdir(folder) except OSError as error: errors.append(("", error)) return errors, info err_desc = { 1: _("More than one possible recID, ambiguous behaviour"), 2: _("No records match that file name"), 3: _("File already exists"), 4: _("A file with the same name and format already exists") } # Create directory DONE/ if doesn't exist folder = (folder[-1] == "/") and folder or (folder + "/") files_done_dir = folder + "DONE/" try: os.mkdir(files_done_dir) except OSError: # Directory exists or no write permission pass for docfile in files: if os.path.isfile(os.path.join(folder, docfile)): info[0] += 1 identifier = file_strip_ext(docfile) extension = docfile[len(identifier):] rec_id = None if identifier: rec_id = search_pattern(p=identifier, f=matching, m='e') if not rec_id: errors.append((docfile, err_desc[2])) continue elif len(rec_id) > 1: errors.append((docfile, err_desc[1])) continue else: rec_id = str(list(rec_id)[0]) rec_info = BibRecDocs(rec_id) if rec_info.bibdocs: for bibdoc in rec_info.bibdocs: attached_files = bibdoc.list_all_files() file_md5 = md5( open(os.path.join(folder, docfile), "rb").read()).hexdigest() num_errors = len(errors) for attached_file in attached_files: if attached_file.checksum == file_md5: errors.append((docfile, err_desc[3])) break elif attached_file.get_full_name() == docfile: errors.append((docfile, err_desc[4])) break if len(errors) > num_errors: continue # Check if user has rights to upload file if req is not None: file_collection = guess_collection_of_a_record(int(rec_id)) auth_code, auth_message = acc_authorize_action( req, 'runbatchuploader', collection=file_collection) if auth_code != 0: error_msg = _( "No rights to upload to collection '%(x_name)s'", x_name=file_collection) errors.append((docfile, error_msg)) continue # Move document to be uploaded to temporary folder (fd, tmp_file) = tempfile.mkstemp( prefix=identifier + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_", suffix=extension, dir=CFG_TMPSHAREDDIR) shutil.copy(os.path.join(folder, docfile), tmp_file) # Create MARC temporary file with FFT tag and call bibupload (fd, filename) = tempfile.mkstemp(prefix=identifier + '_', dir=CFG_TMPSHAREDDIR) filedesc = os.fdopen(fd, 'w') marc_content = """ <record> <controlfield tag="001">%(rec_id)s</controlfield> <datafield tag="FFT" ind1=" " ind2=" "> <subfield code="n">%(name)s</subfield> <subfield code="a">%(path)s</subfield> </datafield> </record> """ % { 'rec_id': rec_id, 'name': encode_for_xml(identifier), 'path': encode_for_xml(tmp_file), } filedesc.write(marc_content) filedesc.close() info[1].append(docfile) user = "" if req is not None: user_info = collect_user_info(req) user = user_info['nickname'] if not user: user = "******" # Execute bibupload with the appropiate mode task_arguments = ('bibupload', user, "--" + mode, "--priority=" + priority, "-N", "batchupload") if exec_date: date = '--runtime=' + "\'" + exec_date + ' ' + exec_time + "\'" task_arguments += (date, ) if email_logs_to: task_arguments += ("--email-logs-to", email_logs_to) task_arguments += (filename, ) jobid = task_low_level_submission(*task_arguments) # write batch upload history run_sql( """INSERT INTO hstBATCHUPLOAD (user, submitdate, filename, execdate, id_schTASK, batch_mode) VALUES (%s, NOW(), %s, %s, %s, "document")""", (user_info['nickname'], docfile, exec_date != "" and (exec_date + ' ' + exec_time) or time.strftime("%Y-%m-%d %H:%M:%S"), str(jobid))) # Move file to DONE folder done_filename = docfile + "_" + time.strftime( "%Y%m%d%H%M%S", time.localtime()) + "_" + str(jobid) try: os.rename(os.path.join(folder, docfile), os.path.join(files_done_dir, done_filename)) except OSError: errors.append('MoveError') return errors, info
def assemble_caption(begin_line, begin_index, end_line, end_index, lines): """ Take write_messageation about the caption of a picture and put it all together in a nice way. If it spans multiple lines, put it on one line. If it contains controlled characters, strip them out. If it has tags we don't want to worry about, get rid of them, etc. @param: begin_line (int): the index of the line where the caption begins @param: begin_index (int): the index within the line where the caption begins @param: end_line (int): the index of the line where the caption ends @param: end_index (int): the index within the line where the caption ends @param: lines ([string, string, ...]): the line strings of the text @return: caption (string): the caption, nicely formatted and pieced together """ # stuff we don't like label_head = '\\label{' # reassemble that sucker if end_line > begin_line: # our caption spanned multiple lines caption = lines[begin_line][begin_index:] for included_line_index in range(begin_line + 1, end_line): caption = caption + ' ' + lines[included_line_index] caption = caption + ' ' + lines[end_line][:end_index] caption = caption.replace('\n', ' ') caption = caption.replace(' ', ' ') else: # it fit on one line caption = lines[begin_line][begin_index:end_index] # clean out a label tag, if there is one label_begin = caption.find(label_head) if label_begin > -1: # we know that our caption is only one line, so if there's a label # tag in it, it will be all on one line. so we make up some args dummy_start, dummy_start_line, label_end, dummy_end = \ find_open_and_close_braces(0, label_begin, '{', [caption]) caption = caption[:label_begin] + caption[label_end + 1:] # clean out characters not allowed in MARCXML # not allowed: & < > try: caption = wash_for_utf8(caption) caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'), wash=True) except: # that damn encode thing threw an error on astro-ph/0601014 sys.stderr.write(caption) sys.stderr.write(' cannot be processed\n') caption = caption.replace('&', '&').replace('<', '<') caption = caption.replace('>', '>') caption = caption.strip() if len(caption) > 1 and caption[0] == '{' and caption[-1] == '}': caption = caption[1:-1] return caption
def document_upload(req=None, folder="", matching="", mode="", exec_date="", exec_time="", ln=CFG_SITE_LANG, priority="1", email_logs_to=None): """ Take files from the given directory and upload them with the appropiate mode. @parameters: + folder: Folder where the files to upload are stored + matching: How to match file names with record fields (report number, barcode,...) + mode: Upload mode (append, revise, replace) @return: tuple (file, error code) file: file name causing the error to notify the user error code: 1 - More than one possible recID, ambiguous behaviour 2 - No records match that file name 3 - File already exists """ import sys from invenio.legacy.bibdocfile.api import BibRecDocs, file_strip_ext from invenio.utils.hash import md5 import shutil from invenio.legacy.search_engine import perform_request_search, \ search_pattern, \ guess_collection_of_a_record _ = gettext_set_language(ln) errors = [] info = [0, []] # Number of files read, name of the files try: files = os.listdir(folder) except OSError as error: errors.append(("", error)) return errors, info err_desc = {1: _("More than one possible recID, ambiguous behaviour"), 2: _("No records match that file name"), 3: _("File already exists"), 4: _("A file with the same name and format already exists")} # Create directory DONE/ if doesn't exist folder = (folder[-1] == "/") and folder or (folder + "/") files_done_dir = folder + "DONE/" try: os.mkdir(files_done_dir) except OSError: # Directory exists or no write permission pass for docfile in files: if os.path.isfile(os.path.join(folder, docfile)): info[0] += 1 identifier = file_strip_ext(docfile) extension = docfile[len(identifier):] rec_id = None if identifier: rec_id = search_pattern(p=identifier, f=matching, m='e') if not rec_id: errors.append((docfile, err_desc[2])) continue elif len(rec_id) > 1: errors.append((docfile, err_desc[1])) continue else: rec_id = str(list(rec_id)[0]) rec_info = BibRecDocs(rec_id) if rec_info.bibdocs: for bibdoc in rec_info.bibdocs: attached_files = bibdoc.list_all_files() file_md5 = md5(open(os.path.join(folder, docfile), "rb").read()).hexdigest() num_errors = len(errors) for attached_file in attached_files: if attached_file.checksum == file_md5: errors.append((docfile, err_desc[3])) break elif attached_file.get_full_name() == docfile: errors.append((docfile, err_desc[4])) break if len(errors) > num_errors: continue # Check if user has rights to upload file if req is not None: file_collection = guess_collection_of_a_record(int(rec_id)) auth_code, auth_message = acc_authorize_action(req, 'runbatchuploader', collection=file_collection) if auth_code != 0: error_msg = _("No rights to upload to collection '%(x_name)s'", x_name=file_collection) errors.append((docfile, error_msg)) continue # Move document to be uploaded to temporary folder (fd, tmp_file) = tempfile.mkstemp(prefix=identifier + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_", suffix=extension, dir=CFG_TMPSHAREDDIR) shutil.copy(os.path.join(folder, docfile), tmp_file) # Create MARC temporary file with FFT tag and call bibupload (fd, filename) = tempfile.mkstemp(prefix=identifier + '_', dir=CFG_TMPSHAREDDIR) filedesc = os.fdopen(fd, 'w') marc_content = """ <record> <controlfield tag="001">%(rec_id)s</controlfield> <datafield tag="FFT" ind1=" " ind2=" "> <subfield code="n">%(name)s</subfield> <subfield code="a">%(path)s</subfield> </datafield> </record> """ % {'rec_id': rec_id, 'name': encode_for_xml(identifier), 'path': encode_for_xml(tmp_file), } filedesc.write(marc_content) filedesc.close() info[1].append(docfile) user = "" if req is not None: user_info = collect_user_info(req) user = user_info['nickname'] if not user: user = "******" # Execute bibupload with the appropiate mode task_arguments = ('bibupload', user, "--" + mode, "--priority=" + priority, "-N", "batchupload") if exec_date: date = '--runtime=' + "\'" + exec_date + ' ' + exec_time + "\'" task_arguments += (date, ) if email_logs_to: task_arguments += ("--email-logs-to", email_logs_to) task_arguments += (filename, ) jobid = task_low_level_submission(*task_arguments) # write batch upload history run_sql("""INSERT INTO hstBATCHUPLOAD (user, submitdate, filename, execdate, id_schTASK, batch_mode) VALUES (%s, NOW(), %s, %s, %s, "document")""", (user_info['nickname'], docfile, exec_date != "" and (exec_date + ' ' + exec_time) or time.strftime("%Y-%m-%d %H:%M:%S"), str(jobid))) # Move file to DONE folder done_filename = docfile + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_" + str(jobid) try: os.rename(os.path.join(folder, docfile), os.path.join(files_done_dir, done_filename)) except OSError: errors.append('MoveError') return errors, info