def check_records(records): for record in records: if is_springer(record): rec_doc = BibRecDocs(int(record.record_id)) rec_docs = rec_doc.list_latest_files() for doc in rec_docs: if doc.get_format() == '.xml': f = open(doc.get_full_path()) content = f.read() try: del record['100'] del record['700'] record.amended = True except: pass first_author = True try: if "-//NLM//DTD JATS" in content: jats = JATSParser() authors = jats.get_authors(parseString(content)) else: app = NLMParser() authors = app.get_authors(parseString(content)) except: record.warn('Problem with parsing XML.') continue for author in authors: if author.get('surname'): subfields = [ ('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', ''))) ] else: subfields = [('a', '%s' % (author.get('name', ''))) ] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record.add_field('100__', value='', subfields=subfields) first_author = False else: record.add_field('700__', value='', subfields=subfields)
def get_rawtext_from_record_id(record_id): bibrec = BibRecDocs(record_id) bibdoc = get_latest_pdf(bibrec.list_latest_files()) try: rawtext = bibdoc.bibdoc.get_text() except: return '' return rawtext
def format_element(bfo, width="", caption="yes", max="-1"): """ Display image of the plot if we are in selected plots collections @param width: the width of the returned image (Eg: '100px') @param separator: a separator between images @param caption: display the captions or not? @param max: the maximum number of plots to display (-1 is all plots) """ ## To achieve this, we take the pngs associated with this document img_files = [] max = int(max) bibarchive = BibRecDocs(bfo.recID) if width != "": width = 'width="%s"' % width for doc in bibarchive.list_bibdocs(): for _file in doc.list_latest_files(): if _file.get_type() == "Plot": try: caption_text = _file.get_description()[5:] index = int(_file.get_description()[:5]) img_location = _file.get_url() except: # FIXME: we have hit probably a plot context file, # so ignore this document; but it would be safer # to check subformat type, so that we don't mask # other eventual errors here. continue img = '<img src="%s" title="%s" %s/>' % \ (img_location, caption_text, width) link = create_html_link(urlbase='%s/record/%s/plots#%d' % (CFG_SITE_URL, bfo.recID,\ index), urlargd={}, link_label=img) img_files.append((index, link)) img_files = sorted(img_files, key=lambda x: x[0]) if max > 0: img_files = img_files[:max] for index in range(len(img_files)): img_files[index] = img_files[index][1] if len(img_files) == 0: return '' return '<div style="overflow-x:scroll;width=100%;white-space:nowrap">' +\ " ".join(img_files) + '</div>'
def get_filetypes(recid): """ Returns filetypes extensions associated with given record. Takes as a parameter the recid of a record. @param url_field: recid of a record """ docs = BibRecDocs(recid) return [_get_filetype(d.format) for d in docs.list_latest_files()]
def Add_Files(parameters, curdir, form, user_info=None): """DEPRECATED: Use FFT instead.""" if os.path.exists("%s/files" % curdir): bibrecdocs = BibRecDocs(sysno) for file in os.listdir("%s/files" % curdir): fullpath = "%s/files/%s" % (curdir, file) if not bibrecdocs.check_file_exists(fullpath): bibrecdocs.add_new_file(fullpath, "Main", never_fail=True) return ""
def get_pdf(recid): bibrecdocs = BibRecDocs(recid) for bibdoc in bibrecdocs.list_bibdocs(): if bibdoc.format_already_exists_p(".pdf"): docfile = bibdoc.get_file(".pdf") return docfile.checksum, docfile.url, ".pdf" elif bibdoc.format_already_exists_p(".pdf;pdfa"): docfile = bibdoc.get_file(".pdf;pdfa") return docfile.checksum, docfile.url, ".pdf;pdfa" return None, None, None
def list_pdfs(recid): rec_info = BibRecDocs(recid) docs = rec_info.list_bibdocs() for doc in docs: for ext in ('pdf', 'pdfa', 'PDF'): try: yield doc.get_file(ext) except InvenioBibDocFileError: pass
def dump_bibdoc(recid, from_date, **kwargs): """Dump all BibDoc metadata. :param docid: BibDoc ID :param from_date: Dump only BibDoc revisions newer than this date. :returns: List of version of the BibDoc formatted as a dict """ BibRecDocs, BibDoc = _import_bibdoc() bibdocfile_dump = [] date = datetime.datetime.strptime(from_date, '%Y-%m-%d %H:%M:%S') for bibdoc in BibRecDocs(recid).list_bibdocs(): for version in bibdoc.list_versions(): bibdoc_version = bibdoc.list_version_files(version) for f in bibdoc_version: if f.is_icon() or f.md < date: # Don't care about icons # Don't care about files not modified since from_date continue bibdocfile_dump.append( dict( bibdocid=f.get_bibdocid(), checksum=f.get_checksum(), comment=f.get_comment(), copyright=(f.get_copyright() if hasattr( f, 'get_copyright') else None), creation_date=datetime_toutc(f.cd).isoformat(), description=f.get_description(), encoding=f.encoding, etag=f.etag, flags=f.flags, format=f.get_format(), full_name=f.get_full_name(), full_path=f.get_full_path(), hidden=f.hidden, license=(f.get_license() if hasattr(f, 'get_license') else None), modification_date=datetime_toutc(f.md).isoformat(), name=f.get_name(), mime=f.mime, path=f.get_path(), recid=f.get_recid(), recids_doctype=f.recids_doctypes, size=f.get_size(), status=f.get_status(), subformat=f.get_subformat(), superformat=f.get_superformat(), type=f.get_type(), url=f.get_url(), version=f.get_version(), )) return bibdocfile_dump
def get_pdf_snippets(recID, patterns): """ Extract text snippets around 'patterns' from the newest PDF file of 'recID' The search is case-insensitive. The snippets are meant to look like in the results of the popular search engine: using " ... " between snippets. For empty patterns it returns "" @param recID: record ID to consider @param patterns: list of patterns to retrieve @param nb_words_around: max number of words around the matched pattern @param max_snippets: max number of snippets to include @return: snippet """ from invenio.bibdocfile import BibRecDocs text_path = "" text_path_courtesy = "" for bd in BibRecDocs(recID).list_bibdocs(): if bd.get_text(): text_path = bd.get_text_path() text_path_courtesy = bd.get_status() if CFG_INSPIRE_SITE and not text_path_courtesy: # get courtesy from doctype, since docstatus was empty: text_path_courtesy = bd.get_type() if text_path_courtesy == 'INSPIRE-PUBLIC': # but ignore 'INSPIRE-PUBLIC' doctype text_path_courtesy = '' break # stop at the first good PDF textable file nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0) max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0) if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.has_key(text_path_courtesy): nb_chars=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS[text_path_courtesy] if CFG_WEBSEARCH_FULLTEXT_SNIPPETS.has_key(text_path_courtesy): max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS[text_path_courtesy] if text_path and nb_chars and max_snippets: out = get_text_snippets(text_path, patterns, nb_chars, max_snippets) if not out: # no hit, so check stemmed versions: from invenio.bibindex_engine_stemmer import stem stemmed_patterns = [stem(p, 'en') for p in patterns] out = get_text_snippets(text_path, stemmed_patterns, nb_chars, max_snippets) if out: out_courtesy = "" if text_path_courtesy: out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>' return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out) else: return "" else: return ""
def fix_recid(recid, logfile): """Fix a given recid.""" print "Upgrading record %s ->" % recid, print >> logfile, "Upgrading record %s:" % recid bibrec = BibRecDocs(recid) print >> logfile, bibrec docnames = bibrec.get_bibdoc_names() try: for docname in docnames: print docname, new_bibdocs = bibrec.fix(docname) new_bibdocnames = [bibdoc.get_docname() for bibdoc in new_bibdocs] if new_bibdocnames: print "(created bibdocs: '%s')" % "', '".join(new_bibdocnames), print >> logfile, "(created bibdocs: '%s')" % "', '".join(new_bibdocnames) except InvenioWebSubmitFileError, e: print >> logfile, BibRecDocs(recid) print "%s -> ERROR", e return False
def look_for_fulltext(recid): """Look for fulltext pdf (bibdocfile) for a given recid""" rec_info = BibRecDocs(recid) docs = rec_info.list_bibdocs() for doc in docs: for d in doc.list_all_files(): if d.get_format().strip('.') in ['pdf', 'pdfa', 'PDF']: try: yield doc, d except InvenioBibDocFileError: pass
def Add_Files(parameters, curdir, form, user_info=None): """DEPRECATED: Use FFT instead.""" if os.path.exists("%s/files" % curdir): bibrecdocs = BibRecDocs(sysno) for current_file in os.listdir("%s/files" % curdir): fullpath = "%s/files/%s" % (curdir, current_file) dummy, filename, extension = decompose_file(current_file) if extension and extension[0] != ".": extension = '.' + extension if not bibrecdocs.check_file_exists(fullpath, extension): bibrecdocs.add_new_file(fullpath, "Main", never_fail=True) return ""
def bst_fix_ffts(debug=0): debug = bool(int(debug)) ffts = {} for recid in get_broken_recids(): task_sleep_now_if_required(can_stop_too=True) write_message("Fixing %s" % recid) try: ffts[recid] = build_fft(get_last_pdf_for_record(BibRecDocs(recid))) except: register_exception(alert_admin=True) write_message("Uploading corrections") bibupload_ffts(ffts, append=True, do_debug=debug, interactive=False) return True
def get_pdfa_record(self, path=None): from invenio.search_engine import perform_request_search xml_doc = self.get_article(path) rec = create_record() dummy, dummy, dummy, dummy, dummy, dummy, dummy,\ dummy, doi = self.get_publication_information(xml_doc) recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' % (doi, )) if recid: record_add_field(rec, '001', controlfield_value=recid[0]) else: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) message = ('Adding PDF/A. No paper with this DOI: ' '%s. Trying to add it anyway.') % (doi, ) self.logger.error(message) try: if exists(join(path, 'main_a-2b.pdf')): record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main_a-2b.pdf')), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi, )) elif exists(join(path, 'main.pdf')): record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) message = 'No PDF/A in VTEX package for record: ' + doi self.logger.debug(message) else: message = "Record %s doesn't contain PDF file." % (doi, ) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi, ) register_exception(alert_admin=True, prefix=message) self.logger.warning(message) ## copy other formats to bibupload file if recid: from invenio.bibdocfile import BibRecDocs record = BibRecDocs(recid[0]) for bibfile in record.list_latest_files(): if bibfile.get_format() != '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', bibfile.get_full_path()), ('n', bibfile.get_name()), ('f', bibfile.get_format())]) return record_xml_output(rec)
def test_BibRecDocs(self): """bibdocfile - BibRecDocs functions""" from invenio.bibdocfile import BibRecDocs my_bibrecdoc = BibRecDocs(2) #add bibdoc my_bibrecdoc.add_new_file(CFG_PREFIX + '/lib/webtest/invenio/test.jpg', 'Main', 'img_test', False, 'test add new file', 'test', '.jpg') my_bibrecdoc.add_bibdoc(doctype='Main', docname='file', never_fail=False) self.assertEqual(len(my_bibrecdoc.list_bibdocs()), 3) my_added_bibdoc = my_bibrecdoc.get_bibdoc('file') #add bibdocfile in empty bibdoc my_added_bibdoc.add_file_new_version(CFG_PREFIX + '/lib/webtest/invenio/test.gif', \ description= 'added in empty bibdoc', comment=None, docformat=None, flags=['PERFORM_HIDE_PREVIOUS']) #propose unique docname self.assertEqual(my_bibrecdoc.propose_unique_docname('file'), 'file_2') #has docname self.assertEqual(my_bibrecdoc.has_docname_p('file'), True) #merge 2 bibdocs my_bibrecdoc.merge_bibdocs('img_test', 'file') self.assertEqual( len(my_bibrecdoc.get_bibdoc("img_test").list_all_files()), 2) #check file exists self.assertEqual( my_bibrecdoc.check_file_exists( CFG_PREFIX + '/lib/webtest/invenio/test.jpg', '.jpg'), True) #get bibdoc names # we can not rely on the order ! names = set([ my_bibrecdoc.get_bibdoc_names('Main')[0], my_bibrecdoc.get_bibdoc_names('Main')[1] ]) self.assertTrue('0104007_02' in names) self.assertTrue('img_test' in names) #get total size self.assertEqual(my_bibrecdoc.get_total_size(), 1647591) #get total size latest version self.assertEqual(my_bibrecdoc.get_total_size_latest_version(), 1647591) #display #value = my_bibrecdoc.display(docname='img_test', version='', doctype='', ln='en', verbose=0, display_hidden=True) #self.assert_("<small><b>Main</b>" in value) #get xml 8564 value = my_bibrecdoc.get_xml_8564() self.assert_('/' + CFG_SITE_RECORD + '/2/files/img_test.jpg</subfield>' in value) #check duplicate docnames self.assertEqual(my_bibrecdoc.check_duplicate_docnames(), True)
def generate_keywords(req, recid, store_keywords=True): req.write( "Please be patient while the keywords classification is running...") bibdocfiles = BibRecDocs(recid).list_latest_files() keywords = [] for doc in bibdocfiles: # Get the keywords for each PDF document contained in the record. if is_pdf(doc.get_full_path()): fulltext = doc.get_full_path() from invenio.bibclassify_engine import get_keywords_from_local_file single_keywords, composite_keywords = get_keywords_from_local_file( fulltext, taxonomy='HEP', with_author_keywords=True) for keyword, spans in single_keywords.items(): keywords.append([keyword.concept, len(spans)]) for keyword, num, components in composite_keywords: keywords.append([keyword.concept, num]) if keywords and store_keywords: output = [ '<collection><record>\n' '<controlfield tag="001">%s</controlfield>' % recid ] output.append( output_marc(single_keywords, composite_keywords, spires=False, taxonomy='HEP')) output.append('</record></collection>') tmp_directory = "%s/bibclassify" % CFG_TMPDIR filename = "bibclassifyd_%s.xml" % time.strftime( "%Y%m%d%H%M%S", time.localtime()) abs_path = os.path.join(tmp_directory, filename) if not os.path.isdir(tmp_directory): os.mkdir(tmp_directory) file_desc = open(abs_path, "w") file_desc.write('\n'.join(output)) file_desc.close() #cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path) #os.system(cmd) return keywords
def filter_fulltexts(recids): """ returns list of records having a fulltext of type fulltext_type. If fulltext_type is empty, return all records having a fulltext""" recids = dict(recids) minimum_timestamp = get_minimum_timestamp() query = """SELECT id_bibrec, max(modification_date) FROM bibrec_bibdoc LEFT JOIN bibdoc ON bibrec_bibdoc.id_bibdoc=bibdoc.id GROUP BY id_bibrec""" res = run_sql(query) return [(recid, max(lastmod, minimum_timestamp)) for (recid, lastmod) in res if recid in recids and BibRecDocs(recid).list_latest_files( list_hidden=False)]
def format_element(bfo): """ Prints buttons to download all photos for each size """ current_bibrecdoc = BibRecDocs(bfo.recID) if len(current_bibrecdoc.bibdocs) < 2: # If we have less than 2 photos, there is no point in displaying the # "Download all" buttons return wrapper = '''<style> #downloadallphotos { clear: both; font-size: small; color: #555444; margin-left: 10px; } #downloadallphotos a { border-radius: 5px; box-shadow: 1px 1px 1px 1px #CCCCCC; color: #222222; display: inline-block; margin: 2px 5px; padding: 3px; text-decoration: none; background-color: #E6E6FA; } #downloadallphotos a:hover { background: -moz-linear-gradient(center top , #3A3A3A 0%, #7D7E7D 100%) repeat scroll 0 0 rgba(0, 0, 0, 0); color: #fff; } </style>''' wrapper += '''<div id="downloadallphotos">Download all pictures:''' buttons = '' for (size, internal_size) in CFG_BIBDOCFILE_SUBFORMATS_TRANSLATIONS: total = current_bibrecdoc.get_total_size_latest_version( bfo.user_info, internal_size) # don't display the button if the size will be 0 if total: buttons += '<a %(original)s href="%(site)s/record/%(recID)s/files/allfiles-%(size)s">%(size)s (%(total)s)</a>' \ % {'original': size == 'original' and 'data-size="Original"' or '', 'site': CFG_SITE_URL, 'recID': bfo.recID, 'size': size, 'total': nice_size(total)} # If there are no buttons to display, don't display the rest of the HTML if buttons: return wrapper + buttons
def get_pdf_snippets(recID, patterns, nb_words_around=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS, max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS): """ Extract text snippets around 'patterns' from the newest PDF file of 'recID' The search is case-insensitive. The snippets are meant to look like in the results of the popular search engine: using " ... " between snippets. For empty patterns it returns "" """ from invenio.bibdocfile import BibRecDocs text_path = "" text_path_courtesy = "" for bd in BibRecDocs(recID).list_bibdocs(): if bd.get_text(): text_path = bd.get_text_path() text_path_courtesy = bd.get_status() if CFG_INSPIRE_SITE and not text_path_courtesy: # get courtesy from doctype, since docstatus was empty: text_path_courtesy = bd.get_type() if text_path_courtesy == 'INSPIRE-PUBLIC': # but ignore 'INSPIRE-PUBLIC' doctype text_path_courtesy = '' break # stop at the first good PDF textable file if text_path: out = get_text_snippets(text_path, patterns, nb_words_around, max_snippets) if not out: # no hit, so check stemmed versions: from invenio.bibindex_engine_stemmer import stem stemmed_patterns = [stem(p, 'en') for p in patterns] out = get_text_snippets(text_path, stemmed_patterns, nb_words_around, max_snippets, False) if out: out_courtesy = "" if text_path_courtesy: out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>' return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out) else: return "" else: return ""
def _analyze_documents(records, ontology, collection): """For each collection, parse the documents attached to the records in collection with the corresponding ontology.""" global _INDEX if not records: # No records could be found. write_message("WARNING: No record were found in collection %s." % collection, stream=sys.stderr, verbose=2) return False # Process records: output = [] for record in records: bibdocfiles = BibRecDocs(record).list_latest_files() output.append('<record>') output.append('<controlfield tag="001">%s</controlfield>' % record) for doc in bibdocfiles: # Get the keywords for each PDF document contained in the record. if is_pdf(doc.get_full_path()): write_message('INFO: Generating keywords for record %d.' % record, stream=sys.stderr, verbose=3) fulltext = doc.get_full_path() output.append( output_keywords_for_local_file( fulltext, taxonomy=ontology, output_mode="marcxml", output_limit=3, match_mode="partial", with_author_keywords=True, verbose=task_get_option('verbose'))) _INDEX += 1 output.append('</record>') task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER)) task_sleep_now_if_required(can_stop_too=False) return '\n'.join(output)
def get_filenames(recid): """ Returns names of the files associated with specific record and their derivatives. Takes as a parameter the recid of a record. Example: input: recID 999 (record with files ['thesis.ps.gz', 'random.pdf']) output: ['thesis.ps.gz', 'thesis.ps', 'thesis', 'random.pdf', 'random'] @param recid: recid of a record """ docs = BibRecDocs(recid) names = [ _get_filenames(d.name + d.format) for d in docs.list_latest_files() ] return reduce(lambda x, y: x + y, names)
def upload_fulltext(recid, path): ''' This method save the uploaded file to associated record @param recid: id of the record @param path: uploaded document to store ''' # upload the file to the record bibarchiv = BibRecDocs(recid) docname = path.split('/')[-1].split('.')[0] doctype = path.split('.')[-1].split(';')[0] bibarchiv.add_new_file(path, CFG_DOCTYPE_UPLOAD_COLLECTION, docname, format=doctype) return ''
def look_for_fulltext(recid): rec_info = BibRecDocs(recid) docs = rec_info.list_bibdocs() path = False for doc in docs: try: path = doc.get_file('pdf').get_full_path() except InvenioWebSubmitFileError: try: path = doc.get_file('pdfa').get_full_path() except InvenioWebSubmitFileError: try: path = doc.get_file('PDF').get_full_path() except InvenioWebSubmitFileError: continue return path
def solr_add_range(lower_recid, upper_recid): """ Adds the regarding field values of all records from the lower recid to the upper one to Solr. It preserves the fulltext information. """ for recid in range(lower_recid, upper_recid + 1): if record_exists(recid): try: abstract = unicode( get_fieldvalues(recid, CFG_MARC_ABSTRACT)[0], 'utf-8') except: abstract = "" try: first_author = get_fieldvalues(recid, CFG_MARC_AUTHOR_NAME)[0] additional_authors = reduce( lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_ADDITIONAL_AUTHOR_NAME), '') author = unicode(first_author + " " + additional_authors, 'utf-8') except: author = "" try: bibrecdocs = BibRecDocs(recid) fulltext = unicode(bibrecdocs.get_text(), 'utf-8') except: fulltext = "" try: keyword = unicode( reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_KEYWORD), ''), 'utf-8') except: keyword = "" try: title = unicode( get_fieldvalues(recid, CFG_MARC_TITLE)[0], 'utf-8') except: title = "" solr_add(recid, abstract, author, fulltext, keyword, title) SOLR_CONNECTION.commit() task_sleep_now_if_required(can_stop_too=True)
def uncook_files(webdeposit_json, recid=None, json_reader=None): if 'files' not in webdeposit_json: webdeposit_json['files'] = [] if recid is None: for f in json_reader['url']: filename = f['url'].split('/')[-1] file_json = {'name': filename} webdeposit_json['files'].append(file_json) else: for f in BibRecDocs(recid, human_readable=True).list_latest_files(): filename = f.get_full_name() path = f.get_path() size = f.get_size() file_json = {'name': filename, 'file': path, 'size': size} webdeposit_json['files'].append(file_json) return webdeposit_json
def hide_old_ffts(): ids = perform_request_search(p="", of='intbitset') for one_id in ids: bibrec = BibRecDocs(one_id) bibdoc = bibrec.list_bibdocs()[0] latest_rev = bibdoc.get_latest_version() i = 1 while i < latest_rev: rev_file_types = [] for f in bibdoc.list_version_files(i): if f.format not in rev_file_types: rev_file_types.append(f.format) for file_type in rev_file_types: write_message("Record %s: hiding format %s in revision %s" % (one_id, file_type, i)) bibdoc.set_flag(CFG_BIBDOCFILE_AVAILABLE_FLAGS[3], file_type, i) i += 1
def get_preferred_posterframe_url(recid, icon_p=True): """ Returns the posteframe that might have been manually uploaded for this record. @param recid: current record ID @param icon_p: if True, return icon version (if exists). Else return original image @return: URL of the preferred posterframe, of None if does not exist """ bibarchive = BibRecDocs(recid) posterframe_bibdocs = bibarchive.list_bibdocs(doctype='posterframe') if posterframe_bibdocs: if icon_p: return posterframe_bibdocs[0].get_icon().get_url() for bibdoc_file in posterframe_bibdocs[0].list_latest_files(): if not bibdoc_file.is_icon(): return bibdoc_file.get_url() return None
def solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter): """ Adds the regarding field values of all records from the lower recid to the upper one to Solr. It preserves the fulltext information. """ for recid in range(lower_recid, upper_recid + 1): if record_exists(recid): abstract = get_field_content_in_utf8(recid, 'abstract', tags_to_index) author = get_field_content_in_utf8(recid, 'author', tags_to_index) keyword = get_field_content_in_utf8(recid, 'keyword', tags_to_index) title = get_field_content_in_utf8(recid, 'title', tags_to_index) try: bibrecdocs = BibRecDocs(recid) fulltext = unicode(bibrecdocs.get_text(), 'utf-8') abstract = unicode( get_fieldvalues(recid, CFG_MARC_ABSTRACT)[0], 'utf-8') except: abstract = "" try: first_author = get_fieldvalues(recid, CFG_MARC_AUTHOR_NAME)[0] additional_authors = reduce( lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_ADDITIONAL_AUTHOR_NAME), '') author = unicode(first_author + " " + additional_authors, 'utf-8') except: author = "" try: fulltext = unicode(get_entire_fulltext(recid), 'utf-8') except: fulltext = '' solr_add(recid, abstract, author, fulltext, keyword, title) next_commit_counter = solr_commit_if_necessary(next_commit_counter, recid=recid) return next_commit_counter
def migrate_bibdoc_status(recid, is_public, access_right): from invenio.search_engine import get_fieldvalues from invenio.bibdocfile import BibRecDocs # Generate firerole fft_status = [] if is_public: email = get_fieldvalues(recid, "8560_f")[0] if access_right == 'open': # Access to everyone fft_status = [ 'allow any', ] elif access_right == 'embargoed': # Access to submitted, Deny everyone else until embargo date, # then allow all date = get_fieldvalues(recid, "942__a")[0] fft_status = [ 'allow email "%s"' % email, 'deny until "%s"' % date, 'allow any', ] elif access_right in ( 'closed', 'restricted', ): # Access to submitter, deny everyone else fft_status = [ 'allow email "%s"' % email, 'deny all', ] else: # Access to submitter, deny everyone else fft_status = None if fft_status: fft_status = "firerole: %s" % "\n".join(fft_status) brd = BibRecDocs(recid) for d in brd.list_bibdocs(): d.set_status(fft_status)
def check_records(records): for record in records: ## Stupid hack because bibcheck filters does not work as expected if record_get_field_value(record, '980', code='b') == "Hindawi": record.warn("Working on this record") recdoc = BibRecDocs(int(record.record_id)) doc = recdoc.get_bibdoc(recdoc.get_bibdoc_names()[0]) try: xml_file = open(doc.get_file("xml").get_full_path()) except: record.warn("No document can be found") continue xml2 = xml.dom.minidom.parseString(xml_file.read()) subject = get_value_in_tag(xml2, "subject") if subject in [ "Editorial", "Erratum", "Corrigendum", "Addendum", "Letter to the Editor" ]: field = record_get_field_value(record, '980', code='c') if field: if field in [ 'ERRATUM', 'ADDENDUM', 'EDITORIAL', 'CORRIGENDUM', 'LETTER TO THE EDITOR' ]: for position, value in record.iterfield('980__c'): record.amend_field(position, subject.upper()) break else: for position, value in record.iterfield('980__%'): record.add_subfield(position, 'c', subject.upper()) break else: for position, value in record.iterfield('980__%'): record.add_subfield(position, 'c', subject.upper()) break elif subject not in [ "Review Article", "Research Article", "Retraction" ]: raise Exception( "This subject: %s does not exit in SCOAP3 system" % (subject, ))