def get_media_from_recid(recid): ''' This method get the file in the given url @param recid: id of the file to get ''' medias = [] bibarchiv = BibRecDocs(recid) bibdocs = bibarchiv.list_latest_files() for bibdocfile in bibdocs: bibfile = {'name': bibdocfile.get_full_name(), 'file': '', 'type': 'application/%s' % \ bibdocfile.get_superformat().split(".")[-1], 'path': bibdocfile.get_full_path(), 'collection': bibdocfile.get_type(), 'size': bibdocfile.get_size(), 'loaded': False, 'selected': ''} if bibfile['collection'] == "Main": bibfile['selected'] = 'checked=yes' medias.append(bibfile) return medias
def get_media_from_recid(recid): ''' This method get the file in the given url @param recid: id of the file to get ''' medias = [] bibarchiv = BibRecDocs(recid) bibdocs = bibarchiv.list_latest_files() for bibdocfile in bibdocs: bibfile = {'name': bibdocfile.get_full_name(), 'file': '', 'type': 'application/%s' % \ bibdocfile.get_superformat().split(".")[-1], 'path': bibdocfile.get_full_path(), 'collection': bibdocfile.get_type(), 'size': bibdocfile.get_size(), 'loaded': False, 'selected': ''} if bibfile['collection'] == "Main": bibfile['selected'] = 'checked=yes' medias.append(bibfile) return medias
def download_one(recid, version): """Download given version of the PDF from arxiv""" write_message('fetching %s' % recid) for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)): if count != 0: write_message("Warning: %s has multiple arxiv #" % recid) continue url_for_pdf = build_arxiv_url(arxiv_id, version) filename_arxiv_id = arxiv_id.replace('/', '_') temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker", dir=CFG_TMPSHAREDDIR, suffix="%s.pdf" % filename_arxiv_id) write_message('downloading pdf from %s' % url_for_pdf) path = download_external_url(url_for_pdf, temp_file.name, content_type='pdf') # Check if it is not an html not found page filesize = os.path.getsize(path) if filesize < 25000: f = open(path) try: for line in f: if 'PDF unavailable' in line: raise PdfNotAvailable() finally: f.close() docs = BibRecDocs(recid) bibdocfiles = docs.list_latest_files(doctype="arXiv") needs_update = False try: bibdocfile = bibdocfiles[0] except IndexError: bibdocfile = None needs_update = True else: existing_md5 = calculate_md5(bibdocfile.fullpath) new_md5 = calculate_md5(path.encode('utf-8')) if new_md5 != existing_md5: write_message('md5 differs updating') needs_update = True else: write_message('md5 matches existing pdf, skipping') if needs_update: if bibdocfiles: write_message('adding as new version') docs.add_new_version(path, docname=bibdocfile.name) else: write_message('adding as new file') docs.add_new_file(path, doctype="arXiv", docname="arXiv:%s" % filename_arxiv_id) else: raise FoundExistingPdf()
def get_filetypes(recid): """ Returns filetypes extensions associated with given record. Takes as a parameter the recid of a record. @param url_field: recid of a record """ from invenio.legacy.bibdocfile.api import BibRecDocs docs = BibRecDocs(recid) return [_get_filetype(d.format) for d in docs.list_latest_files()]
def download_one(recid, version): """Download given version of the PDF from arxiv""" write_message("fetching %s" % recid) for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)): if count != 0: write_message("Warning: %s has multiple arxiv #" % recid) continue url_for_pdf = build_arxiv_url(arxiv_id, version) filename_arxiv_id = arxiv_id.replace("/", "_") temp_file = NamedTemporaryFile( prefix="arxiv-pdf-checker", dir=CFG_TMPSHAREDDIR, suffix="%s.pdf" % filename_arxiv_id ) write_message("downloading pdf from %s" % url_for_pdf) path = download_external_url(url_for_pdf, temp_file.name, content_type="pdf") # Check if it is not an html not found page filesize = os.path.getsize(path) if filesize < 25000: f = open(path) try: for line in f: if "PDF unavailable" in line: raise PdfNotAvailable() finally: f.close() docs = BibRecDocs(recid) bibdocfiles = docs.list_latest_files(doctype="arXiv") needs_update = False try: bibdocfile = bibdocfiles[0] except IndexError: bibdocfile = None needs_update = True else: existing_md5 = calculate_md5(bibdocfile.fullpath) new_md5 = calculate_md5(path.encode("utf-8")) if new_md5 != existing_md5: write_message("md5 differs updating") needs_update = True else: write_message("md5 matches existing pdf, skipping") if needs_update: if bibdocfiles: write_message("adding as new version") docs.add_new_version(path, docname=bibdocfile.name) else: write_message("adding as new file") docs.add_new_file(path, doctype="arXiv", docname="arXiv:%s" % filename_arxiv_id) else: raise FoundExistingPdf()
def get_files_from_bibdoc(recid): """ Retrieves using BibDoc all the files related with a given record @param recid @return List of dictionaries containing all the information stored inside BibDoc if the current record has files attached, the empty list otherwise """ if not recid or recid < 0: return [] from invenio.legacy.bibdocfile.api import BibRecDocs, InvenioBibDocFileError files = [] try: bibrecdocs = BibRecDocs(int(recid)) except InvenioBibDocFileError: return [] latest_files = bibrecdocs.list_latest_files() for afile in latest_files: file_dict = {} file_dict['comment'] = afile.get_comment() file_dict['description'] = afile.get_description() file_dict['eformat'] = afile.get_format() file_dict['full_name'] = afile.get_full_name() file_dict['full_path'] = afile.get_full_path() file_dict['magic'] = afile.get_magic() file_dict['name'] = afile.get_name() file_dict['path'] = afile.get_path() file_dict['size'] = afile.get_size() file_dict['status'] = afile.get_status() file_dict['subformat'] = afile.get_subformat() file_dict['superformat'] = afile.get_superformat() file_dict['type'] = afile.get_type() file_dict['url'] = afile.get_url() file_dict['version'] = afile.get_version() files.append(file_dict) return files
def get_files_from_bibdoc(recid): """ Retrieves using BibDoc all the files related with a given record @param recid @return List of dictionaries containing all the information stored inside BibDoc if the current record has files attached, the empty list otherwise """ if not recid or recid < 0: return [] from invenio.legacy.bibdocfile.api import BibRecDocs, InvenioBibDocFileError files = [] try: bibrecdocs = BibRecDocs(int(recid)) except InvenioBibDocFileError: return [] latest_files = bibrecdocs.list_latest_files() for afile in latest_files: file_dict = {} file_dict['comment'] = afile.get_comment() file_dict['description'] = afile.get_description() file_dict['eformat'] = afile.get_format() file_dict['full_name'] = afile.get_full_name() file_dict['full_path'] = afile.get_full_path() file_dict['magic'] = afile.get_magic() file_dict['name'] = afile.get_name() file_dict['path'] = afile.get_path() file_dict['size'] = afile.get_size() file_dict['status'] = afile.get_status() file_dict['subformat'] = afile.get_subformat() file_dict['superformat'] = afile.get_superformat() file_dict['type'] = afile.get_type() file_dict['url'] = afile.get_url() file_dict['version'] = afile.get_version() files.append(file_dict) return files
def get_record_details(recid, curr_user_email=None): from invenio.legacy.bibdocfile.api import BibRecDocs try: recdocs = BibRecDocs(recid) except: current_app.logger.error("REST API: Error while building BibRecDocs for record %d" % (recid,)) return {} latest_files = recdocs.list_latest_files() if len(latest_files) == 0: current_app.logger.error("REST API: BibRecDocs reports 0 files for record %d" % (recid,)) # bibformat uses get_record, usually is one db # hit per object; should be fastest from invenio.modules.formatter import engine as bibformat_engine bfo = bibformat_engine.BibFormatObject(recid) # first put the record_id and list of files ret = { 'record_id': recid, 'files': [{ 'name': afile.get_full_name().decode('utf-8'), 'size': afile.get_size(), 'url': afile.get_full_url(), } for afile in latest_files ], } if not curr_user_email: curr_user_email = current_user['email'] # add basic metadata fields for fieldname in basic_fields_meta: if fieldname == "open_access": open_access = (read_basic_metadata_field_from_marc(bfo, fieldname) == "open") ret[fieldname] = open_access if not open_access: if read_basic_metadata_field_from_marc(bfo, "uploaded_by") != curr_user_email: ret['files'] = "RESTRICTED" else: ret[fieldname] = read_basic_metadata_field_from_marc(bfo, fieldname) # add 'PID' and 'checksum' for fx in bfo.fields('0247_'): if fx.get('2') in ["PID", "checksum"]: ret[fx.get('2')] = fx.get('a') # add 'domain' domain = read_basic_metadata_field_from_marc(bfo, 'domain') ret['domain'] = domain # add domain-specific metadata fields if domain not in metadata_classes(): current_app.logger.error("Bad domain metadata class for record %d" % (recid,)) else: domain_class = metadata_classes()[domain]() for fieldset in domain_class.fieldsets: if fieldset.name != 'Generic': ret['domain_metadata'] = get_domain_metadata(domain_class, fieldset, bfo) return ret
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None): """ Take a string representing one recid or several and get the associated tarballs for those ids. By default look for files with names matching the report number and with source field 'arXiv'. This can be changed with C{docname}, C{doctype}, C{docformat} @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @param docname: select tarball for given recid(s) that match docname @param doctype: select tarball for given recid(s) that match doctype @param docformat: select tarball for given recid(s) that match docformat @return: tarballs ([string, string, ...]): locations of tarballs """ if not recids: return [] list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recids.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = [int(recids)] arXiv_ids = [] local_files = [] for recid in list_of_ids: rec = get_record(recid) if not doctype and not docname and not docformat: for afieldinstance in record_get_field_instances(rec, tag='037'): if len(field_get_subfield_values(afieldinstance, '9')) > 0: if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) else: bibarchive = BibRecDocs(recid) all_files = bibarchive.list_latest_files() if doctype: all_files = [docfile for docfile in all_files if docfile.get_type() == doctype] if docname: all_files = [docfile for docfile in all_files if docfile.get_name() == docname] if docformat: all_files = [docfile for docfile in all_files if docfile.get_format() == docformat] local_files.extend([(docfile.get_path(), recid) for docfile in all_files]) if doctype or docname or docformat: return local_files return tarballs_by_arXiv_id(arXiv_ids, sdir)
def record_selection(self, recid): """Select specific record indentified by recid.""" self.recid = recid bibdocs = BibRecDocs(recid) self.docfiles = bibdocs.list_latest_files()
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None): """ Take a string representing one recid or several and get the associated tarballs for those ids. By default look for files with names matching the report number and with source field 'arXiv'. This can be changed with C{docname}, C{doctype}, C{docformat} @param: recids (string): the record id or ids @param: sdir (string): where the tarballs should live @param docname: select tarball for given recid(s) that match docname @param doctype: select tarball for given recid(s) that match doctype @param docformat: select tarball for given recid(s) that match docformat @return: tarballs ([string, string, ...]): locations of tarballs """ if not recids: return [] list_of_ids = [] if ',' in recids: recids = recids.split(',') for recid in recids: if '-' in recid: low, high = recid.split('-') recid = range(int(low), int(high)) list_of_ids.extend(recid) else: recid = int(recid) list_of_ids.append(recid) else: if '-' in recids: low, high = recids.split('-') list_of_ids = range(int(low), int(high)) else: list_of_ids = [int(recids)] arXiv_ids = [] local_files = [] for recid in list_of_ids: rec = get_record(recid) if not doctype and not docname and not docformat: for afieldinstance in record_get_field_instances(rec, tag='037'): if len(field_get_subfield_values(afieldinstance, '9')) > 0: if 'arXiv' == field_get_subfield_values( afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values( afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) else: bibarchive = BibRecDocs(recid) all_files = bibarchive.list_latest_files() if doctype: all_files = [ docfile for docfile in all_files if docfile.get_type() == doctype ] if docname: all_files = [ docfile for docfile in all_files if docfile.get_name() == docname ] if docformat: all_files = [ docfile for docfile in all_files if docfile.get_format() == docformat ] local_files.extend([(docfile.get_path(), recid) for docfile in all_files]) if doctype or docname or docformat: return local_files return tarballs_by_arXiv_id(arXiv_ids, sdir)