Exemple #1
0
def get_media_from_recid(recid):
    '''
        This method get the file in the given url
        @param recid: id of the file to get
    '''

    medias = []

    bibarchiv = BibRecDocs(recid)
    bibdocs = bibarchiv.list_latest_files()

    for bibdocfile in bibdocs:

        bibfile = {'name': bibdocfile.get_full_name(),
                   'file': '',
                   'type': 'application/%s' % \
                       bibdocfile.get_superformat().split(".")[-1],
                   'path': bibdocfile.get_full_path(),
                   'collection': bibdocfile.get_type(),
                   'size': bibdocfile.get_size(),
                   'loaded': False,
                   'selected': ''}

        if bibfile['collection'] == "Main":
            bibfile['selected'] = 'checked=yes'

        medias.append(bibfile)

    return medias
def get_media_from_recid(recid):
    '''
        This method get the file in the given url
        @param recid: id of the file to get
    '''

    medias = []

    bibarchiv = BibRecDocs(recid)
    bibdocs = bibarchiv.list_latest_files()

    for bibdocfile in bibdocs:

        bibfile = {'name': bibdocfile.get_full_name(),
                   'file': '',
                   'type': 'application/%s' % \
                       bibdocfile.get_superformat().split(".")[-1],
                   'path': bibdocfile.get_full_path(),
                   'collection': bibdocfile.get_type(),
                   'size': bibdocfile.get_size(),
                   'loaded': False,
                   'selected': ''}

        if bibfile['collection'] == "Main":
            bibfile['selected'] = 'checked=yes'

        medias.append(bibfile)

    return medias
Exemple #3
0
def download_one(recid, version):
    """Download given version of the PDF from arxiv"""
    write_message('fetching %s' % recid)
    for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)):
        if count != 0:
            write_message("Warning: %s has multiple arxiv #" % recid)
            continue

        url_for_pdf = build_arxiv_url(arxiv_id, version)
        filename_arxiv_id = arxiv_id.replace('/', '_')
        temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker",
                                       dir=CFG_TMPSHAREDDIR,
                                       suffix="%s.pdf" % filename_arxiv_id)
        write_message('downloading pdf from %s' % url_for_pdf)
        path = download_external_url(url_for_pdf,
                                     temp_file.name,
                                     content_type='pdf')

        # Check if it is not an html not found page
        filesize = os.path.getsize(path)
        if filesize < 25000:
            f = open(path)
            try:
                for line in f:
                    if 'PDF unavailable' in line:
                        raise PdfNotAvailable()
            finally:
                f.close()

        docs = BibRecDocs(recid)
        bibdocfiles = docs.list_latest_files(doctype="arXiv")

        needs_update = False
        try:
            bibdocfile = bibdocfiles[0]
        except IndexError:
            bibdocfile = None
            needs_update = True
        else:
            existing_md5 = calculate_md5(bibdocfile.fullpath)
            new_md5 = calculate_md5(path.encode('utf-8'))
            if new_md5 != existing_md5:
                write_message('md5 differs updating')
                needs_update = True
            else:
                write_message('md5 matches existing pdf, skipping')

        if needs_update:
            if bibdocfiles:
                write_message('adding as new version')
                docs.add_new_version(path, docname=bibdocfile.name)
            else:
                write_message('adding as new file')
                docs.add_new_file(path,
                                  doctype="arXiv",
                                  docname="arXiv:%s" % filename_arxiv_id)
        else:
            raise FoundExistingPdf()
def get_filetypes(recid):
    """
        Returns filetypes extensions associated with given record.

        Takes as a parameter the recid of a record.
        @param url_field: recid of a record
    """
    from invenio.legacy.bibdocfile.api import BibRecDocs
    docs = BibRecDocs(recid)
    return [_get_filetype(d.format) for d in docs.list_latest_files()]
Exemple #5
0
def download_one(recid, version):
    """Download given version of the PDF from arxiv"""
    write_message("fetching %s" % recid)
    for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)):
        if count != 0:
            write_message("Warning: %s has multiple arxiv #" % recid)
            continue

        url_for_pdf = build_arxiv_url(arxiv_id, version)
        filename_arxiv_id = arxiv_id.replace("/", "_")
        temp_file = NamedTemporaryFile(
            prefix="arxiv-pdf-checker", dir=CFG_TMPSHAREDDIR, suffix="%s.pdf" % filename_arxiv_id
        )
        write_message("downloading pdf from %s" % url_for_pdf)
        path = download_external_url(url_for_pdf, temp_file.name, content_type="pdf")

        # Check if it is not an html not found page
        filesize = os.path.getsize(path)
        if filesize < 25000:
            f = open(path)
            try:
                for line in f:
                    if "PDF unavailable" in line:
                        raise PdfNotAvailable()
            finally:
                f.close()

        docs = BibRecDocs(recid)
        bibdocfiles = docs.list_latest_files(doctype="arXiv")

        needs_update = False
        try:
            bibdocfile = bibdocfiles[0]
        except IndexError:
            bibdocfile = None
            needs_update = True
        else:
            existing_md5 = calculate_md5(bibdocfile.fullpath)
            new_md5 = calculate_md5(path.encode("utf-8"))
            if new_md5 != existing_md5:
                write_message("md5 differs updating")
                needs_update = True
            else:
                write_message("md5 matches existing pdf, skipping")

        if needs_update:
            if bibdocfiles:
                write_message("adding as new version")
                docs.add_new_version(path, docname=bibdocfile.name)
            else:
                write_message("adding as new file")
                docs.add_new_file(path, doctype="arXiv", docname="arXiv:%s" % filename_arxiv_id)
        else:
            raise FoundExistingPdf()
def get_files_from_bibdoc(recid):
    """
    Retrieves using BibDoc all the files related with a given record

    @param recid

    @return List of dictionaries containing all the information stored
            inside BibDoc if the current record has files attached, the
            empty list otherwise
    """
    if not recid or recid < 0:
        return []

    from invenio.legacy.bibdocfile.api import BibRecDocs, InvenioBibDocFileError
    files = []
    try:
        bibrecdocs = BibRecDocs(int(recid))
    except InvenioBibDocFileError:
        return []
    latest_files = bibrecdocs.list_latest_files()
    for afile in latest_files:
        file_dict = {}
        file_dict['comment'] = afile.get_comment()
        file_dict['description'] = afile.get_description()
        file_dict['eformat'] = afile.get_format()
        file_dict['full_name'] = afile.get_full_name()
        file_dict['full_path'] = afile.get_full_path()
        file_dict['magic'] = afile.get_magic()
        file_dict['name'] = afile.get_name()
        file_dict['path'] = afile.get_path()
        file_dict['size'] = afile.get_size()
        file_dict['status'] = afile.get_status()
        file_dict['subformat'] = afile.get_subformat()
        file_dict['superformat'] = afile.get_superformat()
        file_dict['type'] = afile.get_type()
        file_dict['url'] = afile.get_url()
        file_dict['version'] = afile.get_version()
        files.append(file_dict)
    return files
def get_files_from_bibdoc(recid):
    """
    Retrieves using BibDoc all the files related with a given record

    @param recid

    @return List of dictionaries containing all the information stored
            inside BibDoc if the current record has files attached, the
            empty list otherwise
    """
    if not recid or recid < 0:
        return []

    from invenio.legacy.bibdocfile.api import BibRecDocs, InvenioBibDocFileError
    files = []
    try:
        bibrecdocs = BibRecDocs(int(recid))
    except InvenioBibDocFileError:
        return []
    latest_files = bibrecdocs.list_latest_files()
    for afile in latest_files:
        file_dict = {}
        file_dict['comment'] = afile.get_comment()
        file_dict['description'] = afile.get_description()
        file_dict['eformat'] = afile.get_format()
        file_dict['full_name'] = afile.get_full_name()
        file_dict['full_path'] = afile.get_full_path()
        file_dict['magic'] = afile.get_magic()
        file_dict['name'] = afile.get_name()
        file_dict['path'] = afile.get_path()
        file_dict['size'] = afile.get_size()
        file_dict['status'] = afile.get_status()
        file_dict['subformat'] = afile.get_subformat()
        file_dict['superformat'] = afile.get_superformat()
        file_dict['type'] = afile.get_type()
        file_dict['url'] = afile.get_url()
        file_dict['version'] = afile.get_version()
        files.append(file_dict)
    return files
Exemple #8
0
def get_record_details(recid, curr_user_email=None):
    from invenio.legacy.bibdocfile.api import BibRecDocs
    try:
        recdocs = BibRecDocs(recid)
    except:
        current_app.logger.error("REST API: Error while building BibRecDocs for record %d" % (recid,))
        return {}

    latest_files = recdocs.list_latest_files()
    if len(latest_files) == 0:
        current_app.logger.error("REST API: BibRecDocs reports 0 files for record %d" % (recid,))

    # bibformat uses get_record, usually is one db
    # hit per object; should be fastest
    from invenio.modules.formatter import engine as bibformat_engine
    bfo = bibformat_engine.BibFormatObject(recid)

    # first put the record_id and list of files
    ret = {
        'record_id': recid,
        'files': [{
                        'name': afile.get_full_name().decode('utf-8'),
                        'size': afile.get_size(),
                        'url': afile.get_full_url(),
                  } for afile in latest_files ],
    }

    if not curr_user_email:
        curr_user_email = current_user['email']

    # add basic metadata fields
    for fieldname in basic_fields_meta:
        if fieldname == "open_access":
            open_access = (read_basic_metadata_field_from_marc(bfo, fieldname) == "open")
            ret[fieldname] = open_access
            if not open_access:
                if read_basic_metadata_field_from_marc(bfo, "uploaded_by") != curr_user_email:
                    ret['files'] = "RESTRICTED"
        else:
            ret[fieldname] = read_basic_metadata_field_from_marc(bfo, fieldname)

    # add 'PID' and 'checksum'
    for fx in bfo.fields('0247_'):
        if fx.get('2') in ["PID", "checksum"]:
            ret[fx.get('2')] = fx.get('a')

    # add 'domain'
    domain = read_basic_metadata_field_from_marc(bfo, 'domain')
    ret['domain'] = domain

    # add domain-specific metadata fields
    if domain not in metadata_classes():
        current_app.logger.error("Bad domain metadata class for record %d" % (recid,))
    else:
        domain_class = metadata_classes()[domain]()
        for fieldset in domain_class.fieldsets:
            if fieldset.name != 'Generic':
                ret['domain_metadata'] = get_domain_metadata(domain_class,
                                                             fieldset,
                                                             bfo)

    return ret
Exemple #9
0
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None):
    """
    Take a string representing one recid or several and get the associated
    tarballs for those ids. By default look for files with names matching
    the report number and with source field 'arXiv'. This can be changed
    with C{docname}, C{doctype}, C{docformat}

    @param: recids (string): the record id or ids
    @param: sdir (string): where the tarballs should live
    @param docname: select tarball for given recid(s) that match docname
    @param doctype: select tarball for given recid(s) that match doctype
    @param docformat: select tarball for given recid(s) that match docformat
    @return: tarballs ([string, string, ...]): locations of tarballs
    """
    if not recids:
        return []

    list_of_ids = []

    if ',' in recids:
        recids = recids.split(',')
        for recid in recids:
            if '-' in recid:
                low, high = recid.split('-')
                recid = range(int(low), int(high))
                list_of_ids.extend(recid)
            else:
                recid = int(recid)
                list_of_ids.append(recid)

    else:
        if '-' in recids:
            low, high = recids.split('-')
            list_of_ids = range(int(low), int(high))
        else:
            list_of_ids = [int(recids)]

    arXiv_ids = []
    local_files = []
    for recid in list_of_ids:
        rec = get_record(recid)
        if not doctype and not docname and not docformat:
            for afieldinstance in record_get_field_instances(rec, tag='037'):
                if len(field_get_subfield_values(afieldinstance, '9')) > 0:
                    if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]:
                        arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0]
                        arXiv_ids.append(arXiv_id)
        else:
            bibarchive = BibRecDocs(recid)
            all_files = bibarchive.list_latest_files()
            if doctype:
                all_files = [docfile for docfile in all_files if
                             docfile.get_type() == doctype]
            if docname:
                all_files = [docfile for docfile in all_files if
                             docfile.get_name() == docname]
            if docformat:
                all_files = [docfile for docfile in all_files if
                             docfile.get_format() == docformat]
            local_files.extend([(docfile.get_path(), recid) for docfile in all_files])

    if doctype or docname or docformat:
        return local_files

    return tarballs_by_arXiv_id(arXiv_ids, sdir)
Exemple #10
0
 def record_selection(self, recid):
     """Select specific record indentified by recid."""
     self.recid = recid
     bibdocs = BibRecDocs(recid)
     self.docfiles = bibdocs.list_latest_files()
Exemple #11
0
def tarballs_by_recids(recids,
                       sdir,
                       docname=None,
                       doctype=None,
                       docformat=None):
    """
    Take a string representing one recid or several and get the associated
    tarballs for those ids. By default look for files with names matching
    the report number and with source field 'arXiv'. This can be changed
    with C{docname}, C{doctype}, C{docformat}

    @param: recids (string): the record id or ids
    @param: sdir (string): where the tarballs should live
    @param docname: select tarball for given recid(s) that match docname
    @param doctype: select tarball for given recid(s) that match doctype
    @param docformat: select tarball for given recid(s) that match docformat
    @return: tarballs ([string, string, ...]): locations of tarballs
    """
    if not recids:
        return []

    list_of_ids = []

    if ',' in recids:
        recids = recids.split(',')
        for recid in recids:
            if '-' in recid:
                low, high = recid.split('-')
                recid = range(int(low), int(high))
                list_of_ids.extend(recid)
            else:
                recid = int(recid)
                list_of_ids.append(recid)

    else:
        if '-' in recids:
            low, high = recids.split('-')
            list_of_ids = range(int(low), int(high))
        else:
            list_of_ids = [int(recids)]

    arXiv_ids = []
    local_files = []
    for recid in list_of_ids:
        rec = get_record(recid)
        if not doctype and not docname and not docformat:
            for afieldinstance in record_get_field_instances(rec, tag='037'):
                if len(field_get_subfield_values(afieldinstance, '9')) > 0:
                    if 'arXiv' == field_get_subfield_values(
                            afieldinstance, '9')[0]:
                        arXiv_id = field_get_subfield_values(
                            afieldinstance, 'a')[0]
                        arXiv_ids.append(arXiv_id)
        else:
            bibarchive = BibRecDocs(recid)
            all_files = bibarchive.list_latest_files()
            if doctype:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_type() == doctype
                ]
            if docname:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_name() == docname
                ]
            if docformat:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_format() == docformat
                ]
            local_files.extend([(docfile.get_path(), recid)
                                for docfile in all_files])

    if doctype or docname or docformat:
        return local_files

    return tarballs_by_arXiv_id(arXiv_ids, sdir)