def format_element(bfo, separator='<br/>', width="800px", height="480px"):
    """
    Display Flash (swf) panorama attached to this record. Consider
    files attached as .swf file with doctype 'panoaram'.

    @param separator: printed between each panorama
    @param width: width of each panorama
    @param height: height of each panorama
    """
    out = ""
    panoramas = []
    bibarchive = BibRecDocs(bfo.recID)
    # Prepare the Javascripts
    for bibdocfile in bibarchive.list_latest_files(doctype='panorama'):
        if bibdocfile.get_format() == '.swf':
            pano_index = len(panoramas)
            panoramas.append('embedpano({swf:"%(swf_file)s", target:"panoramabox%(pano_index)s", width:"%(width)s", height:"%(height)s"});' \
                            % {'swf_file': bibdocfile.get_url(),
                               'pano_index': pano_index,
                               'width': width,
                               'height': height})
    if panoramas:
        out = separator.join(['<div id="panoramabox%i" style="margin:auto"></div>' %i for i in xrange(len(panoramas))])
        out += '<script type="text/javascript" src="/js/swfkrpano.js"></script>'
        out += '<script type="text/javascript">' + \
               ''.join(panoramas) + \
               '</script>'

    return out
def get_media_from_recid(recid):
    '''
        This method get the file in the given url
        @param(recid) : id of the file to get
        @return (file_type) : the mime type of the file found
        @return (data) : the file in a string variable
    '''

    medias = []

    bibarchiv = BibRecDocs(recid)
    bibdocs = bibarchiv.list_latest_files()

    for bibdocfile in bibdocs :

        bibfile = {'name': bibdocfile.get_full_name(),
                   'file' : '',
                   'type': 'application/%s' % \
                       bibdocfile.get_superformat().split(".")[-1],
                   'path': bibdocfile.get_full_path(),
                   'collection' : bibdocfile.get_type(),
                   'size': bibdocfile.get_size(),
                   'loaded' : False,
                   'selected' : ''}

        if bibfile['collection'] == "Main" :
            bibfile['selected'] = 'checked=yes'

        medias.append(bibfile)

    return medias
def get_media_from_recid(recid):
    '''
        This method get the file in the given url
        @param recid: id of the file to get
    '''

    medias = []

    bibarchiv = BibRecDocs(recid)
    bibdocs = bibarchiv.list_latest_files()

    for bibdocfile in bibdocs:

        bibfile = {'name': bibdocfile.get_full_name(),
                   'file': '',
                   'type': 'application/%s' % \
                       bibdocfile.get_superformat().split(".")[-1],
                   'path': bibdocfile.get_full_path(),
                   'collection': bibdocfile.get_type(),
                   'size': bibdocfile.get_size(),
                   'loaded': False,
                   'selected': ''}

        if bibfile['collection'] == "Main":
            bibfile['selected'] = 'checked=yes'

        medias.append(bibfile)

    return medias
Esempio n. 4
0
def record_has_arxiv_pdf(recid=None):
    if recid is None:
        return False
    brd = BibRecDocs(recid)
    for bdf in brd.list_latest_files(doctype="arXiv"):
        if bdf.format.lower() in ('.pdf', '.pdfa'):
            return True
    return False
Esempio n. 5
0
def get_rawtext_from_record(record):
    bibrec = BibRecDocs(record.record_id)
    bibdoc = get_latest_pdf(bibrec.list_latest_files())
    try:
        rawtext = bibdoc.bibdoc.get_text()
    except:
        return ''
    return rawtext
Esempio n. 6
0
def download_one(recid, version):
    """Download given version of the PDF from arxiv"""
    write_message('fetching %s' % recid)
    for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)):
        if count != 0:
            write_message("Warning: %s has multiple arxiv #" % recid)
            continue

        url_for_pdf = build_arxiv_url(arxiv_id, version)
        filename_arxiv_id = arxiv_id.replace('/', '_')
        temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker",
                                       dir=CFG_TMPSHAREDDIR,
                                       suffix="%s.pdf" % filename_arxiv_id)
        write_message('downloading pdf from %s' % url_for_pdf)
        path = download_external_url(url_for_pdf,
                                     temp_file.name,
                                     content_type='pdf')

        # Check if it is not an html not found page
        filesize = os.path.getsize(path)
        if filesize < 25000:
            f = open(path)
            try:
                for line in f:
                    if 'PDF unavailable' in line:
                        raise PdfNotAvailable()
            finally:
                f.close()

        docs = BibRecDocs(recid)
        bibdocfiles = docs.list_latest_files(doctype="arXiv")

        needs_update = False
        try:
            bibdocfile = bibdocfiles[0]
        except IndexError:
            bibdocfile = None
            needs_update = True
        else:
            existing_md5 = calculate_md5(bibdocfile.fullpath)
            new_md5 = calculate_md5(path.encode('utf-8'))
            if new_md5 != existing_md5:
                write_message('md5 differs updating')
                needs_update = True
            else:
                write_message('md5 matches existing pdf, skipping')

        if needs_update:
            if bibdocfiles:
                write_message('adding as new version')
                docs.add_new_version(path, docname=bibdocfile.name)
            else:
                write_message('adding as new file')
                docs.add_new_file(path,
                                  doctype="arXiv",
                                  docname="arXiv:%s" % filename_arxiv_id)
        else:
            raise FoundExistingPdf()
Esempio n. 7
0
def download_one(recid, version):
    """Download given version of the PDF from arxiv"""
    write_message('fetching %s' % recid)
    for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)):
        if count != 0:
            write_message("Warning: %s has multiple arxiv #" % recid)
            continue

        url_for_pdf = build_arxiv_url(arxiv_id, version)
        filename_arxiv_id = arxiv_id.replace('/', '_')
        temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker",
                                       dir=CFG_TMPSHAREDDIR,
                                       suffix="%s.pdf" % filename_arxiv_id)
        write_message('downloading pdf from %s' % url_for_pdf)
        path = download_external_url(url_for_pdf,
                                     temp_file.name,
                                     content_type='pdf')

        # Check if it is not an html not found page
        filesize = os.path.getsize(path)
        if filesize < 25000:
            f = open(path)
            try:
                for line in f:
                    if 'PDF unavailable' in line:
                        raise PdfNotAvailable()
            finally:
                f.close()

        docs = BibRecDocs(recid)
        bibdocfiles = docs.list_latest_files(doctype="arXiv")

        needs_update = False
        try:
            bibdocfile = bibdocfiles[0]
        except IndexError:
            bibdocfile = None
            needs_update = True
        else:
            existing_md5 = calculate_md5(bibdocfile.fullpath)
            new_md5 = calculate_md5(path.encode('utf-8'))
            if new_md5 != existing_md5:
                write_message('md5 differs updating')
                needs_update = True
            else:
                write_message('md5 matches existing pdf, skipping')

        if needs_update:
            if bibdocfiles:
                write_message('adding as new version')
                docs.add_new_version(path, docname=bibdocfile.name)
            else:
                write_message('adding as new file')
                docs.add_new_file(path,
                                  doctype="arXiv",
                                  docname="arXiv:%s" % filename_arxiv_id)
        else:
            raise FoundExistingPdf()
def check_records(records):
    for record in records:
        if is_springer(record):
            rec_doc = BibRecDocs(int(record.record_id))
            rec_docs = rec_doc.list_latest_files()
            for doc in rec_docs:
                if doc.get_format() == '.xml':
                    f = open(doc.get_full_path())
                    content = f.read()
                    try:
                        del record['100']
                        del record['700']
                        record.amended = True
                    except:
                        pass

                    first_author = True
                    try:
                        if "-//NLM//DTD JATS" in content:
                            jats = JATSParser()
                            authors = jats.get_authors(parseString(content))
                        else:
                            app = NLMParser()
                            authors = app.get_authors(parseString(content))
                    except:
                        record.warn('Problem with parsing XML.')
                        continue

                    for author in authors:
                        if author.get('surname'):
                            subfields = [
                                ('a',
                                 '%s, %s' % (author.get('surname'),
                                             author.get('given_name')
                                             or author.get('initials', '')))
                            ]
                        else:
                            subfields = [('a', '%s' % (author.get('name', '')))
                                         ]
                        if 'orcid' in author:
                            subfields.append(('j', author['orcid']))
                        if 'affiliation' in author:
                            for aff in author["affiliation"]:
                                subfields.append(('v', aff))

                        add_nations_field(subfields)

                        if author.get('email'):
                            subfields.append(('m', author['email']))
                        if first_author:
                            record.add_field('100__',
                                             value='',
                                             subfields=subfields)
                            first_author = False
                        else:
                            record.add_field('700__',
                                             value='',
                                             subfields=subfields)
Esempio n. 9
0
def get_filetypes(recid):
    """
        Returns filetypes extensions associated with given record.

        Takes as a parameter the recid of a record.
        @param url_field: recid of a record
    """
    docs = BibRecDocs(recid)
    return [_get_filetype(d.format) for d in docs.list_latest_files()]
Esempio n. 10
0
def get_rawtext_from_record_id(record_id):
    bibrec = BibRecDocs(record_id)
    bibdoc = get_latest_pdf(bibrec.list_latest_files())
    try:
        rawtext = bibdoc.bibdoc.get_text()
    except:
        return ''

    return rawtext
Esempio n. 11
0
def get_filetypes(recid):
    """
        Returns filetypes extensions associated with given record.

        Takes as a parameter the recid of a record.
        @param url_field: recid of a record
    """
    docs = BibRecDocs(recid)
    return [_get_filetype(d.format) for d in docs.list_latest_files()]
Esempio n. 12
0
    def get_pdfa_record(self, path=None):
        from invenio.search_engine import perform_request_search
        xml_doc = self.get_article(path)
        rec = create_record()
        dummy, dummy, dummy, dummy, dummy, dummy, dummy,\
            dummy, doi = self.get_publication_information(xml_doc)
        recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' %
                                       (doi, ))
        if recid:
            record_add_field(rec, '001', controlfield_value=recid[0])
        else:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
            message = ('Adding PDF/A. No paper with this DOI: '
                       '%s. Trying to add it anyway.') % (doi, )
            self.logger.error(message)
        try:
            if exists(join(path, 'main_a-2b.pdf')):
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', join(path, 'main_a-2b.pdf')),
                                            ('n', 'main'), ('f', '.pdf;pdfa')])
                self.logger.debug('Adding PDF/A to record: %s' % (doi, ))
            elif exists(join(path, 'main.pdf')):
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', join(path, 'main.pdf'))])
                message = 'No PDF/A in VTEX package for record: ' + doi
                self.logger.debug(message)
            else:
                message = "Record %s doesn't contain PDF file." % (doi, )
                raise MissingFFTError(message)
        except MissingFFTError:
            message = "Elsevier paper: %s is missing PDF." % (doi, )
            register_exception(alert_admin=True, prefix=message)
            self.logger.warning(message)

        ## copy other formats to bibupload file
        if recid:
            from invenio.bibdocfile import BibRecDocs
            record = BibRecDocs(recid[0])
            for bibfile in record.list_latest_files():
                if bibfile.get_format() != '.pdf;pdfa':
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', bibfile.get_full_path()),
                                                ('n', bibfile.get_name()),
                                                ('f', bibfile.get_format())])
        return record_xml_output(rec)
Esempio n. 13
0
    def get_pdfa_record(self, path=None):
        from invenio.search_engine import perform_request_search
        xml_doc = self.get_article(path)
        rec = create_record()
        dummy, dummy, dummy, dummy, dummy, dummy, dummy,\
            dummy, doi = self.get_publication_information(xml_doc)
        recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' % (doi,))
        if recid:
            record_add_field(rec, '001', controlfield_value=recid[0])
        else:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi),
                                                              ('2', 'DOI')])
            message = ('Adding PDF/A. No paper with this DOI: '
                       '%s. Trying to add it anyway.') % (doi,)
            self.logger.error(message)
        try:
            if exists(join(path, 'main_a-2b.pdf')):
                record_add_field(
                    rec, 'FFT', subfields=[('a', join(path, 'main_a-2b.pdf')),
                                ('n', 'main'),
                        ('f', '.pdf;pdfa')])
                self.logger.debug('Adding PDF/A to record: %s' % (doi,))
            elif exists(join(path, 'main.pdf')):
                record_add_field(
                    rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))])
                message = 'No PDF/A in VTEX package for record: ' + doi
                self.logger.debug(message)
            else:
                message = "Record %s doesn't contain PDF file." % (doi,)
                raise MissingFFTError(message)
        except MissingFFTError:
            message = "Elsevier paper: %s is missing PDF." % (doi,)
            register_exception(alert_admin=True, prefix=message)
            self.logger.warning(message)

        ## copy other formats to bibupload file
        if recid:
            from invenio.bibdocfile import BibRecDocs
            record = BibRecDocs(recid[0])
            for bibfile in record.list_latest_files():
                if bibfile.get_format() != '.pdf;pdfa':
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', bibfile.get_full_path()),
                                                ('n', bibfile.get_name()),
                                                ('f', bibfile.get_format())]
                                     )
        return record_xml_output(rec)
Esempio n. 14
0
def check_records(records):
    for record in records:
        if is_springer(record):
            rec_doc = BibRecDocs(int(record.record_id))
            rec_docs = rec_doc.list_latest_files()
            for doc in rec_docs:
                if doc.get_format() == '.xml':
                    f = open(doc.get_full_path())
                    content = f.read()
                    try:
                        del record['100']
                        del record['700']
                        record.amended = True
                    except:
                        pass

                    first_author = True
                    try:
                        if "-//NLM//DTD JATS" in content:
                            jats = JATSParser()
                            authors = jats.get_authors(parseString(content))
                        else:
                            app = NLMParser()
                            authors = app.get_authors(parseString(content))
                    except:
                        record.warn('Problem with parsing XML.')
                        continue

                    for author in authors:
                        if author.get('surname'):
                            subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))]
                        else:
                            subfields = [('a', '%s' % (author.get('name', '')))]
                        if 'orcid' in author:
                            subfields.append(('j', author['orcid']))
                        if 'affiliation' in author:
                            for aff in author["affiliation"]:
                                subfields.append(('v', aff))

                        add_nations_field(subfields)

                        if author.get('email'):
                                subfields.append(('m', author['email']))
                        if first_author:
                            record.add_field('100__', value='', subfields=subfields)
                            first_author = False
                        else:
                            record.add_field('700__', value='', subfields=subfields)
Esempio n. 15
0
def get_filenames(recid):
    """
        Returns names of the files associated with specific record
        and their derivatives. Takes as a parameter the recid of a
        record.

        Example:
        input: recID 999 (record with files ['thesis.ps.gz', 'random.pdf'])
        output: ['thesis.ps.gz', 'thesis.ps', 'thesis',
                 'random.pdf', 'random']
        @param recid: recid of a record
    """
    docs = BibRecDocs(recid)
    names = [_get_filenames(d.name + d.format)
                for d in docs.list_latest_files()]
    return reduce(lambda x,y: x+y, names)
Esempio n. 16
0
def get_filenames(recid):
    """
        Returns names of the files associated with specific record
        and their derivatives. Takes as a parameter the recid of a
        record.

        Example:
        input: recID 999 (record with files ['thesis.ps.gz', 'random.pdf'])
        output: ['thesis.ps.gz', 'thesis.ps', 'thesis',
                 'random.pdf', 'random']
        @param recid: recid of a record
    """
    docs = BibRecDocs(recid)
    names = [
        _get_filenames(d.name + d.format) for d in docs.list_latest_files()
    ]
    return reduce(lambda x, y: x + y, names)
Esempio n. 17
0
def has_or_had_format(recid, format):
    doc = BibRecDocs(recid)
    formats = []
    ret = 0
    for d in doc.list_latest_files():
        formats.append(d.format)

    if format in formats:
        ret = 1
    else:
        for d in doc.list_bibdocs():
            for dd in d.docfiles:
                if format == dd.format:
                    ret = 2

    if ret == 0:
        return "<b>NO</b>"
    elif ret == 1:
        return "yes"
    elif ret == 2:
        return "<b>diff. v.</b>"
Esempio n. 18
0
def has_or_had_format(recid, format):
    doc = BibRecDocs(recid)
    formats = []
    ret = 0
    for d in doc.list_latest_files():
        formats.append(d.format)

    if format in formats:
        ret = 1
    else:
        for d in doc.list_bibdocs():
            for dd in d.docfiles:
                if format == dd.format:
                    ret = 2

    if ret == 0:
        return "<b>NO</b>"
    elif ret == 1:
        return "yes"
    elif ret == 2:
        return "<b>diff. v.</b>"
def get_files_from_bibdoc(recid):
    """
    Retrieves using BibDoc all the files related with a given record

    @param recid

    @return List of dictionaries containing all the information stored
            inside BibDoc if the current record has files attached, the
            empty list otherwise
    """
    if not recid or recid < 0:
        return []

    from invenio.bibdocfile import BibRecDocs, InvenioBibDocFileError
    files = []
    try:
        bibrecdocs = BibRecDocs(int(recid))
    except InvenioBibDocFileError:
        return []
    latest_files = bibrecdocs.list_latest_files()
    for afile in latest_files:
        file_dict = {}
        file_dict['comment'] = afile.get_comment()
        file_dict['description'] = afile.get_description()
        file_dict['eformat'] = afile.get_format()
        file_dict['full_name'] = afile.get_full_name()
        file_dict['full_path'] = afile.get_full_path()
        file_dict['magic'] = afile.get_magic()
        file_dict['name'] = afile.get_name()
        file_dict['path'] = afile.get_path()
        file_dict['size'] = afile.get_size()
        file_dict['status'] = afile.get_status()
        file_dict['subformat'] = afile.get_subformat()
        file_dict['superformat'] = afile.get_superformat()
        file_dict['type'] = afile.get_type()
        file_dict['url'] = afile.get_url()
        file_dict['version'] = afile.get_version()
        files.append(file_dict)
    return files
Esempio n. 20
0
def _get_fulltext_args_from_recids(recids, task_info):
    """Get list of fulltext locations for input recids
    @param recids: (list) list of recids
    @return: (list) list of strings of the form 'recid:fulltext dir'
    """
    fulltext_arguments = []
    last_updated = None
    if task_info:
        last_updated = task_info['last_updated']

    if recids:
        if last_updated:
            q_get_outdated = "SELECT id FROM bibrec WHERE id IN (%s) AND " \
                             "modification_date > '%s';" % \
                             (",".join(map(lambda r: str(r), recids)), last_updated)
            ## Get records for reference extraction
            changed_records = run_sql(q_get_outdated)
        else:
            ## Make list of lists of input recids
            changed_records = [[r] for r in recids]
        if changed_records:
            for record_row in changed_records:
                record = record_row[0]
                bibrecdoc = BibRecDocs(record)
                ## Get the latest 'document items' for this record
                bibdocfiles = bibrecdoc.list_latest_files()
                if bibdocfiles:
                    doc_types = {
                        'pdf': [],
                        'pdfa': [],
                        'text': [],
                    }

                    bibdoc = bibrecdoc.list_bibdocs()
                    ## Get the text file for this record
                    if bibdoc and bibdoc[0].has_text():
                        doc_types['text'].append(bibdoc[0].get_text_path())

                    ## For each file, of a record
                    for doc in bibdocfiles:
                        pipe_gfile = \
                               os.popen("%s '%s'" \
                                        % (CFG_PATH_GFILE, doc.get_full_path().replace("'", "\\'")), "r")
                        res_gfile = pipe_gfile.readline()
                        pipe_gfile.close()

                        ## Look for : 1. Unstamped, original uploaded-by-user, pdf files
                        ## 2. Stamped, processed, pdf files
                        ## 3. Text files
                        if (res_gfile.lower().find('pdfa') != -1):
                            doc_types['pdfa'].append(doc.get_full_path())
                        elif (res_gfile.lower().find('pdf') != -1):
                            doc_types['pdf'].append(doc.get_full_path())

                    ## Choose the type in this order of priority
                    type_of_choice = doc_types['text'] or doc_types[
                        'pdf'] or doc_types['pdfa']
                    if type_of_choice:
                        fulltext_arguments.append(
                            str(record).rstrip(".") + ':' + type_of_choice[0])
                    else:
                        write_message("W: No pdf/text file for recid %s" % \
                                      str(record), stream=sys.stdout, verbose=0)
                else:
                    write_message("W: No files exist for recid %s" % \
                                  str(record), stream=sys.stdout, verbose=0)
        elif task_info:
            ## In the event that no records have been modified since the
            ## last reference extraction
            write_message("No newly modified records for extraction-job '%s'." \
                          % task_info['name'], stream=sys.stdout, verbose=0)
    return fulltext_arguments
def bst_scoap3_importer():
    task_sleep_now_if_required(can_stop_too=True)
    f = urllib.urlopen('http://repo.scoap3.org/ffts_for_inspire.py/csv')

    fd_update, name_update = mkstemp(suffix='.xml',
                                     prefix='bibupload_scoap3_',
                                     dir=CFG_TMPSHAREDDIR)
    out_update = fdopen(fd_update, 'w')
    fd_new, name_new = mkstemp(suffix='.xml',
                               prefix='bibupload_scoap3_',
                               dir=CFG_TMPSHAREDDIR)
    out_new = fdopen(fd_new, 'w')
    print >> out_update, "<collection>"
    print >> out_new, "<collection>"

    line_count_new = 0  # to avoid empty bibupload
    line_count_update = 0  # to avoid empty bibupload
    f.readline()  ## Let's strip the header line

    for d in f:
        task_sleep_now_if_required(can_stop_too=True)
        recid, arxiv_id, cr_date, checksum, link, type, doi = [
            x.strip() for x in d.split(',')
        ]
        write_message(d.strip())
        if checksum == "None":
            write_message("... no PDF. Skipping")
            continue
        if arxiv_id == "None":
            inspire_record = perform_request_search(p="doi:%s" % (doi, ),
                                                    cc="HEP")
        else:
            inspire_record = perform_request_search(p="037:%s or doi:%s" %
                                                    (arxiv_id, doi),
                                                    cc="HEP")
        if len(inspire_record) > 1:
            write_message(
                "ERROR: more than one INSPIRE record matched %s and %s for SCOAP3 record %s: %s"
                % (arxiv_id, doi, recid, list(inspire_record)),
                stream=sys.stderr)
            continue
        elif not inspire_record:
            write_message(
                "WARNING: no INSPIRE record matched %s or %s for SCOAP3 record %s"
                % (arxiv_id, doi, recid),
                stream=sys.stderr)
            continue
        action = None  # do nothing
        rec = {}
        inspire_record = inspire_record[0]
        record = BibRecDocs(inspire_record)
        for doc in record.list_latest_files():
            if doc.format in ('.pdf', '.pdf;pdfa'):
                if doc.bibdoc.doctype == 'SCOAP3':
                    if doc.checksum == checksum:
                        write_message(
                            "... OK: file alredy attached to INSPIRE record %s (doc.checksum=%s, checksum=%s)"
                            % (inspire_record, doc.checksum, checksum))
                    else:
                        write_message(
                            "... OK: new revision available for INSPIRE record %s (doc.checksum=%s, checksum=%s)"
                            % (inspire_record, doc.checksum, checksum))
                        action = "UPDATE"
                    break
        else:
            write_message("... OK: need to add new file to INSPIRE record %s" %
                          inspire_record)
            action = "APPEND"
        if action:
            if type == '.pdf;pdfa':
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', link),
                                            ('n', 'scoap3-fulltext'),
                                            ('f', '.pdf;pdfa'),
                                            ('t', 'SCOAP3'),
                                            ('d', 'Article from SCOAP3')])
            else:
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', link),
                                            ('n', 'scoap3-fulltext'),
                                            ('t', 'SCOAP3'),
                                            ('d', 'Article from SCOAP3')])

            record_add_field(rec,
                             '001',
                             controlfield_value=str(inspire_record))
        if action == "UPDATE":
            line_count_update += 1
            print >> out_update, record_xml_output(rec)
        elif action == "APPEND":
            line_count_new += 1
            print >> out_new, record_xml_output(rec)
    print >> out_update, "</collection>"
    print >> out_new, "</collection>"
    out_new.close()
    out_update.close()

    if line_count_new:
        id = task_low_level_submission("bibupload", "admin", "-N",
                                       "SCOAP3-import", "-a", name_new)
        write_message("Scheduled bibupload --append %s with ID #%s" %
                      (name_new, id))
    else:
        remove(name_new)
    if line_count_update:
        id = task_low_level_submission("bibupload", "admin", "-N",
                                       "SCOAP3-import", "-c", name_update)
        write_message("Scheduled bibupload --correct %s with ID #%s" %
                      (name_new, id))
    else:
        remove(name_update)
def format_element(bfo, oai=0):
    """Produce MARCXML with enhanced fields.

    Adds 100/700 $x with Record ID of linked HepName,
         701/702 $y with True/False if the signature is claimed
                 $z with Record ID of institution
                 $w with BAI of linked Profile
         371/110 $z with Record ID of institution
         119/502 $z with Record ID of institution
         999C5   $0 with on the fly discovered Record IDs (not for books)
         773     $0 with Record ID of corresponding Book or Proceeding or Report
                 $1 with Record ID of corresponding Journal
                 $2 with Record ID of corresponding Conference
         693/710 $0 with Record ID of corresponding experiment
    """
    record = bfo.get_record()
    recid = bfo.recID

    # Let's filter hidden fields
    if acc_authorize_action(bfo.user_info, "runbibedit")[0]:
        # not authorized
        for tag in CFG_BIBFORMAT_HIDDEN_TAGS:
            if tag in record:
                del record[tag]
    else:
        # Let's add bibdoc info
        bibrecdocs = BibRecDocs(recid)
        for bibdocfile in bibrecdocs.list_latest_files():
            fft = [
                ("a", bibdocfile.fullpath),
                ("d", bibdocfile.description or ""),
                ("f", bibdocfile.format or ""),
                ("n", bibdocfile.name or ""),
                ("r", bibdocfile.status or ""),
                ("s", bibdocfile.cd.strftime("%Y-%m-%d %H:%M:%S")),
                ("t", bibdocfile.get_type()),
                ("v", str(bibdocfile.version)),
                ("z", bibdocfile.comment or ""),
            ]
            for flag in bibdocfile.flags:
                fft.append(("o", flag))
            record_add_field(record, "FFT", subfields=fft)

    is_institution = "INSTITUTION" in [collection.upper() for collection in bfo.fields("980__a")]

    if "100" in record or "700" in record:
        signatures = dict(
            (name, (personid, flag))
            for name, personid, flag in run_sql(
                "SELECT name, personid, flag FROM aidPERSONIDPAPERS WHERE bibrec=%s AND flag>-2", (recid,)
            )
        )

    # Let's add signatures
    for field in (
        record_get_field_instances(record, "100")
        + record_get_field_instances(record, "700")
        + record_get_field_instances(record, "701")
        + record_get_field_instances(record, "702")
    ):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if "a" in subfield_dict:
            author_name = subfield_dict["a"]
            if "i" in subfield_dict:
                inspire_id = subfield_dict["i"]
                hepname_id = get_hepname_id_from_inspire_id(inspire_id)
                if hepname_id:
                    subfields.append(("x", "%i" % hepname_id))
                    subfields.append(("y", "1"))
            else:
                personid, flag = signatures.get(author_name, (None, None))
                bai = get_personid_canonical_id().get(personid)
                if bai:
                    subfields.append(("w", bai))
                    hepname_id = get_hepname_id(personid)
                    if hepname_id:
                        subfields.append(("x", "%i" % hepname_id))
                    subfields.append(("y", "%i" % (flag == 2)))

        # And matched affiliations
        if "u" in subfield_dict:
            for code, value in subfields:
                if code == "u":
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(("z", "%i" % ids[0]))

    # Thesis institution
    for field in record_get_field_instances(record, "502"):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if "c" in subfield_dict:
            for code, value in subfields:
                if code == "c":
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(("z", "%i" % ids[0]))

    # Enhance affiliation in Experiments
    for field in record_get_field_instances(record, "119"):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if "u" in subfield_dict:
            for code, value in subfields:
                if code == "u":
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(("z", "%i" % ids[0]))

    # Enhance affiliation in HepNames and Jobs and Institutions
    for field in record_get_field_instances(record, "371"):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if "a" in subfield_dict:
            for code, value in subfields:
                if code == "a":
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(("z", "%i" % ids[0]))

    for field in record_get_field_instances(record, "110"):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if is_institution:
            # We try to resolve obsolete ICNs
            if "x" in subfield_dict:
                for code, value in subfields:
                    if code == "x":
                        ids = get_institution_ids(value)
                        if len(ids) == 1:
                            subfields.append(("z", "%i" % ids[0]))
        else:
            # In other collections institution is in a
            if "a" in subfield_dict:
                for code, value in subfields:
                    if code == "a":
                        ids = get_institution_ids(value)
                        if len(ids) == 1:
                            subfields.append(("z", "%i" % ids[0]))

    # Enhance citation
    for field in record_get_field_instances(record, "999", ind1="C", ind2="5"):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if "0" not in subfield_dict:
            matched_id = get_matched_id(subfields)
            if matched_id:
                subfields.append(("0", str(matched_id)))

    # Enhance CNUMs and Journals
    for field in record_get_field_instances(record, "773"):
        subfields = field_get_subfield_instances(field)
        for code, value in subfields:
            if code == "w":
                # Conference CNUMs
                recids = perform_request_search(p='111__g:"%s"' % value, cc="Conferences")
                if len(recids) == 1:
                    subfields.append(("2", str(recids.pop())))
                recids = perform_request_search(p='773__w:"%s" 980:PROCEEDINGS' % value)
                if recid in recids:
                    # We remove this very record, since it can be a proceedings
                    recids.remove(recid)
                if len(recids) == 1:
                    subfields.append(("0", str(recids.pop())))
            elif code == "p":
                # Journal title
                recids = perform_request_search(p='711__a:"%s"' % value, cc="Journals")
                if len(recids) == 1:
                    subfields.append(("1", str(recids.pop())))
            elif code == "z":
                # ISBN
                recids = find_isbn({"ISBN": value})
                if len(recids) == 1:
                    subfields.append(("0", str(recids.pop())))
            elif code == "r":
                # Report
                recids = perform_request_search(p='reportnumber:"%s"' % value)
                if len(recids) == 1:
                    subfields.append(("0", str(recids.pop())))

    # Enhance Experiments
    for field in record_get_field_instances(record, "693"):
        subfields = field_get_subfield_instances(field)
        for code, value in subfields:
            if code == "e":
                recids = perform_request_search(p='119__a:"%s"' % value, cc="Experiments")
                if len(recids) == 1:
                    subfields.append(("0", str(recids.pop())))

    # Enhance Experiments
    for field in record_get_field_instances(record, "710"):
        subfields = field_get_subfield_instances(field)
        for code, value in subfields:
            if code == "g":
                recids = perform_request_search(p='119__a:"%s"' % value, cc="Experiments")
                if len(recids) == 1:
                    subfields.append(("0", str(recids.pop())))

    # Add Creation date:
    if "961" in record:
        del record["961"]
    creation_date, modification_date = run_sql(
        "SELECT creation_date, modification_date FROM bibrec WHERE id=%s", (recid,)
    )[0]
    record_add_field(
        record,
        "961",
        subfields=[("c", creation_date.strftime("%Y-%m-%d")), ("x", modification_date.strftime("%Y-%m-%d"))],
    )

    formatted_record = record_xml_output(record)
    if oai:
        formatted_record = formatted_record.replace(
            "<record>",
            '<marc:record xmlns:marc="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" type="Bibliographic">\n     <marc:leader>00000coc  2200000uu 4500</marc:leader>',
        )
        formatted_record = formatted_record.replace(
            '<record xmlns="http://www.loc.gov/MARC21/slim">',
            '<marc:record xmlns:marc="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" type="Bibliographic">\n     <marc:leader>00000coc  2200000uu 4500</marc:leader>',
        )
        formatted_record = formatted_record.replace("</record", "</marc:record")
        formatted_record = formatted_record.replace("<controlfield", "<marc:controlfield")
        formatted_record = formatted_record.replace("</controlfield", "</marc:controlfield")
        formatted_record = formatted_record.replace("<datafield", "<marc:datafield")
        formatted_record = formatted_record.replace("</datafield", "</marc:datafield")
        formatted_record = formatted_record.replace("<subfield", "<marc:subfield")
        formatted_record = formatted_record.replace("</subfield", "</marc:subfield")
    return formatted_record
def tarballs_by_recids(recids,
                       sdir,
                       docname=None,
                       doctype=None,
                       docformat=None):
    """
    Take a string representing one recid or several and get the associated
    tarballs for those ids. By default look for files with names matching
    the report number and with source field 'arXiv'. This can be changed
    with C{docname}, C{doctype}, C{docformat}

    @param: recids (string): the record id or ids
    @param: sdir (string): where the tarballs should live
    @param docname: select tarball for given recid(s) that match docname
    @param doctype: select tarball for given recid(s) that match doctype
    @param docformat: select tarball for given recid(s) that match docformat
    @return: tarballs ([string, string, ...]): locations of tarballs
    """
    if not recids:
        return []

    list_of_ids = []

    if ',' in recids:
        recids = recids.split(',')
        for recid in recids:
            if '-' in recid:
                low, high = recid.split('-')
                recid = range(int(low), int(high))
                list_of_ids.extend(recid)
            else:
                recid = int(recid)
                list_of_ids.append(recid)

    else:
        if '-' in recids:
            low, high = recids.split('-')
            list_of_ids = range(int(low), int(high))
        else:
            list_of_ids = [int(recids)]

    arXiv_ids = []
    local_files = []
    for recid in list_of_ids:
        rec = get_record(recid)
        if not doctype and not docname and not docformat:
            for afieldinstance in record_get_field_instances(rec, tag='037'):
                if len(field_get_subfield_values(afieldinstance, '9')) > 0:
                    if 'arXiv' == field_get_subfield_values(
                            afieldinstance, '9')[0]:
                        arXiv_id = field_get_subfield_values(
                            afieldinstance, 'a')[0]
                        arXiv_ids.append(arXiv_id)
        else:
            bibarchive = BibRecDocs(recid)
            all_files = bibarchive.list_latest_files()
            if doctype:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_type() == doctype
                ]
            if docname:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_name() == docname
                ]
            if docformat:
                all_files = [
                    docfile for docfile in all_files
                    if docfile.get_format() == docformat
                ]
            local_files.extend([(docfile.get_path(), recid)
                                for docfile in all_files])

    if doctype or docname or docformat:
        return local_files

    return tarballs_by_arXiv_id(arXiv_ids, sdir)
Esempio n. 24
0
def get_formats(recid):
    doc = BibRecDocs(recid)
    formats = []
    for d in doc.list_latest_files():
        formats.append(d.format)
    return formats
Esempio n. 25
0
def bst_scoap3_importer():
    f = urllib.urlopen('http://repo.scoap3.org/ffts_for_inspire.py/csv')

    fd1, name1 = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR)
    out_update = fdopen(fd1, 'w')
    fd2, name2 = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR)
    out_new = fdopen(fd2, 'w')
    print >> out_update, "<collection>"
    print >> out_new, "<collection>"

    line_count_new = 0  # to avoid empty bibupload
    line_count_update = 0  # to avoid empty bibupload
    for d in f:
        d = [x.strip() for x in d.split(',')]
        print d
        if d[0] not in ["recid", ''] and d[4] != "no pdf":
            inspire_record = perform_request_search(p="037:%s" % (d[1],), cc="HEP")
            try:
                if not len(inspire_record):
                    raise IndexError
                elif len(inspire_record) > 1:
                    raise IndexError
                else:
                    action = 0  # do nothing
                    rec = {}
                    record = BibRecDocs(inspire_record[0])
                    for doc in record.list_latest_files():
                        if doc.format in ('.pdf', '.pdf;pdfa'):
                            if doc.bibdoc.doctype is 'SCOAP3':
                                if doc.checksum is d[3]:
                                    print "File alredy attached"
                                else:
                                    action = 1  # update
                            else:
                                action = 2  # new

                    if action:
                        if d[5] == '.pdf;pdfa':
                            record_add_field(rec, 'FFT', subfields=[('a', d[4]), ('n', 'scoap3-fulltext'), ('f', '.pdf;pdfa'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')])
                        else:
                            record_add_field(rec, 'FFT', subfields=[('a', d[4]), ('n', 'scoap3-fulltext'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')])

                        record_add_field(rec, '001', controlfield_value=inspire_record[0])
                    if action == 1:
                        line_count_update = line_count_update + 1
                        print >> out_update, record_xml_output(rec)
                    elif action == 2:
                        line_count_new = line_count_new + 1
                        print >> out_new, record_xml_output(rec)
            except IndexError:
                register_exception(alert_admin=True, prefix="ERROR - PDF import from SCOAP3. No record with: %s" % (d[1],))
                continue
            except:
                register_exception(alert_admin=True, prefix="ERROR - PDF import from SCOAP3.")
                continue

    print >> out_update, "</collection>"
    print >> out_new, "</collection>"
    out_new.close()
    out_update.close()

    if line_count_new:
        task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-a", name2)
    if line_count_update:
        task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-c", name1)
Esempio n. 26
0
def get_formats(recid):
    doc = BibRecDocs(recid)
    formats = []
    for d in doc.list_latest_files():
        formats.append(d.format)
    return formats
def format_element(bfo, oai=0):
    """Produce MARCXML with enhanced fields.

    Adds 100/700 $x with Record ID of linked HepName,
         701/702 $y with True/False if the signature is claimed
                 $z with Record ID of institution
                 $w with BAI of linked Profile
         371/110 $z with Record ID of institution
         119/502 $z with Record ID of institution
         999C5   $0 with on the fly discovered Record IDs (not for books)
         773     $0 with Record ID of corresponding Book or Proceeding or Report
                 $1 with Record ID of corresponding Journal
                 $2 with Record ID of corresponding Conference
         693/710 $0 with Record ID of corresponding experiment
    """
    can_see_hidden_stuff = not acc_authorize_action(bfo.user_info,
                                                    'runbibedit')[0]
    recid = bfo.recID
    if can_see_hidden_stuff and is_record_deleted(bfo):
        record = salvage_deleted_record_from_history(recid)
    else:
        record = bfo.get_record()

    # Let's filter hidden fields
    if can_see_hidden_stuff:
        # Let's add bibdoc info
        bibrecdocs = BibRecDocs(recid)
        for bibdocfile in bibrecdocs.list_latest_files():
            fft = [
                ('a', bibdocfile.fullpath),
                ('d', bibdocfile.description or ''),
                ('f', bibdocfile.format or ''),
                ('n', bibdocfile.name or ''),
                ('r', bibdocfile.status or ''),
                ('s', bibdocfile.cd.strftime('%Y-%m-%d %H:%M:%S')),
                ('t', bibdocfile.bibdoc.doctype),
                ('v', str(bibdocfile.version)),
                ('z', bibdocfile.comment or ''),
            ]
            for flag in bibdocfile.flags:
                fft.append(('o', flag))
            record_add_field(record, 'FFT', subfields=fft)
    else:
        # not authorized
        for tag in CFG_BIBFORMAT_HIDDEN_TAGS:
            if tag in record:
                del record[tag]

    is_institution = 'INSTITUTION' in [
        collection.upper() for collection in bfo.fields('980__a')
    ]

    signatures = {}
    if '100' in record or '700' in record:
        signatures = dict((
            name, (personid, flag)
        ) for name, personid, flag in run_sql(
            "SELECT name, personid, flag FROM aidPERSONIDPAPERS WHERE bibrec=%s AND flag>-2",
            (recid, )))

    # Let's add signatures
    for field in record_get_field_instances(
            record, '100') + record_get_field_instances(
                record, '700') + record_get_field_instances(
                    record, '701') + record_get_field_instances(record, '702'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'a' in subfield_dict:
            author_name = subfield_dict['a']
            personid, flag = signatures.get(author_name, (None, None))
            bai = get_personid_canonical_id().get(personid)
            if bai:
                subfields.append(('w', bai))
                hepname_id = get_hepname_id(personid)
                if hepname_id:
                    subfields.append(('x', '%i' % hepname_id))
                subfields.append(('y', '%i' % (flag == 2)))

        # And matched affiliations
        if 'u' in subfield_dict:
            for code, value in subfields:
                if code == 'u':
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(('z', '%i' % ids[0]))

    # Thesis institution
    for field in record_get_field_instances(record, '502'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'c' in subfield_dict:
            for code, value in subfields:
                if code == 'c':
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(('z', '%i' % ids[0]))

    # Related institution
    for field in record_get_field_instances(record, '510'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'a' in subfield_dict and not '0' in subfield_dict:
            ids = get_institution_ids(subfield_dict['a'])
            if len(ids) == 1:
                subfields.append(('0', '%i' % ids[0]))

    # Related journal
    for field in record_get_field_instances(record, '530'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'a' in subfield_dict and not '0' in subfield_dict:
            ids = get_institution_ids(subfield_dict['a'])
            if len(ids) == 1:
                subfields.append(('0', '%i' % ids[0]))

    # Enhance affiliation in Experiments
    for field in record_get_field_instances(record, '119'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'u' in subfield_dict:
            for code, value in subfields:
                if code == 'u':
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(('z', '%i' % ids[0]))

    # Enhance affiliation in HepNames and Jobs and Institutions and
    # naked affiliations in HEP
    for field in record_get_field_instances(
            record, '371') + record_get_field_instances(record, '902'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'a' in subfield_dict:
            for code, value in subfields:
                if code == 'a':
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(('z', '%i' % ids[0]))

    for field in record_get_field_instances(record, '110'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if is_institution:
            # We try to resolve obsolete ICNs
            if 'x' in subfield_dict:
                for code, value in subfields:
                    if code == 'x':
                        ids = get_institution_ids(value)
                        if len(ids) == 1:
                            subfields.append(('z', '%i' % ids[0]))
        else:
            # In other collections institution is in a
            if 'a' in subfield_dict:
                for code, value in subfields:
                    if code == 'a':
                        ids = get_institution_ids(value)
                        if len(ids) == 1:
                            subfields.append(('z', '%i' % ids[0]))

    # Enhance citation
    for field in record_get_field_instances(record, '999', ind1='C', ind2='5'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if '0' in subfield_dict:
            # Already available recid
            subfields.append(('z', '1'))
        else:
            matched_id = get_matched_id(subfields)
            if matched_id:
                subfields.append(('0', str(matched_id)))

    # Enhance related records
    for field in (
            record_get_field_instances(record, '780', ind1='0', ind2='2') +
            record_get_field_instances(record, '785', ind1='0', ind2='2') +
            record_get_field_instances(record, '787', ind1='0', ind2='8')):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        subfield_citation = []
        if subfield_dict.get('r'):  # Reportnumber
            subfield_citation.append(('r', subfield_dict['r']))
        if subfield_dict.get('z'):  # ISBN
            subfield_citation.append(('i', subfield_dict['z']))
        if 'w' not in subfield_dict and subfield_citation:
            matched_id = get_matched_id(subfield_citation)
            if matched_id:
                subfields.append(('w', str(matched_id)))

    # Enhance CNUMs and Journals
    for field in record_get_field_instances(record, '773'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        for code, value in subfields:
            if code == 'w':
                # Conference CNUMs
                recids = perform_request_search(p='111__g:"%s"' % value,
                                                cc='Conferences')
                if len(recids) == 1:
                    subfields.append(('2', str(recids.pop())))
                if '0' not in subfield_dict:
                    recids = perform_request_search(
                        p='773__w:"%s" 980:PROCEEDINGS' % value)
                    if recid in recids:
                        # We remove this very record, since it can be a proceedings
                        recids.remove(recid)
                    if len(recids) == 1:
                        subfields.append(('0', str(recids.pop())))
            elif code == 'p':
                # Journal title
                recids = perform_request_search(p='711__a:"%s"' % value,
                                                cc='Journals')
                if len(recids) == 1:
                    subfields.append(('1', str(recids.pop())))
            elif code == 'z' and '0' not in subfield_dict:
                # ISBN
                recids = find_isbn({'ISBN': value})
                if len(recids) == 1:
                    subfields.append(('0', str(recids.pop())))
            elif code == 'r' and '0' not in subfield_dict:
                # Report
                recids = perform_request_search(p='reportnumber:"%s"' % value)
                if len(recids) == 1:
                    subfields.append(('0', str(recids.pop())))

    # Enhance Experiments
    for field in record_get_field_instances(record, '693'):
        subfields = field_get_subfield_instances(field)
        for code, value in subfields:
            if code == 'e':
                recids = perform_request_search(p='119__a:"%s"' % value,
                                                cc='Experiments')
                if len(recids) == 1:
                    subfields.append(('0', str(recids.pop())))
            elif code == 'a':
                recids = perform_request_search(p='119__b:"%s"' % value,
                                                cc='Experiments')
                if len(recids) == 1:
                    subfields.append(('0', str(recids.pop())))

    # Enhance Experiments
    for field in record_get_field_instances(record, '710'):
        subfields = field_get_subfield_instances(field)
        for code, value in subfields:
            if code == 'g':
                recids = perform_request_search(p='119__a:"%s"' % value,
                                                cc='Experiments')
                if len(recids) == 1:
                    subfields.append(('0', str(recids.pop())))

    # Add Creation date:
    if '961' in record:
        del record['961']
    creation_date, modification_date = run_sql(
        "SELECT creation_date, modification_date FROM bibrec WHERE id=%s",
        (recid, ))[0]
    record_add_field(record,
                     '961',
                     subfields=[('x', creation_date.strftime('%Y-%m-%d')),
                                ('c', modification_date.strftime('%Y-%m-%d'))])

    formatted_record = record_xml_output(record)
    if oai:
        formatted_record = formatted_record.replace(
            "<record>",
            "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n     <marc:leader>00000coc  2200000uu 4500</marc:leader>"
        )
        formatted_record = formatted_record.replace(
            "<record xmlns=\"http://www.loc.gov/MARC21/slim\">",
            "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n     <marc:leader>00000coc  2200000uu 4500</marc:leader>"
        )
        formatted_record = formatted_record.replace("</record",
                                                    "</marc:record")
        formatted_record = formatted_record.replace("<controlfield",
                                                    "<marc:controlfield")
        formatted_record = formatted_record.replace("</controlfield",
                                                    "</marc:controlfield")
        formatted_record = formatted_record.replace("<datafield",
                                                    "<marc:datafield")
        formatted_record = formatted_record.replace("</datafield",
                                                    "</marc:datafield")
        formatted_record = formatted_record.replace("<subfield",
                                                    "<marc:subfield")
        formatted_record = formatted_record.replace("</subfield",
                                                    "</marc:subfield")
    return formatted_record
Esempio n. 28
0
def tarballs_by_recids(recids, sdir, docname=None, doctype=None, docformat=None):
    """
    Take a string representing one recid or several and get the associated
    tarballs for those ids. By default look for files with names matching
    the report number and with source field 'arXiv'. This can be changed
    with C{docname}, C{doctype}, C{docformat}

    @param: recids (string): the record id or ids
    @param: sdir (string): where the tarballs should live
    @param docname: select tarball for given recid(s) that match docname
    @param doctype: select tarball for given recid(s) that match doctype
    @param docformat: select tarball for given recid(s) that match docformat
    @return: tarballs ([string, string, ...]): locations of tarballs
    """
    if not recids:
        return []

    list_of_ids = []

    if ',' in recids:
        recids = recids.split(',')
        for recid in recids:
            if '-' in recid:
                low, high = recid.split('-')
                recid = range(int(low), int(high))
                list_of_ids.extend(recid)
            else:
                recid = int(recid)
                list_of_ids.append(recid)

    else:
        if '-' in recids:
            low, high = recids.split('-')
            list_of_ids = range(int(low), int(high))
        else:
            list_of_ids = [int(recids)]

    arXiv_ids = []
    local_files = []
    for recid in list_of_ids:
        rec = get_record(recid)
        if not doctype and not docname and not docformat:
            for afieldinstance in record_get_field_instances(rec, tag='037'):
                if len(field_get_subfield_values(afieldinstance, '9')) > 0:
                    if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]:
                        arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0]
                        arXiv_ids.append(arXiv_id)
        else:
            bibarchive = BibRecDocs(recid)
            all_files = bibarchive.list_latest_files()
            if doctype:
                all_files = [docfile for docfile in all_files if
                             docfile.get_type() == doctype]
            if docname:
                all_files = [docfile for docfile in all_files if
                             docfile.get_name() == docname]
            if docformat:
                all_files = [docfile for docfile in all_files if
                             docfile.get_format() == docformat]
            local_files.extend([(docfile.get_path(), recid) for docfile in all_files])

    if doctype or docname or docformat:
        return local_files

    return tarballs_by_arXiv_id(arXiv_ids, sdir)
Esempio n. 29
0
def bst_scoap3_importer():
    """Import from SCOAP3."""
    try:
        request = requests.get(
            'http://repo.scoap3.org/ffts_for_inspire.py/csv')
    except (HTTPError, ConnectionError, Timeout):
        register_exception()
        return
    task_sleep_now_if_required(can_stop_too=True)

    fd_update, name_update = mkstemp(suffix='.xml',
                                     prefix='bibupload_scoap3_',
                                     dir=CFG_TMPSHAREDDIR)

    out_update = fdopen(fd_update, 'w')
    fd_new, name_new = mkstemp(suffix='.xml',
                               prefix='bibupload_scoap3_',
                               dir=CFG_TMPSHAREDDIR)
    out_new = fdopen(fd_new, 'w')

    print >> out_update, "<collection>"
    print >> out_new, "<collection>"

    line_count_new = 0  # to avoid empty bibupload
    line_count_update = 0  # to avoid empty bibupload

    # We strip the first line.
    for line in request.text.split("\n")[1:]:
        if not line.strip():
            continue
        task_sleep_now_if_required(can_stop_too=True)
        recid, arxiv_id, cr_date, checksum, link, file_format, doi = [
            x.strip() for x in line.split(',')
        ]
        write_message(line.strip())
        if checksum == "None":
            write_message("... no PDF. Skipping")
            continue
        if arxiv_id == "None":
            inspire_record = perform_request_search(p="doi:%s" % (doi, ),
                                                    cc="HEP")
        else:
            inspire_record = perform_request_search(p="037:%s or doi:%s" %
                                                    (arxiv_id, doi),
                                                    cc="HEP")
        if len(inspire_record) > 1:
            write_message(
                "ERROR: more than one INSPIRE record matched %s and %s for SCOAP3 record %s: %s"
                % (arxiv_id, doi, recid, list(inspire_record)),
                stream=sys.stderr)
            continue
        elif not inspire_record:
            write_message(
                "WARNING: no INSPIRE record matched %s or %s for SCOAP3 record %s"
                % (arxiv_id, doi, recid),
                stream=sys.stderr)
            continue
        action = None  # do nothing
        rec = {}
        inspire_record = inspire_record[0]
        record = BibRecDocs(inspire_record)
        for doc in record.list_latest_files('SCOAP3'):
            if doc.format == file_format:
                if doc.checksum == checksum:
                    write_message(
                        "... OK: file alredy attached to INSPIRE record %s (doc.checksum=%s, checksum=%s)"
                        % (inspire_record, doc.checksum, checksum))
                else:
                    write_message(
                        "... OK: new revision available for INSPIRE record %s (doc.checksum=%s, checksum=%s)"
                        % (inspire_record, doc.checksum, checksum))
                    action = "UPDATE"
                break
        else:
            write_message("... OK: need to add new file to INSPIRE record %s" %
                          inspire_record)
            action = "APPEND"
        if action:
            if file_format == '.pdf;pdfa':
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', link),
                                            ('n', 'scoap3-fulltext'),
                                            ('f', '.pdf;pdfa'),
                                            ('t', 'SCOAP3'),
                                            ('d', 'Article from SCOAP3')])
            else:
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', link),
                                            ('n', 'scoap3-fulltext'),
                                            ('t', 'SCOAP3'),
                                            ('d', 'Article from SCOAP3')])

            record_add_field(rec,
                             '001',
                             controlfield_value=str(inspire_record))
        if action == "UPDATE":
            line_count_update += 1
            print >> out_update, record_xml_output(rec)
        elif action == "APPEND":
            line_count_new += 1
            print >> out_new, record_xml_output(rec)
    print >> out_update, "</collection>"
    print >> out_new, "</collection>"
    out_new.close()
    out_update.close()

    if line_count_new:
        # We use correct here instead of append to deal with potential sync issues.
        # Basically BibUpload should handle "new" corrections as "append" if it is not there.
        id = task_low_level_submission("bibupload", "admin", "-N",
                                       "SCOAP3-import", "-c", name_new)
        write_message("Scheduled bibupload --correct %s with ID #%s" %
                      (name_new, id))
    else:
        remove(name_new)
    if line_count_update:
        id = task_low_level_submission("bibupload", "admin", "-N",
                                       "SCOAP3-import", "-c", name_update)
        write_message("Scheduled bibupload --correct %s with ID #%s" %
                      (name_update, id))
    else:
        remove(name_update)
Esempio n. 30
0
def bst_scoap3_importer():
    """Import from SCOAP3."""
    try:
        request = requests.get('http://repo.scoap3.org/ffts_for_inspire.py/csv')
    except (HTTPError, ConnectionError, Timeout):
        register_exception()
        return
    task_sleep_now_if_required(can_stop_too=True)

    fd_update, name_update = mkstemp(
        suffix='.xml',
        prefix='bibupload_scoap3_',
        dir=CFG_TMPSHAREDDIR
    )

    out_update = fdopen(fd_update, 'w')
    fd_new, name_new = mkstemp(
        suffix='.xml',
        prefix='bibupload_scoap3_',
        dir=CFG_TMPSHAREDDIR
    )
    out_new = fdopen(fd_new, 'w')

    print >> out_update, "<collection>"
    print >> out_new, "<collection>"

    line_count_new = 0  # to avoid empty bibupload
    line_count_update = 0  # to avoid empty bibupload

    # We strip the first line.
    for line in request.text.split("\n")[1:]:
        if not line.strip():
            continue
        task_sleep_now_if_required(can_stop_too=True)
        recid, arxiv_id, cr_date, checksum, link, type, doi = [x.strip() for x in line.split(',')]
        write_message(line.strip())
        if checksum == "None":
            write_message("... no PDF. Skipping")
            continue
        if arxiv_id == "None":
            inspire_record = perform_request_search(p="doi:%s" % (doi, ), cc="HEP")
        else:
            inspire_record = perform_request_search(p="037:%s or doi:%s" % (arxiv_id, doi), cc="HEP")
        if len(inspire_record) > 1:
            write_message("ERROR: more than one INSPIRE record matched %s and %s for SCOAP3 record %s: %s" % (arxiv_id, doi, recid, list(inspire_record)), stream=sys.stderr)
            continue
        elif not inspire_record:
            write_message("WARNING: no INSPIRE record matched %s or %s for SCOAP3 record %s" % (arxiv_id, doi, recid), stream=sys.stderr)
            continue
        action = None  # do nothing
        rec = {}
        inspire_record = inspire_record[0]
        record = BibRecDocs(inspire_record)
        for doc in record.list_latest_files():
            if doc.format in ('.pdf', '.pdf;pdfa'):
                if doc.bibdoc.doctype == 'SCOAP3':
                    if doc.checksum == checksum:
                        write_message("... OK: file alredy attached to INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum))
                    else:
                        write_message("... OK: new revision available for INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum))
                        action = "UPDATE"
                    break
        else:
            write_message("... OK: need to add new file to INSPIRE record %s" % inspire_record)
            action = "APPEND"
        if action:
            if type == '.pdf;pdfa':
                record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('f', '.pdf;pdfa'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')])
            else:
                record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')])

            record_add_field(rec, '001', controlfield_value=str(inspire_record))
        if action == "UPDATE":
            line_count_update += 1
            print >> out_update, record_xml_output(rec)
        elif action == "APPEND":
            line_count_new += 1
            print >> out_new, record_xml_output(rec)
    print >> out_update, "</collection>"
    print >> out_new, "</collection>"
    out_new.close()
    out_update.close()

    if line_count_new:
        id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-a", name_new)
        write_message("Scheduled bibupload --append %s with ID #%s" % (name_new, id))
    else:
        remove(name_new)
    if line_count_update:
        id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-c", name_update)
        write_message("Scheduled bibupload --correct %s with ID #%s" % (name_new, id))
    else:
        remove(name_update)
def format_element(bfo, oai=0):
    """Produce MARCXML with enhanced fields.

    Adds 100/700 $x with Record ID of linked HepName,
         701/702 $y with True/False if the signature is claimed
                 $z with Record ID of institution
                 $w with BAI of linked Profile
         371/110 $z with Record ID of institution
         119/502 $z with Record ID of institution
         999C5   $0 with on the fly discovered Record IDs (not for books)
         773     $0 with Record ID of corresponding Book or Proceeding or Report
                 $1 with Record ID of corresponding Journal
                 $2 with Record ID of corresponding Conference
         693/710 $0 with Record ID of corresponding experiment
    """
    can_see_hidden_stuff = not acc_authorize_action(bfo.user_info, 'runbibedit')[0]
    recid = bfo.recID
    if can_see_hidden_stuff and is_record_deleted(bfo):
        record = salvage_deleted_record_from_history(recid)
    else:
        record = bfo.get_record()

    # Let's filter hidden fields
    if can_see_hidden_stuff:
        # Let's add bibdoc info
        bibrecdocs = BibRecDocs(recid)
        for bibdocfile in bibrecdocs.list_latest_files():
            fft = [
                ('a', bibdocfile.fullpath),
                ('d', bibdocfile.description or ''),
                ('f', bibdocfile.format or ''),
                ('n', bibdocfile.name or ''),
                ('r', bibdocfile.status or ''),
                ('s', bibdocfile.cd.strftime('%Y-%m-%d %H:%M:%S')),
                ('t', bibdocfile.bibdoc.doctype),
                ('v', str(bibdocfile.version)),
                ('z', bibdocfile.comment or ''),
            ]
            for flag in bibdocfile.flags:
                fft.append(('o', flag))
            record_add_field(record, 'FFT', subfields=fft)
    else:
        # not authorized
        for tag in CFG_BIBFORMAT_HIDDEN_TAGS:
            if tag in record:
                del record[tag]


    is_institution = 'INSTITUTION' in [collection.upper() for collection in bfo.fields('980__a')]

    signatures = {}
    if '100' in record or '700' in record:
        signatures = dict((name, (personid, flag)) for name, personid, flag in run_sql("SELECT name, personid, flag FROM aidPERSONIDPAPERS WHERE bibrec=%s AND flag>-2", (recid, )))

    # Let's add signatures
    for field in record_get_field_instances(record, '100') + record_get_field_instances(record, '700') + record_get_field_instances(record, '701') + record_get_field_instances(record, '702'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'a' in subfield_dict:
            author_name = subfield_dict['a']
            personid, flag = signatures.get(author_name, (None, None))
            bai = get_personid_canonical_id().get(personid)
            if bai:
                subfields.append(('w', bai))
                hepname_id = get_hepname_id(personid)
                if hepname_id:
                    subfields.append(('x', '%i' % hepname_id))
                subfields.append(('y', '%i' % (flag == 2)))

        # And matched affiliations
        if 'u' in subfield_dict:
            for code, value in subfields:
                if code == 'u':
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(('z', '%i' % ids[0]))

    # Thesis institution
    for field in record_get_field_instances(record, '502'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'c' in subfield_dict:
            for code, value in subfields:
                if code == 'c':
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(('z', '%i' % ids[0]))

    # Related institution
    for field in record_get_field_instances(record, '510'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'a' in subfield_dict and not '0'in subfield_dict:
            ids = get_institution_ids(subfield_dict['a'])
            if len(ids) == 1:
                subfields.append(('0', '%i' % ids[0]))

    # Related journal
    for field in record_get_field_instances(record, '530'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'a' in subfield_dict and not '0'in subfield_dict:
            ids = get_institution_ids(subfield_dict['a'])
            if len(ids) == 1:
                subfields.append(('0', '%i' % ids[0]))

    # Enhance affiliation in Experiments
    for field in record_get_field_instances(record, '119'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'u' in subfield_dict:
            for code, value in subfields:
                if code == 'u':
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(('z', '%i' % ids[0]))

    # Enhance affiliation in HepNames and Jobs and Institutions and
    # naked affiliations in HEP
    for field in record_get_field_instances(record, '371') + record_get_field_instances(record, '902'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if 'a' in subfield_dict:
            for code, value in subfields:
                if code == 'a':
                    ids = get_institution_ids(value)
                    if len(ids) == 1:
                        subfields.append(('z', '%i' % ids[0]))

    for field in record_get_field_instances(record, '110'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if is_institution:
            # We try to resolve obsolete ICNs
            if 'x' in subfield_dict:
                for code, value in subfields:
                    if code == 'x':
                        ids = get_institution_ids(value)
                        if len(ids) == 1:
                            subfields.append(('z', '%i' % ids[0]))
        else:
            # In other collections institution is in a
            if 'a' in subfield_dict:
                for code, value in subfields:
                    if code == 'a':
                        ids = get_institution_ids(value)
                        if len(ids) == 1:
                            subfields.append(('z', '%i' % ids[0]))

    # Enhance citation
    for field in record_get_field_instances(record, '999', ind1='C', ind2='5'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        if '0' in subfield_dict:
            # Already available recid
            subfields.append(('z', '1'))
        else:
            matched_id = get_matched_id(subfields)
            if matched_id:
                subfields.append(('0', str(matched_id)))

    # Enhance related records
    for field in (record_get_field_instances(record, '780', ind1='0', ind2='2') +
                  record_get_field_instances(record, '785', ind1='0', ind2='2') +
                  record_get_field_instances(record, '787', ind1='0', ind2='8')):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        subfield_citation = []
        if subfield_dict.get('r'): # Reportnumber
            subfield_citation.append(('r', subfield_dict['r']))
        if subfield_dict.get('z'): # ISBN
            subfield_citation.append(('i', subfield_dict['z']))
        if 'w' not in subfield_dict and subfield_citation:
            matched_id = get_matched_id(subfield_citation)
            if matched_id:
                subfields.append(('w', str(matched_id)))

    # Enhance CNUMs and Journals
    for field in record_get_field_instances(record, '773'):
        subfields = field_get_subfield_instances(field)
        subfield_dict = dict(subfields)
        for code, value in subfields:
            if code == 'w':
                # Conference CNUMs
                recids = perform_request_search(p='111__g:"%s"' % value, cc='Conferences')
                if len(recids) == 1:
                    subfields.append(('2', str(recids.pop())))
                if '0' not in subfield_dict:
                    recids = perform_request_search(p='773__w:"%s" 980:PROCEEDINGS' % value)
                    if recid in recids:
                        # We remove this very record, since it can be a proceedings
                        recids.remove(recid)
                    if len(recids) == 1:
                        subfields.append(('0', str(recids.pop())))
            elif code == 'p':
                # Journal title
                recids = perform_request_search(p='711__a:"%s"' % value, cc='Journals')
                if len(recids) == 1:
                    subfields.append(('1', str(recids.pop())))
            elif code == 'z' and '0' not in subfield_dict:
                # ISBN
                recids = find_isbn({'ISBN': value})
                if len(recids) == 1:
                    subfields.append(('0', str(recids.pop())))
            elif code == 'r' and '0' not in subfield_dict:
                # Report
                recids = perform_request_search(p='reportnumber:"%s"' % value)
                if len(recids) == 1:
                    subfields.append(('0', str(recids.pop())))

    # Enhance Experiments
    for field in record_get_field_instances(record, '693'):
        subfields = field_get_subfield_instances(field)
        for code, value in subfields:
            if code == 'e':
                recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments')
                if len(recids) == 1:
                    subfields.append(('0', str(recids.pop())))
            elif code == 'a':
                recids = perform_request_search(p='119__b:"%s"' % value, cc='Experiments')
                if len(recids) == 1:
                    subfields.append(('0', str(recids.pop())))


    # Enhance Experiments
    for field in record_get_field_instances(record, '710'):
        subfields = field_get_subfield_instances(field)
        for code, value in subfields:
            if code == 'g':
                recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments')
                if len(recids) == 1:
                    subfields.append(('0', str(recids.pop())))

    # Add Creation date:
    if '961' in record:
        del record['961']
    creation_date, modification_date = run_sql("SELECT creation_date, modification_date FROM bibrec WHERE id=%s", (recid,))[0]
    record_add_field(record, '961', subfields=[('x', creation_date.strftime('%Y-%m-%d')), ('c', modification_date.strftime('%Y-%m-%d'))])

    formatted_record = record_xml_output(record)
    if oai:
        formatted_record = formatted_record.replace("<record>", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n     <marc:leader>00000coc  2200000uu 4500</marc:leader>")
        formatted_record = formatted_record.replace("<record xmlns=\"http://www.loc.gov/MARC21/slim\">", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n     <marc:leader>00000coc  2200000uu 4500</marc:leader>")
        formatted_record = formatted_record.replace("</record", "</marc:record")
        formatted_record = formatted_record.replace("<controlfield", "<marc:controlfield")
        formatted_record = formatted_record.replace("</controlfield", "</marc:controlfield")
        formatted_record = formatted_record.replace("<datafield", "<marc:datafield")
        formatted_record = formatted_record.replace("</datafield", "</marc:datafield")
        formatted_record = formatted_record.replace("<subfield", "<marc:subfield")
        formatted_record = formatted_record.replace("</subfield", "</marc:subfield")
    return formatted_record
Esempio n. 32
0
def _get_fulltext_args_from_recids(recids, task_info):
    """Get list of fulltext locations for input recids
    @param recids: (list) list of recids
    @return: (list) list of strings of the form 'recid:fulltext dir'
    """
    fulltext_arguments = []
    last_updated = None
    if task_info:
        last_updated = task_info['last_updated']

    if recids:
        if last_updated:
            q_get_outdated = "SELECT id FROM bibrec WHERE id IN (%s) AND " \
                             "modification_date > '%s';" % \
                             (",".join(map(lambda r: str(r), recids)), last_updated)
            ## Get records for reference extraction
            changed_records = run_sql(q_get_outdated)
        else:
            ## Make list of lists of input recids
            changed_records = [[r] for r in recids]
        if changed_records:
            for record_row in changed_records:
                record = record_row[0]
                bibrecdoc = BibRecDocs(record)
                ## Get the latest 'document items' for this record
                bibdocfiles = bibrecdoc.list_latest_files()
                if bibdocfiles:
                    doc_types = {'pdf'  : [],
                                 'pdfa' : [],
                                 'text' : [],}

                    bibdoc = bibrecdoc.list_bibdocs()
                    ## Get the text file for this record
                    if bibdoc and bibdoc[0].has_text():
                        doc_types['text'].append(bibdoc[0].get_text_path())

                    ## For each file, of a record
                    for doc in bibdocfiles:
                        pipe_gfile = \
                               os.popen("%s '%s'" \
                                        % (CFG_PATH_GFILE, doc.get_full_path().replace("'", "\\'")), "r")
                        res_gfile = pipe_gfile.readline()
                        pipe_gfile.close()

                        ## Look for : 1. Unstamped, original uploaded-by-user, pdf files
                        ## 2. Stamped, processed, pdf files
                        ## 3. Text files
                        if (res_gfile.lower().find('pdfa') != -1):
                            doc_types['pdfa'].append(doc.get_full_path())
                        elif (res_gfile.lower().find('pdf') != -1):
                            doc_types['pdf'].append(doc.get_full_path())

                    ## Choose the type in this order of priority
                    type_of_choice = doc_types['text'] or doc_types['pdf'] or doc_types['pdfa']
                    if type_of_choice:
                        fulltext_arguments.append(str(record).rstrip(".")+':'+type_of_choice[0])
                    else:
                        write_message("W: No pdf/text file for recid %s" % \
                                      str(record), stream=sys.stdout, verbose=0)
                else:
                    write_message("W: No files exist for recid %s" % \
                                  str(record), stream=sys.stdout, verbose=0)
        elif task_info:
            ## In the event that no records have been modified since the
            ## last reference extraction
            write_message("No newly modified records for extraction-job '%s'." \
                          % task_info['name'], stream=sys.stdout, verbose=0)
    return fulltext_arguments
def retrieve_random_sample(possible_ids, directory):
    """retrieves a sample document from a given set and return the id"""
    while len(possible_ids) > 0:
        recid = possible_ids.pop()
        brd = BibRecDocs(recid)

        pdf_bibdocfile = reduce(lambda x, y: y, filter(lambda bdf: bdf.format == ".pdf", brd.list_latest_files()), None)
        if pdf_bibdocfile:
            file_to_save = os.path.join(directory, "%i.pdf" % (recid,))
            f = open(file_to_save, "w")
            f.write(pdf_bibdocfile.get_content())
            f.close()
            return recid
    return None  # there were no more samples