def check_records(records):
    for record in records:
        if is_springer(record):
            rec_doc = BibRecDocs(int(record.record_id))
            rec_docs = rec_doc.list_latest_files()
            for doc in rec_docs:
                if doc.get_format() == '.xml':
                    f = open(doc.get_full_path())
                    content = f.read()
                    try:
                        del record['100']
                        del record['700']
                        record.amended = True
                    except:
                        pass

                    first_author = True
                    try:
                        if "-//NLM//DTD JATS" in content:
                            jats = JATSParser()
                            authors = jats.get_authors(parseString(content))
                        else:
                            app = NLMParser()
                            authors = app.get_authors(parseString(content))
                    except:
                        record.warn('Problem with parsing XML.')
                        continue

                    for author in authors:
                        if author.get('surname'):
                            subfields = [
                                ('a',
                                 '%s, %s' % (author.get('surname'),
                                             author.get('given_name')
                                             or author.get('initials', '')))
                            ]
                        else:
                            subfields = [('a', '%s' % (author.get('name', '')))
                                         ]
                        if 'orcid' in author:
                            subfields.append(('j', author['orcid']))
                        if 'affiliation' in author:
                            for aff in author["affiliation"]:
                                subfields.append(('v', aff))

                        add_nations_field(subfields)

                        if author.get('email'):
                            subfields.append(('m', author['email']))
                        if first_author:
                            record.add_field('100__',
                                             value='',
                                             subfields=subfields)
                            first_author = False
                        else:
                            record.add_field('700__',
                                             value='',
                                             subfields=subfields)
Esempio n. 2
0
def get_rawtext_from_record_id(record_id):
    bibrec = BibRecDocs(record_id)
    bibdoc = get_latest_pdf(bibrec.list_latest_files())
    try:
        rawtext = bibdoc.bibdoc.get_text()
    except:
        return ''

    return rawtext
Esempio n. 3
0
def format_element(bfo, width="", caption="yes", max="-1"):
    """
    Display image of the plot if we are in selected plots collections

    @param width: the width of the returned image (Eg: '100px')
    @param separator: a separator between images
    @param caption: display the captions or not?
    @param max: the maximum number of plots to display (-1 is all plots)
    """
    ## To achieve this, we take the pngs associated with this document

    img_files = []
    max = int(max)

    bibarchive = BibRecDocs(bfo.recID)

    if width != "":
        width = 'width="%s"' % width

    for doc in bibarchive.list_bibdocs():
        for _file in doc.list_latest_files():
            if _file.get_type() == "Plot":

                try:
                    caption_text = _file.get_description()[5:]
                    index = int(_file.get_description()[:5])
                    img_location = _file.get_url()
                except:
                    # FIXME: we have hit probably a plot context file,
                    # so ignore this document; but it would be safer
                    # to check subformat type, so that we don't mask
                    # other eventual errors here.
                    continue

                img = '<img src="%s" title="%s" %s/>' % \
                      (img_location, caption_text, width)

                link = create_html_link(urlbase='%s/record/%s/plots#%d' %
                                                (CFG_SITE_URL, bfo.recID,\
                                                 index),
                                        urlargd={},
                                        link_label=img)

                img_files.append((index, link))

    img_files = sorted(img_files, key=lambda x: x[0])
    if max > 0:
        img_files = img_files[:max]

    for index in range(len(img_files)):
        img_files[index] = img_files[index][1]

    if len(img_files) == 0:
        return ''

    return '<div style="overflow-x:scroll;width=100%;white-space:nowrap">' +\
           " ".join(img_files) + '</div>'
Esempio n. 4
0
def get_filetypes(recid):
    """
        Returns filetypes extensions associated with given record.

        Takes as a parameter the recid of a record.
        @param url_field: recid of a record
    """
    docs = BibRecDocs(recid)
    return [_get_filetype(d.format) for d in docs.list_latest_files()]
Esempio n. 5
0
def Add_Files(parameters, curdir, form, user_info=None):
    """DEPRECATED: Use FFT instead."""
    if os.path.exists("%s/files" % curdir):
        bibrecdocs = BibRecDocs(sysno)
        for file in os.listdir("%s/files" % curdir):
            fullpath = "%s/files/%s" % (curdir, file)
            if not bibrecdocs.check_file_exists(fullpath):
                bibrecdocs.add_new_file(fullpath, "Main", never_fail=True)
    return ""
Esempio n. 6
0
def get_pdf(recid):
    bibrecdocs = BibRecDocs(recid)
    for bibdoc in bibrecdocs.list_bibdocs():
        if bibdoc.format_already_exists_p(".pdf"):
            docfile = bibdoc.get_file(".pdf")
            return docfile.checksum, docfile.url, ".pdf"
        elif bibdoc.format_already_exists_p(".pdf;pdfa"):
            docfile = bibdoc.get_file(".pdf;pdfa")
            return docfile.checksum, docfile.url, ".pdf;pdfa"
    return None, None, None
Esempio n. 7
0
def list_pdfs(recid):
    rec_info = BibRecDocs(recid)
    docs = rec_info.list_bibdocs()

    for doc in docs:
        for ext in ('pdf', 'pdfa', 'PDF'):
            try:
                yield doc.get_file(ext)
            except InvenioBibDocFileError:
                pass
Esempio n. 8
0
def dump_bibdoc(recid, from_date, **kwargs):
    """Dump all BibDoc metadata.

    :param docid: BibDoc ID
    :param from_date: Dump only BibDoc revisions newer than this date.

    :returns: List of version of the BibDoc formatted as a dict
    """
    BibRecDocs, BibDoc = _import_bibdoc()

    bibdocfile_dump = []

    date = datetime.datetime.strptime(from_date, '%Y-%m-%d %H:%M:%S')
    for bibdoc in BibRecDocs(recid).list_bibdocs():
        for version in bibdoc.list_versions():
            bibdoc_version = bibdoc.list_version_files(version)
            for f in bibdoc_version:
                if f.is_icon() or f.md < date:
                    # Don't care about icons
                    # Don't care about files not modified since from_date
                    continue
                bibdocfile_dump.append(
                    dict(
                        bibdocid=f.get_bibdocid(),
                        checksum=f.get_checksum(),
                        comment=f.get_comment(),
                        copyright=(f.get_copyright() if hasattr(
                            f, 'get_copyright') else None),
                        creation_date=datetime_toutc(f.cd).isoformat(),
                        description=f.get_description(),
                        encoding=f.encoding,
                        etag=f.etag,
                        flags=f.flags,
                        format=f.get_format(),
                        full_name=f.get_full_name(),
                        full_path=f.get_full_path(),
                        hidden=f.hidden,
                        license=(f.get_license()
                                 if hasattr(f, 'get_license') else None),
                        modification_date=datetime_toutc(f.md).isoformat(),
                        name=f.get_name(),
                        mime=f.mime,
                        path=f.get_path(),
                        recid=f.get_recid(),
                        recids_doctype=f.recids_doctypes,
                        size=f.get_size(),
                        status=f.get_status(),
                        subformat=f.get_subformat(),
                        superformat=f.get_superformat(),
                        type=f.get_type(),
                        url=f.get_url(),
                        version=f.get_version(),
                    ))

    return bibdocfile_dump
Esempio n. 9
0
def get_pdf_snippets(recID, patterns):

    """
    Extract text snippets around 'patterns' from the newest PDF file of 'recID'
    The search is case-insensitive.
    The snippets are meant to look like in the results of the popular search
    engine: using " ... " between snippets.
    For empty patterns it returns ""

    @param recID: record ID to consider
    @param patterns: list of patterns to retrieve
    @param nb_words_around: max number of words around the matched pattern
    @param max_snippets: max number of snippets to include
    @return: snippet
    """
    from invenio.bibdocfile import BibRecDocs

    text_path = ""
    text_path_courtesy = ""
    for bd in BibRecDocs(recID).list_bibdocs():
        if bd.get_text():
            text_path = bd.get_text_path()
            text_path_courtesy = bd.get_status()
            if CFG_INSPIRE_SITE and not text_path_courtesy:
                # get courtesy from doctype, since docstatus was empty:
                text_path_courtesy = bd.get_type()
                if text_path_courtesy == 'INSPIRE-PUBLIC':
                    # but ignore 'INSPIRE-PUBLIC' doctype
                    text_path_courtesy = ''
            break # stop at the first good PDF textable file

    nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0)
    max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0)
    if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.has_key(text_path_courtesy):
        nb_chars=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS[text_path_courtesy]
    if CFG_WEBSEARCH_FULLTEXT_SNIPPETS.has_key(text_path_courtesy):
        max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS[text_path_courtesy]

    if text_path and nb_chars and max_snippets:
        out = get_text_snippets(text_path, patterns, nb_chars, max_snippets)
        if not out:
            # no hit, so check stemmed versions:
            from invenio.bibindex_engine_stemmer import stem
            stemmed_patterns = [stem(p, 'en') for p in patterns]
            out = get_text_snippets(text_path, stemmed_patterns, nb_chars, max_snippets)

        if out:
            out_courtesy = ""
            if text_path_courtesy:
                out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>'
            return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out)
        else:
            return ""
    else:
        return ""
def fix_recid(recid, logfile):
    """Fix a given recid."""
    print "Upgrading record %s ->" % recid,
    print >> logfile, "Upgrading record %s:" % recid

    bibrec = BibRecDocs(recid)
    print >> logfile, bibrec
    docnames = bibrec.get_bibdoc_names()
    try:
        for docname in docnames:
            print docname,
            new_bibdocs = bibrec.fix(docname)
            new_bibdocnames = [bibdoc.get_docname() for bibdoc in new_bibdocs]
            if new_bibdocnames:
                print "(created bibdocs: '%s')" % "', '".join(new_bibdocnames),
                print >> logfile, "(created bibdocs: '%s')" % "', '".join(new_bibdocnames)
    except InvenioWebSubmitFileError, e:
        print >> logfile, BibRecDocs(recid)
        print "%s -> ERROR", e
        return False
        def look_for_fulltext(recid):
            """Look for fulltext pdf (bibdocfile) for a given recid"""
            rec_info = BibRecDocs(recid)
            docs = rec_info.list_bibdocs()

            for doc in docs:
                for d in doc.list_all_files():
                    if d.get_format().strip('.') in ['pdf', 'pdfa', 'PDF']:
                        try:
                            yield doc, d
                        except InvenioBibDocFileError:
                            pass
Esempio n. 12
0
def Add_Files(parameters, curdir, form, user_info=None):
    """DEPRECATED: Use FFT instead."""
    if os.path.exists("%s/files" % curdir):
        bibrecdocs = BibRecDocs(sysno)
        for current_file in os.listdir("%s/files" % curdir):
            fullpath = "%s/files/%s" % (curdir, current_file)
            dummy, filename, extension = decompose_file(current_file)
            if extension and extension[0] != ".":
                extension = '.' + extension
            if not bibrecdocs.check_file_exists(fullpath, extension):
                bibrecdocs.add_new_file(fullpath, "Main", never_fail=True)
    return ""
Esempio n. 13
0
def bst_fix_ffts(debug=0):
    debug = bool(int(debug))
    ffts = {}
    for recid in get_broken_recids():
        task_sleep_now_if_required(can_stop_too=True)
        write_message("Fixing %s" % recid)
        try:
            ffts[recid] = build_fft(get_last_pdf_for_record(BibRecDocs(recid)))
        except:
            register_exception(alert_admin=True)
    write_message("Uploading corrections")
    bibupload_ffts(ffts, append=True, do_debug=debug, interactive=False)
    return True
Esempio n. 14
0
    def get_pdfa_record(self, path=None):
        from invenio.search_engine import perform_request_search
        xml_doc = self.get_article(path)
        rec = create_record()
        dummy, dummy, dummy, dummy, dummy, dummy, dummy,\
            dummy, doi = self.get_publication_information(xml_doc)
        recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' %
                                       (doi, ))
        if recid:
            record_add_field(rec, '001', controlfield_value=recid[0])
        else:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
            message = ('Adding PDF/A. No paper with this DOI: '
                       '%s. Trying to add it anyway.') % (doi, )
            self.logger.error(message)
        try:
            if exists(join(path, 'main_a-2b.pdf')):
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', join(path, 'main_a-2b.pdf')),
                                            ('n', 'main'), ('f', '.pdf;pdfa')])
                self.logger.debug('Adding PDF/A to record: %s' % (doi, ))
            elif exists(join(path, 'main.pdf')):
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', join(path, 'main.pdf'))])
                message = 'No PDF/A in VTEX package for record: ' + doi
                self.logger.debug(message)
            else:
                message = "Record %s doesn't contain PDF file." % (doi, )
                raise MissingFFTError(message)
        except MissingFFTError:
            message = "Elsevier paper: %s is missing PDF." % (doi, )
            register_exception(alert_admin=True, prefix=message)
            self.logger.warning(message)

        ## copy other formats to bibupload file
        if recid:
            from invenio.bibdocfile import BibRecDocs
            record = BibRecDocs(recid[0])
            for bibfile in record.list_latest_files():
                if bibfile.get_format() != '.pdf;pdfa':
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', bibfile.get_full_path()),
                                                ('n', bibfile.get_name()),
                                                ('f', bibfile.get_format())])
        return record_xml_output(rec)
    def test_BibRecDocs(self):
        """bibdocfile - BibRecDocs functions"""
        from invenio.bibdocfile import BibRecDocs
        my_bibrecdoc = BibRecDocs(2)
        #add bibdoc
        my_bibrecdoc.add_new_file(CFG_PREFIX + '/lib/webtest/invenio/test.jpg',
                                  'Main', 'img_test', False,
                                  'test add new file', 'test', '.jpg')
        my_bibrecdoc.add_bibdoc(doctype='Main',
                                docname='file',
                                never_fail=False)
        self.assertEqual(len(my_bibrecdoc.list_bibdocs()), 3)
        my_added_bibdoc = my_bibrecdoc.get_bibdoc('file')
        #add bibdocfile in empty bibdoc
        my_added_bibdoc.add_file_new_version(CFG_PREFIX + '/lib/webtest/invenio/test.gif', \
        description= 'added in empty bibdoc', comment=None, docformat=None, flags=['PERFORM_HIDE_PREVIOUS'])
        #propose unique docname
        self.assertEqual(my_bibrecdoc.propose_unique_docname('file'), 'file_2')
        #has docname
        self.assertEqual(my_bibrecdoc.has_docname_p('file'), True)
        #merge 2 bibdocs
        my_bibrecdoc.merge_bibdocs('img_test', 'file')
        self.assertEqual(
            len(my_bibrecdoc.get_bibdoc("img_test").list_all_files()), 2)
        #check file exists
        self.assertEqual(
            my_bibrecdoc.check_file_exists(
                CFG_PREFIX + '/lib/webtest/invenio/test.jpg', '.jpg'), True)
        #get bibdoc names
        # we can not rely on the order !
        names = set([
            my_bibrecdoc.get_bibdoc_names('Main')[0],
            my_bibrecdoc.get_bibdoc_names('Main')[1]
        ])
        self.assertTrue('0104007_02' in names)
        self.assertTrue('img_test' in names)

        #get total size
        self.assertEqual(my_bibrecdoc.get_total_size(), 1647591)
        #get total size latest version
        self.assertEqual(my_bibrecdoc.get_total_size_latest_version(), 1647591)
        #display
        #value = my_bibrecdoc.display(docname='img_test', version='', doctype='', ln='en', verbose=0, display_hidden=True)
        #self.assert_("<small><b>Main</b>" in value)
        #get xml 8564
        value = my_bibrecdoc.get_xml_8564()
        self.assert_('/' + CFG_SITE_RECORD +
                     '/2/files/img_test.jpg</subfield>' in value)
        #check duplicate docnames
        self.assertEqual(my_bibrecdoc.check_duplicate_docnames(), True)
Esempio n. 16
0
def generate_keywords(req, recid, store_keywords=True):
    req.write(
        "Please be patient while the keywords classification is running...")

    bibdocfiles = BibRecDocs(recid).list_latest_files()

    keywords = []
    for doc in bibdocfiles:
        # Get the keywords for each PDF document contained in the record.
        if is_pdf(doc.get_full_path()):
            fulltext = doc.get_full_path()
            from invenio.bibclassify_engine import get_keywords_from_local_file
            single_keywords, composite_keywords = get_keywords_from_local_file(
                fulltext, taxonomy='HEP', with_author_keywords=True)

            for keyword, spans in single_keywords.items():
                keywords.append([keyword.concept, len(spans)])
            for keyword, num, components in composite_keywords:
                keywords.append([keyword.concept, num])

    if keywords and store_keywords:
        output = [
            '<collection><record>\n'
            '<controlfield tag="001">%s</controlfield>' % recid
        ]

        output.append(
            output_marc(single_keywords,
                        composite_keywords,
                        spires=False,
                        taxonomy='HEP'))

        output.append('</record></collection>')

        tmp_directory = "%s/bibclassify" % CFG_TMPDIR
        filename = "bibclassifyd_%s.xml" % time.strftime(
            "%Y%m%d%H%M%S", time.localtime())
        abs_path = os.path.join(tmp_directory, filename)

        if not os.path.isdir(tmp_directory):
            os.mkdir(tmp_directory)

        file_desc = open(abs_path, "w")
        file_desc.write('\n'.join(output))
        file_desc.close()

        #cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path)
        #os.system(cmd)

    return keywords
Esempio n. 17
0
def filter_fulltexts(recids):
    """ returns list of records having a fulltext of type fulltext_type.
    If fulltext_type is empty, return all records having a fulltext"""
    recids = dict(recids)
    minimum_timestamp = get_minimum_timestamp()
    query = """SELECT id_bibrec, max(modification_date)
                FROM bibrec_bibdoc
                LEFT JOIN bibdoc ON bibrec_bibdoc.id_bibdoc=bibdoc.id
                GROUP BY id_bibrec"""
    res = run_sql(query)
    return [(recid, max(lastmod, minimum_timestamp))
            for (recid, lastmod) in res
            if recid in recids and BibRecDocs(recid).list_latest_files(
                list_hidden=False)]
Esempio n. 18
0
def format_element(bfo):
    """
    Prints buttons to download all photos for each size
    """
    current_bibrecdoc = BibRecDocs(bfo.recID)
    if len(current_bibrecdoc.bibdocs) < 2:
        # If we have less than 2 photos, there is no point in displaying the
        # "Download all" buttons
        return
    wrapper = '''<style>
                #downloadallphotos {
                    clear: both;
                    font-size: small;
                    color: #555444;
                    margin-left: 10px;
                }
                #downloadallphotos a {
                    border-radius: 5px;
                    box-shadow: 1px 1px 1px 1px #CCCCCC;
                    color: #222222;
                    display: inline-block;
                    margin: 2px 5px;
                    padding: 3px;
                    text-decoration: none;
                    background-color: #E6E6FA;
                }
                #downloadallphotos a:hover {
                    background: -moz-linear-gradient(center top , #3A3A3A 0%, #7D7E7D 100%) repeat scroll 0 0 rgba(0, 0, 0, 0);
                    color: #fff;
                }
                </style>'''
    wrapper += '''<div id="downloadallphotos">Download all pictures:'''
    buttons = ''
    for (size, internal_size) in CFG_BIBDOCFILE_SUBFORMATS_TRANSLATIONS:
        total = current_bibrecdoc.get_total_size_latest_version(
            bfo.user_info, internal_size)
        # don't display the button if the size will be 0
        if total:
            buttons += '<a %(original)s href="%(site)s/record/%(recID)s/files/allfiles-%(size)s">%(size)s (%(total)s)</a>' \
                % {'original': size == 'original' and 'data-size="Original"' or '',
                   'site': CFG_SITE_URL,
                   'recID': bfo.recID,
                   'size': size,
                   'total': nice_size(total)}
    # If there are no buttons to display, don't display the rest of the HTML
    if buttons:
        return wrapper + buttons
def get_pdf_snippets(recID,
                     patterns,
                     nb_words_around=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS,
                     max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS):
    """
    Extract text snippets around 'patterns' from the newest PDF file of 'recID'
    The search is case-insensitive.
    The snippets are meant to look like in the results of the popular search
    engine: using " ... " between snippets.
    For empty patterns it returns ""
    """
    from invenio.bibdocfile import BibRecDocs

    text_path = ""
    text_path_courtesy = ""
    for bd in BibRecDocs(recID).list_bibdocs():
        if bd.get_text():
            text_path = bd.get_text_path()
            text_path_courtesy = bd.get_status()
            if CFG_INSPIRE_SITE and not text_path_courtesy:
                # get courtesy from doctype, since docstatus was empty:
                text_path_courtesy = bd.get_type()
                if text_path_courtesy == 'INSPIRE-PUBLIC':
                    # but ignore 'INSPIRE-PUBLIC' doctype
                    text_path_courtesy = ''
            break  # stop at the first good PDF textable file

    if text_path:
        out = get_text_snippets(text_path, patterns, nb_words_around,
                                max_snippets)
        if not out:
            # no hit, so check stemmed versions:
            from invenio.bibindex_engine_stemmer import stem
            stemmed_patterns = [stem(p, 'en') for p in patterns]
            out = get_text_snippets(text_path, stemmed_patterns,
                                    nb_words_around, max_snippets, False)
        if out:
            out_courtesy = ""
            if text_path_courtesy:
                out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>'
            return """<div class="snippetbox">%s%s</div>""" % (out_courtesy,
                                                               out)
        else:
            return ""
    else:
        return ""
Esempio n. 20
0
def _analyze_documents(records, ontology, collection):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding ontology."""
    global _INDEX

    if not records:
        # No records could be found.
        write_message("WARNING: No record were found in collection %s." %
                      collection,
                      stream=sys.stderr,
                      verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files()
        output.append('<record>')
        output.append('<controlfield tag="001">%s</controlfield>' % record)
        for doc in bibdocfiles:
            # Get the keywords for each PDF document contained in the record.
            if is_pdf(doc.get_full_path()):
                write_message('INFO: Generating keywords for record %d.' %
                              record,
                              stream=sys.stderr,
                              verbose=3)
                fulltext = doc.get_full_path()

                output.append(
                    output_keywords_for_local_file(
                        fulltext,
                        taxonomy=ontology,
                        output_mode="marcxml",
                        output_limit=3,
                        match_mode="partial",
                        with_author_keywords=True,
                        verbose=task_get_option('verbose')))

        _INDEX += 1

        output.append('</record>')

        task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER))
        task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)
Esempio n. 21
0
def get_filenames(recid):
    """
        Returns names of the files associated with specific record
        and their derivatives. Takes as a parameter the recid of a
        record.

        Example:
        input: recID 999 (record with files ['thesis.ps.gz', 'random.pdf'])
        output: ['thesis.ps.gz', 'thesis.ps', 'thesis',
                 'random.pdf', 'random']
        @param recid: recid of a record
    """
    docs = BibRecDocs(recid)
    names = [
        _get_filenames(d.name + d.format) for d in docs.list_latest_files()
    ]
    return reduce(lambda x, y: x + y, names)
def upload_fulltext(recid, path):
    '''
        This method save the uploaded file to associated record
        @param recid: id of the record
        @param path: uploaded document to store
    '''

    # upload the file to the record

    bibarchiv = BibRecDocs(recid)
    docname = path.split('/')[-1].split('.')[0]
    doctype = path.split('.')[-1].split(';')[0]
    bibarchiv.add_new_file(path,
                           CFG_DOCTYPE_UPLOAD_COLLECTION,
                           docname,
                           format=doctype)

    return ''
Esempio n. 23
0
def look_for_fulltext(recid):
    rec_info = BibRecDocs(recid)
    docs = rec_info.list_bibdocs()

    path = False
    for doc in docs:
        try:
            path = doc.get_file('pdf').get_full_path()
        except InvenioWebSubmitFileError:
            try:
                path = doc.get_file('pdfa').get_full_path()
            except InvenioWebSubmitFileError:
                try:
                    path = doc.get_file('PDF').get_full_path()
                except InvenioWebSubmitFileError:
                    continue

    return path
Esempio n. 24
0
def solr_add_range(lower_recid, upper_recid):
    """
    Adds the regarding field values of all records from the lower recid to the upper one to Solr.
    It preserves the fulltext information.
    """
    for recid in range(lower_recid, upper_recid + 1):
        if record_exists(recid):
            try:
                abstract = unicode(
                    get_fieldvalues(recid, CFG_MARC_ABSTRACT)[0], 'utf-8')
            except:
                abstract = ""
            try:
                first_author = get_fieldvalues(recid, CFG_MARC_AUTHOR_NAME)[0]
                additional_authors = reduce(
                    lambda x, y: x + " " + y,
                    get_fieldvalues(recid, CFG_MARC_ADDITIONAL_AUTHOR_NAME),
                    '')
                author = unicode(first_author + " " + additional_authors,
                                 'utf-8')
            except:
                author = ""
            try:
                bibrecdocs = BibRecDocs(recid)
                fulltext = unicode(bibrecdocs.get_text(), 'utf-8')
            except:
                fulltext = ""
            try:
                keyword = unicode(
                    reduce(lambda x, y: x + " " + y,
                           get_fieldvalues(recid, CFG_MARC_KEYWORD), ''),
                    'utf-8')
            except:
                keyword = ""
            try:
                title = unicode(
                    get_fieldvalues(recid, CFG_MARC_TITLE)[0], 'utf-8')
            except:
                title = ""
            solr_add(recid, abstract, author, fulltext, keyword, title)

    SOLR_CONNECTION.commit()
    task_sleep_now_if_required(can_stop_too=True)
def uncook_files(webdeposit_json, recid=None, json_reader=None):
    if 'files' not in webdeposit_json:
        webdeposit_json['files'] = []

    if recid is None:
        for f in json_reader['url']:
            filename = f['url'].split('/')[-1]
            file_json = {'name': filename}
            webdeposit_json['files'].append(file_json)

    else:
        for f in BibRecDocs(recid, human_readable=True).list_latest_files():
            filename = f.get_full_name()
            path = f.get_path()
            size = f.get_size()
            file_json = {'name': filename, 'file': path, 'size': size}
            webdeposit_json['files'].append(file_json)

    return webdeposit_json
Esempio n. 26
0
def hide_old_ffts():
    ids = perform_request_search(p="", of='intbitset')
    for one_id in ids:
        bibrec = BibRecDocs(one_id)
        bibdoc = bibrec.list_bibdocs()[0]
        latest_rev = bibdoc.get_latest_version()

        i = 1
        while i < latest_rev:
            rev_file_types = []
            for f in bibdoc.list_version_files(i):
                if f.format not in rev_file_types:
                    rev_file_types.append(f.format)
            for file_type in rev_file_types:
                write_message("Record %s: hiding format %s in revision %s" %
                              (one_id, file_type, i))
                bibdoc.set_flag(CFG_BIBDOCFILE_AVAILABLE_FLAGS[3], file_type,
                                i)
            i += 1
Esempio n. 27
0
def get_preferred_posterframe_url(recid, icon_p=True):
    """
    Returns the posteframe that might have been manually uploaded for
    this record.

    @param recid: current record ID
    @param icon_p: if True, return icon version (if exists). Else return original image
    @return: URL of the preferred posterframe, of None if does not exist
    """
    bibarchive = BibRecDocs(recid)
    posterframe_bibdocs = bibarchive.list_bibdocs(doctype='posterframe')

    if posterframe_bibdocs:
        if icon_p:
            return posterframe_bibdocs[0].get_icon().get_url()
        for bibdoc_file in posterframe_bibdocs[0].list_latest_files():
            if not bibdoc_file.is_icon():
                return bibdoc_file.get_url()

    return None
Esempio n. 28
0
def solr_add_range(lower_recid, upper_recid, tags_to_index,
                   next_commit_counter):
    """
    Adds the regarding field values of all records from the lower recid to the upper one to Solr.
    It preserves the fulltext information.
    """
    for recid in range(lower_recid, upper_recid + 1):
        if record_exists(recid):
            abstract = get_field_content_in_utf8(recid, 'abstract',
                                                 tags_to_index)
            author = get_field_content_in_utf8(recid, 'author', tags_to_index)
            keyword = get_field_content_in_utf8(recid, 'keyword',
                                                tags_to_index)
            title = get_field_content_in_utf8(recid, 'title', tags_to_index)
            try:
                bibrecdocs = BibRecDocs(recid)
                fulltext = unicode(bibrecdocs.get_text(), 'utf-8')
                abstract = unicode(
                    get_fieldvalues(recid, CFG_MARC_ABSTRACT)[0], 'utf-8')
            except:
                abstract = ""
            try:
                first_author = get_fieldvalues(recid, CFG_MARC_AUTHOR_NAME)[0]
                additional_authors = reduce(
                    lambda x, y: x + " " + y,
                    get_fieldvalues(recid, CFG_MARC_ADDITIONAL_AUTHOR_NAME),
                    '')
                author = unicode(first_author + " " + additional_authors,
                                 'utf-8')
            except:
                author = ""
            try:
                fulltext = unicode(get_entire_fulltext(recid), 'utf-8')
            except:
                fulltext = ''

            solr_add(recid, abstract, author, fulltext, keyword, title)
            next_commit_counter = solr_commit_if_necessary(next_commit_counter,
                                                           recid=recid)

    return next_commit_counter
Esempio n. 29
0
def migrate_bibdoc_status(recid, is_public, access_right):
    from invenio.search_engine import get_fieldvalues
    from invenio.bibdocfile import BibRecDocs

    # Generate firerole
    fft_status = []
    if is_public:
        email = get_fieldvalues(recid, "8560_f")[0]
        if access_right == 'open':
            # Access to everyone
            fft_status = [
                'allow any',
            ]
        elif access_right == 'embargoed':
            # Access to submitted, Deny everyone else until embargo date,
            # then allow all
            date = get_fieldvalues(recid, "942__a")[0]
            fft_status = [
                'allow email "%s"' % email,
                'deny until "%s"' % date,
                'allow any',
            ]
        elif access_right in (
                'closed',
                'restricted',
        ):
            # Access to submitter, deny everyone else
            fft_status = [
                'allow email "%s"' % email,
                'deny all',
            ]
    else:
        # Access to submitter, deny everyone else
        fft_status = None

    if fft_status:
        fft_status = "firerole: %s" % "\n".join(fft_status)

        brd = BibRecDocs(recid)
        for d in brd.list_bibdocs():
            d.set_status(fft_status)
def check_records(records):
    for record in records:
        ## Stupid hack because bibcheck filters does not work as expected
        if record_get_field_value(record, '980', code='b') == "Hindawi":
            record.warn("Working on this record")
            recdoc = BibRecDocs(int(record.record_id))
            doc = recdoc.get_bibdoc(recdoc.get_bibdoc_names()[0])
            try:
                xml_file = open(doc.get_file("xml").get_full_path())
            except:
                record.warn("No document can be found")
                continue
            xml2 = xml.dom.minidom.parseString(xml_file.read())
            subject = get_value_in_tag(xml2, "subject")
            if subject in [
                    "Editorial", "Erratum", "Corrigendum", "Addendum",
                    "Letter to the Editor"
            ]:
                field = record_get_field_value(record, '980', code='c')
                if field:
                    if field in [
                            'ERRATUM', 'ADDENDUM', 'EDITORIAL', 'CORRIGENDUM',
                            'LETTER TO THE EDITOR'
                    ]:
                        for position, value in record.iterfield('980__c'):
                            record.amend_field(position, subject.upper())
                            break
                    else:
                        for position, value in record.iterfield('980__%'):
                            record.add_subfield(position, 'c', subject.upper())
                            break
                else:
                    for position, value in record.iterfield('980__%'):
                        record.add_subfield(position, 'c', subject.upper())
                        break
            elif subject not in [
                    "Review Article", "Research Article", "Retraction"
            ]:
                raise Exception(
                    "This subject: %s does not exit in SCOAP3 system" %
                    (subject, ))