Exemple #1
0
 def __init__(self, *args, **kwargs):
     """Constructor."""
     super(LiteratureForm, self).__init__(*args, **kwargs)
     from invenio_knowledge.api import get_kb_mappings
     self.subject.choices = [
         (x['value'], x['value'])
         for x in get_kb_mappings(cfg["DEPOSIT_INSPIRE_SUBJECTS_KB"])
     ]
     self.degree_type.choices = [('', '')] + [
         (x['value'], x['value'])
         for x in get_kb_mappings(cfg["DEPOSIT_INSPIRE_DEGREE_KB"])
     ]
 def prepare_data_cache(self):
     """*Index* knowledge base and cache it."""
     cache = {}
     for mapping in get_kb_mappings(self.get_kbname()):
         key = mapping["key"]
         value = mapping["value"]
         words = clean_and_split_words_and_stem(key, CFG_SITE_LANG, stem_p=True)
         for word in words:
             if word not in cache:
                 cache[word] = []
             if value not in cache[word]:
                 cache[word].append(value)
     return cache
    def _convert_files(obj, eng):
        from invenio_knowledge.api import get_kb_mappings
        mappings = dict(
            map(
                lambda item: (item['key'], item['value']),
                get_kb_mappings('JOURNALS')
            )
        )
        ws = WorldScientific(mappings)

        target_folder_full = get_storage_path(suffix=target_folder)

        args = obj.extra_data['args']
        # By default, we set the from date as today
        to_date = args.get("to_date") or datetime.now().strftime('%Y-%m-%d')

        # By last resort, we set the from date a week before
        from_date = args.get("from_date") or cache.get(date_key) \
            or (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')

        obj.extra_data['args']["to_date"] = to_date
        obj.extra_data['args']["from_date"] = from_date

        insert_files = []
        filenames = obj.data['extracted_files']
        for filename in filenames:
            date = ws.get_date(filename)
            if from_date <= date <= to_date:
                marc = ws.get_record(filename)
                if marc:
                    filename = basename(filename)
                    filename = join(target_folder_full, filename)
                    insert_files.append(filename)
                    with open(filename, 'w') as outfile:
                        outfile.write(marc)

        obj.log.info("Converted {0} articles between {1} to {2}".format(
            len(insert_files),
            from_date,
            to_date
        ))

        obj.data['insert'] = insert_files
        obj.data["result_path"] = target_folder_full

        obj.log.debug("Saved converted files to {0}".format(target_folder_full))
        obj.log.debug("{0} files to add".format(
            len(obj.data["insert"]),
        ))
Exemple #4
0
    def process_sip_metadata(cls, deposition, metadata):
        from ..dojson.model import literature

        form_fields = copy.deepcopy(metadata)

        filter_empty_elements(metadata)
        converted = literature.do(metadata)
        metadata.clear()
        metadata.update(converted)

        # Add extra fields that need to be computed or depend on other
        # fields.
        #
        # ============================
        # Collection
        # ============================
        metadata['collections'] = [{'primary': "HEP"}]
        if form_fields['type_of_doc'] == 'thesis':
            metadata['collections'].append({'primary': "THESIS"})
        if "subject_terms" in metadata:
            # Check if it was imported from arXiv
            if any([x["scheme"] == "arXiv" for x in metadata["subject_terms"]]):
                metadata['collections'].extend([{'primary': "arXiv"},
                                                {'primary': "Citeable"}])
                # Add arXiv as source
                if metadata.get("abstracts"):
                    metadata['abstracts'][0]['source'] = 'arXiv'
                if form_fields.get("arxiv_id"):
                    metadata['external_system_numbers'] = [{
                        'value': 'oai:arXiv.org:' + form_fields['arxiv_id'],
                        'institute': 'arXiv'
                    }]
        if "publication_info" in metadata:
            metadata['collections'].append({'primary': "Published"})
        # ============================
        # Title source
        # ============================
        if 'title_source' in form_fields and form_fields['title_source']:
            metadata['titles'][0]['source'] = form_fields['title_source']
        # ============================
        # Title from arXiv
        # ============================
        if 'title_arXiv' in form_fields and form_fields['title_arXiv']:
            for title in metadata['titles']:
                if title['title'] == form_fields['title_arXiv']:
                    break
            else:
                metadata['titles'].append({
                    'title': form_fields['title_arXiv'],
                    'source': 'arXiv'
                })
        # ============================
        # Conference name
        # ============================
        if 'conf_name' in form_fields:
            if 'nonpublic_note' in form_fields:
                metadata.setdefault("hidden_notes", []).append({
                    "value": form_fields['conf_name']
                })
                metadata['hidden_notes'].append({
                    'value': form_fields['nonpublic_note']
                })
            else:
                metadata.setdefault("hidden_notes", []).append({
                    "value": form_fields['conf_name']
                })
            metadata['collections'].extend([{'primary': "ConferencePaper"}])

        # ============================
        # Page range
        # ============================
        if 'page_nr' not in metadata:
            if metadata.get("publication_info", {}).get("page_artid"):
                pages = metadata['publication_info']['page_artid'].split('-')
                if len(pages) == 2:
                    try:
                        metadata['page_nr'] = int(pages[1]) - int(pages[0]) + 1
                    except ValueError:
                        pass
        # ============================
        # Language
        # ============================
        if metadata.get("language") == "oth":
            if form_fields.get("other_language"):
                metadata["language"] = form_fields["other_language"]

        # ===============================
        # arXiv category in report number
        # ===============================
        if metadata.get("_categories"):
            del metadata["_categories"]

        # ============================
        # Date of defense
        # ============================
        if form_fields.get('defense_date'):
            defense_note = {
                'value': 'Presented on ' + form_fields['defense_date']
            }
            metadata.setdefault("public_notes", []).append(defense_note)
        # ==========
        # Owner Info
        # ==========
        userid = deposition.user_id
        user = UserInfo(userid)
        email = user.info.get('email', '')
        try:
            source = UserEXT.query.filter_by(id_user=userid, method='orcid').one()
        except NoResultFound:
            source = ''
        if source:
            source = source.method + ':' + source.id
        metadata['acquisition_source'] = dict(
            source=source,
            email=email,
            method="submission",
            submission_number=deposition.id,
        )
        # ==============
        # References
        # ==============
        if form_fields.get('references'):
            metadata['references'] = form_fields.get('references')
        # ==============
        # Extra comments
        # ==============
        if form_fields.get('extra_comments'):
            metadata.setdefault('hidden_notes', []).append(
                {
                    'value': form_fields['extra_comments'],
                    'source': 'submitter'
                }
            )
        # ======================================
        # Journal name Knowledge Base conversion
        # ======================================
        if metadata.get("publication_info", {}).get("journal_title"):
            journals_kb = dict([(x['key'].lower(), x['value'])
                                for x in get_kb_mappings(cfg.get("DEPOSIT_INSPIRE_JOURNALS_KB"))])

            metadata['publication_info']['journal_title'] = journals_kb.get(metadata['publication_info']['journal_title'].lower(),
                                                                            metadata['publication_info']['journal_title'])
        if metadata.get("publication_info") and not isinstance(metadata['publication_info'], list):
            metadata["publication_info"] = [metadata['publication_info']]
    def _convert_files(obj, eng):
        from invenio_knowledge.api import get_kb_mappings
        mappings = dict(
            map(
                lambda item: (item['key'], item['value']),
                get_kb_mappings('JOURNALS')
            )
        )
        ws = WorldScientific(mappings)

        target_folder_full = get_storage_path(suffix=target_folder)

        args = obj.extra_data['args']

        # By default, we set the from date as today
        to_date = args.get("to_date") or datetime.now().strftime('%Y-%m-%d')

        # By last resort, we set the from date months before
        from_date = args.get("from_date")

        if not from_date:
            if args.get("reharvest"):
                # Since "beginning" of time when not specified
                from_date = datetime.strptime("1900-01-01", "%Y-%m-%d")
            else:
                # Dynamic date in the past when not specified and not reharvest
                from_date = datetime.now() - timedelta(weeks=weeks_threshold)\
                    .strftime('%Y-%m-%d')

        obj.extra_data['args']["to_date"] = to_date
        obj.extra_data['args']["from_date"] = from_date

        insert_files = []
        if args.get("reharvest"):
            filenames = obj.data['all_extracted_files']
        else:
            filenames = obj.data['newly_extracted_files']

        for filename in filenames:
            date = ws.get_date(filename)
            if date is None or (from_date <= date <= to_date):
                marc = ws.get_record(filename)
                if marc:
                    filename = basename(filename)
                    filename = join(target_folder_full, filename)
                    insert_files.append(filename)
                    with open(filename, 'w') as outfile:
                        outfile.write(marc)
            else:
                obj.log.info("Filtered out {0} ({1})".format(filename, date))

        obj.log.info("Converted {0}/{1} articles between {2} to {3}".format(
            len(insert_files),
            len(filenames),
            from_date,
            to_date
        ))

        obj.data['insert'] = insert_files
        obj.data["result_path"] = target_folder_full

        obj.log.debug("Saved converted files to {0}".format(target_folder_full))
        obj.log.debug("{0} files to add".format(
            len(obj.data["insert"]),
        ))
def get_value(kb_name, list_of_keys):
    """Get the value registered with at least one of the keys."""
    for key in list_of_keys:
        if kb_mapping_exists(kb_name, key):
            return get_kb_mappings(kb_name=kb_name, key=key)[0].get("value")
    def process_sip_metadata(cls, deposition, metadata):
        from ..dojson.model import literature
        form_fields = copy.deepcopy(metadata)

        filter_empty_elements(metadata)
        converted = literature.do(metadata)
        metadata.clear()
        metadata.update(converted)

        # Add extra fields that need to be computed or depend on other
        # fields.
        #
        # ============================
        # Collection
        # ============================
        metadata['collections'] = [{'primary': "HEP"}]
        if form_fields['type_of_doc'] == 'thesis':
            metadata['collections'].append({'primary': "THESIS"})
        if "subject_terms" in metadata:
            # Check if it was imported from arXiv
            if any([x["scheme"] == "arXiv" for x in metadata["subject_terms"]]):
                metadata['collections'].extend([{'primary': "arXiv"},
                                                {'primary': "Citeable"}])
                # Add arXiv as source
                if metadata.get("abstracts"):
                    metadata['abstracts'][0]['source'] = 'arXiv'
                if form_fields.get("arxiv_id"):
                    metadata['external_system_numbers'] = [{
                        'value': 'oai:arXiv.org:' + form_fields['arxiv_id'],
                        'institute': 'arXiv'
                    }]
        if "publication_info" in metadata:
            if all([key in metadata['publication_info'].keys() for key in
                   ('year', 'journal_issue', 'journal_volume', 'page_artid')]):
                # NOTE: Only peer reviewed journals should have this collection
                # we are adding it here but ideally should be manually added
                # by a curator.
                metadata['collections'].append({'primary': "Published"})
                # Add Citeable collection if not present
                collections = [x['primary'] for x in metadata['collections']]
                if "Citeable" not in collections:
                    metadata['collections'].append({'primary': "Citeable"})
        # ============================
        # Title source and cleanup
        # ============================
        try:
            # Clean up all extra spaces in title
            metadata['titles'][0]['title'] = " ".join(
                metadata['titles'][0]['title'].split()
            )
            title = metadata['titles'][0]['title']
        except (KeyError, IndexError):
            title = ""
        if form_fields.get('title_arXiv'):
            title_arxiv = " ".join(form_fields.get('title_arXiv').split())
            if title == title_arxiv:
                metadata['titles'][0]["source"] = "arXiv"
            else:
                metadata['titles'].append({
                    'title': title_arxiv,
                    'source': "arXiv"
                })
        if form_fields.get('title_crossref'):
            title_crossref = " ".join(
                form_fields.get('title_crossref').split()
            )
            if title == title_crossref:
                metadata['titles'][0]["source"] = "CrossRef"
            else:
                metadata['titles'].append({
                    'title': title_crossref,
                    'source': "CrossRef"
                })
        try:
            metadata['titles'][0]['source']
        except KeyError:
            # Title has no source, so should be the submitter
            metadata['titles'][0]['source'] = "submitter"

        # ============================
        # Conference name
        # ============================
        if 'conf_name' in form_fields:
            if 'nonpublic_note' in form_fields:
                metadata.setdefault("hidden_notes", []).append({
                    "value": form_fields['conf_name']
                })
                metadata['hidden_notes'].append({
                    'value': form_fields['nonpublic_note']
                })
            else:
                metadata.setdefault("hidden_notes", []).append({
                    "value": form_fields['conf_name']
                })
            metadata['collections'].extend([{'primary': "ConferencePaper"}])

        # ============================
        # Page range
        # ============================
        if 'page_nr' not in metadata:
            if metadata.get("publication_info", {}).get("page_artid"):
                pages = metadata['publication_info']['page_artid'].split('-')
                if len(pages) == 2:
                    try:
                        metadata['page_nr'] = int(pages[1]) - int(pages[0]) + 1
                    except ValueError:
                        pass
        # ============================
        # Language
        # ============================
        if metadata.get("languages", []) and metadata["languages"][0] == "oth":
            if form_fields.get("other_language"):
                metadata["languages"] = [form_fields["other_language"]]

        # ===============================
        # arXiv category in report number
        # ===============================
        if metadata.get("_categories"):
            del metadata["_categories"]

        # ============================
        # Date of defense
        # ============================
        if form_fields.get('defense_date'):
            defense_note = {
                'value': 'Presented on ' + form_fields['defense_date']
            }
            metadata.setdefault("public_notes", []).append(defense_note)
        # ==========
        # Owner Info
        # ==========
        userid = deposition.user_id
        user = UserInfo(userid)
        email = user.info.get('email', '')
        try:
            source = UserEXT.query.filter_by(id_user=userid, method='orcid').one()
        except NoResultFound:
            source = ''
        if source:
            source = source.method + ':' + source.id
        metadata['acquisition_source'] = dict(
            source=source,
            email=email,
            method="submission",
            submission_number=deposition.id,
        )
        # ==============
        # References
        # ==============
        if form_fields.get('references'):
            metadata['references'] = form_fields.get('references')
        # ==============
        # Extra comments
        # ==============
        if form_fields.get('extra_comments'):
            metadata.setdefault('hidden_notes', []).append(
                {
                    'value': form_fields['extra_comments'],
                    'source': 'submitter'
                }
            )
            metadata["extra_comments"] = form_fields.get("extra_comments")
        # ======================================
        # Journal name Knowledge Base conversion
        # ======================================
        if metadata.get("publication_info", {}).get("journal_title"):
            journals_kb = dict([(x['key'].lower(), x['value'])
                                for x in get_kb_mappings(cfg.get("DEPOSIT_INSPIRE_JOURNALS_KB"))])

            metadata['publication_info']['journal_title'] = journals_kb.get(metadata['publication_info']['journal_title'].lower(),
                                                                            metadata['publication_info']['journal_title'])
        if metadata.get("publication_info") and not isinstance(metadata['publication_info'], list):
            metadata["publication_info"] = [metadata['publication_info']]