def __init__(self, *args, **kwargs): """Constructor.""" super(LiteratureForm, self).__init__(*args, **kwargs) from invenio_knowledge.api import get_kb_mappings self.subject.choices = [ (x['value'], x['value']) for x in get_kb_mappings(cfg["DEPOSIT_INSPIRE_SUBJECTS_KB"]) ] self.degree_type.choices = [('', '')] + [ (x['value'], x['value']) for x in get_kb_mappings(cfg["DEPOSIT_INSPIRE_DEGREE_KB"]) ]
class JournalForm(WTFormDefault): """Journal Form.""" name = SelectField( label='', choices=LocalProxy(lambda: [('', _('Any journal'))] + [(kb['key'], kb[ 'value']) for kb in get_kb_mappings('EJOURNALS')]), coerce=unicode, ) vol = StringField(_('Vol')) page = StringField(_('Pg'))
def prepare_data_cache(self): """*Index* knowledge base and cache it.""" cache = {} for mapping in get_kb_mappings(self.get_kbname()): key = mapping["key"] value = mapping["value"] words = clean_and_split_words_and_stem(key, CFG_SITE_LANG, stem_p=True) for word in words: if word not in cache: cache[word] = [] if value not in cache[word]: cache[word].append(value) return cache
def _convert_files(obj, eng): from invenio_knowledge.api import get_kb_mappings mappings = dict( map( lambda item: (item['key'], item['value']), get_kb_mappings('JOURNALS') ) ) ws = WorldScientific(mappings) target_folder_full = get_storage_path(suffix=target_folder) args = obj.extra_data['args'] # By default, we set the from date as today to_date = args.get("to_date") or datetime.now().strftime('%Y-%m-%d') # By last resort, we set the from date a week before from_date = args.get("from_date") or cache.get(date_key) \ or (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d') obj.extra_data['args']["to_date"] = to_date obj.extra_data['args']["from_date"] = from_date insert_files = [] filenames = obj.data['extracted_files'] for filename in filenames: date = ws.get_date(filename) if from_date <= date <= to_date: marc = ws.get_record(filename) if marc: filename = basename(filename) filename = join(target_folder_full, filename) insert_files.append(filename) with open(filename, 'w') as outfile: outfile.write(marc) obj.log.info("Converted {0} articles between {1} to {2}".format( len(insert_files), from_date, to_date )) obj.data['insert'] = insert_files obj.data["result_path"] = target_folder_full obj.log.debug("Saved converted files to {0}".format(target_folder_full)) obj.log.debug("{0} files to add".format( len(obj.data["insert"]), ))
def prepare_data_cache(self): """*Index* knowledge base and cache it.""" cache = {} for mapping in get_kb_mappings(self.get_kbname()): key = mapping['key'] value = mapping['value'] words = clean_and_split_words_and_stem(key, CFG_SITE_LANG, stem_p=True) for word in words: if word not in cache: cache[word] = [] if value not in cache[word]: cache[word].append(value) return cache
def process_sip_metadata(cls, deposition, metadata): from ..dojson.model import literature form_fields = copy.deepcopy(metadata) filter_empty_elements(metadata) converted = literature.do(metadata) metadata.clear() metadata.update(converted) # Add extra fields that need to be computed or depend on other # fields. # # ============================ # Collection # ============================ metadata['collections'] = [{'primary': "HEP"}] if form_fields['type_of_doc'] == 'thesis': metadata['collections'].append({'primary': "THESIS"}) if "subject_terms" in metadata: # Check if it was imported from arXiv if any([x["scheme"] == "arXiv" for x in metadata["subject_terms"]]): metadata['collections'].extend([{'primary': "arXiv"}, {'primary': "Citeable"}]) # Add arXiv as source if metadata.get("abstracts"): metadata['abstracts'][0]['source'] = 'arXiv' if form_fields.get("arxiv_id"): metadata['external_system_numbers'] = [{ 'value': 'oai:arXiv.org:' + form_fields['arxiv_id'], 'institute': 'arXiv' }] if "publication_info" in metadata: metadata['collections'].append({'primary': "Published"}) # ============================ # Title source # ============================ if 'title_source' in form_fields and form_fields['title_source']: metadata['titles'][0]['source'] = form_fields['title_source'] # ============================ # Title from arXiv # ============================ if 'title_arXiv' in form_fields and form_fields['title_arXiv']: for title in metadata['titles']: if title['title'] == form_fields['title_arXiv']: break else: metadata['titles'].append({ 'title': form_fields['title_arXiv'], 'source': 'arXiv' }) # ============================ # Conference name # ============================ if 'conf_name' in form_fields: if 'nonpublic_note' in form_fields: metadata.setdefault("hidden_notes", []).append({ "value": form_fields['conf_name'] }) metadata['hidden_notes'].append({ 'value': form_fields['nonpublic_note'] }) else: metadata.setdefault("hidden_notes", []).append({ "value": form_fields['conf_name'] }) metadata['collections'].extend([{'primary': "ConferencePaper"}]) # ============================ # Page range # ============================ if 'page_nr' not in metadata: if metadata.get("publication_info", {}).get("page_artid"): pages = metadata['publication_info']['page_artid'].split('-') if len(pages) == 2: try: metadata['page_nr'] = int(pages[1]) - int(pages[0]) + 1 except ValueError: pass # ============================ # Language # ============================ if metadata.get("language") == "oth": if form_fields.get("other_language"): metadata["language"] = form_fields["other_language"] # =============================== # arXiv category in report number # =============================== if metadata.get("_categories"): del metadata["_categories"] # ============================ # Date of defense # ============================ if form_fields.get('defense_date'): defense_note = { 'value': 'Presented on ' + form_fields['defense_date'] } metadata.setdefault("public_notes", []).append(defense_note) # ========== # Owner Info # ========== userid = deposition.user_id user = UserInfo(userid) email = user.info.get('email', '') try: source = UserEXT.query.filter_by(id_user=userid, method='orcid').one() except NoResultFound: source = '' if source: source = source.method + ':' + source.id metadata['acquisition_source'] = dict( source=source, email=email, method="submission", submission_number=deposition.id, ) # ============== # References # ============== if form_fields.get('references'): metadata['references'] = form_fields.get('references') # ============== # Extra comments # ============== if form_fields.get('extra_comments'): metadata.setdefault('hidden_notes', []).append( { 'value': form_fields['extra_comments'], 'source': 'submitter' } ) # ====================================== # Journal name Knowledge Base conversion # ====================================== if metadata.get("publication_info", {}).get("journal_title"): journals_kb = dict([(x['key'].lower(), x['value']) for x in get_kb_mappings(cfg.get("DEPOSIT_INSPIRE_JOURNALS_KB"))]) metadata['publication_info']['journal_title'] = journals_kb.get(metadata['publication_info']['journal_title'].lower(), metadata['publication_info']['journal_title']) if metadata.get("publication_info") and not isinstance(metadata['publication_info'], list): metadata["publication_info"] = [metadata['publication_info']]
def _convert_files(obj, eng): from invenio_knowledge.api import get_kb_mappings mappings = dict( map( lambda item: (item['key'], item['value']), get_kb_mappings('JOURNALS') ) ) ws = WorldScientific(mappings) target_folder_full = get_storage_path(suffix=target_folder) args = obj.extra_data['args'] # By default, we set the from date as today to_date = args.get("to_date") or datetime.now().strftime('%Y-%m-%d') # By last resort, we set the from date months before from_date = args.get("from_date") if not from_date: if args.get("reharvest"): # Since "beginning" of time when not specified from_date = datetime.strptime("1900-01-01", "%Y-%m-%d") else: # Dynamic date in the past when not specified and not reharvest from_date = datetime.now() - timedelta(weeks=weeks_threshold)\ .strftime('%Y-%m-%d') obj.extra_data['args']["to_date"] = to_date obj.extra_data['args']["from_date"] = from_date insert_files = [] if args.get("reharvest"): filenames = obj.data['all_extracted_files'] else: filenames = obj.data['newly_extracted_files'] for filename in filenames: date = ws.get_date(filename) if date is None or (from_date <= date <= to_date): marc = ws.get_record(filename) if marc: filename = basename(filename) filename = join(target_folder_full, filename) insert_files.append(filename) with open(filename, 'w') as outfile: outfile.write(marc) else: obj.log.info("Filtered out {0} ({1})".format(filename, date)) obj.log.info("Converted {0}/{1} articles between {2} to {3}".format( len(insert_files), len(filenames), from_date, to_date )) obj.data['insert'] = insert_files obj.data["result_path"] = target_folder_full obj.log.debug("Saved converted files to {0}".format(target_folder_full)) obj.log.debug("{0} files to add".format( len(obj.data["insert"]), ))
def get_value(kb_name, list_of_keys): """Get the value registered with at least one of the keys.""" for key in list_of_keys: if kb_mapping_exists(kb_name, key): return get_kb_mappings(kb_name=kb_name, key=key)[0].get("value")
def inner(dummy_form, dummy_field, term, limit=50): from invenio_knowledge.api import get_kb_mappings result = get_kb_mappings(name, '', term, limit=limit)[:limit] return map(mapper, result) if mapper is not None else result
def process_sip_metadata(cls, deposition, metadata): from ..dojson.model import literature form_fields = copy.deepcopy(metadata) filter_empty_elements(metadata) converted = literature.do(metadata) metadata.clear() metadata.update(converted) # Add extra fields that need to be computed or depend on other # fields. # # ============================ # Collection # ============================ metadata['collections'] = [{'primary': "HEP"}] if form_fields['type_of_doc'] == 'thesis': metadata['collections'].append({'primary': "THESIS"}) if "subject_terms" in metadata: # Check if it was imported from arXiv if any([x["scheme"] == "arXiv" for x in metadata["subject_terms"]]): metadata['collections'].extend([{'primary': "arXiv"}, {'primary': "Citeable"}]) # Add arXiv as source if metadata.get("abstracts"): metadata['abstracts'][0]['source'] = 'arXiv' if form_fields.get("arxiv_id"): metadata['external_system_numbers'] = [{ 'value': 'oai:arXiv.org:' + form_fields['arxiv_id'], 'institute': 'arXiv' }] if "publication_info" in metadata: if all([key in metadata['publication_info'].keys() for key in ('year', 'journal_issue', 'journal_volume', 'page_artid')]): # NOTE: Only peer reviewed journals should have this collection # we are adding it here but ideally should be manually added # by a curator. metadata['collections'].append({'primary': "Published"}) # Add Citeable collection if not present collections = [x['primary'] for x in metadata['collections']] if "Citeable" not in collections: metadata['collections'].append({'primary': "Citeable"}) # ============================ # Title source and cleanup # ============================ try: # Clean up all extra spaces in title metadata['titles'][0]['title'] = " ".join( metadata['titles'][0]['title'].split() ) title = metadata['titles'][0]['title'] except (KeyError, IndexError): title = "" if form_fields.get('title_arXiv'): title_arxiv = " ".join(form_fields.get('title_arXiv').split()) if title == title_arxiv: metadata['titles'][0]["source"] = "arXiv" else: metadata['titles'].append({ 'title': title_arxiv, 'source': "arXiv" }) if form_fields.get('title_crossref'): title_crossref = " ".join( form_fields.get('title_crossref').split() ) if title == title_crossref: metadata['titles'][0]["source"] = "CrossRef" else: metadata['titles'].append({ 'title': title_crossref, 'source': "CrossRef" }) try: metadata['titles'][0]['source'] except KeyError: # Title has no source, so should be the submitter metadata['titles'][0]['source'] = "submitter" # ============================ # Conference name # ============================ if 'conf_name' in form_fields: if 'nonpublic_note' in form_fields: metadata.setdefault("hidden_notes", []).append({ "value": form_fields['conf_name'] }) metadata['hidden_notes'].append({ 'value': form_fields['nonpublic_note'] }) else: metadata.setdefault("hidden_notes", []).append({ "value": form_fields['conf_name'] }) metadata['collections'].extend([{'primary': "ConferencePaper"}]) # ============================ # Page range # ============================ if 'page_nr' not in metadata: if metadata.get("publication_info", {}).get("page_artid"): pages = metadata['publication_info']['page_artid'].split('-') if len(pages) == 2: try: metadata['page_nr'] = int(pages[1]) - int(pages[0]) + 1 except ValueError: pass # ============================ # Language # ============================ if metadata.get("languages", []) and metadata["languages"][0] == "oth": if form_fields.get("other_language"): metadata["languages"] = [form_fields["other_language"]] # =============================== # arXiv category in report number # =============================== if metadata.get("_categories"): del metadata["_categories"] # ============================ # Date of defense # ============================ if form_fields.get('defense_date'): defense_note = { 'value': 'Presented on ' + form_fields['defense_date'] } metadata.setdefault("public_notes", []).append(defense_note) # ========== # Owner Info # ========== userid = deposition.user_id user = UserInfo(userid) email = user.info.get('email', '') try: source = UserEXT.query.filter_by(id_user=userid, method='orcid').one() except NoResultFound: source = '' if source: source = source.method + ':' + source.id metadata['acquisition_source'] = dict( source=source, email=email, method="submission", submission_number=deposition.id, ) # ============== # References # ============== if form_fields.get('references'): metadata['references'] = form_fields.get('references') # ============== # Extra comments # ============== if form_fields.get('extra_comments'): metadata.setdefault('hidden_notes', []).append( { 'value': form_fields['extra_comments'], 'source': 'submitter' } ) metadata["extra_comments"] = form_fields.get("extra_comments") # ====================================== # Journal name Knowledge Base conversion # ====================================== if metadata.get("publication_info", {}).get("journal_title"): journals_kb = dict([(x['key'].lower(), x['value']) for x in get_kb_mappings(cfg.get("DEPOSIT_INSPIRE_JOURNALS_KB"))]) metadata['publication_info']['journal_title'] = journals_kb.get(metadata['publication_info']['journal_title'].lower(), metadata['publication_info']['journal_title']) if metadata.get("publication_info") and not isinstance(metadata['publication_info'], list): metadata["publication_info"] = [metadata['publication_info']]