def iterIsoRecords(iso_file_name, isis_json_type): from iso2709 import IsoFile from subfield import expand iso = IsoFile(iso_file_name) for record in iso: fields = {} for field in record.directory: field_key = str(int(field.tag)) # remove leading zeroes field_occurrences = fields.setdefault(field_key, []) content = field.value.decode(INPUT_ENCODING, "replace") if isis_json_type == 1: field_occurrences.append(content) elif isis_json_type == 2: field_occurrences.append(expand(content)) elif isis_json_type == 3: field_occurrences.append(dict(expand(content))) else: raise NotImplementedError( "ISIS-JSON type %s conversion not yet implemented for .iso input" % isis_json_type ) yield fields iso.close()
def load_journals(self, json_file): """ Function: load_journals Esse metodo cria um dicionário de periódicos com atributos necessários para a criação de fascículos. Alguns registros seram transferidos das bases de periódicos para a base de fascículos e nesses caso esse dicionário será utilizado para carregar esses dados. Os campos a serem carregados no dicionário são: 540 - licensa de uso """ for record in json_file: self._journals[record['400'][0]] = {} if '935' in record: self._journals[record['935'][0]] = { } # Se for igual ao 400 ira sobrescrever if '541' in record: f540 = subfield.CompositeField( subfield.expand(record['540'][0])) href_pattern = re.compile( 'href=\\"[^ \t\n\r\f\v]*\\"' ) # regex para encontrar url no texto de disclaimer url_match = href_pattern.search(f540['t']) if url_match: reference_url = url_match.group()[6:-1] else: reference_url = '' f540 = subfield.CompositeField( subfield.expand(record['540'][0])) self._journals[record['400'][0]]['use_license'] = {} self._journals[record['935'][0]]['use_license'] = {} self._journals[record['400'][0]]['use_license'][ 'license_code'] = record['541'][0] self._journals[record['935'][0]]['use_license'][ 'license_code'] = record['541'][0] self._journals[record['400'][0]]['use_license'][ 'reference_url'] = reference_url self._journals[record['935'][0]]['use_license'][ 'reference_url'] = reference_url self._journals[record['400'] [0]]['use_license']['disclaimer'] = f540['t'] self._journals[record['935'] [0]]['use_license']['disclaimer'] = f540['t'] if '935' in record: self._journals[record['935'][0]]['use_license'] = {} self._journals[record['935'][0]]['use_license'][ 'license_code'] = record['541'][0] self._journals[record['935'][0]]['use_license'][ 'reference_url'] = reference_url self._journals[record['935'][0]]['use_license'][ 'disclaimer'] = f540['t'] else: self._journals[record['400'][0]]['use_license'] = False if '935' in record: self._journals[record['935'][0]]['use_license'] = False
def load_historic(self, journal, historicals): import operator lifecycles = {} for i in historicals: parsed_subfields = subfield.CompositeField(subfield.expand(i)) try: lifecycles[self.iso_format(parsed_subfields['a'])] = parsed_subfields['b'] except KeyError: self.charge_summary("history_error_field") return False try: lifecycles[self.iso_format(parsed_subfields['c'])] = parsed_subfields['d'] except KeyError: self.charge_summary("history_error_field") return False print lifecycles for cyclekey,cyclevalue in iter(sorted(lifecycles.iteritems())): try: journalhist = JournalHist() journalhist.date = cyclekey journalhist.status = cyclevalue journal.journalhist_set.add(journalhist) self.charge_summary("life_cycle") except exceptions.ValidationError: self.charge_summary("history_error_data") return False return True
def load_historic(self, journal, historicals): lifecycles = {} for i in historicals: expanded = subfield.expand(i) parsed_subfields = dict(expanded) try: lifecycles[self.iso_format(parsed_subfields['a'])] = parsed_subfields['b'] except KeyError: self.charge_summary("history_error_field") try: lifecycles[self.iso_format(parsed_subfields['c'])] = parsed_subfields['d'] except KeyError: self.charge_summary("history_error_field") for cyclekey, cyclevalue in iter(sorted(lifecycles.iteritems())): try: journalhist = JournalPublicationEvents() journalhist.created_at = cyclekey journalhist.status = self.trans_pub_status.get(cyclevalue.lower(), 'inprogress') journalhist.journal = journal journalhist.changed_by_id = 1 journalhist.save() journalhist.created_at = cyclekey journalhist.save() # Updating to real date, once when saving the model is given a automatica value self.charge_summary("publication_events") except exceptions.ValidationError: self.charge_summary("publications_events_error_data") return False return True
def load_section(self, record, collection): section = "" section_by_language = {} journal = self.load_journal(record['35'][0], collection=collection) if '49' in record: for sec in record['49']: # Criando dicionário organizado de secoes parsed_subfields = subfield.CompositeField( subfield.expand(sec)) if not parsed_subfields['c'] in section_by_language: section_by_language[parsed_subfields['c']] = { } # Criando Secao if not parsed_subfields['l'] in section_by_language[ parsed_subfields['c']]: section_by_language[parsed_subfields['c']][ parsed_subfields['l']] = parsed_subfields['t'] else: print u"Periódico %s não tem seções definidas" % record['35'][0] self.charge_summary('journals_without_sections') for sec_key, sec in section_by_language.items(): if journal is None: print('Invalid Journal: {}'.format(record['35'][0])) continue section = Section() if 'pt' in sec: section.title = sec['pt'] elif 'en' in sec: section.title = sec['en'] elif 'es' in sec: section.title = sec['es'] else: section.title = '' section.legacy_code = sec_key section.journal = journal section.creation_date = datetime.now() section.save(force_insert=True) self.charge_summary('sections') lang_dict = LANG_DICT for trans_key, trans in sec.items(): try: language = Language.objects.get(iso_code=trans_key) except Language.DoesNotExist: language = Language.objects.create(iso_code=trans_key, name=lang_dict.get( trans_key, '###NOT FOUND###')) section_title = SectionTitle(section=section, title=trans, language=language) section_title.save() self.charge_summary('translations') return section
def load_journals(self, json_file): """ Function: load_journals Esse metodo cria um dicionário de periódicos com atributos necessários para a criação de fascículos. Alguns registros seram transferidos das bases de periódicos para a base de fascículos e nesses caso esse dicionário será utilizado para carregar esses dados. Os campos a serem carregados no dicionário são: 540 - licensa de uso """ for record in json_file: self._journals[record['400'][0]] = {} if '935' in record: self._journals[record['935'][0]] = {} # Se for igual ao 400 ira sobrescrever if '541' in record: f540 = subfield.CompositeField(subfield.expand(record['540'][0])) href_pattern = re.compile('href=\\"[^ \t\n\r\f\v]*\\"') # regex para encontrar url no texto de disclaimer url_match = href_pattern.search(f540['t']) if url_match: reference_url = url_match.group()[6:-1] else: reference_url = '' f540 = subfield.CompositeField(subfield.expand(record['540'][0])) self._journals[record['400'][0]]['use_license'] = {} self._journals[record['935'][0]]['use_license'] = {} self._journals[record['400'][0]]['use_license']['license_code'] = record['541'][0] self._journals[record['935'][0]]['use_license']['license_code'] = record['541'][0] self._journals[record['400'][0]]['use_license']['reference_url'] = reference_url self._journals[record['935'][0]]['use_license']['reference_url'] = reference_url self._journals[record['400'][0]]['use_license']['disclaimer'] = f540['t'] self._journals[record['935'][0]]['use_license']['disclaimer'] = f540['t'] if '935' in record: self._journals[record['935'][0]]['use_license'] = {} self._journals[record['935'][0]]['use_license']['license_code'] = record['541'][0] self._journals[record['935'][0]]['use_license']['reference_url'] = reference_url self._journals[record['935'][0]]['use_license']['disclaimer'] = f540['t'] else: self._journals[record['400'][0]]['use_license'] = False if '935' in record: self._journals[record['935'][0]]['use_license'] = False
def load_mission(self, journal, missions): for i in missions: parsed_subfields = subfield.CompositeField(subfield.expand(i)) mission = JournalMission() mission.language = parsed_subfields['l'] mission.description = parsed_subfields['_'] journal.journalmission_set.add(mission) self.charge_summary("mission")
def load_section(self, record): section = "" section_by_language = {} journal = self.load_journal(record['35'][0]) if record.has_key('49'): for sec in record['49']: # Criando dicionário organizado de secoes parsed_subfields = subfield.CompositeField(subfield.expand(sec)) if not section_by_language.has_key(parsed_subfields['c']): section_by_language[parsed_subfields['c']] = {} # Criando Secao if not section_by_language[parsed_subfields['c']].has_key(parsed_subfields['l']): section_by_language[parsed_subfields['c']][parsed_subfields['l']] = parsed_subfields['t'] else: print u"Periódico "+record['35'][0]+u" não tem seções definidas" self.charge_summary('journals_without_sections') for sec_key,sec in section_by_language.items(): if journal is None: print('Invalid Journal: {}'.format(record['35'][0])) continue section = Section() if sec.has_key('pt'): section.title = sec['pt'] elif sec.has_key('en'): section.title = sec['en'] elif sec.has_key('es'): section.title = sec['es'] else: section.title = '' section.code = sec_key section.journal = journal section.creation_date = datetime.now() section.save(force_insert=True) self.charge_summary('sections') lang_dict = LANG_DICT for trans_key,trans in sec.items(): try: language = Language.objects.get(iso_code=trans_key) except Language.DoesNotExist: language = Language.objects.create(iso_code=trans_key, name=lang_dict.get(trans_key, '###NOT FOUND###')) section_title = SectionTitle(section=section, title=trans, language=language) section_title.save() self.charge_summary('translations') return section
def load_use_license(self, code, disclaimer): expanded_disclaimer = subfield.expand(disclaimer) parsed_subfields_disclaimer = dict(expanded_disclaimer) use_license = UseLicense.objects.get_or_create(license_code=code)[0] if parsed_subfields_disclaimer.has_key('t'): use_license.disclaimer = parsed_subfields_disclaimer['t'] use_license.save() return use_license
def iterIsoRecords(iso_file_name, isis_json_type): from iso2709 import IsoFile from subfield import expand iso = IsoFile(iso_file_name) for record in iso: fields = {} for field in record.directory: field_key = str(int(field.tag)) # remove leading zeroes field_occurrences = fields.setdefault(field_key,[]) content = field.value.decode(INPUT_ENCODING,'replace') if isis_json_type == 1: field_occurrences.append(content) elif isis_json_type == 2: field_occurrences.append(expand(content)) elif isis_json_type == 3: field_occurrences.append(dict(expand(content))) else: raise NotImplementedError('ISIS-JSON type %s conversion not yet implemented for .iso input' % isis_json_type) yield fields iso.close()
def load_section(self, record): section = "" section_by_language = {} journal = self.load_journal(record['35'][0]) if record.has_key('49'): for sec in record['49']: # Criando dicionário organizado de secoes parsed_subfields = subfield.CompositeField( subfield.expand(sec)) if not section_by_language.has_key(parsed_subfields['c']): section_by_language[parsed_subfields['c']] = { } # Criando Secao if not section_by_language[parsed_subfields['c']].has_key( parsed_subfields['l']): section_by_language[parsed_subfields['c']][ parsed_subfields['l']] = parsed_subfields['t'] else: print u"Periódico " + record['35'][0] + u" não tem seções definidas" self.charge_summary('journals_without_sections') for sec_key, sec in section_by_language.items(): if journal is None: print('Invalid Journal: {}'.format(record['35'][0])) continue section = Section() if sec.has_key('pt'): section.title = sec['pt'] elif sec.has_key('en'): section.title = sec['en'] elif sec.has_key('es'): section.title = sec['es'] else: section.title = '' section.code = sec_key section.journal = journal section.creation_date = datetime.now() section.save(force_insert=True) self.charge_summary('sections') for trans_key, trans in sec.items(): translation = TranslatedData() translation.language = trans_key translation.field = 'code' translation.model = 'section' translation.save(force_insert=True) section.title_translations.add(translation) self.charge_summary('translations') return section
def load_historic(self, collection, journal, user, historicals): lifecycles = {} for i in historicals: expanded = subfield.expand(i) parsed_subfields = dict(expanded) try: lifecycles[self.iso_format(parsed_subfields['a'])] = parsed_subfields['b'] except KeyError: self.charge_summary("history_error_field") try: lifecycles[self.iso_format(parsed_subfields['c'])] = parsed_subfields['d'] except KeyError: self.charge_summary("history_error_field") for cycledate, cyclestatus in iter(sorted(lifecycles.iteritems())): defaults = { 'created_by': user, } status = self.trans_pub_status.get( cyclestatus.lower(), 'inprogress' ) try: timeline = JournalTimeline.objects.get_or_create( journal=journal, collection=collection, since=cycledate, status=status, defaults=defaults)[0] self.charge_summary("timeline") except exceptions.ValidationError: self.charge_summary("timeline_invalid_date") try: membership = Membership.objects.get_or_create( journal=journal, collection=collection, since=cycledate, status=status, defaults=defaults ) except: self.charge_summary("timeline_invalid_date") return True
def load_mission(self, journal, missions): from sectionimport import LANG_DICT as lang_dict for i in missions: parsed_subfields = subfield.CompositeField(subfield.expand(i)) mission = JournalMission() try: language = Language.objects.get_or_create(iso_code = parsed_subfields['l'], name = lang_dict.get(parsed_subfields['l'], '###NOT FOUND###'))[0] mission.language = language except: pass mission.description = parsed_subfields['_'] journal.missions.add(mission) self.charge_summary("mission")
def get_last_status(self, historicals): lifecycles = {} for i in historicals: expanded = subfield.expand(i) parsed_subfields = dict(expanded) try: lifecycles[self.iso_format(parsed_subfields['a'])] = parsed_subfields['b'] except KeyError: self.charge_summary("history_error_field") try: lifecycles[self.iso_format(parsed_subfields['c'])] = parsed_subfields['d'] except KeyError: self.charge_summary("history_error_field") return sorted(lifecycles.iteritems())[-1][1]
def load_section(self, record): section = "" section_by_language = {} journal = self.load_journal(record['35'][0]) if record.has_key('49'): for sec in record['49']: # Criando dicionário organizado de secoes parsed_subfields = subfield.CompositeField(subfield.expand(sec)) if not section_by_language.has_key(parsed_subfields['c']): section_by_language[parsed_subfields['c']] = {} # Criando Secao if not section_by_language[parsed_subfields['c']].has_key(parsed_subfields['l']): section_by_language[parsed_subfields['c']][parsed_subfields['l']] = parsed_subfields['t'] else: print u"Periódico "+record['35'][0]+u" não tem seções definidas" self.charge_summary('journals_without_sections') for sec_key,sec in section_by_language.items(): if journal is None: print('Invalid Journal: {}'.format(record['35'][0])) continue section = Section() if sec.has_key('pt'): section.title = sec['pt'] elif sec.has_key('en'): section.title = sec['en'] elif sec.has_key('es'): section.title = sec['es'] else: section.title = '' section.code = sec_key section.journal = journal section.creation_date = datetime.now() section.save(force_insert=True) self.charge_summary('sections') for trans_key,trans in sec.items(): translation = TranslatedData() translation.language = trans_key translation.field = 'code' translation.model = 'section' translation.save(force_insert=True) section.title_translations.add(translation) self.charge_summary('translations') return section
def load_sections(self, issue, record): issue_sections = [] if '49' in record: for code in record['49']: expanded = subfield.expand(code) parsed_subfields = dict(expanded) if parsed_subfields['c'] in self._sections: section_id = self._sections[parsed_subfields['c']] section = Section.objects.get(id=section_id) try: issue.section.add(section) except ObjectDoesNotExist: print "Inconsistência nos dados carregando seção" return issue_sections
def pega_dados(self, dados, campos, datas): dados = dict(expand(dados)) dados = { nome_campo: dados[chave_campo] if chave_campo in dados else None for chave_campo, nome_campo in campos.iteritems()} for campo in datas: try: dados[campo] = data_br(dados[campo]) except ValueError: dados[campo] = data_ano(dados[campo]) except AttributeError: pass # Remove campos vazios dados.pop('xxx', None) if None in dados: self.error = True return dados
def pega_dados(self, dados, campos, datas): dados = dict(expand(dados)) dados = { nome_campo: dados[chave_campo] if chave_campo in dados else None for chave_campo, nome_campo in campos.iteritems() } for campo in datas: try: dados[campo] = data_br(dados[campo]) except ValueError: dados[campo] = data_ano(dados[campo]) except AttributeError: pass # Remove campos vazios dados.pop('xxx', None) if None in dados: self.error = True return dados
def load_issue(self, record): """ Function: load_issue Retorna model issue que foi registrado no banco de dados """ issue = Issue() error = False try: journal = Journal.objects.get(print_issn=record['35'][0], collection=self._collection.id) except ObjectDoesNotExist: try: journal = Journal.objects.get(eletronic_issn=record['35'][0], collection=self._collection.id) except ObjectDoesNotExist: print u"Inconsistência de dados tentando encontrar periódico com ISSN: %s" % record['35'][0] error = True if error: return False if '32' in record: issue.number = record['32'][0] if '41' in record: if record['41'][0] == 'pr': issue.is_press_release = True if '43' in record: expanded = subfield.expand(record['43'][0]) month_start = dict(expanded) if 'm' in month_start: month_start = month_start['m'][:3].lower() if month_start in self._monthtoindex: month_start = self._monthtoindex[month_start] else: month_start = 0 else: month_start = 0 if '122' in record: issue.total_documents = record['122'][0] else: issue.total_documents = 0 if '65' in record: year = record['65'][0][0:4] month_end = record['65'][0][4:6] if month_end == '00': month_end = '01' issue.publication_start_month = month_start issue.publication_end_month = month_end issue.publication_year = int(year) else: print u'Fasciculo %s %s %s não possui data de publicação' % (record['35'][0], record['31'][0], record['32'][0]) issue.publication_start_month = 0 issue.publication_end_month = 0 issue.publication_year = 0000 current_year = datetime.date(datetime.now()).year previous_year = current_year - 1 if issue.number.lower() == 'ahead': if int(issue.publication_year) == int(current_year): journal.current_ahead_documents = issue.total_documents print u"ahead {0} de {1} removido da lista de issues, o total ({2}) de documentos foi transferido para models.journal.current_ahead_documents".format(journal.title, issue.publication_year, issue.total_documents) if int(issue.publication_year) == int(previous_year): journal.previous_ahead_documents = issue.total_documents print u"ahead {0} de {1} removido da lista de issues, o total ({2}) de documentos foi transferido para models.journal.previous_ahead_documents".format(journal.title, issue.publication_year, issue.total_documents) journal.save() return False issue.journal = journal issue.creation_date = datetime.now() if '31' in record: issue.volume = record['31'][0] if '131' in record: issue.suppl_volume = record['131'][0] if '132' in record: issue.suppl_number = record['132'][0] if '33' in record: issue.title = record['33'][0] if '36' in record: try: issue.order = int(str(record['36'][0])[4:]) except ValueError: print record if '33' in record: issue.title = record['33'][0] if '200' in record: if int(record['200'][0]) == 1: issue.is_marked_up = True else: issue.is_marked_up = False if '62' in record: issue.publisher_fullname = record['62'][0] if '85' in record: issue.ctrl_vocabulary = record['85'][0] if '117' in record: issue.editorial_standard = record['117'][0] license = self.load_use_license(record['35'][0]) if license: issue.use_license = license try: issue.save(force_insert=True) except DatabaseError as e: print "error({0}), input data: {1}".format(e.message, issue.__dict__) if '91' in record: created = u'%s-%s-01T01:01:01' % (record['91'][0][0:4], record['91'][0][4:6]) issue.created = datetime.strptime(created, "%Y-%m-%dT%H:%M:%S") issue.save() self.load_sections(issue, record) self.charge_summary('issues') return issue
def load_issue(self, record): """ Function: load_issue Retorna model issue que foi registrado no banco de dados """ issue = Issue() error = False try: journal = Journal.objects.get(print_issn=record['35'][0], collections=self._collection.id) except ObjectDoesNotExist: try: journal = Journal.objects.get(eletronic_issn=record['35'][0], collections=self._collection.id) except ObjectDoesNotExist: print u"Inconsistência de dados tentando encontrar periódico com ISSN: %s" % record[ '35'][0] error = True if error: return False if '32' in record: issue.number = record['32'][0] if '41' in record: if record['41'][0] == 'pr': issue.is_press_release = True if '43' in record: expanded = subfield.expand(record['43'][0]) month_start = dict(expanded) if 'm' in month_start: month_start = month_start['m'][:3].lower() if month_start in self._monthtoindex: month_start = self._monthtoindex[month_start] else: month_start = 0 else: month_start = 0 if '122' in record: issue.total_documents = record['122'][0] else: issue.total_documents = 0 if '65' in record: year = record['65'][0][0:4] month_end = record['65'][0][4:6] if month_end == '00': month_end = '01' issue.publication_start_month = month_start issue.publication_end_month = month_end issue.publication_year = int(year) else: print u'Fasciculo %s %s %s não possui data de publicação' % ( record['35'][0], record['31'][0], record['32'][0]) issue.publication_start_month = 0 issue.publication_end_month = 0 issue.publication_year = 0000 current_year = datetime.date(datetime.now()).year previous_year = current_year - 1 if issue.number.lower() == 'ahead': if int(issue.publication_year) == int(current_year): journal.current_ahead_documents = issue.total_documents print u"ahead {0} de {1} removido da lista de issues, o total ({2}) de documentos foi transferido para models.journal.current_ahead_documents".format( journal.title, issue.publication_year, issue.total_documents) if int(issue.publication_year) == int(previous_year): journal.previous_ahead_documents = issue.total_documents print u"ahead {0} de {1} removido da lista de issues, o total ({2}) de documentos foi transferido para models.journal.previous_ahead_documents".format( journal.title, issue.publication_year, issue.total_documents) journal.save() return False issue.journal = journal issue.creation_date = datetime.now() if '31' in record: issue.volume = record['31'][0] if '131' in record: issue.suppl_volume = record['131'][0] if '132' in record: issue.suppl_number = record['132'][0] if '33' in record: issue.title = record['33'][0] if '36' in record: try: issue.order = int(str(record['36'][0])[4:]) except ValueError: print record if '33' in record: issue.title = record['33'][0] if '200' in record: if int(record['200'][0]) == 1: issue.is_marked_up = True else: issue.is_marked_up = False if '62' in record: issue.publisher_fullname = record['62'][0] if '85' in record: issue.ctrl_vocabulary = record['85'][0] if '117' in record: issue.editorial_standard = record['117'][0] license = self.load_use_license(record['35'][0]) if license: issue.use_license = license try: issue.save(force_insert=True) except DatabaseError as e: print "error({0}), input data: {1}".format(e.message, issue.__dict__) if '91' in record: created = u'%s-%s-01T01:01:01' % (record['91'][0][0:4], record['91'][0][4:6]) issue.created = datetime.strptime(created, "%Y-%m-%dT%H:%M:%S") issue.save() self.load_sections(issue, record) self.charge_summary('issues') return issue