def parse_video_format(self, record): """ Extract format info from av entity and returns a VideoFormat props. """ node = record.find('./efg:avManifestation/efg:format', self.ns) if node is not None: video_format = {} # gauge (0..1) enum gauge_el = node.find('efg:gauge', self.ns) if gauge_el is not None and gauge_el.text is not None and gauge_el.text.lower( ) != 'n/a': code_el = codelists.fromCode(gauge_el.text.strip(), codelists.GAUGE) if code_el is None: self.warnings.append('Invalid gauge for: ' + gauge_el.text.strip()) else: video_format['gauge'] = code_el[0] # aspectRation (0..1) enum aspect_ratio_el = node.find('efg:aspectRatio', self.ns) if aspect_ratio_el is not None and aspect_ratio_el.text is not None and aspect_ratio_el.text.lower( ) != 'n/a': code_el = codelists.fromCode(aspect_ratio_el.text.strip(), codelists.ASPECT_RATIO) if code_el is None: self.warnings.append('Invalid aspect ratio for: ' + aspect_ratio_el.text.strip()) else: video_format['aspect_ratio'] = code_el[0] # sound (0..1) enum sound_el = node.find('efg:sound', self.ns) if sound_el is not None and sound_el.text is not None and sound_el.text.lower( ) != 'n/a': code_el = codelists.fromDescription(sound_el.text.strip(), codelists.VIDEO_SOUND) if code_el is None: self.warnings.append('Invalid format sound for: ' + sound_el.text.strip()) else: video_format['sound'] = code_el[0] # colour (0..1) colour_el = node.find('efg:colour', self.ns) if colour_el is not None and colour_el.text is not None and colour_el.text.lower( ) != 'n/a': code_el = codelists.fromDescription(colour_el.text.strip(), codelists.COLOUR) if code_el is None: self.warnings.append('Invalid format colour for: ' + colour_el.text.strip()) else: video_format['colour'] = code_el[0] log.debug(video_format) return video_format
def parse_descriptions(self, record): descriptions = [] for node in record.findall("efg:description", self.ns): description = {} dtype = node.get('type') if dtype is not None and dtype.lower() != 'n/a': code_el = codelists.fromDescription( dtype, codelists.DESCRIPTION_TYPES) if code_el is None: self.warnings.append('Invalid description type for: ' + dtype) else: description['description_type'] = code_el[0] lang = node.get('lang') if lang is not None and lang.lower() != 'n/a': lang_val = lang.lower() lang_code = codelists.fromCode(lang_val, codelists.LANGUAGE) if lang_code is None: self.warnings.append('Invalid description language for: ' + lang) else: description['language'] = lang_code[0] description['source_ref'] = node.get('source') description['text'] = node.text.strip() log.debug('description: {}'.format(description)) descriptions.append(description) # october 2018: change: description is optional #if len(descriptions) == 0: # raise ValueError('Description is missing') return descriptions
def parse_record_sources(self, record, audio_visual=False): """ Returns a list of sources in the form of: [[<recordsource 'dict'>, <provider 'dict'>], etc.] """ record_sources = [] bind_url = False for node in record.findall("./efg:recordSource", self.ns): rs = {} rs['source_id'] = node.find('efg:sourceID', self.ns).text.strip() log.debug('record source [ID]: %s' % rs['source_id']) # record provider provider = {} provider_el = node.find('efg:provider', self.ns) provider['name'] = provider_el.text.strip() provider['identifier'] = provider_el.get('id').upper() p_scheme = provider_el.get('schemeID') scheme = codelists.fromDescription(p_scheme, codelists.PROVIDER_SCHEMES) if scheme is None: raise ValueError('Invalid provider scheme value for [%s]' % p_scheme) provider['scheme'] = scheme[0] log.debug('Record Provider: {}'.format(provider)) # bind here the url only to the first element # this is a naive solution but enough because we expect here ONLY # one record_source (that of the archive) if not bind_url: rs['is_shown_at'] = self.get_record_source_url( record, audio_visual) bind_url = True record_sources.append([rs, provider]) return record_sources
def get_colour(self, record): node = record.find('./efg:nonAVManifestation/efg:colour', self.ns) if node is not None: code_el = codelists.fromDescription(node.text.strip(), codelists.COLOUR) if code_el is not None: return code_el[0] self.warnings.append('Invalid format colour for: ' + code_el.text.strip())
def parse_related_agents(self, record): """ Extract related agents as a list of Agent props with their related contribution activities in the creation. e.g. [[<type 'dict'>, ['Director', 'Screenplay']], etc.] """ nodes = [] persons = record.findall('./efg:relPerson', self.ns) if len(persons) > 0: nodes.extend(persons) corporates = record.findall('./efg:relCorporate', self.ns) if len(corporates) > 0: nodes.extend(corporates) agents = [] for agent_node in nodes: props = {} props['names'] = [ agent_node.find('efg:name', self.ns).text.strip() ] activities = [] rel_agent_type = agent_node.find('efg:type', self.ns) if rel_agent_type is not None and rel_agent_type.text.lower( ) != 'n/a': code_el = codelists.fromDescription( rel_agent_type.text.strip(), codelists.TYPE_OF_ACTIVITY) if code_el is None: self.warnings.append('Invalid agent activity for: ' + rel_agent_type.text.strip()) else: activities.append(rel_agent_type.text.strip()) if agent_node.tag == 'relPerson' or 'efg:relPerson': props['agent_type'] = 'P' elif agent_node.tag == 'relCorporate' or 'efg:relCorporate': props['agent_type'] = 'C' else: # should never be reached raise ValueError('Invalid tag name for: {}'.format( agent_node.tag)) agent = None # de-duplicate agents for item in agents: if props['names'][0] in item[0]['names']: log.debug('FOUND agent: ' + props['names'][0]) agent = item[0] item[1].extend(activities) log.debug('added activities: {}'.format(activities)) break if agent is None: agents.append([props, activities]) log.debug(agent.names[0] for agent in agents) return agents
def get_rights_status(self, record, audio_visual=False): inpath = 'efg:avManifestation' if audio_visual else 'efg:nonAVManifestation' node = record.find("./" + inpath + "/efg:rightsStatus", self.ns) if node is None: raise ValueError("Rights status is missing") code_el = codelists.fromDescription(node.text.strip(), codelists.RIGHTS_STATUS) if code_el is None: raise ValueError('Invalid rights status description for: ' + node.text.strip()) return code_el[0]
def get_non_av_specific_type(self, record): node = record.find('./efg:nonAVManifestation/efg:specificType', self.ns) if node is None: raise ValueError('Non-AV specific type is missing') code_el = codelists.fromDescription(node.text.strip(), codelists.NON_AV_SPECIFIC_TYPES) if code_el is None: raise ValueError('Invalid Non-AV spefic type for: ' + node.text.strip()) return code_el[0]
def parse_languages(self, record, audio_visual=False): """ Extract language and usage if any. It returns an array of arrays as in the following example: [['fr','03'],['fr','25'],['ca','25']] The second nested element corresponds to the usage code in the controlled codelist. """ inpath = 'efg:avManifestation' if audio_visual else 'efg:nonAVManifestation' languages = [] for node in record.findall("./" + inpath + "/efg:language", self.ns): lang = node.text.lower() if lang.lower() == 'n/a': continue lang_code = codelists.fromCode(lang, codelists.LANGUAGE) if lang_code is None: self.warnings.append('Invalid language for: ' + node.text) continue else: lang = lang_code[0] usage = node.get('usage') if usage is not None: if usage.lower() == 'n/a': usage = None else: code_el = codelists.fromDescription( usage, codelists.LANGUAGE_USAGES) if code_el is None: self.warnings.append('Invalid language usage for: ' + usage) usage = None else: usage = code_el[0] lang_usage = [lang, usage] log.debug("lang code: {}, usage code: {}".format( lang_usage[0], lang_usage[1])) languages.append(lang_usage) return languages
def parse_keywords(self, record): keywords = [] for node in record.findall("efg:keywords", self.ns): for term in node.findall('efg:term', self.ns): keyword = {} ktype = node.get('type') if ktype is not None and ktype.lower() != 'n/a': # filter ktype with value 'Project' if ktype == 'Project': continue code_el = codelists.fromDescription( ktype, codelists.KEYWORD_TYPES) if code_el is None: self.warnings.append('Invalid keyword type for: ' + ktype) else: keyword['keyword_type'] = code_el[0] log.debug('keyword [type]: %s' % keyword['keyword_type']) lang = node.get('lang') if lang is not None and lang.lower() != 'n/a': lang_val = lang.lower() lang_code = codelists.fromCode(lang_val, codelists.LANGUAGE) if lang_code is None: self.warnings.append('Invalid keyword language for: ' + lang.text) else: keyword['language'] = lang_code[0] log.debug('language: {}'.format(keyword['language'])) if ktype == 'Form': # check term from a controlled IMC list if term.text.lower() == 'n/a': continue code_el = codelists.fromCode(term.text.strip(), codelists.FORM) if code_el is None: self.warnings.append('Invalid form type for: ' + term.text.strip()) continue else: keyword['term'] = code_el[0] else: keyword['term'] = term.text.strip() log.debug('keyword: {}'.format(keyword['term'])) #log.debug('term id: %s' % term.get('id')) if term.get('id') is not None: # check keyword term id is integer (keyword term id is optional) try: int(term.get('id')) keyword['termID'] = term.get('id') except Exception: self.warnings.append('Invalid keyword term id for: ' + term.get('id') + '. Expected integer.') else: keyword['termID'] = None keyword['schemeID'] = node.get('scheme') keywords.append(keyword) return keywords