def _add_tal_graph(self, graph): records = graph.add_chain( MatchingFiles(path='/', pattern=self.tal_pattern, fs='fs.data.aata'), CurriedXMLReader(xpath='/auth_TAL_XML/record', fs='fs.data.aata', limit=self.limit), RecordCounter(name='tal', verbose=self.debug), _xml_element_to_dict, ) tal = self.add_tal_chain(graph, records) return tal
def _add_series_graph(self, graph): records = graph.add_chain( MatchingFiles(path='/', pattern=self.series_pattern, fs='fs.data.aata'), CurriedXMLReader(xpath='/series_XML/record', fs='fs.data.aata', limit=self.limit), RecordCounter(name='series', verbose=self.debug), _xml_element_to_dict, ) series = self.add_series_chain(graph, records) return series
def _add_people_graph(self, graph): records = graph.add_chain( MatchingFiles(path='/', pattern=self.people_pattern, fs='fs.data.aata'), CurriedXMLReader(xpath='/auth_person_XML/record', fs='fs.data.aata', limit=self.limit), RecordCounter(name='people', verbose=self.debug), _xml_element_to_dict, ) people = self.add_people_chain(graph, records) return people
def _add_abstracts_graph(self, graph): abstract_records = graph.add_chain( MatchingFiles(path='/', pattern=self.abstracts_pattern, fs='fs.data.aata'), CurriedXMLReader(xpath='/AATA_XML/record', fs='fs.data.aata', limit=self.limit), RecordCounter(name='abstracts', verbose=self.debug), _xml_element_to_dict, ) articles = self.add_articles_chain(graph, abstract_records) return articles
def _add_geog_graph(self, graph): records = graph.add_chain( MatchingFiles(path='/', pattern=self.geog_pattern, fs='fs.data.aata'), CurriedXMLReader(xpath='/auth_geog_XML/record', fs='fs.data.aata', limit=self.limit), _xml_element_to_dict, ) geog = self.add_geog_chain(graph, records) return geog
def _add_journals_graph(self, graph): records = graph.add_chain( MatchingFiles(path='/', pattern=self.journals_pattern, fs='fs.data.aata'), CurriedXMLReader(xpath='/journal_XML/record', fs='fs.data.aata', limit=self.limit), _xml_element_to_dict, ) journals = self.add_journals_chain(graph, records) return journals
def _construct_graph(self, services=None): ''' Construct bonobo.Graph object for the entire pipeline. ''' g = bonobo.Graph() contents_records = g.add_chain( MatchingFiles(path='/', pattern=self.contents_files_pattern, fs='fs.data.people'), CurriedCSVReader(fs='fs.data.people', limit=self.limit, field_names=self.contents_headers), KeyManagement( operations=[ { 'group': { 'person': { 'rename_keys': { 'person_authority': 'auth_name', 'person_auth_disp': 'auth_display_name', 'ulan_id': 'ulan', 'birth_date': 'birth', 'death_date': 'death', 'notes': 'internal_notes' }, 'properties': ( 'star_record_no', 'person_authority', 'person_auth_disp', 'variant_names', 'type', 'project', 'birth_date', 'death_date', 'period_active', 'century_active', 'active_city_date', 'nationality', 'location', 'address', 'subjects_painted', 'source', 'medal_received', 'text', 'notes', 'brief_notes', 'working_notes', 'bibliography', 'ulan_id', 'segment', ) } } } ] ), # Trace(name='foo', ordinals=range(10)), ExtractKeyedValue(key='person'), AddPerson(helper=self.helper), ) _ = self.add_person_or_group_chain(g, contents_records, serialize=True) _ = self.add_places_chain(g, contents_records, key='places', serialize=True) self.graph = g