def do_import (options): people = options.people if options.uncompressed: fh = open(people, 'r') else: fh = bz2.BZ2File(people, 'r') reader = unicodecsv.UnicodeReader(fh) docs = [] endpoint = options.solr solr = pysolr.Solr(endpoint) if options.purge: solr.delete(q="collection:london-design-museum") for row in reader: doc = { 'uri': 'x-urn:ldm:id=%s' % row['id'], 'collection': 'ldm', 'collection_id': row['id'], 'name' : row['name'] } concordances = [] for k, v in row.items(): if not v: continue if k == 'confidence': continue parts = k.split(':') if len(parts) == 2 and parts[1] == 'id': mt = "=".join((k, v)) concordances.append(mt) if len(concordances): doc['concordances'] = concordances doc['concordances_machinetags'] = utils.generate_concordances_machinetags(concordances) doc['concordances_machinetags_hierarchy'] = utils.generate_concordances_machinetags_hierarchy(concordances) docs.append(doc) if len(docs) == 1000: solr.add(docs) docs = [] if len(docs): solr.add(docs) docs = [] solr.optimize()
def do_import (options): people = options.people if options.uncompressed: fh = open(people, 'r') else: fh = bz2.BZ2File(people, 'r') reader = unicodecsv.UnicodeReader(fh) docs = [] endpoint = options.solr solr = pysolr.Solr(endpoint) if options.purge: solr.delete(q="collection:walkerartcenter") for row in reader: id = row['ulanid'] doc = { 'uri': 'x-urn:wac:id=%s' % id, 'collection': 'walkerartcenter', 'collection_id': id, 'name' : row['preferredlabel'], } if row.get('birthdate', ''): doc['year_birth'] = row['birthdate'] if row.get('deathdate', ''): doc['year_death'] = row['deathdate'] concordances = [ 'ulan:id=%s' % id ] if len(concordances): doc['concordances'] = concordances doc['concordances_machinetags'] = utils.generate_concordances_machinetags(concordances) doc['concordances_machinetags_hierarchy'] = utils.generate_concordances_machinetags_hierarchy(concordances) docs.append(doc) if len(docs) == 1000: solr.add(docs) docs = [] if len(docs): solr.add(docs) docs = [] solr.optimize()
def do_import (options): dt = re.compile("\d{4}(?:\s?-\s?\d{4})?$") people = options.people if options.uncompressed: fh = open(people, 'r') else: fh = bz2.BZ2File(people, 'r') reader = unicodecsv.UnicodeReader(fh) docs = [] endpoint = options.solr solr = pysolr.Solr(endpoint) if options.purge: solr.delete(q="collection:imamuseum") for row in reader: if row['display_name'] == '': continue doc = { 'uri': 'x-urn:imamuseum:id=%s' % row['irn'], 'collection': 'imamuseum', 'collection_id': row['irn'], 'name' : row['display_name'] } # grrrnnnngmmghhnhnnrnnrnn.... for prop in ('birth_date', 'death_date'): date = row[ prop ].strip() if not dt.match(date): continue parts = date.split('-') if prop == 'birth_date': doc['year_birth'] = parts[0].strip() else: if len(parts) == 1: doc['year_death'] = parts[0].strip() else: doc['year_death'] = parts[1].strip() concordances = [] if row['ulan:id'] != '': concordances.append('ulan:id=%s' % row['ulan:id']) if len(concordances): doc['concordances'] = concordances doc['concordances_machinetags'] = utils.generate_concordances_machinetags(concordances) doc['concordances_machinetags_hierarchy'] = utils.generate_concordances_machinetags_hierarchy(concordances) docs.append(doc) if len(docs) == 1000: solr.add(docs) docs = [] if len(docs): solr.add(docs) docs = [] solr.optimize()