def import_into_xapian(): client = Client(settings.XAPIAN_BASE_URL, settings.XAPIAN_SPECIES_DB) client.newdb([{ 'field_name': 'common_name', 'store': True, 'freetext': {'language': 'en'} # language used for stemming }, { 'field_name': 'scientific_name', 'store': True, 'freetext': {'language': 'en'} # Remove when stemming bug is fixed }, { 'field_name': 'freebase_id', 'store': True, 'freetext': {'language': 'en'} }], overwrite=True) # replaces existing index if there is one # We have a database! # Now we create documents queue = [] count = 0 for row in import_from_file(): if not row['scientific_name']: continue count += 1 doc = Document() # doc.id = 'X' will over-ride auto ID /AND/ cause replace if exists doc.extend([ ('common_name', row['name']), ('scientific_name', row['scientific_name']), ('freebase_id', row['id']), ]) # client.add(doc) - would work here queue.append(doc) if len(queue) >= 1000: client.bulkadd(queue) queue = [] print "Imported %d" % count # Catch the remainder if queue: client.bulkadd(queue)
def import_into_xapian(): client = Client( settings.XAPIAN_BASE_URL, settings.XAPIAN_LOCATION_DB ) try: client.deldb() except: # BAD: Naked except pass client.newdb([{ 'field_name': 'place_name', 'store': True, 'freetext': {'language': 'en'} # language used for stemming }, { 'field_name': 'county', # Maps to admin_name2 'store': True, 'freetext': {} }, { 'field_name': 'country_code', 'store': True, 'freetext': {} # TODO: Use exact match here, not yet implemented }, { 'field_name': 'postal_code', 'store': True, 'freetext': {} # TODO: Can we do prefix search only? }, { 'field_name': 'description', 'store': True, # stored but not indexed }, { 'field_name': 'latlon', 'store': True, 'type': 'geo', 'geo': {}, # no options yet }]) # We have a database! # We throw away anything that results in a description that we have # already used for something else. There are only 213 (out of 27,000) # where a duplicate description has more than one lat/lon pair - so # we've chosen to just discard those. seen_descriptions = set() # Now we create documents queue = [] count = 0 for row in import_from_file(): # Some (3) of them don't have lat or lon - ignore those if not (row['latitude'] and row['longitude']): continue description = make_description(row) if description in seen_descriptions: continue seen_descriptions.add(description) count += 1 doc = Document() # doc.id = 'X' will over-ride auto ID /AND/ cause replace if exists doc.extend([ ('place_name', row['place_name']), ('county', row['admin_name2']), ('postal_code', row['postal_code']), ('country_code', row['country_code']), ('description', description), ('latlon', '%s %s' % ( row['latitude'], row['longitude'], )), # TODO: Ignoring accuracy field for the moment ]) # client.add(doc) - would work here queue.append(doc) if len(queue) >= 1000: client.bulkadd(queue) queue = [] print "Imported %d" % count # Catch the remainder if queue: client.bulkadd(queue)