def handle(self, *args, **options): """Command handle.""" if self.has_filter(options): self.filter_indices(options) else: # Process all indices. index_builder.build()
def handle(self, *args, **options): """Command handle.""" verbosity = int(options["verbosity"]) if self.has_filter(options): self.filter_indices(options, verbosity) else: # Process all indices. index_builder.build()
def handle(self, *args, **options): """Command handle.""" verbosity = int(options['verbosity']) if self.has_filter(options): self.filter_indices(options, verbosity) else: # Process all indices. index_builder.build()
def setUp(self): self.collection1 = Collection.objects.get(pk=1) self.resource_name = 'collection' self.viewset = CollectionViewSet self.post_data = { 'name': 'Test collection', 'slug': 'test_collection', } super().setUp() # Reindex data objects as they are loaded in fixtures. # TODO: Remove this when we get rid of fixtures. from resolwe.elastic.builder import index_builder index_builder.build()
def setUp(self): self.data1 = Data.objects.get(pk=1) self.resource_name = 'data' self.viewset = DataViewSet self.data = { 'name': 'New data', 'slug': 'new_data', 'collections': ['1'], 'process': 'test_process', } super().setUp() # Reindex data objects as they are loaded in fixtures. # TODO: Remove this when we get rid of fixtures. from resolwe.elastic.builder import index_builder index_builder.build()
def setUp(self): self.data1 = Data.objects.get(pk=1) self.resource_name = "data" self.viewset = DataViewSet self.data = { "name": "New data", "slug": "new_data", "collection": { "id": 1 }, "process": { "slug": "test_process" }, } super().setUp() # Reindex data objects as they are loaded in fixtures. # TODO: Remove this when we get rid of fixtures. from resolwe.elastic.builder import index_builder index_builder.build()
def test_bulk_indexing(self): from .test_app.models import TestModel from .test_app.elastic_indexes import TestSearchDocument first_obj = TestModel.objects.create(name='First name', number=42) TestModel.objects.create(name='Second name', number=43) # Delete whole index index_builder.delete() es_objects = TestSearchDocument.search().execute() self.assertEqual(len(es_objects), 0) # Build empty queryset index_builder.build(queryset=TestModel.objects.none()) es_objects = TestSearchDocument.search().execute() self.assertEqual(len(es_objects), 0) # Build only the subset of queryset defined in index index_builder.build(queryset=TestModel.objects.filter(pk=first_obj.pk)) es_objects = TestSearchDocument.search().execute() self.assertEqual(len(es_objects), 1) # Delete whole index index_builder.delete() es_objects = TestSearchDocument.search().execute() self.assertEqual(len(es_objects), 0) # Build only object index_builder.build(obj=first_obj) es_objects = TestSearchDocument.search().execute() self.assertEqual(len(es_objects), 1) # Delete whole index index_builder.delete() es_objects = TestSearchDocument.search().execute() self.assertEqual(len(es_objects), 0) # Build whole queryset defined in index index_builder.build() es_objects = TestSearchDocument.search().execute() self.assertEqual(len(es_objects), 2)
def handle(self, *args, **options): """Command handle.""" count_total, count_inserted = 0, 0 to_index = [] relation_type_choices = list(zip(*Mapping.RELATION_TYPE_CHOICES))[0] for tab_file_name, _, tab_file in decompress(options['file_name']): logger.info(__("Importing mappings from \"{}\"...", tab_file_name)) mappings = set() for row in csv.DictReader(tab_file, delimiter=str('\t')): if row['relation_type'] not in relation_type_choices: raise ValidationError("Unknown relation type: {}".format( row['relation_type'])) # NOTE: For performance reasons this is a tuple instead of a dict. # Tuple can be hashed, so it can be used in `ìn` operation, # and is serialized to a JSON list. # Make sure that any changes also reflect in the SQL query # below. mapping = ( row['relation_type'], row['source_db'], row['source_id'], row['source_species'], row['target_db'], row['target_id'], row['target_species'], ) if mapping in mappings: raise ValidationError( "Duplicated mapping (relation type: '{}', source db: '{}', source id: " "'{}', source species: {}, target db: '{}', target id: '{}', " "target species: {}) found in '{}'".format( row['relation_type'], row['source_db'], row['source_id'], row['source_species'], row['target_db'], row['target_id'], row['target_species'], tab_file_name)) mappings.add(mapping) with connection.cursor() as cursor: cursor.execute( """ WITH tmp AS( INSERT INTO {table_name} ( relation_type, source_db, source_id, source_species, target_db, target_id, target_species ) SELECT value->>0, value->>1, value->>2, value->>3, value->>4, value->>5, value->>6 FROM json_array_elements(%s) LEFT JOIN {table_name} ON value->>0 = {table_name}.relation_type AND value->>1 = {table_name}.source_db AND value->>2 = {table_name}.source_id AND value->>3 = {table_name}.source_species AND value->>4 = {table_name}.target_db AND value->>5 = {table_name}.target_id AND value->>6 = {table_name}.target_species WHERE {table_name}.relation_type IS NULL RETURNING id ) SELECT COALESCE(array_agg(id), ARRAY[]::INTEGER[]) AS ids, COUNT(*) AS count_inserted FROM tmp; """.format( table_name=Mapping._meta.db_table, # pylint: disable=no-member,protected-access ), params=[json.dumps(list(mappings))]) result = cursor.fetchone() to_index.extend(result[0]) count_total += len(mappings) count_inserted += result[1] index_builder.build(queryset=Mapping.objects.filter(id__in=to_index)) logger.info( # pylint: disable=logging-not-lazy "Total mappings: %d. Inserted %d, unchanged %d." % (count_total, count_inserted, count_total - count_inserted))
def handle(self, *args, **options): """Command handle.""" count_total, count_inserted, count_updated = 0, 0, 0 to_index = [] type_choices = list(zip(*Feature.TYPE_CHOICES))[0] subtype_choices = list(zip(*Feature.SUBTYPE_CHOICES))[0] for tab_file_name, tab_file in decompress(options['file_name']): logger.info(__("Importing features from \"{}\"...", tab_file_name)) features = [] unique_features = set() for row in csv.DictReader(tab_file, delimiter=str('\t')): sub_type = SUBTYPE_MAP.get(row['Gene type'], 'other') if row['Type'] not in type_choices: raise ValidationError("Unknown type: {}".format( row['Type'])) if sub_type not in subtype_choices: raise ValidationError( "Unknown subtype: {}".format(sub_type)) aliases_text = row['Aliases'].strip() aliases = [] if aliases_text and aliases_text != '-': aliases = aliases_text.split(',') if (row['Source'], row['ID']) in unique_features: raise ValidationError( "Duplicated feature (source: '{}', id: '{}') found in '{}'" .format(row['Source'], row['ID'], tab_file_name)) # NOTE: For performance reasons this is a list instead of a dict. # Make sure that any changes also reflect in the SQL query # below. features.append([ row['Source'], row['ID'], row['Species'], row['Type'], sub_type, row['Name'], row['Full name'], row['Description'], aliases, ]) unique_features.add((row['Source'], row['ID'])) with connection.cursor() as cursor: cursor.execute( """ WITH tmp AS ( INSERT INTO {table_name} ( source, feature_id, species, type, sub_type, name, full_name, description, aliases ) SELECT value->>0, value->>1, value->>2, value->>3, value->>4, value->>5, value->>6, value->>7, ARRAY(SELECT json_array_elements_text(value->8)) FROM json_array_elements(%s) ON CONFLICT (species, source, feature_id) DO UPDATE SET type = EXCLUDED.type, sub_type = EXCLUDED.sub_type, name = EXCLUDED.name, full_name = EXCLUDED.full_name, description = EXCLUDED.description, aliases = EXCLUDED.aliases WHERE ( {table_name}.type, {table_name}.sub_type, {table_name}.name, {table_name}.full_name, {table_name}.description, {table_name}.aliases ) IS DISTINCT FROM ( EXCLUDED.type, EXCLUDED.sub_type, EXCLUDED.name, EXCLUDED.full_name, EXCLUDED.description, EXCLUDED.aliases ) RETURNING id, xmax ) SELECT COALESCE(array_agg(id), ARRAY[]::INTEGER[]) AS ids, COUNT(CASE WHEN xmax = 0 THEN 1 END) AS count_inserted, COUNT(CASE WHEN xmax != 0 THEN 1 END) AS count_updated FROM tmp; """.format( table_name=Feature._meta.db_table, # pylint: disable=no-member,protected-access ), params=[json.dumps(features)]) result = cursor.fetchone() to_index.extend(result[0]) count_total += len(features) count_inserted += result[1] count_updated += result[2] index_builder.build(queryset=Feature.objects.filter(id__in=to_index)) logger.info( # pylint: disable=logging-not-lazy "Total features: %d. Inserted %d, updated %d, unchanged %d." % (count_total, count_inserted, count_updated, count_total - count_inserted - count_updated))
def handle(self, *args, **options): """Command handle.""" count_total, count_inserted, count_updated = 0, 0, 0 to_index = [] type_choices = list(zip(*Feature.TYPE_CHOICES))[0] subtype_choices = list(zip(*Feature.SUBTYPE_CHOICES))[0] for tab_file_name, tab_file in decompress(options['file_name']): logger.info(__("Importing features from \"{}\"...", tab_file_name)) features = [] unique_features = set() for row in csv.DictReader(tab_file, delimiter=str('\t')): sub_type = SUBTYPE_MAP.get(row['Gene type'], 'other') if row['Type'] not in type_choices: raise ValidationError("Unknown type: {}".format(row['Type'])) if sub_type not in subtype_choices: raise ValidationError("Unknown subtype: {}".format(sub_type)) aliases_text = row['Aliases'].strip() aliases = [] if aliases_text and aliases_text != '-': aliases = aliases_text.split(',') if (row['Source'], row['ID']) in unique_features: raise ValidationError( "Duplicated feature (source: '{}', id: '{}') found in '{}'".format( row['Source'], row['ID'], tab_file_name ) ) # NOTE: For performance reasons this is a list instead of a dict. # Make sure that any changes also reflect in the SQL query # below. features.append([ row['Source'], row['ID'], row['Species'], row['Type'], sub_type, row['Name'], row['Full name'], row['Description'], aliases, ]) unique_features.add((row['Source'], row['ID'])) with connection.cursor() as cursor: cursor.execute( """ WITH tmp AS ( INSERT INTO {table_name} ( source, feature_id, species, type, sub_type, name, full_name, description, aliases ) SELECT value->>0, value->>1, value->>2, value->>3, value->>4, value->>5, value->>6, value->>7, ARRAY(SELECT json_array_elements_text(value->8)) FROM json_array_elements(%s) ON CONFLICT (species, source, feature_id) DO UPDATE SET type = EXCLUDED.type, sub_type = EXCLUDED.sub_type, name = EXCLUDED.name, full_name = EXCLUDED.full_name, description = EXCLUDED.description, aliases = EXCLUDED.aliases WHERE ( {table_name}.type, {table_name}.sub_type, {table_name}.name, {table_name}.full_name, {table_name}.description, {table_name}.aliases ) IS DISTINCT FROM ( EXCLUDED.type, EXCLUDED.sub_type, EXCLUDED.name, EXCLUDED.full_name, EXCLUDED.description, EXCLUDED.aliases ) RETURNING id, xmax ) SELECT COALESCE(array_agg(id), ARRAY[]::INTEGER[]) AS ids, COUNT(CASE WHEN xmax = 0 THEN 1 END) AS count_inserted, COUNT(CASE WHEN xmax != 0 THEN 1 END) AS count_updated FROM tmp; """.format( table_name=Feature._meta.db_table, # pylint: disable=no-member,protected-access ), params=[json.dumps(features)] ) result = cursor.fetchone() to_index.extend(result[0]) count_total += len(features) count_inserted += result[1] count_updated += result[2] index_builder.build(queryset=Feature.objects.filter(id__in=to_index)) logger.info( # pylint: disable=logging-not-lazy "Total features: %d. Inserted %d, updated %d, unchanged %d." % (count_total, count_inserted, count_updated, count_total - count_inserted - count_updated) )
def handle(self, *args, **options): """Command handle.""" count_total, count_inserted = 0, 0 to_index = [] relation_type_choices = list(zip(*Mapping.RELATION_TYPE_CHOICES))[0] for tab_file_name, tab_file in decompress(options['file_name']): logger.info(__("Importing mappings from \"{}\"...", tab_file_name)) mappings = set() for row in csv.DictReader(tab_file, delimiter=str('\t')): if row['relation_type'] not in relation_type_choices: raise ValidationError( "Unknown relation type: {}".format(row['relation_type']) ) # NOTE: For performance reasons this is a tuple instead of a dict. # Tuple can be hashed, so it can be used in `ìn` operation, # and is serialized to a JSON list. # Make sure that any changes also reflect in the SQL query # below. mapping = ( row['relation_type'], row['source_db'], row['source_id'], row['source_species'], row['target_db'], row['target_id'], row['target_species'], ) if mapping in mappings: raise ValidationError( "Duplicated mapping (relation type: '{}', source db: '{}', source id: " "'{}', source species: {}, target db: '{}', target id: '{}', " "target species: {}) found in '{}'".format( row['relation_type'], row['source_db'], row['source_id'], row['source_species'], row['target_db'], row['target_id'], row['target_species'], tab_file_name ) ) mappings.add(mapping) with connection.cursor() as cursor: cursor.execute( """ WITH tmp AS( INSERT INTO {table_name} ( relation_type, source_db, source_id, source_species, target_db, target_id, target_species ) SELECT value->>0, value->>1, value->>2, value->>3, value->>4, value->>5, value->>6 FROM json_array_elements(%s) ON CONFLICT DO NOTHING -- conflict means that mapping is already present RETURNING id ) SELECT COALESCE(array_agg(id), ARRAY[]::INTEGER[]) AS ids, COUNT(*) AS count_inserted FROM tmp; """.format( table_name=Mapping._meta.db_table, # pylint: disable=no-member,protected-access ), params=[json.dumps(list(mappings))] ) result = cursor.fetchone() to_index.extend(result[0]) count_total += len(mappings) count_inserted += result[1] index_builder.build(queryset=Mapping.objects.filter(id__in=to_index)) logger.info( # pylint: disable=logging-not-lazy "Total mappings: %d. Inserted %d, unchanged %d." % (count_total, count_inserted, count_total - count_inserted) )
def handle(self, *args, **options): """Command handle.""" index_builder.build(push=False) index_builder.push()