def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob']: _convert_format(source["format"], inverse=True) job_handle = _index(request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath']) else: client = SolrClient(request.user) unique_key_field = destination[ 'indexerDefaultField'] and destination['indexerDefaultField'][ 0] or None df = destination['indexerPrimaryKey'] and destination[ 'indexerPrimaryKey'][0] or None kwargs = {} stats = request.fs.stats(source["path"]) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(request.user, request.fs) fields = indexer.get_kept_field_list(source['columns']) if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not client.exists(index_name): client.create_index(name=index_name, fields=fields, unique_key_field=unique_key_field, df=df) data = request.fs.read(source["path"], 0, MAX_UPLOAD_SIZE) client.index(name=index_name, data=data, **kwargs) job_handle = { 'status': 0, 'on_success_url': reverse('search:browse', kwargs={'name': index_name}) } elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) else: job_handle = _create_table(request, source, destination, start_time) return JsonResponse(job_handle)
def index(request): response = {'status': -1} name = request.POST.get('name') data = request.POST.get('data') client = SolrClient(request.user) client.index(name, data) response['status'] = 0 response['message'] = _('Data added') return JsonResponse(response)
def update_data_from_hive(self, collection_or_core_name, columns, fetch_handle): MAX_ROWS = 10000 ROW_COUNT = 0 FETCH_BATCH = 1000 has_more = True client = SolrClient(self.user) try: while ROW_COUNT < MAX_ROWS and has_more: result = fetch_handle(FETCH_BATCH, ROW_COUNT == 0) has_more = result['has_more'] if result['data']: kwargs = {} dataset = tablib.Dataset() dataset.append(columns) for i, row in enumerate(result['data']): dataset.append([ROW_COUNT + i] + [ cell if cell else (0 if isinstance(cell, numbers.Number) else '') for cell in row ]) if not client.index(name=collection_or_core_name, data=dataset.csv, **kwargs): raise PopupException( _('Could not update index. Check error logs for more info.' )) ROW_COUNT += len(dataset) except Exception, e: raise PopupException(_('Could not update index: %s') % e)
def update_data_from_hive(self, collection_or_core_name, columns, fetch_handle, indexing_options=None): MAX_ROWS = 10000 FETCH_BATCH = 1000 row_count = 0 has_more = True if indexing_options is None: indexing_options = {} client = SolrClient(self.user) try: while row_count < MAX_ROWS and has_more: result = fetch_handle(FETCH_BATCH, row_count == 0) has_more = result['has_more'] if result['data']: dataset = tablib.Dataset() dataset.append(columns) for i, row in enumerate(result['data']): dataset.append([ cell if cell else (0 if isinstance(cell, numbers.Number) else '') for cell in row ]) if not client.index(name=collection_or_core_name, data=dataset.csv, **indexing_options): raise PopupException( _('Could not index the data. Check error logs for more info.' )) row_count += len(dataset) except Exception as e: raise PopupException(_('Could not update index: %s') % e) return row_count
class Command(BaseCommand): """ Install examples but do not overwrite them. """ def handle(self, *args, **options): self.user = install_sample_user() self.client = SolrClient(self.user) collection = options['data'] if collection == 'twitter_demo': LOG.info("Installing twitter collection") path = os.path.abspath( os.path.join( os.path.dirname(__file__), '../../../../../../../apps/search/examples/collections/solr_configs_twitter_demo/index_data.csv' )) self._setup_collection_from_csv( { 'name': 'twitter_demo', 'fields': self._parse_fields(path, fieldtypes={ 'source': 'string', 'username': '******', }), 'uniqueKeyField': 'id', 'df': 'text' }, path) LOG.info("Twitter collection successfully installed") if collection == 'yelp_demo': LOG.info("Installing yelp collection") path = os.path.abspath( os.path.join( os.path.dirname(__file__), '../../../../../../../apps/search/examples/collections/solr_configs_yelp_demo/index_data.csv' )) self._setup_collection_from_csv( { 'name': 'yelp_demo', 'fields': self._parse_fields(path, fieldtypes={ 'name': 'string', }), 'uniqueKeyField': 'id', 'df': 'text' }, path) LOG.info("Yelp collection successfully installed") if collection == 'log_analytics_demo': LOG.info("Installing logs collection") path = os.path.abspath( os.path.join( os.path.dirname(__file__), '../../../../../../../apps/search/examples/collections/solr_configs_log_analytics_demo/index_data.csv' )) self._setup_collection_from_csv( { 'name': 'log_analytics_demo', 'fields': self._parse_fields(path, fieldtypes={ 'region_code': 'string', 'referer': 'string', 'user_agent': 'string' }), 'uniqueKeyField': 'id', 'df': 'record' }, path) LOG.info("Logs collection successfully installed") def _setup_collection_from_csv(self, collection, path): if not self.client.exists(collection['name']): self.client.create_index( name=collection['name'], fields=collection['fields'], unique_key_field=collection['uniqueKeyField'], df=collection['df']) with open(path) as fh: self.client.index(collection['name'], fh.read()) def _parse_fields(self, path, separator=',', quote_character='"', fieldtypes={}): with open(path) as fh: field_generator = utils.field_values_from_separated_file( fh, separator, quote_character) row = next(field_generator) field_names = list(row.keys()) field_types = utils.get_field_types( (list(row.values()) for row in itertools.chain([row], field_generator)), iterations=51) return [{ 'name': field[0], 'type': field[0] in fieldtypes and fieldtypes[field[0]] or field[1] } for field in zip(field_names, field_types)]