def index_column(self, column, index_config): column.prepare_data() doc_body = { 'name': column.name, 'semantic_type': column.semantic_type, 'content_length': column.content_length, 'data_size': len(column.value_list), 'values': column.value_list, 'histogram': column.histogram_list } if column.is_numeric(): doc_body['numeric'] = column.numeric_list doc_body['sample_numeric'] = column.sample_list else: doc_body['textual'] = column.value_text self.es.index(index=Utils.get_index_name(index_config), doc_type=column.semantic_type, body=doc_body) doc_body = { 'semantic_type': column.semantic_type } if not self.es.search_exists(index=Utils.get_index_name(index_config), doc_type='semantic', body=doc_body): self.es.index(index=Utils.get_index_name(index_config), doc_type='semantic', body=doc_body)
def search_types_data(self, index_config, semantic_types): result = self.es.search(index=Utils.get_index_name(index_config), doc_type=','.join(semantic_types), body={"query": {"match_all": {}}}) return sc.parallelize(result['hits']['hits']).map( lambda hit: (hit['_type'], hit['_source'].items())).groupByKey().flatMap(lambda x: sc.parallelize(x[1]).map( lambda y: ((x[0], y[0]), y[1]) if isinstance(y[1], list) else ((x[0], y[0]), [y[1]]))).reduceByKey( lambda x, y: x + y).map(lambda x: (x[0][0], {x[0][1]: x[1]})).collectAsMap() return result
def is_index_exist(self, index_config): return self.es.exists(index=Utils.get_index_name(index_config))
def search_all_types(self, index_config): result = self.es.search(index=Utils.get_index_name(index_config), doctype='semantic', body={"query": {"match_all": {}}}) return sc.parallelize(result['hits']['hits']).map(lambda hit: hit['semantic_type']).collect()