コード例 #1
0
    def index_column(self, column, index_config):
        column.prepare_data()
        doc_body = {
            'name': column.name,
            'semantic_type': column.semantic_type,
            'content_length': column.content_length,
            'data_size': len(column.value_list),
            'values': column.value_list,
            'histogram': column.histogram_list
        }
        if column.is_numeric():
            doc_body['numeric'] = column.numeric_list
            doc_body['sample_numeric'] = column.sample_list
        else:
            doc_body['textual'] = column.value_text

        self.es.index(index=Utils.get_index_name(index_config), doc_type=column.semantic_type,
                      body=doc_body)

        doc_body = {
            'semantic_type': column.semantic_type
        }

        if not self.es.search_exists(index=Utils.get_index_name(index_config),
                                     doc_type='semantic', body=doc_body):
            self.es.index(index=Utils.get_index_name(index_config), doc_type='semantic',
                          body=doc_body)
コード例 #2
0
    def search_types_data(self, index_config, semantic_types):
        result = self.es.search(index=Utils.get_index_name(index_config), doc_type=','.join(semantic_types),
                                body={"query": {"match_all": {}}})

        return sc.parallelize(result['hits']['hits']).map(
            lambda hit: (hit['_type'], hit['_source'].items())).groupByKey().flatMap(lambda x: sc.parallelize(x[1]).map(
            lambda y: ((x[0], y[0]), y[1]) if isinstance(y[1], list) else ((x[0], y[0]), [y[1]]))).reduceByKey(
            lambda x, y: x + y).map(lambda x: (x[0][0], {x[0][1]: x[1]})).collectAsMap()

        return result
コード例 #3
0
 def is_index_exist(self, index_config):
     return self.es.exists(index=Utils.get_index_name(index_config))
コード例 #4
0
    def search_all_types(self, index_config):
        result = self.es.search(index=Utils.get_index_name(index_config), doctype='semantic',
                                body={"query": {"match_all": {}}})

        return sc.parallelize(result['hits']['hits']).map(lambda hit: hit['semantic_type']).collect()