Example #1
0
def update_process_datetime(doc_id, timestamp):
  ''' Updates the last_update_date for the document id passed into function.
    The document id in will be the name of another index in the cluster.
  '''
  connection_string = 'http://localhost:9200'
  process_index = 'openfdametadata'
  _type = 'last_run'
  _map = {}
  _map[_type] = {}
  _map[_type]['properties'] = {}
  _map[_type]['properties']['last_update_date'] = {}
  _map[_type]['properties']['last_update_date']['type'] = 'date'
  _map[_type]['properties']['last_update_date']['format'] = 'dateOptionalTime'

  es = ElasticSearch(connection_string)
  try:
    es.create_index(process_index)
    logging.info('Creating index %s', process_index)
  except exceptions.IndexAlreadyExistsError as e:
    logging.info('%s already exists', process_index)

  try:
    es.put_mapping(process_index, doc_type=_type, mapping=_map)
    logging.info('Successfully created mapping')
  except:
    logging.fatal('Could not create the mapping')

  new_doc = {}
  new_doc['last_update_date'] = timestamp
  es.index(process_index,
           doc_type=_type,
           id=doc_id,
           doc=new_doc,
           overwrite_existing=True)
Example #2
0
def update_process_datetime(doc_id, timestamp):
    ''' Updates the last_update_date for the document id passed into function.
    The document id in will be the name of another index in the cluster.
  '''
    connection_string = 'http://localhost:9200'
    process_index = 'openfdametadata'
    _type = 'last_run'
    _map = {}
    _map[_type] = {}
    _map[_type]['properties'] = {}
    _map[_type]['properties']['last_update_date'] = {}
    _map[_type]['properties']['last_update_date']['type'] = 'date'
    _map[_type]['properties']['last_update_date'][
        'format'] = 'dateOptionalTime'

    es = ElasticSearch(connection_string)
    try:
        es.create_index(process_index)
        logging.info('Creating index %s', process_index)
    except exceptions.IndexAlreadyExistsError as e:
        logging.info('%s already exists', process_index)

    try:
        es.put_mapping(process_index, doc_type=_type, mapping=_map)
        logging.info('Successfully created mapping')
    except:
        logging.fatal('Could not create the mapping')

    new_doc = {}
    new_doc['last_update_date'] = timestamp
    es.index(process_index,
             doc_type=_type,
             id=doc_id,
             doc=new_doc,
             overwrite_existing=True)
Example #3
0
def cli(index_name, delete_index, mapping_file, settings_file, doc_type,
        import_file, delimiter, tab, host, docs_per_chunk, bytes_per_chunk,
        parallel, quiet):
    """
    Bulk import a delimited file into a target Elasticsearch instance. Common
    delimited files include things like CSV and TSV.

    \b
    Load a CSV file:
      csv2es --index-name potatoes --doc-type potato --import-file potatoes.csv
    \b
    For a TSV file, note the tab delimiter option
      csv2es --index-name tomatoes --doc-type tomato \
             --import-file tomatoes.tsv --tab
    \b
    For a nifty pipe-delimited file (delimiters must be one character):
      csv2es --index-name pipes --doc-type pipe --import-file pipes.psv \
             --delimiter '|'

    """

    echo('Using host: ' + host, quiet)
    es = ElasticSearch(host)

    if delete_index:
        try:
            es.delete_index(index_name)
            echo('Deleted: ' + index_name, quiet)
        except ElasticHttpNotFoundError:
            echo('Index ' + index_name + ' not found, nothing to delete',
                 quiet)

    try:
        if settings_file:
            echo('Applying mapping from: ' + settings_file, quiet)
            with open(settings_file) as f:
                settings = json.loads(f.read())
            es.create_index(index_name, settings)
        else:
            es.create_index(index_name)
        echo('Created new index: ' + index_name, quiet)
    except ElasticHttpError as e:
        if e.error['type'] == 'index_already_exists_exception':
            echo('Index ' + index_name + ' already exists', quiet)
        else:
            raise

    echo('Using document type: ' + doc_type, quiet)
    if mapping_file:
        echo('Applying mapping from: ' + mapping_file, quiet)
        with open(mapping_file) as f:
            mapping = json.loads(f.read())
        es.put_mapping(index_name, doc_type, mapping)

    target_delimiter = sanitize_delimiter(delimiter, tab)
    documents = documents_from_file(es, import_file, target_delimiter, quiet)
    perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk,
                       bytes_per_chunk, parallel)
Example #4
0
def cli(index_name, delete_index, mapping_file, settings_file, doc_type, host,
        docs_per_chunk, bytes_per_chunk, parallel, quiet, parser, config_file,
        user, passwd):

    with open(config_file, "rb") as f:
        con = json.loads(f.read())
    host = con['es_config']['host']
    echo('Using host: ' + host, quiet)
    es = ElasticSearch(host)

    if con['db']['type'] == "oracle":
        db = import_module('cx_Oracle')
        collection = db.connect(user, passwd, con['db']['con_str'])
    else:
        db = import_module('MySQLdb')
        collection = db.connect(con['db']['con_str'][0],
                                user,
                                passwd,
                                con['db']['con_str'][1],
                                charset=con['db']['con_str'][2])

    if delete_index:  # 删除索引
        try:
            stamp = 0
            es.delete_index(index_name)
            echo('Deleted: ' + index_name, quiet)
        except ElasticHttpNotFoundError:
            echo('Index ' + index_name + ' not found, nothing to delete',
                 quiet)

    try:
        if settings_file:
            with open(settings_file, 'r') as f:
                settings_json = json.loads(f.read())
            es.create_index(index_name, settings=settings_json)
        else:
            es.create_index(index_name)
        echo('Created new index: ' + index_name, quiet)
    except Exception:
        echo('Index ' + index_name + ' already exists', quiet)

    echo('Using document type: ' + doc_type, quiet)

    es.put_mapping(index_name, doc_type, con['mapping'])

    parser_fun = None
    if parser is not None:
        # 加载解释函数
        parser_fun = import_module(PARSER_PATH + '.' + parser)

    documents = documents_from_file(es, collection, quiet, parser_fun, con)

    perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk,
                       bytes_per_chunk, parallel)
    print "end:" + time.strftime(
        ISOTIMEFORMAT, time.localtime()) + '/n all records import complete.'
Example #5
0
def cli(index_name, delete_index, mapping_file, doc_type, import_file,
        delimiter, tab, host, docs_per_chunk, bytes_per_chunk, parallel, quiet,
        document_id_in_file):
    """
    Bulk import a delimited file into a target Elasticsearch instance. Common
    delimited files include things like CSV and TSV.

    \b
    Load a CSV file:
      csv2es --index-name potatoes --doc-type potato --import-file potatoes.csv
    \b
    For a TSV file, note the tab delimiter option
      csv2es --index-name tomatoes --doc-type tomato --import-file tomatoes.tsv --tab
    \b
    For a nifty pipe-delimited file (delimiters must be one character):
      csv2es --index-name pipes --doc-type pipe --import-file pipes.psv --delimiter '|'

    """

    echo('Using host: ' + host, quiet)
    es = ElasticSearch(host)

    if delete_index:
        try:
            es.delete_index(index_name)
            echo('Deleted: ' + index_name, quiet)
        except ElasticHttpNotFoundError:
            echo('Index ' + index_name + ' not found, nothing to delete',
                 quiet)

    try:
        es.create_index(index_name)
        echo('Created new index: ' + index_name, quiet)
    except IndexAlreadyExistsError:
        echo('Index ' + index_name + ' already exists', quiet)
    except ElasticHttpError as exception:
        echo(
            'Error creating index %s. ElasticHttpError [%s]' %
            (index_name, exception.error), quiet)

    echo('Using document type: ' + doc_type, quiet)
    if mapping_file:
        echo('Applying mapping from: ' + mapping_file, quiet)
        with open(mapping_file) as f:
            mapping = json.loads(f.read())
        es.put_mapping(index_name, doc_type, mapping)

    target_delimiter = sanitize_delimiter(delimiter, tab)
    documents = documents_from_file(es, import_file, target_delimiter, quiet,
                                    document_id_in_file)
    perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk,
                       bytes_per_chunk, parallel)
Example #6
0
def init_schema():
    """Should be called at application startup. Makes sure the mappings and
    index exist."""
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    try:
        es.create_index(settings.ELASTIC_SEARCH_INDEX)
    except IndexAlreadyExistsError:
        pass

    #   Does not replace if exact mapping already exists
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                   {'reg_tree': {
                       'properties': NODE_SEARCH_SCHEMA
                   }})
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'layer',
                   {'layer': {
                       'properties': LAYER_SCHEMA
                   }})
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'notice',
                   {'notice': {
                       'properties': LAYER_SCHEMA
                   }})
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'diff',
                   {'diff': {
                       'properties': DIFF_SCHEMA
                   }})
Example #7
0
def init_schema():
    """Should be called at application startup. Makes sure the mappings and
    index exist."""
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    try:
        es.create_index(settings.ELASTIC_SEARCH_INDEX)
    except IndexAlreadyExistsError:
        pass

    #   Does not replace if exact mapping already exists
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', {
        'reg_tree': {'properties': NODE_SEARCH_SCHEMA}
    })
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'layer', {
        'layer': {'properties': LAYER_SCHEMA}
    })
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'notice', {
        'notice': {'properties': LAYER_SCHEMA}
    })
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'diff', {
        'diff': {'properties': DIFF_SCHEMA}
    })
Example #8
0
class ElasticConnector(Connector):
    """
    Class for connectors that are operate with elasticsearch database
  """
    MAX_SIZE = 1000

    def __init__(self, database, host='http://localhost:9200/'):
        self.client = ElasticSearch(host)
        self.index = database
        self.create_index()

    def query_to_id(self, query):
        """
      Returns id representation of a specified query
      This is a temporary method as a replacement of elasticsearch query search
    """
        return "_".join(str(k) + "_" + str(v)
                        for k, v in query.items()).replace("/", "_")

    def create_index(self):
        """
      Creates specified index or catches an exception if it has already been created
    """
        try:
            self.client.create_index(self.index)
        except Exception as e:
            pass

    def set_dynamic_mapping(self, collection):
        """
      Sets dynamic mapping for a specified document type
    """
        self.client.put_mapping(self.index, collection, {'dynamic': True})

    def save_block(self, block):
        """
      Saves operation info in a database
    """
        super().save_block(block)
        collection = block.get_collection()
        dictionary = block.to_dict()
        query = block.get_query()
        self.update_by_query(collection, query, block)

    def update_by_query(self, collection, query, document):
        """
      Sets dynamic mapping for a specified collection,
      then creates a new id for a document depending on query for it.
      Saves a new object in a database as a new one
    """
        try:
            self.set_dynamic_mapping(collection)
            document_id = document.get_id()
            document_body = document.to_dict()
            if "_id" in document_body.keys():
                del document_body['_id']
            self.client.index(self.index,
                              collection,
                              document_body,
                              id=self.query_to_id(query))
        except Exception as e:
            print(e)
            pass

    def find_last_block(self):
        """
      Finds last block index as a value field of a document 
      in a status collection with specified id
    """
        try:
            document = self.client.get(self.index, 'status',
                                       'height_all_tsx')['_source']
            return document['value']
        except ElasticHttpNotFoundError as e:
            return 0

    def update_last_block(self, last_block):
        """
      Updates last block index as a value field of a document 
      in a status collection with specified id
    """
        self.client.index(self.index,
                          'status', {'value': last_block},
                          id='height_all_tsx')

    def save_instance(self, instance):
        """
      Saves account or comment object
    """
        self.update_by_query(instance.get_collection(), instance.get_query(),
                             instance)

    def get_instances_to_update(self, collection):
        """
      Finds and returns all dictionaries with objects that should be updated
    """
        hits = self.client.search("need_update:true",
                                  index=self.index,
                                  doc_type=collection,
                                  size=self.MAX_SIZE)['hits']['hits']
        return [{**hit['_source'], **{"_id": hit["_id"]}} for hit in hits]

    def update_instances(self, collection, instances):
        """
      Resets need_update flag for all instances in a list by their ids in _id field
    """
        for instance in instances:
            self.client.update(self.index,
                               collection,
                               instance["_id"],
                               doc={'need_update': False})
Example #9
0
s=ElasticSearch("http://localhost:9200")

if "init" in sys.argv :
	try :
		s.delete_index("flights");
	except Exception, e:
		print e
	try :
		s.create_index("flights")
	except Exception, e:
		print e
	else :	
		print "Created flights"


	s.put_mapping("flights","flight",simplejson.loads('{"flight":{"properties":{"datum":{"type":"string","index":"not_analyzed","omit_norms":true,"index_options":"docs"},"type": { "type": "string", "index" : "not_analyzed" }, "duration":{"type":"double"},"end":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}},"flight":{"type":"string","store":true,"analyzer":"keyword"},"hex":{"type":"string","store":true,"analyzer":"keyword"},"id":{"type":"string","store":true},"radar":{"type":"string","store":true,"analyzer":"keyword"},"reg":{"type":"string","store":true,"analyzer":"keyword"},"route":{"properties":{"coordinates":{"type":"double"},"type":{"type":"string"}}},"start":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}}}}}'))



def md(a) :
    a["datum"]=a["starttime"][:10]
    return a
    
    
def makets(a) :
    for f in ("starttime","endtime") :
        a[f]=maket(a[f])
    return a


Example #10
0
class ElasticSearchProvider(SearchProvider):
    def __init__(self, config, db=None, authnz_wrapper=None, io_loop=None):
        self.debug = False
        self.config = config
        if db is not None:
            self.db = db
        self.syncES = ElasticSearch(
            '%(ELASTIC_SEARCH_PROTOCOL)s://%(ELASTIC_SEARCH_HOST)s:%(ELASTIC_SEARCH_PORT)s' % config
        )
        self.asyncES = ESConnection(
            host=config.get('ELASTIC_SEARCH_HOST'),
            port=config.get('ELASTIC_SEARCH_PORT'),
            io_loop=io_loop,
            protocol=config.get('ELASTIC_SEARCH_PROTOCOL'),
        )
        self.index = config.get('ELASTIC_SEARCH_INDEX')
        self.max_retries = config.get('ELASTIC_SEARCH_MAX_RETRIES')

    def activate_debug(self):
        self.debug = True

    def connect_to_db(self):
        from sqlalchemy import create_engine
        from sqlalchemy.orm import scoped_session, sessionmaker
        conn_string = self.config.get('SQLALCHEMY_CONNECTION_STRING')
        engine = create_engine(
            conn_string,
            convert_unicode=True,
            pool_size=1,
            max_overflow=0,
            echo=self.debug
        )
        maker = sessionmaker(bind=engine, autoflush=True)
        self.db = scoped_session(maker)

    def _assemble_inner_query(self, domain=None, page_filter=None):
        if page_filter and domain:
            page_prefix = '%s/%s' % (domain.url, page_filter)
        else:
            page_prefix = None

        if page_prefix:
            return {
                'prefix': {
                    'page_url': page_prefix
                }
            }
        else:
            return {
                'match_all': {}
            }

    def _assemble_outer_query(self, inner_query, filter_terms):
        return {
            'filtered': {
                'query': inner_query,
                'filter': {
                    'and': [{
                        'term': filter_term
                    } for filter_term in filter_terms]
                }
            }
        }

    def _assemble_filter_terms(self, key_id=None, domain=None):
        filter_terms = []

        if key_id:
            filter_terms.append({'keys.id': key_id})

        if domain:
            filter_terms.append({'domain_id': domain.id})

        return filter_terms

    def gen_doc(self, review):
        return {
            'keys': [{'id': violation.key_id} for violation in review.violations],
            'uuid': str(review.uuid),
            'completed_date': review.completed_date,
            'violation_count': review.violation_count,
            'page_id': review.page_id,
            'page_uuid': str(review.page.uuid),
            'page_url': review.page.url,
            'page_last_review_date': review.page.last_review_date,
            'domain_id': review.domain_id,
            'domain_name': review.domain.name,
        }

    def index_review(self, review):
        for attempt in range(self.max_retries):
            try:
                self.syncES.send_request(
                    method='POST',
                    path_components=[self.index, 'review', review.page_id],
                    body=dumps(self.gen_doc(review)),
                    encode_body=False
                )
                break
            except (Timeout, ConnectionError, ElasticHttpError, InvalidJsonResponseError) as e:
                values = review.id, review.page_id, str(e)
                logging.error('Could not index review (review_id:{0}, page_id:{1}): {2}'.format(*values))
                time.sleep(1)
                if attempt >= self.max_retries - 1:
                    raise
            else:
                raise

    def index_reviews(self, reviewed_pages, reviews_count, batch_size):
        action = {'index': {'_type': 'review'}}

        for i in range(0, reviews_count, batch_size):
            body_bits = []

            for page in reviewed_pages[i:i + batch_size]:
                doc = self.gen_doc(page.last_review)

                action['index']['_id'] = doc['page_id']

                body_bits.append(dumps(action))
                body_bits.append(dumps(doc))

            # Yes, that trailing newline IS necessary
            body = '\n'.join(body_bits) + '\n'

            self.syncES.send_request(
                method='POST',
                path_components=[self.index, '_bulk'],
                body=body,
                encode_body=False
            )

        logging.info('Done!')

    @return_future
    def get_by_violation_key_name(self, key_id, current_page=1, page_size=10, domain=None, page_filter=None, callback=None):
        def treat_response(response):
            if response.error is None:
                try:
                    hits = loads(response.body).get('hits', {'hits': []})

                    reviews_data = []
                    for hit in hits['hits']:
                        completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date'])
                        reviews_data.append({
                            'uuid': hit['_source']['uuid'],
                            'page': {
                                'uuid': hit['_source']['page_uuid'],
                                'url': hit['_source']['page_url'],
                                'completedAt': completedAt
                            },
                            'domain': hit['_source']['domain_name']
                        })

                    reviews_count = hits.get('total', 0)

                    callback({
                        'reviews': reviews_data,
                        'reviewsCount': reviews_count
                    })
                except Exception as e:
                    reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message)
                    logging.error(reason)
                    callback({'error': {'status_code': 500, 'reason': reason}})
            else:
                reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body)
                logging.error(reason)
                callback({'error': {'status_code': 500, 'reason': reason}})

        inner_query = self._assemble_inner_query(domain, page_filter)
        filter_terms = self._assemble_filter_terms(key_id, domain)

        query = self._assemble_outer_query(inner_query, filter_terms)

        sort_ = [{
            'completed_date': {
                'order': 'desc'
            }
        }, {
            'violation_count': {
                'order': 'desc'
            }
        }]

        source = {'query': query, 'sort': sort_}

        self.asyncES.search(
            callback=treat_response,
            index=self.index,
            type='review',
            source=source,
            page=current_page,
            size=page_size,
        )

    @return_future
    def get_domain_active_reviews(self, domain, current_page=1, page_size=10, page_filter=None, callback=None):
        def treat_response(response):
            if response.error is None:
                try:
                    hits = loads(response.body).get('hits', {'hits': []})

                    pages = []
                    for hit in hits['hits']:
                        completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date'])
                        pages.append({
                            'url': hit['_source']['page_url'],
                            'uuid': hit['_source']['page_uuid'],
                            'violationCount': len(hit['_source']['keys']),
                            'completedAt': completedAt,
                            'reviewId': hit['_source']['uuid']
                        })

                    reviews_count = hits.get('total', 0)

                    callback({
                        'reviewsCount': reviews_count,
                        'pages': pages
                    })
                except Exception as e:
                    reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message)
                    logging.error(reason)
                    callback({'error': {'status_code': 500, 'reason': reason}})
            else:
                reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body)
                logging.error(reason)
                callback({'error': {'status_code': 500, 'reason': reason}})

        inner_query = self._assemble_inner_query(domain=domain, page_filter=page_filter)
        filter_terms = self._assemble_filter_terms(domain=domain)

        query = self._assemble_outer_query(inner_query, filter_terms)

        sort_ = [{
            'violation_count': {
                'order': 'desc'
            }
        }, {
            'completed_date': {
                'order': 'desc'
            }
        }]

        source = {'query': query, 'sort': sort_}

        self.asyncES.search(
            callback=treat_response,
            index=self.index,
            type='review',
            source=source,
            page=current_page,
            size=page_size,
        )

    def refresh(self):
        try:
            self.syncES.refresh(index=self.index)
        except Exception as e:
            logging.error('Could not refresh index (%s)' % e)

    def get_index_settings(cls):
        return {
            'index': {
                'number_of_shards': 4
            }
        }

    def get_index_mapping(cls):
        return {
            'review': {
                'properties': {
                    'keys': {
                        'properties': {
                            'id': {
                                'type': 'integer'
                            }
                        }
                    },
                    'uuid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'completed_date': {
                        'type': 'integer'
                    },
                    'violation_count': {
                        'type': 'float'
                    },
                    'page_id': {
                        'type': 'integer'
                    },
                    'page_uuid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'page_url': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'page_last_review_date': {
                        'type': 'integer'
                    },
                    'domain_id': {
                        'type': 'integer'
                    },
                    'domain_name': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    }
                }
            }
        }

    def setup_index(self):
        try:
            settings = self.get_index_settings()
            self.syncES.create_index(index=self.index, settings=settings)
            mapping = self.get_index_mapping()
            self.syncES.put_mapping(index=self.index, doc_type='review', mapping=mapping)
            logging.info('Index %s created.' % self.index)
        except Exception as e:
            raise e

    def delete_index(self):
        try:
            self.syncES.delete_index(index=self.index)
            logging.info('Index %s deleted.' % self.index)
        except Exception as e:
            raise e

    def _get_max_page_id_from_index(self, must_have_domain_name=False):
        if must_have_domain_name:
            inner_query = {
                'constant_score': {
                    'filter': {
                        'not': {
                            'missing': {
                                'field': 'domain_name'
                            }
                        }
                    }
                }
            }
        else:
            inner_query = {
                'match_all': {}
            }

        query = {
            'query': inner_query,
            'sort': [{
                'page_id': {
                    'order': 'desc'
                }
            }]
        }

        results = self.syncES.search(query, index=self.index, doc_type='review')
        if results['hits']['total'] > 0:
            return results['hits']['hits'][0]['_id'] or 0
        return 0

    def index_all_reviews(self, keys=None, batch_size=200, replace=False):
        logging.info('Querying database...')
        self.connect_to_db()

        if keys is not None:
            keys = [k.id for k in self.db.query(Key.id).filter(Key.name.in_(keys)).all()]

        try:
            max_page_id = self._get_max_page_id_from_index(must_have_domain_name=True)
        except Exception:
            logging.error('Could not retrieve max page_id! Use with --replace (with caution)')
            return

        def apply_filters(query):
            if keys is not None:
                query = query \
                    .filter(Violation.review_id == Page.last_review_id) \
                    .filter(Violation.key_id.in_(keys))

            if not replace:
                query = query.filter(Page.id > max_page_id)

            return query.filter(Page.last_review_id != None)

        reviews_count = apply_filters(self.db.query(func.count(Page))).scalar()

        query = self.db.query(Page).options(joinedload('last_review'))
        reviewed_pages = apply_filters(query).order_by(Page.id.asc())

        logging.info('Indexing %d reviews...' % reviews_count)

        self.index_reviews(reviewed_pages, reviews_count, batch_size)

    @classmethod
    def new_instance(cls, config):
        return ElasticSearchProvider(config)

    @classmethod
    def main(cls):
        import sys

        parser = cls.argparser()
        args = parser.parse_args()

        config = {}
        host = None
        port = None
        index = None
        es = None

        levels = ['ERROR', 'WARNING', 'INFO', 'DEBUG']
        log_level = levels[args.verbose]
        logging.basicConfig(level=getattr(logging, log_level), format='%(levelname)s - %(message)s')

        if not (args.create or args.recreate or args.delete or args.keys or args.all_keys):
            parser.print_help()
            sys.exit(1)

        if args.conf:
            from derpconf.config import ConfigurationError
            from holmes.config import Config
            try:
                config = Config().load(args.conf[0])
                host = config['ELASTIC_SEARCH_HOST']
                port = config['ELASTIC_SEARCH_PORT']
                index = config['ELASTIC_SEARCH_INDEX']
            except ConfigurationError:
                logging.error('Could not load config! Use --conf conf_file')
                sys.exit(1)
            except KeyError:
                logging.error('Could not parse config! Check it\'s contents')
                sys.exit(1)

        if args.server:
            try:
                host, port = args.server[0].split(':')
                config['ELASTIC_SEARCH_HOST'] = host
                config['ELASTIC_SEARCH_PORT'] = port
            except Exception:
                logging.error('Could not parse server host and port! Use --server host:port')
                sys.exit(1)

        if args.index:
                index = args.index[0]
                config['ELASTIC_SEARCH_INDEX'] = index

        from pyelasticsearch.exceptions import IndexAlreadyExistsError, ElasticHttpNotFoundError, InvalidJsonResponseError
        from requests.exceptions import ConnectionError
        try:

            if args.create or args.recreate or args.delete:
                if host is None or port is None:
                    logging.error('Need either a host and port or a config file to perform such operation!')
                    sys.exit(1)
                if index is None:
                    logging.error('Need either an index name or a config file to perform such operation!')
                    sys.exit(1)
                else:
                    es = cls.new_instance(config)
                    if args.recreate or args.delete:
                        try:
                            es.delete_index()
                        except ElasticHttpNotFoundError:
                            pass
                        except InvalidJsonResponseError as e:
                            logging.error('Invalid response! Reason: %s' % e)
                            sys.exit(1)
                    if args.create or args.recreate:
                        es.setup_index()

            if args.keys or args.all_keys:
                if config is None:
                    logging.error('Need a config file to perform such operation! Use --conf conf_file')
                else:
                    batch_size = args.batch_size[0] if args.batch_size else 200
                    es = cls.new_instance(config) if not es else es
                    try:
                        if args.verbose > 2:
                            es.activate_debug()
                        if args.keys:
                            es.index_all_reviews(args.keys, replace=args.replace, batch_size=batch_size)
                        elif args.all_keys:
                            es.index_all_reviews(replace=args.replace, batch_size=batch_size)
                    except InvalidJsonResponseError as e:
                        logging.error('Invalid response! Reason: %s' % e)
                        sys.exit(1)

        except IndexAlreadyExistsError:
            logging.error('Index %s already exists! Use --recreate (with caution) to recreate' % index)
        except ConnectionError:
            logging.error('Could not connect to server at %s:%s' % (host, port))
        except KeyError:
            logging.error('Could not get host nor port! Use either -conf or --server')
            sys.exit(1)
Example #11
0
                "type": "float",
                "null_value": 0.0
            },
            "Runtime": {
                "type": "integer"
            },
            "Type": {
                "type": "string",
                "index": "not_analyzed"
            },
            "Rated": {
                "type": "string",
                "index": "not_analyzed"
            },
            "imdbID": {
                "type": "string",
                "index": "not_analyzed"
            },
            "metadata": {
                "type": "string",
                "index": "not_analyzed"
            },
            "queue": {
                "type": "string",
                "index": "not_analyzed"
            }
        }
    }
}
print es.put_mapping("prime", "video", mapping)
Example #12
0
input = len(sys.argv)
if input < 2:
	usage()
	sys.exit(1)
else:
	qname = sys.argv[1]

from pyelasticsearch import ElasticSearch
es = ElasticSearch(elasticsearch)

try:
	s = es.status('oplog')
except:
	print "Creating index: oplog"
	try:
		s = es.create_index('oplog')
		print "sleeping for 5 to ensure index exists"
		time.sleep(5)
	except:
		print "ERROR: index creation failed!"
		sys.exit()

print "Creating queue: %s" % qname
try:
	es.put_mapping('oplog',qname,{"properties" : { "from" : {"type" : "string", "null_value" : "na"}, "sent" : {"type" : "string", "null_value" : "na"}, "submitted" : {"type" : "date"}, "subject" : {"type" : "string", "null_value" : "na"}, "message" : {"type" : "string", "null_value" : "na"} }})
	print "Created queue with mapping:"
	print es.get_mapping('oplog',qname)
except:
	print "ERROR: queue creation failed!"
Example #13
0
			"state": {"type" : "string"},
			"country": {"type" : "string"},
			"name": {"type" : "string"},
			"description": {"type" : "string"},
			"logo": {"type" : "string"},
			"twitter": {"type" : "string"},
			"station_site": {"type" : "string"},
			"primary_genre": {"type" : "string", "index" : "not_analyzed"},
			"frequency": {"type" : "string"},
			"shoutcast_url": {"type" : "string"},
			"location": {"type" : "geo_point"},
			"geojson": {"type" : "string", "index" : "no"},
        }
    }
}
es.put_mapping(INDEX_NAME, "station", STATION_MAPPING)


headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36'
}

failures = 0
pk = 0
while failures < 200:
	pk += 1
	r = requests.get("http://www.iheart.com/a/live/station/%d/" % pk, headers=headers)

	if r.status_code != 200:
		if r.status_code > 500:
			print("[%d] %d" % (r.status_code, pk))
Example #14
0
class Elastic(DataLayer):
    """ElasticSearch data layer."""

    serializers = {
        'integer': int,
        'datetime': parse_date
    }

    def init_app(self, app):
        app.config.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200/')
        app.config.setdefault('ELASTICSEARCH_INDEX', 'eve')
        self.es = ElasticSearch(app.config['ELASTICSEARCH_URL'])
        self.index = app.config['ELASTICSEARCH_INDEX']

    def _get_field_mapping(self, schema):
        """Get mapping for given field schema."""
        if schema['type'] == 'datetime':
            return {'type': 'date'}
        elif schema['type'] == 'string' and schema.get('unique'):
            return {'type': 'string', 'index': 'not_analyzed'}
        elif schema['type'] == 'string':
            return {'type': 'string'}

    def put_mapping(self, app):
        """Put mapping for elasticsearch for current schema.

        It's not called automatically now, but rather left for user to call it whenever it makes sense.
        """
        for resource, resource_config in app.config['DOMAIN'].items():
            properties = {}
            properties[config.DATE_CREATED] = self._get_field_mapping({'type': 'datetime'})
            properties[config.LAST_UPDATED] = self._get_field_mapping({'type': 'datetime'})

            for field, schema in resource_config['schema'].items():
                field_mapping = self._get_field_mapping(schema)
                if field_mapping:
                    properties[field] = field_mapping

            datasource = (resource, )  # TODO: config.SOURCES not available yet (self._datasource_ex(resource))
            mapping = {}
            mapping[datasource[0]] = {'properties': properties}
            self.es.put_mapping(self.index, datasource[0], mapping)

    def find(self, resource, req, sub_resource_lookup):
        """
        TODO: implement sub_resource_lookup
        """
        query = {
            'query': {
                'query_string': {
                    'query': request.args.get('q', '*'),
                    'default_field': request.args.get('df', '_all'),
                    'default_operator': 'AND'
                }
            }
        }

        if not req.sort and self._default_sort(resource):
            req.sort = self._default_sort(resource)

        # skip sorting when there is a query to use score
        if req.sort and 'q' not in request.args:
            query['sort'] = []
            sort = ast.literal_eval(req.sort)
            for (key, sortdir) in sort:
                sort_dict = dict([(key, 'asc' if sortdir > 0 else 'desc')])
                query['sort'].append(sort_dict)

        if req.where:
            where = json.loads(req.where)
            if where:
                query['filter'] = {
                    'term': where
                }

        if req.max_results:
            query['size'] = req.max_results

        if req.page > 1:
            query['from'] = (req.page - 1) * req.max_results

        source_config = config.SOURCES[resource]
        if 'facets' in source_config:
            query['facets'] = source_config['facets']

        try:
            args = self._es_args(resource)
            args['es_fiels'] = self._fields(resource)
            return self._parse_hits(self.es.search(query, **args), resource)
        except es_exceptions.ElasticHttpError:
            return ElasticCursor()

    def find_one(self, resource, **lookup):
        args = self._es_args(resource)
        args['es_fields'] = self._fields(resource)

        if config.ID_FIELD in lookup:
            try:
                hit = self.es.get(id=lookup[config.ID_FIELD], **args)
            except es_exceptions.ElasticHttpNotFoundError:
                return

            if not hit['exists']:
                return

            doc = hit.get('fields', hit.get('_source', {}))
            doc['_id'] = hit.get('_id')
            convert_dates(doc, self._dates(resource))
            return doc
        else:
            query = {
                'query': {
                    'constant_score': {
                        'filter': {
                            'term': lookup
                        }
                    }
                }
            }

            try:
                args['size'] = 1
                docs = self._parse_hits(self.es.search(query, **args), resource)
                return docs.first()
            except es_exceptions.ElasticHttpNotFoundError:
                return None

    def find_list_of_ids(self, resource, ids, client_projection=None):
        args = self._es_args(resource)
        args['es_fields'] = self._fields(resource)
        return self._parse_hits(self.es.multi_get(ids, **args), resource)

    def insert(self, resource, doc_or_docs, **kwargs):
        ids = []
        kwargs.update(self._es_args(resource))
        for doc in doc_or_docs:
            doc.update(self.es.index(doc=doc, id=doc.get('_id'), **kwargs))
            ids.append(doc['_id'])
        self.es.refresh(self.index)
        return ids

    def update(self, resource, id_, updates):
        args = self._es_args(resource, refresh=True)
        return self.es.update(id=id_, doc=updates, **args)

    def replace(self, resource, id_, document):
        args = self._es_args(resource, refresh=True)
        args['overwrite_existing'] = True
        return self.es.index(document=document, id=id_, **args)

    def remove(self, resource, id_=None):
        args = self._es_args(resource, refresh=True)
        if id_:
            return self.es.delete(id=id_, **args)
        else:
            try:
                return self.es.delete_all(**args)
            except es_exceptions.ElasticHttpNotFoundError:
                return

    def _parse_hits(self, hits, resource):
        """Parse hits response into documents."""
        return ElasticCursor(hits, self._dates(resource))

    def _es_args(self, resource, refresh=None):
        """Get index and doctype args."""
        datasource = self._datasource(resource)
        args = {
            'index': self.index,
            'doc_type': datasource[0],
            }
        if refresh:
            args['refresh'] = refresh
        return args

    def _fields(self, resource):
        """Get projection fields for given resource."""
        datasource = self._datasource(resource)
        keys = datasource[2].keys()
        return ','.join(keys)

    def _default_sort(self, resource):
        datasource = self._datasource(resource)
        return datasource[3]

    def _dates(self, resource):
        dates = [config.LAST_UPDATED, config.DATE_CREATED]
        datasource = self._datasource(resource)
        schema = config.DOMAIN[datasource[0]]['schema']
        for field, field_schema in schema.items():
            if field_schema['type'] == 'datetime':
                dates.append(field)
        return dates
Example #15
0
from pyelasticsearch import ElasticSearch


os.environ.setdefault("DJANGO_SETTINGS_MODULE", "timesketch.settings")
from django.contrib.auth.models import User

from timesketch.apps.sketch.models import Timeline


user = User.objects.get(id=2)
es_server = sys.argv[1]
es_port = sys.argv[2]
name = sys.argv[3]
index = sys.argv[4]

es = ElasticSearch("http://%s:%s" % (es_server, es_port))

mapping = {
        "plaso_event": {
            u'properties': {
                u'timesketch_label': {
                    "type": "nested"}
            }
        },
}

es.put_mapping(index, "plaso_event", mapping)
timeline = Timeline.objects.create(owner=user, title=name, description=name,
                                   datastore_index=index)
timeline.make_public()
Example #16
0
class IbbdElasticSearch:
    """
    es操作
    文档:http://pyelasticsearch.readthedocs.io/en/latest/
    """
    es = None
    config = {}

    mapping_is_set = False  # 判断是否已经设置了es的mapping

    def __init__(self, config):
        """
        es初始化
        配置参数:
        host: es连接字符串
        indexName: index的名字
        deleteIndex: 是否删除已经存在的index,默认为false,不删除
        settings: index的配置。具体的配置项,请看es的文档。
        settingsFile: index的配置,json文件。具体的配置项,请看es的文档。
        mappings: mappings的配置。具体的配置项,请看es的文档。
        mappingsFile: mappings的配置,json文件。具体的配置项,请看es的文档。
        idField: id字段。有些数据是包含id字段的

        说明:settings和settingsFile最多只能有一项
        mappings和mappingsFile最多也只能有一项
        """
        self.es = ElasticSearch(config['host'])

        if 'docType' not in config:
            config['docType'] = config['indexName']
        self.config = config

        if 'deleteIndex' in config and config['deleteIndex']:
            try:
                self.es.delete_index(config['indexName'])

                print('delete index ' + config['indexName'] + ' success!')
            except ElasticHttpNotFoundError:  # 如果本来不存在,则输出提示就好
                print('Index ' + config['indexName'] \
                                + ' not found, nothing to delete!')
            except:
                raise Exception('Index ' + config['indexName'] + ' delete error!')

        try:
            if 'settings' in config:
                self.es.create_index(config['indexName'],
                                     settings=config['settings'])
            elif 'settingsFile' in config:
                with open(config['settingsFile'], 'r') as f:
                    config['settings'] = json.loads(f.read())
                self.es.create_index(config['indexName'],
                                     settings=config['settings'])
            else:
                self.es.create_index(config['indexName'])

            print('create index ' + config['indexName'] + ' success!')
        except Exception:
            raise Exception("create index " + config['indexName'] + ' error!')

    def _putMapping(self, row):
        """
        设置es的mapping。
        可以根据row生成默认配置, 生成配置规则如下:
        """
        try:
            if 'mappingsFile' in self.config:
                with open(self.config['mappingsFile'], 'r') as f:
                    self.config['mappings'] = json.loads(f.read())

            if 'mappings' in self.config:
                self.es.put_mapping(self.config['indexName'],
                                    self.config['docType'],
                                    self.config['mappings'])
            print("put mapping " + self.config['indexName'] + ' success!')
        except Exception:
            raise Exception("put mapping " + self.config['indexName'] + ' error!')

    def read(self):
        pass

    def batchRead(self):
        pass

    def write(self, row):
        """
        写入单行记录
        """
        return self.batchWrite([row])

    def batchWrite(self, rows):
        """
        写入多行记录
        """
        if not self.mapping_is_set:   # 设置mapping
            self.mapping_is_set = True
            self._putMapping(rows[0])

        docs = ()
        if 'idField' in self.config:
            docs = (self.es.index_op(doc, id=doc.pop(self.config['idField'])) \
                    for doc in rows)
        else:
            docs = (self.es.index_op(doc) for doc in rows)

        self.es.bulk(docs,
                     index=self.config['indexName'],
                     doc_type=self.config['docType'])

        return True
Example #17
0
class LBRest():

    def __init__(self, base=None, idx_exp_url=None, 
                 txt_mapping=None, cfg_idx=None):
        """Serve para cosumir o LBG e o ES."""

        self.base = base
        self.idx_exp_url = idx_exp_url
        if self.idx_exp_url is not None:
            self.idx_exp_host = idx_exp_url.split('/')[2]
            self.idx_exp_index = idx_exp_url.split('/')[3]
            self.idx_exp_type = idx_exp_url.split('/')[4]
            self.es = ElasticSearch("http://" + self.idx_exp_host)
        self.txt_mapping = txt_mapping
        self.cfg_idx = cfg_idx
        self.con_refsd = False

    def get_index(self, bases_list):
        """Obter a a configuração de indexação p/ as bases."""

        bases_indexes = []
        for base in bases_list:
            idx_exp_url = base['metadata']['idx_exp_url']
            nm_idx = idx_exp_url.split('/')[3]
            url_txt_idx = config.REST_URL + "/_txt_idx/" + nm_idx
            req = None
            try:
                req = requests.get(url_txt_idx)
                req.raise_for_status()
                idx_resp = req.json()
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 404:

                    # NOTE: Para os casos onde não há configuração de 
                    # indexação setada na rota "_txt_idx"! By Questor
                    idx_resp = None
                else:
                    fail_content = None
                    if req is not None:
                        fail_content = req._content
                    else:
                        fail_content = str(e)
                    logger.error("Falha HTTP ao tentar obter configuração de "\
                    "índice textual! URL: %s. FALHA: %s" % 
                    (config.REST_URL, fail_content))
                    return []
            except Exception as e:
                fail_content = None
                if req is not None:
                    fail_content = req._content
                else:
                    fail_content = str(e)
                logger.error("Erro ao tentar obter a configuração de índice "\
                "textual! URL: %s. FALHA: %s" % 
                (config.REST_URL, fail_content))
                return []
            bases_indexes.append({"base": base, "index": idx_resp})
        return bases_indexes

    def get_bases(self):
        """Get all bases which has to index registries."""

        # NOTE: A construção logo abaixo tá meio tosca. O objetivo é
        # checar se na estrutura de dados da table "lb_base" já está 
        # o atributo (campo struct) e o campo "txt_mapping". Se não 
        # tiver, tenta obter a base com todos os campos. Trata-se de 
        # um "workaround" sendo o correto que a estrutura de dados 
        # na table "lb_base" esteja atualizada! By Questor
        bases = [ ]
        req = None
        try:
            params = """{
                "select": [
                    "name",
                    "idx_exp_time",
                    "idx_exp_url",
                    "txt_mapping"
                ],
                "literal": "idx_exp is true",
                "limit": null
            }"""
            req = requests.get(config.REST_URL, params={'$$':params})
            if config.FORCE_INDEX == True:
                data = [ ]
                results = dict({
                    u'metadata' : {
                        u'idx_exp_url'  : u''+config.ES_URL+'',
                        u'name'         : u''+config.NM_BASE+'',
                        u'idx_exp_time' : u''+config.TIME_IDX+''
                    }
                })
                data.append(results)
                bases = data
            else:
                req.raise_for_status()
                response = req.json()
                bases = response["results"]
        except Exception as e:
            bases = [ ]
            req = None
            try:
                params = """{
                    "literal": "idx_exp is true",
                    "limit": null
                }"""
                req = requests.get(config.REST_URL, params={'$$':params})
                req.raise_for_status()
                response = req.json()
                bases = response["results"]
            except Exception as e:
                # NOTE: A variável de instância "self.con_refsd" 
                # serve p/ evitar que o aviso mais abaixo seja 
                # exibido repetidamente detonando o log! By Questor
                if self.con_refsd:
                    return bases

                # NOTE: Estou usando '"Connection refused" in str(e)' 
                # pq "raise_for_status()" mais acima não retorna uma 
                # exceção do tipo "requests.exceptions.HTTPError" de 
                # forma q possamos usar o código em "status_code" 
                # tratar erro de forma mais específica! By Questor
                if "Connection refused" in str(e) and not self.con_refsd:
                    logger.error('Erro ao obter a lista bases para '\
                    'indexação. URL: %s. FALHA: Servidor indisponivel! '\
                    'HTTPCode: 502 (Connection refused)!' % (config.REST_URL))
                    self.con_refsd = True
                    return bases
                self.con_refsd = False
                fail_content = None
                if req is not None:
                    fail_content = req._content
                else:
                    fail_content = str(e)
                logger.error(
                    ("Erro ao obter a lista bases para indexação. "
                        "URL: %s. FALHA: %s") % (
                        config.REST_URL, 
                        fail_content))
        return bases

    def get_passed_registries(self):
        """Retorna registros da base de log erros de indexação. 
        Apenas "id_doc_orig" e "dt_last_up_orig".
        """

        # NOTE: Cria base de log se não existir! By Questor
        self.create_log_base()

        registries = [ ]
        params = {'$$':"""{
            "select":["id_doc_orig", "dt_last_up_orig"],
            "literal": "nm_base = '%s'",
            "limit": null
            }""" % self.base}
        url = config.REST_URL + '/log_lbindex/doc'

        req = None
        try:
            req = requests.get(url, params=params)
            req.raise_for_status()
            response = req.json()
            registries = response["results"]
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                1 Erro ao recuperar registros da base %s'. FALHA: %s
            """ % ('log_lbindex', fail_content))

        resp = {}
        for reg in registries:
            resp[reg['id_doc_orig']] = reg['dt_last_up_orig']
        return resp

    def get_registries(self):
        """Retorna registros à serem indexados que sob certos critérios não 
        tenham falhado no passado.
        """

        # NOTE: Obtêm registros da base de log de erros! Registros 
        # q tenham falhado no passado! By Questor
        registries = [ ]
        if config.FORCE_INDEX:
            params = {'$$':'{"select":["id_doc", "dt_last_up"], "limit": %d}'}
        else:
            params = {
                '$$':'{"select":["id_doc", "dt_last_up"], \
                "literal":"dt_idx is null", "limit": %d}'
            }

        params.update(result_count='false')
        params['$$'] = params['$$'] % config.DEFAULT_LIMIT

        url = config.REST_URL + '/' + self.base + '/doc'

        req = None
        try:
            req = requests.get(url, params=params)
            req.raise_for_status()
            response = req.json()
            registries = response["results"]
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                Erro ao recuperar registros da base %s'. FALHA: %s
            """ % (self.base, fail_content))

        '''
        TODO: Essa lógica poderia ser mais eficiente... A 
        princípio vejo duas soluções...
        1 - Guardar em cache (mais complicada);
        2 - Trazer apenas os registros (id_doc) envolvidos 
        no processo de indexação atual.
        By Questor
        '''

        '''
        TODO: Esse método "self.get_passed_registries()" deveria 
        ser chamado sempre? Mesmo quando a operação é "create"? 
        Checar melhor... By Questor
        '''

        # NOTE: Obtêm registros da base de log de erros! Registros 
        # q tenham falhado no passado! By Questor
        passed = self.get_passed_registries()

        _registries = [ ]
        for reg in registries:
            if reg['_metadata']['id_doc'] in passed:
                '''
                NOTE: O objetivo aqui é checar se o registro 
                está no log de erros (registros que tentou-se 
                indexar no passado) e se estiver ignora-os a 
                não ser que a data de "update" do registro 
                registrado na base de logs seja diferente da 
                data atual do registro, nesses casos o LBIndex 
                vai tentar novamente!
                By Questor
                '''

                '''
                NOTE: No dict "passed" consta apenas o valor 
                do campo "dt_last_up_orig" da base "log_lbindex"! 
                By Questor
                '''
                dt_last_up = passed[reg['_metadata']['id_doc']]

                if dt_last_up != reg['_metadata']['dt_last_up']:
                    _registries.append(reg)
            else:
                _registries.append(reg)

        return _registries

    def get_full_reg(self, id, dt_last_up):
        """Obtêm o registro doc mais textos extraídos dos arquivos anexos se 
        houverem.
        """

        # TODO: Registrar essa ação no log toda "santa vez"? By Questor
        logger.info('Recuperando registro %s da base %s ...' % 
            (str(id), self.base))

        response = None
        url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/full'

        req = None
        try:
            req = requests.get(url)
            req.raise_for_status()
            response = req.json()
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            error_msg = """
                Erro ao recuperar registro %s na base %s'. FALHA: %s
            """ % (str(id), self.base, fail_content)

            # TODO: Pq duas chamadas as logs? By Questor
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
        return response

    def es_create_mapping(self):
        """Cria um mapping p/ uma base se houver configuração p/ isso."""

        response_0 = None
        response_0_json = None
        index_url = None
        try:
            index_url = ("http://" + self.idx_exp_host + "/" + 
                self.idx_exp_index + "/" + self.idx_exp_type)
            response_0 = requests.get(index_url + "/_mapping")
            response_0.raise_for_status()
            response_0_json = response_0.json()
        except requests.exceptions.HTTPError as e:

            # NOTE: Normalmente entrará nesse bloco de código 
            # quando o índice não existe! By Questor
            self.es_create_index()
        except requests.exceptions.RequestException as e:
            raise Exception("Problem in the mapping provider! " + str(e))
        except Exception as e:
            raise Exception("Mapping operation. Program error! " + str(e))

        if (response_0.status_code == 200 and not response_0_json and 
                (self.txt_mapping is not None and self.txt_mapping)):
            response_1 = None
            try:
                response_1 = self.es.put_mapping(
                    index=self.idx_exp_index,
                    doc_type=self.idx_exp_type,
                    mapping=self.txt_mapping)

                if (response_1 is None or
                        response_1.get("acknowledged", None) is None or
                        response_1.get("acknowledged", None) != True):
                    raise Exception("Retorno inesperado do servidor \
                        ao criar mapping! " + 
                        str(response_1))
            except Exception as e:
                raise Exception("Mapping creation error! " + str(e))

    def es_create_index(self):
        """Criar um índice p/ a base com as configurações setadas, não havendo 
        criar um índice genérico.
        """

        response_0 = None
        try:
            cfg_idx_holder = None

            # NOTE: Se não houver configuração de indexação "setada" 
            # o sistema vai criar uma padrão! By Questor
            if self.cfg_idx is not None and self.cfg_idx:
                cfg_idx_holder = self.cfg_idx
            else:
                cfg_idx_holder = {
                        "settings":{
                            "analysis":{
                                "analyzer":{
                                    "default":{
                                        "tokenizer":"standard",
                                        "filter":[
                                            "lowercase",
                                            "asciifolding"
                                        ]
                                    }
                                }
                            }
                        }
                    }

            response_0 = self.es.create_index(index=self.idx_exp_index,
                                              settings=cfg_idx_holder)

            if (response_0 is None or
                response_0.get("acknowledged", None) is None or
                response_0.get("acknowledged", None) != True):
                raise Exception("Retorno inesperado do servidor \
                    ao criar index! " + 
                    str(response_0))

            self.es_create_mapping()
        except IndexAlreadyExistsError as e:
            self.es_create_mapping()
        except Exception as e:
            raise Exception("Index creation error! " + str(e))

    def index_member(self, registry, id, dt_last_up):
        """Criar o índice textual para cada registro."""

        logger.info(
            'Indexando registro %s da base %s na url %s ...' % (
                str(id), 
                self.base, self.idx_exp_url))

        try:

            # NOTE: Trata e cria os mappings e index textuais! 
            # By Questor
            self.es_create_mapping()
            self.es.index(self.idx_exp_index, self.idx_exp_type, 
                          registry, id=id)
            return True
        except Exception as e:
            error_msg = ("Erro ao indexar registro %s da base %s na url %s'. "
                "Mensagem de erro: %s") % (
                str(id), 
                self.base, self.idx_exp_url, str(e))
            logger.error(error_msg)

            # TODO: Pq dois logs? By Questor
            self.write_error(id, dt_last_up, error_msg)
            return False

    def update_dt_index(self, id, dt_last_up):
        """Atualizar a data de atualização da indexação textual do registro."""

        logger.info('Alterando data de indexacao do '\
            'registro %s da base %s ...' % (str(id), self.base))
        params = {'value': datetime.datetime.now().\
            strftime('%d/%m/%Y %H:%M:%S')}
        url = (config.REST_URL + '/' + self.base + '/doc/' + str(id) + 
            '/_metadata/dt_idx')

        req = None
        try:
            req = requests.put(url, params=params)
            req.raise_for_status()
            return True
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            error_msg = 'Erro ao alterar data de indexacao do registro %s na '\
                'base %s. FALHA: %s' % (str(id), self.base, fail_content)
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
        return False

    def write_error(self, id_doc, dt_last_up, error_msg):
        """Write errors to LightBase."""

        error = {
            'nm_base': self.base,
            'id_doc_orig': id_doc,
            'error_msg': error_msg,
            'dt_error': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'),
            'dt_last_up_orig': dt_last_up
        }
        url = config.REST_URL + '/log_lbindex/doc'
        data = {'value': json.dumps(error)}
        req = None
        try:
            req = requests.post(url, data=data)
            req.raise_for_status()
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                0 Erro ao tentar escrever erro no Lightbase. FALHA: %s
            """ % fail_content)

    def get_errors(self):
        """Get all bases which has to index registries."""

        errors = [ ]
        params = """{
            "literal": "base = '%s'",
            "limit": 250
        }""" % (self.base)
        url = config.REST_URL + '/_index_error'

        req = None
        try:
            req = requests.get(url, params={'$$':params})
            req.raise_for_status()
            response = req.json()
            errors = response["results"]
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                Erro ao tentar recuperar erros de indice. URL: %s. FALHA: %s
            """ % (url, fail_content))
        return errors

    # TODO: Esse método serve para criar/atualizar p/ uma 
    # indexação (index) padrão! No momento está "desvirtuado", 
    # pois basta apagar o índice p/ q ele seja recriado com a 
    # indexação setada na rota "_txt_idx"! Creio que esse 
    # método não faz muito sentido aqui. Sugiro remover! 
    # By Questor
    def create_index(self):
        """Cria índice com as opções de mapeamento padrão
        Atualiza o índice se já estiver criado.
        """

        settings = {
            "settings":{
                "analysis":{
                    "analyzer":{
                        "default":{
                            "tokenizer":"standard",
                            "filter":[
                                "lowercase",
                                "asciifolding"
                            ]
                        }
                    }
                }
            }
        }

        http, space, address, _index, _type = self.idx_exp_url.split('/')

        try:
            result = self.es.create_index(
                index=_index,
                settings=settings
            )
        except IndexAlreadyExistsError as e:
            logger.info("O índice já existe. Tentando atualizar o mapping...")
            self.es.close_index(index=_index)
            result = self.es.update_settings(
                index=_index,
                settings=settings
            )
            logger.info("Mapping atualizado com sucesso. Abrindo o índice...")
            self.es.open_index(index=_index)
            logger.info("Índice reaberto com sucesso!")

    def delete_index(self, registry):
        """Deletar registros no index."""

        id = registry['id_doc']
        try:
            http, space, address, _index, _type = self.idx_exp_url.split('/')
            self.es.delete(_index, _type, id=id)
            return True

        except ElasticHttpNotFoundError as e:
            return True

        except Exception as e:
            error_msg = 'Erro ao deletar indice %s da base %s na url %s. '\
                'Mensagem de erro: %s' % \
                (str(id), self.base, self.idx_exp_url, str(e))
            logger.error(error_msg)
            return False

    def delete_error(self, registry):
        """Deletar registro de erros na rota '_index_error'."""

        url = (config.REST_URL + 
            """/_index_error?$$={"literal":"base = '%s' and id_doc = %d"}""")
        url = url % (registry['base'], registry['id_doc'])
        logger.info('Deletando registro de erro de indice na url %s' % url)

        req = None
        try:
            req = requests.delete(url)
            req.raise_for_status()
            return True
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            error_msg = """
                Erro ao deletar erro de indice. FALHA: %s
            """ % (fail_content)
            logger.error(error_msg)
        return False

    @staticmethod
    def create_log_base():
        """Cria base de log do LBIndex caso não exista."""

        log_base = model.LogBase()
        response = log_base.get_base()
        if not response:

            # NOTE: Cria a base já que ela não existe!
            logger.info("Criando base de log do índice...")
            result = log_base.create_base()
            if result is None:
                logger.error("Erro na criação da base de log: \n%s", 
                             response.text)
                return False
            else:
                logger.info("Base de log criada com sucesso!")
        return True
Example #18
0
                    "type": "string"
                }
            },
            "_timestamp": {
                "enabled": True
            },
            "_parent": {
                "type": "station"
            },
            "_ttl": {
                "enabled": True,
                "default": "7d"
            }
        }
    }
    es.put_mapping(settings.ES_INDEX, 'play', play_mapping)

    query = {
        "query": {
            "filtered": {
                "query": {
                    "match_all": {}
                },
                "filter": {
                    "bool": {
                        "must": {},
                        "should": {},
                        "must_not": {
                            "missing": {
                                "field": "shoutcast_url",
                                "existence": True,
Example #19
0
from pyelasticsearch import ElasticSearch

es = ElasticSearch("http://localhost:9200/")
mapping = {
    "video" : {
        "properties" : {
            "title" : { "type" : "string" },
            "year" : { "type" : "integer" },
            "image_url" : { "type" : "string", "index" : "not_analyzed" },
            "amazon_url"  : { "type" : "string", "index" : "not_analyzed" },
            "Genre" : { "type" : "string", "index" : "not_analyzed" },
            "Metascore" : { "type" : "float", "null_value" : 0.0 },
            "imdbRating" : { "type" : "float", "null_value" : 0.0 },
            "Runtime" : { "type" : "integer" },
            "Type" : { "type" : "string", "index" : "not_analyzed" },
            "Rated" : { "type" : "string", "index" : "not_analyzed" },
            "imdbID" : { "type" : "string", "index" : "not_analyzed" },
            "metadata" : { "type" : "string", "index" : "not_analyzed" },
            "queue" : { "type" : "string", "index" : "not_analyzed" }
        }
    }
}
print es.put_mapping("prime", "video", mapping)

Example #20
0
class SearchIndex(object):
    def __init__(self, model):
        self.es = ElasticSearch()
        self.model = model

    def put_mapping(self, index, doc_type):
        mapping = {
            doc_type: {
                "properties": {
                    "location": {
                        "type": "geo_point"
                    },
                }
            }
        }
        self.es.put_mapping(index=index, doc_type=doc_type, mapping=mapping)

    def bulk_items(self, index, doc_type):
        for m in self.model.objects.all():
            self.es.bulk([
                self.es.index_op({
                    "pk": m.pk,
                    "name": m.name,
                    "rating": m.rating,
                    "address": m.address,
                    "description": m.description,
                    "location": {
                        "lon": m.longitude,
                        "lat": m.latitude
                    }
                }),
                ],
                doc_type=doc_type,
                index=index)

    def search(self, index, question, longitude, latitude, size=10):
        #self.es.delete_index(index)
        try:
            self.es.create_index(index)
            self.put_mapping(index, "place")
            self.bulk_items(index, "place")
        except IndexAlreadyExistsError:
            pass

        query = {
            "query": {
                "function_score": {
                    "query": {
                        "bool": {
                            "should": [
                                {"match": {"name": question}},
                                {"match": {"_all": {
                                    "query": question,
                                    "operator": "or",
                                    "fuzziness": "auto",
                                    "zero_terms_query": "all"
                                    }}}
                                ]
                            }
                        },
                    "functions": [
                        {"exp": {"rating": {"origin": 5, "scale": 1, "offset": 0.1}}},
                    ]
                    }
                }
            }

        if longitude and longitude is not None:
            query['query']['function_score']['functions'] = [
                {'gauss': {
                    "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "550m", "scale": "1km"}
                    }},
                {'gauss': {
                    "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "500m", "scale": "2km"}
                    }},
            ]

        results = self.es.search(query, index=index, size=size)

        self.es.refresh()

        return results
Example #21
0
                "itunesTrackId": {"type": "long"},
                "song_spot": {"type": "string"}
            },
            "_timestamp": {
                "enabled": True
            },
            "_parent": {
                "type": "station"
            },
            "_ttl": {
                "enabled": True,
                "default" : "7d"
            }
        }
    }
    es.put_mapping(settings.ES_INDEX, 'play', play_mapping)

    query = {
        "query":{
            "filtered":{
                "query":{
                    "match_all":{}
                },
                "filter":{
                    "bool":{
                        "must":{},
                        "should":{},
                        "must_not":{
                            "missing":{
                                "field": "shoutcast_url",
                                "existence": True,