def restore_tokens():
    connections.create_connection(hosts=ES_NODES)
    Index(INDEX_NAME).delete()

    class Token(DocType):
        username = String()
        token = String()
        expires = Date()
        read = Boolean()
        write = Boolean()
        revoked = Boolean()
        acl = String()
        groups = String()
        admin = Boolean()
        last_activity_at = Date()

        class Meta:
            index = INDEX_NAME

    Token.init()
    reindex_results = connections.get_connection().reindex(body={"source": {"index": BACKUP_INDEX_NAME}, "dest": {"index": INDEX_NAME}}, request_timeout=3600)
    if reindex_results.get('created') + reindex_results.get('updated') == reindex_results.get('total'):
        return ('Tokens restored to previous schema successfully!')
    else:
        return ('Tokens did not restore from backup properly')
 def handle(self, *args, **options):
     s3 = boto3.resource('s3', aws_access_key_id=settings.AWS_ACCESS_KEY, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY)
     dt = options['start_date'].replace(hour=0, minute=0, second=0, microsecond=0)
     if not options['to_stdout']:
         connections.create_connection(hosts=[options['es_url']], **settings.ES_CONNECTION_PARAMS)
         CRECDoc.init()
     while dt < options['end_date']:
         logger.info('Processing files for {0}.'.format(dt))
         try:
             response = s3.Object(
                 options['source_bucket'],
                 crec_s3_key('mods.xml', dt)
             ).get()
         except botocore.exceptions.ClientError as e:
             logger.info('Could not find mods file for {0}.'.format(dt))
             response = None
         if response is not None and response.get('Body'):
             try:
                 crecs = extract_crecs_from_mods(response['Body'])
                 logger.info('Found {0} new records.'.format(len(crecs)))
                 if options['to_stdout']:
                     logger.info('Using stdout:')
                 for crec in crecs:
                     if not crec.is_skippable():
                         if options['to_stdout']:
                             logger.info(crec.to_es_doc())
                         else:
                             es_doc = crec.to_es_doc()
                             es_doc.save()
                         upload_speaker_word_counts(crec)
             except Exception as e:
                 logger.exception('Error processing data for {0}.'.format(dt.strftime('%Y-%m-%d')))
         dt += timedelta(days=1)
Example #3
0
    def __init__(self, nodes=ES_NODES, **kwargs):

        if type(nodes) == str:
            nodes = nodes.split(',')

        if not nodes:
            nodes = ES_NODES

        self.indicators_prefix = kwargs.get('indicators_prefix', 'indicators')
        self.tokens_prefix = kwargs.get('tokens_prefix', 'tokens')

        logger.info('setting es nodes {}'.format(nodes))

        connections.create_connection(hosts=nodes)

        self._alive = False

        while not self._alive:
            if not self._health_check():
                logger.warn('ES cluster not accessible')
                logger.info('retrying connection in 30s')
                sleep(30)

            self._alive = True

        logger.info('ES connection successful')
        self.tokens = TokenManager()
        self.indicators = IndicatorManager()
Example #4
0
 def update_all():
     """Check with Proxy source and update authors and articles.
     
     IMPORTANT: Will lock if unable to connect to MediaWiki server!
     """
     # authors
     connections.create_connection(hosts=config.DOCSTORE_HOSTS)
     index = Index(config.DOCSTORE_INDEX)
     mw_authors = Proxy.authors(cached_ok=False)
     es_authors = self.authors()
     authors_new,authors_delete = self.authors_to_update(mw_authors, es_authors)
     
     for n,title in enumerate(authors_delete):
         logging.debug('%s/%s %s' % (n, len(authors_delete), title))
         author = Author.get(url_title=title)
         author.delete()
         
     for n,title in enumerate(authors_new):
         logging.debug('%s/%s %s' % (n, len(authors_new), title))
         mwauthor = Proxy.page(title)
         author = Author.from_mw(mwauthor)
         author.save()
     
     # articles
     connections.create_connection(hosts=config.DOCSTORE_HOSTS)
     index = Index(config.DOCSTORE_INDEX)
     # authors need to be refreshed
     mw_authors = Proxy.authors(cached_ok=False)
     mw_articles = Proxy.articles_lastmod()
     es_authors = self.authors()
     es_articles = self.articles()
     articles_update,articles_delete = self.articles_to_update(
         mw_authors, mw_articles, es_authors, es_articles)
     self.delete_articles(titles=articles_delete)
     self.index_articles(titles=articles_update)
Example #5
0
    def setUp(self):
        from django.conf import settings
        SEARCH = getattr(settings, 'SEARCH')

        connections.create_connection('testing', **SEARCH['default']['connections'])
        self.index = Index(SEARCH['default']['index'], using='testing')
        # This is needed for test_documents, but has side effects in all running tests
        doctypes_list = (
            value for name, value
            in inspect.getmembers(documents)
            if not name.startswith('_') and
            inspect.isclass(value) and
            issubclass(value, DocType) and
            name != DocType.__name__
        )

        for doctype in doctypes_list:
            # Remove assigned index
            doctype._doc_type.index = None
            # Associate docs with test index
            self.index.doc_type(doctype)

        if self.index.exists():
            self.index.delete(ignore=404)
        self.index.create()

        self.search = Search(index=SEARCH['default']['index'])
Example #6
0
    def test_it_does_raise_if_bad_connection_is_queried(self, request):
        request.addfinalizer(remove_connection)

        connections.create_connection(alias='foobar', hosts=['localhost:2323'])

        with pytest.raises(ConnectionError):
            Index('whatever', using='foobar').exists()
Example #7
0
    def __init__(self, nodes=ES_NODES, **kwargs):
        self.logger = logging.getLogger(__name__)

        if type(nodes) == str:
            nodes = nodes.split(',')

        self.logger.info('setting es nodes {}'.format(nodes))
        connections.create_connection(hosts=nodes)
Example #8
0
    def __init__(self):
        """Do nothing, by default."""
        from elasticsearch_dsl.connections import connections
        from tg import config

        connections.create_connection(hosts=[config.get('elasticsearch.host')],
                                      send_get_body_as="POST",
                                      timeout=20)
    def __init__(self, remote='localhost:9200', index='indicators', **kwargs):
        super(_ElasticSearch, self).__init__(remote)

        self.index = index
        if isinstance(self.remote, str):
            self.remote = self.remote.split(',')

        connections.create_connection(hosts=self.remote)
Example #10
0
    def __init__(self):

        if not self.index:
            raise ValueError("No index specified")

        if not self.doc_types:
            raise ValueError("No doc_types specified")

        connections.create_connection(hosts=settings.ELASTIC_SEARCH_HOSTS)
Example #11
0
    def __init__(self, **kwargs):
        self.hosts = kwargs.get('hosts', 'localhost')
        self.client = Elasticsearch(self.hosts)

        timeout = kwargs.get('timeout', 10)
        os.environ['LOGGO_REQUEST_TIMEOUT'] = str(timeout)

        max_retries = kwargs.get('max_retries', 2)
        connections.create_connection(hosts=self.hosts, connection_class=CustomUrllib3HttpConnection, max_retries=max_retries)

        index_name = kwargs.get('index', None)
        self.create_index_if_not_exists(index_name)
Example #12
0
def app():
    print "Running notifications app..."

    # Define a default Elasticsearch client
    connections.create_connection(hosts=[ES_SERVER])

    # App logic runs here
    #rabbitmq_conf()
    sample_data()

    # Display cluster health
    print(connections.get_connection().cluster.health())
Example #13
0
    def __init__(self):

        if not self.index:
            raise ValueError("No index specified")

        if not self.doc_types:
            raise ValueError("No doc_types specified")

        connections.create_connection(
            hosts=settings.ELASTIC_SEARCH_HOSTS,
            # sniff_on_start=True,
            retry_on_timeout=True,
        )
    def _configure(self):
        if "endpoints" in self.config:
            self.endpoints = self.config["endpoints"]
        else:
            self.host = self.config["host"]
            self.port = self.config["port"]
            self.endpoints = ["{}:{}".format(self.host, self.port)]
        connections.create_connection(hosts=self.endpoints,
                                      timeout=self.config.get("timeout",15),
                                      retry_on_timeout=True,
                                      maxsize=25)

        return
Example #15
0
def setup_mapping(command, conf, vars):
    # Setup Elasticsearch's database mapping
    print("Setting up Elasticsearch's model")

    connections.create_connection(
        hosts=[config.get('elasticsearch.host')], send_get_body_as='POST')

    # Setup the jobs index
    _setup_index(model.JobElastic)
    # Setup the company index
    _setup_index(model.CompanyElastic)
    # Setup the geocomplete index
    _setup_index(model.Geocomplete)
Example #16
0
def create_connections():
    """Create connections to elasticsearch as defined in settings.py."""
    for alias, params in CONNECTIONS.items():
        processed_params = {}
        for param_name, param_value in params.items():

            if param_name == 'serializer' and isinstance(param_value, str):
                serializer_class = import_string(param_value)
                param_value = serializer_class()

            processed_params[param_name] = param_value

        connections.create_connection(alias, **processed_params)
def reindex_tokens():
    TokenBackup.init()
    connections.create_connection(hosts=ES_NODES)
    backup_results = connections.get_connection().reindex(body={"source": {"index": INDEX_NAME}, "dest": {"index": BACKUP_INDEX_NAME}}, request_timeout=3600)
    if backup_results.get('created') + backup_results.get('updated') == backup_results.get('total'):
        Index(INDEX_NAME).delete()
    else:
        return ('Tokens did not backup properly')
    time.sleep(1)
    Token.init()
    reindex_results = connections.get_connection().reindex(body={"source": {"index": BACKUP_INDEX_NAME}, "dest": {"index": INDEX_NAME}}, request_timeout=3600)
    if reindex_results.get('created') + reindex_results.get('updated') == reindex_results.get('total'):
        return ('Tokens reindexed successfully!')
    else:
        return ('Tokens did not reindex from backup properly')
Example #18
0
def prepare_connection():
    """Set dafault connection for ElasticSearch.

    .. warning::

        In case of using multiprocessing/multithreading, connection will
        be probably initialized in the main process/thread and the same
        connection (socket) will be used in all processes/threads. This
        will cause some unexpected timeouts of pushes to Elasticsearch.
        So make sure that this function is called again in each
        process/thread to make sure that unique connection will be used.
    """
    elasticsearch_host = getattr(settings, 'ELASTICSEARCH_HOST', 'localhost')
    elasticsearch_port = getattr(settings, 'ELASTICSEARCH_PORT', 9200)
    connections.create_connection(hosts=['{}:{}'.format(elasticsearch_host, elasticsearch_port)])
Example #19
0
    def __init__(self, config_file='config.cfg'):
        super(Elastic, self).__init__()

        self.percentage=10.0
        self.minimum_occurrences=250

# The ConfigParser documentation points out that there's no way to force defaults config option
# outside the "DEFAULT" section.
        config = ConfigParser()
        config.read(config_file)
        if not config.has_section('elastic'):
            config.add_section('elastic')
        
        for option, value in {'use_ssl': 'True', 'host': '127.0.0.1', 'version': '2', 'index': 'nxapi', 'doc_type': 'events'}.items():
            if not config.has_option('elastic', option):
                config.set('elastic', option, value)

        self.version = config.getint('elastic', 'version')
        self.index = config.get('elastic', 'index')
        use_ssl = config.getboolean('elastic', 'use_ssl')
        host = config.get('elastic', 'host')
        self.doc_type = config.get('elastic', 'doc_type')
        self.client = connections.create_connection(hosts=[host], use_ssl=use_ssl, index=self.index, version=self.version, doc_type=self.doc_type, timeout=30, retry_on_timeout=True )

        Event.init(index=self.index)
        index = Index(self.index, using=self.client)
        index.doc_type(Event)
        self.initialize_search()
Example #20
0
    def __init__(self):
        es_url = app.config['ELASTICSEARCH_URL']
        es_port = app.config['ELASTICSEARCH_PORT']
        logstash_host = app.config['LOGSTASH_HOST']
        logstash_port = int(app.config['LOGSTASH_PORT'])

        self.measure = Measure(app.config['ELASTICSEARCH_CLIENT'], (logstash_host, logstash_port))
        self.es = connections.create_connection(hosts=[es_url + ':' + es_port])
Example #21
0
def get_conn(*, verify=True, verify_indices=None):
    """
    Lazily create the connection.

    Args:
        verify (bool): If true, check the presence of indices and mappings
        verify_indices (list of str): If set, check the presence of these indices. Else use the defaults.

    Returns:
        elasticsearch.client.Elasticsearch: An Elasticsearch client
    """
    # pylint: disable=global-statement
    global _CONN
    global _CONN_VERIFIED

    do_verify = False
    if _CONN is None:
        http_auth = settings.ELASTICSEARCH_HTTP_AUTH
        use_ssl = http_auth is not None
        _CONN = connections.create_connection(
            hosts=[settings.ELASTICSEARCH_URL],
            http_auth=http_auth,
            use_ssl=use_ssl,
            # make sure we verify SSL certificates (off by default)
            verify_certs=use_ssl
        )
        # Verify connection on first connect if verify=True.
        do_verify = verify

    if verify and not _CONN_VERIFIED:
        # If we have a connection but haven't verified before, do it now.
        do_verify = True

    if not do_verify:
        if not verify:
            # We only skip verification if we're reindexing or
            # deleting the index. Make sure we verify next time we connect.
            _CONN_VERIFIED = False
        return _CONN

    # Make sure everything exists.
    if verify_indices is None:
        verify_indices = set()
        for index_type in ALL_INDEX_TYPES:
            verify_indices = verify_indices.union(
                get_aliases(index_type)
            )
    for verify_index in verify_indices:
        if not _CONN.indices.exists(verify_index):
            raise ReindexException("Unable to find index {index_name}".format(
                index_name=verify_index
            ))

    _CONN_VERIFIED = True
    return _CONN
Example #22
0
 def applyConfig(self):
     try:
         print("Connecting to '%s', index '%s'" % (self.confESHost, self.confESIndex))
         res = connections.create_connection(hosts=[self.confESHost])
         idx = Index(self.confESIndex)
         idx.doc_type(DocHTTPRequestResponse)
         DocHTTPRequestResponse.init()
         try:
             idx.create()
         except:
             pass
     except Exception as e:
         JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Error while initializing ElasticSearch: %s</p></html>" % (str(e)), "Error", JOptionPane.ERROR_MESSAGE)
def get_es_client(enforce_new=False, retry=True):
    """Returns the singleton Elasticsearch-client object connected to ES server specified by environment variable ES_HOST with default timeout specified by environment variable ES_TIMEOUT
    """
    global CLIENT
    if enforce_new or not CLIENT:
        CLIENT = \
        connections.create_connection(hosts=[os.environ['ES_HOST']], 
                                      timeout=os.environ['ES_TIMEOUT'])
    if CLIENT.ping():
        return CLIENT
    elif retry:
        return get_es_client(enforce_new=True, retry=False)
    else:
        return CLIENT    
Example #24
0
    def form_sinks(self):
        model_modules = [c for c in listdir(self.model_location) if
                         isfile(join(self.model_location, c)) if c != '__init__.py']

        model_modules = [m for m in model_modules if PY_FILE_REGEX.match(m)]
        for model_module in model_modules:
            # get the name of the class
            model_module = model_module.split('.')[0]
            try:
                module_path = self.load_path + '.' + model_module

                model_class = model_module
                module = importlib.import_module(module_path)
                self.models.append(ModelIdentifier(index=module.index, mapping=module.mapping,
                                                   model_class=module.model_class))
            except (ImportError, Exception) as e:
                raise RuntimeError("Error importing the module ", e)

        for model in self.models:
            model_name = model.index + "." + model.mapping
            connections.create_connection(model_name, hosts=[ELASTIC_HOST], port=ELASTIC_PORT)
            data_sink = ElasticDataSink(model_name, connections.get_connection(model_name), model)
            self.data_sinks[model_name] = data_sink
Example #25
0
    def registerExtenderCallbacks(self, callbacks):
        self.callbacks = callbacks
        self.helpers = callbacks.getHelpers()
        callbacks.setExtensionName("Storing HTTP Requests/Responses into ElasticSearch")
        self.callbacks.registerHttpListener(self)
        self.callbacks.registerContextMenuFactory(self)
        self.out = callbacks.getStdout()

        res = connections.create_connection(hosts=[ES_host])
        idx = Index(ES_index)
        idx.doc_type(DocHTTPRequestResponse)
        try:
            idx.create()
        except:
            print("Index already exists")
Example #26
0
File: utils.py Project: olabi/lore
def get_conn(verify=True):
    """
    Lazily create the connection.
    """
    # pylint: disable=global-statement
    # This is ugly. Any suggestions on a way that doesn't require "global"?
    global _CONN
    global _CONN_VERIFIED

    do_verify = False
    if _CONN is None:
        _CONN = connections.create_connection(hosts=[URL])
        # Verify connection on first connect if verify=True.
        do_verify = verify

    if verify and not _CONN_VERIFIED:
        # If we have a connection but haven't verified before, do it now.
        do_verify = True

    if not do_verify:
        if not verify:
            # We only skip verification if we're reindexing or
            # deleting the index. Make sure we verify next time we connect.
            _CONN_VERIFIED = False
        return _CONN

    # Make sure everything exists.
    if not _CONN.indices.exists(INDEX_NAME):
        raise ReindexException("Unable to find index {index_name}".format(
            index_name=INDEX_NAME
        ))

    mapping = _CONN.indices.get_mapping()
    if INDEX_NAME not in mapping:
        raise ReindexException(
            "No mappings found in index {index_name}".format(
                index_name=INDEX_NAME
            )
        )

    mappings = _CONN.indices.get_mapping()[INDEX_NAME]["mappings"]
    if DOC_TYPE not in mappings.keys():
        raise ReindexException("Mapping {doc_type} not found".format(
            doc_type=DOC_TYPE
        ))

    _CONN_VERIFIED = True
    return _CONN
Example #27
0
 def applyConfig(self):
     try:
         print("Connecting to '%s', index '%s'" % (self.confESHost, self.confESIndex))
         self.es = connections.create_connection(hosts=[self.confESHost])
         self.idx = Index(self.confESIndex)
         self.idx.doc_type(DocHTTPRequestResponse)
         if self.idx.exists():
             self.idx.open()
         else:
             self.idx.create()
         self.callbacks.saveExtensionSetting("elasticburp.host", self.confESHost)
         self.callbacks.saveExtensionSetting("elasticburp.index", self.confESIndex)
         self.callbacks.saveExtensionSetting("elasticburp.tools", str(self.confBurpTools))
         self.callbacks.saveExtensionSetting("elasticburp.onlyresp", str(int(self.confBurpOnlyResp)))
     except Exception as e:
         JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Error while initializing ElasticSearch: %s</p></html>" % (str(e)), "Error", JOptionPane.ERROR_MESSAGE)
Example #28
0
def test_event_send_and_store(options):
    inputServer=options['loginput']
    esServer = options['esserver']
    uuids=[]
    
    #create a sample test event
    anevent=json.loads(r'''{
        "category": "pytest",
        "processid": "0",
        "severity": "DEBUG",
        "utctimestamp": "",
        "hostname": "testhost.pytest.com",
        "summary": "a test event for pytest from test_basic_event_send",
        "eventsource": "pytest",
        "details": {
          "processid": "14148",
          "hostname": "testvictim.pytest.com",
          "program": "pytest",
          "sourceipaddress": "10.1.2.3"
        }
      }''')
    #send events
    for i in range(0,5):
        anevent['timestamp']=datetime.utcnow().isoformat()
        anevent['details']['uuid']=str(uuid.uuid1())
        uuids.append(anevent['details']['uuid'])
        if options["verbose"]:
            print('sending {0}'.format(anevent))
        r=requests.put(url="http://{0}/events".format(inputServer),data=json.dumps(anevent))
        if options["verbose"]:
            print(r)    
        assert r.status_code == 200
        
    #search for events to have landed in ES
    es=connections.create_connection(hosts=['{0}'.format(esServer)])
    
    for u in uuids:
        for hit in scan(es,
                        query={"query":{"match":{"details.uuid":"{0}".format(u)}}},
                        index="events",
                        doc_type="event"):
            assert u == hit['_source']['details']['uuid']
Example #29
0
def _init():
    es_url = settings.ELASTIC_SEARCH["url"]
    if not es_url:
        return

    connection = connections.create_connection(
        hosts=[es_url],
        verify_certs=es_url.startswith("https"),
        ca_certs=certifi.where(),
        timeout=20)

    # Create any indices that are missing
    indices = connection.indices.get("*")
    for item in doctypes():
        if item._doc_type.index not in indices:
            item.init()
        connection.indices.put_mapping(doc_type=item._doc_type.name,
                                       index=item._doc_type.index,
                                       body={"_routing": {"required": True}})
    return connection
Example #30
0
def setup_database(config):
    settings = dictset(config.registry.settings).mget('elasticsearch')
    params = {}
    params['chunk_size'] = settings.get('chunk_size', 500)
    params['hosts'] = []
    for hp in split_strip(settings['hosts']):
        h, p = split_strip(hp, ':')
        params['hosts'].append(dict(host=h, port=p))
    if settings.asbool('sniff'):
        params['sniff_on_start'] = True
        params['sniff_on_connection_fail'] = True

    # XXX if this connection has to deal with mongo and sqla objects,
    # then we'll need to use their es serializers instead. should
    # probably clean up that part of the engine interface - there's
    # lots of repeated code, plus other engines shouldn't have to know
    # about es - they should just know how to serialize their
    # documents to JSON.
    conn = connections.create_connection(
        serializer=JSONSerializer(), **params)
    setup_index(conn, settings)
Example #31
0
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


import hashlib
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../../../../")

from med_base.storage.es.models import EntityDisease, EntityBodypart, EntityDepartment, \
                            EntityDrug, EntityExam, EntityOperation, EntitySymptom

from elasticsearch_dsl.connections import connections
from conf.settings import ES_HOST

connections.create_connection(hosts=[ES_HOST])


from jk39.items import DiseaseItem, ExamItem, DrugItem, OperationItem, SymptomItem


class EntityItemPipeline(object):

    def process_item(self, item, spider):
        if isinstance(item, DiseaseItem):
            spider.logger.info('====== SAVE A Entity Disease: name={} ======'.format(item.get('name', '').strip()))
            meta_dict = {}
            for key in ['name', 'describe', 'is_infect', 'highrisk_group', 
                        'source_url', 'treatment_cycle', 'treatment_cost']:
                if item.get(key, '').strip():
                    meta_dict[key] = item.get(key, '').strip()
Example #32
0
 def setUp(self):
     from django.conf import settings
     self.settings = getattr(settings, 'SEARCH')
     connections.create_connection(
         'testing', **self.settings['default']['connections'])
Example #33
0
from elasticsearch_dsl import Document, Text, Keyword, Date, analyzer
from elasticsearch_dsl.connections import connections

connections.create_connection(hosts=['127.0.0.1'])
ik_analyzer = analyzer('ik_max_word')


class CnblogsType(Document):
    title = Text(analyzer='ik_max_word')
    description = Text(analyzer='ik_max_word')
    url = Keyword()
    riqi = Date()

    class Index:
        name = 'cnblog_text'
        settings = {
            'number_of_shards': 5,
        }


es = connections.create_connection(CnblogsType)

if __name__ == '__main__':
    CnblogsType.init()
Example #34
0
    def build_index(self, document_parquet, section_parquet, tables_parquet,
                    figures_parquet, equations_parquet):
        if self.awsauth is not None:
            connections.create_connection(
                hosts=self.hosts,
                http_auth=self.awsauth,
                use_ssl=True,
                verify_certs=True,
                connection_class=RequestsHttpConnection)
        else:
            connections.create_connection(hosts=self.hosts)
        logger.info('Building elastic index')
        connections.create_connection(hosts=self.hosts)
        Object.init()
        FullDocument.init()
        # This is a parquet file to load from
        df = pd.read_parquet(document_parquet)
        for ind, row in df.iterrows():
            FullDocument(name=row['pdf_name'],
                         dataset_id=row['dataset_id'],
                         content=row['content']).save()
        logger.info('Done building document index')
        df = pd.read_parquet(section_parquet)
        for ind, row in df.iterrows():
            Object(
                cls='Section',
                dataset_id=row['dataset_id'],
                content=row['content'],
                header_content=row['section_header'],
                area=50,
                detect_score=row['detect_score'],
                postprocess_score=row['postprocess_score'],
                pdf_name=row['pdf_name'],
            ).save()
        logger.info('Done building section index')

        if tables_parquet != '':
            df = pd.read_parquet(tables_parquet)
            for ind, row in df.iterrows():
                Object(
                    cls='Table',
                    dataset_id=row['dataset_id'],
                    content=row['content'],
                    header_content=row['caption_content'],
                    area=50,
                    detect_score=row['detect_score'],
                    postprocess_score=row['postprocess_score'],
                    pdf_name=row['pdf_name'],
                    img_pth=row['img_pth'],
                ).save()
            logger.info('Done building tables index')
        if figures_parquet != '':
            df = pd.read_parquet(figures_parquet)
            for ind, row in df.iterrows():
                Object(
                    cls='Figure',
                    dataset_id=row['dataset_id'],
                    content=row['content'],
                    header_content=row['caption_content'],
                    area=50,
                    detect_score=row['detect_score'],
                    postprocess_score=row['postprocess_score'],
                    pdf_name=row['pdf_name'],
                    img_pth=row['img_pth'],
                ).save()
            logger.info('Done building figures index')

        if equations_parquet != '':
            df = pd.read_parquet(equations_parquet)
            for ind, row in df.iterrows():
                Object(
                    cls='Equation',
                    dataset_id=row['dataset_id'],
                    content=row['content'],
                    header_content='',
                    area=50,
                    detect_score=row['detect_score'],
                    postprocess_score=row['postprocess_score'],
                    pdf_name=row['pdf_name'],
                    img_pth=row['img_pth'],
                ).save()
            logger.info('Done building equations index')

        logger.info('Done building object index')
Example #35
0
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import pymysql.cursors
from elasticsearch_dsl.connections import connections
# 将mysql插入变成异步化的包,由twisted提供
from twisted.enterprise import adbapi

from ArticleSpider.models.es_types import ArticleType
from ArticleSpider import settings

es = connections.create_connection(hosts=[settings.ES_ADDRESS])


class ArticlespiderPipeline(object):
    def process_item(self, item, spider):

        return item


class MysqlTwistPipeline(object):
    """
    数据异步插入mysql
    """
    def __init__(self, dbpool):
        self.dbpool = dbpool
Example #36
0
def index(
    buildroot: Path,
    url: str,
    update=False,
    no_progressbar=False,
):
    # We can confidently use a single host here because we're not searching
    # a cluster.
    connections.create_connection(hosts=[url], retry_on_timeout=True)
    connection = connections.get_connection()
    health = connection.cluster.health()
    status = health["status"]
    if status not in ("green", "yellow"):
        raise click.ClickException(f"status {status} not green or yellow")

    count_todo = 0
    for file in walk(buildroot):
        count_todo += 1

    click.echo(f"Found {count_todo:,} (potential) documents to index")

    if update:
        for name in connection.indices.get_alias():
            if name.startswith(f"{INDEX_ALIAS_NAME}_"):
                document_index = Index(name)
                break
        else:
            raise IndexAliasError(
                f"Unable to find an index called {INDEX_ALIAS_NAME}_*")

    else:
        # Confusingly, `._index` is actually not a private API.
        # It's the documented way you're supposed to reach it.
        document_index = Document._index
        click.echo("Deleting any possible existing index "
                   f"and creating a new one called {document_index._name!r}")
        document_index.delete(ignore=404)
        document_index.create()

    skipped = []

    def generator():
        root = Path(buildroot)
        for doc in walk(root):
            # The reason for specifying the exact index name is that we might
            # be doing an update and if you don't specify it, elasticsearch_dsl
            # will fall back to using whatever Document._meta.Index automatically
            # becomes in this moment.
            search_doc = to_search(doc, _index=document_index._name)
            if search_doc:
                yield search_doc.to_dict(True)
            else:
                # The reason something might be chosen to be skipped is because
                # there's logic that kicks in only when the `index.json` file
                # has been opened and parsed.
                # Keep a count of all of these. It's used to make sure the
                # progressbar, if used, ticks as many times as the estimate
                # count was.
                skipped.append(1)

    def get_progressbar():
        if no_progressbar:
            return VoidProgressBar()
        return click.progressbar(length=count_todo, label="Indexing", width=0)

    count_done = count_worked = count_errors = 0
    count_shards_worked = count_shards_failed = 0
    errors_counter = Counter()
    t0 = time.time()
    with get_progressbar() as bar:
        for success, info in parallel_bulk(
                connection,
                generator(),
                # If the bulk indexing failed, it will by default raise a BulkIndexError.
                # Setting this to 'False' will suppress that.
                raise_on_exception=False,
                # If the bulk operation failed for some other reason like a ReadTimeoutError
                # it will raise whatever the error but default.
                # We prefer to swallow all errors under the assumption that the holes
                # will hopefully be fixed in the next attempt.
                raise_on_error=False,
        ):
            if success:
                count_shards_worked += info["index"]["_shards"]["successful"]
                count_shards_failed += info["index"]["_shards"]["failed"]
                count_worked += 1
            else:
                count_errors += 1
                errors_counter[info["index"]["error"]] += 1
            count_done += 1
            bar.update(1)

        for skip in skipped:
            bar.update(1)

    # Now when the index has been filled, we need to make sure we
    # correct any previous indexes.
    if update:
        # When you do an update, Elasticsearch will internally delete the
        # previous docs (based on the _id primary key we set).
        # Normally, Elasticsearch will do this when you restart the cluster
        # but that's not something we usually do.
        # See https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-forcemerge.html
        document_index.forcemerge()
    else:
        # Now we're going to bundle the change to set the alias to point
        # to the new index and delete all old indexes.
        # The reason for doing this together in one update is to make it atomic.
        alias_updates = [{
            "add": {
                "index": document_index._name,
                "alias": INDEX_ALIAS_NAME
            }
        }]
        for index_name in connection.indices.get_alias():
            if index_name.startswith(f"{INDEX_ALIAS_NAME}_"):
                if index_name != document_index._name:
                    alias_updates.append(
                        {"remove_index": {
                            "index": index_name
                        }})
                    click.echo(f"Delete old index {index_name!r}")

        connection.indices.update_aliases({"actions": alias_updates})
        click.echo(f"Reassign the {INDEX_ALIAS_NAME!r} alias from old index "
                   f"to {document_index._name}")

    t1 = time.time()
    took = t1 - t0
    rate = count_done / took
    click.echo(f"Took {format_time(took)} to index {count_done:,} documents. "
               f"Approximately {rate:.1f} docs/second")
    click.echo(f"Count shards - successful: {count_shards_worked:,} "
               f"failed: {count_shards_failed:,}")
    click.echo(f"Counts - worked: {count_worked:,} errors: {count_errors:,}")
    if errors_counter:
        click.echo("Most common errors....")
        for error, count in errors_counter.most_common():
            click.echo(f"{count:,}\t{error[:80]}")
"""
Elasticsearch base configruation.
"""

import os
from elasticsearch import Elasticsearch
from elasticsearch_dsl.connections import connections

ELASTICSEARCH_AVAILABLE = False
ES_CLIENT = None

es_host = os.environ.get('ELASTICSEARCH_URL')
if es_host:
    ES_CLIENT = Elasticsearch(['{}'.format(es_host)])
    connections.create_connection(hosts=['{}'.format(es_host)])
    ELASTICSEARCH_AVAILABLE = True
Example #38
0
import elasticsearch
import tqdm
from nltk.tokenize import word_tokenize
from jinja2 import Environment, PackageLoader

from qanta.wikipedia.cached_wikipedia import Wikipedia
from qanta.datasets.abstract import QuestionText
from qanta.guesser.abstract import AbstractGuesser
from qanta.spark import create_spark_context
from qanta.config import conf
from qanta.util.io import get_tmp_dir, safe_path
from qanta import qlogging

log = qlogging.get(__name__)
ES_PARAMS = 'es_params.pickle'
connections.create_connection(hosts=['localhost'])


def create_es_config(output_path, host='localhost', port=9200, tmp_dir=None):
    if tmp_dir is None:
        tmp_dir = get_tmp_dir()
    data_dir = safe_path(os.path.join(tmp_dir, 'elasticsearch/data/'))
    log_dir = safe_path(os.path.join(tmp_dir, 'elasticsearch/log/'))
    env = Environment(loader=PackageLoader('qanta', 'templates'))
    template = env.get_template('elasticsearch.yml')
    config_content = template.render({
        'host': host,
        'port': port,
        'log_dir': log_dir,
        'data_dir': data_dir
    })
Example #39
0
import time

import elasticsearch.client
from django.conf import settings
from elasticsearch_dsl import Document, InnerDoc, Date, Integer, Long, Text, Object, GeoPoint, Keyword, Boolean
from elasticsearch_dsl.connections import connections

from blog.models import Article

ELASTICSEARCH_ENABLED = hasattr(settings, 'ELASTICSEARCH_DSL')

if ELASTICSEARCH_ENABLED:
    connections.create_connection(
        hosts=[settings.ELASTICSEARCH_DSL['default']['hosts']])
    from elasticsearch import Elasticsearch

    es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
    from elasticsearch.client import IngestClient

    c = IngestClient(es)
    try:
        c.get_pipeline('geoip')
    except elasticsearch.exceptions.NotFoundError:
        c.put_pipeline('geoip',
                       body='''{
              "description" : "Add geoip info",
              "processors" : [
                {
                  "geoip" : {
                    "field" : "ip"
                  }
Example #40
0
# -*- coding:utf-8 -*-
import random
import hashlib

from elasticsearch_dsl import DocType, Keyword
from elasticsearch_dsl.connections import connections

index_prefix = 'skyeye_cloud_sandbox_s3_index_%s'
connections.create_connection(hosts=['10.95.166.208', '10.95.166.209', '10.95.166.210'])

class Doc(DocType):
    task_id = Keyword()
    file_name = Keyword()

    class Meta:
        index = 'skyeye_cloud_sandbox_s3_index_*'

    def save(self, **kwargs):
        # 使用索引模板生成索引
        index = index_prefix % (self.task_id[0])
        return super(Doc, self).save(index=index, **kwargs)

if __name__ == '__main__':
    for i in range(100):
        s3index = Doc(task_id = str(hashlib.md5(str(random.randint(0,100))).hexdigest()),file_name = str(hashlib.md5(str(random.randint(0,100))).hexdigest()))
        s3index.save()
        print 'finish done!'
Example #41
0
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
    analyzer, InnerObjectWrapper, Completion, Keyword, Text

from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
from elasticsearch_dsl.connections import connections

connections.create_connection(hosts=["140.143.211.106"])


class CustomAnalyzer(_CustomAnalyzer):
    def get_analysis_definition(self):
        return {}


ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])


class BaiduType(DocType):
    suggest = Completion(analyzer=ik_analyzer)
    url = Keyword()
    title = Text(analyzer="ik_max_word")
    summary = Text(analyzer="ik_max_word")
    content = Text(analyzer="ik_max_word")

    class Meta:
        index = "baidu"
        doc_type = "baike"


def gen_suggest(index, info_tuple):
Example #42
0
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy
import redis
from tools.es_models import NewsClsType
from elasticsearch_dsl.connections import connections

es = connections.create_connection(NewsClsType._doc_type.using)

redis_cli = redis.StrictRedis(host="127.0.0.1")


def gen_suggest(index, info_tuple):
    # 根据字符串生成搜索建议字符串数组
    user_words = set()
    suggests = []
    for text, weight in info_tuple:
        if text:
            # 调用 es 的 Analyzer 接口分析字符串
            words = es.indices.analyze(index=index,
                                       analyzer="ik_max_word",
                                       params={'filter': ["lowercase"]},
                                       body=text)
            # analyzed_words = set([r["token"] for r in words if len(r["token"]) > 1])
            analyzed_words = set(
                [r["token"] for r in words["tokens"] if len(r["token"]) > 1])
Example #43
0
# coding=utf8
#将伯乐在线mysql中的数据存到es中
import MySQLdb
from ArticleSpider.moudles.es_types import ArticleType
# 连接到数据库,并获得图标
connection = MySQLdb.connect('127.0.0.1', 'root', '111111', 'spider', charset="utf8", use_unicode=True)
cursor = connection.cursor()
from elasticsearch_dsl.connections import connections
from w3lib.html import remove_tags
es = connections.create_connection(ArticleType._doc_type.using)

def gen_suggests(index, info_tuple):
    # 根据字符串生成搜索建议数组
    used_words = set()
    suggests = []
    for text, weight in info_tuple:
        if text:
            # 调用es的analyze接口分析字符串
            words = es.indices.analyze(index=index, analyzer="ik_max_word", params={'filter': ["lowercase"]}, body=text)
            anylyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"]) > 1])
            new_words = anylyzed_words - used_words
        else:
            new_words = set()

        if new_words:
            suggests.append({"input": list(new_words), "weight": weight})

    return suggests

cursor.execute("select title, url, create_date,  praise_nums, comment_nums, fav_nums, front_image_url, tags, content from article")
Example #44
0
# Elasticsearch

# aws, localhost, or govuk-paas
ELASTICSEARCH_PROVIDER = env.str('ELASTICSEARCH_PROVIDER', 'aws').lower()

if ELASTICSEARCH_PROVIDER == 'govuk-paas':
    services = {
        item['instance_name']: item
        for item in VCAP_SERVICES['elasticsearch']
    }
    ELASTICSEARCH_INSTANCE_NAME = env.str(
        'ELASTICSEARCH_INSTANCE_NAME',
        VCAP_SERVICES['elasticsearch'][0]['instance_name'])
    connections.create_connection(
        alias='default',
        hosts=[services[ELASTICSEARCH_INSTANCE_NAME]['credentials']['uri']],
        connection_class=RequestsHttpConnection,
    )
elif ELASTICSEARCH_PROVIDER == 'localhost':
    connections.create_connection(alias='default',
                                  hosts=['localhost:9200'],
                                  use_ssl=False,
                                  verify_certs=False,
                                  connection_class=RequestsHttpConnection)
else:
    raise NotImplementedError()

ELASTICSEARCH_COMPANY_INDEX_ALIAS = env.str(
    'ELASTICSEARCH_COMPANY_INDEX_ALIAS', 'ch-companies')

# health check
Example #45
0
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import DocType, Text, Date, Search
from elasticsearch.helpers import bulk
from elasticsearch import Elasticsearch
from . import models

connections.create_connection(hosts =[ '192.168.99.100'], port= 32771)

class ResourceIndex(DocType):
    Name = Text()
    created_date = Date()
    description = Text()
    link = Text()
    image = Text()
    class Meta:
        index = 'takethekids'

def bulk_indexing():
    ResourceIndex.init()
    es = Elasticsearch([
        {'host': '192.168.99.100', 'port': 32771}
    ])
    bulk(client=es, actions=(b.indexing() for b in models.Resource.objects.all().iterator()))
Example #46
0
        """
        Phải khai báo lại class Meta với doc_type và tên index đúng như dưới
        """
        doc_type = 'CivilArticle'
        index = index_name

    class Index:
        """
        Từ bản 0.10.2 trở đi phải khai báo thêm cả class Index có thuộc tính name là tên index trong elastic search như dưới đây
        """
        name = index_name


# Create connection
es = Elasticsearch()
connections.create_connection(hosts=['localhost'], timeout=20)
connections.add_connection('CivilArticle', es)
CivilArticle.init(index_name)


def preprocess_content(content):
    res = []
    lines = content.split('\n')
    for line in lines:
        line = remove_numbering(line)
        words = pre_process_text(line)
        res.append(' '.join(words))

    return ' '.join(res)

Example #47
0
    def _initialize(self):
        """
        Initialize a connection to an ES cluster and creates an index template if it does not exist.
        """
        if not self._initialized:
            http_auth = None
            if self._access_key and self._secret_key and self._aws_region:
                http_auth = AWS4Auth(self._access_key, self._secret_key,
                                     self._aws_region, "es")
            elif self._access_key and self._secret_key:
                http_auth = (self._access_key, self._secret_key)
            else:
                logger.warn("Connecting to Elasticsearch without HTTP auth")

            self._client = connections.create_connection(
                hosts=[{
                    "host": self._host,
                    "port": self._port
                }],
                http_auth=http_auth,
                use_ssl=self._use_ssl,
                verify_certs=True,
                connection_class=RequestsHttpConnection,
                timeout=ELASTICSEARCH_DEFAULT_CONNECTION_TIMEOUT,
            )

            # Create a second connection with a timeout of 60s vs 10s.
            # For some reason the PUT template API can take anywhere between
            # 10s and 30s on the test cluster.
            # This only needs to be done once to initialize the index template
            connections.create_connection(
                alias=ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS,
                hosts=[{
                    "host": self._host,
                    "port": self._port
                }],
                http_auth=http_auth,
                use_ssl=self._use_ssl,
                verify_certs=True,
                connection_class=RequestsHttpConnection,
                timeout=ELASTICSEARCH_TEMPLATE_CONNECTION_TIMEOUT,
            )

            try:
                force_template_update = ELASTICSEARCH_FORCE_INDEX_TEMPLATE_UPDATE.lower(
                ) == "true"
                self._client.indices.get_template(self._index_prefix)
                LogEntry.init(
                    self._index_prefix,
                    self._index_settings,
                    skip_template_init=not force_template_update,
                )
            except NotFoundError:
                LogEntry.init(self._index_prefix,
                              self._index_settings,
                              skip_template_init=False)
            finally:
                try:
                    connections.remove_connection(
                        ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS)
                except KeyError as ke:
                    logger.exception(
                        "Elasticsearch connection not found to remove %s: %s",
                        ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS,
                        ke,
                    )

            self._initialized = True
Example #48
0
from django_elasticsearch_dsl import Index, fields
from django_elasticsearch_dsl.documents import Document
from api.models.article_model import ArticleOfInterest
from elasticsearch_dsl.connections import connections
from django_elasticsearch_dsl.registries import registry
from elasticsearchapp.custom_analyzers import greek_analyzer

connections.create_connection()

article_index = Index('articles')

article_index.settings(number_of_shards=1, number_of_replicas=0)


@registry.register_document
@article_index.document
class ArticleDocument(Document):
    title = fields.TextField(analyzer=greek_analyzer)
    date = fields.DateField()
    body = fields.TextField(analyzer=greek_analyzer)
    tags = fields.TextField(analyzer=greek_analyzer)
    author = fields.TextField()
    link = fields.TextField()
    type = fields.TextField()
    scope = fields.TextField()

    class Django:
        model = ArticleOfInterest
Example #49
0
from elasticsearch_dsl import DocType, String, Boolean, Long, FacetedSearch, Date
from elasticsearch_dsl.connections import connections

from config import parse_config

connections.create_connection(hosts=[parse_config("db")['url']])


class Function(DocType):
    function_id = String(index='not_analyzed')
    tenant_id = String(index='not_analyzed')
    user_id = String(index='not_analyzed')
    image_id = String(index='not_analyzed')
    name = String(index='not_analyzed')
    description = String(index='not_analyzed')
    type = String(index='not_analyzed')
    event = String(index='not_analyzed')
    public = Boolean()
    endpoint = String(index='not_analyzed')
    runtime = String(index='not_analyzed')
    memory = Long()
    zip_location = String(index='not_analyzed')
    tags = String()
    status = String(index='not_analyzed')

    class Meta:
        index = 'pratai'


class FunctionSearch(FacetedSearch):
    doc_types = [
Example #50
0
# -*- coding: UTF-8 -*-
from elasticsearch_dsl.connections import connections
from ultis.commons import ComFunc

# server connect
ELASTIC_HOST = "192.168.9.199"
ELASTIC_PORT = 9200
connections.create_connection(
    hosts=['{0}:{1}'.format(ELASTIC_HOST, ELASTIC_PORT)])


class CommonEs(object):
    """
    es可以共用的函数
    """

    time_zone = "Asia/Shanghai"

    @classmethod
    def debug_query(cls, s):
        """
        debug search query dict
        :param s: search object
        :return:
        """
        print('=' * 30)
        print(ComFunc.to_json_string(s.to_dict()))
        print('=' * 30)
Example #51
0
from django.db import models

# Create your models here.
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
    analyzer, InnerObjectWrapper, Completion, Keyword, Text

from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer

from elasticsearch_dsl.connections import connections
connections.create_connection(hosts=["localhost"])  #连接服务器


#避免analyzer报错
class CustonAnalyzer(_CustomAnalyzer):
    def get_analysis_definition(self):
        return {}


ik_analyzer = CustonAnalyzer("ik_max_word", filter=["lowercase"])


class LagouType(DocType):
    #拉钩职位类型
    suggest = Completion(analyzer=ik_analyzer)  #添加suggester
    url = Keyword()
    url_object_id = Keyword()
    title = Text(analyzer="ik_max_word")
    salary = Text(analyzer="ik_max_word")
    job_city = Text(analyzer="ik_max_word")
    work_years = Text(analyzer="ik_max_word")
    def __init__(self, url_path, query_dict):
        self.url_path = url_path
        self.query_dict = query_dict

        # create ES connection
        connections.create_connection()
Example #53
0
def enable_es():
    ES_URL = 'http://search:9200'
    connections.create_connection('default', hosts=[ES_URL])
Example #54
0
from urllib.parse import urlparse
from scrapy.crawler import CrawlerProcess
from elasticsearch_dsl import Index, Search, Mapping
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import Document, DateRange, Keyword, Range, Text

# Initiate the elasticsearch connection
# hosts = "localhost"
# http_auth = ("elastic", "changeme")
# port = "9200"

hosts = [os.getenv("HOST")]
http_auth = (os.getenv("USERNAME"), os.getenv("PASSWORD"))
port = os.getenv("PORT")
client = connections.create_connection(hosts=hosts,
                                       http_auth=http_auth,
                                       port=port)

# initiate Redis connection
# redis_conn = Redis("127.0.0.1", os.getenv("REDIS_PORT", 6379))

redis_conn = Redis(os.getenv("REDIS_HOST", "redis"),
                   os.getenv("REDIS_PORT", 6379))


def domains(url):
    """
    Get the domain of the url.
    """
    return tldextract.extract(url).registered_domain
Example #55
0
import os
import sys
import time
import pickle
from elasticsearch_dsl.connections import connections

from okcom_tokenizer.tokenizers import CCEmojiJieba, UniGram
from marginalbear_elastic.query import post_search, post_multifield_query
from marginalbear_elastic.utils import concat_tokens
from marginalbear_elastic.ranking import avg_pmi

client = connections.create_connection(hosts=['elastic:changeme@localhost'],
                                       timeout=20)
ccjieba = CCEmojiJieba()
unigram = UniGram()

package_dir = os.path.dirname(os.path.realpath(__name__))


def query_ccjieba(input_sentence, pairs_cnt, total_pairs_cnt):
    query = ccjieba.cut(input_sentence.strip())
    results = post_search(client,
                          index='post',
                          tokenizer='ccjieba',
                          query=concat_tokens(query, pos=False),
                          top=100)
    tokenized_query = [str(i['word']) for i in query]
    sorted_ans = avg_pmi(tokenized_query,
                         results,
                         pairs_cnt,
                         total_pairs_cnt,
Example #56
0
# -*- coding: utf-8 -*-# -*- coding: utf-8 -*-
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
    analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
from elasticsearch_dsl.connections import connections

connections.create_connection(hosts=["localhost"])

class CustomAnalyzer(_CustomAnalyzer):
    def get_analysis_definition(self):
        return {}

ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])

class ArticleType(DocType):
    # 伯乐在线文章类型
    suggest = Completion(analyzer=ik_analyzer)
    title = Text(analyzer="ik_max_word")
    create_date = Date()
    url = Keyword()
    url_obj_id = Keyword()
    img_url = Keyword()
    img_path = Keyword()
    praise_count = Integer()
    comment_count = Integer()
    collect_count = Integer()
    tags = Text(analyzer="ik_max_word")
    content = Text(analyzer="ik_max_word")

    class Meta:
Example #57
0
# _*_ coding:utf-8 _*_

from es_config import host
from elasticsearch_dsl import DocType, Text, Date, Integer, Q, Keyword, analyzer, token_filter, tokenizer
from elasticsearch_dsl.connections import connections
# Define a default Elasticsearch client
connections.create_connection(hosts=[host])

pinyin_analyzer = analyzer('pinyin_analyzer',
                           tokenizer=tokenizer('my_pinyin',
                                               type='pinyin',
                                               lowercase=True))
local_dynamic_synonym_filter = token_filter(name_or_instance="local_synonym",
                                            type="dynamic_synonym",
                                            synonyms_path="synonyms.txt",
                                            interval=60)
local_synonym = token_filter(name_or_instance="local_synonym",
                             type="dynamic_synonym",
                             synonyms_path="synonyms.txt",
                             interval=30)

ik_synonym_analyzer = analyzer("remote_ik_synonym_analyzer ",
                               tokenizer='ik_max_word',
                               filter=[local_synonym])
ik_smart_synonym = analyzer("ik_smart_synonym",
                            tokenizer='ik_smart',
                            filter=[local_dynamic_synonym_filter])
ik_max_word_synonym = analyzer("ik_max_word_synonym",
                               tokenizer='ik_max_word',
                               filter=[local_dynamic_synonym_filter])
Example #58
0
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy
from .models.es_types import CompanyType

from elasticsearch_dsl.connections import connections
es = connections.create_connection(CompanyType._doc_type.using)


class HuijuCompanyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


class CompanyItem(scrapy.Item):

    company_name = scrapy.Field()  #公司名称
    legal_person = scrapy.Field()  #法人
    registered_capital = scrapy.Field()  #注册资本
    telephone_number = scrapy.Field()  #电话
    email = scrapy.Field()  #邮箱
    company_url = scrapy.Field()  #公司网址
    address = scrapy.Field()  #地址
    registration_time = scrapy.Field()  #注册时间
    company_state = scrapy.Field()  #公司状态
from elasticsearch_dsl import Document, Date, Integer
from elasticsearch_dsl.connections import connections

connections.create_connection(hosts=['elastic:[email protected]'])


class ReservationDocument(Document):
    reserved_by = Integer()
    room = Integer()
    settle_date = Date()
    leave_date = Date()

    class Index:
        name = 'reservation'
        settings = {
            "number_of_shards": 1,
        }

    def save(self, **kwargs):
        return super(ReservationDocument, self).save(**kwargs)
Example #60
0
This file generates an elasticsearch index. Run this before running web_app.py.
Make sure that elasticsearch is running in the background beforehand.
"""

import csv
import time

from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch_dsl import Index, Document, Text
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl.analysis import analyzer

# Connect to local host server
connections.create_connection(hosts=['127.0.0.1'])

# Create elasticsearch object
es = Elasticsearch()

# Analyzers for both the predicate and arguments.
predicate_analyzer = analyzer(
    'covid_predicate_analyzer',
    tokenizer='whitespace',
    # Use a stemmer to capture predicates with similar stems.
    filter=['lowercase', 'stemmer'])

argument_analyzer = analyzer('covid_argument_analyzer',
                             tokenizer='whitespace',
                             filter=['lowercase'])