コード例 #1
0
def mock_client(dummy_response):
    """Returns elasticsearch mock client"""

    client = MagicMock()
    client.search.return_value = dummy_response
    add_connection("mock", client)
    yield client
コード例 #2
0
def mock_client(dummy_response):
    client = Mock()
    client.search.return_value = dummy_response
    add_connection('mock', client)
    yield client
    connections._conn = {}
    connections._kwargs = {}
コード例 #3
0
def client():
    try:
        connection = get_test_client(nowait='WAIT_FOR_ES' not in os.environ)
        add_connection('default', connection)
        return connection
    except SkipTest:
        skip()
コード例 #4
0
def elastic_client():
    try:
        connection = get_test_client()
        add_connection("default", connection)
        yield connection
        connection.indices.delete("test-*", ignore=404)
    except SkipTest:
        skip()
コード例 #5
0
ファイル: indexer.py プロジェクト: wwwzrb/wsm
def main(use_elasticsearch = True, calculate_PageRank = False, tele_const = 0.2):
    """
    main entry for the indexer module.
    """
    jsons_root_dir = 'JSONs201806101057/'

    # list of addresses of all json files
    all_json_dirs = glob.glob(unicode(jsons_root_dir + '*.json'))

    # first reading all json files
    jsons = []
    for jdir in all_json_dirs:
        with open(jdir, 'r') as f:
            jsn = json.load(f)
            jsons.append(jsn)
    print len(jsons), ' json files imported.'

    # now creating a set of all links and then a list of all links in json files
    print 'creating a list of all links'
    links_set = set()
    for js in jsons:
        links_set.add(js["url"])
        for l in js["outlinks"]:
            links_set.add(l)
    print len(links_set), ' links found'
    links = list(links_set)

    ## if user has selected to index documents using Elasticsearch
    # Note that when using Elasticsearch, page rank is ignored
    if use_elasticsearch:
        from elasticsearch import Elasticsearch
        from elasticsearch_dsl import Search, document, field, connections, Q
        from elasticsearch_dsl.connections import connections

        print 'Using Elasticsearch for indexing, PageRank is ignored'
        es = Elasticsearch(serializer=JSONSerializerPython2())
        es.indices.create(index='book-index', ignore=[400, 404])
        connections.create_connection(hosts=['localhost'], timeout=20)
        connections.add_connection('book', es)
        Book.init('book-index')

        ## adding all document to the index 'book-index'
        for idx, js in enumerate(jsons):
            if len(js['type']) == 0:
                js['type'] = ['missing']
                print idx
                print js['title']
            book = Book(average=js['average'], cover=js['cover'], description=js['description'].encode('utf-8',"replace"), ratings=js['ratings'], reviews=js['reviews'], title=js['title'], url=js['url'], outlinks=js['outlinks'], type=js['type'])
            book.add_authors(js['authors'])

            book.add_userreviews(js['userreviews'])
            book.id = idx
            book.save()
        print 'Elasticsearch index created'

    ### use pyLucene instead
    else:
        """
コード例 #6
0
def init_es(
        settings: Settings,
        use_async: bool = True) -> Union[AsyncElasticsearch, Elasticsearch]:
    """Instantiate an elastic search client."""
    if use_async:
        return AsyncElasticsearch([settings.ES_HOST])
    else:
        client = Elasticsearch([settings.ES_HOST])
        add_connection("default", client)
        return client
コード例 #7
0
ファイル: searcher.py プロジェクト: wwwzrb/wsm
    def GET(self):
        render = web.template.render('templates/')
        details = []
        data_input = web.input()
        query_author=""
        query_title=""
        if "author" in data_input:
            query_author = data_input["authors"]
        query_title=data_input["title"]


        if use_elasticsearch:
            # importing libraries for Elasticsearch
            from elasticsearch import Elasticsearch
            from elasticsearch_dsl import Search, document, field, connections, Q
            from elasticsearch_dsl.connections import connections
            from booktype import Book

            es = Elasticsearch()
            es.indices.create(index='book-index', ignore=[400, 404])
            connections.create_connection(hosts=['localhost'], timeout=20)
            connections.add_connection('book', es)
            # print(connections.get_connection().cluster.health())
            #s = Search(using=es, index='book-index').doc_type('book').query(Q('match', title=query.strip()) | Q('match', description=query.strip()) | Q("match", userreviews_userReview=query.strip()))
            s = Search(using=es).index('book-index').doc_type('book').query(
                Q('match', title=query_title) | Q('match', authors_name=query_author))
            ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute()
            response = s.execute()
            userreviews_userName=["None"]
            userreviews_userURL=["#"]
            userreviews_userReview=["None"]
            userreviews_userReviewDate=["None"]
            # print 'total number of hits: ', response.hits.total
            for res in response:
                authors = zip(res.authors_name, res.authors_url)
                try:
                    reviews = zip(res.userreviews_userName, res.userreviews_userURL,
                              res.userreviews_userReview, res.userreviews_userReviewDate)
                except:
                    reviews=zip(userreviews_userName,userreviews_userURL,userreviews_userReview,userreviews_userReviewDate)

                details.append({'title': res.title, 'description': res.description.encode('utf-8'), 'url': res.url, 'cover':res.cover, 'authors':authors, 'reviews': reviews, 'types': res.type})
                break
        else:
            try:
                detail = details[0]
            except Exception as exp:
                print "get detail error"

        return render.details(details)
コード例 #8
0
 def __init__(self):
     try:
         self._conn = connections.get_connection(alias='default')
     except KeyError:
         # if there is no existing connection create new one
         if is_testing():
             # fake Elasticsearch with unittests
             self._conn = FakeElasticsearch()
             connections.add_connection(alias='default', conn=self._conn)
         else:
             try:
                 http_auth = {'http_auth': (settings.ELASTICSEARCH_USERNAME, settings.ELASTICSEARCH_PASSWORD,)} \
                     if settings.ELASTICSEARCH_USERNAME else {}
             except AttributeError:
                 http_auth = {}
             self._conn = connections.create_connection(
                 alias='default',
                 hosts=[settings.ELASTICSEARCH_HOST],
                 **http_auth)
コード例 #9
0
ファイル: searcher.py プロジェクト: wwwzrb/wsm
    def GET(self, query):
        data_input = web.input()
        page = 0
        if "page" in data_input:
            page = int(data_input["page"])
        render = web.template.render('templates/')
        anses = []
        num_pages = 0
        # print 'query content:', query
        if use_elasticsearch:
            # importing libraries for Elasticsearch
            from elasticsearch import Elasticsearch
            from elasticsearch_dsl import Search, document, field, connections, Q
            from elasticsearch_dsl.connections import connections
            from booktype import Book

            es = Elasticsearch()
            es.indices.create(index='book-index', ignore=[400, 404])
            connections.create_connection(hosts=['localhost'], timeout=20)
            connections.add_connection('book', es)
            # print(connections.get_connection().cluster.health())
            s = Search(using=es, index='book-index').doc_type('book').query(Q("match", authors_name=query.strip()))
            ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute()
            s = s[page * 10: page * 10 + 10]
            response = s.execute()
            # print 'total number of hits: ', response.hits.total
            num_pages = (response.hits.total / 10) + 1
            for res in response:
                authors = zip(res.authors_name, res.authors_url)
                anses.append({'title': res.title, 'description': res.description.encode('utf-8'), 'url': res.url,
                              'cover': res.cover, 'authors': authors, 'types': res.type})
        else:
            try:
                anse = anses[0]
            except Exception as exp:
                print "get genre error"


        return render.index(anses, query, num_pages)
コード例 #10
0
        self.userreviews_userReview = [rev['userReview'] for rev in reviews]
        self.userreviews_userReviewDate = [
            rev['userReviewDate'] for rev in reviews
        ]
        self.userreviews_userURL = [rev['userURL'] for rev in reviews]

    class Meta:
        doc_type = 'book'
        index = 'book-index'


if __name__ == '__main__':
    es = Elasticsearch()
    es.indices.create(index='book-index', ignore=[400, 404])
    connections.create_connection(hosts=['localhost'], timeout=20)
    connections.add_connection('book', es)
    print(connections.get_connection().cluster.health())

    all_json_dirs = glob.glob('JSONs/*.json')
    all_jsons = []
    for jdir in all_json_dirs[:10]:
        with open(jdir, 'r') as f:
            jsn = json.load(f)
            all_jsons.append(jsn)
    print len(all_jsons)

    Book.init('book-index')
    for idx, js in enumerate(all_jsons):
        book = Book(average=js['average'],
                    cover=js['cover'],
                    description=js['description'].encode('utf-8'),
コード例 #11
0
from datetime import datetime

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch_dsl import Document, Date, Nested, Boolean, \
    analyzer, InnerDoc, Completion, Keyword, Text, connections

HOSTS = ['localhost:9200']
CONNECTION_ALIAS = 'dev'
INDEX = 'blog'
es_client = Elasticsearch(HOSTS, timeout=5)
connections.add_connection(CONNECTION_ALIAS, es_client)

html_strip = analyzer('html_strip',
                      tokenizer="standard",
                      filter=["standard", "lowercase", "stop", "snowball"],
                      char_filter=["html_strip"])


class Comment(InnerDoc):
    author = Text(fields={'raw': Keyword()})
    content = Text(analyzer='snowball')
    created_at = Date()

    def age(self):
        return datetime.now() - self.created_at


class Post(Document):
    title = Text()
    title_suggest = Completion()
コード例 #12
0
ファイル: documents.py プロジェクト: timo955/kitsune
from django.db.models import Count, Prefetch, Q
from elasticsearch_dsl import InnerDoc, connections, field

from kitsune.forums.models import Post
from kitsune.questions.models import Answer, Question
from kitsune.search import config
from kitsune.search.v2.base import SumoDocument
from kitsune.search.v2.es7_utils import es7_client
from kitsune.search.v2.fields import SumoLocaleAwareKeywordField, SumoLocaleAwareTextField
from kitsune.users.models import Profile
from kitsune.wiki import models as wiki_models
from kitsune.wiki.config import CANNED_RESPONSES_CATEGORY, REDIRECT_HTML, TEMPLATES_CATEGORY

connections.add_connection(config.DEFAULT_ES7_CONNECTION, es7_client())


class WikiDocument(SumoDocument):
    updated = field.Date()

    product_ids = field.Keyword(multi=True)
    topic_ids = field.Keyword(multi=True)
    category = field.Keyword()

    # Document specific fields (locale aware)
    title = SumoLocaleAwareTextField()
    content = SumoLocaleAwareTextField(store=True,
                                       term_vector="with_positions_offsets")
    summary = SumoLocaleAwareTextField(store=True,
                                       term_vector="with_positions_offsets")
    # store keywords in a text field so they're stemmed:
    keywords = SumoLocaleAwareTextField()
コード例 #13
0
    Index,
    Text,
    Float,
    Document,
    Nested,
    InnerDoc,
)

from transcriptor.amazon import AmazonJob
from pathlib import Path

import typer

app = typer.Typer()

conn = connections.add_connection(conn=Client, alias='default')


class Alternate(InnerDoc):
    content = Text()


class Alternative(Document):
    content = Text()
    confidence = Float()
    alternate = Nested(Alternate)

    class Index:
        name = 'alternatives'

コード例 #14
0
Authors:
Sapir Nahum
Shmuel Eliasyan
"""
import logging
"""
===================================================================================================
Imports
===================================================================================================
"""
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, connections
import calendar
import time

connections.add_connection('default', Elasticsearch)


class elasticsearch():
    """
    ===================================================================================================
    Init
    ===================================================================================================
    """
    def __init__(self):
        self.client = Elasticsearch()
        self.s = Search(using=self.client)

    """
    ===================================================================================================
    Functions
コード例 #15
0
        aws_access_key=settings.AWS_ACCESS_KEY_ID,
        aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
        aws_host=es_url,
        aws_region=es_aws_region,
        aws_service="es",
    )
    auth.encode = lambda x: bytes(x.encode("utf-8"))
    _es = Elasticsearch(
        host=es_url,
        port=es_port,
        connection_class=RequestsHttpConnection,
        timeout=10,
        max_retries=1,
        retry_on_timeout=True,
        http_auth=auth,
        wait_for_status="yellow",
    )
    _es.info()
    return _es


ES = _elasticsearch_connect()
"""Elasticsearch client, also aliased to connection 'default'"""
connections.add_connection("default", ES)

MEDIA_INDEX_MAPPING = {
    media_type: config(f"{media_type.upper()}_INDEX_NAME", default=media_type)
    for media_type in MEDIA_TYPES
}
"""mapping of media types to Elasticsearch index names"""
コード例 #16
0
    def GET(self, query):
        data_input = web.input()
        page = 0
        if "page" in data_input:
            page = int(data_input["page"])
        render = web.template.render('templates/')
        anses = []
        num_pages = 0
        if use_elasticsearch:
            # importing libraries for Elasticsearch
            from elasticsearch import Elasticsearch
            from elasticsearch_dsl import Search, document, field, connections, Q
            from elasticsearch_dsl.connections import connections
            from booktype import Book

            es = Elasticsearch()
            es.indices.create(index='book-index', ignore=[400, 404])
            connections.create_connection(hosts=['localhost'], timeout=20)
            connections.add_connection('book', es)
            # print(connections.get_connection().cluster.health())
            s = Search(es).index('book-index').doc_type('book').query(
                Q('match', title=query.strip())
                | Q('match', description=query.strip())
                | Q("match", userreviews_userReview=query.strip()))
            ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute()
            s = s[page * 10:page * 10 + 10]
            response = s.execute()
            # print 'total number of hits: ', response.hits.total
            num_pages = (response.hits.total / 10) + 1
            for res in response:
                authors = zip(res.authors_name, res.authors_url)
                anses.append({
                    'title': res.title,
                    'description': res.description.encode('utf-8'),
                    'url': res.url,
                    'cover': res.cover,
                    'authors': authors
                })
        else:
            # importing libraries for Lucene
            import lucene
            from java.io import File
            from org.apache.lucene.index import DirectoryReader, Term
            from org.apache.lucene.queryparser.classic import QueryParser
            from org.apache.lucene.store import SimpleFSDirectory
            from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery
            from org.apache.lucene.util import Version
            from org.apache.lucene.analysis.standard import StandardAnalyzer
            import os

            # fields
            title_field = 'title'
            description_field = 'description'
            cover_field = 'cover'
            authors_name_field = 'authors_name'
            authors_url_field = 'authors_url'
            url_field = 'url'

            index_folder = '.'
            index_name = 'lucene.index'
            index_path = os.path.join(index_folder, index_name)

            lucene.initVM()
            version = Version.LUCENE_CURRENT
            directory = SimpleFSDirectory(File(index_path))
            searcher = IndexSearcher(DirectoryReader.open(directory))
            analyzer = StandardAnalyzer(version)

            title_tq = TermQuery(Term(title_field, query))
            desc_tq = TermQuery(Term(description_field, query))
            query = BooleanQuery()
            query.add(BooleanClause(title_tq, BooleanClause.Occur.SHOULD))
            query.add(BooleanClause(desc_tq, BooleanClause.Occur.SHOULD))
            scoreDocs = searcher.search(query, 1000).scoreDocs
            num_pages = (len(scoreDocs) / 10) + 1

            for scoreDoc in scoreDocs[page * 10:page * 10 + 10]:
                doc = searcher.doc(scoreDoc.doc)
                authors = zip([doc.get(authors_name_field)],
                              [doc.get(authors_url_field)])
                anses.append({
                    'title':
                    doc.get(title_field),
                    'description':
                    doc.get(description_field).encode('utf-8'),
                    'url':
                    doc.get(url_field),
                    'cover':
                    doc.get(cover_field),
                    'authors':
                    authors
                })

        return render.index(anses, query, num_pages)
コード例 #17
0
ファイル: conftest.py プロジェクト: City-of-Helsinki/helerm
def es_connection():
    es_connection = get_test_client()
    add_connection("default", es_connection)
    yield es_connection
コード例 #18
0
ファイル: indexer.py プロジェクト: mikalv/GoodSearcher
def main(use_elasticsearch=True, calculate_PageRank=False, tele_const=0.2):
    """
    main entry for the indexer module.
    """
    jsons_root_dir = 'JSONs/'

    # list of addresses of all json files
    all_json_dirs = glob.glob(jsons_root_dir + '*.json')

    # first reading all json files
    jsons = []
    for jdir in all_json_dirs:
        with open(jdir, 'r') as f:
            jsn = json.load(f)
            jsons.append(jsn)
    print len(jsons), ' json files imported.'

    # now creating a set of all links and then a list of all links in json files
    print 'creating a list of all links'
    links_set = set()
    for js in jsons:
        links_set.add(js["url"])
        for l in js["outlinks"]:
            links_set.add(l)
    print len(links_set), ' links found'
    links = list(links_set)

    ## if user has selected to index documents using Elasticsearch
    # Note that when using Elasticsearch, page rank is ignored
    if use_elasticsearch:
        from elasticsearch import Elasticsearch
        from elasticsearch_dsl import Search, document, field, connections, Q
        from elasticsearch_dsl.connections import connections

        print 'Using Elasticsearch for indexing, PageRank is ignored'
        es = Elasticsearch()
        es.indices.create(index='book-index', ignore=[400, 404])
        connections.create_connection(hosts=['localhost'], timeout=20)
        connections.add_connection('book', es)
        Book.init('book-index')

        ## adding all document to the index 'book-index'
        for idx, js in enumerate(jsons):
            book = Book(average=js['average'],
                        cover=js['cover'],
                        description=js['description'].encode('utf-8'),
                        ratings=js['ratings'],
                        reviews=js['reviews'],
                        title=js['title'],
                        url=js['url'],
                        outlinks=js['outlinks'])
            book.add_authors(js['authors'])
            book.add_userreviews(js['userreviews'])
            book.id = idx
            book.save()
        print 'Elasticsearch index created'

    ### use pyLucene instead
    else:
        import lucene
        from java.io import File
        from org.apache.lucene.index import IndexWriterConfig, IndexWriter, FieldInfo
        from org.apache.lucene.document import Document, Field, FieldType, IntField, FloatField
        from org.apache.lucene.store import SimpleFSDirectory
        from org.apache.lucene.util import Version
        from org.apache.lucene.analysis.standard import StandardAnalyzer

        print 'Using Lucene for indexing'
        ## if user has selected to calculate the PageRank
        if calculate_PageRank:
            # now creating the unnormalized adjacency matrix
            print 'creating the unnormalized adjacency matrix.'
            adjacency = np.zeros((len(links_set), len(links_set)))
            for js in jsons:
                node_idx = links.index(js["url"])
                for l in js["outlinks"]:
                    out_idx = links.index(l)
                    adjacency[node_idx, out_idx] += 1
            print 'the unnormalized adjacency matrix created.'

            print 'normalizing the adjacency matrix with teleporting constant value of ', tele_const
            norm_mat = Normalize(adjacency, tele_const)
            print 'calculating the PageRank scores'
            pr_scores = PageRankScore(norm_mat)

        ## here goes the pyLucene code, which means I should swith to the damn Ubuntu
        index_folder = '.'
        index_name = 'lucene.index'
        index_path = os.path.join(index_folder, index_name)
        print 'initializing Lucene VM'
        lucene.initVM()
        print 'lucene version ', lucene.VERSION
        version = Version.LUCENE_CURRENT
        index_store = SimpleFSDirectory(File(index_path))
        analyzer = StandardAnalyzer(version)
        config = IndexWriterConfig(version, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(index_store, config)

        # Options
        TokenizeFields = True

        # Title field type
        title_field = 'title'
        tft = FieldType()
        tft.setIndexed(True)
        tft.setStored(True)
        tft.setTokenized(TokenizeFields)
        tft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS
                            )  #only index the document and frequency data

        # Authors name field type
        authors_name_field = 'authors_name'
        anft = FieldType()
        anft.setIndexed(True)
        anft.setStored(True)
        anft.setTokenized(TokenizeFields)
        anft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        # Authors url field type
        authors_url_field = 'authors_url'
        auft = FieldType()
        auft.setIndexed(False)
        auft.setStored(True)

        # Average rating field type
        average_field = 'average'

        # Cover Image URL field type
        cover_field = 'cover'
        cft = FieldType()
        cft.setIndexed(False)
        cft.setStored(True)

        # Book description field type
        description_field = 'description'
        descft = FieldType()
        descft.setIndexed(True)
        descft.setStored(True)
        descft.setTokenized(TokenizeFields)
        descft.setIndexOptions(
            FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        # Outlinks field type
        outlinks_field = "outlinks"
        outft = FieldType()
        outft.setIndexed(False)
        outft.setStored(True)

        # Ratings count field type
        ratings_field = 'ratings'

        # Reviews count field type
        reviews_field = 'reviews'

        # URL field type
        url_field = 'url'
        uft = FieldType()
        uft.setIndexed(False)
        uft.setStored(True)

        # userreviews.userName field type
        userreviews_userName_field = 'userreviews_userName'
        usunft = FieldType()
        usunft.setIndexed(False)
        usunft.setStored(True)

        #userreviews.userReview field type
        userreviews_userReview_field = 'userreviews_userReview'
        usurft = FieldType()
        usurft.setIndexed(True)
        usurft.setStored(False)
        usurft.setTokenized(TokenizeFields)
        usurft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        #userreviews.userReviewDate field type
        userreviews_userReviewDate_field = 'userreviews_userReviewDate'
        usudft = FieldType()
        usudft.setIndexed(False)
        usudft.setStored(True)

        #userreviews.userURL field type
        userreviews_userURL_field = 'userreviews_userURL'
        usuuft = FieldType()
        usuuft.setIndexed(False)
        usuuft.setStored(True)

        docid_field = 'docid'

        for idx, js in enumerate(jsons):
            boostVal = js['average']
            if calculate_PageRank:
                boostVal *= pr_scores[links.index(js['url'])]
            doc = Document()
            for author in js['authors']:
                doc.add(Field(authors_name_field, author['name'], anft))
                doc.add(Field(authors_url_field, author['url'], auft))
            doc.add(
                FloatField(average_field, float(js['average']),
                           Field.Store.YES))
            doc.add(Field(cover_field, js['cover'], cft))
            df = Field(description_field, js['description'], descft)
            df.setBoost(boostVal)
            doc.add(df)
            for u in js['outlinks']:
                doc.add(Field(outlinks_field, u, outft))
            doc.add(IntField(ratings_field, js['ratings'], Field.Store.YES))
            doc.add(IntField(reviews_field, js['reviews'], Field.Store.YES))
            tf = Field(title_field, js['title'], tft)
            tf.setBoost(boostVal)
            doc.add(tf)
            doc.add(Field(url_field, js['url'], uft))

            for rev in js['userreviews']:
                doc.add(
                    Field(userreviews_userName_field, rev['userName'], usunft))
                doc.add(
                    Field(userreviews_userReview_field, rev['userReview'],
                          usurft))
                doc.add(
                    Field(userreviews_userReviewDate_field,
                          rev['userReviewDate'], usurft))
                doc.add(
                    Field(userreviews_userURL_field, rev['userURL'], usuuft))
            doc.add(IntField(docid_field, idx, Field.Store.YES))

            writer.addDocument(doc)
        print 'lucene index created'
        writer.commit()
        writer.close()
        print 'writing lucene indexing finished'