def mock_client(dummy_response): """Returns elasticsearch mock client""" client = MagicMock() client.search.return_value = dummy_response add_connection("mock", client) yield client
def mock_client(dummy_response): client = Mock() client.search.return_value = dummy_response add_connection('mock', client) yield client connections._conn = {} connections._kwargs = {}
def client(): try: connection = get_test_client(nowait='WAIT_FOR_ES' not in os.environ) add_connection('default', connection) return connection except SkipTest: skip()
def elastic_client(): try: connection = get_test_client() add_connection("default", connection) yield connection connection.indices.delete("test-*", ignore=404) except SkipTest: skip()
def main(use_elasticsearch = True, calculate_PageRank = False, tele_const = 0.2): """ main entry for the indexer module. """ jsons_root_dir = 'JSONs201806101057/' # list of addresses of all json files all_json_dirs = glob.glob(unicode(jsons_root_dir + '*.json')) # first reading all json files jsons = [] for jdir in all_json_dirs: with open(jdir, 'r') as f: jsn = json.load(f) jsons.append(jsn) print len(jsons), ' json files imported.' # now creating a set of all links and then a list of all links in json files print 'creating a list of all links' links_set = set() for js in jsons: links_set.add(js["url"]) for l in js["outlinks"]: links_set.add(l) print len(links_set), ' links found' links = list(links_set) ## if user has selected to index documents using Elasticsearch # Note that when using Elasticsearch, page rank is ignored if use_elasticsearch: from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections print 'Using Elasticsearch for indexing, PageRank is ignored' es = Elasticsearch(serializer=JSONSerializerPython2()) es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) Book.init('book-index') ## adding all document to the index 'book-index' for idx, js in enumerate(jsons): if len(js['type']) == 0: js['type'] = ['missing'] print idx print js['title'] book = Book(average=js['average'], cover=js['cover'], description=js['description'].encode('utf-8',"replace"), ratings=js['ratings'], reviews=js['reviews'], title=js['title'], url=js['url'], outlinks=js['outlinks'], type=js['type']) book.add_authors(js['authors']) book.add_userreviews(js['userreviews']) book.id = idx book.save() print 'Elasticsearch index created' ### use pyLucene instead else: """
def init_es( settings: Settings, use_async: bool = True) -> Union[AsyncElasticsearch, Elasticsearch]: """Instantiate an elastic search client.""" if use_async: return AsyncElasticsearch([settings.ES_HOST]) else: client = Elasticsearch([settings.ES_HOST]) add_connection("default", client) return client
def GET(self): render = web.template.render('templates/') details = [] data_input = web.input() query_author="" query_title="" if "author" in data_input: query_author = data_input["authors"] query_title=data_input["title"] if use_elasticsearch: # importing libraries for Elasticsearch from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections from booktype import Book es = Elasticsearch() es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) # print(connections.get_connection().cluster.health()) #s = Search(using=es, index='book-index').doc_type('book').query(Q('match', title=query.strip()) | Q('match', description=query.strip()) | Q("match", userreviews_userReview=query.strip())) s = Search(using=es).index('book-index').doc_type('book').query( Q('match', title=query_title) | Q('match', authors_name=query_author)) ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute() response = s.execute() userreviews_userName=["None"] userreviews_userURL=["#"] userreviews_userReview=["None"] userreviews_userReviewDate=["None"] # print 'total number of hits: ', response.hits.total for res in response: authors = zip(res.authors_name, res.authors_url) try: reviews = zip(res.userreviews_userName, res.userreviews_userURL, res.userreviews_userReview, res.userreviews_userReviewDate) except: reviews=zip(userreviews_userName,userreviews_userURL,userreviews_userReview,userreviews_userReviewDate) details.append({'title': res.title, 'description': res.description.encode('utf-8'), 'url': res.url, 'cover':res.cover, 'authors':authors, 'reviews': reviews, 'types': res.type}) break else: try: detail = details[0] except Exception as exp: print "get detail error" return render.details(details)
def __init__(self): try: self._conn = connections.get_connection(alias='default') except KeyError: # if there is no existing connection create new one if is_testing(): # fake Elasticsearch with unittests self._conn = FakeElasticsearch() connections.add_connection(alias='default', conn=self._conn) else: try: http_auth = {'http_auth': (settings.ELASTICSEARCH_USERNAME, settings.ELASTICSEARCH_PASSWORD,)} \ if settings.ELASTICSEARCH_USERNAME else {} except AttributeError: http_auth = {} self._conn = connections.create_connection( alias='default', hosts=[settings.ELASTICSEARCH_HOST], **http_auth)
def GET(self, query): data_input = web.input() page = 0 if "page" in data_input: page = int(data_input["page"]) render = web.template.render('templates/') anses = [] num_pages = 0 # print 'query content:', query if use_elasticsearch: # importing libraries for Elasticsearch from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections from booktype import Book es = Elasticsearch() es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) # print(connections.get_connection().cluster.health()) s = Search(using=es, index='book-index').doc_type('book').query(Q("match", authors_name=query.strip())) ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute() s = s[page * 10: page * 10 + 10] response = s.execute() # print 'total number of hits: ', response.hits.total num_pages = (response.hits.total / 10) + 1 for res in response: authors = zip(res.authors_name, res.authors_url) anses.append({'title': res.title, 'description': res.description.encode('utf-8'), 'url': res.url, 'cover': res.cover, 'authors': authors, 'types': res.type}) else: try: anse = anses[0] except Exception as exp: print "get genre error" return render.index(anses, query, num_pages)
self.userreviews_userReview = [rev['userReview'] for rev in reviews] self.userreviews_userReviewDate = [ rev['userReviewDate'] for rev in reviews ] self.userreviews_userURL = [rev['userURL'] for rev in reviews] class Meta: doc_type = 'book' index = 'book-index' if __name__ == '__main__': es = Elasticsearch() es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) print(connections.get_connection().cluster.health()) all_json_dirs = glob.glob('JSONs/*.json') all_jsons = [] for jdir in all_json_dirs[:10]: with open(jdir, 'r') as f: jsn = json.load(f) all_jsons.append(jsn) print len(all_jsons) Book.init('book-index') for idx, js in enumerate(all_jsons): book = Book(average=js['average'], cover=js['cover'], description=js['description'].encode('utf-8'),
from datetime import datetime from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk from elasticsearch_dsl import Document, Date, Nested, Boolean, \ analyzer, InnerDoc, Completion, Keyword, Text, connections HOSTS = ['localhost:9200'] CONNECTION_ALIAS = 'dev' INDEX = 'blog' es_client = Elasticsearch(HOSTS, timeout=5) connections.add_connection(CONNECTION_ALIAS, es_client) html_strip = analyzer('html_strip', tokenizer="standard", filter=["standard", "lowercase", "stop", "snowball"], char_filter=["html_strip"]) class Comment(InnerDoc): author = Text(fields={'raw': Keyword()}) content = Text(analyzer='snowball') created_at = Date() def age(self): return datetime.now() - self.created_at class Post(Document): title = Text() title_suggest = Completion()
from django.db.models import Count, Prefetch, Q from elasticsearch_dsl import InnerDoc, connections, field from kitsune.forums.models import Post from kitsune.questions.models import Answer, Question from kitsune.search import config from kitsune.search.v2.base import SumoDocument from kitsune.search.v2.es7_utils import es7_client from kitsune.search.v2.fields import SumoLocaleAwareKeywordField, SumoLocaleAwareTextField from kitsune.users.models import Profile from kitsune.wiki import models as wiki_models from kitsune.wiki.config import CANNED_RESPONSES_CATEGORY, REDIRECT_HTML, TEMPLATES_CATEGORY connections.add_connection(config.DEFAULT_ES7_CONNECTION, es7_client()) class WikiDocument(SumoDocument): updated = field.Date() product_ids = field.Keyword(multi=True) topic_ids = field.Keyword(multi=True) category = field.Keyword() # Document specific fields (locale aware) title = SumoLocaleAwareTextField() content = SumoLocaleAwareTextField(store=True, term_vector="with_positions_offsets") summary = SumoLocaleAwareTextField(store=True, term_vector="with_positions_offsets") # store keywords in a text field so they're stemmed: keywords = SumoLocaleAwareTextField()
Index, Text, Float, Document, Nested, InnerDoc, ) from transcriptor.amazon import AmazonJob from pathlib import Path import typer app = typer.Typer() conn = connections.add_connection(conn=Client, alias='default') class Alternate(InnerDoc): content = Text() class Alternative(Document): content = Text() confidence = Float() alternate = Nested(Alternate) class Index: name = 'alternatives'
Authors: Sapir Nahum Shmuel Eliasyan """ import logging """ =================================================================================================== Imports =================================================================================================== """ from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, connections import calendar import time connections.add_connection('default', Elasticsearch) class elasticsearch(): """ =================================================================================================== Init =================================================================================================== """ def __init__(self): self.client = Elasticsearch() self.s = Search(using=self.client) """ =================================================================================================== Functions
aws_access_key=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, aws_host=es_url, aws_region=es_aws_region, aws_service="es", ) auth.encode = lambda x: bytes(x.encode("utf-8")) _es = Elasticsearch( host=es_url, port=es_port, connection_class=RequestsHttpConnection, timeout=10, max_retries=1, retry_on_timeout=True, http_auth=auth, wait_for_status="yellow", ) _es.info() return _es ES = _elasticsearch_connect() """Elasticsearch client, also aliased to connection 'default'""" connections.add_connection("default", ES) MEDIA_INDEX_MAPPING = { media_type: config(f"{media_type.upper()}_INDEX_NAME", default=media_type) for media_type in MEDIA_TYPES } """mapping of media types to Elasticsearch index names"""
def GET(self, query): data_input = web.input() page = 0 if "page" in data_input: page = int(data_input["page"]) render = web.template.render('templates/') anses = [] num_pages = 0 if use_elasticsearch: # importing libraries for Elasticsearch from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections from booktype import Book es = Elasticsearch() es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) # print(connections.get_connection().cluster.health()) s = Search(es).index('book-index').doc_type('book').query( Q('match', title=query.strip()) | Q('match', description=query.strip()) | Q("match", userreviews_userReview=query.strip())) ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute() s = s[page * 10:page * 10 + 10] response = s.execute() # print 'total number of hits: ', response.hits.total num_pages = (response.hits.total / 10) + 1 for res in response: authors = zip(res.authors_name, res.authors_url) anses.append({ 'title': res.title, 'description': res.description.encode('utf-8'), 'url': res.url, 'cover': res.cover, 'authors': authors }) else: # importing libraries for Lucene import lucene from java.io import File from org.apache.lucene.index import DirectoryReader, Term from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery from org.apache.lucene.util import Version from org.apache.lucene.analysis.standard import StandardAnalyzer import os # fields title_field = 'title' description_field = 'description' cover_field = 'cover' authors_name_field = 'authors_name' authors_url_field = 'authors_url' url_field = 'url' index_folder = '.' index_name = 'lucene.index' index_path = os.path.join(index_folder, index_name) lucene.initVM() version = Version.LUCENE_CURRENT directory = SimpleFSDirectory(File(index_path)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(version) title_tq = TermQuery(Term(title_field, query)) desc_tq = TermQuery(Term(description_field, query)) query = BooleanQuery() query.add(BooleanClause(title_tq, BooleanClause.Occur.SHOULD)) query.add(BooleanClause(desc_tq, BooleanClause.Occur.SHOULD)) scoreDocs = searcher.search(query, 1000).scoreDocs num_pages = (len(scoreDocs) / 10) + 1 for scoreDoc in scoreDocs[page * 10:page * 10 + 10]: doc = searcher.doc(scoreDoc.doc) authors = zip([doc.get(authors_name_field)], [doc.get(authors_url_field)]) anses.append({ 'title': doc.get(title_field), 'description': doc.get(description_field).encode('utf-8'), 'url': doc.get(url_field), 'cover': doc.get(cover_field), 'authors': authors }) return render.index(anses, query, num_pages)
def es_connection(): es_connection = get_test_client() add_connection("default", es_connection) yield es_connection
def main(use_elasticsearch=True, calculate_PageRank=False, tele_const=0.2): """ main entry for the indexer module. """ jsons_root_dir = 'JSONs/' # list of addresses of all json files all_json_dirs = glob.glob(jsons_root_dir + '*.json') # first reading all json files jsons = [] for jdir in all_json_dirs: with open(jdir, 'r') as f: jsn = json.load(f) jsons.append(jsn) print len(jsons), ' json files imported.' # now creating a set of all links and then a list of all links in json files print 'creating a list of all links' links_set = set() for js in jsons: links_set.add(js["url"]) for l in js["outlinks"]: links_set.add(l) print len(links_set), ' links found' links = list(links_set) ## if user has selected to index documents using Elasticsearch # Note that when using Elasticsearch, page rank is ignored if use_elasticsearch: from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections print 'Using Elasticsearch for indexing, PageRank is ignored' es = Elasticsearch() es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) Book.init('book-index') ## adding all document to the index 'book-index' for idx, js in enumerate(jsons): book = Book(average=js['average'], cover=js['cover'], description=js['description'].encode('utf-8'), ratings=js['ratings'], reviews=js['reviews'], title=js['title'], url=js['url'], outlinks=js['outlinks']) book.add_authors(js['authors']) book.add_userreviews(js['userreviews']) book.id = idx book.save() print 'Elasticsearch index created' ### use pyLucene instead else: import lucene from java.io import File from org.apache.lucene.index import IndexWriterConfig, IndexWriter, FieldInfo from org.apache.lucene.document import Document, Field, FieldType, IntField, FloatField from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version from org.apache.lucene.analysis.standard import StandardAnalyzer print 'Using Lucene for indexing' ## if user has selected to calculate the PageRank if calculate_PageRank: # now creating the unnormalized adjacency matrix print 'creating the unnormalized adjacency matrix.' adjacency = np.zeros((len(links_set), len(links_set))) for js in jsons: node_idx = links.index(js["url"]) for l in js["outlinks"]: out_idx = links.index(l) adjacency[node_idx, out_idx] += 1 print 'the unnormalized adjacency matrix created.' print 'normalizing the adjacency matrix with teleporting constant value of ', tele_const norm_mat = Normalize(adjacency, tele_const) print 'calculating the PageRank scores' pr_scores = PageRankScore(norm_mat) ## here goes the pyLucene code, which means I should swith to the damn Ubuntu index_folder = '.' index_name = 'lucene.index' index_path = os.path.join(index_folder, index_name) print 'initializing Lucene VM' lucene.initVM() print 'lucene version ', lucene.VERSION version = Version.LUCENE_CURRENT index_store = SimpleFSDirectory(File(index_path)) analyzer = StandardAnalyzer(version) config = IndexWriterConfig(version, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) # Options TokenizeFields = True # Title field type title_field = 'title' tft = FieldType() tft.setIndexed(True) tft.setStored(True) tft.setTokenized(TokenizeFields) tft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS ) #only index the document and frequency data # Authors name field type authors_name_field = 'authors_name' anft = FieldType() anft.setIndexed(True) anft.setStored(True) anft.setTokenized(TokenizeFields) anft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # Authors url field type authors_url_field = 'authors_url' auft = FieldType() auft.setIndexed(False) auft.setStored(True) # Average rating field type average_field = 'average' # Cover Image URL field type cover_field = 'cover' cft = FieldType() cft.setIndexed(False) cft.setStored(True) # Book description field type description_field = 'description' descft = FieldType() descft.setIndexed(True) descft.setStored(True) descft.setTokenized(TokenizeFields) descft.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Outlinks field type outlinks_field = "outlinks" outft = FieldType() outft.setIndexed(False) outft.setStored(True) # Ratings count field type ratings_field = 'ratings' # Reviews count field type reviews_field = 'reviews' # URL field type url_field = 'url' uft = FieldType() uft.setIndexed(False) uft.setStored(True) # userreviews.userName field type userreviews_userName_field = 'userreviews_userName' usunft = FieldType() usunft.setIndexed(False) usunft.setStored(True) #userreviews.userReview field type userreviews_userReview_field = 'userreviews_userReview' usurft = FieldType() usurft.setIndexed(True) usurft.setStored(False) usurft.setTokenized(TokenizeFields) usurft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #userreviews.userReviewDate field type userreviews_userReviewDate_field = 'userreviews_userReviewDate' usudft = FieldType() usudft.setIndexed(False) usudft.setStored(True) #userreviews.userURL field type userreviews_userURL_field = 'userreviews_userURL' usuuft = FieldType() usuuft.setIndexed(False) usuuft.setStored(True) docid_field = 'docid' for idx, js in enumerate(jsons): boostVal = js['average'] if calculate_PageRank: boostVal *= pr_scores[links.index(js['url'])] doc = Document() for author in js['authors']: doc.add(Field(authors_name_field, author['name'], anft)) doc.add(Field(authors_url_field, author['url'], auft)) doc.add( FloatField(average_field, float(js['average']), Field.Store.YES)) doc.add(Field(cover_field, js['cover'], cft)) df = Field(description_field, js['description'], descft) df.setBoost(boostVal) doc.add(df) for u in js['outlinks']: doc.add(Field(outlinks_field, u, outft)) doc.add(IntField(ratings_field, js['ratings'], Field.Store.YES)) doc.add(IntField(reviews_field, js['reviews'], Field.Store.YES)) tf = Field(title_field, js['title'], tft) tf.setBoost(boostVal) doc.add(tf) doc.add(Field(url_field, js['url'], uft)) for rev in js['userreviews']: doc.add( Field(userreviews_userName_field, rev['userName'], usunft)) doc.add( Field(userreviews_userReview_field, rev['userReview'], usurft)) doc.add( Field(userreviews_userReviewDate_field, rev['userReviewDate'], usurft)) doc.add( Field(userreviews_userURL_field, rev['userURL'], usuuft)) doc.add(IntField(docid_field, idx, Field.Store.YES)) writer.addDocument(doc) print 'lucene index created' writer.commit() writer.close() print 'writing lucene indexing finished'