Example #1
0
def load_pie(
    filename, index_name, type_name, category, name, zone="France", sep=";", display="pie", source="", description=""
):

    f = open(filename, mode="r")
    es = ElasticSearch(CONTEXT["datahub-store"])

    categories = {}
    for line in f:
        key, string_value = line.split(sep, 2)
        value = cjson.decode(string_value)

        categories[key] = value

    serie = {
        "name": name,
        "owner": "public",
        "display": display,
        "zone": zone,
        "category": category,
        "source": source,
        "description": description % (key),
        "data": {"categories": categories.keys(), "series": [{"data": categories.values()}]},
    }
    es.index(index_name, display, serie)

    es.refresh(index_name)
    f.close()

    es.refresh(index_name)
    f.close()
def main():
    #Train the Naive Bayes Classifier
    f=open('./data_set/naivebayes_trained_model.pickle')
    NBClassifier=pickle.load(f)

    #ElasticSearch- Call the es_indexer file to create 'sentiment_analysis' index and store
    #the contents of the tweet file in that Index
    
    es=ElasticSearch('http://localhost:9200/')
    es_indexer()
    ############Indexing into Elasticsearch############
    i=0
    for each in tweet_data():
        i+=1
        testTweet= each
        processedTestTweet=process_tweet(testTweet)
        sentiment=NBClassifier.classify(extract_features(build_feature_vector(processedTestTweet)))
    
        
        es.index("sentiment_analysis","document",{
                     "text": testTweet,
                     "sentiment": sentiment
                         },id=i)
    print "Indexing completed."

    es.refresh(index="sentiment_analysis")
    print "Index refreshed."

    f.close()
Example #3
0
class Indexer(object):
  
  def __init__(self, input):
    self.input = input
    self.es = ElasticSearch()
    self.index_name = "psim"
    self.doc_type = 'book'
    
  def delete_index(self):
    # Delete index if already found one
    try:
      self.es.delete_index(index = self.index_name)
    except Exception:
      pass
  
  def create_index(self):
    self.es.create_index(index=self.index_name, settings = self.get_index_settings())
    
  def get_index_settings(self):
    settings = {
                        "mappings": {
                           "book": {
                             "_all" : {"enabled" : "false"},       
                             "properties": {
                                "codes": {"type": "string",
                                         "term_vector": "yes",
                                         "store": "true"},
                                "pid" : {"type" : "string"},
                                "embedding": {"type": "float",
                                              "store": "true"},
                                "magnitude": {"type": "float", "store": "true"}
                             }     
                           }
                        }
               }
    return settings
  
  def documents(self):
    with open(self.input) as input_file:
      for line in input_file:
        json_doc = json.loads(line)
        yield self.es.index_op(json_doc, doc_type=self.doc_type)
    
  def index(self):
    self.delete_index()
    self.create_index()
    for chunk in bulk_chunks(self.documents(), docs_per_chunk=1000):
      self.es.bulk(chunk, index = self.index_name, doc_type = self.doc_type)
    self.es.refresh(self.index_name)
Example #4
0
def resource_create(request, index):
    data = json.loads(request.POST['data'])
    data['new'] = True
    # random identifier
    data['uri'] = ''.join(random.choice('0123456789ABCDEF') for i in range(32))

    # store data in elasticsearch
    es = ElasticSearch(settings.ELASTICSEARCH_URL)
    if index == 'persons':
        es.index(index, "person", data)
    elif index == 'institutes':
        es.index(index, "institute", data)
    es.refresh(index)

    return JsonResponse(data)
Example #5
0
def resource_create(request, index):
    data = json.loads(request.POST['data'])
    data['new'] = True
    # random identifier
    data['uri'] = ''.join(random.choice('0123456789ABCDEF') for i in range(32))

    # store data in elasticsearch
    es = ElasticSearch(settings.ELASTICSEARCH_URL)
    if index == 'persons':
        es.index(index, "person", data)
    elif index == 'institutes':
        es.index(index, "institute", data)
    es.refresh(index)

    return JsonResponse(data)
Example #6
0
 def test_cluster_size_3(self):
     cluster = self._make_one(size=3)
     cluster.start()
     self.assertEqual(len(cluster), 3)
     self.assertEqual(len(cluster.hosts), 3)
     self.assertEqual(len(os.listdir(cluster.working_path)), 3)
     self.assertEqual(len(cluster.urls), 3)
     client = ElasticSearch(cluster.urls, max_retries=2)
     self.assertEqual(client.health()['number_of_nodes'], 3)
     # test if routing works and data is actually distributed across nodes
     client.create_index('test_shards', settings={
         'number_of_shards': 1,
         'number_of_replicas': 2,
     })
     client.index('test_shards', 'spam', {'eggs': 'bacon'})
     client.refresh('test_shards')
     shard_info = client.status()['indices']['test_shards']['shards']['0']
     nodes = set([s['routing']['node'] for s in shard_info])
     self.assertTrue(len(nodes) > 1)
Example #7
0
def load_wordcloud(
    filename,
    index_name,
    type_name,
    category,
    name,
    zone="France",
    sep=";",
    display="wordcloud",
    source="",
    description="",
):

    f = open(filename, mode="r")
    es = ElasticSearch(CONTEXT["datahub-store"])

    categories = []
    for line in f:
        key, string_value = line.split(sep, 2)
        value = cjson.decode(string_value)

        categories.append((value["label"], value["norm_count"] / 100.0))

    serie = {
        "name": name,
        "owner": "public",
        "display": display,
        "zone": zone,
        "category": category,
        "source": source,
        "description": unicode(description) % (name),
        "data": {
            "categories": map(lambda item: item[0], categories),
            "series": [{"data": map(lambda item: item[1], categories)}],
        },
    }
    es.index(index_name, display, serie)

    es.refresh(index_name)
    f.close()
                    "coordinates" : coords,  # 4, 5
                    "feature_class" : row[6],
                    "feature_code" : row[7],
                    "country_code2" : row[8],
                    "country_code3" : country_code3,
                    "cc2" : row[9],
                    "admin1_code" : row[10],
                    "admin2_code" : row[11],
                    "admin3_code" : row[12],
                    "admin4_code" : row[13],
                    "population" : row[14],
                    "elevation" : row[15],
                    "dem" : row[16],
                    "timzeone" :  row[17],
                    "modification_date" : "2014-01-01"
                   }
            yield es.index_op(doc, index='geonames', doc_type='geoname')
        except:
            count += 1

    print 'Exception count:', count


chunk_count = 0
for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500):
    es.bulk(chunk)
    chunk_count += 1
    print 'Chunk count:', chunk_count

es.refresh('geonames')
Example #9
0
    }
}


es.health(wait_for_status='yellow')
es.delete_index('write-ads')
es.create_index('write-ads', settings={'mappings': ad_mapping})

dateYMD = args["date"]
prepareDataFromDB(dateYMD)

dir = DATA_FILES_JSON + '/' + dateYMD
for filename in os.listdir(dir):
    if filename.endswith('.json'):
        with open(dir + '/' + filename) as open_file:
            json_docs = json.load(open_file)
            es.bulk((es.index_op(doc) for doc in json_docs),
                index='write-ads',
                doc_type='ad')

es.refresh("write-ads")

res = es.search('website:com', index='write-ads')
print("Got %d Hits for .com websites" % res['hits']['total'])
for hit in res['hits']['hits']:
    print (hit["_source"])
res = es.search('website:in', index='write-ads')
print("Got %d Hits for .in websites" % res['hits']['total'])
res = es.search('category:entertainment', index='write-ads')
print("Got %d Hits for category:Entertainment" % res['hits']['total'])
Example #10
0
    def save(self, force_insert=False, force_update=False, **kwargs):
        es = ElasticSearch(ELASTIC_SEARCH_URL)
        if self.id:
            location = self.get_location()
            location_es = "{0},{1}".format(location.y, location.x)
            es.update('glamazer', 'modelresult', 'listings.listing.{0}'.format(self.id),
                script="ctx._source.listing_id = listing;" +
                "ctx._source.artist_id = artist;" +
                "ctx._source.artist_avatar = artist_avatar;" +
                "ctx._source.title = title;" +
                "ctx._source.location = location;" +
                "ctx._source.description = description;" +
                "ctx._source.get_picture = get_picture;" +
                "ctx._source.metadata = metadata;" +
                "ctx._source.price = price;" +
                "ctx._source.likes = likes;" +
                "ctx._source.comments = comments;" +
                "ctx._source.tags = tags;" +
                "ctx._source.status = status;" +
                "ctx._source.style = style;" +
                "ctx._source.rating = rating",
                params={
                    'listing':self.id, 
                    'artist':self.get_artist_id(),
                    'artist_avatar':self.get_artist_avatar(),
                    'title':self.title,
                    'location':location_es,
                    'description':self.description, 
                    'get_picture':self.get_picture(),
                    'metadata':self.metadata,
                    'price':self.price,
                    'likes':self.likes,
                    'comments':self.comments,
                    'tags':self.get_tags(),
                    'status':self.status,
                    'style':self.get_style(),
                    'rating':self.get_rating()
                    })
            super(Listing, self).save(force_insert, force_update)
        else:
            super(Listing, self).save(force_insert, force_update)

            artist_user = self.artist.user
            artist_name = artist_user.first_name
            followers = Followers.objects.select_related().filter(artist=self.artist)
            for follower in followers:
                Notification.objects.create(
                    sender = artist_user,
                    receiver = follower.user,
                    time = current_time(),
                    short_text = NOTIFICATIONS_SHORT[10].format(artist=artist_name),
                    long_text = NOTIFICATIONS_LONG[10].format(artist=artist_name, listing=self.title, user_id=self.artist_id, metadata=self.id),
                )

            location = self.get_location()
            location_es = "{0},{1}".format(location.y, location.x)
            es.index('glamazer', 'modelresult', 
                {
                    'listing_id': self.id,
                    'artist_id': self.artist_id,
                    'artist_avatar':self.get_artist_avatar(),
                    'title': self.title,
                    'location': location_es,
                    'description': self.description,
                    'get_picture': self.get_picture(),
                    'metadata': self.metadata,
                    'price': self.price,
                    'likes': self.likes,
                    'comments':self.comments,
                    'tags': self.get_tags(),
                    'status':self.status,
                    'style':self.get_style(),
                    'rating':self.get_rating()
                }, id='listings.listing.{0}'.format(self.id))
            es.refresh('glamazer')
                "coordinates": coords,  # 4, 5
                "feature_class": row[6],
                "feature_code": row[7],
                "country_code2": row[8],
                "country_code3": country_code3,
                "cc2": row[9],
                "admin1_code": row[10],
                "admin2_code": row[11],
                "admin3_code": row[12],
                "admin4_code": row[13],
                "population": row[14],
                "elevation": row[15],
                "dem": row[16],
                "timzeone": row[17],
                "modification_date": "2014-01-01"
            }
            yield es.index_op(doc, index='geonames', doc_type='geoname')
        except:
            count += 1

    print 'Exception count:', count


chunk_count = 0
for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500):
    es.bulk(chunk)
    chunk_count += 1
    print 'Chunk count:', chunk_count

es.refresh('geonames')
Example #12
0
    'id': 2,
    'name': 'Jessica Coder',
    'age': 31,
    'title': 'Programmer'
}, {
    'id': 3,
    'name': 'Freddy Coder抽',
    'age': 29,
    'title': 'Office Assistant'
}]

es.bulk((es.index_op(doc, id=doc.pop('id')) for doc in docs),
        index='test',
        doc_type='test')

es.refresh('test')

res1 = es.get('test', 'test', 1)

# 全文匹配, 注意中英文的分词方式.
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html

res8 = es.search(index='test',
                 size=2,
                 query={"query": {
                     "query_string": {
                         "query": "抽"
                     }
                 }})

# 前缀匹配查询,只接受小写.
def main():
    #Open the below file for writing
    #fhand = open('./data_set/full_feature_list.txt', 'w')
    #Below is the list of tuples tweet words, sentiment
    # tweets = []
    # #featureList=[]
    # input_tweets = csv.reader(open('./data_set/full_training_dataset _main.csv', 'rb'), delimiter=',')#, quotechar='|')
    # #initial_tweets=open('./data_set/train_data.csv', 'rU')
    # #input_tweets = csv.reader((line.replace('\0','') for line in initial_tweets), delimiter=",")
    # start6=time.clock()
    # print 'start-time','->',start6
    # for row in input_tweets:
    #     sentiment = row[0]
    #     tweet = row[1]
    #     #Callin process_tweet method to pre process the tweet
    #     processed_tweet = process_tweet(tweet)
    #     #Calling build_feature_list that returns the list of feature vector
    #     featureVector = build_feature_vector(processed_tweet)
    #     #featureList.append(featureVector)
    #     # for each in featureVector:
    #     #     fhand.write(each)
    #     #     fhand.write('\n')
    #     tweets.append((featureVector, sentiment))
    # print 'end-time','->',time.clock()-start6
    # #fhand.close()

    #Extract feature vector for all tweets. In below function the inputs are extract features function and all
    #tweets, the extract features function picks one tweet from list if tweets and create feature vector.
    # start7=time.clock()
    # print 'start-time-training-time','->',start7
    # training_set=apply_features(extract_features,tweets)
    # print type(training_set)
    # print 'end-time-training-time','->',time.clock()-start7
    #Train the Naive Bayes Classifier
    start8=time.clock()
    print 'start-time-NB','->',start8
    f=open('./data_set/naivebayes_trained_model.pickle')
    NBClassifier=pickle.load(f)
    print 'end-time-NB','->',time.clock()-start8
    #NBClassifier =  nltk.NaiveBayesClassifier.train(training_set)
    #f=open('./data_set/NBClassifier_trained_21k_tweets.pickle','wb')
    #pickle.dump(NBClassifier,f)
    #print 'end-time-NB','->',time.clock()-start8
    #Print most informative features about the classifier
    #print NBClassifier.show_most_informative_features(10)

    #Test the  classifier

    #ElasticSearch- Call the es_indexer file to create 'sentiment_analysis' index
    #es_indexer()
    es=ElasticSearch('http://localhost:9200/')
    i=0
    #testTweet = 'I will get internship happy!!!'
    for each in tweet_data():
        i+=1
        testTweet= each
        processedTestTweet=process_tweet(testTweet)
        sentiment=NBClassifier.classify(extract_features(build_feature_vector(processedTestTweet)))
        #print sentiment
    ############Indexing into Elasticsearch############
        es.index("sentiment_analysis","document",{
                     "text": testTweet,
                     "sentiment": sentiment
                         },id=i)
    print "Indexing completed."

    es.refresh(index="sentiment_analysis")
    print "Index refreshed."


    #print extract_features(tweets[0][0])
    f.close()
Example #14
0
class TestClient(unittest.TestCase):

    def setUp(self):
        super(TestClient, self).setUp()
        docs = []

        self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks')
    def test_datetime_ranges(self, _mock):
        "Test datetime ranges get converted to dates."
        client = self._make_one()
        start = datetime.datetime(2012, 1, 1, 12, 34, 56)
        end = datetime.datetime(2012, 1, 31, 12, 34, 56)
        list(client('downloads_count', start, end, interval='week'))
        self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1))
        assert not isinstance(_mock.call_args[0][0], datetime.datetime)
        self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31))
        assert not isinstance(_mock.call_args[0][1], datetime.datetime)

    def test_date_order(self):
        # Ensure fill doesn't change date ordering.
        client = self._make_one()
        prev_date = datetime.date(2000, 1, 1)

        # Addon 1 doesn't have downloads for every month and the client will
        # fill zeroes for the missing dates.
        hits = list(client('downloads_count', START, '2012-05-01',
                           interval='month', add_on='1'))
        for hit in hits:
            d = hit['date']
            assert prev_date < d
            prev_date = d
Example #15
0
                return data

es = ElasticSearch('http://localhost:9200/')
es.delete_index('pet')
spider = Spider()
breeds = spider.getPetBreeds()
p = Pinyin()
for breed in breeds:
    flg = 1
    page = 1
    pet_list = []
    while(flg):
        pets = spider.getPets(breed, (page - 1) * spider.limit)
        if not pets:
            flg = 0
        else:
            page = page + 1
            for pet in pets:
                pet_obj = {}
                pet_obj['name'] = pet['name']
                pet_obj['img'] = pet['img']
                pet_obj['type'] = breed['ename'] 
                pet_list.append(pet_obj)
                #print pet['name'] + '\t' + p.get_pinyin(pet['name'], '')
    print breed['ename'] + '\n'
    if not pet_list:
        continue
    doc_type = p.get_pinyin(breed['ename'].replace('宠物', ''), '')
    es.bulk((es.index_op(pet_obj) for pet_obj in pet_list), doc_type=doc_type, index = 'pet')
es.refresh('pet')
Example #16
0
def scrape(url):
    """
    Hit the page, save the contents
    """

    if USE_ES:
        es = ElasticSearch(ES_URL)

    page_num = 1
    while True:
        # Replace the wildcard with the current page number
        r = requests.get(url.replace('*', str(page_num)),
            # Be a good net citizen
            headers={
                'User-Agent': 'python-wbawsearch'
            }
        )

        # We have hit the last page, stop
        if r.status_code == 404:
            return

        # Show where we are
        print r.url

        # Save text for processing
        soup = BeautifulSoup(r.text)
        tables = soup.find_all(width=625)

        # Go through each table
        for table in tables:
            # Find all <td>
            cols = table.find_all('td')

            # Treat list of <td> as matrix,
            # pluck out cells we want
            for ii in range(5):
                # First cell
                page_url = str(cols[ii + 0].find('a').get('href')).strip()

                if page_url == 'indpages/wbawnowatch.html':
                    # Empty cell
                    break

                image_thumb_url = str(cols[ii + 0].find('img').get('src')).strip()
                image_full_url = image_thumb_url.replace('th', '').strip()

                # Second cell
                titles = cols[ii + 5].find_all('font')
                title = str(titles[0].find(text=True)).strip()
                short_description = str(titles[3].string).strip()

                # Third cell
                stk = str(cols[ii + 10].find(text=True)).strip()

                # Fourth cell
                price = str(cols[ii + 15].find('font').string).strip()

                # Fifth cell
                status = str(cols[ii + 20].find('font').string).strip()

                # Skip anything already sold
                if price.lower() != 'sold' and status.lower() != 'sold':

                    print page_url

                    # Get the long description
                    long_description = scrape_page(BASE_URL + '/' + page_url)

                    if USE_ES:
                        # Get the raw stock number
                        id = stk.split('#')[1].strip()

                        # Convert images to data URIs
                        #image_full_data = image_to_uri(
                        #    image_full_url
                        #)
                        #image_thumb_data = image_to_uri(
                        #    image_thumb_url
                        #)

                        # Add the document to the index
                        es.index(
                            ES_INDEX, "watch",
                            dict(
                                id=id,
                                page_url=page_url,
                                image_thumb_url=image_thumb_url,
                                image_full_url=image_full_url,
                                title=title,
                                short_description=short_description,
                                long_description=long_description,
                                stk=stk,
                                price=price,
                                status=status,

                                #image_full_data=image_full_data,
                                #image_thumb_data=image_thumb_data
                            ), id=id
                        )
                    else:
                        # Create a new watch object
                        watch = Watch(
                            page_url=page_url,
                            image_thumb_url=image_thumb_url,
                            image_full_url=image_full_url,
                            title=title,
                            short_description=short_description,
                            long_description=long_description,
                            stk=stk,
                            price=price,
                            status=status
                        )

                        # Add and commit
                        session.add(watch)
                        session.commit()

        # Next page
        page_num += 1

        # Sleep so this doesn't crash
        time.sleep(1)

    if USE_ES:
        es.refresh(ES_INDEX)
Example #17
0
for term in terms:
    
    print term
    words = iterativeChildren(terms[term]['data'])
    for word in words:
        terms[term]['closure'].append(word)

    d = iterativeDev(terms[term]['data_with_develops_from'])
    for dd in d:
        terms[term]['closure_with_develops_from'].append(dd)
    
    terms[term]['closure'] = list(set(terms[term]['closure']))
    terms[term]['closure'].append(term)
    
    terms[term]['closure_with_develops_from'] = list(set(terms[term]['closure_with_develops_from']))
    terms[term]['closure_with_develops_from'].append(term)

    terms[term]['systems'] = getSystemSlims(term)
    terms[term]['organs'] = getOrganSlims(term)
    terms[term]['developmental'] = getDevelopmentSlims(term)
    
    # Indexing the data in ElasticSearch
    connection.index(index_name, doc_type_name, terms[term], id=term)
    if count % 1000 == 0:
        connection.flush(index=index_name)
    connection.refresh()
    count = count + 1

print
print "Total GO Terms indexed " + str(count)
Example #18
0
class SearchIndex(object):
    def __init__(self, model):
        self.es = ElasticSearch()
        self.model = model

    def put_mapping(self, index, doc_type):
        mapping = {
            doc_type: {
                "properties": {
                    "location": {
                        "type": "geo_point"
                    },
                }
            }
        }
        self.es.put_mapping(index=index, doc_type=doc_type, mapping=mapping)

    def bulk_items(self, index, doc_type):
        for m in self.model.objects.all():
            self.es.bulk([
                self.es.index_op({
                    "pk": m.pk,
                    "name": m.name,
                    "rating": m.rating,
                    "address": m.address,
                    "description": m.description,
                    "location": {
                        "lon": m.longitude,
                        "lat": m.latitude
                    }
                }),
                ],
                doc_type=doc_type,
                index=index)

    def search(self, index, question, longitude, latitude, size=10):
        #self.es.delete_index(index)
        try:
            self.es.create_index(index)
            self.put_mapping(index, "place")
            self.bulk_items(index, "place")
        except IndexAlreadyExistsError:
            pass

        query = {
            "query": {
                "function_score": {
                    "query": {
                        "bool": {
                            "should": [
                                {"match": {"name": question}},
                                {"match": {"_all": {
                                    "query": question,
                                    "operator": "or",
                                    "fuzziness": "auto",
                                    "zero_terms_query": "all"
                                    }}}
                                ]
                            }
                        },
                    "functions": [
                        {"exp": {"rating": {"origin": 5, "scale": 1, "offset": 0.1}}},
                    ]
                    }
                }
            }

        if longitude and longitude is not None:
            query['query']['function_score']['functions'] = [
                {'gauss': {
                    "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "550m", "scale": "1km"}
                    }},
                {'gauss': {
                    "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "500m", "scale": "2km"}
                    }},
            ]

        results = self.es.search(query, index=index, size=size)

        self.es.refresh()

        return results
Example #19
0
def documents_from_mails(mails):
    """Build document from mail"""
    for mail in mails:
        if 'Date' in mail.headers:  # Some mails seem broken.
            yield {
                '@source': 'stuff://',
                '@type': 'mailadmin',
                '@tags': [mail.headers['From']],
                '@fields': mail.headers,
                '@timestamp': parse_date(mail.headers['Date']),
                '@source_host': 'localhost',
                '@source_path': 'mail/admin ',
                '@message': mail.body,
                'id': mail.headers['Message-Id']
            }

if __name__ == '__main__':
    # Instantiate it with an url
    es = ElasticSearch(sys.argv[1])
    # Kibana need this kind of name
    NAME = 'logstash-2013.06.13'
    try:
        es.delete_index(NAME)
    except ElasticHttpNotFoundError:
        pass  # Nobody cares
    emails = mbox(sys.argv[2])
    for n, docs in enumerate(bulk_iterate(documents_from_mails(emails), 100)):
        es.bulk_index(NAME, 'mailadmin', docs)
        print(n)
    print es.refresh(NAME)
Example #20
0
def getFeeds():
    print "getting feeds"
    es = ElasticSearch('http://fisensee.ddns.net:9200/')

    query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}}
    oldFeeds = es.search(query, size=300, index='feeds')

    if (len(oldFeeds['hits']['hits']) is not 0):
        es.bulk(
            es.delete_op(id=feed['_id'], index='feeds', doc_type='feed')
            for feed in oldFeeds['hits']['hits'])

    feedSources = FeedSource.objects.all()
    feeds = []
    defaultText = 'undefined'
    defaultDate = datetime.datetime.now().isoformat()
    utc = pytz.utc
    berlin = pytz.timezone('Europe/Berlin')
    now = datetime.datetime.today()
    dateThreshold = now - datetime.timedelta(weeks=2)

    allUrls = []
    for feedSource in feedSources:
        allUrls.append(feedSource.sourceUrl)

    urls = set(allUrls)
    for url in urls:
        source = feedparser.parse(url)
        for entry in source['items']:
            feed = {
                'title': defaultText,
                'description': defaultText,
                'link': defaultText,
                'date': defaultDate,
                'url': defaultText
            }
            if ('title' in entry):
                feed['title'] = entry['title']
            if ('description' in entry):
                feed['description'] = entry['description']
            if ('link' in entry):
                feed['link'] = entry['link']
            if ('published_parsed' in entry):
                date = datetime.datetime.fromtimestamp(
                    time.mktime(entry['published_parsed']))
                if (date < dateThreshold):
                    break
                utcDate = utc.localize(date)
                feed['date'] = utcDate.astimezone(berlin).isoformat()
            #id creation should be enough for now, but it's made to fail
            if ('title' or 'published_parsed' in entry):
                feed['id'] = base64.urlsafe_b64encode(
                    hashlib.sha256((feed['title'] +
                                    feed['date']).encode('utf8')).hexdigest())
            else:
                feed['id'] = base64.urlsafe_b64encode(
                    hashlib.sha256((feed['title']).encode('utf8')).hexdigest())
            feed['url'] = url
            feeds.append(feed)

    es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds),
            index='feeds',
            doc_type='feed')
    print es.refresh('feeds')
Example #21
0
class Elastic(DataLayer):
    """ElasticSearch data layer."""

    serializers = {
        'integer': int,
        'datetime': parse_date
    }

    def init_app(self, app):
        app.config.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200/')
        app.config.setdefault('ELASTICSEARCH_INDEX', 'eve')
        self.es = ElasticSearch(app.config['ELASTICSEARCH_URL'])
        self.index = app.config['ELASTICSEARCH_INDEX']

    def _get_field_mapping(self, schema):
        """Get mapping for given field schema."""
        if schema['type'] == 'datetime':
            return {'type': 'date'}
        elif schema['type'] == 'string' and schema.get('unique'):
            return {'type': 'string', 'index': 'not_analyzed'}
        elif schema['type'] == 'string':
            return {'type': 'string'}

    def put_mapping(self, app):
        """Put mapping for elasticsearch for current schema.

        It's not called automatically now, but rather left for user to call it whenever it makes sense.
        """
        for resource, resource_config in app.config['DOMAIN'].items():
            properties = {}
            properties[config.DATE_CREATED] = self._get_field_mapping({'type': 'datetime'})
            properties[config.LAST_UPDATED] = self._get_field_mapping({'type': 'datetime'})

            for field, schema in resource_config['schema'].items():
                field_mapping = self._get_field_mapping(schema)
                if field_mapping:
                    properties[field] = field_mapping

            datasource = (resource, )  # TODO: config.SOURCES not available yet (self._datasource_ex(resource))
            mapping = {}
            mapping[datasource[0]] = {'properties': properties}
            self.es.put_mapping(self.index, datasource[0], mapping)

    def find(self, resource, req, sub_resource_lookup):
        """
        TODO: implement sub_resource_lookup
        """
        query = {
            'query': {
                'query_string': {
                    'query': request.args.get('q', '*'),
                    'default_field': request.args.get('df', '_all'),
                    'default_operator': 'AND'
                }
            }
        }

        if not req.sort and self._default_sort(resource):
            req.sort = self._default_sort(resource)

        # skip sorting when there is a query to use score
        if req.sort and 'q' not in request.args:
            query['sort'] = []
            sort = ast.literal_eval(req.sort)
            for (key, sortdir) in sort:
                sort_dict = dict([(key, 'asc' if sortdir > 0 else 'desc')])
                query['sort'].append(sort_dict)

        if req.where:
            where = json.loads(req.where)
            if where:
                query['filter'] = {
                    'term': where
                }

        if req.max_results:
            query['size'] = req.max_results

        if req.page > 1:
            query['from'] = (req.page - 1) * req.max_results

        source_config = config.SOURCES[resource]
        if 'facets' in source_config:
            query['facets'] = source_config['facets']

        try:
            args = self._es_args(resource)
            args['es_fiels'] = self._fields(resource)
            return self._parse_hits(self.es.search(query, **args), resource)
        except es_exceptions.ElasticHttpError:
            return ElasticCursor()

    def find_one(self, resource, **lookup):
        args = self._es_args(resource)
        args['es_fields'] = self._fields(resource)

        if config.ID_FIELD in lookup:
            try:
                hit = self.es.get(id=lookup[config.ID_FIELD], **args)
            except es_exceptions.ElasticHttpNotFoundError:
                return

            if not hit['exists']:
                return

            doc = hit.get('fields', hit.get('_source', {}))
            doc['_id'] = hit.get('_id')
            convert_dates(doc, self._dates(resource))
            return doc
        else:
            query = {
                'query': {
                    'constant_score': {
                        'filter': {
                            'term': lookup
                        }
                    }
                }
            }

            try:
                args['size'] = 1
                docs = self._parse_hits(self.es.search(query, **args), resource)
                return docs.first()
            except es_exceptions.ElasticHttpNotFoundError:
                return None

    def find_list_of_ids(self, resource, ids, client_projection=None):
        args = self._es_args(resource)
        args['es_fields'] = self._fields(resource)
        return self._parse_hits(self.es.multi_get(ids, **args), resource)

    def insert(self, resource, doc_or_docs, **kwargs):
        ids = []
        kwargs.update(self._es_args(resource))
        for doc in doc_or_docs:
            doc.update(self.es.index(doc=doc, id=doc.get('_id'), **kwargs))
            ids.append(doc['_id'])
        self.es.refresh(self.index)
        return ids

    def update(self, resource, id_, updates):
        args = self._es_args(resource, refresh=True)
        return self.es.update(id=id_, doc=updates, **args)

    def replace(self, resource, id_, document):
        args = self._es_args(resource, refresh=True)
        args['overwrite_existing'] = True
        return self.es.index(document=document, id=id_, **args)

    def remove(self, resource, id_=None):
        args = self._es_args(resource, refresh=True)
        if id_:
            return self.es.delete(id=id_, **args)
        else:
            try:
                return self.es.delete_all(**args)
            except es_exceptions.ElasticHttpNotFoundError:
                return

    def _parse_hits(self, hits, resource):
        """Parse hits response into documents."""
        return ElasticCursor(hits, self._dates(resource))

    def _es_args(self, resource, refresh=None):
        """Get index and doctype args."""
        datasource = self._datasource(resource)
        args = {
            'index': self.index,
            'doc_type': datasource[0],
            }
        if refresh:
            args['refresh'] = refresh
        return args

    def _fields(self, resource):
        """Get projection fields for given resource."""
        datasource = self._datasource(resource)
        keys = datasource[2].keys()
        return ','.join(keys)

    def _default_sort(self, resource):
        datasource = self._datasource(resource)
        return datasource[3]

    def _dates(self, resource):
        dates = [config.LAST_UPDATED, config.DATE_CREATED]
        datasource = self._datasource(resource)
        schema = config.DOMAIN[datasource[0]]['schema']
        for field, field_schema in schema.items():
            if field_schema['type'] == 'datetime':
                dates.append(field)
        return dates
Example #22
0
class ElasticSearchProvider(SearchProvider):
    def __init__(self, config, db=None, authnz_wrapper=None, io_loop=None):
        self.debug = False
        self.config = config
        if db is not None:
            self.db = db
        self.syncES = ElasticSearch(
            '%(ELASTIC_SEARCH_PROTOCOL)s://%(ELASTIC_SEARCH_HOST)s:%(ELASTIC_SEARCH_PORT)s' % config
        )
        self.asyncES = ESConnection(
            host=config.get('ELASTIC_SEARCH_HOST'),
            port=config.get('ELASTIC_SEARCH_PORT'),
            io_loop=io_loop,
            protocol=config.get('ELASTIC_SEARCH_PROTOCOL'),
        )
        self.index = config.get('ELASTIC_SEARCH_INDEX')
        self.max_retries = config.get('ELASTIC_SEARCH_MAX_RETRIES')

    def activate_debug(self):
        self.debug = True

    def connect_to_db(self):
        from sqlalchemy import create_engine
        from sqlalchemy.orm import scoped_session, sessionmaker
        conn_string = self.config.get('SQLALCHEMY_CONNECTION_STRING')
        engine = create_engine(
            conn_string,
            convert_unicode=True,
            pool_size=1,
            max_overflow=0,
            echo=self.debug
        )
        maker = sessionmaker(bind=engine, autoflush=True)
        self.db = scoped_session(maker)

    def _assemble_inner_query(self, domain=None, page_filter=None):
        if page_filter and domain:
            page_prefix = '%s/%s' % (domain.url, page_filter)
        else:
            page_prefix = None

        if page_prefix:
            return {
                'prefix': {
                    'page_url': page_prefix
                }
            }
        else:
            return {
                'match_all': {}
            }

    def _assemble_outer_query(self, inner_query, filter_terms):
        return {
            'filtered': {
                'query': inner_query,
                'filter': {
                    'and': [{
                        'term': filter_term
                    } for filter_term in filter_terms]
                }
            }
        }

    def _assemble_filter_terms(self, key_id=None, domain=None):
        filter_terms = []

        if key_id:
            filter_terms.append({'keys.id': key_id})

        if domain:
            filter_terms.append({'domain_id': domain.id})

        return filter_terms

    def gen_doc(self, review):
        return {
            'keys': [{'id': violation.key_id} for violation in review.violations],
            'uuid': str(review.uuid),
            'completed_date': review.completed_date,
            'violation_count': review.violation_count,
            'page_id': review.page_id,
            'page_uuid': str(review.page.uuid),
            'page_url': review.page.url,
            'page_last_review_date': review.page.last_review_date,
            'domain_id': review.domain_id,
            'domain_name': review.domain.name,
        }

    def index_review(self, review):
        for attempt in range(self.max_retries):
            try:
                self.syncES.send_request(
                    method='POST',
                    path_components=[self.index, 'review', review.page_id],
                    body=dumps(self.gen_doc(review)),
                    encode_body=False
                )
                break
            except (Timeout, ConnectionError, ElasticHttpError, InvalidJsonResponseError) as e:
                values = review.id, review.page_id, str(e)
                logging.error('Could not index review (review_id:{0}, page_id:{1}): {2}'.format(*values))
                time.sleep(1)
                if attempt >= self.max_retries - 1:
                    raise
            else:
                raise

    def index_reviews(self, reviewed_pages, reviews_count, batch_size):
        action = {'index': {'_type': 'review'}}

        for i in range(0, reviews_count, batch_size):
            body_bits = []

            for page in reviewed_pages[i:i + batch_size]:
                doc = self.gen_doc(page.last_review)

                action['index']['_id'] = doc['page_id']

                body_bits.append(dumps(action))
                body_bits.append(dumps(doc))

            # Yes, that trailing newline IS necessary
            body = '\n'.join(body_bits) + '\n'

            self.syncES.send_request(
                method='POST',
                path_components=[self.index, '_bulk'],
                body=body,
                encode_body=False
            )

        logging.info('Done!')

    @return_future
    def get_by_violation_key_name(self, key_id, current_page=1, page_size=10, domain=None, page_filter=None, callback=None):
        def treat_response(response):
            if response.error is None:
                try:
                    hits = loads(response.body).get('hits', {'hits': []})

                    reviews_data = []
                    for hit in hits['hits']:
                        completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date'])
                        reviews_data.append({
                            'uuid': hit['_source']['uuid'],
                            'page': {
                                'uuid': hit['_source']['page_uuid'],
                                'url': hit['_source']['page_url'],
                                'completedAt': completedAt
                            },
                            'domain': hit['_source']['domain_name']
                        })

                    reviews_count = hits.get('total', 0)

                    callback({
                        'reviews': reviews_data,
                        'reviewsCount': reviews_count
                    })
                except Exception as e:
                    reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message)
                    logging.error(reason)
                    callback({'error': {'status_code': 500, 'reason': reason}})
            else:
                reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body)
                logging.error(reason)
                callback({'error': {'status_code': 500, 'reason': reason}})

        inner_query = self._assemble_inner_query(domain, page_filter)
        filter_terms = self._assemble_filter_terms(key_id, domain)

        query = self._assemble_outer_query(inner_query, filter_terms)

        sort_ = [{
            'completed_date': {
                'order': 'desc'
            }
        }, {
            'violation_count': {
                'order': 'desc'
            }
        }]

        source = {'query': query, 'sort': sort_}

        self.asyncES.search(
            callback=treat_response,
            index=self.index,
            type='review',
            source=source,
            page=current_page,
            size=page_size,
        )

    @return_future
    def get_domain_active_reviews(self, domain, current_page=1, page_size=10, page_filter=None, callback=None):
        def treat_response(response):
            if response.error is None:
                try:
                    hits = loads(response.body).get('hits', {'hits': []})

                    pages = []
                    for hit in hits['hits']:
                        completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date'])
                        pages.append({
                            'url': hit['_source']['page_url'],
                            'uuid': hit['_source']['page_uuid'],
                            'violationCount': len(hit['_source']['keys']),
                            'completedAt': completedAt,
                            'reviewId': hit['_source']['uuid']
                        })

                    reviews_count = hits.get('total', 0)

                    callback({
                        'reviewsCount': reviews_count,
                        'pages': pages
                    })
                except Exception as e:
                    reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message)
                    logging.error(reason)
                    callback({'error': {'status_code': 500, 'reason': reason}})
            else:
                reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body)
                logging.error(reason)
                callback({'error': {'status_code': 500, 'reason': reason}})

        inner_query = self._assemble_inner_query(domain=domain, page_filter=page_filter)
        filter_terms = self._assemble_filter_terms(domain=domain)

        query = self._assemble_outer_query(inner_query, filter_terms)

        sort_ = [{
            'violation_count': {
                'order': 'desc'
            }
        }, {
            'completed_date': {
                'order': 'desc'
            }
        }]

        source = {'query': query, 'sort': sort_}

        self.asyncES.search(
            callback=treat_response,
            index=self.index,
            type='review',
            source=source,
            page=current_page,
            size=page_size,
        )

    def refresh(self):
        try:
            self.syncES.refresh(index=self.index)
        except Exception as e:
            logging.error('Could not refresh index (%s)' % e)

    def get_index_settings(cls):
        return {
            'index': {
                'number_of_shards': 4
            }
        }

    def get_index_mapping(cls):
        return {
            'review': {
                'properties': {
                    'keys': {
                        'properties': {
                            'id': {
                                'type': 'integer'
                            }
                        }
                    },
                    'uuid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'completed_date': {
                        'type': 'integer'
                    },
                    'violation_count': {
                        'type': 'float'
                    },
                    'page_id': {
                        'type': 'integer'
                    },
                    'page_uuid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'page_url': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'page_last_review_date': {
                        'type': 'integer'
                    },
                    'domain_id': {
                        'type': 'integer'
                    },
                    'domain_name': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    }
                }
            }
        }

    def setup_index(self):
        try:
            settings = self.get_index_settings()
            self.syncES.create_index(index=self.index, settings=settings)
            mapping = self.get_index_mapping()
            self.syncES.put_mapping(index=self.index, doc_type='review', mapping=mapping)
            logging.info('Index %s created.' % self.index)
        except Exception as e:
            raise e

    def delete_index(self):
        try:
            self.syncES.delete_index(index=self.index)
            logging.info('Index %s deleted.' % self.index)
        except Exception as e:
            raise e

    def _get_max_page_id_from_index(self, must_have_domain_name=False):
        if must_have_domain_name:
            inner_query = {
                'constant_score': {
                    'filter': {
                        'not': {
                            'missing': {
                                'field': 'domain_name'
                            }
                        }
                    }
                }
            }
        else:
            inner_query = {
                'match_all': {}
            }

        query = {
            'query': inner_query,
            'sort': [{
                'page_id': {
                    'order': 'desc'
                }
            }]
        }

        results = self.syncES.search(query, index=self.index, doc_type='review')
        if results['hits']['total'] > 0:
            return results['hits']['hits'][0]['_id'] or 0
        return 0

    def index_all_reviews(self, keys=None, batch_size=200, replace=False):
        logging.info('Querying database...')
        self.connect_to_db()

        if keys is not None:
            keys = [k.id for k in self.db.query(Key.id).filter(Key.name.in_(keys)).all()]

        try:
            max_page_id = self._get_max_page_id_from_index(must_have_domain_name=True)
        except Exception:
            logging.error('Could not retrieve max page_id! Use with --replace (with caution)')
            return

        def apply_filters(query):
            if keys is not None:
                query = query \
                    .filter(Violation.review_id == Page.last_review_id) \
                    .filter(Violation.key_id.in_(keys))

            if not replace:
                query = query.filter(Page.id > max_page_id)

            return query.filter(Page.last_review_id != None)

        reviews_count = apply_filters(self.db.query(func.count(Page))).scalar()

        query = self.db.query(Page).options(joinedload('last_review'))
        reviewed_pages = apply_filters(query).order_by(Page.id.asc())

        logging.info('Indexing %d reviews...' % reviews_count)

        self.index_reviews(reviewed_pages, reviews_count, batch_size)

    @classmethod
    def new_instance(cls, config):
        return ElasticSearchProvider(config)

    @classmethod
    def main(cls):
        import sys

        parser = cls.argparser()
        args = parser.parse_args()

        config = {}
        host = None
        port = None
        index = None
        es = None

        levels = ['ERROR', 'WARNING', 'INFO', 'DEBUG']
        log_level = levels[args.verbose]
        logging.basicConfig(level=getattr(logging, log_level), format='%(levelname)s - %(message)s')

        if not (args.create or args.recreate or args.delete or args.keys or args.all_keys):
            parser.print_help()
            sys.exit(1)

        if args.conf:
            from derpconf.config import ConfigurationError
            from holmes.config import Config
            try:
                config = Config().load(args.conf[0])
                host = config['ELASTIC_SEARCH_HOST']
                port = config['ELASTIC_SEARCH_PORT']
                index = config['ELASTIC_SEARCH_INDEX']
            except ConfigurationError:
                logging.error('Could not load config! Use --conf conf_file')
                sys.exit(1)
            except KeyError:
                logging.error('Could not parse config! Check it\'s contents')
                sys.exit(1)

        if args.server:
            try:
                host, port = args.server[0].split(':')
                config['ELASTIC_SEARCH_HOST'] = host
                config['ELASTIC_SEARCH_PORT'] = port
            except Exception:
                logging.error('Could not parse server host and port! Use --server host:port')
                sys.exit(1)

        if args.index:
                index = args.index[0]
                config['ELASTIC_SEARCH_INDEX'] = index

        from pyelasticsearch.exceptions import IndexAlreadyExistsError, ElasticHttpNotFoundError, InvalidJsonResponseError
        from requests.exceptions import ConnectionError
        try:

            if args.create or args.recreate or args.delete:
                if host is None or port is None:
                    logging.error('Need either a host and port or a config file to perform such operation!')
                    sys.exit(1)
                if index is None:
                    logging.error('Need either an index name or a config file to perform such operation!')
                    sys.exit(1)
                else:
                    es = cls.new_instance(config)
                    if args.recreate or args.delete:
                        try:
                            es.delete_index()
                        except ElasticHttpNotFoundError:
                            pass
                        except InvalidJsonResponseError as e:
                            logging.error('Invalid response! Reason: %s' % e)
                            sys.exit(1)
                    if args.create or args.recreate:
                        es.setup_index()

            if args.keys or args.all_keys:
                if config is None:
                    logging.error('Need a config file to perform such operation! Use --conf conf_file')
                else:
                    batch_size = args.batch_size[0] if args.batch_size else 200
                    es = cls.new_instance(config) if not es else es
                    try:
                        if args.verbose > 2:
                            es.activate_debug()
                        if args.keys:
                            es.index_all_reviews(args.keys, replace=args.replace, batch_size=batch_size)
                        elif args.all_keys:
                            es.index_all_reviews(replace=args.replace, batch_size=batch_size)
                    except InvalidJsonResponseError as e:
                        logging.error('Invalid response! Reason: %s' % e)
                        sys.exit(1)

        except IndexAlreadyExistsError:
            logging.error('Index %s already exists! Use --recreate (with caution) to recreate' % index)
        except ConnectionError:
            logging.error('Could not connect to server at %s:%s' % (host, port))
        except KeyError:
            logging.error('Could not get host nor port! Use either -conf or --server')
            sys.exit(1)
Example #23
0
def getFeeds():
    print "getting feeds"
    es = ElasticSearch('http://fisensee.ddns.net:9200/')

    query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}}
    oldFeeds = es.search(query, size=300, index='feeds')

    if(len(oldFeeds['hits']['hits']) is not 0):
        es.bulk(es.delete_op(id=feed['_id'], index='feeds',
        doc_type='feed') for feed in oldFeeds['hits']['hits'])


    feedSources = FeedSource.objects.all()
    feeds = []
    defaultText = 'undefined'
    defaultDate = datetime.datetime.now().isoformat()
    utc = pytz.utc
    berlin = pytz.timezone('Europe/Berlin')
    now = datetime.datetime.today()
    dateThreshold = now - datetime.timedelta(weeks=2)

    allUrls = []
    for feedSource in feedSources:
        allUrls.append(feedSource.sourceUrl)

    urls = set(allUrls)
    for url in urls:
        source = feedparser.parse(url)
        for entry in source['items']:
            feed = {
                'title':defaultText,
                'description':defaultText,
                'link':defaultText,
                'date':defaultDate,
                'url': defaultText
            }
            if('title' in entry):
                feed['title'] = entry['title']
            if('description' in entry):
                feed['description'] = entry['description']
            if('link' in entry):
                feed['link'] = entry['link']
            if('published_parsed' in entry):
                date = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed']))
                if(date < dateThreshold):
                    break
                utcDate = utc.localize(date)
                feed['date'] = utcDate.astimezone(berlin).isoformat()
            #id creation should be enough for now, but it's made to fail
            if('title' or 'published_parsed' in entry):
                feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title'] + feed['date']).encode('utf8')).hexdigest())
            else:
                feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title']).encode('utf8')).hexdigest())
            feed['url'] = url
            feeds.append(feed)



    es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds),
        index = 'feeds',
        doc_type = 'feed')
    print es.refresh('feeds')
Example #24
0
    def save(self, force_insert=False, force_update=False, **kwargs):
        es = ElasticSearch(ELASTIC_SEARCH_URL)
        if self.id:
            location = self.get_location()
            location_es = "{0},{1}".format(location.y, location.x)
            es.update('glamazer',
                      'modelresult',
                      'listings.listing.{0}'.format(self.id),
                      script="ctx._source.listing_id = listing;" +
                             "ctx._source.artist_id = artist;" +
                             "ctx._source.artist_avatar = artist_avatar;" +
                             "ctx._source.artist_name = artist_name;" +
                             "ctx._source.salon_id = salon;" +
                             "ctx._source.salon_avatar = salon_avatar;" +
                             "ctx._source.salon_name = salon_name;" +
                             "ctx._source.title = title;" +
                             "ctx._source.location = location;" +
                             "ctx._source.description = description;" +
                             "ctx._source.get_picture = get_picture;" +
                             "ctx._source.metadata = metadata;" +
                             "ctx._source.gender = gender;" +
                             "ctx._source.price = price;" +
                             "ctx._source.currency = currency;" +
                             "ctx._source.likes = likes;" +
                             "ctx._source.comments = comments;" +
                             "ctx._source.tags = tags;" +
                             "ctx._source.status = status;" +
                             "ctx._source.style = style;" +
                             "ctx._source.rating = rating",
                      params={
                             'listing': self.id,
                             'artist': self.get_artist_id(),
                             'artist_avatar': self.get_artist_avatar(),
                             'artist_name': self.get_artist_name(),
                             'salon': self.get_salon_id(),
                             'salon_avatar': self.get_salon_avatar(),
                             'salon_name': self.get_salon_name(),
                             'title': self.title,
                             'location': location_es,
                             'description': self.description,
                             'get_picture': self.get_picture(),
                             'metadata': self.metadata,
                             'gender': self.gender,
                             'price': self.price,
                             'currency': self.currency,
                             'likes': self.likes,
                             'comments': self.comments,
                             'tags': self.get_tags(),
                             'status': self.status,
                             'style': self.get_style(),
                             'rating': self.get_rating()
                        })
            super(Listing, self).save(force_insert, force_update)
        else:
            super(Listing, self).save(force_insert, force_update)

            location = self.get_location()
            location_es = "{0},{1}".format(location.y, location.x)
            es.index('glamazer', 'modelresult', {
                'listing_id': self.id,
                'artist_id': self.artist_id,
                'artist_avatar': self.get_artist_avatar(),
                'artist_name': self.get_artist_name(),
                'salon_id': self.get_salon_id(),
                'salon_avatar': self.get_salon_avatar(),
                'salon_name': self.get_salon_name(),
                'title': self.title,
                'location': location_es,
                'description': self.description,
                'get_picture': self.get_picture(),
                'metadata': self.metadata,
                'gender': self.gender,
                'price': self.price,
                'currency': self.currency,
                'likes': self.likes,
                'comments': self.comments,
                'tags': self.get_tags(),
                'status': self.status,
                'style': self.get_style(),
                'rating': self.get_rating()
                }, id='listings.listing.{0}'.format(self.id))
            es.refresh('glamazer')
class TestClient(unittest.TestCase):
    def setUp(self):
        super(TestClient, self).setUp()
        docs = []

        self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks')
    def test_datetime_ranges(self, _mock):
        "Test datetime ranges get converted to dates."
        client = self._make_one()
        start = datetime.datetime(2012, 1, 1, 12, 34, 56)
        end = datetime.datetime(2012, 1, 31, 12, 34, 56)
        list(client('downloads_count', start, end, interval='week'))
        self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1))
        assert not isinstance(_mock.call_args[0][0], datetime.datetime)
        self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31))
        assert not isinstance(_mock.call_args[0][1], datetime.datetime)

    def test_date_order(self):
        # Ensure fill doesn't change date ordering.
        client = self._make_one()
        prev_date = datetime.date(2000, 1, 1)

        # Addon 1 doesn't have downloads for every month and the client will
        # fill zeroes for the missing dates.
        hits = list(
            client('downloads_count',
                   START,
                   '2012-05-01',
                   interval='month',
                   add_on='1'))
        for hit in hits:
            d = hit['date']
            assert prev_date < d
            prev_date = d
Example #26
0
class ESWrapper(BaseDB):
    def __init__(self,
                 index_name,
                 doc_type,
                 host='http://localhost',
                 port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        #self._base_query = {"query": {"bool": {"must": {"match": {}}}}}
        #self._base_query = {"query": {"bool": {}}}
        self._geo_filter = {"distance": "20km", "coordinates": {}}
        self._population_filter = {'population': {'gte': 5000}}
        self._index = index_name
        self._doctype = doc_type

    def getByid(self, geonameId):
        maincondition = {"match": {"id": geonameId}}
        q = {"query": {"bool": {"must": maincondition}}}
        return self.eserver.search(
            q, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']

    def _query(self, qkey, **kwargs):
        q = {"query": {"bool": {}}}
        query_name = "should"
        q["query"]["bool"]["minimum_number_should_match"] = 1
        kwargs.pop("qtype", "")

        placetokens = [
            l.strip() for l in tokenizer.split(qkey)
            if l and l not in STOP_WORDS and l[-1] != '.'
        ]
        if placetokens:
            reduced_placename = u" ".join(placetokens[0:])
            if len(placetokens[0]) < 3 and len(
                    placetokens) > 1 and 3.0 / len(placetokens) >= .5:
                reduced_placename = u" ".join(placetokens[1:])
        else:
            reduced_placename = qkey

        # print "qkey", qkey, "reduced", reduced_placename
        maincondition = [
            {
                "bool": {
                    "must": [{
                        "multi_match": {
                            "query":
                            qkey,
                            "fields":
                            ["name.raw^5", "asciiname^5", "alternatenames"],
                            "operator":
                            "and"
                        }
                    }, {
                        "terms": {
                            "featureClass": ["a", "p"]
                        }
                    }],
                }
            },
            {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            },
            {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            },
            {
                "term": {
                    "normalized_asciiname": {
                        "value": qkey
                    }
                }
            },
            # {"term": {"alternatenames": {"value": qkey[1:]}}},
            {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            },
            # {"multi_match": {"query": reduced_placename if 'fuzzy' in kwargs else unicode(unidecode(reduced_placename)),
            {
                "multi_match": {
                    "query":
                    reduced_placename if 'fuzzy' in kwargs else unicode(
                        unidecode(reduced_placename)),
                    'fuzziness':
                    kwargs.pop("fuzzy", 0),
                    "max_expansions":
                    kwargs.pop("max_expansion", 10),
                    "prefix_length":
                    kwargs.pop("prefix_length", 1),
                    'operator':
                    kwargs.pop("operator", "and"),
                    "fields": [
                        "name^3", "asciiname^3", "alternatenames",
                        "normalized_asciiname^3"
                    ]
                }
            }
        ]

        q["query"]["bool"][query_name] = maincondition

        if kwargs:
            filter_cond = []
            if 'min_popln' in kwargs:
                popln = kwargs.pop("min_popln")
                if popln is not None:
                    filter_cond.append(
                        {"range": {
                            "population": {
                                "gte": popln
                            }
                        }})

            for key, val in kwargs.viewitems():
                if not isinstance(val, basestring):
                    val = list([(v) for v in val])
                    filter_cond.append({"terms": {key: val}})
                else:
                    filter_cond.append({"term": {key: (val)}})

            q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}

        q['from'] = 0
        q['size'] = 50
        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def query(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        # print(max_score)
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def _oldquery(self,
                  qkey,
                  qtype="exact",
                  analyzer=None,
                  min_popln=None,
                  size=10,
                  **kwargs):
        """
        qtype values are exact, relaxed or geo_distance
        Always limit results to 10
        """
        q = {"query": {"bool": {}}}
        query_name = kwargs.pop('query_name', 'must')
        query_name = "should"
        if query_name == "should":
            q["query"]["bool"]["minimum_number_should_match"] = 1

        maincondition = {}
        if qtype == "exact":
            maincondition = [{
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            }]
            if analyzer:
                maincondition["match"]["name.raw"]["analyzer"] = analyzer

        elif qtype == "relaxed":
            maincondition["match"] = {"alternatenames": {"query": qkey}}
            if analyzer:
                maincondition["match"]["alternatenames"]["analyzer"] = analyzer

            #q["query"]["bool"][query_name]["match"].pop("name.raw", "")
        elif qtype == "combined":
            maincondition = [{
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": qkey,
                            "fields":
                            ["name.raw", "asciiname", "alternatenames"]
                        }
                    },
                    "filter": {
                        "bool": {
                            "should": [{
                                "range": {
                                    "population": {
                                        "gte": 5000
                                    }
                                }
                            }, {
                                "terms": {
                                    "featureCode": [
                                        "pcla", "pcli", "cont", "rgn", "admd",
                                        "adm1", "adm2"
                                    ]
                                }
                            }]
                        }
                    }
                }
            }, {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey[1:]
                    }
                }
            }, {
                "match": {
                    "alternatenames": {
                        "query": qkey,
                        'fuzziness': kwargs.pop("fuzzy", 0),
                        "max_expansions": kwargs.pop("max_expansion", 5),
                        "prefix_length": kwargs.pop("prefix_length", 1)
                    }
                }
            }]

        if maincondition:
            q["query"]["bool"][query_name] = maincondition

            if min_popln:
                filter_cond = [{"range": {"population": {"gte": min_popln}}}]
            else:
                filter_cond = []

            if kwargs:
                #filter_cond = [{"range": {"population": {"gte": min_popln}}}]
                filter_cond += [{
                    "term": {
                        key: val
                    }
                } for key, val in kwargs.viewitems()]
                # print(filter_cond)
                q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}
            elif min_popln:
                filter_cond = [{
                    "range": {
                        "population": {
                            "gte": min_popln
                        }
                    }
                }, {
                    "terms": {
                        "featureCode": ["ppla", "pplx"]
                    }
                }]

                q["query"]["bool"]["filter"] = {
                    "bool": {
                        "should": filter_cond
                    }
                }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def oldquery(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def near_geo(self, geo_point, min_popln=5000, **kwargs):
        q2 = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    },
                    "filter": [
                        {
                            "geo_distance": {
                                "distance": "30km",
                                "coordinates": geo_point
                            }
                        },
                        {
                            "terms":
                            # {"featureCode":
                            #  ["pcli", "ppl", "ppla2", "adm3"]}
                            {
                                "featureClass": ["a", "h", "l", "t", "p", "v"]
                            }
                        }
                    ]
                }
            },
            "sort": {
                "population": "desc"
            }
        }
        if kwargs:
            for key in kwargs:
                q2['query']['bool']['filter'].append(
                    {"term": {
                        key: kwargs[key]
                    }})

        res = self.eserver.search(
            q2, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']
        res['confidence'] = 1.0
        return [GeoPoint(**res)]

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)
            settings['mappings'][self._doctype] = settings['mappings'].pop(
                'places')

        try:
            self.eserver.create_index(index=self._index, settings=settings)
        except:
            self.eserver.delete_index(self._index)
            self.eserver.create_index(index=self._index, settings=settings)

        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype)
            print "..",

        self.eserver.refresh(self._index)

    def _opLoader(self, datacsv, confDir):
        ere = re.compile("[^\sa-zA-Z0-9]")
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                try:
                    row['coordinates'] = [
                        float(row['longitude']),
                        float(row['latitude'])
                    ]
                    try:
                        row['population'] = int(row["population"])
                    except:
                        row['population'] = -1

                    try:
                        row['elevation'] = int(row['elevation'])
                    except:
                        row['elevation'] = -1

                    del (row['latitude'])
                    del (row['longitude'])
                    #print row['name']
                    row['alternatenames'] = row['alternatenames'].lower(
                    ).split(",")
                    row['normalized_asciiname'] = (re.sub(
                        r'\s+', r' ', ere.sub("", row['asciiname']))).strip()
                    cnt += 1
                    yield self.eserver.index_op(row,
                                                index=self._index,
                                                doc_type=self._doctype)
                except:
                    print json.dumps(row)
                    continue

    def remove_dynamic_stopwords(self, term):
        # cc = {}
        # ttl = 0
        words = [w for t in term.split("-") for w in t.split() if len(w) > 1]

        if len(words) == 1:
            return term

        stopword_removed = ""
        for word in words:
            try:
                t = self.eserver.count(word)['count']
                if t >= 20000:
                    continue
            except:
                pass

            stopword_removed += (word + " ")
            # else:
            #     print(term, "stopword ", word)

        return stopword_removed.strip()
Example #27
0
class ElasticSearch(object):
    conn = None
    url = settings.ELASTICSEARCH_URL
    index_name = settings.ELASTICSEARCH_INDEX_NAME
    stdout = None
    stderr = None

    def __init__(self, index_name=None, stdout=None, stderr=None):
        self.conn = PyElasticSearch()
        if index_name:
            self.index_name = index_name
        if stdout:
            self.stdout = stdout
        if stderr:
            self.stderr = stderr

    def create_index(self, delete=True):
        if delete:
            try:
                self.conn.delete_index(self.index_name)
            except ElasticHttpNotFoundError as e:
                pass
        mappings = dict(
            (k, v) for k, v in get_elasticsearch_properties().items())
        self.conn.create_index(self.index_name,
                               settings={'mappings': mappings})

    def index_activity_by_id(self, activity_id):
        activity = HistoricalActivity.objects.get(pk=activity_id)
        return self.index_activity(activity)

    def delete_activity_by_id(self, activity_id):
        activity = HistoricalActivity.objects.get(pk=activity_id)
        return self.delete_activity(activity)

    def index_activity(self, activity):
        for doc_type in DOC_TYPES_ACTIVITY:
            docs = self.get_activity_documents(activity, doc_type=doc_type)
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(
                        doc, id=doc.pop('id'), parent=doc.pop('_parent', None))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    def index_investor(self, investor):
        for doc_type in DOC_TYPES_INVESTOR:
            docs = self.get_investor_documents(investor, doc_type=doc_type)
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id'))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    def index_activity_documents(self, activity_identifiers=[]):
        activity_identifiers = activity_identifiers or HistoricalActivity.objects.filter(
            fk_status__in=(
                HistoricalActivity.STATUS_ACTIVE,
                HistoricalActivity.STATUS_PENDING,
                HistoricalActivity.STATUS_OVERWRITTEN,
                HistoricalActivity.STATUS_DELETED)).distinct().values_list(
                    'activity_identifier', flat=True).distinct()

        for doc_type in DOC_TYPES_ACTIVITY:
            docs = []
            # Collect documents
            self.stdout and self.stdout.write(
                'Collect %ss for %i deals...' %
                (doc_type, len(activity_identifiers)))
            for activity_identifier in activity_identifiers:
                for activity in self.get_activity_versions(
                        activity_identifier):
                    docs.extend(
                        self.get_activity_documents(activity,
                                                    doc_type=doc_type))
            # Bulk index documents
            self.stdout and self.stdout.write('Index %i %ss...' %
                                              (len(docs), doc_type))
            if len(docs) > 0:
                paginator = Paginator(docs, 1000)
                for page in paginator.page_range:
                    try:
                        self.conn.bulk(
                            (self.conn.index_op(doc,
                                                id=doc.pop('id'),
                                                parent=doc.pop(
                                                    '_parent', None))
                             for doc in paginator.page(page)),
                            index=self.index_name,
                            doc_type=doc_type)
                    except BulkError as e:
                        for error in e.errors:
                            msg = '%s: %s on ID %s' % (
                                error['index']['error']['type'],
                                error['index']['error']['reason'],
                                error['index']['_id'])
                            if 'caused_by' in error['index']['error']:
                                msg += ' (%s: %s)' % (error['index']['error']
                                                      ['caused_by']['type'],
                                                      error['index']['error']
                                                      ['caused_by']['reason'])
                            self.stderr and self.stderr.write(msg)
                    self.conn.refresh()

    def index_investor_documents(self):
        investors = Investor.objects.public().order_by(
            'investor_identifier', '-id').distinct('investor_identifier')

        for doc_type in DOC_TYPES_INVESTOR:
            docs = []
            # Collect documents
            self.stdout and self.stdout.write(
                'Collect %ss for %i investors...' %
                (doc_type, investors.count()))
            for investor in investors:
                docs.extend(
                    self.get_investor_documents(investor, doc_type=doc_type))
            # Bulk index documents
            self.stdout and self.stdout.write('Index %i %ss...' %
                                              (len(docs), doc_type))
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id'))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    #def index_activity_by_version(self, activity_identifier):
    #    for doc_type in get_elasticsearch_properties().keys():
    #        docs = self.get_documents_for_activity_version(activity_identifier, doc_type=doc_type)
    #        if len(docs) > 0:
    #            try:
    #                self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs),
    #                    index=self.index_name,
    #                    doc_type=doc_type)
    #            except BulkError as e:
    #                for error in e.errors:
    #                    stderr and stderr.write('%s: %s (caused by %s: %s, ID: %s)' % (
    #                            error['index']['error']['type'],
    #                            error['index']['error']['reason'],
    #                            error['index']['error']['caused_by']['type'],
    #                            error['index']['error']['caused_by']['reason'],
    #                            error['index']['_id']
    #                          ))

    def get_activity_versions(self, activity_identifier):
        versions = []
        # get the newest non-pending, readable historic version:
        try:
            newest = HistoricalActivity.objects.filter(
                activity_identifier=activity_identifier,
                fk_status__in=(
                    HistoricalActivity.STATUS_ACTIVE,
                    HistoricalActivity.STATUS_OVERWRITTEN,
                    HistoricalActivity.STATUS_DELETED)).distinct().latest()
            if newest and not newest.fk_status_id == HistoricalActivity.STATUS_DELETED:
                versions.append(newest)
        except HistoricalActivity.DoesNotExist:
            newest = None

        # get newer pendings
        pendings = HistoricalActivity.objects.filter(
            activity_identifier=activity_identifier,
            fk_status_id=HistoricalActivity.STATUS_PENDING).distinct()
        if newest:
            pendings.filter(history_date__gt=newest.history_date)
        versions.extend(pendings)

        return versions

    def get_activity_documents(self, activity, doc_type='deal'):
        docs = []
        deal_attrs = {
            'id': activity.id,
            'activity_identifier': activity.activity_identifier,
            'historical_activity_id': activity.id,
            'status': activity.fk_status_id,
        }

        # Todo: Is there a nice way to prevent this extra Activity query?
        # e.g. if we save is_public/deal_scope as ActivityAttributes
        public_activity = Activity.objects.filter(
            activity_identifier=activity.activity_identifier).order_by(
                '-id').first()
        if public_activity:
            deal_attrs.update({
                'is_public':
                public_activity.is_public,
                'deal_scope':
                public_activity.deal_scope,
                'deal_size':
                public_activity.deal_size,
                'current_negotiation_status':
                public_activity.negotiation_status,
                'top_investors':
                public_activity.top_investors,
                'fully_updated_date':
                public_activity.fully_updated_date,
            })
        else:
            # Fixme: This should not happen
            self.stderr and self.stderr.write(
                _('Missing activity for historical activity %i (Activity identifier: #%i)'
                  % (activity.id, activity.activity_identifier)))
        #except Activity.MultipleObjectsReturned:
        #    # Fixme: This should not happen
        #    self.stderr and self.stderr.write(_('Too much activities for historical activity %i (Activity identifier: #%i)' % (
        #        activity.id,
        #        activity.activity_identifier
        #    )))

        for a in activity.attributes.select_related('fk_group__name').order_by(
                'fk_group__name'):
            # do not include the django object id
            if a.name == 'id':
                continue
            attribute = None
            attribute_key = '%s_attr' % a.name
            if attribute_key in get_elasticsearch_properties(
            )['deal']['properties'].keys():
                attribute = {
                    'value': a.value,
                    'value2': a.value2,
                    'date': a.date,
                    'is_current': a.is_current,
                }
            value = a.value

            # Area field?
            if a.name and 'area' in a.name and a.polygon is not None:
                # Get polygon
                #value = json.loads(a.polygon.json)
                # Apparently this is case sensitive: MultiPolygon as provided by the GeoJSON does not work
                #value['type'] = 'multipolygon'
                value = a.polygon.json or ''
            # do not include empty values
            if value is None or value == '':
                continue

            # Doc types: location, data_source or contract
            group_match = a.fk_group and a.fk_group.name or ''
            group_match = re.match(
                '(?P<doc_type>location|data_source|contract)_(?P<count>\d+)',
                group_match)
            if group_match:
                dt, count = group_match.groupdict()['doc_type'], int(
                    group_match.groupdict()['count'])
                if doc_type == dt:
                    while len(docs) < count:
                        docs.append({
                            '_parent': activity.activity_identifier,
                            'id': a.id,  #'%i_%i' % (a.id, count),
                        })
                    docs[count - 1][a.name] = [
                        value,
                    ]
                # Set doc type counter within deal doc type (for location/data_source/contract)
                elif doc_type == 'deal':
                    # Set counter
                    key = '%s_count' % dt
                    if key not in deal_attrs.keys():
                        deal_attrs[key] = count
                    elif deal_attrs[key] < count:
                        deal_attrs[key] = count

                    # Create list with correct length to ensure formset values have the same index
                    if not a.name in deal_attrs:
                        deal_attrs[a.name] = [''] * count
                        if attribute:
                            deal_attrs[attribute_key] = [''] * count
                    else:
                        while len(deal_attrs[a.name]) < count:
                            deal_attrs[a.name].append('')
                            if attribute:
                                deal_attrs[attribute_key].append('')
                    deal_attrs[a.name][count - 1] = value
                    if attribute:
                        deal_attrs['%s_attr' % a.name][count - 1] = attribute

            # Doc type: deal and not formset
            elif doc_type == 'deal':
                if a.name in deal_attrs:
                    deal_attrs[a.name].append(value)
                    if '%s_attr' % a.name in get_elasticsearch_properties(
                    )['deal']['properties'].keys():
                        deal_attrs['%s_attr' % a.name].append(attribute)
                else:
                    deal_attrs[a.name] = [
                        value,
                    ]
                    if '%s_attr' % a.name in get_elasticsearch_properties(
                    )['deal']['properties'].keys():
                        deal_attrs['%s_attr' % a.name] = [
                            attribute,
                        ]

        if doc_type == 'deal':
            # Additionally save operational company attributes
            oc = Investor.objects.filter(
                investoractivityinvolvement__fk_activity__activity_identifier=
                activity.activity_identifier)
            if oc.count() > 0:
                oc = oc.first()
                for field in Investor._meta.fields:
                    if isinstance(field, ForeignKey):
                        deal_attrs['operational_company_%s' %
                                   field.name] = getattr(
                                       oc, '%s_id' % field.name)
                    else:
                        deal_attrs['operational_company_%s' %
                                   field.name] = getattr(oc, field.name)
            else:
                pass
                #self.stderr and self.stderr.write("Missing operational company for deal #%i" % activity.activity_identifier)

        # Create single document for each location
        # FIXME: Saving single deals for each location might be deprecated since we have doc_type location now?
        spatial_names = list(get_spatial_properties())
        for i in range(deal_attrs.get('location_count', 0)):
            doc = deal_attrs.copy()
            for name in spatial_names:
                if not name in doc:
                    continue
                if len(deal_attrs[name]) > i:
                    doc[name] = deal_attrs[name][i]
                else:
                    doc[name] = ''
            # Set unique ID for location (deals can have multiple locations)
            doc['id'] = '%s_%i' % (doc['id'], i)
            point_lat = doc.get('point_lat', None)
            point_lon = doc.get('point_lon', None)
            if point_lat and point_lon:
                # Parse values
                try:
                    parsed_lat, parsed_lon = float(point_lat), float(point_lon)
                    doc['geo_point'] = '%s,%s' % (point_lat, point_lon)
                except ValueError:
                    doc['geo_point'] = '0,0'
            else:
                doc['point_lat'] = '0'
                doc['point_lon'] = '0'
                doc['geo_point'] = '0,0'
            # FIXME: we dont really need 'point_lat' and 'point_lon' here,
            # so we should pop them from doc when adding 'geo_point'
            docs.append(doc)

        # Update docs with export values
        for doc in docs:
            doc.update(self.get_export_properties(doc, doc_type=doc_type))

        return docs

    def get_export_properties(self, doc, doc_type='deal'):
        if doc_type == 'investor':
            return ExportInvestorForm.export(doc)
        elif doc_type == 'involvement':
            return InvestorVentureInvolvementForm.export(doc)
        else:
            properties = {
                'deal_scope_export':
                doc.get('deal_scope', ''),
                'is_public_export':
                doc.get('is_public', False) and str(_('Yes')) or str(_('No')),
                'deal_size_export':
                doc.get('deal_size', ''),
                'current_negotiation_status_export':
                doc.get('current_negotiation_status', ''),
                'top_investors_export':
                doc.get('top_investors', ''),
                'fully_updated_date_export':
                doc.get('fully_updated_date', ''),
            }
            # Doc types: deal, location, contract and data_source
            for form in ChangeDealView.FORMS:
                formset_name = hasattr(form, "form") and form.Meta.name or None
                form = formset_name and form.form or form
                properties.update(form.export(doc, formset=formset_name))
            properties.update(
                ExportInvestorForm.export(doc, prefix='operational_company_'))
            return properties

    def get_investor_documents(self, investor, doc_type='investor'):
        docs = []
        # Doc types: involvement and investor
        if doc_type == 'involvement':
            ivis = InvestorVentureInvolvement.objects.filter(
                Q(fk_venture=investor) | Q(fk_investor=investor))
            for ivi in ivis:
                doc = {}
                for field in ivi._meta.local_fields:
                    if isinstance(field, ForeignKey):
                        doc[field.name] = getattr(ivi, '%s_id' % field.name)
                    else:
                        doc[field.name] = getattr(ivi, field.name)
                docs.append(doc)
        elif doc_type == 'investor':
            doc = {}
            for field in investor._meta.local_fields:
                if isinstance(field, ForeignKey):
                    doc[field.name] = getattr(investor, '%s_id' % field.name)
                else:
                    doc[field.name] = getattr(investor, field.name)
            docs.append(doc)

        # Update docs with export values
        for doc in docs:
            doc.update(self.get_export_properties(doc, doc_type=doc_type))

        return docs

    def refresh_index(self):
        self.conn.refresh(self.index_name)

    def search(self, elasticsearch_query, doc_type='deal', sort=[]):
        """ Executes paginated queries until all results have been retrieved. 
            @return: The full list of hits. """
        start = 0
        size = 10000  # 10000 is the default elasticsearch max_window_size (pagination is cheap, so more is not necessarily better)
        raw_result_list = []

        done = False
        while not done:
            query = {
                'query': elasticsearch_query,
                'from': start,
                'size': size,
            }
            if sort:
                query['sort'] = sort
            query_result = self.conn.search(query,
                                            index=self.index_name,
                                            doc_type=doc_type)
            raw_result_list.extend(query_result['hits']['hits'])
            results_total = query_result['hits']['total']

            if len(raw_result_list) >= results_total:
                done = True
            else:
                start = len(raw_result_list)

        print('\nElasticsearch returned %i documents from a total of %i \n\n' %
              (len(raw_result_list), query_result['hits']['total']))
        return raw_result_list

    def delete_activity(self, activity):
        for doc_type in DOC_TYPES_ACTIVITY:
            try:
                if doc_type == 'deal':
                    self.conn.delete(id=activity.activity_identifier,
                                     index=self.index_name,
                                     doc_type=doc_type)
                else:
                    self.conn.delete_by_query(query={
                        "parent_id": {
                            "type": "deal",
                            "id": str(activity.activity_identifier),
                        }
                    },
                                              index=self.index_name,
                                              doc_type=doc_type)
            except ElasticHttpNotFoundError as e:
                pass

    def get_deals_by_activity_identifier(self,
                                         activity_identifier,
                                         doc_type='deal'):
        return self.search({
            "constant_score": {
                "filter": {
                    "term": {
                        "activity_identifier": activity_identifier
                    }
                }
            }
        })
Example #28
0
    def save(self, force_insert=False, force_update=False, **kwargs):
        es = ElasticSearch(ELASTIC_SEARCH_URL)
        if self.id:
            location = self.get_location()
            location_es = "{0},{1}".format(location.y, location.x)
            es.update(
                'glamazer',
                'modelresult',
                'listings.listing.{0}'.format(self.id),
                script="ctx._source.listing_id = listing;" +
                "ctx._source.artist_id = artist;" +
                "ctx._source.artist_avatar = artist_avatar;" +
                "ctx._source.artist_name = artist_name;" +
                "ctx._source.salon_id = salon;" +
                "ctx._source.salon_avatar = salon_avatar;" +
                "ctx._source.salon_name = salon_name;" +
                "ctx._source.title = title;" +
                "ctx._source.location = location;" +
                "ctx._source.description = description;" +
                "ctx._source.get_picture = get_picture;" +
                "ctx._source.metadata = metadata;" +
                "ctx._source.gender = gender;" + "ctx._source.price = price;" +
                "ctx._source.currency = currency;" +
                "ctx._source.likes = likes;" +
                "ctx._source.comments = comments;" +
                "ctx._source.tags = tags;" + "ctx._source.status = status;" +
                "ctx._source.style = style;" + "ctx._source.rating = rating",
                params={
                    'listing': self.id,
                    'artist': self.get_artist_id(),
                    'artist_avatar': self.get_artist_avatar(),
                    'artist_name': self.get_artist_name(),
                    'salon': self.get_salon_id(),
                    'salon_avatar': self.get_salon_avatar(),
                    'salon_name': self.get_salon_name(),
                    'title': self.title,
                    'location': location_es,
                    'description': self.description,
                    'get_picture': self.get_picture(),
                    'metadata': self.metadata,
                    'gender': self.gender,
                    'price': self.price,
                    'currency': self.currency,
                    'likes': self.likes,
                    'comments': self.comments,
                    'tags': self.get_tags(),
                    'status': self.status,
                    'style': self.get_style(),
                    'rating': self.get_rating()
                })
            super(Listing, self).save(force_insert, force_update)
        else:
            super(Listing, self).save(force_insert, force_update)

            location = self.get_location()
            location_es = "{0},{1}".format(location.y, location.x)
            es.index('glamazer',
                     'modelresult', {
                         'listing_id': self.id,
                         'artist_id': self.artist_id,
                         'artist_avatar': self.get_artist_avatar(),
                         'artist_name': self.get_artist_name(),
                         'salon_id': self.get_salon_id(),
                         'salon_avatar': self.get_salon_avatar(),
                         'salon_name': self.get_salon_name(),
                         'title': self.title,
                         'location': location_es,
                         'description': self.description,
                         'get_picture': self.get_picture(),
                         'metadata': self.metadata,
                         'gender': self.gender,
                         'price': self.price,
                         'currency': self.currency,
                         'likes': self.likes,
                         'comments': self.comments,
                         'tags': self.get_tags(),
                         'status': self.status,
                         'style': self.get_style(),
                         'rating': self.get_rating()
                     },
                     id='listings.listing.{0}'.format(self.id))
            es.refresh('glamazer')
Example #29
0
class ESWrapper(BaseDB):
    def __init__(self,
                 index_name,
                 doc_type,
                 host='http://localhost',
                 port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        #self._base_query = {"query": {"bool": {"must": {"match": {}}}}}
        #self._base_query = {"query": {"bool": {}}}
        self._geo_filter = {"distance": "20km", "coordinates": {}}
        self._population_filter = {'population': {'gte': 5000}}
        self._index = index_name
        self._doctype = doc_type

    def getByid(self, geonameId):
        maincondition = {"match": {"id": geonameId}}
        q = {"query": {"bool": {"must": maincondition}}}
        return self.eserver.search(
            q, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']

    def _query(self,
               qkey,
               qtype="exact",
               analyzer=None,
               min_popln=None,
               size=10,
               **kwargs):
        """
        qtype values are exact, relaxed or geo_distance
        Always limit results to 10
        """
        q = {"query": {"bool": {}}}
        query_name = kwargs.pop('query_name', 'must')
        query_name = "should"
        if query_name == "should":
            q["query"]["bool"]["minimum_number_should_match"] = 1

        maincondition = {}
        if qtype == "exact":
            maincondition = [{
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            }]
            #maincondition["match"] = {"name.raw": {"query": qkey}}
            if analyzer:
                maincondition["match"]["name.raw"]["analyzer"] = analyzer

        elif qtype == "relaxed":
            maincondition["match"] = {"alternatenames": {"query": qkey}}
            if analyzer:
                maincondition["match"]["alternatenames"]["analyzer"] = analyzer

            #q["query"]["bool"][query_name]["match"].pop("name.raw", "")
        elif qtype == "combined":
            maincondition = [{
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": qkey,
                            "fields":
                            ["name.raw", "asciiname", "alternatenames"]
                        }
                    },
                    "filter": {
                        "bool": {
                            "should": [{
                                "range": {
                                    "population": {
                                        "gte": 5000
                                    }
                                }
                            }, {
                                "terms": {
                                    "featureCode": [
                                        "pcla", "pcli", "cont", "rgn", "admd",
                                        "adm1", "adm2"
                                    ]
                                }
                            }]
                        }
                    }
                }
            }, {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey[1:]
                    }
                }
            }, {
                "match": {
                    "alternatenames": {
                        "query": qkey,
                        'fuzziness': kwargs.pop("fuzzy", 0),
                        "max_expansions": kwargs.pop("max_expansion", 5),
                        "prefix_length": kwargs.pop("prefix_length", 1)
                    }
                }
            }]

        if maincondition:
            q["query"]["bool"][query_name] = maincondition

            if min_popln:
                filter_cond = [{"range": {"population": {"gte": min_popln}}}]
            else:
                filter_cond = []

            if kwargs:
                #filter_cond = [{"range": {"population": {"gte": min_popln}}}]
                filter_cond += [{
                    "term": {
                        key: val
                    }
                } for key, val in kwargs.viewitems()]
                # print(filter_cond)
                q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}
            elif min_popln:
                filter_cond = [{
                    "range": {
                        "population": {
                            "gte": min_popln
                        }
                    }
                }, {
                    "terms": {
                        "featureCode": ["ppla", "pplx"]
                    }
                }]

                q["query"]["bool"]["filter"] = {
                    "bool": {
                        "should": filter_cond
                    }
                }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def query(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def near_geo(self, geo_point, min_popln=5000, **kwargs):
        q2 = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    },
                    "filter": [{
                        "geo_distance": {
                            "distance": "30km",
                            "coordinates": geo_point
                        }
                    }, {
                        "terms": {
                            "featureCode": ["pcli", "ppl", "ppla2", "adm3"]
                        }
                    }]
                }
            },
            "sort": {
                "population": "desc"
            }
        }

        res = self.eserver.search(q2,
                                  index=self._index,
                                  doc_type=self._doctype,
                                  **kwargs)['hits']['hits'][0]['_source']
        res['confidence'] = 1.0
        return [GeoPoint(**res)]

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)
            settings['mappings'][self._doctype] = settings['mappings'].pop(
                'places')

        try:
            self.eserver.create_index(index=self._index, settings=settings)
        except:
            self.eserver.delete_index(self._index)
            self.eserver.create_index(index=self._index, settings=settings)

        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype)
            print "..",

        self.eserver.refresh(self._index)

    def _opLoader(self, datacsv, confDir):
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                try:
                    row['coordinates'] = [
                        float(row['longitude']),
                        float(row['latitude'])
                    ]
                    try:
                        row['population'] = int(row["population"])
                    except:
                        row['population'] = -1

                    try:
                        row['elevation'] = int(row['elevation'])
                    except:
                        row['elevation'] = -1

                    del (row['latitude'])
                    del (row['longitude'])
                    #print row['name']
                    row['alternatenames'] = row['alternatenames'].split(",")
                    cnt += 1
                    yield self.eserver.index_op(row,
                                                index=self._index,
                                                doc_type=self._doctype)
                except:
                    print json.dumps(row)
                    continue
Example #30
0
class ESWrapper(BaseDB):
    def __init__(self, index_name, host='http://localhost', port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        self._base_query = {
            "query": {
                "bool": {
                    "must": {
                        "match": {
                            "name.raw": ""
                        }
                    }
                }
            }
        }
        self._geo_filter = {
            "geo_distance": {
                "distance": "20km",
                "coordinates": {}
            }
        }
        self._index = index_name
        self._doctype = "places"

    def query(self, qkey, qtype="exact"):
        """
        qtype values are exact, relaxed or geo_distance
        """
        q = self._base_query.copy()
        if qtype == "exact":
            q["query"]["bool"]["must"]["match"]["name.raw"] = qkey
        elif qtype == "relaxed":
            q["query"]["bool"]["must"]["match"]["name"] = qkey
            q["query"]["bool"]["must"]["match"].pop("name.raw")
        elif qtype == "geo_distance":
            q = {
                "query": {
                    "bool": {
                        "must": {
                            "match_all": {}
                        }
                    },
                    "filter": {
                        "geo_distance": {
                            "distance": "20km",
                            "coordinates": qkey
                        }
                    }
                }
            }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def near_geo(self, geo_point):
        q = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    }
                },
                "filter": self._geo_filter
            }
        }
        q["query"]["bool"]["geo_distance"]["coordinates"] = geo_point
        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)

        self.eserver.create_index(index='geonames', settings=settings)
        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index='geonames', doc_type='places')
            print "..",

        self.eserver.refresh('geonames')

    def _opLoader(self, datacsv, confDir):
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                row['coordinates'] = [
                    float(row['longitude']),
                    float(row['latitude'])
                ]
                del (row['latitude'])
                del (row['longitude'])
                row['alternatenames'] = row['alternatenames'].split(",")
                cnt += 1
                #if cnt > 100:
                #break
                yield self.eserver.index_op(row,
                                            index="geonames",
                                            doc_type="places")