コード例 #1
0
def IndexData(request):
    es = ElasticSearch(settings.ELASTIC_SEARCH)
    for file in fileHolder:
        index = file['segment_name'].lower()
        rawfiles = file['rawfiles']
        data_for_es = file['dataFrames']
        try:
            es.delete_index(index.replace(" ", ""))
        except:
            pass
    es.create_index(index.replace(" ", ""))

    ## Loop dataframe and to elasticsearch index
    docs = json.loads(data_for_es.to_json(orient='records'))
    es.bulk((es.index_op(doc) for doc in docs),
            index=index.replace(" ", ""),
            doc_type=index)

    ##Create segment template
    file_names = []
    for file in rawfiles:
        file_names.append(file.name)

    segment = Segments(name=index,
                       files_added=",".join(file_names),
                       es_index=index.replace(" ", ""))
    segment.save()

    segment = Segments.objects.get(name=index)

    return render(request, 'analyse.html', {'segment': segment})
コード例 #2
0
ファイル: add_documents.py プロジェクト: brooksgod/memex
def add_document(entries):
    es_server = 'http://localhost:9200/'
    if os.environ.get('ELASTICSEARCH_SERVER'):
        es_server = os.environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    es.bulk([es.index_op(doc) for doc in entries],
            index='memex',
            doc_type='page')
コード例 #3
0
ファイル: add_documents.py プロジェクト: ViDA-NYU/memex
def add_document(entries):
    es_server = 'http://localhost:9200/'
    if os.environ.get('ELASTICSEARCH_SERVER'):
        es_server = os.environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    es.bulk([es.index_op(doc) for doc in entries],
            index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex', 
            doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
コード例 #4
0
ファイル: bulk_index_docs.py プロジェクト: dapurv5/es-scripts
class Indexer(object):
  
  def __init__(self, input):
    self.input = input
    self.es = ElasticSearch()
    self.index_name = "psim"
    self.doc_type = 'book'
    
  def delete_index(self):
    # Delete index if already found one
    try:
      self.es.delete_index(index = self.index_name)
    except Exception:
      pass
  
  def create_index(self):
    self.es.create_index(index=self.index_name, settings = self.get_index_settings())
    
  def get_index_settings(self):
    settings = {
                        "mappings": {
                           "book": {
                             "_all" : {"enabled" : "false"},       
                             "properties": {
                                "codes": {"type": "string",
                                         "term_vector": "yes",
                                         "store": "true"},
                                "pid" : {"type" : "string"},
                                "embedding": {"type": "float",
                                              "store": "true"},
                                "magnitude": {"type": "float", "store": "true"}
                             }     
                           }
                        }
               }
    return settings
  
  def documents(self):
    with open(self.input) as input_file:
      for line in input_file:
        json_doc = json.loads(line)
        yield self.es.index_op(json_doc, doc_type=self.doc_type)
    
  def index(self):
    self.delete_index()
    self.create_index()
    for chunk in bulk_chunks(self.documents(), docs_per_chunk=1000):
      self.es.bulk(chunk, index = self.index_name, doc_type = self.doc_type)
    self.es.refresh(self.index_name)
コード例 #5
0
ファイル: add_documents.py プロジェクト: ViDA-NYU/memex
def update_document(entries):
    es_server = 'http://localhost:9200/'
    if os.environ.get('ELASTICSEARCH_SERVER'):
        es_server = os.environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)
    
    # es.update(index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex', 
    #           doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page',
    #           id=url,
    #           script=doc,
    #           upsert=True
    #       )
    es.bulk([es.update_op(doc, id=doc['url'], upsert=True) for doc in entries],
            index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex', 
            doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
コード例 #6
0
def update_document(entries):
    es_server = 'http://localhost:9200/'
    if os.environ.get('ELASTICSEARCH_SERVER'):
        es_server = os.environ['ELASTICSEARCH_SERVER']
    es = ElasticSearch(es_server)

    # es.update(index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex',
    #           doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page',
    #           id=url,
    #           script=doc,
    #           upsert=True
    #       )
    es.bulk([es.update_op(doc, id=doc['url'], upsert=True) for doc in entries],
            index=os.environ['ELASTICSEARCH_INDEX']
            if os.environ.get('ELASTICSEARCH_INDEX') else 'memex',
            doc_type=os.environ['ELASTICSEARCH_DOC_TYPE']
            if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
コード例 #7
0
ファイル: ESSession.py プロジェクト: visgence/teleceptor
    def commit(self):
        if len(self.buffer) > 0:
            logging.debug("Inserting {} to elasticsearch".format(len(self.buffer)))

            es = ElasticSearch(ELASTICSEARCH_URI)

            docs = []
            for doc in self.buffer:
                t = time.gmtime(int(doc['@timestamp']/1000))
                index = ELASTICSEARCH_INDEX + "-" + str(t.tm_year).zfill(2) + "." + str(t.tm_mon).zfill(2) + "." + str(t.tm_mday).zfill(2)
                docs.append(es.index_op(doc, index=index, doc_type=ELASTICSEARCH_DOC))
            if len(docs) > 0:
                try:
                    es.bulk(docs)
                    logging.debug("inserted %d records" % (len(docs)))
                    self.buffer = []
                except Exception as e:
                    logging.error("Insert Exception " + str(e))
コード例 #8
0
ファイル: signals.py プロジェクト: drager/toerh
def update_index(sender, created, **kwargs):
    """
    A signal for indexing new coffeehouses
    upon creation
    """
    es = ElasticSearch()
    if created:
        m = sender.objects.last()
        es.bulk([
            es.index_op({
                "pk": m.pk,
                "name": m.name,
                "rating": m.rating,
                "location": {
                    "lon": m.position.longitude,
                    "lat": m.position.latitude
                }
            }),
            ],
            doc_type="place",
            index="toerh_coffee")
コード例 #9
0
ファイル: elk.py プロジェクト: kamakazikamikaze/patheng
def bulkpush(sendto, offline, queue, errorqueue, debug=False):
    '''
    Send data in a bulk document to target ElasticSearch clusters
    If a cluster is unreachable, data will be offloaded to a temporary directory until it is back online

    Keyword arguments:
    sendto -- list of online clusters to send data to
    offline -- list of offline clusters to withhold data for
    queue -- multiprocessing queue of documents ready to be sent
    errorqueue -- multiprocessing queue of documents ready to be sent
    '''
    docs = []
    errordocs = []
    while not queue.empty():
        docs.append(queue.get())
    while not errorqueue.empty():
        errordocs.append(errorqueue.get())
    for cluster in sendto:
        # if debug:
        #	pprint(cluster)
        es = ElasticSearch(cluster['url'])
        if docs:
            r = es.bulk((es.index_op(doc) for doc in docs),
                        index=cluster['data index'],
                        doc_type=cluster['data index type'])
        if errordocs:
            r = es.bulk((es.index_op(doc) for doc in errordocs),
                        index=cluster['error index'],
                        doc_type=cluster['error index type'])
        if debug:  # TODO: add try except statment with imformative errors
            if r['errors'] == False:  # TODO: dump data to be sent next time the script is run
                print('\n\t', 'Bulk package was received by', cluster['name'])
            else:
                print('\n\t', 'Bulk package was not accepted by',
                      cluster['name'])
    if offline:
        _localoffload(offline=offline,
                      docs=docs,
                      errordocs=errordocs,
                      debug=debug)
コード例 #10
0
                    "coordinates" : coords,  # 4, 5
                    "feature_class" : row[6],
                    "feature_code" : row[7],
                    "country_code2" : row[8],
                    "country_code3" : country_code3,
                    "cc2" : row[9],
                    "admin1_code" : row[10],
                    "admin2_code" : row[11],
                    "admin3_code" : row[12],
                    "admin4_code" : row[13],
                    "population" : row[14],
                    "elevation" : row[15],
                    "dem" : row[16],
                    "timzeone" :  row[17],
                    "modification_date" : "2014-01-01"
                   }
            yield es.index_op(doc, index='geonames', doc_type='geoname')
        except:
            count += 1

    print 'Exception count:', count


chunk_count = 0
for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500):
    es.bulk(chunk)
    chunk_count += 1
    print 'Chunk count:', chunk_count

es.refresh('geonames')
コード例 #11
0
    def __init__(self,start,**kwargs):
        """
        Invoke a Downloader object to get data from
        the Record. It will check to see if the necessary
        files are already downloaded and use those instead of
        querying FDSys. Downloaders are the endpoint for raw data.

        Required arguments:

        start : In form 'YYYY-MM-DD.' This is the day/start day you want.

        Optional arguments:

        parse : Defaults to True. This tells the downloader whether you just want
                the raw files, or if you also want it to extract data from the HTML.
                (Default means yes, give me the data.)


        end : Same form as start. This is the end date.

        outpath : Output path RELATIVE TO the present working directory. Defaults
                  to 'output' and works fine when you run it from the repo's root
                  directory.

        do_mode : Specify what kind of data you want from the parser.
                  If do_mode is not set, the downloader will do absolutely zilch.
                  do_mode can take the following values:

                  json : write json files in a /json directory for that
                         day of the Record.

                  es : Specify the URL and index of an ElasticSearch cluster with
                       arguments es_url and index, and it will pass each file to
                       that cluster for indexing. WARNING: This doesn't handle any
                       mappings, and it doesn't check to see if records are already
                       there, so it will overwrite old files in the same index
                       WITHOUT versioning.

                       also specify:
                       es_url : ElasticSearch cluster url
                       index  : ElasticSearch cluster index

                  yield : For each day of the Record the user specifies,
                          the downloader acts like a generator, yielding that day's
                          "crfile" dictionary.
        """

        self.status = 'idle'
        logging.debug('Downloader object ready with params:')
        logging.debug(','.join(['='.join([key,value]) for key,value in kwargs.items()]))
        if 'outpath' in kwargs.keys():
            outpath = kwargs['outpath']
        else:
            outpath = 'output'
        if kwargs['do_mode'] == 'es':
            es = ElasticSearch(kwargs['es_url'])
            for chunk in bulk_chunks((es.index_op(crfile.crdoc,id=crfile.crdoc.pop('id')) for crfile
                                        in self.bulkdownload(start,**kwargs)),
                                        docs_per_chunk=100):
                es.bulk(chunk,index=kwargs['index'],doc_type='crdoc')
        elif kwargs['do_mode'] == 'json':
            # outpath called so often to make it easy to follow
            # the idea that we're traversing a directory tree
            for crfile in self.bulkdownload(start,**kwargs):
                filename = os.path.split(crfile.filepath)[-1].split('.')[0] + '.json'
                outpath = os.path.split(crfile.filepath)[0]
                outpath = os.path.split(outpath)[0]
                if 'json' not in os.listdir(outpath):
                    os.mkdir(os.path.join(outpath,'json'))
                outpath = os.path.join(outpath,'json',filename)
                with open(outpath,'w') as out_json:
                    json.dump(crfile.crdoc,out_json)
        elif kwargs['do_mode'] == 'yield':
            self.yielded = self.bulkdownload(start,parse=True,**kwargs)
        elif kwargs['do_mode'] == 'noparse':
            self.bulkdownload(start,parse=False,**kwargs)

        else:
            return None
コード例 #12
0
ファイル: fill.py プロジェクト: rainum/dronevision
    }
}
es.create_index(ELASTICSEARCH_INDEX, settings=index_settings)

for filename in FILES:
    print "Processing %s" % filename

    sf = shapefile.Reader(filename)

    shapes = sf.shapes()
    for i, shape in enumerate(shapes, start=1):
        points = [(p[0], p[1]) for p in shape.points]

        data = {
            'filename': filename,
            'location': {
                'type': 'polygon',
                'coordinates': [points]
            }
        }

        if points[-1] != points[0]:
            points.append(points[0])

        try:
            es.bulk([es.index_op(data)],
                    doc_type=ELASTICSEARCH_DOC,
                    index=ELASTICSEARCH_INDEX)
        except:
            print "Exception"
コード例 #13
0
ファイル: timer.py プロジェクト: fiesensee/CompREST
def getFeeds():
    print "getting feeds"
    es = ElasticSearch('http://fisensee.ddns.net:9200/')

    query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}}
    oldFeeds = es.search(query, size=300, index='feeds')

    if (len(oldFeeds['hits']['hits']) is not 0):
        es.bulk(
            es.delete_op(id=feed['_id'], index='feeds', doc_type='feed')
            for feed in oldFeeds['hits']['hits'])

    feedSources = FeedSource.objects.all()
    feeds = []
    defaultText = 'undefined'
    defaultDate = datetime.datetime.now().isoformat()
    utc = pytz.utc
    berlin = pytz.timezone('Europe/Berlin')
    now = datetime.datetime.today()
    dateThreshold = now - datetime.timedelta(weeks=2)

    allUrls = []
    for feedSource in feedSources:
        allUrls.append(feedSource.sourceUrl)

    urls = set(allUrls)
    for url in urls:
        source = feedparser.parse(url)
        for entry in source['items']:
            feed = {
                'title': defaultText,
                'description': defaultText,
                'link': defaultText,
                'date': defaultDate,
                'url': defaultText
            }
            if ('title' in entry):
                feed['title'] = entry['title']
            if ('description' in entry):
                feed['description'] = entry['description']
            if ('link' in entry):
                feed['link'] = entry['link']
            if ('published_parsed' in entry):
                date = datetime.datetime.fromtimestamp(
                    time.mktime(entry['published_parsed']))
                if (date < dateThreshold):
                    break
                utcDate = utc.localize(date)
                feed['date'] = utcDate.astimezone(berlin).isoformat()
            #id creation should be enough for now, but it's made to fail
            if ('title' or 'published_parsed' in entry):
                feed['id'] = base64.urlsafe_b64encode(
                    hashlib.sha256((feed['title'] +
                                    feed['date']).encode('utf8')).hexdigest())
            else:
                feed['id'] = base64.urlsafe_b64encode(
                    hashlib.sha256((feed['title']).encode('utf8')).hexdigest())
            feed['url'] = url
            feeds.append(feed)

    es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds),
            index='feeds',
            doc_type='feed')
    print es.refresh('feeds')
コード例 #14
0
                "coordinates": coords,  # 4, 5
                "feature_class": row[6],
                "feature_code": row[7],
                "country_code2": row[8],
                "country_code3": country_code3,
                "cc2": row[9],
                "admin1_code": row[10],
                "admin2_code": row[11],
                "admin3_code": row[12],
                "admin4_code": row[13],
                "population": row[14],
                "elevation": row[15],
                "dem": row[16],
                "timzeone": row[17],
                "modification_date": "2014-01-01"
            }
            yield es.index_op(doc, index='geonames', doc_type='geoname')
        except:
            count += 1

    print 'Exception count:', count


chunk_count = 0
for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500):
    es.bulk(chunk)
    chunk_count += 1
    print 'Chunk count:', chunk_count

es.refresh('geonames')
コード例 #15
0
ファイル: sync.py プロジェクト: xhook7/py-mysql-es
class MySync(object):
	ts = 0	#last chunk time
	log_file = None
	log_pos  = None
	
	def __init__(self):
		self.config = yaml.load(open('./etc/config.yaml'))
		self.mark_path = self.config['binlog']['mark']
		self.bulk_size = self.config['es']['bulk_size']
		self.excludes_fields = self.config['slave']['excludes_fields']
		self.es = ElasticSearch('http://{host}:{port}/'.format(
			host=self.config['es']['host'], 
			port=self.config['es']['port']
		))

		#set logger
		logging.basicConfig(
			level=logging.DEBUG,
			format='%(asctime)s %(levelname)s %(message)s',
			datefmt='%Y-%m-%d %H:%M:%S',
			filename=self.config['log']['run']
		)
		logging.getLogger('elasticsearch').setLevel(logging.INFO)
		logging.getLogger('elasticsearch.trace').setLevel(logging.INFO)
		logging.getLogger('elasticsearch.trace').addHandler(logging.StreamHandler())

		#resume stream
		if os.path.isfile(self.mark_path):		
			with open(self.mark_path, 'r') as y:
				mark = yaml.load(y)
				self.log_file = mark.get('log_file')
				self.log_pos  = mark.get('log_pos')
				logging.info('resume stream >> file:%s, pos:%s' % (self.log_file, self.log_pos))
	
	"""
	mark binlog position
	"""
	def mark_binlog(self):
		if self.log_file and self.log_pos:
			with open(self.mark_path, 'w') as y:
				logging.info('mark binlog >> file:%s, pos:%s' % (self.log_file, self.log_pos))
				yaml.safe_dump({'log_file':self.log_file, 'log_pos':self.log_pos}, y, default_flow_style=False)
	

	"""
	format fields
	"""
	def _format(self, dat):
		for k,v in dat.items():
			if isinstance(v, datetime):
				dat[k] = v.strftime('%Y-%m-%d %H:%M:%S')
			elif isinstance(v, date):
				dat[k] = v.strftime('%Y-%m-%d')
			if k in self.excludes_fields:
				del dat[k]
		return dat
	

	"""
	mysql binlog event handle
	"""
	def proc_binlog(self):
		stream = BinLogStreamReader(
			connection_settings = self.config['mysql'],
			server_id = self.config['slave']['server_id'],
			log_file = self.log_file,
			log_pos = self.log_pos,
			only_schemas = self.config['slave']['schemas'],
			blocking = True,
			resume_stream = bool(self.log_file and self.log_pos),
			only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent]
		)
		for binlogevent in stream:
			#binlogevent.dump()
			self.log_file = stream.log_file
			self.log_pos  = stream.log_pos
			for row in binlogevent.rows:
				pk = binlogevent.primary_key
				table = binlogevent.table
				schema = binlogevent.schema
				if isinstance(binlogevent, WriteRowsEvent):
					yield self.es.index_op(self._format(row['values']), doc_type=table, index=schema, id=row['values'][pk])
				elif isinstance(binlogevent, UpdateRowsEvent):
					yield self.es.update_op(self._format(row['after_values']), doc_type=table, index=schema, id=row['after_values'][pk])
				elif isinstance(binlogevent, DeleteRowsEvent):
					yield self.es.delete_op(doc_type=table, index=schema, id=row['values'][pk])
				else:
					continue

		stream.close()
	

	"""
	notify exception
	"""
	def send_email(self, msg):
		import smtplib
		from email.mime.text import MIMEText
		msg = MIMEText(msg, 'plain', 'utf-8')
		msg['From'] = self.config['email']['from']['user']
		msg['To'] = ','.join(self.config['email']['to'])
		msg['Subject'] = 'Binlog Sync Exception:'
		try:
			s = smtplib.SMTP()
			s.connect(self.config['email']['host'], self.config['email']['port'])
			s.ehlo()
			s.starttls()
			s.login(user=self.config['email']['from']['user'], password=self.config['email']['from']['passwd'])
			s.sendmail(msg['From'], self.config['email']['to'], msg.as_string())
			s.quit()
		except Exception:
			import traceback
			logging.error(traceback.format_exc())

	"""
	bulk chunk check every second
	"""
	def bulk_chunks(self, actions, docs_per_chunk=300, bytes_per_chunk=None):
		chunk = []
		docs = bytes = 0
		for action in actions:
			next_len = len(action) + 1  #+1 for \n
			if chunk and (
				(docs_per_chunk and docs >= docs_per_chunk) or
				(bytes_per_chunk and bytes + next_len > bytes_per_chunk) or
				(self.ts+1 < int(time.time()))
			):
				#print(">>>chunk:%d" % len(chunk))
				yield chunk
				chunk = []
				docs = bytes = 0
				self.ts = int(time.time())

			chunk.append(action)
			docs += 1
			bytes += next_len

		if chunk:
			yield chunk

	"""
	run entry
	"""
	def run(self):
		try:
			for chunk in self.bulk_chunks(self.proc_binlog(), docs_per_chunk=self.bulk_size):
				#time.sleep(1)
				self.es.bulk(chunk)
				self.mark_binlog()
		except KeyboardInterrupt:
			pass
		except Exception:
			import traceback
			logging.error(traceback.format_exc())
			self.send_email(msg=traceback.format_exc())
			raise
コード例 #16
0
ファイル: spider.py プロジェクト: wangqiuyi/0517
                return data

es = ElasticSearch('http://localhost:9200/')
es.delete_index('pet')
spider = Spider()
breeds = spider.getPetBreeds()
p = Pinyin()
for breed in breeds:
    flg = 1
    page = 1
    pet_list = []
    while(flg):
        pets = spider.getPets(breed, (page - 1) * spider.limit)
        if not pets:
            flg = 0
        else:
            page = page + 1
            for pet in pets:
                pet_obj = {}
                pet_obj['name'] = pet['name']
                pet_obj['img'] = pet['img']
                pet_obj['type'] = breed['ename'] 
                pet_list.append(pet_obj)
                #print pet['name'] + '\t' + p.get_pinyin(pet['name'], '')
    print breed['ename'] + '\n'
    if not pet_list:
        continue
    doc_type = p.get_pinyin(breed['ename'].replace('宠物', ''), '')
    es.bulk((es.index_op(pet_obj) for pet_obj in pet_list), doc_type=doc_type, index = 'pet')
es.refresh('pet')
コード例 #17
0
class ElasticSearch(object):
    conn = None
    url = settings.ELASTICSEARCH_URL
    index_name = settings.ELASTICSEARCH_INDEX_NAME
    stdout = None
    stderr = None

    def __init__(self, index_name=None, stdout=None, stderr=None):
        self.conn = PyElasticSearch()
        if index_name:
            self.index_name = index_name
        if stdout:
            self.stdout = stdout
        if stderr:
            self.stderr = stderr

    def create_index(self, delete=True):
        if delete:
            try:
                self.conn.delete_index(self.index_name)
            except ElasticHttpNotFoundError as e:
                pass
        mappings = dict(
            (k, v) for k, v in get_elasticsearch_properties().items())
        self.conn.create_index(self.index_name,
                               settings={'mappings': mappings})

    def index_activity_by_id(self, activity_id):
        activity = HistoricalActivity.objects.get(pk=activity_id)
        return self.index_activity(activity)

    def delete_activity_by_id(self, activity_id):
        activity = HistoricalActivity.objects.get(pk=activity_id)
        return self.delete_activity(activity)

    def index_activity(self, activity):
        for doc_type in DOC_TYPES_ACTIVITY:
            docs = self.get_activity_documents(activity, doc_type=doc_type)
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(
                        doc, id=doc.pop('id'), parent=doc.pop('_parent', None))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    def index_investor(self, investor):
        for doc_type in DOC_TYPES_INVESTOR:
            docs = self.get_investor_documents(investor, doc_type=doc_type)
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id'))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    def index_activity_documents(self, activity_identifiers=[]):
        activity_identifiers = activity_identifiers or HistoricalActivity.objects.filter(
            fk_status__in=(
                HistoricalActivity.STATUS_ACTIVE,
                HistoricalActivity.STATUS_PENDING,
                HistoricalActivity.STATUS_OVERWRITTEN,
                HistoricalActivity.STATUS_DELETED)).distinct().values_list(
                    'activity_identifier', flat=True).distinct()

        for doc_type in DOC_TYPES_ACTIVITY:
            docs = []
            # Collect documents
            self.stdout and self.stdout.write(
                'Collect %ss for %i deals...' %
                (doc_type, len(activity_identifiers)))
            for activity_identifier in activity_identifiers:
                for activity in self.get_activity_versions(
                        activity_identifier):
                    docs.extend(
                        self.get_activity_documents(activity,
                                                    doc_type=doc_type))
            # Bulk index documents
            self.stdout and self.stdout.write('Index %i %ss...' %
                                              (len(docs), doc_type))
            if len(docs) > 0:
                paginator = Paginator(docs, 1000)
                for page in paginator.page_range:
                    try:
                        self.conn.bulk(
                            (self.conn.index_op(doc,
                                                id=doc.pop('id'),
                                                parent=doc.pop(
                                                    '_parent', None))
                             for doc in paginator.page(page)),
                            index=self.index_name,
                            doc_type=doc_type)
                    except BulkError as e:
                        for error in e.errors:
                            msg = '%s: %s on ID %s' % (
                                error['index']['error']['type'],
                                error['index']['error']['reason'],
                                error['index']['_id'])
                            if 'caused_by' in error['index']['error']:
                                msg += ' (%s: %s)' % (error['index']['error']
                                                      ['caused_by']['type'],
                                                      error['index']['error']
                                                      ['caused_by']['reason'])
                            self.stderr and self.stderr.write(msg)
                    self.conn.refresh()

    def index_investor_documents(self):
        investors = Investor.objects.public().order_by(
            'investor_identifier', '-id').distinct('investor_identifier')

        for doc_type in DOC_TYPES_INVESTOR:
            docs = []
            # Collect documents
            self.stdout and self.stdout.write(
                'Collect %ss for %i investors...' %
                (doc_type, investors.count()))
            for investor in investors:
                docs.extend(
                    self.get_investor_documents(investor, doc_type=doc_type))
            # Bulk index documents
            self.stdout and self.stdout.write('Index %i %ss...' %
                                              (len(docs), doc_type))
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id'))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    #def index_activity_by_version(self, activity_identifier):
    #    for doc_type in get_elasticsearch_properties().keys():
    #        docs = self.get_documents_for_activity_version(activity_identifier, doc_type=doc_type)
    #        if len(docs) > 0:
    #            try:
    #                self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs),
    #                    index=self.index_name,
    #                    doc_type=doc_type)
    #            except BulkError as e:
    #                for error in e.errors:
    #                    stderr and stderr.write('%s: %s (caused by %s: %s, ID: %s)' % (
    #                            error['index']['error']['type'],
    #                            error['index']['error']['reason'],
    #                            error['index']['error']['caused_by']['type'],
    #                            error['index']['error']['caused_by']['reason'],
    #                            error['index']['_id']
    #                          ))

    def get_activity_versions(self, activity_identifier):
        versions = []
        # get the newest non-pending, readable historic version:
        try:
            newest = HistoricalActivity.objects.filter(
                activity_identifier=activity_identifier,
                fk_status__in=(
                    HistoricalActivity.STATUS_ACTIVE,
                    HistoricalActivity.STATUS_OVERWRITTEN,
                    HistoricalActivity.STATUS_DELETED)).distinct().latest()
            if newest and not newest.fk_status_id == HistoricalActivity.STATUS_DELETED:
                versions.append(newest)
        except HistoricalActivity.DoesNotExist:
            newest = None

        # get newer pendings
        pendings = HistoricalActivity.objects.filter(
            activity_identifier=activity_identifier,
            fk_status_id=HistoricalActivity.STATUS_PENDING).distinct()
        if newest:
            pendings.filter(history_date__gt=newest.history_date)
        versions.extend(pendings)

        return versions

    def get_activity_documents(self, activity, doc_type='deal'):
        docs = []
        deal_attrs = {
            'id': activity.id,
            'activity_identifier': activity.activity_identifier,
            'historical_activity_id': activity.id,
            'status': activity.fk_status_id,
        }

        # Todo: Is there a nice way to prevent this extra Activity query?
        # e.g. if we save is_public/deal_scope as ActivityAttributes
        public_activity = Activity.objects.filter(
            activity_identifier=activity.activity_identifier).order_by(
                '-id').first()
        if public_activity:
            deal_attrs.update({
                'is_public':
                public_activity.is_public,
                'deal_scope':
                public_activity.deal_scope,
                'deal_size':
                public_activity.deal_size,
                'current_negotiation_status':
                public_activity.negotiation_status,
                'top_investors':
                public_activity.top_investors,
                'fully_updated_date':
                public_activity.fully_updated_date,
            })
        else:
            # Fixme: This should not happen
            self.stderr and self.stderr.write(
                _('Missing activity for historical activity %i (Activity identifier: #%i)'
                  % (activity.id, activity.activity_identifier)))
        #except Activity.MultipleObjectsReturned:
        #    # Fixme: This should not happen
        #    self.stderr and self.stderr.write(_('Too much activities for historical activity %i (Activity identifier: #%i)' % (
        #        activity.id,
        #        activity.activity_identifier
        #    )))

        for a in activity.attributes.select_related('fk_group__name').order_by(
                'fk_group__name'):
            # do not include the django object id
            if a.name == 'id':
                continue
            attribute = None
            attribute_key = '%s_attr' % a.name
            if attribute_key in get_elasticsearch_properties(
            )['deal']['properties'].keys():
                attribute = {
                    'value': a.value,
                    'value2': a.value2,
                    'date': a.date,
                    'is_current': a.is_current,
                }
            value = a.value

            # Area field?
            if a.name and 'area' in a.name and a.polygon is not None:
                # Get polygon
                #value = json.loads(a.polygon.json)
                # Apparently this is case sensitive: MultiPolygon as provided by the GeoJSON does not work
                #value['type'] = 'multipolygon'
                value = a.polygon.json or ''
            # do not include empty values
            if value is None or value == '':
                continue

            # Doc types: location, data_source or contract
            group_match = a.fk_group and a.fk_group.name or ''
            group_match = re.match(
                '(?P<doc_type>location|data_source|contract)_(?P<count>\d+)',
                group_match)
            if group_match:
                dt, count = group_match.groupdict()['doc_type'], int(
                    group_match.groupdict()['count'])
                if doc_type == dt:
                    while len(docs) < count:
                        docs.append({
                            '_parent': activity.activity_identifier,
                            'id': a.id,  #'%i_%i' % (a.id, count),
                        })
                    docs[count - 1][a.name] = [
                        value,
                    ]
                # Set doc type counter within deal doc type (for location/data_source/contract)
                elif doc_type == 'deal':
                    # Set counter
                    key = '%s_count' % dt
                    if key not in deal_attrs.keys():
                        deal_attrs[key] = count
                    elif deal_attrs[key] < count:
                        deal_attrs[key] = count

                    # Create list with correct length to ensure formset values have the same index
                    if not a.name in deal_attrs:
                        deal_attrs[a.name] = [''] * count
                        if attribute:
                            deal_attrs[attribute_key] = [''] * count
                    else:
                        while len(deal_attrs[a.name]) < count:
                            deal_attrs[a.name].append('')
                            if attribute:
                                deal_attrs[attribute_key].append('')
                    deal_attrs[a.name][count - 1] = value
                    if attribute:
                        deal_attrs['%s_attr' % a.name][count - 1] = attribute

            # Doc type: deal and not formset
            elif doc_type == 'deal':
                if a.name in deal_attrs:
                    deal_attrs[a.name].append(value)
                    if '%s_attr' % a.name in get_elasticsearch_properties(
                    )['deal']['properties'].keys():
                        deal_attrs['%s_attr' % a.name].append(attribute)
                else:
                    deal_attrs[a.name] = [
                        value,
                    ]
                    if '%s_attr' % a.name in get_elasticsearch_properties(
                    )['deal']['properties'].keys():
                        deal_attrs['%s_attr' % a.name] = [
                            attribute,
                        ]

        if doc_type == 'deal':
            # Additionally save operational company attributes
            oc = Investor.objects.filter(
                investoractivityinvolvement__fk_activity__activity_identifier=
                activity.activity_identifier)
            if oc.count() > 0:
                oc = oc.first()
                for field in Investor._meta.fields:
                    if isinstance(field, ForeignKey):
                        deal_attrs['operational_company_%s' %
                                   field.name] = getattr(
                                       oc, '%s_id' % field.name)
                    else:
                        deal_attrs['operational_company_%s' %
                                   field.name] = getattr(oc, field.name)
            else:
                pass
                #self.stderr and self.stderr.write("Missing operational company for deal #%i" % activity.activity_identifier)

        # Create single document for each location
        # FIXME: Saving single deals for each location might be deprecated since we have doc_type location now?
        spatial_names = list(get_spatial_properties())
        for i in range(deal_attrs.get('location_count', 0)):
            doc = deal_attrs.copy()
            for name in spatial_names:
                if not name in doc:
                    continue
                if len(deal_attrs[name]) > i:
                    doc[name] = deal_attrs[name][i]
                else:
                    doc[name] = ''
            # Set unique ID for location (deals can have multiple locations)
            doc['id'] = '%s_%i' % (doc['id'], i)
            point_lat = doc.get('point_lat', None)
            point_lon = doc.get('point_lon', None)
            if point_lat and point_lon:
                # Parse values
                try:
                    parsed_lat, parsed_lon = float(point_lat), float(point_lon)
                    doc['geo_point'] = '%s,%s' % (point_lat, point_lon)
                except ValueError:
                    doc['geo_point'] = '0,0'
            else:
                doc['point_lat'] = '0'
                doc['point_lon'] = '0'
                doc['geo_point'] = '0,0'
            # FIXME: we dont really need 'point_lat' and 'point_lon' here,
            # so we should pop them from doc when adding 'geo_point'
            docs.append(doc)

        # Update docs with export values
        for doc in docs:
            doc.update(self.get_export_properties(doc, doc_type=doc_type))

        return docs

    def get_export_properties(self, doc, doc_type='deal'):
        if doc_type == 'investor':
            return ExportInvestorForm.export(doc)
        elif doc_type == 'involvement':
            return InvestorVentureInvolvementForm.export(doc)
        else:
            properties = {
                'deal_scope_export':
                doc.get('deal_scope', ''),
                'is_public_export':
                doc.get('is_public', False) and str(_('Yes')) or str(_('No')),
                'deal_size_export':
                doc.get('deal_size', ''),
                'current_negotiation_status_export':
                doc.get('current_negotiation_status', ''),
                'top_investors_export':
                doc.get('top_investors', ''),
                'fully_updated_date_export':
                doc.get('fully_updated_date', ''),
            }
            # Doc types: deal, location, contract and data_source
            for form in ChangeDealView.FORMS:
                formset_name = hasattr(form, "form") and form.Meta.name or None
                form = formset_name and form.form or form
                properties.update(form.export(doc, formset=formset_name))
            properties.update(
                ExportInvestorForm.export(doc, prefix='operational_company_'))
            return properties

    def get_investor_documents(self, investor, doc_type='investor'):
        docs = []
        # Doc types: involvement and investor
        if doc_type == 'involvement':
            ivis = InvestorVentureInvolvement.objects.filter(
                Q(fk_venture=investor) | Q(fk_investor=investor))
            for ivi in ivis:
                doc = {}
                for field in ivi._meta.local_fields:
                    if isinstance(field, ForeignKey):
                        doc[field.name] = getattr(ivi, '%s_id' % field.name)
                    else:
                        doc[field.name] = getattr(ivi, field.name)
                docs.append(doc)
        elif doc_type == 'investor':
            doc = {}
            for field in investor._meta.local_fields:
                if isinstance(field, ForeignKey):
                    doc[field.name] = getattr(investor, '%s_id' % field.name)
                else:
                    doc[field.name] = getattr(investor, field.name)
            docs.append(doc)

        # Update docs with export values
        for doc in docs:
            doc.update(self.get_export_properties(doc, doc_type=doc_type))

        return docs

    def refresh_index(self):
        self.conn.refresh(self.index_name)

    def search(self, elasticsearch_query, doc_type='deal', sort=[]):
        """ Executes paginated queries until all results have been retrieved. 
            @return: The full list of hits. """
        start = 0
        size = 10000  # 10000 is the default elasticsearch max_window_size (pagination is cheap, so more is not necessarily better)
        raw_result_list = []

        done = False
        while not done:
            query = {
                'query': elasticsearch_query,
                'from': start,
                'size': size,
            }
            if sort:
                query['sort'] = sort
            query_result = self.conn.search(query,
                                            index=self.index_name,
                                            doc_type=doc_type)
            raw_result_list.extend(query_result['hits']['hits'])
            results_total = query_result['hits']['total']

            if len(raw_result_list) >= results_total:
                done = True
            else:
                start = len(raw_result_list)

        print('\nElasticsearch returned %i documents from a total of %i \n\n' %
              (len(raw_result_list), query_result['hits']['total']))
        return raw_result_list

    def delete_activity(self, activity):
        for doc_type in DOC_TYPES_ACTIVITY:
            try:
                if doc_type == 'deal':
                    self.conn.delete(id=activity.activity_identifier,
                                     index=self.index_name,
                                     doc_type=doc_type)
                else:
                    self.conn.delete_by_query(query={
                        "parent_id": {
                            "type": "deal",
                            "id": str(activity.activity_identifier),
                        }
                    },
                                              index=self.index_name,
                                              doc_type=doc_type)
            except ElasticHttpNotFoundError as e:
                pass

    def get_deals_by_activity_identifier(self,
                                         activity_identifier,
                                         doc_type='deal'):
        return self.search({
            "constant_score": {
                "filter": {
                    "term": {
                        "activity_identifier": activity_identifier
                    }
                }
            }
        })
コード例 #18
0
    Rthandler.setFormatter(formatter)
    logging.getLogger().addHandler(Rthandler)


def get_para_5m_raw_data():
    examples = []
    lines = io.open(PATH, 'r', encoding='utf-8').readlines()
    for i in lines:
        s1 = i.split("\t")[0].lower()
        s2 = i.split("\t")[1].lower()
        examples.append({'content': s1, 'type': 'origin'})
        examples.append({'content': s2, 'type': 'para'})
    return examples


def document(sentences):
    for s in sentences:
        dic = {'content': s['content'], 'type': s['type']}
        yield es.index_op(dic)


if __name__ == '__main__':
    # init_log()
    sentences = get_para_5m_raw_data()
    for chunk in bulk_chunks(document(sentences),
                             docs_per_chunk=1000,
                             bytes_per_chunk=100000):
        es.bulk(chunk, doc_type='sentence', index='para-nmt-50m')
        doc_num += 1000
        print("indexed" + str(doc_num) + "docs")
        logging.info("indexed" + str(doc_num) + "docs")
コード例 #19
0
ファイル: fill.py プロジェクト: rainum/dronevision
    }
}
es.create_index(ELASTICSEARCH_INDEX, settings=index_settings)

for filename in FILES:
    print "Processing %s" % filename

    sf = shapefile.Reader(filename)

    shapes = sf.shapes()
    for i, shape in enumerate(shapes, start=1):
        points = [(p[0], p[1]) for p in shape.points]

        data = {
            'filename': filename,
            'location': {
                'type': 'polygon',
                'coordinates': [points]
            }
        }

        if points[-1] != points[0]:
            points.append(points[0])

        try:
            es.bulk([es.index_op(data)],
                    doc_type=ELASTICSEARCH_DOC,
                    index=ELASTICSEARCH_INDEX)
        except:
            print "Exception"
コード例 #20
0
ファイル: indexes.py プロジェクト: drager/toerh
class SearchIndex(object):
    def __init__(self, model):
        self.es = ElasticSearch()
        self.model = model

    def put_mapping(self, index, doc_type):
        mapping = {
            doc_type: {
                "properties": {
                    "location": {
                        "type": "geo_point"
                    },
                }
            }
        }
        self.es.put_mapping(index=index, doc_type=doc_type, mapping=mapping)

    def bulk_items(self, index, doc_type):
        for m in self.model.objects.all():
            self.es.bulk([
                self.es.index_op({
                    "pk": m.pk,
                    "name": m.name,
                    "rating": m.rating,
                    "address": m.address,
                    "description": m.description,
                    "location": {
                        "lon": m.longitude,
                        "lat": m.latitude
                    }
                }),
                ],
                doc_type=doc_type,
                index=index)

    def search(self, index, question, longitude, latitude, size=10):
        #self.es.delete_index(index)
        try:
            self.es.create_index(index)
            self.put_mapping(index, "place")
            self.bulk_items(index, "place")
        except IndexAlreadyExistsError:
            pass

        query = {
            "query": {
                "function_score": {
                    "query": {
                        "bool": {
                            "should": [
                                {"match": {"name": question}},
                                {"match": {"_all": {
                                    "query": question,
                                    "operator": "or",
                                    "fuzziness": "auto",
                                    "zero_terms_query": "all"
                                    }}}
                                ]
                            }
                        },
                    "functions": [
                        {"exp": {"rating": {"origin": 5, "scale": 1, "offset": 0.1}}},
                    ]
                    }
                }
            }

        if longitude and longitude is not None:
            query['query']['function_score']['functions'] = [
                {'gauss': {
                    "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "550m", "scale": "1km"}
                    }},
                {'gauss': {
                    "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "500m", "scale": "2km"}
                    }},
            ]

        results = self.es.search(query, index=index, size=size)

        self.es.refresh()

        return results
コード例 #21
0
ファイル: indexer.py プロジェクト: cervere/scrapemall
    }
}


es.health(wait_for_status='yellow')
es.delete_index('write-ads')
es.create_index('write-ads', settings={'mappings': ad_mapping})

dateYMD = args["date"]
prepareDataFromDB(dateYMD)

dir = DATA_FILES_JSON + '/' + dateYMD
for filename in os.listdir(dir):
    if filename.endswith('.json'):
        with open(dir + '/' + filename) as open_file:
            json_docs = json.load(open_file)
            es.bulk((es.index_op(doc) for doc in json_docs),
                index='write-ads',
                doc_type='ad')

es.refresh("write-ads")

res = es.search('website:com', index='write-ads')
print("Got %d Hits for .com websites" % res['hits']['total'])
for hit in res['hits']['hits']:
    print (hit["_source"])
res = es.search('website:in', index='write-ads')
print("Got %d Hits for .in websites" % res['hits']['total'])
res = es.search('category:entertainment', index='write-ads')
print("Got %d Hits for category:Entertainment" % res['hits']['total'])
コード例 #22
0
ファイル: sync.py プロジェクト: muke5hy/py-mysql-es
class MySync(object):
	log_file = None
	log_pos  = None
	
	def __init__(self):
		print '[INFO] starting ...'
		self.config = yaml.load(open('./etc/config.yaml'))
		self.mark_path = self.config['binlog']['mark']
		self.bulk_size = self.config['es']['bulk_size']
		self.excludes_fields = self.config['slave']['excludes_fields']
		self.es = ElasticSearch('http://{host}:{port}/'.format(host=self.config['es']['host'], port=self.config['es']['port']))
		"""
		resume stream
		"""
		if os.path.isfile(self.mark_path):		
			with open(self.mark_path, 'r') as y:
				mark = yaml.load(y)
				self.log_file = mark.get('log_file')
				self.log_pos  = mark.get('log_pos')
				logging.info("resume stream : file: {file}, pos: {pos}".format(file=self.log_file,pos=self.log_pos))	
	
	def mark_binlog(self):
		if self.log_file and self.log_pos:
			with open(self.mark_path, 'w') as y:
				logging.info("mark binlog: binlog_file: {file}, pos: {pos}".format(file=self.log_file, pos=self.log_pos))
				yaml.safe_dump({"log_file": self.log_file, "log_pos": self.log_pos}, y, default_flow_style=False)
	
	def _format(self, dat):
		for k,v in dat.items():
			if isinstance(v, datetime):
				dat[k] = v.strftime('%Y-%m-%d %H:%M:%S')				
			elif isinstance(v, date):
				dat[k] = v.strftime('%Y-%m-%d')
			if k in self.excludes_fields:
				del dat[k]
		return dat
	
	def proc_binlog(self):
		stream = BinLogStreamReader(
			connection_settings = self.config['mysql'],
			server_id = self.config['slave']['server_id'],
			log_file = self.log_file,
			log_pos = self.log_pos,
			only_schemas = self.config['slave']['schemas'],
			blocking = True,
			resume_stream = bool(self.log_file and self.log_pos),
			only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent]
		)
		for binlogevent in stream:
			self.log_file = stream.log_file
			self.log_pos  = stream.log_pos
			for row in binlogevent.rows:		
				pk     = binlogevent.primary_key			
				table  = binlogevent.table
				schema = binlogevent.schema

				if isinstance(binlogevent, WriteRowsEvent):
					yield self.es.index_op(self._format(row['values']), doc_type=table, index=schema, id=row['values'][pk])
				elif isinstance(binlogevent, UpdateRowsEvent):
					yield self.es.update_op(self._format(row['after_values']), doc_type=table, index=schema, id=row['after_values'][pk])
				elif isinstance(binlogevent, DeleteRowsEvent):
					yield self.es.delete_op(doc_type=table, index=schema, id=row['values'][pk])
				else:
					logging.warning("unsupport event type")
					continue
	
		stream.close()
		
	def send_email(self, msg):
		import smtplib
		from email.mime.text import MIMEText
		msg = MIMEText(msg, 'plain', 'utf-8')
		msg['From']    = self.config['email']['from']['user']
		msg['To']      = ','.join(self.config['email']['to'])
		msg['Subject'] = 'Binlog Sync Exception:'
		try:
			s = smtplib.SMTP();
			s.connect(self.config['email']['host'], self.config['email']['port'])
			s.ehlo()
			s.starttls()
			s.login(user=self.config['email']['from']['user'], password=self.config['email']['from']['passwd'])
			s.sendmail(msg['From'], self.config['email']['to'], msg.as_string())
			s.quit()
		except Exception:
			import traceback
			logging.error(traceback.format_exc())
	
	def run(self):
		try:
			if self.bulk_size < 2:
				for action in self.proc_binlog():
					self.es.bulk([action])
					self.mark_binlog()
			else:
				for chunk in bulk_chunks(self.proc_binlog(), docs_per_chunk=self.bulk_size):
					self.es.bulk(chunk)
					self.mark_binlog()
		except KeyboardInterrupt:
			pass
		except Exception:
			import traceback
			logging.error(traceback.format_exc())
			self.send_email(msg=traceback.format_exc())
			raise
コード例 #23
0
ファイル: timer.py プロジェクト: fiesensee/CompREST
def getFeeds():
    print "getting feeds"
    es = ElasticSearch('http://fisensee.ddns.net:9200/')

    query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}}
    oldFeeds = es.search(query, size=300, index='feeds')

    if(len(oldFeeds['hits']['hits']) is not 0):
        es.bulk(es.delete_op(id=feed['_id'], index='feeds',
        doc_type='feed') for feed in oldFeeds['hits']['hits'])


    feedSources = FeedSource.objects.all()
    feeds = []
    defaultText = 'undefined'
    defaultDate = datetime.datetime.now().isoformat()
    utc = pytz.utc
    berlin = pytz.timezone('Europe/Berlin')
    now = datetime.datetime.today()
    dateThreshold = now - datetime.timedelta(weeks=2)

    allUrls = []
    for feedSource in feedSources:
        allUrls.append(feedSource.sourceUrl)

    urls = set(allUrls)
    for url in urls:
        source = feedparser.parse(url)
        for entry in source['items']:
            feed = {
                'title':defaultText,
                'description':defaultText,
                'link':defaultText,
                'date':defaultDate,
                'url': defaultText
            }
            if('title' in entry):
                feed['title'] = entry['title']
            if('description' in entry):
                feed['description'] = entry['description']
            if('link' in entry):
                feed['link'] = entry['link']
            if('published_parsed' in entry):
                date = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed']))
                if(date < dateThreshold):
                    break
                utcDate = utc.localize(date)
                feed['date'] = utcDate.astimezone(berlin).isoformat()
            #id creation should be enough for now, but it's made to fail
            if('title' or 'published_parsed' in entry):
                feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title'] + feed['date']).encode('utf8')).hexdigest())
            else:
                feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title']).encode('utf8')).hexdigest())
            feed['url'] = url
            feeds.append(feed)



    es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds),
        index = 'feeds',
        doc_type = 'feed')
    print es.refresh('feeds')
コード例 #24
0
ファイル: elasticsearch.py プロジェクト: ibbd-dev/ibbdETL
class IbbdElasticSearch:
    """
    es操作
    文档:http://pyelasticsearch.readthedocs.io/en/latest/
    """
    es = None
    config = {}

    mapping_is_set = False  # 判断是否已经设置了es的mapping

    def __init__(self, config):
        """
        es初始化
        配置参数:
        host: es连接字符串
        indexName: index的名字
        deleteIndex: 是否删除已经存在的index,默认为false,不删除
        settings: index的配置。具体的配置项,请看es的文档。
        settingsFile: index的配置,json文件。具体的配置项,请看es的文档。
        mappings: mappings的配置。具体的配置项,请看es的文档。
        mappingsFile: mappings的配置,json文件。具体的配置项,请看es的文档。
        idField: id字段。有些数据是包含id字段的

        说明:settings和settingsFile最多只能有一项
        mappings和mappingsFile最多也只能有一项
        """
        self.es = ElasticSearch(config['host'])

        if 'docType' not in config:
            config['docType'] = config['indexName']
        self.config = config

        if 'deleteIndex' in config and config['deleteIndex']:
            try:
                self.es.delete_index(config['indexName'])

                print('delete index ' + config['indexName'] + ' success!')
            except ElasticHttpNotFoundError:  # 如果本来不存在,则输出提示就好
                print('Index ' + config['indexName'] \
                                + ' not found, nothing to delete!')
            except:
                raise Exception('Index ' + config['indexName'] + ' delete error!')

        try:
            if 'settings' in config:
                self.es.create_index(config['indexName'],
                                     settings=config['settings'])
            elif 'settingsFile' in config:
                with open(config['settingsFile'], 'r') as f:
                    config['settings'] = json.loads(f.read())
                self.es.create_index(config['indexName'],
                                     settings=config['settings'])
            else:
                self.es.create_index(config['indexName'])

            print('create index ' + config['indexName'] + ' success!')
        except Exception:
            raise Exception("create index " + config['indexName'] + ' error!')

    def _putMapping(self, row):
        """
        设置es的mapping。
        可以根据row生成默认配置, 生成配置规则如下:
        """
        try:
            if 'mappingsFile' in self.config:
                with open(self.config['mappingsFile'], 'r') as f:
                    self.config['mappings'] = json.loads(f.read())

            if 'mappings' in self.config:
                self.es.put_mapping(self.config['indexName'],
                                    self.config['docType'],
                                    self.config['mappings'])
            print("put mapping " + self.config['indexName'] + ' success!')
        except Exception:
            raise Exception("put mapping " + self.config['indexName'] + ' error!')

    def read(self):
        pass

    def batchRead(self):
        pass

    def write(self, row):
        """
        写入单行记录
        """
        return self.batchWrite([row])

    def batchWrite(self, rows):
        """
        写入多行记录
        """
        if not self.mapping_is_set:   # 设置mapping
            self.mapping_is_set = True
            self._putMapping(rows[0])

        docs = ()
        if 'idField' in self.config:
            docs = (self.es.index_op(doc, id=doc.pop(self.config['idField'])) \
                    for doc in rows)
        else:
            docs = (self.es.index_op(doc) for doc in rows)

        self.es.bulk(docs,
                     index=self.config['indexName'],
                     doc_type=self.config['docType'])

        return True
コード例 #25
0
    def __init__(self,start,**kwargs):
        """
        Invoke a Downloader object to get data from
        the Record. It will check to see if the necessary
        files are already downloaded and use those instead of
        querying FDSys. Downloaders are the endpoint for raw data.

        Required arguments:

        start : In form 'YYYY-MM-DD.' This is the day/start day you want.

        Optional arguments:

        parse : Defaults to True. This tells the downloader whether you just want
                the raw files, or if you also want it to extract data from the HTML.
                (Default means yes, give me the data.)


        end : Same form as start. This is the end date.

        outpath : Output path RELATIVE TO the present working directory. Defaults
                  to 'output' and works fine when you run it from the repo's root
                  directory.

        do_mode : Specify what kind of data you want from the parser.
                  If do_mode is not set, the downloader will do absolutely zilch.
                  do_mode can take the following values:

                  json : write json files in a /json directory for that
                         day of the Record.

                  es : Specify the URL and index of an ElasticSearch cluster with
                       arguments es_url and index, and it will pass each file to
                       that cluster for indexing. WARNING: This doesn't handle any
                       mappings, and it doesn't check to see if records are already
                       there, so it will overwrite old files in the same index
                       WITHOUT versioning.

                       also specify:
                       es_url : ElasticSearch cluster url
                       index  : ElasticSearch cluster index

                  yield : For each day of the Record the user specifies,
                          the downloader acts like a generator, yielding that day's
                          "crfile" dictionary. 
        """
        self.status = 'idle'
        logging.debug('Downloader object ready with params:')
        logging.debug(','.join(['='.join([key,value]) for key,value in list(kwargs.items())]))
        if 'outpath' in list(kwargs.keys()):
            outpath = kwargs['outpath']
        else:
            outpath = 'output'
        if kwargs['do_mode'] == 'es':
            es = ElasticSearch(kwargs['es_url'])
            for chunk in bulk_chunks((es.index_op(crfile.crdoc,id=crfile.crdoc.pop('id')) for crfile
                                        in self.bulkdownload(start,**kwargs)),
                                        docs_per_chunk=100):
                es.bulk(chunk,index=kwargs['index'],doc_type='crdoc')
        elif kwargs['do_mode'] == 'json':
            # outpath called so often to make it easy to follow
            # the idea that we're traversing a directory tree
            for crfile in self.bulkdownload(start,**kwargs):
                filename = os.path.split(crfile.filepath)[-1].split('.')[0] + '.json'
                outpath = os.path.split(crfile.filepath)[0]
                outpath = os.path.split(outpath)[0]
                if 'json' not in os.listdir(outpath):
                    os.mkdir(os.path.join(outpath,'json'))
                outpath = os.path.join(outpath,'json',filename)
                with open(outpath,'w') as out_json:
                    json.dump(crfile.crdoc,out_json)
        elif kwargs['do_mode'] == 'yield':
            self.yielded = self.bulkdownload(start,parse=True,**kwargs)
        elif kwargs['do_mode'] == 'noparse':
            self.bulkdownload(start,parse=False,**kwargs)

        else:
            return None
コード例 #26
0
ファイル: elk.py プロジェクト: kamakazikamikaze/patheng
def loadlocal(debug=False):
    """
    Check for data offloaded to disk and retry sending if cluster(s) are now online
    """
    # TODO: have each cluster checked on a unqiue cluster basis instead of on a per json basis.
    # TODO: give user control of where data and tmp folder is stored
    basedir = './cfg/tmp/'
    datadir = basedir + 'data/'
    checkmakedir(basedir)
    checkmakedir(datadir)
    sendto = []
    dumpconfigs = [
        basedir + file for file in os.listdir(basedir)
        if file.endswith('.json')
    ]
    if dumpconfigs:
        for configfile in dumpconfigs:
            try:
                cluster = {}
                with open(configfile, 'r') as f:
                    cluster = json.load(f)
                name = cluster['name']
                cluster['alive'] = False
                i = 1  # i is to be cast as a string but incremented as an integer since we're using a Dict/JSON
                while not cluster['alive']:
                    if not 'http://' in cluster[str(i)].encode():
                        if debug:
                            print('Cluster', name, 'was missing \'http://\'')
                        cluster[str(i)] = 'http://' + cluster[str(i)]
                    if not cluster[str(i)][-1] == '/':
                        if debug:
                            print('Cluster', name, 'was missing \'/\'')
                        cluster[str(i)] += '/'
                    try:
                        if requests.get(cluster[str(i)]).status_code == 200:
                            cluster['url'] = cluster[str(i)]
                            cluster['data index'] = make_index(
                                cluster['url'], cluster['data index'])
                            if cluster['log errors to index']:
                                make_index(cluster['url'],
                                           cluster['error index'])
                            cluster['alive'] = True
                            cluster['dumpconfigs'] = configfile
                            sendto.append(cluster)

                            if debug:
                                print('\nDumped cluster', name, 'will send to',
                                      cluster[str(i)] + cluster['data index'])
                                if cluster['log errors to index']:
                                    print(
                                        'Errors will be sent to',
                                        cluster[str(i)] +
                                        cluster['error index'])
                        else:
                            if debug:
                                print('\nDumped cluster', name, 'master', i,
                                      'cannot be reached. Trying next...')
                            i += 1
                    except Exception as e:
                        if debug:
                            print(e)
                            print('Dumped  cluster', name, 'master', i,
                                  'cannot be reached. Trying next...')
                        i += 1
            except Exception as e:
                if debug:
                    print(e)
                    print(
                        'Dumped cluster', name,
                        'has zero master nodes to send to! Skipping for now.')
        for cluster in sendto:
            try:
                es = ElasticSearch(cluster['url'])
                for pickleid in cluster['pickle']:
                    with open(datadir + pickleid, 'rb') as f:
                        docpile = pickle.load(f)
                    r = es.bulk((es.index_op(doc) for doc in docpile),
                                index=cluster['data index'],
                                doc_type=cluster['data index type'])
                for errpickleid in cluster['err_pickle']:
                    with open(datadir + errpickleid, 'rb') as f:
                        errdocpile = pickle.load(f)
                    r = es.bulk((es.index_op(doc) for doc in errdocpile),
                                index=cluster['error index'],
                                doc_type=cluster['error index type'])
                    if debug:
                        if r['errors']:
                            print(r['errors'])
                            raise Exception
                os.remove(cluster['dumpconfigs'])
            except Exception as e:
                print(e)
                print("thought the cluster was up but it really isn't")
        _cleanupdump(debug)
コード例 #27
0
    'age': 32,
    'title': '抽象tv Coder'
}, {
    'id': 2,
    'name': 'Jessica Coder',
    'age': 31,
    'title': 'Programmer'
}, {
    'id': 3,
    'name': 'Freddy Coder抽',
    'age': 29,
    'title': 'Office Assistant'
}]

es.bulk((es.index_op(doc, id=doc.pop('id')) for doc in docs),
        index='test',
        doc_type='test')

es.refresh('test')

res1 = es.get('test', 'test', 1)

# 全文匹配, 注意中英文的分词方式.
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html

res8 = es.search(index='test',
                 size=2,
                 query={"query": {
                     "query_string": {
                         "query": "抽"
                     }
コード例 #28
0
ファイル: dbManager.py プロジェクト: sauravcsvt/geocoding
class ESWrapper(BaseDB):
    def __init__(self, index_name, host='http://localhost', port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        self._base_query = {
            "query": {
                "bool": {
                    "must": {
                        "match": {
                            "name.raw": ""
                        }
                    }
                }
            }
        }
        self._geo_filter = {
            "geo_distance": {
                "distance": "20km",
                "coordinates": {}
            }
        }
        self._index = index_name
        self._doctype = "places"

    def query(self, qkey, qtype="exact"):
        """
        qtype values are exact, relaxed or geo_distance
        """
        q = self._base_query.copy()
        if qtype == "exact":
            q["query"]["bool"]["must"]["match"]["name.raw"] = qkey
        elif qtype == "relaxed":
            q["query"]["bool"]["must"]["match"]["name"] = qkey
            q["query"]["bool"]["must"]["match"].pop("name.raw")
        elif qtype == "geo_distance":
            q = {
                "query": {
                    "bool": {
                        "must": {
                            "match_all": {}
                        }
                    },
                    "filter": {
                        "geo_distance": {
                            "distance": "20km",
                            "coordinates": qkey
                        }
                    }
                }
            }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def near_geo(self, geo_point):
        q = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    }
                },
                "filter": self._geo_filter
            }
        }
        q["query"]["bool"]["geo_distance"]["coordinates"] = geo_point
        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)

        self.eserver.create_index(index='geonames', settings=settings)
        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index='geonames', doc_type='places')
            print "..",

        self.eserver.refresh('geonames')

    def _opLoader(self, datacsv, confDir):
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                row['coordinates'] = [
                    float(row['longitude']),
                    float(row['latitude'])
                ]
                del (row['latitude'])
                del (row['longitude'])
                row['alternatenames'] = row['alternatenames'].split(",")
                cnt += 1
                #if cnt > 100:
                #break
                yield self.eserver.index_op(row,
                                            index="geonames",
                                            doc_type="places")
コード例 #29
0
class ESWrapper(BaseDB):
    def __init__(self,
                 index_name,
                 doc_type,
                 host='http://localhost',
                 port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        #self._base_query = {"query": {"bool": {"must": {"match": {}}}}}
        #self._base_query = {"query": {"bool": {}}}
        self._geo_filter = {"distance": "20km", "coordinates": {}}
        self._population_filter = {'population': {'gte': 5000}}
        self._index = index_name
        self._doctype = doc_type

    def getByid(self, geonameId):
        maincondition = {"match": {"id": geonameId}}
        q = {"query": {"bool": {"must": maincondition}}}
        return self.eserver.search(
            q, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']

    def _query(self, qkey, **kwargs):
        q = {"query": {"bool": {}}}
        query_name = "should"
        q["query"]["bool"]["minimum_number_should_match"] = 1
        kwargs.pop("qtype", "")

        placetokens = [
            l.strip() for l in tokenizer.split(qkey)
            if l and l not in STOP_WORDS and l[-1] != '.'
        ]
        if placetokens:
            reduced_placename = u" ".join(placetokens[0:])
            if len(placetokens[0]) < 3 and len(
                    placetokens) > 1 and 3.0 / len(placetokens) >= .5:
                reduced_placename = u" ".join(placetokens[1:])
        else:
            reduced_placename = qkey

        # print "qkey", qkey, "reduced", reduced_placename
        maincondition = [
            {
                "bool": {
                    "must": [{
                        "multi_match": {
                            "query":
                            qkey,
                            "fields":
                            ["name.raw^5", "asciiname^5", "alternatenames"],
                            "operator":
                            "and"
                        }
                    }, {
                        "terms": {
                            "featureClass": ["a", "p"]
                        }
                    }],
                }
            },
            {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            },
            {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            },
            {
                "term": {
                    "normalized_asciiname": {
                        "value": qkey
                    }
                }
            },
            # {"term": {"alternatenames": {"value": qkey[1:]}}},
            {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            },
            # {"multi_match": {"query": reduced_placename if 'fuzzy' in kwargs else unicode(unidecode(reduced_placename)),
            {
                "multi_match": {
                    "query":
                    reduced_placename if 'fuzzy' in kwargs else unicode(
                        unidecode(reduced_placename)),
                    'fuzziness':
                    kwargs.pop("fuzzy", 0),
                    "max_expansions":
                    kwargs.pop("max_expansion", 10),
                    "prefix_length":
                    kwargs.pop("prefix_length", 1),
                    'operator':
                    kwargs.pop("operator", "and"),
                    "fields": [
                        "name^3", "asciiname^3", "alternatenames",
                        "normalized_asciiname^3"
                    ]
                }
            }
        ]

        q["query"]["bool"][query_name] = maincondition

        if kwargs:
            filter_cond = []
            if 'min_popln' in kwargs:
                popln = kwargs.pop("min_popln")
                if popln is not None:
                    filter_cond.append(
                        {"range": {
                            "population": {
                                "gte": popln
                            }
                        }})

            for key, val in kwargs.viewitems():
                if not isinstance(val, basestring):
                    val = list([(v) for v in val])
                    filter_cond.append({"terms": {key: val}})
                else:
                    filter_cond.append({"term": {key: (val)}})

            q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}

        q['from'] = 0
        q['size'] = 50
        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def query(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        # print(max_score)
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def _oldquery(self,
                  qkey,
                  qtype="exact",
                  analyzer=None,
                  min_popln=None,
                  size=10,
                  **kwargs):
        """
        qtype values are exact, relaxed or geo_distance
        Always limit results to 10
        """
        q = {"query": {"bool": {}}}
        query_name = kwargs.pop('query_name', 'must')
        query_name = "should"
        if query_name == "should":
            q["query"]["bool"]["minimum_number_should_match"] = 1

        maincondition = {}
        if qtype == "exact":
            maincondition = [{
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            }]
            if analyzer:
                maincondition["match"]["name.raw"]["analyzer"] = analyzer

        elif qtype == "relaxed":
            maincondition["match"] = {"alternatenames": {"query": qkey}}
            if analyzer:
                maincondition["match"]["alternatenames"]["analyzer"] = analyzer

            #q["query"]["bool"][query_name]["match"].pop("name.raw", "")
        elif qtype == "combined":
            maincondition = [{
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": qkey,
                            "fields":
                            ["name.raw", "asciiname", "alternatenames"]
                        }
                    },
                    "filter": {
                        "bool": {
                            "should": [{
                                "range": {
                                    "population": {
                                        "gte": 5000
                                    }
                                }
                            }, {
                                "terms": {
                                    "featureCode": [
                                        "pcla", "pcli", "cont", "rgn", "admd",
                                        "adm1", "adm2"
                                    ]
                                }
                            }]
                        }
                    }
                }
            }, {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey[1:]
                    }
                }
            }, {
                "match": {
                    "alternatenames": {
                        "query": qkey,
                        'fuzziness': kwargs.pop("fuzzy", 0),
                        "max_expansions": kwargs.pop("max_expansion", 5),
                        "prefix_length": kwargs.pop("prefix_length", 1)
                    }
                }
            }]

        if maincondition:
            q["query"]["bool"][query_name] = maincondition

            if min_popln:
                filter_cond = [{"range": {"population": {"gte": min_popln}}}]
            else:
                filter_cond = []

            if kwargs:
                #filter_cond = [{"range": {"population": {"gte": min_popln}}}]
                filter_cond += [{
                    "term": {
                        key: val
                    }
                } for key, val in kwargs.viewitems()]
                # print(filter_cond)
                q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}
            elif min_popln:
                filter_cond = [{
                    "range": {
                        "population": {
                            "gte": min_popln
                        }
                    }
                }, {
                    "terms": {
                        "featureCode": ["ppla", "pplx"]
                    }
                }]

                q["query"]["bool"]["filter"] = {
                    "bool": {
                        "should": filter_cond
                    }
                }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def oldquery(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def near_geo(self, geo_point, min_popln=5000, **kwargs):
        q2 = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    },
                    "filter": [
                        {
                            "geo_distance": {
                                "distance": "30km",
                                "coordinates": geo_point
                            }
                        },
                        {
                            "terms":
                            # {"featureCode":
                            #  ["pcli", "ppl", "ppla2", "adm3"]}
                            {
                                "featureClass": ["a", "h", "l", "t", "p", "v"]
                            }
                        }
                    ]
                }
            },
            "sort": {
                "population": "desc"
            }
        }
        if kwargs:
            for key in kwargs:
                q2['query']['bool']['filter'].append(
                    {"term": {
                        key: kwargs[key]
                    }})

        res = self.eserver.search(
            q2, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']
        res['confidence'] = 1.0
        return [GeoPoint(**res)]

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)
            settings['mappings'][self._doctype] = settings['mappings'].pop(
                'places')

        try:
            self.eserver.create_index(index=self._index, settings=settings)
        except:
            self.eserver.delete_index(self._index)
            self.eserver.create_index(index=self._index, settings=settings)

        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype)
            print "..",

        self.eserver.refresh(self._index)

    def _opLoader(self, datacsv, confDir):
        ere = re.compile("[^\sa-zA-Z0-9]")
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                try:
                    row['coordinates'] = [
                        float(row['longitude']),
                        float(row['latitude'])
                    ]
                    try:
                        row['population'] = int(row["population"])
                    except:
                        row['population'] = -1

                    try:
                        row['elevation'] = int(row['elevation'])
                    except:
                        row['elevation'] = -1

                    del (row['latitude'])
                    del (row['longitude'])
                    #print row['name']
                    row['alternatenames'] = row['alternatenames'].lower(
                    ).split(",")
                    row['normalized_asciiname'] = (re.sub(
                        r'\s+', r' ', ere.sub("", row['asciiname']))).strip()
                    cnt += 1
                    yield self.eserver.index_op(row,
                                                index=self._index,
                                                doc_type=self._doctype)
                except:
                    print json.dumps(row)
                    continue

    def remove_dynamic_stopwords(self, term):
        # cc = {}
        # ttl = 0
        words = [w for t in term.split("-") for w in t.split() if len(w) > 1]

        if len(words) == 1:
            return term

        stopword_removed = ""
        for word in words:
            try:
                t = self.eserver.count(word)['count']
                if t >= 20000:
                    continue
            except:
                pass

            stopword_removed += (word + " ")
            # else:
            #     print(term, "stopword ", word)

        return stopword_removed.strip()
コード例 #30
0
ファイル: dbManager.py プロジェクト: astorfi/geocoding
class ESWrapper(BaseDB):
    def __init__(self,
                 index_name,
                 doc_type,
                 host='http://localhost',
                 port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        #self._base_query = {"query": {"bool": {"must": {"match": {}}}}}
        #self._base_query = {"query": {"bool": {}}}
        self._geo_filter = {"distance": "20km", "coordinates": {}}
        self._population_filter = {'population': {'gte': 5000}}
        self._index = index_name
        self._doctype = doc_type

    def getByid(self, geonameId):
        maincondition = {"match": {"id": geonameId}}
        q = {"query": {"bool": {"must": maincondition}}}
        return self.eserver.search(
            q, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']

    def _query(self,
               qkey,
               qtype="exact",
               analyzer=None,
               min_popln=None,
               size=10,
               **kwargs):
        """
        qtype values are exact, relaxed or geo_distance
        Always limit results to 10
        """
        q = {"query": {"bool": {}}}
        query_name = kwargs.pop('query_name', 'must')
        query_name = "should"
        if query_name == "should":
            q["query"]["bool"]["minimum_number_should_match"] = 1

        maincondition = {}
        if qtype == "exact":
            maincondition = [{
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            }]
            #maincondition["match"] = {"name.raw": {"query": qkey}}
            if analyzer:
                maincondition["match"]["name.raw"]["analyzer"] = analyzer

        elif qtype == "relaxed":
            maincondition["match"] = {"alternatenames": {"query": qkey}}
            if analyzer:
                maincondition["match"]["alternatenames"]["analyzer"] = analyzer

            #q["query"]["bool"][query_name]["match"].pop("name.raw", "")
        elif qtype == "combined":
            maincondition = [{
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": qkey,
                            "fields":
                            ["name.raw", "asciiname", "alternatenames"]
                        }
                    },
                    "filter": {
                        "bool": {
                            "should": [{
                                "range": {
                                    "population": {
                                        "gte": 5000
                                    }
                                }
                            }, {
                                "terms": {
                                    "featureCode": [
                                        "pcla", "pcli", "cont", "rgn", "admd",
                                        "adm1", "adm2"
                                    ]
                                }
                            }]
                        }
                    }
                }
            }, {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey[1:]
                    }
                }
            }, {
                "match": {
                    "alternatenames": {
                        "query": qkey,
                        'fuzziness': kwargs.pop("fuzzy", 0),
                        "max_expansions": kwargs.pop("max_expansion", 5),
                        "prefix_length": kwargs.pop("prefix_length", 1)
                    }
                }
            }]

        if maincondition:
            q["query"]["bool"][query_name] = maincondition

            if min_popln:
                filter_cond = [{"range": {"population": {"gte": min_popln}}}]
            else:
                filter_cond = []

            if kwargs:
                #filter_cond = [{"range": {"population": {"gte": min_popln}}}]
                filter_cond += [{
                    "term": {
                        key: val
                    }
                } for key, val in kwargs.viewitems()]
                # print(filter_cond)
                q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}
            elif min_popln:
                filter_cond = [{
                    "range": {
                        "population": {
                            "gte": min_popln
                        }
                    }
                }, {
                    "terms": {
                        "featureCode": ["ppla", "pplx"]
                    }
                }]

                q["query"]["bool"]["filter"] = {
                    "bool": {
                        "should": filter_cond
                    }
                }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def query(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def near_geo(self, geo_point, min_popln=5000, **kwargs):
        q2 = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    },
                    "filter": [{
                        "geo_distance": {
                            "distance": "30km",
                            "coordinates": geo_point
                        }
                    }, {
                        "terms": {
                            "featureCode": ["pcli", "ppl", "ppla2", "adm3"]
                        }
                    }]
                }
            },
            "sort": {
                "population": "desc"
            }
        }

        res = self.eserver.search(q2,
                                  index=self._index,
                                  doc_type=self._doctype,
                                  **kwargs)['hits']['hits'][0]['_source']
        res['confidence'] = 1.0
        return [GeoPoint(**res)]

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)
            settings['mappings'][self._doctype] = settings['mappings'].pop(
                'places')

        try:
            self.eserver.create_index(index=self._index, settings=settings)
        except:
            self.eserver.delete_index(self._index)
            self.eserver.create_index(index=self._index, settings=settings)

        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype)
            print "..",

        self.eserver.refresh(self._index)

    def _opLoader(self, datacsv, confDir):
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                try:
                    row['coordinates'] = [
                        float(row['longitude']),
                        float(row['latitude'])
                    ]
                    try:
                        row['population'] = int(row["population"])
                    except:
                        row['population'] = -1

                    try:
                        row['elevation'] = int(row['elevation'])
                    except:
                        row['elevation'] = -1

                    del (row['latitude'])
                    del (row['longitude'])
                    #print row['name']
                    row['alternatenames'] = row['alternatenames'].split(",")
                    cnt += 1
                    yield self.eserver.index_op(row,
                                                index=self._index,
                                                doc_type=self._doctype)
                except:
                    print json.dumps(row)
                    continue