Example #1
0
class MysqlEvElasticsearchHandler(IEventHandler):
    def __init__(self, config=None, es_instance=None):
        if es_instance:
            self.es = es_instance
        else:
            self.config = config
            self.excludes_fields = self.config['excludes_fields']
            self.es = ElasticSearch('http://{host}:{port}/'.format(
                host=self.config['host'], port=self.config['port']))

    def _format(self, dat):
        for k, v in dat.items():
            if isinstance(v, datetime):
                dat[k] = v.strftime('%Y-%m-%d %H:%M:%S')
            elif isinstance(v, date):
                dat[k] = v.strftime('%Y-%m-%d')
            if k in self.excludes_fields:
                del dat[k]
        return dat

    def on_delete_raw(self, ev_id, ev):
        for row in ev.rows:
            pk = ev.primary_key
            table = ev.table
            schema = ev.schema
            yield self.es.index_op(self._format(row['values']),
                                   doc_type=table,
                                   index=schema,
                                   id=row['values'][pk])

    def on_update_raw(self, ev_id, ev):
        for row in ev.rows:
            pk = ev.primary_key
            table = ev.table
            schema = ev.schema
            yield self.es.update_op(self._format(row['after_values']),
                                    doc_type=table,
                                    index=schema,
                                    id=row['after_values'][pk])

    def on_insert_raw(self, ev_id, ev):
        for row in ev.rows:
            pk = ev.primary_key
            table = ev.table
            schema = ev.schema
            yield self.es.delete_op(doc_type=table,
                                    index=schema,
                                    id=row['values'][pk])
Example #2
0
def getFeeds():
    print "getting feeds"
    es = ElasticSearch('http://fisensee.ddns.net:9200/')

    query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}}
    oldFeeds = es.search(query, size=300, index='feeds')

    if(len(oldFeeds['hits']['hits']) is not 0):
        es.bulk(es.delete_op(id=feed['_id'], index='feeds',
        doc_type='feed') for feed in oldFeeds['hits']['hits'])


    feedSources = FeedSource.objects.all()
    feeds = []
    defaultText = 'undefined'
    defaultDate = datetime.datetime.now().isoformat()
    utc = pytz.utc
    berlin = pytz.timezone('Europe/Berlin')
    now = datetime.datetime.today()
    dateThreshold = now - datetime.timedelta(weeks=2)

    allUrls = []
    for feedSource in feedSources:
        allUrls.append(feedSource.sourceUrl)

    urls = set(allUrls)
    for url in urls:
        source = feedparser.parse(url)
        for entry in source['items']:
            feed = {
                'title':defaultText,
                'description':defaultText,
                'link':defaultText,
                'date':defaultDate,
                'url': defaultText
            }
            if('title' in entry):
                feed['title'] = entry['title']
            if('description' in entry):
                feed['description'] = entry['description']
            if('link' in entry):
                feed['link'] = entry['link']
            if('published_parsed' in entry):
                date = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed']))
                if(date < dateThreshold):
                    break
                utcDate = utc.localize(date)
                feed['date'] = utcDate.astimezone(berlin).isoformat()
            #id creation should be enough for now, but it's made to fail
            if('title' or 'published_parsed' in entry):
                feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title'] + feed['date']).encode('utf8')).hexdigest())
            else:
                feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title']).encode('utf8')).hexdigest())
            feed['url'] = url
            feeds.append(feed)



    es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds),
        index = 'feeds',
        doc_type = 'feed')
    print es.refresh('feeds')
Example #3
0
class MySync(object):
	ts = 0	#last chunk time
	log_file = None
	log_pos  = None
	
	def __init__(self):
		self.config = yaml.load(open('./etc/config.yaml'))
		self.mark_path = self.config['binlog']['mark']
		self.bulk_size = self.config['es']['bulk_size']
		self.excludes_fields = self.config['slave']['excludes_fields']
		self.es = ElasticSearch('http://{host}:{port}/'.format(
			host=self.config['es']['host'], 
			port=self.config['es']['port']
		))

		#set logger
		logging.basicConfig(
			level=logging.DEBUG,
			format='%(asctime)s %(levelname)s %(message)s',
			datefmt='%Y-%m-%d %H:%M:%S',
			filename=self.config['log']['run']
		)
		logging.getLogger('elasticsearch').setLevel(logging.INFO)
		logging.getLogger('elasticsearch.trace').setLevel(logging.INFO)
		logging.getLogger('elasticsearch.trace').addHandler(logging.StreamHandler())

		#resume stream
		if os.path.isfile(self.mark_path):		
			with open(self.mark_path, 'r') as y:
				mark = yaml.load(y)
				self.log_file = mark.get('log_file')
				self.log_pos  = mark.get('log_pos')
				logging.info('resume stream >> file:%s, pos:%s' % (self.log_file, self.log_pos))
	
	"""
	mark binlog position
	"""
	def mark_binlog(self):
		if self.log_file and self.log_pos:
			with open(self.mark_path, 'w') as y:
				logging.info('mark binlog >> file:%s, pos:%s' % (self.log_file, self.log_pos))
				yaml.safe_dump({'log_file':self.log_file, 'log_pos':self.log_pos}, y, default_flow_style=False)
	

	"""
	format fields
	"""
	def _format(self, dat):
		for k,v in dat.items():
			if isinstance(v, datetime):
				dat[k] = v.strftime('%Y-%m-%d %H:%M:%S')
			elif isinstance(v, date):
				dat[k] = v.strftime('%Y-%m-%d')
			if k in self.excludes_fields:
				del dat[k]
		return dat
	

	"""
	mysql binlog event handle
	"""
	def proc_binlog(self):
		stream = BinLogStreamReader(
			connection_settings = self.config['mysql'],
			server_id = self.config['slave']['server_id'],
			log_file = self.log_file,
			log_pos = self.log_pos,
			only_schemas = self.config['slave']['schemas'],
			blocking = True,
			resume_stream = bool(self.log_file and self.log_pos),
			only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent]
		)
		for binlogevent in stream:
			#binlogevent.dump()
			self.log_file = stream.log_file
			self.log_pos  = stream.log_pos
			for row in binlogevent.rows:
				pk = binlogevent.primary_key
				table = binlogevent.table
				schema = binlogevent.schema
				if isinstance(binlogevent, WriteRowsEvent):
					yield self.es.index_op(self._format(row['values']), doc_type=table, index=schema, id=row['values'][pk])
				elif isinstance(binlogevent, UpdateRowsEvent):
					yield self.es.update_op(self._format(row['after_values']), doc_type=table, index=schema, id=row['after_values'][pk])
				elif isinstance(binlogevent, DeleteRowsEvent):
					yield self.es.delete_op(doc_type=table, index=schema, id=row['values'][pk])
				else:
					continue

		stream.close()
	

	"""
	notify exception
	"""
	def send_email(self, msg):
		import smtplib
		from email.mime.text import MIMEText
		msg = MIMEText(msg, 'plain', 'utf-8')
		msg['From'] = self.config['email']['from']['user']
		msg['To'] = ','.join(self.config['email']['to'])
		msg['Subject'] = 'Binlog Sync Exception:'
		try:
			s = smtplib.SMTP()
			s.connect(self.config['email']['host'], self.config['email']['port'])
			s.ehlo()
			s.starttls()
			s.login(user=self.config['email']['from']['user'], password=self.config['email']['from']['passwd'])
			s.sendmail(msg['From'], self.config['email']['to'], msg.as_string())
			s.quit()
		except Exception:
			import traceback
			logging.error(traceback.format_exc())

	"""
	bulk chunk check every second
	"""
	def bulk_chunks(self, actions, docs_per_chunk=300, bytes_per_chunk=None):
		chunk = []
		docs = bytes = 0
		for action in actions:
			next_len = len(action) + 1  #+1 for \n
			if chunk and (
				(docs_per_chunk and docs >= docs_per_chunk) or
				(bytes_per_chunk and bytes + next_len > bytes_per_chunk) or
				(self.ts+1 < int(time.time()))
			):
				#print(">>>chunk:%d" % len(chunk))
				yield chunk
				chunk = []
				docs = bytes = 0
				self.ts = int(time.time())

			chunk.append(action)
			docs += 1
			bytes += next_len

		if chunk:
			yield chunk

	"""
	run entry
	"""
	def run(self):
		try:
			for chunk in self.bulk_chunks(self.proc_binlog(), docs_per_chunk=self.bulk_size):
				#time.sleep(1)
				self.es.bulk(chunk)
				self.mark_binlog()
		except KeyboardInterrupt:
			pass
		except Exception:
			import traceback
			logging.error(traceback.format_exc())
			self.send_email(msg=traceback.format_exc())
			raise
Example #4
0
def getFeeds():
    print "getting feeds"
    es = ElasticSearch('http://fisensee.ddns.net:9200/')

    query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}}
    oldFeeds = es.search(query, size=300, index='feeds')

    if (len(oldFeeds['hits']['hits']) is not 0):
        es.bulk(
            es.delete_op(id=feed['_id'], index='feeds', doc_type='feed')
            for feed in oldFeeds['hits']['hits'])

    feedSources = FeedSource.objects.all()
    feeds = []
    defaultText = 'undefined'
    defaultDate = datetime.datetime.now().isoformat()
    utc = pytz.utc
    berlin = pytz.timezone('Europe/Berlin')
    now = datetime.datetime.today()
    dateThreshold = now - datetime.timedelta(weeks=2)

    allUrls = []
    for feedSource in feedSources:
        allUrls.append(feedSource.sourceUrl)

    urls = set(allUrls)
    for url in urls:
        source = feedparser.parse(url)
        for entry in source['items']:
            feed = {
                'title': defaultText,
                'description': defaultText,
                'link': defaultText,
                'date': defaultDate,
                'url': defaultText
            }
            if ('title' in entry):
                feed['title'] = entry['title']
            if ('description' in entry):
                feed['description'] = entry['description']
            if ('link' in entry):
                feed['link'] = entry['link']
            if ('published_parsed' in entry):
                date = datetime.datetime.fromtimestamp(
                    time.mktime(entry['published_parsed']))
                if (date < dateThreshold):
                    break
                utcDate = utc.localize(date)
                feed['date'] = utcDate.astimezone(berlin).isoformat()
            #id creation should be enough for now, but it's made to fail
            if ('title' or 'published_parsed' in entry):
                feed['id'] = base64.urlsafe_b64encode(
                    hashlib.sha256((feed['title'] +
                                    feed['date']).encode('utf8')).hexdigest())
            else:
                feed['id'] = base64.urlsafe_b64encode(
                    hashlib.sha256((feed['title']).encode('utf8')).hexdigest())
            feed['url'] = url
            feeds.append(feed)

    es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds),
            index='feeds',
            doc_type='feed')
    print es.refresh('feeds')
Example #5
0
class MySync(object):
	log_file = None
	log_pos  = None
	
	def __init__(self):
		print '[INFO] starting ...'
		self.config = yaml.load(open('./etc/config.yaml'))
		self.mark_path = self.config['binlog']['mark']
		self.bulk_size = self.config['es']['bulk_size']
		self.excludes_fields = self.config['slave']['excludes_fields']
		self.es = ElasticSearch('http://{host}:{port}/'.format(host=self.config['es']['host'], port=self.config['es']['port']))
		"""
		resume stream
		"""
		if os.path.isfile(self.mark_path):		
			with open(self.mark_path, 'r') as y:
				mark = yaml.load(y)
				self.log_file = mark.get('log_file')
				self.log_pos  = mark.get('log_pos')
				logging.info("resume stream : file: {file}, pos: {pos}".format(file=self.log_file,pos=self.log_pos))	
	
	def mark_binlog(self):
		if self.log_file and self.log_pos:
			with open(self.mark_path, 'w') as y:
				logging.info("mark binlog: binlog_file: {file}, pos: {pos}".format(file=self.log_file, pos=self.log_pos))
				yaml.safe_dump({"log_file": self.log_file, "log_pos": self.log_pos}, y, default_flow_style=False)
	
	def _format(self, dat):
		for k,v in dat.items():
			if isinstance(v, datetime):
				dat[k] = v.strftime('%Y-%m-%d %H:%M:%S')				
			elif isinstance(v, date):
				dat[k] = v.strftime('%Y-%m-%d')
			if k in self.excludes_fields:
				del dat[k]
		return dat
	
	def proc_binlog(self):
		stream = BinLogStreamReader(
			connection_settings = self.config['mysql'],
			server_id = self.config['slave']['server_id'],
			log_file = self.log_file,
			log_pos = self.log_pos,
			only_schemas = self.config['slave']['schemas'],
			blocking = True,
			resume_stream = bool(self.log_file and self.log_pos),
			only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent]
		)
		for binlogevent in stream:
			self.log_file = stream.log_file
			self.log_pos  = stream.log_pos
			for row in binlogevent.rows:		
				pk     = binlogevent.primary_key			
				table  = binlogevent.table
				schema = binlogevent.schema

				if isinstance(binlogevent, WriteRowsEvent):
					yield self.es.index_op(self._format(row['values']), doc_type=table, index=schema, id=row['values'][pk])
				elif isinstance(binlogevent, UpdateRowsEvent):
					yield self.es.update_op(self._format(row['after_values']), doc_type=table, index=schema, id=row['after_values'][pk])
				elif isinstance(binlogevent, DeleteRowsEvent):
					yield self.es.delete_op(doc_type=table, index=schema, id=row['values'][pk])
				else:
					logging.warning("unsupport event type")
					continue
	
		stream.close()
		
	def send_email(self, msg):
		import smtplib
		from email.mime.text import MIMEText
		msg = MIMEText(msg, 'plain', 'utf-8')
		msg['From']    = self.config['email']['from']['user']
		msg['To']      = ','.join(self.config['email']['to'])
		msg['Subject'] = 'Binlog Sync Exception:'
		try:
			s = smtplib.SMTP();
			s.connect(self.config['email']['host'], self.config['email']['port'])
			s.ehlo()
			s.starttls()
			s.login(user=self.config['email']['from']['user'], password=self.config['email']['from']['passwd'])
			s.sendmail(msg['From'], self.config['email']['to'], msg.as_string())
			s.quit()
		except Exception:
			import traceback
			logging.error(traceback.format_exc())
	
	def run(self):
		try:
			if self.bulk_size < 2:
				for action in self.proc_binlog():
					self.es.bulk([action])
					self.mark_binlog()
			else:
				for chunk in bulk_chunks(self.proc_binlog(), docs_per_chunk=self.bulk_size):
					self.es.bulk(chunk)
					self.mark_binlog()
		except KeyboardInterrupt:
			pass
		except Exception:
			import traceback
			logging.error(traceback.format_exc())
			self.send_email(msg=traceback.format_exc())
			raise