def bulk_queue_push(self, data, case_id, source=None, machine=None, data_type=None, data_path=None, chunk_size=500, kjson=False): case_id = case_id.lower() bulk_queue = [] for d in data: di = {"_index": case_id, "_source": {}, '_id': str(uuid.uuid4())} di['_source']['Data'] = d['Data'] if kjson else d source = d['data_source'] if kjson else source data_type = d['data_type'] if kjson else source data_path = d['data_path'] if kjson else source if source is not None: di['_source']['data_source'] = source if machine is not None: di['_source']['machine'] = machine if data_type is not None: di['_source']['data_type'] = data_type if data_path is not None: di['_source']['data_path'] = data_path bulk_queue.append(di) logger.logger(level=logger.DEBUG, type="elasticsearch", message="Index [" + case_id + "]: Pushing [" + str(len(bulk_queue)) + "] records") push_es = self.bulk_to_elasticsearch(bulk_queue, case_id, chunk_size) if push_es[0]: logger.logger(level=logger.INFO, type="elasticsearch", message="Index [" + case_id + "]: Pushed [" + str(len(bulk_queue) - len(push_es[2])) + "] records successfully") return [ True, "Pushed [" + str(len(bulk_queue)) + "] records", push_es[2], push_es[3] ] else: logger.logger(level=logger.ERROR, type="elasticsearch", message="Index [" + case_id + "]: Failed pusheing [" + str(len(bulk_queue)) + "] records", reason=push_es[1]) return [ False, 'Failed to bulk data to Elasticsearch: ' + str(push_es[1]), bulk_queue, push_es[3] ]
from app.brain import twitter from app.brain.classifier import Classifier from app.brain.markov import Markov from app.models import Muse, Tweet, Doc from app.config import config from tweepy.error import TweepError from mongoengine.errors import NotUniqueError, OperationError from pymongo.errors import DuplicateKeyError import random # Logging from app.logger import logger logger = logger(__name__) # Load the classifier and markov. # Loaded here so we can keep it in memory. # accessible via app.brain.CLS or app.brain.MKV CLS = Classifier() MKV = Markov(ramble=config().ramble, ngram_size=config().ngram_size, spasm=config().spasm) def ponder(): """ Fetch tweets from the Muses and memorize them; i.e. train classifier or Markov on them. """
def bulk_to_elasticsearch_fix_errors(self, indx, errors): logger.logger(level=logger.WARNING, type="elasticsearch", message="Index [" + indx + "]: Failed pushing [" + str(len(errors)) + "] records [BulkIndexError], retry to fix the issue") # check the returned error for each document and try to solve it fixed_data = [] nonfixed_data = [] limit_fields_increased = False for _id, doc in errors.iteritems(): record_msg_info = "Indx[" + indx + "]" if 'machine' in doc['index']['data'].keys(): record_msg_info += ", machine [" + doc['index']['data'][ 'machine'] + "]" if 'data_type' in doc['index']['data'].keys(): record_msg_info += ", data_type[" + doc['index']['data'][ 'data_type'] + "]" if '_id' in doc['index'].keys(): record_msg_info += ", rec_id[" + doc['index']['_id'] + "]" try: doc_reason = doc['index']['error']['reason'] logger.logger(level=logger.WARNING, type="elasticsearch", message=record_msg_info + ": record failed", reason=doc_reason) # === if the error is the limitation on the fields number, get the add 1000 to the limitation and try again if "Limit of total fields" in doc_reason and limit_fields_increased == False: new_limit = int(self.get_total_fields_limit(indx)) new_limit = new_limit + 1000 inc = self.es_db.indices.put_settings( index=indx, body='{"index.mapping.total_fields.limit": ' + str(new_limit) + '}') if inc["acknowledged"]: logger.logger( level=logger.INFO, type="elasticsearch", message=record_msg_info + " : The total_fields.limit has been increased to " + str(new_limit)) limit_fields_increased = True else: logger.logger( level=logger.ERROR, type="elasticsearch", message=record_msg_info + " : failed to increase total_fields.limit") # === if already fixed the limit of total fields issue, then add it to the list if "Limit of total fields" in doc_reason and limit_fields_increased: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue # if there is error where the text field exceeded the maximum number of charactors (by default 32766) match = re.match( 'Document contains at least one immense term in field="(.+)" \(whose UTF8 encoding is longer than the max length ([0-9]+)\), all of which were skipped.* original message: bytes can be at most ([0-9]+) in length; got ([0-9]+)', doc_reason) if match is not None: field = match.groups()[0] current_max = int(match.groups()[1]) data_length = int(match.groups()[3]) logger.logger(level=logger.ERROR, type="elasticsearch", message=record_msg_info + " : field data more than the specified", reason="field " + field + ", defined max length [" + str(current_max) + "], field data [" + str(data_length) + "]") # ==== check if reason that an object received but the field data type is not correct match = re.match( "object mapping for \[(.*)\] tried to parse field \[(.*)\] as (.*), but found a concrete value", doc_reason) if match is not None: match = match.groups() failed_field = match[0] # if datatype is object but found concrete value if match[2] == 'object': d = json_get_val_by_path(doc['index']['data'], failed_field) if d[0]: # if type of field is object but found "None" as string if d[1] == 'None': if json_update_val_by_path( doc['index']['data'], failed_field, None)[0]: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue # if type of field is object but found string if isinstance(d[1], str): if json_update_val_by_path( doc['index']['data'], failed_field, {'value': d[1]})[0]: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue # ==== failed to parse field as date match = re.match( "failed to parse field \[(.*)\] of type \[(.*)\] in document with id .*", doc_reason) if match is not None: match = match.groups() failed_field = match[0] failed_field_type = match[1] # if the field mapped as date if failed_field_type == 'date': if json_update_val_by_path(doc['index']['data'], failed_field, '1700-01-01T00:00:00')[0]: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue # if the field mapped as text if failed_field_type == 'text': d = json_get_val_by_path(doc['index']['data'], failed_field) if d[0]: d = d[1] try: if isinstance(d, list): res = [0 for x in range(len(d))] for i in d.keys(): res[int(i)] = d[i] res_str = '\n'.join(res) if json_update_val_by_path( doc['index']['data'], failed_field, res_str)[0]: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue elif isinstance(d, dict): res_str = "\n".join([ str(k) + "=" + str(d[k]) for k in d.keys() ]) if json_update_val_by_path( doc['index']['data'], failed_field, res_str)[0]: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue except Exception as e: pass logger.logger(level=logger.ERROR, type="elasticsearch", message=record_msg_info + " : No fix found for failed record [" + doc['index']['_id'] + "] data", reason=doc['index']['data']) nonfixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) except Exception as e: logger.logger(level=logger.ERROR, type="elasticsearch", message=record_msg_info + " : unsuspected error in fixing record issue", reason=str(e)) nonfixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) return fixed_data, nonfixed_data
def bulk_to_elasticsearch(self, bulk_queue, indx, chunk_size): try: errors = { } # contain dictionary of failed data (origin data and error info) failed = [] # contain the IDs of the failed records successed = [] # contain the IDs of successed records logger.logger(level=logger.DEBUG, type="elasticsearch", message="Index [" + indx + "]: bulk push to ES, default chunk[" + str(chunk_size) + "]: ", reason="number of records: " + str(len(bulk_queue))) # use helpers to push the data to elasticsearch for ok, item in helpers.parallel_bulk(self.es_db, bulk_queue, chunk_size=chunk_size, raise_on_error=False, raise_on_exception=False): if not ok: errors[item['index']['_id']] = item logger.logger(level=logger.WARNING, type="elasticsearch", message="Index [" + indx + "]: Failed pushing record: ", reason=str(item)) failed.append(item['index']['_id']) else: successed.append(item['index']['_id']) if len(failed): logger.logger(level=logger.WARNING, type="elasticsearch", message="Index [" + indx + "]: Failed pushing [" + str(len(failed)) + "] records, try to fix the issue") # get origin data from ID for data in bulk_queue: try: errors[data['_id']]['index']['data'] = data['_source'] logger.logger(level=logger.DEBUG, type="elasticsearch", message="Index [" + indx + "]: get data for failed record [" + data['_id'] + "]", reason=str(errors[data['_id']])) except: # if record not in the errors list, continue continue logger.logger(level=logger.WARNING, type="elasticsearch", message="Index [" + indx + "]: Failed pushing record: ", reason=str(data['_id'])) fixed_errors, nonfixed_errors = self.bulk_to_elasticsearch_fix_errors( indx, errors) failed = nonfixed_errors if len(fixed_errors): logger.logger( level=logger.DEBUG, type="elasticsearch", message="Index [" + indx + "]: fixed issue of [" + str(len(fixed_errors)) + "] records, retry to push it") repush_failed_errors = self.bulk_to_elasticsearch( fixed_errors, indx, chunk_size) if repush_failed_errors[0]: successed += repush_failed_errors[3] failed += repush_failed_errors[2] return [ True, "Pushed [" + str(len(successed)) + "] records to [" + indx + "] index", failed, successed ] # if connection timeout to elasticsearch occurred except elasticsearch.exceptions.ConnectionTimeout as e: logger.logger(level=logger.WARNING, type="elasticsearch", message="Index [" + indx + "]: Failed to push the records, retry again", reason="Connection to Elasticsearch timeout") return self.bulk_to_elasticsearch(bulk_queue, indx, chunk_size) except Exception as e: logger.logger( level=logger.ERROR, type="elasticsearch", message="Failed pushing the records, unexpected error", reason=str(e)) return [ False, "Failed pushing [" + str(len(bulk_queue)) + "] records to [" + indx + "] index", bulk_queue, [] ]
def query(self, indexname, body, count=3): count -= 1 indexname = indexname.lower() body["track_total_hits"] = True logger.logger(level=logger.DEBUG, type="elasticsearch", message="Query to index [" + indexname + "]", reason=json.dumps(body)) filter_path = [ 'hits.hits._source.Data', 'hits.total.value', 'aggregations.*.buckets' ] try: #search_res = self.es_db.search(index=indexname,body=body , filter_path=filter_path) search_res = self.es_db.search(index=indexname, body=body) return [True, search_res] except elasticsearch.RequestError as e: reason = e.info['error']['reason'] logger.logger(level=logger.WARNING, type="elasticsearch", message="Query [" + indexname + "] failed [RequestError]", reason=reason) # if the problem in shards if reason == "all shards failed": for shard in e.info['error']['failed_shards']: if 'caused_by' in shard['reason'].keys(): shard_reason = shard['reason']['caused_by']['reason'] else: shard_reason = shard['reason']['reason'] # if the reason is that the field used for key is text and is not sortable, then try it sub-field ".keyword" if shard_reason.startswith( "Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default" ): if "sort" in body.keys(): field = body['sort'].keys()[0] order = body['sort'][field]['order'] body['sort'] = { field + ".keyword": { 'order': order } } logger.logger( level=logger.INFO, type="elasticsearch", message="Query [" + indexname + "], the sort is not a sortable field, try using sub-field .keyword" ) return self.query(indexname, body, count) # if the reason is the result has too many fields match = re.match( 'field expansion (for \[.*\] )?matches too many fields, limit: ([0-9]+), got: ([0-9]+)', shard_reason) if match is not None: # if the problem is the number of fields more than the default max number of fields in query max_field_num = int(match.groups()[1]) + 100 inc = self.es_db.indices.put_settings( index=indexname, body='{ "index" : { "query": { "default_field" : ' + str(max_field_num) + '} } }') if inc["acknowledged"]: logger.logger( level=logger.INFO, type="elasticsearch", message="Query [" + indexname + "] max query fields number increased " + str(max_field_num)) if count != 0: return self.query(indexname, body, count) else: return [ False, "exceeded the number of tries to fix the issue, field expansion matches too many fields" ] else: logger.logger( level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] Failed increasing the result window") continue # if the result window is too large, increase the window match = re.match( 'Result window is too large, from \+ size must be less than or equal to: \[([0-9]+)\] but was \[([0-9]+)\].*', shard_reason) if match is not None: max_result_window = int(match.groups()[1]) + 1000 inc = self.es_db.indices.put_settings( index=indexname, body='{ "index" : { "max_result_window" : ' + str(max_result_window) + ' } }') if inc["acknowledged"]: logger.logger( level=logger.INFO, type="elasticsearch", message="Query [" + indexname + "] result window increased to " + str(self.get_max_result_window(indexname))) if count != 0: return self.query(indexname, body, count) else: return [ False, "exceeded the number of tries to fix the issue, Result window is too large" ] else: logger.logger( level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] Failed increasing the result window") continue else: logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [RequestError]", reason=shard_reason) else: logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [RequestError]", reason=json.dumps(e.info)) res = [False, reason] except elasticsearch.ConnectionError as e: logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [ConnectionError]", reason=e.info) res = [False, 'Failed to connect to elasticsearch'] except elasticsearch.TransportError as e: reason = str(e) logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [TransportError]", reason=reason) logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [TransportError]", reason=json.dumps(e.info)) res = [False, reason] except elasticsearch.ElasticsearchException as e: reason = str(e) logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [ElasticsearchException]", reason=reason) logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [ElasticsearchException]", reason=json.dumps(e.info)) res = [False, reason] except Exception as e: print str(e) res = [False, str(e)] logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [Exception]", reason=str(e)) return res
from flask import Blueprint, render_template, redirect, request, url_for, jsonify, flash from flask.views import MethodView from flask.ext.mongoengine.wtf import model_form from app import app, brain, db from app.models import Muse, Tweet, Config, Doc from app.auth import requires_auth from app.forms import TweetingForm # Logging from app.logger import logger logger = logger(__name__) # Landing page @app.route('/') @app.route('/index') def index(): return render_template('index.html', speech=brain.MKV.generate()) @app.route('/generate') def generate(): return render_template('generate.html', speech=brain.MKV.generate()) @app.route('/generate_', methods=['GET', 'POST']) @requires_auth def generate_(): form = TweetingForm() if form.validate_on_submit(): flash('Tweet twoot') brain.twitter.tweet(form.tweet.data) return redirect('/generate_') return render_template('generate_.html', form=form, speech=brain.MKV.generate())