def compute_similarities(text, models, count=None): """Finds items that are similar to the specified text :param text: The text to be used for comparison :param models: The list of models to be compared against text Each of the entries should have a simhash property :param count: The no. of similar items to return """ # Get the simhash of the submitted message _hash = simhash(util.unicodeToAscii(text)) candidates, scores = {}, [] # TODO: Investiage ways of speeding this - complexity is O(n) for model in models: target = simhash(hash=long(model.simhash)) if long(target) == long(_hash): continue similarity = _hash.similarity(target) if similarity >= similarity_threshold: scores.append((model.id, similarity)) candidates[model.id] = model if len(scores) == 0: return [] scores.sort(key=lambda x: x[1], reverse=True) result_size = max_similar_messages if count is None else count retval = [] for x in range(result_size): message_dict = candidates[scores[x][0]].as_dict() del message_dict['simhash'] message_dict['score'] = scores[x][1] retval.append(message_dict) return retval
def add_message(deployment_id): """Adds a new message for the deployment in :deployment_id The input parameters are: message: string :param deployment_id: the id of the deployment """ if not request.json: abort(400) _post = request.json if 'origin_message_id' not in _post and 'content' not in _post: abort(400) # Does the deployment exist deployment = Deployment.by_id(deployment_id) if deployment is None: abort(404) _hash = simhash(util.unicodeToAscii(_post['content'])) message = Message(deployment_id=deployment_id, origin_message_id=_post['origin_message_id'], content=_post['content'], simhash=str(_hash)) message.create() return jsonify(message.as_dict())
def add_report(deployment_id): """Adds a new report to the deployment specified by the ``deployment_id`` parameter Input parameters: description: string - Description of the report categories: array of integers - category ids :param deployment_id: the id of the deployment """ verify_deployment(deployment_id) errors = {} _post = request.json # Check for fields if 'origin_report_id' not in _post: errors['origin_report_id'] = 'The report id is missing' if 'title' not in _post: errors['title'] = 'The report title is missing' if 'description' not in _post: errors['description'] = 'The report description is missing' if 'categories' not in _post or len(_post['categories']) == 0: errors['categories'] = 'The report categories must be specified' # Did we encounter any errors? if len(errors) > 0: app.logger.error("There are some errors in the request %r" % errors) abort(400) # Does the specified report already exist? _report = db.session.query(Report).\ filter(Report.origin_report_id == _post['origin_report_id'], Report.deployment_id == deployment_id).first() if not _report is None: app.logger.error("The report %s has already been registered" % _post['origin_report_id']) abort(400) # Get the categories categories = db.session.query(Category).\ filter(Category.deployment_id == deployment_id, Category.origin_category_id.in_(_post['categories'])).all() # Have the specified category ids been registered? if len(categories) == 0: app.logger.error("The specified categories are invalid") abort(400) # Compute the simhash on the report description _hash = simhash(util.unicodeToAscii(_post['description'])) report = Report(deployment_id=deployment_id, origin_report_id=_post['origin_report_id'], title=_post['title'], description=_post['description'], simhash=str(_hash)) # Create the report report.create() # Save the report categories report_categories = [] for category in categories: rc = ReportCategory(report_id=report.id, category_id=category.id) report_categories.append(rc) ReportCategory.create_all(report_categories) return jsonify(report.as_dict())