def invalid_extraction(entity_type, entity_value, domain): user = session_helper.get_user() user_name = user.get_user_name() org = session_helper.get_org() success = db.mark_invalid_extracted_entity(user_name, entity_type, entity_value, domain, org) == 0 return json.dumps(dict(success=success))
def add_irrelevant_trail_entity(domain, trail, entity): org = helper.get_org() if not db.does_irrelevant_entity_exist(org, domain, trail, entity): success = db.add_irrelevant_trail_entity(org, domain, trail, entity.encode("utf-8")) == 0 if success: kafka_producer.send_trail_term_message(org, domain, trail, entity, False) return json.dumps(dict(success=success)) return json.dumps(dict(success=True))
def get_trail_entity_links(domain, trail): org = helper.get_org() links = json.dumps( dict( visited=db.get_visited_trail_entity_links(org, domain, trail), notVisited=db.get_not_visited_trail_entity_links(org, domain, trail), ) ) return links
def getTimeWindow(users, trail=u'*'): org = helper.get_org() if trail == u'': trail = u'*' print 'getTimeWindow(', users, ',', trail, ')' if len(users) > 0: users = users.split(",") else: users = [] return json.dumps(datawake_mysql.getTimeWindow(org, users, trail))
def get(domain,trail,stars,newdomain): org = helper.get_org().upper() if not db.domain_exists(newdomain): db.add_new_domain(newdomain,'auto generated domain from trail: '+trail) features = set([]) url_set = set([]) stars = int(stars) # get all stared urls for the trail for (url,rank) in db.getRankedUrls(org,trail,domain): url_set.add(url) if stars < 1: urls = db.getBrowsePathUrls(org,trail) for url in urls: url_set.add(url) # get the list of invalid entities for the domain markedEntities = set([]) for (type,value) in db.get_marked_entities_for_domain(org, domain): markedEntities.add(value) # for each url get all extracted entities entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls(url_set) for url,featureDict in all_entities.iteritems(): for type,values in featureDict.iteritems(): type = type.replace(',',' ') filtered_values = [] for value in values: if value not in markedEntities: value = value.replace(',',' ') features.add(type+"\0"+value) # for each url get any manually extracted entities for url in url_set: for featureObj in db.get_feedback_entities(org, domain, url): type = featureObj['type'].replace(',',' ') value = featureObj['value'].replace(',',' ') features.add(type+"\0"+value) entity_data_connector.add_new_domain_items( map(lambda x: newdomain+'\0'+x,features))
def get(domain, trail): org = helper.get_org() trail_report = {} # get all stared urls for the trail for (url, rank) in db.getRankedUrls(org, trail, domain): trail_report[url] = { 'rank': rank, } # get the list of invalid entities for the domain markedEntities = set([]) for (type, value) in db.get_marked_entities_for_domain(org, domain): markedEntities.add(value) # for each url get all extracted entities entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls( trail_report.keys()) for url, featureDict in all_entities.iteritems(): for type, values in featureDict.iteritems(): filtered_values = [] for value in values: if value not in markedEntities: filtered_values.append(value) if len(filtered_values) > 0: try: if 'auto_features' not in trail_report[url]: trail_report[url]['auto_features'] = {} trail_report[url]['auto_features'][type] = filtered_values except: tangelo.log("report generation error. skipping url.") continue # for each url get any manually extracted entities for url in trail_report.keys(): for featureObj in db.get_feedback_entities(org, domain, url): if 'manual_features' not in trail_report[url]: trail_report[url]['manual_features'] = {} if featureObj['type'] not in trail_report[url]['manual_features']: trail_report[url]['manual_features'][featureObj['type']] = [] trail_report[url]['manual_features'][featureObj['type']].append( featureObj['value']) # for each url get any highlighted text for url in trail_report.keys(): selections = db.getSelections(domain, trail, url, org) if len(selections) > 0: trail_report[url]['selections'] = selections result = {'trail': trail, 'urls': trail_report} return json.dumps(result, sort_keys=True, indent=4, separators=(',', ':'))
def get_chart(users=u"", trail=u"*", domain=u""): org = helper.get_org() # tangelo.log('dataservice-get org=' + org + ' users=' + users + ' trail= ' + trail + ' domain=' + domain) if trail == u"": trail = u"*" if len(users) > 0: users = users.split(",") else: users = [] result = datawake_mysql.getHourlyBrowsePathCounts(org, users, trail, domain=domain) return json.dumps(dict(data=result))
def get_chart(users=u'', trail=u'*', domain=u''): org = helper.get_org() #tangelo.log('dataservice-get org=' + org + ' users=' + users + ' trail= ' + trail + ' domain=' + domain) if trail == u'': trail = u'*' if len(users) > 0: users = users.split(",") else: users = [] result = datawake_mysql.getHourlyBrowsePathCounts(org, users, trail, domain=domain) return json.dumps(dict(data=result))
def get(domain, trail, stars, newdomain): org = helper.get_org().upper() if not db.domain_exists(newdomain): db.add_new_domain(newdomain, 'auto generated domain from trail: ' + trail) features = set([]) url_set = set([]) stars = int(stars) # get all stared urls for the trail for (url, rank) in db.getRankedUrls(org, trail, domain): url_set.add(url) if stars < 1: urls = db.getBrowsePathUrls(org, trail) for url in urls: url_set.add(url) # get the list of invalid entities for the domain markedEntities = set([]) for (type, value) in db.get_marked_entities_for_domain(org, domain): markedEntities.add(value) # for each url get all extracted entities entity_data_connector = factory.get_entity_data_connector() all_entities = entity_data_connector.get_extracted_entities_from_urls( url_set) for url, featureDict in all_entities.iteritems(): for type, values in featureDict.iteritems(): type = type.replace(',', ' ') filtered_values = [] for value in values: if value not in markedEntities: value = value.replace(',', ' ') features.add(type + "\0" + value) # for each url get any manually extracted entities for url in url_set: for featureObj in db.get_feedback_entities(org, domain, url): type = featureObj['type'].replace(',', ' ') value = featureObj['value'].replace(',', ' ') features.add(type + "\0" + value) entity_data_connector.add_new_domain_items( map(lambda x: newdomain + '\0' + x, features))
def getGraph(name, startdate=u'', enddate=u'', users=u'', trail=u'*', domain=u''): org = helper.get_org() if trail == u'': trail = u'*' userlist = map(lambda x: x.replace('\"', '').strip(), users.split(',')) userlist = filter(lambda x: len(x) > 0, userlist) #tangelo.log('getGraph( ' + str(name) + ',' + str(startdate) + ',' + str(enddate) + ',' + str(userlist) + ',' + str(trail) + ',' + str(domain) + ')') if name == 'browse path': graph = graph_helper.getBrowsePathEdges(org, startdate, enddate, userlist, trail, domain) return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with adjacent urls': graph = graph_helper.getBrowsePathAndAdjacentWebsiteEdgesWithLimit(org, startdate, enddate, 1, userlist, trail, domain) return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with adjacent urls min degree 2': graph = graph_helper.getBrowsePathAndAdjacentWebsiteEdgesWithLimit(org, startdate, enddate, 2, userlist, trail, domain) return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with adjacent phone #\'s': graph = graph_helper.getBrowsePathAndAdjacentPhoneEdgesWithLimit(org, startdate, enddate, 1, userlist, trail, domain) return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with adjacent email #\'s': graph = graph_helper.getBrowsePathAndAdjacentEmailEdgesWithLimit(org, startdate, enddate, 1, userlist, trail, domain) return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with text selections': graph = graph_helper.getBrowsePathWithTextSelections(org, startdate, enddate, userlist, trail, domain) return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path- with look ahead': graph = graph_helper.getBrowsePathWithLookAhead(org, startdate, enddate, userlist, trail, domain) return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with adjacent info': graph = graph_helper.getBrowsePathAndAdjacentInfoEdges(org, startdate, enddate,1,userlist, trail, domain) return json.dumps(graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'OculusForensicRequest': rows = graph_helper.getOculusForensicGraph(org,startdate,enddate,userlist,trail,domain) return json.dumps(rows) return json.dumps(dict(nodes=[], links=[]))
def get_trails(domain): org = helper.get_org() return get_trails_for_domain_and_org(org, domain)
def get_trail_based_entities(domain, trail): entities = db.get_trail_based_entities(helper.get_org(), domain, trail) irrelevantEntities = db.get_irrelevant_trail_based_entities(helper.get_org(), domain, trail) return json.dumps(dict(entities=entities, irrelevantEntities=irrelevantEntities))
def fetch_entities(domain, url): org = session_helper.get_org() entities = db.get_feedback_entities(org, domain, url) return json.dumps(dict(entities=entities))
def marked_entities(domain): user = session_helper.get_user() user_name = user.get_user_name() org = session_helper.get_org() marked_entities_list = db.get_marked_entities(org, domain, user_name) return json.dumps(dict(marked_entities=marked_entities_list))
def get_url_entities(domain, trail, url): results = json.dumps(dict(entities=db.get_entities_on_url(helper.get_org(), domain, trail, url))) tangelo.log_info(results) return results
def good_extraction(raw_text, entity_type, entity_value, url, domain): org = session_helper.get_org() success = db.add_extractor_feedback(org, domain, raw_text, entity_type, entity_value, url) == 0 return json.dumps(dict(success=success))
def deleteUser(users, startdate, enddate): org = helper.get_org() tangelo.log('deleteUser(' + users + ',' + startdate + ',' + enddate + ')') datawake_mysql.deleteUserData(org, users, startdate, enddate) return json.dumps(dict(success=True))
def getTrails(): org = helper.get_org() results = datawake_mysql.getTrailsWithUserCounts(org) results.insert(0, {}) return json.dumps(results)
def listUsers(): org = helper.get_org() return json.dumps(datawake_mysql.getActiveUsers(org))
def delete_link_from_trail(domain, trail, url): org = helper.get_org() success = db.delete_link_from_trail(org, domain, trail, url) == 0 return json.dumps(dict(success=success))
def get_trail_entity_links(domain, trail): org = helper.get_org() links = json.dumps(dict(visited=db.get_visited_trail_entity_links(org, domain, trail), notVisited=db.get_not_visited_trail_entity_links(org, domain, trail))) return links
def get_selections(domain, trail, url): org = helper.get_org() return json.dumps( dict(selections=db.getSelections(domain, trail, url, org)))
def get_selections(domain, trail, url): org = helper.get_org() return json.dumps(dict(selections=db.getSelections(domain, trail, url, org)))
def getGraph(name, startdate=u'', enddate=u'', users=u'', trail=u'*', domain=u''): org = helper.get_org() if trail == u'': trail = u'*' userlist = map(lambda x: x.replace('\"', '').strip(), users.split(',')) userlist = filter(lambda x: len(x) > 0, userlist) #tangelo.log('getGraph( ' + str(name) + ',' + str(startdate) + ',' + str(enddate) + ',' + str(userlist) + ',' + str(trail) + ',' + str(domain) + ')') if name == 'browse path': graph = graph_helper.getBrowsePathEdges(org, startdate, enddate, userlist, trail, domain) return json.dumps( graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with adjacent urls': graph = graph_helper.getBrowsePathAndAdjacentWebsiteEdgesWithLimit( org, startdate, enddate, 1, userlist, trail, domain) return json.dumps( graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with adjacent urls min degree 2': graph = graph_helper.getBrowsePathAndAdjacentWebsiteEdgesWithLimit( org, startdate, enddate, 2, userlist, trail, domain) return json.dumps( graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with adjacent phone #\'s': graph = graph_helper.getBrowsePathAndAdjacentPhoneEdgesWithLimit( org, startdate, enddate, 1, userlist, trail, domain) return json.dumps( graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with adjacent email #\'s': graph = graph_helper.getBrowsePathAndAdjacentEmailEdgesWithLimit( org, startdate, enddate, 1, userlist, trail, domain) return json.dumps( graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with text selections': graph = graph_helper.getBrowsePathWithTextSelections( org, startdate, enddate, userlist, trail, domain) return json.dumps( graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path- with look ahead': graph = graph_helper.getBrowsePathWithLookAhead( org, startdate, enddate, userlist, trail, domain) return json.dumps( graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'browse path - with adjacent info': graph = graph_helper.getBrowsePathAndAdjacentInfoEdges( org, startdate, enddate, 1, userlist, trail, domain) return json.dumps( graph_helper.processEdges(graph['edges'], graph['nodes'])) if name == 'OculusForensicRequest': rows = graph_helper.getOculusForensicGraph(org, startdate, enddate, userlist, trail, domain) return json.dumps(rows) return json.dumps(dict(nodes=[], links=[]))