def __init__(self, k, wipe_segs=False): self.total_edges_inserted = 0 self.k = k # max nodes per segment self.consumer = kafka.KafkaConsumer('se') self.producer = kafka.KafkaProducer() self.gremlin = gremlin_query.Runner() self.next_seg_id = 1 + self.max_seg_id() self.next_node_id = 1 + self.max_node_id() if wipe_segs: self.drop_all_existing_segments()
def drop_all(): # Even for very small transactions, # like "g.V().order().limit(20).drop().iterate()", # this sometimes reports GremlinServerError: Code [597]: SCRIPT_EVALUATION. # The vertex or type has been removed [v[1831104]] # in which case verify no background tasks are inserting, # and if necessary then: stop, /opt/titan/bin/titan.sh clean, start. with gremlin_query.Runner() as gremlin: for x in ['V().has(\'segment:name\',within(\'byPID\',\'byTime\')).has(label,\'Segment\')']: for q in ['g.%s.drop().iterate() ' % x, 'graph.tx().commit() ']: print(q, gremlin.fetch(q))
def get_label_counts(with_edges=False): '''Queries titan with read throughput of ~2700 node/sec.''' queries = ['g.V().groupCount().by(label())'] if with_edges: queries.append('g.E().groupCount().by(label())') cnt = {} with gremlin_query.Runner() as gremlin: for query in queries: for msg in gremlin.fetch(query): if msg.data: assert len(msg.data) == 1 cnt.update(msg.data[0]) cnt['total'] = sum(cnt.values()) return sorted(['%6d %s' % (cnt[k], k) for k in cnt.keys()])
def get_eventType_counts(): queries = ["g.V().has('eventType').groupCount().by('eventType')"] cnt = {} with gremlin_query.Runner() as gremlin: for query in queries: for msg in gremlin.fetch(query): if msg.data: assert len(msg.data) == 1 cnt.update(msg.data[0]) cnt['total'] = sum(cnt.values()) return sorted([ '%6d %s' % (cnt[k], k if k == 'total' else cdm.enums.Event(int(k)).name) for k in cnt.keys() ])
def renderSegments(verbose=False): with gremlin_query.Runner() as gremlin: vertices = gremlin_properties.fetch(gremlin, QUERYV) graph = {} for v in vertices: val = {} val['criteria'] = v['pid'] val['name'] = v['segment:name'] edges = gremlin_properties.fetch(gremlin, QUERYE.format(v.getId())) val['edges_out'] = [e.getId() for e in edges] graph[v.getId()] = val dot = toDot(graph) if verbose: print(dot) dot.format = 'svg' dot.render('static/seggraph.dot', view=False)
sys.path.append(os.path.expanduser('~/adapt/tools')) import gremlin_query def report(gremlin): stamps = [] # Dates before 1971 fail the sanity check and are rejected. sane = 365 * 86400 * 1e6 queries = [ "g.V().values('startedAtTime').is(gt(%d)).min()" % sane, "g.V().values('startedAtTime').max()", ] for query in queries: for msg in gremlin.fetch(query): if msg.data: for usec in msg.data: stamp = datetime.datetime.utcfromtimestamp(usec / 1e6) print(stamp) stamps.append(stamp) delta = stamps[1] - stamps[0] print(delta, 'elapsed time') if __name__ == '__main__': with gremlin_query.Runner() as gremlin: report(gremlin)
def execute_graph_query(query): with gremlin_query.Runner() as g: return str(g.fetch(query))
def compute_view_and_save(self): # extract features keys = sorted(self.features_queries.keys()) QUERY = self.node_ids_query + ";if(IDS!=[]){" for i in range(0, len(keys)): if type(self.features_queries[keys[i]]) == type(dict()): QUERY += self.split_Q_to_many( "yf{}".format(i), self.features_queries[keys[i]]['first']) if 'second' in self.features_queries[keys[i]].keys(): QUERY += self.split_Q_to_many( "ys{}".format(i), self.features_queries[keys[i]]['second']) if 'third' in self.features_queries[keys[i]].keys(): QUERY += self.split_Q_to_many( "yt{}".format(i), self.features_queries[keys[i]]['third']) else: QUERY += self.split_Q_to_many("x{}".format(i), self.features_queries[keys[i]]) QUERY += "[IDS" for i in range(0, len(keys)): if type(self.features_queries[keys[i]]) == type(dict()): QUERY += ",[yf{}".format(i) if 'second' in self.features_queries[keys[i]].keys(): QUERY += ",ys{}".format(i) if 'third' in self.features_queries[keys[i]].keys(): QUERY += ",yt{}".format(i) QUERY += "]" else: QUERY += ",x{}".format(i) QUERY += "]}else [];" # print(QUERY) # if len(QUERY) > 0: # return False log.info("Extracting features for " + self.view_type + "...") with gremlin_query.Runner() as gremlin: try: result = gremlin.fetch_data(QUERY, bindings=bindings) except: log.exception("Exception at query:" + QUERY) return False if result == []: log.info("Found 0 " + self.view_type + " nodes") return False log.info("Writing " + self.view_type + " view features to file: " + self.feature_file) f = open(self.feature_file, "w") f.write("id") for k in keys: f.write("," + k) f.write("\n") for i in range(0, len(result[0])): f.write(str(result[0][i])) j = 1 for k in keys: res = 0 try: if type(self.features_queries[k]) == type(dict()): if self.features_queries[k]['operator'] == 'subTime': res = (result[j][0][i] - result[j][1][i]) / 1.0e6 elif self.features_queries[k][ 'operator'] == 'RELUTime': res = (result[j][0][i] - result[j][1][i]) / 1.0e6 if res < 0: res = 0 elif self.features_queries[k][ 'operator'] == 'div(SubTime)': res = result[j][0][i] / ( (result[j][1][i] - result[j][2][i]) / 1.0e6) elif self.features_queries[k][ 'operator'] == '(SubTime)div': res = ((result[j][0][i] - result[j][1][i]) / 1.0e6) / result[j][2][i] else: log.info("Unrecognized operator: " + self.features_queries[k]['operator']) else: res = result[j][i] except: log.exception("Exception: i=" + str(i) + ", j=" + str(j) + ", k=" + k) f.write(',' + str(res)) j += 1 f.write('\n') f.close() log.info("Writing " + self.feature_file + " Finished") return True
def attach_scores_to_db(self, view_stats, percentage=5.0): with open(self.score_file) as f: for i, l in enumerate(f): pass total_nodes = i view_stats.number_nodes = total_nodes cutoff = min(math.ceil(total_nodes * (percentage / 100.0)), 500) view_stats.number_nodes_attached = cutoff max_score = 0 max_id = 0 max_feature = None cnt = 0 QUERY = "" binds = { 'atype': 'anomalyType', 'ascore': 'anomalyScore', 'sin': 'segment:includes', 'afeature': 'anomalyFeature' } with gremlin_query.Runner() as gremlin: with open(self.score_file, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: QUERY += "x={id};t='{type}';s={score};g.V(x).property(atype,t).next();g.V(x).property(ascore,s).next();".format( id=row['id'], type=self.view_type, score=row['anomaly_score']) feature = "[" for k in sorted(row.keys()): if k != 'id' and k != 'anomaly_score': if feature != "[": feature += "," feature += k + ":" + str(row[k]) feature += "]" feature = "Rnk:" + str(cnt + 1) + "/" + str( total_nodes) + feature QUERY += "f='{feat}';g.V(x).property(afeature,f).next();".format( feat=feature) log.info("Adding anomaly scores to id " + row['id'] + " (" + self.view_type + ", " + row['anomaly_score'] + ")") if float(row['anomaly_score']) > max_score: max_score = float(row['anomaly_score']) max_id = row['id'] max_feature = feature cnt = cnt + 1 if cnt >= cutoff: break if len(QUERY) > 20000: log.info("size of QUERY = " + str(len(QUERY))) try: log.info('Attaching anomaly score for ' + str(cnt) + ' nodes') gremlin.fetch_data(QUERY, binds) log.info('Anomaly score attachment done for ' + str(cnt) + ' nodes') except: log.exception("Exception at query:" + QUERY) QUERY = "" QUERY += "x={id};t='{type}';s={score};f='{feat}';IDS=g.V(x).in(sin).id().toList().toArray();if(IDS!=[]){{g.V(IDS).property(atype,t).next();g.V(IDS).property(ascore,s).next();g.V(IDS).property(afeature,f).next();}};".format( id=max_id, type=self.view_type, score=max_score, feat=max_feature) log.info("size of QUERY = " + str(len(QUERY))) # log.info("Attaching anomaly scores to top " + str(cutoff) + " anomalous nodes (threshold=min(" + str(percentage) + "%,1000))...") try: gremlin.fetch_data(QUERY, binds) log.info('Anomaly score attachment done for view ' + self.view_type) except: log.exception("Exception at query:" + QUERY)