def dumpcmd(args): log = logging.getLogger("dumpcmd") model = CommunitiesModel().load(args.input) log.info("Initializing the sha1 resolver") communities = BatchedCommunityResolver(model, args.batch, get_db(args), args.tables["meta"]) stream_template(args.template, sys.stdout, communities=communities, model=model, model_path=os.path.abspath(args.input))
def find_connected_components(args): log = logging.getLogger("graph") session = get_db(args) table = args.tables["hashtables"] rows = session.execute("SELECT DISTINCT hashtable FROM %s" % table) hashtables = sorted(r.hashtable for r in rows) log.info("Detected %d hashtables", len(hashtables)) # Read buckets from database buckets = [] element_ids = {} prev_len = 0 for hashtable in hashtables: rows = session.execute( "SELECT sha1, value FROM %s WHERE hashtable=%d" % (table, hashtable)) band = None bucket = [] for row in rows: eid = element_ids.setdefault(row.sha1, len(element_ids)) if row.value != band: if band is not None: buckets.append(bucket.copy()) bucket.clear() band = row.value bucket.append(eid) continue bucket.append(eid) if bucket: buckets.append(bucket) log.info("Fetched %d, %d buckets", hashtable, len(buckets) - prev_len) prev_len = len(buckets) element_to_buckets = [[] for _ in range(len(element_ids))] for i, bucket in enumerate(buckets): for element in bucket: element_to_buckets[element].append(i) # Statistics about buckets levels = (logging.ERROR, logging.INFO) log.info("Number of buckets: %d", len(buckets)) log.log(levels[len(element_ids) >= len(buckets[0])], "Number of elements: %d", len(element_ids)) epb = sum(map(len, buckets)) / len(buckets) log.log(levels[epb >= 1], "Average number of elements per bucket: %.1f", epb) nb = min(map(len, element_to_buckets)) log.log(levels[nb == len(hashtables)], "Min number of buckets per element: %s", nb) nb = max(map(len, element_to_buckets)) log.log(levels[nb == len(hashtables)], "Max number of buckets per element: %s", nb) log.info("Running CC analysis") # Connect components connected_components_element = _find_connected_component(buckets, element_to_buckets) log.info("CC number: %d", len(connected_components_element)) log.info("Writing %s", args.output) ConnectedComponentsModel() \ .construct(connected_components_element, element_to_buckets, element_ids) \ .save(args.output)
def query(args): log = logging.getLogger("query") session = get_db(args) tables = args.tables if args.id: rows = session.execute( "SELECT hashtable, value FROM %s WHERE sha1='%s'" % (tables["hashtables2"], args.id)) bands = [(r.hashtable, r.value) for r in rows] else: # args.file if not args.feature: log.critical( "-f / --feature must be specified at least once in file query mode" ) return 1 if not args.params: log.critical("-p / --params must be specified in file query mode") return 1 wmh, bag = hash_file(args) htnum, band_size = calc_hashtable_params(args.threshold, len(wmh), args.false_positive_weight, args.false_negative_weight) log.info("Number of hash tables: %d", htnum) log.info("Band size: %d", band_size) bands = [(i, bytearray(wmh[i * band_size:(i + 1) * band_size].data)) for i in range(htnum)] similar = set() log.info("Looking for similar items") for i, band in bands: rows = session.execute( "SELECT sha1 FROM %s WHERE hashtable=%d AND value=0x%s" % (tables["hashtables"], i, codecs.encode(band, "hex").decode())) similar.update(r.sha1 for r in rows) log.info("Fetched %d items", len(similar)) if args.precise: # Precise bags vocab = OrderedDocumentFrequencies().load(args.docfreq) log.info("Calculating the precise result") if args.id: rows = session.execute( "SELECT item, value FROM %s WHERE sha1='%s'" % (tables["bags"], args.id)) bag = numpy.zeros(len(vocab), dtype=numpy.float32) for row in rows: bag[vocab.order[row.item]] = row.value # Fetch other bags from the DB precise = [] for x in similar: rows = session.execute( "SELECT item, value FROM %s WHERE sha1='%s'" % (tables["bags"], x)) other_bag = numpy.zeros(len(vocab), dtype=numpy.float32) for row in rows: other_bag[vocab.order[row.item]] = row.value if weighted_jaccard(bag, other_bag) >= args.threshold: precise.append(x) log.info("Survived: %.2f", len(precise) / len(similar)) similar = precise if args.id: try: similar.remove(args.id) except KeyError: # o_O pass similar = [s.split("@")[1] for s in similar] stream_template(args.template, sys.stdout, size=len(similar), origin=args.id if args.id else os.path.abspath(args.file), items=BatchedHashResolver(similar, args.batch, session, tables["meta"]))