Exemple #1
0
def dumpcmd(args):
    log = logging.getLogger("dumpcmd")
    model = CommunitiesModel().load(args.input)
    log.info("Initializing the sha1 resolver")
    communities = BatchedCommunityResolver(model, args.batch, get_db(args), args.tables["meta"])
    stream_template(args.template, sys.stdout, communities=communities, model=model,
                    model_path=os.path.abspath(args.input))
Exemple #2
0
def dumpcmd(args):
    log = logging.getLogger("dumpcmd")
    model = CommunitiesModel().load(args.input)
    log.info("Initializing the sha1 resolver")
    communities = BatchedCommunityResolver(model, args.batch, get_db(args), args.tables["meta"])
    stream_template(args.template, sys.stdout, communities=communities, model=model,
                    model_path=os.path.abspath(args.input))
Exemple #3
0
def find_connected_components(args):
    log = logging.getLogger("graph")
    session = get_db(args)
    table = args.tables["hashtables"]
    rows = session.execute("SELECT DISTINCT hashtable FROM %s" % table)
    hashtables = sorted(r.hashtable for r in rows)
    log.info("Detected %d hashtables", len(hashtables))

    # Read buckets from database
    buckets = []
    element_ids = {}
    prev_len = 0
    for hashtable in hashtables:
        rows = session.execute(
            "SELECT sha1, value FROM %s WHERE hashtable=%d" % (table, hashtable))
        band = None
        bucket = []
        for row in rows:
            eid = element_ids.setdefault(row.sha1, len(element_ids))
            if row.value != band:
                if band is not None:
                    buckets.append(bucket.copy())
                    bucket.clear()
                band = row.value
                bucket.append(eid)
                continue
            bucket.append(eid)
        if bucket:
            buckets.append(bucket)
        log.info("Fetched %d, %d buckets", hashtable, len(buckets) - prev_len)
        prev_len = len(buckets)

    element_to_buckets = [[] for _ in range(len(element_ids))]
    for i, bucket in enumerate(buckets):
        for element in bucket:
            element_to_buckets[element].append(i)

    # Statistics about buckets
    levels = (logging.ERROR, logging.INFO)
    log.info("Number of buckets: %d", len(buckets))
    log.log(levels[len(element_ids) >= len(buckets[0])],
            "Number of elements: %d", len(element_ids))
    epb = sum(map(len, buckets)) / len(buckets)
    log.log(levels[epb >= 1], "Average number of elements per bucket: %.1f", epb)
    nb = min(map(len, element_to_buckets))
    log.log(levels[nb == len(hashtables)], "Min number of buckets per element: %s", nb)
    nb = max(map(len, element_to_buckets))
    log.log(levels[nb == len(hashtables)], "Max number of buckets per element: %s", nb)
    log.info("Running CC analysis")

    # Connect components
    connected_components_element = _find_connected_component(buckets, element_to_buckets)
    log.info("CC number: %d", len(connected_components_element))

    log.info("Writing %s", args.output)
    ConnectedComponentsModel() \
        .construct(connected_components_element, element_to_buckets, element_ids) \
        .save(args.output)
Exemple #4
0
def find_connected_components(args):
    log = logging.getLogger("graph")
    session = get_db(args)
    table = args.tables["hashtables"]
    rows = session.execute("SELECT DISTINCT hashtable FROM %s" % table)
    hashtables = sorted(r.hashtable for r in rows)
    log.info("Detected %d hashtables", len(hashtables))

    # Read buckets from database
    buckets = []
    element_ids = {}
    prev_len = 0
    for hashtable in hashtables:
        rows = session.execute(
            "SELECT sha1, value FROM %s WHERE hashtable=%d" % (table, hashtable))
        band = None
        bucket = []
        for row in rows:
            eid = element_ids.setdefault(row.sha1, len(element_ids))
            if row.value != band:
                if band is not None:
                    buckets.append(bucket.copy())
                    bucket.clear()
                band = row.value
                bucket.append(eid)
                continue
            bucket.append(eid)
        if bucket:
            buckets.append(bucket)
        log.info("Fetched %d, %d buckets", hashtable, len(buckets) - prev_len)
        prev_len = len(buckets)

    element_to_buckets = [[] for _ in range(len(element_ids))]
    for i, bucket in enumerate(buckets):
        for element in bucket:
            element_to_buckets[element].append(i)

    # Statistics about buckets
    levels = (logging.ERROR, logging.INFO)
    log.info("Number of buckets: %d", len(buckets))
    log.log(levels[len(element_ids) >= len(buckets[0])],
            "Number of elements: %d", len(element_ids))
    epb = sum(map(len, buckets)) / len(buckets)
    log.log(levels[epb >= 1], "Average number of elements per bucket: %.1f", epb)
    nb = min(map(len, element_to_buckets))
    log.log(levels[nb == len(hashtables)], "Min number of buckets per element: %s", nb)
    nb = max(map(len, element_to_buckets))
    log.log(levels[nb == len(hashtables)], "Max number of buckets per element: %s", nb)
    log.info("Running CC analysis")

    # Connect components
    connected_components_element = _find_connected_component(buckets, element_to_buckets)
    log.info("CC number: %d", len(connected_components_element))

    log.info("Writing %s", args.output)
    ConnectedComponentsModel() \
        .construct(connected_components_element, element_to_buckets, element_ids) \
        .save(args.output)
Exemple #5
0
def query(args):
    log = logging.getLogger("query")
    session = get_db(args)
    tables = args.tables
    if args.id:
        rows = session.execute(
            "SELECT hashtable, value FROM %s WHERE sha1='%s'" %
            (tables["hashtables2"], args.id))
        bands = [(r.hashtable, r.value) for r in rows]
    else:
        # args.file
        if not args.feature:
            log.critical(
                "-f / --feature must be specified at least once in file query mode"
            )
            return 1
        if not args.params:
            log.critical("-p / --params must be specified in file query mode")
            return 1
        wmh, bag = hash_file(args)
        htnum, band_size = calc_hashtable_params(args.threshold, len(wmh),
                                                 args.false_positive_weight,
                                                 args.false_negative_weight)
        log.info("Number of hash tables: %d", htnum)
        log.info("Band size: %d", band_size)
        bands = [(i, bytearray(wmh[i * band_size:(i + 1) * band_size].data))
                 for i in range(htnum)]
    similar = set()
    log.info("Looking for similar items")
    for i, band in bands:
        rows = session.execute(
            "SELECT sha1 FROM %s WHERE hashtable=%d AND value=0x%s" %
            (tables["hashtables"], i, codecs.encode(band, "hex").decode()))
        similar.update(r.sha1 for r in rows)
    log.info("Fetched %d items", len(similar))
    if args.precise:
        # Precise bags
        vocab = OrderedDocumentFrequencies().load(args.docfreq)
        log.info("Calculating the precise result")
        if args.id:
            rows = session.execute(
                "SELECT item, value FROM %s WHERE sha1='%s'" %
                (tables["bags"], args.id))
            bag = numpy.zeros(len(vocab), dtype=numpy.float32)
            for row in rows:
                bag[vocab.order[row.item]] = row.value
        # Fetch other bags from the DB
        precise = []
        for x in similar:
            rows = session.execute(
                "SELECT item, value FROM %s WHERE sha1='%s'" %
                (tables["bags"], x))
            other_bag = numpy.zeros(len(vocab), dtype=numpy.float32)
            for row in rows:
                other_bag[vocab.order[row.item]] = row.value
            if weighted_jaccard(bag, other_bag) >= args.threshold:
                precise.append(x)
            log.info("Survived: %.2f", len(precise) / len(similar))
        similar = precise
    if args.id:
        try:
            similar.remove(args.id)
        except KeyError:
            # o_O
            pass

    similar = [s.split("@")[1] for s in similar]
    stream_template(args.template,
                    sys.stdout,
                    size=len(similar),
                    origin=args.id if args.id else os.path.abspath(args.file),
                    items=BatchedHashResolver(similar, args.batch, session,
                                              tables["meta"]))