Example #1
0
def deduplicate_other_old(file_prefix, input_dir, output_dir, threshold,
                          permutations):
    """
    Removes all documents from a set of minhashed documents (3 files with the
    same minhash prefix) that occur in other batches in input_dir. Only
    batches whose number is higher than the batch in question are considered
    (i.e. upper triangular matrix).

    Warning: only works for full documents at this point!
    """
    lsh = MinHashLSH(threshold=threshold, num_perm=permutations)
    file_base = op.basename(file_prefix)
    logging.info('Processing batch {}...'.format(file_base))

    # First, load the (already deduplicated) batch...
    for input_file, results in read_batch(file_prefix):
        for doc_id, minhash in zip(results['id'], results['minhash']):
            lsh.insert('\t'.join(doc_id), minhash)

    initial_len = len(lsh.keys)
    to_match_with = find_all_batches(input_dir,
                                     int(file_prefix.rpartition(os.sep)[-1]))

    # Now, remove all documents in it that are contained in other batches
    # to the "right" of it (with greater batch numbers)
    for batch in to_match_with:
        initial_batch_len = len(lsh.keys)
        for _, results in read_batch(batch):
            for i, minhash in enumerate(results['minhash']):
                for duplicate in lsh.query(minhash):
                    lsh.remove(duplicate)
        logging.info(
            'Cross-deduplicated batch {} with batch {}: {} -> {} documents.'.
            format(file_base, op.basename(batch), initial_batch_len,
                   len(lsh.keys)))

    # Finally, we print the documents left. Unfortunately, in order to
    # keep the format, we have to read the original batch again.
    with closing(
            BatchWriter(sys.maxsize, output_dir, len(file_base),
                        int(file_base))) as bw:
        # OK, we need to re-read the batch unfortunately
        for input_file, results in read_batch(file_prefix):
            doc_ids, minhashes = [], []
            for doc_id, minhash in zip(results['id'], results['minhash']):
                if '\t'.join(doc_id) in lsh:
                    doc_ids.append(doc_id)
                    minhashes.append(minhash)
            bw.write_results(input_file, {'id': doc_ids, 'minhash': minhashes})
    logging.info('Processed batch {}; kept {} out of {} documents.'.format(
        file_base, len(lsh.keys), initial_len))
    return len(lsh.keys), initial_len
def learn_duplicates(name, f, verbose=False):
    print(name)
    logging.basicConfig(level=logging.DEBUG)
    texts_sample = [
        item['extracted_text'] for item in item_reader(f, name, limit=300)]
    dupe_predictor = DupePredictor(texts_sample)

    lsh = MinHashLSH(threshold=0.9, num_perm=128)  # separate from dupe_predictor
    too_common_shingles = dupe_predictor.too_common_shingles
    threshold = 0.98
    y_pred, y_true = [], []
    def _report_pr():
        tp = sum(p > threshold and d for p, d in zip(y_pred, y_true))
        fp = sum(p > threshold and not d for p, d in zip(y_pred, y_true))
        fn = sum(p < threshold and d for p, d in zip(y_pred, y_true))
        n_dup = tp + fn
        print('precision: %.3f, recall %.3f at %.2f threshold '
                '(%d duplicates)' % (
            tp / (tp + fp) if tp else 0.,
            tp / n_dup if n_dup else 0., threshold, n_dup))
    for i, item in enumerate(item_reader(f, name)):
        dupe_prob = dupe_predictor.get_dupe_prob(item['url'])
        y_pred.append(dupe_prob)
        min_hash = get_min_hash(item['extracted_text'], too_common_shingles)
        if dupe_prob < threshold:
            duplicates = [url for url, _ in dupe_predictor.update_model(
                item['url'], item['extracted_text'])]
        else:
            # We think this is a duplicate: replicate crawling
            # and do not update the model.
            duplicates = list(lsh.query(min_hash))
        key = canonicalize_url(item['url'])
        if key in lsh:
            lsh.remove(key)
        lsh.insert(key, min_hash)
        y_true.append(bool(duplicates))
        if verbose:
            if duplicates and dupe_prob < threshold:
                path = _full_path(item['url'])
                sample = [url for url in duplicates
                          if _full_path(url) == path] or duplicates
                print('false negative %s (%s, %d more)' % (
                    item['url'], sample[0], len(sample) - 1))
            elif not duplicates and dupe_prob > threshold:
                print('false positive', item['url'])
        if i % 100 == 0:
            _report_pr()
    _report_pr()
class DuplicateChecker:
    def __init__(self):
        self.minhashes = {}
        self.lsh = MinHashLSH(threshold=THRESHOLD)

    def create_minhashes_reading_articles(self, start_date, end_date):
        """Fills the minhashes dict with the files paths as the keys and the minhashes from the articles bodies as
         the values"""
        for category in read_categories_from_file():
            for date_between in get_dates_between(start_date, end_date):
                try:
                    date_between = date_between.strftime('%Y/%m/%d')
                    current_dir_path = f'{DUMP_DIR}/{category}/{date_between}'
                    for filename in os.listdir(current_dir_path):
                        self._create_minhash_from_file(current_dir_path,
                                                       filename)
                except FileNotFoundError:
                    pass

    def _create_minhash_from_file(self, current_dir_path, filename):
        file_path = f'{current_dir_path}/{filename}'
        with open(file_path) as f:
            article = Article(**json.load(f))
            if not article.body:
                os.remove(file_path)
                return

            minhash = MinHash()
            for word in article.body.split(' '):
                minhash.update(word.encode('utf8'))
            lean_minhash = LeanMinHash(minhash)
            self.minhashes[file_path] = lean_minhash
            self.lsh.insert(file_path, lean_minhash)

    def find_similar_articles(self):
        """Finds every similar article from the LSH index, and removes it from the index itself as well as the file from
        the disk"""
        for path, minhash in self.minhashes.items():
            # The LSH will find at least the path itself, so we need to filter it
            for similar_article_path in [
                    x for x in self.lsh.query(minhash) if x is not path
            ]:
                print(
                    f'\tremoving similar article from {similar_article_path}')
                self.lsh.remove(similar_article_path)
                with contextlib.suppress(FileNotFoundError):
                    os.remove(similar_article_path)
def analyze_file(name, f, verbose=False):
    urls = []
    Doc = namedtuple('Doc', ['item', 'min_hash'])
    documents = {} # key -> Doc
    lsh = MinHashLSH(threshold=0.9, num_perm=128)
    too_common = get_too_common_shingles(f, name, limit=300)
    for i, item in enumerate(item_reader(f, name)):
        urls.append(item['url'])
        min_hash = get_min_hash(item['extracted_text'], too_common)
        key = 'item_{}'.format(i)
        item = {'url': item['url']}
        documents[key] = Doc(item, min_hash)
        if key in lsh:
            lsh.remove(key)
        lsh.insert(key, min_hash)
    paths = [''.join([p.netloc, p.path]) for p in map(urlsplit, urls)]
    duplicates = get_duplicates(lsh, documents, verbose=verbose)
    print(name.ljust(40), '\t'.join(map(str, [
        len(urls), len(set(urls)), len(set(paths)),
        n_unique(documents, duplicates),
        ])))
Example #5
0
class DupePredictor(object):
    """ Learn to predict if the content is duplicate by the URL.
    """
    def __init__(self, texts_sample=None, jaccard_threshold=0.9, num_perm=128):
        """ Initialize DupePredictor.
        :param jaccard_threshold: a minimal jaccard similarity when pages
        are considered duplicates (intersection of content / union of content).
        :param texts_sample: a list of texts to calculate too_common_shingles
        - this allows a more precise duplicate detection, because now
        we know which parts are common to all pages, and which are unique
        for each page.
        """
        self.jaccard_threshold = jaccard_threshold
        self.num_perm = num_perm
        self.lsh = MinHashLSH(
            threshold=self.jaccard_threshold, num_perm=self.num_perm)
        self.too_common_shingles = set()
        if texts_sample:
            self.too_common_shingles = get_too_common_shingles(texts_sample)

        self.seen_urls = {}  # url: URLMeta
        self.urls_by_path = defaultdict(set)  # path: {url}
        self.urls_by_path_q = defaultdict(set)  # (path, q): {url}
        self.urls_by_path_qwp = defaultdict(set)  # (path, param, q): {url}
        self.params_by_path = defaultdict(set)  # path: {param}
        self.param_values = defaultdict(set)  # (path, param): {value}

        # Duplicate hypotheses:
        # (1) All items with same path are duplicates. Key is (path,)
        self.path_dupstats = defaultdict(DupStat)
        # (2) All items with same path that differ only in given param are
        # duplicates. Key is (param,)
        self.param_dupstats = defaultdict(DupStat)
        # (3) Same but conditioned by path, key is (path, param)
        self.path_param_dupstats = defaultdict(DupStat)
        # (4) Same but conditioned by path + the rest of the query
        # Key is (path, query, param)
        self.path_query_param_dupstats = defaultdict(DupStat)
        # (5) All items with same path with only added param=value are duplicates
        # Key is (param, value)
        self.param_value_dupstats = defaultdict(DupStat)
        # (6) Same but conditioned by path, key is (path, param, value)
        self.path_param_value_dupstats = defaultdict(DupStat)
        # TODO - more powerful hypotheses:
        # - param + value without path
        # - more than one get param

    def get_dupe_prob(self, url):
        """ A probability of given url being a duplicate of some content
        that has already been seem.
        """
        path, query = _parse_url(url)
        dupestats = []
        extend_ds = lambda x: dupestats.extend([_f for _f in (
            ds_dict.get(key) for ds_dict, key in x) if _f])
        if self.urls_by_path.get(path):
            extend_ds([(self.path_dupstats, path)])
        # If param is in the query
        for param, value in list(query.items()):
            qwp_key = _q_key(_without_key(query, param))
            # Have we seen the query with param changed or removed?
            has_changed = self.urls_by_path_qwp.get((path, param, qwp_key))
            has_removed = self.urls_by_path_q.get((path, qwp_key))
            if has_changed or has_removed:
                extend_ds(self._param_dupstats(path, param, qwp_key))
            if has_removed:
                extend_ds(self._param_value_dupstats(path, param, value))
        # If param is not in the query, but we've crawled a page when it is
        q_key = _q_key(query)
        for param in (self.params_by_path.get(path, set()) - set(query)):
            if self.urls_by_path_qwp.get((path, param, q_key)):
                extend_ds(self._param_dupstats(path, param, q_key))
                # FIXME - this could be a long list of param values,
                # it's better to somehow store only high-probability values?
                for value in self.param_values.get((path, param), set()):
                    extend_ds(self._param_value_dupstats(path, param, value))
        return max(ds.get_prob() for ds in dupestats) if dupestats else 0.

    def update_model(self, url, text):
        """ Update prediction model with a page by given url and text content.
        Return a list of item duplicates (for testing purposes).
        """
        min_hash = get_min_hash(text, self.too_common_shingles, self.num_perm)
        item_url = canonicalize_url(url)
        item_path, item_query = _parse_url(item_url)
        all_duplicates = [
            (url, self.seen_urls[url]) for url in self.lsh.query(min_hash)]
        duplicates = [(url, m.query) for url, m in all_duplicates
                      if m.path == item_path]
        # Hypothesis (1) - just paths
        n_path_nodup = self._nodup_filter(min_hash, (
            self.urls_by_path.get(item_path, set())
            .difference(url for url, _ in duplicates)))
        self.path_dupstats[item_path].update(len(duplicates), n_path_nodup)
        # Other hypotheses, if param is in the query
        for param, value in list(item_query.items()):
            self._update_with_param(
                duplicates, min_hash, item_path, item_query, param, [value])
        # Other hypotheses, if param is not in the query
        for param in (
                self.params_by_path.get(item_path, set()) - set(item_query)):
            self._update_with_param(
                duplicates, min_hash, item_path, item_query, param,
                self.param_values.get((item_path, param), set()))
        # Update indexes
        for param, value in list(item_query.items()):
            self.urls_by_path_q[item_path, _q_key(item_query)].add(item_url)
            item_qwp_key = _q_key(_without_key(item_query, param))
            self.urls_by_path_qwp[item_path, param, item_qwp_key].add(item_url)
            self.params_by_path[item_path].add(param)
            self.param_values[item_path, param].add(value)
        if not item_query:
            self.urls_by_path_q[item_path, ()].add(item_url)
        self.urls_by_path[item_path].add(item_url)
        if item_url in self.lsh:
            self.lsh.remove(item_url)
        self.lsh.insert(item_url, min_hash)
        self.seen_urls[item_url] = URLMeta(item_path, item_query, min_hash)
        if len(self.seen_urls) % 100 == 0:
            self.log_dupstats()
        return all_duplicates

    def _update_with_param(self, duplicates, min_hash, item_path, item_query,
                           param, values):
        # qwp = "query without param"
        item_qwp = _without_key(item_query, param)
        item_qwp_key = _q_key(item_qwp)

        q_dup = {url for url, q in duplicates
                 if _without_key(q, param) == item_qwp}
        n_q_nodup = self._nodup_filter(min_hash, (
            self.urls_by_path_qwp.get((item_path, param, item_qwp_key), set())
            .union(self.urls_by_path_q.get((item_path, item_qwp_key), set()))
            .difference(q_dup)))
        if q_dup or n_q_nodup:
            for ds_dict, key in self._param_dupstats(
                    item_path, param, item_qwp_key):
                ds_dict[key].update(len(q_dup), n_q_nodup)
        if values:
            if param in item_query:
                qv_dup = {url for url, q in duplicates if q == item_qwp}
                n_qv_nodup = self._nodup_filter(min_hash, (
                    self.urls_by_path_q.get((item_path, item_qwp_key), set())
                    .difference(qv_dup)))
            # FIXME - this could be a long list of param values,
            # it's better to somehow store only high-probability values?
            for value in values:
                if param not in item_query:
                    qv_dup = {url for url, q in duplicates
                        if q.get(param) == value and
                        _without_key(q, param) == item_qwp}
                    qap_key = _q_key(_with_key_val(item_query, param, value))
                    n_qv_nodup = self._nodup_filter(min_hash, (
                        self.urls_by_path_q.get((item_path, qap_key), set())
                        .difference(qv_dup)))
                if qv_dup or n_qv_nodup:
                    for ds_dict, key in self._param_value_dupstats(
                            item_path, param, value):
                        ds_dict[key].update(len(qv_dup), n_qv_nodup)

    def _param_dupstats(self, path, param, qwp_key):
        return [
            (self.param_dupstats, param),
            (self.path_param_dupstats, (path, param)),
            (self.path_query_param_dupstats, (path, param, qwp_key)),
            ]

    def _param_value_dupstats(self, path, param, value):
        return [
            (self.param_value_dupstats, (param, value)),
            (self.path_param_value_dupstats, (path, param, value)),
            ]

    def _nodup_filter(self, min_hash, all_urls, max_sample=200):
        """ This filters results that are considered not duplicates.
        But we really need to check that, because lsh.query does not always
        return ALL duplicates, esp. when there are a lot of them, so
        here we double-check and return only urls that are NOT duplicates.
        Return estimated number of not duplicates.
        """
        if not all_urls:
            return 0
        urls = random.sample(all_urls, max_sample) \
               if len(all_urls) > max_sample else all_urls
        filtered = [
            url for url in urls
            if min_hash.jaccard(self.seen_urls[url].min_hash) <
            self.jaccard_threshold]
        return int(len(filtered) / len(urls) * len(all_urls))

    def log_dupstats(self, min_dup=100):
        for ds, name in [
                (self.path_dupstats, 'Path dupstats'),
                (self.param_dupstats, 'Param dupstats'),
                (self.path_param_dupstats, 'Path-param dupstats'),
                (self.path_query_param_dupstats, 'Path-query-param dupstats'),
                (self.param_value_dupstats, 'Param-value dupstats'),
                (self.path_param_value_dupstats, 'Path-param-value dupstats'),
                ]:
            _log_dupstats(ds, name, min_dup=min_dup)
for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m1.update(d.encode('utf8'))
for d in data3:
    m1.update(d.encode('utf8'))
print((m1.hashvalues))
print((m2.hashvalues))
print((m3.hashvalues))
import numpy as np
print(np.shape(m1.hashvalues))
# Create an MinHashLSH index optimized for Jaccard threshold 0.5,
# that accepts MinHash objects with 128 permutations functions
lsh = MinHashLSH(threshold=0.5, num_perm=128)

# Insert m2 and m3 into the index
lsh.insert("m2", m2)
lsh.insert("m3", m3)

# Check for membership using the key
print("m2" in lsh)
print("m3" in lsh)

# Using m1 as the query, retrieve the keys of the qualifying datasets
result = lsh.query(m1)
print("Candidates with Jaccard similarity > 0.5", result)

# Remove key from lsh
lsh.remove("m2")
Example #7
0
    def summarize(self, dataset):
        lilm = self.sm
        N = lilm.shape[0]
        M = (lilm.nnz + np.sum(lilm.diagonal() != 0)) // 2
        nnz = M

        degs = np.array(lilm.sum(axis=1))
        degs = np.squeeze(degs).tolist()
        pt = PriorityTree(((d, i) for i, d in enumerate(degs)))
        sizes = defaultdict(lambda: 1)
        nodes = {}
        for n in range(N):
            nodes[n] = {n}

        deg_len = sum(LN(d) for d in degs)
        length = LN(N)
        length += deg_len
        length += N * LN(1)
        length += c_MDL.LnU(N * (N + 1) // 2, M)
        length += M * LN(1)
        logger.info(f"Init length: {length}")

        # Initialize MinHash LSH
        lsh = MinHashLSH(threshold=0.4, num_perm=64)
        minhashes = [None] * N
        for n in range(N):
            m = MinHash(num_perm=64, hashfunc=hash, seed=1024)
            neighbors = ssp.find(lilm[n])[1]
            for nei in neighbors:
                m.update(nei)
            minhashes[n] = m
            lsh.insert(str(n), m)

        start_time = time.time()
        cnt = N
        end = False
        non_gain = 0

        while not end:
            logger.debug(f"Iteration {N-cnt}")

            while True:
                du, u = pt.pop()
                if du == math.inf:
                    logger.debug(f"Choose node {u} and break")
                    end = True
                    break
                if du != degs[u]:
                    logger.warning(f"Degree not match! {du} != {degs[u]}")
                candidates = lsh.query(minhashes[u])
                candidates = candidates[:min(self.C, len(candidates))]

                if len(candidates) == 1:
                    non_gain += 1
                    if non_gain >= cnt:
                        end = True
                        break
                    logger.debug(f"No candidate for node {u}({du})")
                    continue
                if not end:
                    logger.debug(
                        f"Choose node: {u} with degree {du}, {len(candidates)} candidates."
                    )
                break
            if end:
                break

            neiu = set(ssp.find(lilm[u])[1])
            max_gain, v = 0, -1

            len_matrix = c_MDL.LnU(cnt * (cnt + 1) // 2, nnz)
            g_new_nnz = 0
            neiv = set()
            for c in candidates:
                c = int(c)
                if c == u:
                    continue
                dc = degs[c]

                gain = xlogx(du) + xlogx(dc)
                gain -= xlogx(du + dc)

                gain2 = 0
                common_nei = 0
                if lilm[u, u] != 0:
                    gain += xlogx(lilm[u, u]) / 2
                if lilm[c, c] != 0:
                    gain += xlogx(lilm[c, c]) / 2
                new_weight = lilm[u, u] + lilm[c, c] + 2 * lilm[u, c]
                if new_weight != 0:
                    gain -= xlogx(new_weight) / 2
                gain2 += LN(lilm[u, u]) + LN(lilm[c, c]) - LN(new_weight)

                neic = set(ssp.find(lilm[c])[1])
                for nei in neiu:
                    if nei not in neic:
                        continue
                    if nei == u or nei == v:
                        continue
                    gain -= xlogx(lilm[u, nei])
                    gain -= xlogx(lilm[c, nei])
                    new_weight = lilm[u, nei] + lilm[c, nei]
                    gain += xlogx(new_weight)
                    gain2 += LN(lilm[u, nei]) + \
                        LN(lilm[c, nei]) - LN(new_weight)

                gain += gain2
                gain += LN(cnt) - LN(cnt - 1)
                size_u, size_v = sizes[u], sizes[v]
                gain += LN(size_u) + LN(size_v) - LN(size_u + size_v)
                gain += c_MDL.log_comb(size_u + size_v, size_u)
                common_nei = len(neiu & neic)
                new_nnz = nnz - common_nei
                if (u in neiu) and (c in neic) and (c not in neiu):
                    new_nnz -= 1
                gain += LN(nnz) - LN(new_nnz)
                # gain += common_nei * self.B
                gain += len_matrix
                gain -= c_MDL.LnU(cnt * (cnt - 1) // 2, new_nnz)
                if cnt * (cnt - 1) // 2 < new_nnz:
                    logger.warning(f"Wrong parameter: {cnt}, {new_nnz}")
                    end = True
                    break

                if gain > max_gain:
                    max_gain = gain
                    g_new_nnz = new_nnz
                    v = c
                    neiv = neic
            if end:
                break
            if v == -1:
                logger.debug(f"No non-negative gain for node {u}")
                non_gain += 1
                if non_gain >= cnt:
                    end = True
                continue
            non_gain = 0

            # Merge u and v
            logger.debug(f"Merge {u} and {v}, gain: {max_gain}")
            nodes[u] = nodes[u] | nodes[v]
            if v in nodes:
                del nodes[v]
            length -= max_gain
            logger.debug(f"Current length: {length}")

            # Update degree and sizes
            pt.update(u, (degs[u] + degs[v], u))
            degs[u] = degs[u] + degs[v]
            pt.update(v, (math.inf, v))
            degs[v] = 0
            sizes[u] = sizes[u] + sizes[v]
            sizes[v] = 0

            lilm[u] = lilm[u] + lilm[v]
            lilm[u, u] += lilm[u, v]
            for nei in ssp.find(lilm[u])[1]:
                if nei != u and nei != v:
                    lilm[nei, u] = lilm[u, nei]
            lilm[v] = 0
            lilm[:, v] = 0
            cnt -= 1
            nnz = g_new_nnz
            mu = minhashes[u]
            for nei in (neiu | neiv):
                if nei not in neiu:
                    mu.update(nei)
                if nei == u or nei == v:
                    continue
                # Leave v in nei's MinHash
                if nei not in neiu:
                    m = minhashes[nei]
                    m.update(u)
                    if str(nei) in lsh:
                        lsh.remove(str(nei))
                    lsh.insert(str(nei), m)
                    minhashes[nei] = m

            if str(u) in lsh:
                lsh.remove(str(u))
            if str(v) in lsh:
                lsh.remove(str(v))
            lsh.insert(str(u), mu)
            minhashes[u] = mu
            minhashes[v] = None

        elapsed = time.time() - start_time
        logger.info(
            f"Summarize {N} nodes to {cnt} nodes, costs {elapsed} seconds, final length: {length}"
        )

        return lilm, nodes
Example #8
0
    data = {jsonFile["id_str"] : [lot, lat] }
    data_dict.update(data)
    
    if counter <= upper_limit:
        counter = counter + 1
        geolocated_list.append(jsonFile["id_str"])
        
m1 = MinHash(num_perm=128)
assigned_lot=0;assigned_lat=0;
start=[];end=[];
haversine_distances = list()
#json file is being splitted from the collection
for jsonFile in LHDB.db_collection.find():
    splitted_word = jsonFile["text"].split(' ')
    for tokenized in splitted_word:
        m1.update(tokenized.encode('utf8'))
    result1 = lsh1.query(m1) 
#assigning long and lat values in the non geo items      
    for item in result1:
        if(item not in geolocated_list):
            assigned_lot = data_dict[jsonFile["id_str"]][0]
            assigned_lat = data_dict[jsonFile["id_str"]][1]
            lot = data_dict[item][0]
            lat = data_dict[item][1]
            start = (lat,lot)
            end = (assigned_lat,assigned_lot)
            distance = haversine(start, end)
            haversine_distances.append(distance)
        lsh1.remove(item)
        
print(haversine_distances)
Example #9
0
    def randomize_lsh_search(arr,
                             seq_all,
                             valid_events,
                             th_lsh,
                             sample_size=128,
                             beta=1):

        # initialize lsh
        # calculate occur time of every event
        vector_all = {}
        for sid in arr:
            vector_all[sid] = arr[sid]['pattern']

        m_all = {}
        for sid in vector_all:
            m_all[sid] = MinHash(num_perm=128)
            for d in vector_all[sid]:
                m_all[sid].update(str(d).encode('utf8'))

        lsh = MinHashLSH(threshold=th_lsh, num_perm=sample_size)
        for sid in m_all:
            # 'sid' for the label to put out, 'm_all' is the sequence data to put in
            lsh.insert(str(sid), m_all[sid])

        # main loop
        rs = {}
        count = 0
        # find the longest sequence
        longestSeq = 0
        id_to_merge = ""
        for userkey in arr:
            if len(arr[userkey]['pattern']) > longestSeq:
                longestSeq = len(arr[userkey]['pattern'])
                id_to_merge = userkey
        while (len(arr) > 0):

            # find the candidates ------ similar in minHash
            lsh_candidates = lsh.query(m_all[id_to_merge])
            del lsh_candidates[lsh_candidates.index(id_to_merge)]

            cost_min = 0
            cost_temp = 0
            el = {}
            el_temp = {}
            Tag = {}
            bound = {}

            if len(arr) == 1:
                rs[id_to_merge] = arr[id_to_merge]
                break

            for id_candidates in lsh_candidates:
                if arr[id_to_merge]['size'] == 1 and arr[id_candidates][
                        'size'] == 1:
                    el_temp, cost_temp, Tag[id_candidates] = merge(
                        arr[id_to_merge],
                        arr[id_candidates],
                        seq_all,
                        valid_events,
                        beta=beta)
                    bound[id_candidates] = [cost_temp, cost_temp]
                else:
                    bound[id_candidates], Tag[id_candidates] = calbound(
                        arr[id_to_merge],
                        arr[id_candidates],
                        seq_all,
                        valid_events,
                        beta=beta)
            if len(bound) > 0:
                minBound = min([bound[key][1] for key in bound])

            # find out which candidate is the best for merging
            for id_candidates in lsh_candidates:
                # take the candidate which minimal LCS distance is less than the element counts difference out of consideration
                if (bound[id_candidates][0] -
                        minBound) > 0.001 and Tag[id_candidates] == False:
                    continue

                count += 1
                el_temp, cost_temp, overlapTag = merge(arr[id_to_merge],
                                                       arr[id_candidates],
                                                       seq_all,
                                                       valid_events,
                                                       beta=beta)

                if cost_temp < cost_min:
                    cost_min = cost_temp
                    el = el_temp
                    id_merged = id_candidates

            if cost_min < 0:
                del arr[id_to_merge]
                del arr[id_merged]
                del m_all[id_to_merge]
                del m_all[id_merged]
                lsh.remove(id_to_merge)
                lsh.remove(id_merged)

                arr[el['id'][0]] = el

                vector_temp = el['pattern']

                m_all[el['id'][0]] = MinHash(num_perm=128)
                for d in vector_temp:
                    m_all[el['id'][0]].update(str(d).encode('utf8'))
                lsh.insert(el['id'][0], m_all[el['id'][0]])

            else:
                rs[id_to_merge] = arr[id_to_merge]
                del arr[id_to_merge]
                del m_all[id_to_merge]
                lsh.remove(id_to_merge)

                # find the longest sequence
                longestSeq = 0
                id_to_merge = ""
                for userkey in arr:
                    if len(arr[userkey]['pattern']) > longestSeq:
                        longestSeq = len(arr[userkey]['pattern'])
                        id_to_merge = userkey

        return rs
Example #10
0
    shingle_set = get_k_shingles(exp, shingle_size)
    mh = MinHash()  # create MinHash for exp
    for s in shingle_set:
        mh.update(s.encode('utf8'))  # convert shingle s into MinHash
    minhash_dict[idx] = LeanMinHash(mh)
print(now_time() + 'Created Minhash')
del sentences  # to save memory


for sim_threshold in sim_thresholds:  # create MinHash for once, when testing multiple similarity values
    lsh = MinHashLSH(threshold=sim_threshold)  # create LSH index
    for idx, mh in minhash_dict.items():
        lsh.insert(str(idx), mh)
    print(now_time() + 'Created LSH for similarity {}'.format(sim_threshold))

    queried_ids = set()  # way more efficient than list
    exp_id_groups = []
    for idx, mh in minhash_dict.items():
        if idx in queried_ids:
            continue
        one_group_ids_str = lsh.query(mh)  # id list of one group of duplicate sentences
        for i in one_group_ids_str:
            lsh.remove(i)  # for efficiency
        one_group_ids_int = [int(i) for i in one_group_ids_str]
        if len(one_group_ids_int) > group_size:
            exp_id_groups.append(one_group_ids_int)  # only keep a group with enough sentences
        for i in one_group_ids_int:
            queried_ids.add(i)
    pickle.dump(exp_id_groups, open(directory + 'groups{}.pickle'.format(sim_threshold), 'wb'))
    print(now_time() + 'Saved a file for similarity {}'.format(sim_threshold))
Example #11
0
class FrequentCollector:
    """
    Parts of the frequent paragraph collection algorithm in
    :func:`collect_frequent` have been moved here to make the code more
    readable.
    """
    # The default (dummy) bootstrap tuple used when there is no bootstrap data
    BOOTSTRAP_TUPLE = (None, 0, [])

    def __init__(self,
                 threshold: float,
                 permutations: int,
                 decay: float,
                 min_freq: int,
                 bootstrap: Union[RandomPDataReader, None] = None,
                 decay_filter: str = 'score < 0.5',
                 wrap_filter: int = 'count >= min_freq'):
        self.threshold = threshold
        self.permutations = permutations
        self.decay = decay
        self.min_freq = min_freq
        self.bootstrap = bootstrap or {}
        self.decay_filter = Filter(decay_filter)
        self.wrap_filter = Filter(wrap_filter)
        logging.debug('Decay filter: {}'.format(decay_filter))
        logging.debug('Wrap filter: {}'.format(wrap_filter))

    def reset(self, domain):
        """Resets the bookkeeping and statistics objects."""
        self.lsh = MinHashLSH(threshold=self.threshold,
                              num_perm=self.permutations)
        self.freq_ps = {}  # type: Dict[str, PData]
        self.num_dup = 0
        # Bootstrap the domain frequency counts if previous data is available
        _, docs, pdatas = self.bootstrap.get(domain, self.BOOTSTRAP_TUPLE)
        self.stats = CollectStats(domains=1, docs=docs)
        for pdata_id, pdata in enumerate(pdatas, start=1):
            self.lsh.insert(str(pdata_id), pdata.minhash)
            self.freq_ps[str(pdata_id)] = pdata

    def collect_from_doc(self, url: str, paragraphs: List[Any]):
        """
        Runs the algorithm in MMDS (TOOD) on a document, does the bookkeeping
        and updates the statistics in the object.

        :param url: the URL of the document (used as key in LSH).
        :param paragraphs: the minhashes of the paragraphs of the document.
        """
        # Step 1: decrease score of all paragraphs
        for pdata in self.freq_ps.values():
            pdata *= self.decay

        # Step 2: add new paragraphs to the roster
        already_increased = set()  # type: Set[str]
        for p, mh in enumerate(paragraphs, start=1):
            found_dup = False
            for duplicate in self.lsh.query(mh):
                # Ensure that the paragraph counter is increased by
                # at most one per document
                if duplicate not in already_increased:
                    self.freq_ps[duplicate] += 1
                    already_increased.add(duplicate)
                    if not found_dup:
                        found_dup = True
                        self.num_dup += 1
            if not found_dup:
                # OK, this is a new paragraph
                key = url + '_' + str(p)
                self.lsh.insert(key, mh)
                self.freq_ps[key] = PData(mh)
                already_increased.add(key)
        self.stats.docs += 1
        self.stats.ps += p

        # Step 3: drop paragraphs with low score
        to_drop = [
            key for key, pdata in self.freq_ps.items()
            if self.decay_filter(score=pdata.score, count=pdata.count)
        ]
        for key in to_drop:
            self.freq_ps.pop(key)
            self.lsh.remove(key)

    def wrap_up_domain(self):
        """
        Drops all frequent candidates that are below the minimum frequency and
        updates the statistics.
        """
        # Get rid of paragraphs that only occured once
        self.freq_ps = {
            key: pdata
            for key, pdata in self.freq_ps.items()
            if self.wrap_filter(score=pdata.score,
                                count=pdata.count,
                                min_freq=self.min_freq,
                                docs=self.stats.docs)
        }
        self.stats.frequents = len(self.freq_ps)
Example #12
0
def deduplicate_other(main_batch, batches_to_subtract, output_dir, threshold,
                      permutations):
    """
    Removes all documents from a set of minhashed documents (3 files with the
    same minhash prefix) that occur in other batches. Both main_batch and
    batches_to_subtract should be batch prefixes.

    Warning: only works for full documents at this point!
    """
    lsh = MinHashLSH(threshold=threshold, num_perm=permutations)
    main_base = op.basename(main_batch)
    logging.info('Processing input batch {}...'.format(main_base))

    # First, load the (already deduplicated) batch...
    for input_file, results in read_batch(main_batch):
        for doc_id, minhash in zip(results['id'], results['minhash']):
            lsh.insert('\t'.join(doc_id), minhash)
    initial_len = len(lsh.keys)

    # Now, remove all documents in it that are contained in th batches
    # to subtract
    content_duplicates, url_duplicates = 0, 0
    for batch in batches_to_subtract:
        batch_content_duplicates, batch_url_duplicates = 0, 0
        initial_batch_len = len(lsh.keys)
        for _, results in read_batch(batch):
            for doc_id, minhash in zip(results['id'], results['minhash']):
                key = '_'.join(doc_id)
                if key in lsh:
                    batch_url_duplicates += 1
                    lsh.remove(key)
                else:
                    for duplicate in lsh.query(minhash):
                        lsh.remove(duplicate)
                        batch_content_duplicates += 1
        logging.info(
            'Cross-deduplicated input batch {} with cross batch {}: {} -> {} '
            'documents (removed {} by url, {} by content).'.format(
                main_base, op.basename(batch), initial_batch_len,
                len(lsh.keys), batch_url_duplicates, batch_content_duplicates))
        content_duplicates += batch_content_duplicates
        url_duplicates += batch_url_duplicates

    # Finally, we print the documents left. Unfortunately, in order to
    # keep the format, we have to read the original batch again.
    with closing(
            BatchWriter(sys.maxsize, output_dir, len(main_base),
                        int(main_base))) as bw:
        # OK, we need to re-read the batch unfortunately
        for input_file, results in read_batch(main_batch):
            doc_ids, minhashes = [], []
            for doc_id, minhash in zip(results['id'], results['minhash']):
                if '\t'.join(doc_id) in lsh:
                    doc_ids.append(doc_id)
                    minhashes.append(minhash)
            bw.write_results(input_file, {'id': doc_ids, 'minhash': minhashes})
    logging.info('Processed input batch {}; kept {} out of {} documents '
                 '(removed {} by url, {} by content).'.format(
                     main_base, len(lsh.keys), initial_len, url_duplicates,
                     content_duplicates))
    return len(lsh.keys), initial_len