Esempio n. 1
0
def extract_next_links(url, resp) -> "list()":
    defrag=urldefrag(url)[0]
    print(defrag)
    if resp.status == 200:
        print("Scanning")
        if defrag not in urls:
            content = resp.raw_response.text
            data=getVisibleText(content)
            simmed=Simhash(data)
            if simmed.value not in sims:
                index=SimhashIndex(objs,k=3)
                if len(index.get_near_dups(simmed))==0:
                    urls.add(defrag)
                    sims.add(simmed.value)
                    objs.append((url,simmed))
                    print(len(urls),len(sims),len(objs))
                    try:
                        file=open("data_dump.txt","a",errors="ignore")
                        to_write=url+ " \n "+ data+ "\n"+ str(simmed.value) +"\n\n"
                        file.write(to_write)
                    except Exception as e:
                        raise e
                    finally:
                        file.close()
            #urls[defrag].add(getVisibleText(content))
            #print(urls[defrag])
        return getAllUrls(url,content)
    else:
        print("Cant scan")
        return []
Esempio n. 2
0
 def __init__(self, vocab_to_freq, f=64, k=32):
   self.vocab_to_freq = vocab_to_freq
   self.simhash_index = SimhashIndex([], f=f, k=k)
   self.f = f
   self.k = k
   
   simhash_index = self.simhash_index
   for w in vocab_to_freq:
     sh = Simhash(w, f=f)
     simhash_index.add(w, sh)
Esempio n. 3
0
def clustering():
    fout = open('cluster.txt', 'w', encoding='UTF-8')
    cursor = conn.cursor()
    cursor.execute(
        'SELECT id, title, cluster, sim_count, link, simhash FROM entries where cluster=0'
    )
    entrylist = cursor.fetchall()
    objs = []
    entrydic = {}
    for item in entrylist:
        if not is_en(item[1]):
            if not item[4].startswith("https://weibo.com"):
                sim = Simhash(get_features_cn(item[1]))
                objs.append((str(item[0]), sim))
                entrydic[str(item[0])] = {
                    'title': item[1],
                    'cluster': 0,
                    'sim_count': 0,
                    'link': item[4],
                    'simhash': sim.value
                }
        else:
            sim = Simhash(get_features(item[1]))
            objs.append((str(item[0]), sim))
            entrydic[str(item[0])] = {
                'title': item[1],
                'cluster': 0,
                'sim_count': 0,
                'link': item[4],
                'simhash': sim.value
            }

    index = SimhashIndex(objs, k=tolerance)
    cluster_num = last_cluster_num
    for key in entrydic:
        if entrydic[key]['cluster'] == 0:
            sims = index.get_near_dups(
                Simhash(get_features_cn(entrydic[key]['title'])))
            for item in sims:
                entrydic[item]['cluster'] = cluster_num
                # if len(sims) > 1:
                entrydic[item]['sim_count'] = len(sims) - 1
                if len(sims) > 1:
                    fout.write(item + '\t' + str(entrydic[item]['cluster']) +
                               '\t' + entrydic[item]['title'] + '\n')
                cursor.execute(
                    'UPDATE entries SET cluster=%s, sim_count=%s, simhash=%s where id = %s',
                    (entrydic[item]['cluster'], entrydic[item]['sim_count'],
                     str(entrydic[item]['simhash']), item))
                # conn.commit()
                # fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\t' + entrydic[item]['link'] + '\n')
            cluster_num += 1
    # cursor.execute('UPDATE somevariables SET last_cluster=%s', (cluster_num,))
    # conn.commit()
    conn.close()
Esempio n. 4
0
def main():
    # user_query = input()
    DOCID = 0


    numPartial = 1 

    index = SimhashIndex([])

    totaldocs = 0
    docnum = 0

    validDocFile = open('validDocs2', 'w')

    for root, dirs, files in os.walk(DEVPATH):
        for fname in files:
            if not fname.endswith(".json"):
                continue
            totaldocs += 1
            h2t = html2text.HTML2Text()

            file = open(root + "/" + fname)

            pageDict = json.loads(file.read())

            # close file to get memory back
            file.close()

            # get html formated content
            htmlContent = pageDict['content']

            print(pageDict['url'])

            plainContent = h2t.handle(htmlContent)

            feat = get_features(plainContent)

            sim = Simhash(feat)

            if len(index.get_near_dups(sim)) > 0:
                continue

            print(docnum, totaldocs)

            index.add(str(docnum), sim)

            validDocFile.write(root + "/" + fname + "\n")

            docnum+=1


    validDocFile.close()
Esempio n. 5
0
class SpellingCorrector(object):
  def __init__(self, vocab_to_freq, f=64, k=32):
    self.vocab_to_freq = vocab_to_freq
    self.simhash_index = SimhashIndex([], f=f, k=k)
    self.f = f
    self.k = k
    
    simhash_index = self.simhash_index
    for w in vocab_to_freq:
      sh = Simhash(w, f=f)
      simhash_index.add(w, sh)
  
  def add_valid_word(self, word):
    if word not in self.vocab_to_freq:
      sh = Simhash(word, self.f)
      self.simhash_index.add(word, sh)
    self.vocab_to_freq[word] = self.vocab_to_freq.get(word, 0) + 1
    
  def correct_word(self, word):
    
    if word in self.vocab_to_freq:
      return word
    
    #Edit distance between
    sh = Simhash(word, f=self.f)
    candidates = self.simhash_index.get_near_dups(sh)
    
    if not candidates:
      #No near dups. Oh well. This word will go as it is.
      print 'no candidates'
      return word
    
    if len(candidates) == 1:
      #Only one candidate, so assume this is the correction
      return candidates[0]
      
    lev_dist_gen = ((other_w, levenshtein(other_w, word)) for other_w in candidates)
    closest_words, dists = zip(*all_min_or_max(lev_dist_gen, min, lambda item: item[1]))
    
    if len(closest_words) == 1:
      #One of the candidates had the best edit distance. Return that.
      return closest_words[0]
    
    #OK, there are multiple closest words. Rely on word frequency to choose the right one.
    vocab_to_freq = self.vocab_to_freq
    word_freq_gen = ((other_w, vocab_to_freq[other_w]) for other_w in closest_words)
    most_freq_words, freqs = zip(*all_min_or_max(word_freq_gen, max, lambda item: item[1]))
    
    #using choice because at this point there's no other way to narrow it down, unless we
    #track higher order ngrams.
    return choice(most_freq_words)
Esempio n. 6
0
def get_near_dups(query_simhash, candidates_simhash, k):
    res = [0] * len(candidates_simhash)
    query = Simhash(value=query_simhash)

    for i in range(len(candidates_simhash)):
        candidates_simhash[i] = (str(i), Simhash(value=candidates_simhash[i]))
        i = i + 1
    index = SimhashIndex(candidates_simhash, k=k)
    near_dups = index.get_near_dups(query)

    for dup in near_dups:
        res[int(dup)] = 1

    return res
Esempio n. 7
0
def sim_merge(finaldb_cut, simdb):
    d = {}
    index_list = []
    hashurl2sim = {}
    max_distance = 10
    with open(finaldb_cut, 'r') as f:
        for line in f:
            if not line:
                break
            # hashurl  title  author  images  links  text  pub_time
            # 1        2      3       4       5      6     7
            # jushi  shouji  zujin  dizhi  ditie  url  crawl_time  source  ext
            # 8      9       10     11     12     13   14          15      16
            array = line.rstrip('\r\n').split('\t')
            hashurl=array[0]     #string,key
            title=array[1]       #string
            text=array[5]        #string
            pub_time=array[6]    #string 
            url=array[12]        #string 

            s = Simhash((title+text).decode('utf-8'))
            d.update({
                hashurl:(title, url, pub_time)
            })
            sim = Simhash((title+text).decode('utf-8'))
            index_list.append((hashurl, sim))
            hashurl2sim.update({hashurl:sim})

    index = SimhashIndex(index_list, k=max_distance)
    merged = {}
    while d:
        hashurl, (title, url, pub_time) = d.popitem()
        merged[hashurl] = (title, url, pub_time)
        sim_list = index.get_near_dups(hashurl2sim[hashurl])
        buf_list = []
        for h in sim_list:
            if h != hashurl:
                if d.has_key(h):
                    title2, url2, pub_time2 = d.pop(h)
                    merged[h] = (title2, url2, pub_time2)
                else:
                    title2, url2, pub_time2 = merged[h]
            else:
                title2, url2, pub_time2 = title, url, pub_time
            buf_list.append((h, title2, url2, pub_time2))
        if len(buf_list) > 1:
            buf_list = sorted(buf_list, key=lambda i:i[3], reverse=True)
            simdb.insert('\t'.join(
                [buf_list[0][0], json.dumps(buf_list[1:])]
            ))
Esempio n. 8
0
def get_simHashindex(hash_list):
    """
    功能:创建Simhash索引
    参数:SimHash列表
    返回值:SimHash索
    """
    return SimhashIndex(hash_list, k=5)  #创建索引
Esempio n. 9
0
def simhash_clustering(
    signatures: List[int],
    hamming_distance: int = 3,
    # num_blocks: Optional[int] = 5,
) -> List[List[int]]:

    index = SimhashIndex([(i, Simhash(value=signature))
                          for i, signature in enumerate(signatures)],
                         k=hamming_distance)

    neighbors: List[List[int]] = []
    for signature in signatures:
        neighbors.append(
            list(map(int, index.get_near_dups(Simhash(value=signature)))))

    return neighbors
Esempio n. 10
0
def _build_index():
    global _INDEX
    index_list = []
    for domain in _TRAIN.keys():
        sim = Simhash(domain)
        index_list.append((domain, sim))
    _INDEX = SimhashIndex(index_list, k=100)
Esempio n. 11
0
 def setUp(self):
     data = {
         1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
         2: u'How are you i am fine. blar blar blar blar blar than',
         3: u'This is simhash test.',
     }
     objs = [(str(k), Simhash(v)) for k, v in data.items()]
     self.index = SimhashIndex(objs)
Esempio n. 12
0
    def process_graph(self, project_id):
        visits = defaultdict(list)
        p = 0
        hashtags_db = Hashtag.objects.filter(project_id=project_id)

        logger.info("Total hashtags to process " + str(len(hashtags_db)))
        for hashtag_entry in hashtags_db:
            visits[hashtag_entry.user_id].append(hashtag_entry.hashtag)
            p += 1

        logger.info("Hashtag read")
        logger.info("Hashtag processed " + str(p))
        logger.info("Visits count " + str(len(visits)))

        objs = []
        cant_users = 0
        cant_processed = 0
        index = SimhashIndex(objs, f=f1, k=k1)
        for user, hashtags in visits.iteritems():
            if len(hashtags) > MIN_HASHTAG_PER_USER:
                simhash = Simhash(hashtags, f=f1)
                index.add(user, simhash)
                cant_processed += 1
            cant_users += 1
            if cant_users % 10000 == 0:
                logger.info("%s processed" % cant_users)

        logger.info("Simash index build for %i out of %i users" %
                    (cant_processed, len(visits)))
        cant_processed = 0
        for user, hashtags in visits.iteritems():
            near_dups = index.get_near_dups(Simhash(hashtags, f=f1))
            for user_near_dups in near_dups:
                user_near_dups = long(user_near_dups)
                if user_near_dups != long(user):
                    hashtag_near_dups = visits[user_near_dups]
                    intersect = set(hashtags).intersection(hashtag_near_dups)
                    ratio = len(intersect) * 1.0 / len(hashtag_near_dups)
                    if ratio >= 0.1:
                        hashtag_graph = HashtagGraph(user_oid_i=user,
                                                     user_oid_j=user_near_dups,
                                                     ratio=ratio)
                        hashtag_graph.save()
            cant_processed += 1
            if cant_processed % 10000 == 0:
                logger.info("%i processed" % cant_processed)
Esempio n. 13
0
def sim_merge(finaldb_cut, simdb):
    d = {}
    index_list = []
    hashurl2sim = {}
    max_distance = 10
    with open(finaldb_cut, 'r') as f:
        for line in f:
            if not line:
                break
            # hashurl  title  author  images  links  text  pub_time
            # 1        2      3       4       5      6     7
            # jushi  shouji  zujin  dizhi  ditie  url  crawl_time  source  ext
            # 8      9       10     11     12     13   14          15      16
            array = line.rstrip('\r\n').split('\t')
            hashurl = array[0]  #string,key
            title = array[1]  #string
            text = array[5]  #string
            pub_time = array[6]  #string
            url = array[12]  #string

            s = Simhash((title + text).decode('utf-8'))
            d.update({hashurl: (title, url, pub_time)})
            sim = Simhash((title + text).decode('utf-8'))
            index_list.append((hashurl, sim))
            hashurl2sim.update({hashurl: sim})

    index = SimhashIndex(index_list, k=max_distance)
    merged = {}
    while d:
        hashurl, (title, url, pub_time) = d.popitem()
        merged[hashurl] = (title, url, pub_time)
        sim_list = index.get_near_dups(hashurl2sim[hashurl])
        buf_list = []
        for h in sim_list:
            if h != hashurl:
                if d.has_key(h):
                    title2, url2, pub_time2 = d.pop(h)
                    merged[h] = (title2, url2, pub_time2)
                else:
                    title2, url2, pub_time2 = merged[h]
            else:
                title2, url2, pub_time2 = title, url, pub_time
            buf_list.append((h, title2, url2, pub_time2))
        if len(buf_list) > 1:
            buf_list = sorted(buf_list, key=lambda i: i[3], reverse=True)
            simdb.insert('\t'.join([buf_list[0][0], json.dumps(buf_list[1:])]))
Esempio n. 14
0
    def process_graph(self, project_id):
        visits = defaultdict(list)
        processed = 0
        urls_db = Urls.objects.filter(project_id=project_id)

        logger.info("Total urls to process " + str(len(urls_db)))
        for url_entry in urls_db:
            visits[url_entry.user_id].append(url_entry.url)
            processed += 1
        logger.info("Urls read")
        logger.info("Urls processed " + str(processed))
        logger.info("Visits count " + str(len(visits)))

        objs = []
        cant_users = 0
        cant_processed = 0
        index = SimhashIndex(objs, f=f1, k=k1)
        for user, urls in visits.iteritems():
            if len(urls) > MIN_URLS_PER_USER:
                simhash = Simhash(urls, f=f1)
                index.add(user, simhash)
                cant_processed += 1
            cant_users += 1
            if cant_users % 10000 == 0:
                logger.info("%s processed" % cant_users)

        logger.info("Simash index build for %i out of %i users" %
                    (cant_processed, len(visits)))
        cant_processed = 0
        for user, urls in visits.iteritems():
            near_dups = index.get_near_dups(Simhash(urls, f=f1))
            for user_near_dups in near_dups:
                user_near_dups = long(user_near_dups)
                if user_near_dups != long(user):
                    urls_near_dups = visits[user_near_dups]
                    intersect = set(urls).intersection(urls_near_dups)
                    ratio = len(intersect) * 1.0 / len(urls_near_dups)
                    if ratio >= 0.1:
                        url_graph = UrlsGraph(user_oid_i=user,
                                              user_oid_j=user_near_dups,
                                              ratio=ratio)
                        url_graph.save()
            cant_processed += 1
            if cant_processed % 10000 == 0:
                logger.info("%i processed" % cant_processed)
Esempio n. 15
0
def simhashsort(datadic, entryset):
    objs = [(id, Simhash(sent)) for id, sent in datadic.items()]
    index = SimhashIndex(objs, k = tolerance)  # k是容忍度;k越大,检索出的相似文本就越多
    kind = 1  # 类型号
    sorted = set()
    for id in datadic:
        if str(id) in sorted:  # 不重复分类
            continue
        # 求相似集
        similiarlist = index.get_near_dups(Simhash(datadic[id]))
        similiarlist.append(str(id))
        # 将相似集信息返回到entryset中
        for id in similiarlist:
            sorted.add(id)
        for entry in entryset:
            if str(entry["id"]) in similiarlist:
                entry["cluster"] = kind
        kind += 1
Esempio n. 16
0
class TestSimhashIndex(TestCase):
    def setUp(self):
        data = {
            1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
            2: u'How are you i am fine. blar blar blar blar blar than',
            3: u'This is simhash test.',
        }
        objs = [(str(k), Simhash(v)) for k, v in data.items()]
        self.index = SimhashIndex(objs)

    def test_bucket_size(self):
        self.assertEqual(self.index.bucket_size(), 6)

    def test_get_near_dup(self):
        s1 = Simhash(u'How are you i am fine. blar blar blar blar blar thank')
        dups = self.index.get_near_dups(s1)

        self.assertEqual(len(dups), 2)
Esempio n. 17
0
def save_duplicates(save_path, text2hash_dict, k=5):
    """Group similar docs' title"""
    # Construct SimhashIndex object for similar docs detection. k is tolerance.
    index = SimhashIndex(text2hash_dict, k=k)

    done = list()
    with tqdm(total=len(text2hash_dict)) as pbar:
        with open(save_path, 'w', encoding='utf8') as file:
            for i in range(len(text2hash_dict) - 1):
                # get near duplicates
                near_dups = index.get_near_dups(text2hash_dict[i][1])
                # near dups includes origin title, len > 1 requested
                if len(near_dups) > 1 and text2hash_dict[i][0] not in done:
                    for title in near_dups:
                        file.write(title)
                        file.write('\n')
                    file.write('#' * 5 + '\n')
                    done.extend(near_dups)
                pbar.update()
Esempio n. 18
0
    def process_graph(self, project_id):
        visits = defaultdict(list)
        p = 0;
        hashtags_db = Hashtag.objects.filter(project_id=project_id)

        logger.info("Total hashtags to process "+str(len(hashtags_db)))
        for hashtag_entry in hashtags_db:
            visits[hashtag_entry.user_id].append(hashtag_entry.hashtag)
            p +=1

        logger.info("Hashtag read")
        logger.info("Hashtag processed " + str(p))
        logger.info("Visits count " + str(len(visits)))

        objs = []
        cant_users = 0
        cant_processed = 0
        index = SimhashIndex(objs, f=f1, k=k1)
        for user, hashtags in visits.iteritems():
            if len(hashtags) > MIN_HASHTAG_PER_USER:
                simhash = Simhash(hashtags, f=f1)
                index.add(user, simhash)
                cant_processed += 1
            cant_users += 1
            if cant_users % 10000 == 0:
                logger.info("%s processed" % cant_users)

        logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits)))
        cant_processed = 0
        for user, hashtags in visits.iteritems():
            near_dups = index.get_near_dups(Simhash(hashtags, f=f1))
            for user_near_dups in near_dups:
                user_near_dups = long(user_near_dups)
                if user_near_dups != long(user):
                    hashtag_near_dups = visits[user_near_dups]
                    intersect = set(hashtags).intersection(hashtag_near_dups)
                    ratio = len(intersect)*1.0/len(hashtag_near_dups)
                    if ratio >= 0.1:
                        hashtag_graph = HashtagGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio)
                        hashtag_graph.save()
            cant_processed += 1
            if cant_processed % 10000 == 0:
                    logger.info("%i processed" % cant_processed)
Esempio n. 19
0
 def __init__(self, hash_size=64, hash_tol=3, num_words_to_complete=10):
   """
   Params:
     hash_size : The number of output bits of the hash function used in SimHash.
                 Higher values -> able to handle more noise.
     hash_tol  : The number of bits that can differ for a candidate near-match in Simhash
     
     num_words_to_complete : The number of words to complete given a context when a new
                             document is encountered in get_best_match
   """
   
   self.num_words_to_complete = num_words_to_complete
   self.hash_size = hash_size
   self.hash_tol = hash_tol
   
   #This implementation of simhash stores the index in RAM, but it could easily be
   # put on disk.
   self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol)
   self.author_identifier = LanguageModelAuthorIdentifier()
   self.author_semantic_models = SemanticLanguageModels()
def simhashSort2(datadic, entryset):
    objs = []
    for entry in datadic:
        objs.append((entry[0], Simhash(entry[1])))
    index = SimhashIndex(objs, k=tolerance)  # k是容忍度;k越大,检索出的相似文本就越多
    kind = 1  # 类型号
    sorted = set()
    for item in datadic:
        if str(item[0]) in sorted:  # 不重复分类
            continue
        # 求相似集
        similiarlist = index.get_near_dups(Simhash(item[1]))
        similiarlist.append(str(item[1]))
        # 将相似集信息返回到entryset中
        for id in similiarlist:
            sorted.add(id)
        for entry in entryset:
            if str(entry["id"]) in similiarlist:
                entry["sim_count"] = kind
        kind += 1
Esempio n. 21
0
def create_test_data():
    """For 1 million records, it takes 5 minutes.
    """
    complexity = 1000**2
    print("creat data ...")
    data = [rand_str(8) for i in range(complexity)]
    print("calculate simhash ...")
    objs = [(i, Simhash(item)) for i, item in enumerate(data)]
    print("creat index ...")
    index = SimhashIndex(objs, k=3)
    safe_dump_pk(data, datafile)
    safe_dump_pk(index, indexfile)
Esempio n. 22
0
File: clust.py Progetto: johnb30/ark
def add_to_redis(CONN, hashes):
    logger.info(type(hashes))
    objs = []
    for k, v in hashes.iteritems():
        a = Simhash('a')
        a.value = int(k)
        objs.append((v, a))
    logger.info(objs[0])
    logger.info('Number of objects: {}'.format(len(objs)))
    index = SimhashIndex(CONN, objs, k=3)

    return index
def simhash_1(labels, targets, query, query_url, dataset, k=2, width=5):
    dictionary = dict(zip(labels, targets))
    objs = [(str(k), Simhash(get_features(v, width)))
            for k, v in dictionary.items()]
    index = SimhashIndex(objs, k=k)
    query_simhash = Simhash(get_features(query, width))
    near_dups = index.get_near_dups(query_simhash)

    # Save fingerprints for future use
    appendToFingerprints(
        dataset, './dataset/fingerprints.csv', {
            "query": str(query_simhash.value),
            "duplicates": ' '.join([str(obj[1].value) for obj in objs])
        })
    # print("QUERY: {}".format(query_url))
    # pp(near_dups)

    return {
        "dataset": dataset,
        "query": query_url,
        "duplicates": ' '.join(near_dups)
    }
Esempio n. 24
0
def simhash_test():
    data = {
        1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: u'How are you i am fine. blar blar blar blar blar than',
        3: u'This is simhash test.',
    }
    for k, v in data.items(): print k, get_phrases(v)
    for k, v in data.items(): print k, Simhash(get_phrases(v)).value

    objs = [(str(k), Simhash(get_phrases(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    print index.bucket_size()

    s1 = Simhash(get_phrases(u'How are you i am fine. blar blar blar blar blar thank'))
    print index.get_near_dups(s1)

    index.add('4', s1)
    print index.get_near_dups(s1)
Esempio n. 25
0
async def gen_simhash_index(conf):
    m = 0
    n = 0
    objs = []
    simhash_answer_index = {}
    for items in conf.DEMO_QUESTION:
        for item in items:
            objs.append((n, Simhash(await _tokenization(conf, item))))
            simhash_answer_index[n] = m
            n += 1
        m += 1

    simhash_index = SimhashIndex(objs, k=6)
    return simhash_index, simhash_answer_index
Esempio n. 26
0
def test(n):
    import time
    import distance
    from simhash import Simhash, SimhashIndex

    WIDTH = 3

    def gg():
        import random
        from random import randint
        from simhash import Simhash, SimhashIndex
        from itertools import groupby
        # text = str(bin(randint(2**63, 2**64-1)))[2:]
        # tokens = [text[i:i + WIDTH] for i in range(max(len(text) - WIDTH + 1, 1))]
        # return text, Simhash({k: sum(1 for _ in g) for k, g in groupby(sorted(tokens))})
        text = ''.join([random.choice('0123456789abcdef') for _ in range(36)])
        return text, Simhash(text)

    hashes = [gg() for _ in range(n)]
    d1, d2 = [], []
    test_string, test_hash = gg()

    start = time.time()
    for s, h in hashes:
        d1.append([distance.hamming(test_string, s), s])
    print time.time() - start

    start = time.time()
    index = SimhashIndex(hashes, k=5)
    for st in index.get_near_dups(test_hash):
        d2.append([distance.hamming(test_string, st), st])
    print time.time() - start

    print len(d1), len(d2)

    for a, b in zip(sorted(d1)[:20], sorted(d2)):
        print a[1] == b[1], '\t', a, '\t', b
Esempio n. 27
0
def test(n):
    import time
    import distance
    from simhash import Simhash, SimhashIndex

    WIDTH = 3

    def gg():
        import random
        from random import randint
        from simhash import Simhash, SimhashIndex
        from itertools import groupby
        # text = str(bin(randint(2**63, 2**64-1)))[2:]
        # tokens = [text[i:i + WIDTH] for i in range(max(len(text) - WIDTH + 1, 1))]
        # return text, Simhash({k: sum(1 for _ in g) for k, g in groupby(sorted(tokens))})
        text = ''.join([random.choice('0123456789abcdef') for _ in range(36)])
        return text, Simhash(text)

    hashes = [gg() for _ in range(n)]
    d1, d2 = [], []
    test_string, test_hash = gg()

    start = time.time()
    for s, h in hashes:
        d1.append([distance.hamming(test_string, s), s])
    print time.time() - start

    start = time.time()
    index = SimhashIndex(hashes, k=5)
    for st in index.get_near_dups(test_hash):
        d2.append([distance.hamming(test_string, st), st])
    print time.time() - start

    print len(d1), len(d2)

    for a, b in zip(sorted(d1)[:20], sorted(d2)):
        print a[1] == b[1], '\t', a, '\t', b
Esempio n. 28
0
def find_near_matches(session, collection, index_size, probability_index_near_match):
    from simhash import Simhash, SimhashIndex
    logging.getLogger().setLevel(logging.CRITICAL)

    tweet_id_simhash_value = session.execute(
        sa.select([model.Tweet.tweet_id, model.Tweet.features['filter','simhash']])
        .where(model.Tweet.collection == collection)
    )

    simhash_index = SimhashIndex([], k=7)

    insert_relation_stmt = pg.insert(model.relation)
    # insert_tweet_near_matches_stmt = insert_tweet_near_matches_stmt.on_conflict_do_update(
    #     index_elements=['tweet_id', 'collection'],
    #     set_={
    #         'earliest_near_match_id': insert_tweet_near_matches_stmt.excluded.earliest_near_match_id
    #     }
    # )

    indexed_tweet_ids = []

    for i, (tweet_id, simhash_value) in enumerate(tweet_id_simhash_value):

        if (i % 100000) == 1000:
            logger.info('Processed %s tweets. Committing.', i)
            session.commit()

        simhash = Simhash(simhash_value)

        near_matches_ids = simhash_index.get_near_dups(simhash)

        if not near_matches_ids:
            simhash_index.add(tweet_id, simhash)
            indexed_tweet_ids.append((tweet_id, simhash))

            if len(indexed_tweet_ids) > index_size:
                simhash_index.delete(*indexed_tweet_ids.pop(0))

        if near_matches_ids:
            near_match_id = min(near_matches_ids)

            logger.debug('A near match %s for tweet %s', near_match_id, tweet_id)
            session.execute(
                insert_relation_stmt.values(
                    [(tweet_id, collection, 'near_match', near_match_id)]
                )
            )

    session.commit()
Esempio n. 29
0
def use_simhash_index():
    data = {
        1: "How are you? I Am fine. blar blar blar blar blar Thanks.",
        2: "How are you i am fine. blar blar blar blar blar than",
        3: "This is simhash test.",
    }
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    print(index.bucket_size())

    s1 = Simhash(
        get_features(u"How are you i am fine. blar blar blar blar blar thank"))
    print(index.get_near_dups(s1))

    index.add("4", s1)
    print(index.get_near_dups(s1))
Esempio n. 30
0
def use_simhash_index():
    data = {
        1: "How are you? I Am fine. blar blar blar blar blar Thanks.",
        2: "How are you i am fine. blar blar blar blar blar than",
        3: "This is simhash test.",
    }
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)
    
    print(index.bucket_size())
    
    s1 = Simhash(get_features(u"How are you i am fine. blar blar blar blar blar thank"))
    print(index.get_near_dups(s1))
    
    index.add("4", s1)
    print(index.get_near_dups(s1))
Esempio n. 31
0
def console_test():
    from simhash import Simhash, SimhashIndex
    data = {
        1: 'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: 'How are you i am fine. blar blar blar blar blar than',
        3: 'This is simhash test.',
        4: 'How are you i am fine. blar blar blar blar blar thank1',
    }
    objs = [(str(k), Simhash(v)) for k, v in data.items()]
    index = SimhashIndex(objs, k=10)
    s1 = Simhash(
        u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank'
    )
    dups = index.get_near_dups(s1)
    dups = index.get_near_dups2(s1, 5)
    index.remove(s1)
Esempio n. 32
0
 def __init__(self, config, worker=None):
     self.config = config
     self.host, self.port = config.cache_server
     #self.robots = list of banned paths
     self.robots = {}
     self.simhashes = SimhashIndex([])
     self.link = 1
     self.worker = worker
     self.maxWords = (
         "", 0
     )  # maxWords[0] is the URL, maxWords[1] is the number of words in it
     self.wordCounter = Counter(
     )  # a dictionary that keeps track of the # of words
     self.stopWords = [
         '1', 'a', 'about', 'above', 'after', 'again', 'against', 'all',
         'also', 'am', 'an', 'and', 'any', 'are', 'are', "aren't", 'as',
         'at', 'b', 'be', 'because', 'been', 'before', 'being', 'below',
         'between', 'both', 'but', 'by', 'can', 'can', "can't", 'cannot',
         'could', "couldn't", 'd', 'did', "didn't", 'do', 'does', "doesn't",
         'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from',
         'further', 'had', "hadn't", 'has', 'has', "hasn't", "hasn't",
         'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her',
         'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd",
         "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it',
         "it's", 'its', 'itself', "let's", "ll", 'm', 'may', 'me', 'more',
         'most', "mustn't", 'my', 'myself', 'next', 'no', 'nor', 'not',
         'of', 'off', 'on', 'once', 'once', 'one', 'only', 'or', 'other',
         'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 's',
         'same', 'say', 'says', "shan't", 'she', "she'd", "she'll", "she's",
         'should', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that',
         "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
         'there', "there's", 'these', 'they', "they'd", "they'll",
         "they're", "they've", 'this', 'those', 'through', 'to', 'too',
         'under', 'under', 'until', 'until', 'up', 've', 'very', 'was',
         "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were',
         "weren't", 'what', "what's", 'when', "when's", 'where', 'which',
         'while', 'who', "who's", 'whom', 'why', "why's", 'will', 'with',
         "won't", 'would', "wouldn't", 'x', 'y', 'you', "you'd", "you'll",
         "you're", "you've", 'your', 'yourself', 'yourselves'
     ]
Esempio n. 33
0
def main(path):
    corpuses = readFiles.normalize(path)
    results = []
    for corpus in corpuses:
        hashset = {}
        listofitems = []
        for item in corpus.keys():
            if item == 'desc': continue
            z = Simhash(corpus[item])
            hashset[item] = z
            listofitems += [(item, z)]

        l = SimhashIndex(listofitems)
        #print(l.get_near_dups(hashset['../corpus/bbc/tech1/001.txt']))
        hashlist = {}
        for i, item1 in enumerate(hashset.keys()):
            hashlist[item1] = []
            for j, item2 in enumerate(hashset.keys()):
                if j < i:
                    hashlist[item1] += [' ']
                    continue
                hashlist[item1] += [hashset[item1].distance(hashset[item2])]
                #print item1, item2, hashset[item1].distance(hashset[item2])
        results += [[hashset, hashlist, corpus['desc']]]

    with open('results.csv', 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='{')
        for hashset, hashlist, desc in results:
            writer.writerow([" "])
            writer.writerow([i for i in desc.split()])
            record = []
            record += [['Table'] + [key for key in hashset.keys()]]
            for k in hashset.keys():
                record += [[k] + hashlist[k]]
            for item in record:
                writer.writerow(item)
Esempio n. 34
0
 def clear(self):
   self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol)
Esempio n. 35
0
s3 = 'This is simhash test.'.decode('utf-8', 'ignore')

# print get_features(s1)
#
# print Simhash(get_features('How are you? I am fine. Thanks.')).value


sh1 = Simhash(s1)
sh2 = Simhash(s2)
sh3 = Simhash(s3)

# print sh.value


# print sh1.distance(sh2)

shIndex = SimhashIndex([], k=3)
shIndex.add('1', sh1)
shIndex.add('2', sh2)
# shIndex.add('3', sh3)

if shIndex.get_near_dups(sh3):
    print 'YES'
else:
    print 'NO'

# print shIndex.get_near_dups(sh2)


Esempio n. 36
0
def init_index(url, initial_data):
    data[url] = initial_data
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    global index 
    index = SimhashIndex(objs, k=3)
Esempio n. 37
0
        f_stop.close()
    f_stop_seg_list = f_stop_text.split('\n')
    for myword in liststr.split('/'):
        if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
            mywordlist.append(myword)
    return ''.join(mywordlist)


#data.head()['content'].apply(lambda x:jiebaclearText(str(x)))

data['content'] = data['content'].apply(lambda x: jiebaclearText(str(x)))
data['simhash'] = data['content'].apply(lambda x: Simhash(x).value)

train = data.loc[data['source'] == 'train']
test = data.loc[data['source'] == 'test']

train.drop('source', axis=1, inplace=True)
test.drop([
    'source',
], axis=1, inplace=True)

objs = [(row["id"], Simhash(row["content"]))
        for index, row in train.iterrows()]

index = SimhashIndex(objs, k=12)
test['result'] = test['content'].apply(
    lambda x: index.get_near_dups(Simhash(x)))

sub['result'] = test['result']
sub.to_csv('../output/simhash.csv', index=False)
Esempio n. 38
0
		sys.exit(-1)

	logger.info('connect mongo ok.' )

	try:
		logger.info('{create_time:{$gte:%ld,$lt:%ld} }' %(lasttimestamp,curtimestamp) )
		status_count = weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }).count()
		logger.info('status_count: %d' %status_count)
		if status_count < 10:
			connection.close();mylogger.close()
			sys.exit(0)
		stopwords = loadstopwords(stopwordsfilename)
		fdoc=open(docfile,'w');fcut=open(cutfile,'w')
		num=0;simnum=0;cutnum=0
		#simhash
		index = SimhashIndex({})
		for one in weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }):
			weibo_id = str(one['_id'])
			weibo_text = one['data']['text'].strip()
			text_sh = Simhash(weibo_text )
			if len(index.get_near_dups(text_sh) ) == 0: #not find sim
				#cut
				text_seg = jieba.cut(weibo_text)
				text_result = list(set(text_seg) - stopwords)
				content = ' 1 '.join(text_result)
				if content != '':
					fdoc.write(weibo_id+'\t'+weibo_text.encode('utf-8')+'\n');fcut.write(content.encode('utf-8')+' 1\n')
					cutnum += 1
				simnum += 1
			num += 1
			index.add(num,text_sh)
print(Simhash('aa').distance(Simhash('aa')))


def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]

data = {
    1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
    2: u'How are you i am fine. blar blar blar blar blar than',
    3: u'This is simhash test.',
}
objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print(index.bucket_size())

s1 = Simhash(get_features(u'How are you i am fine. blar blar blar blar blar thank'))
print(index.get_near_dups(s1))

index.add('4', s1)
print(index.get_near_dups(s1))


def main():
    pass


if __name__ == '__main__':
Esempio n. 40
0
 def setUp(self):
     objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
     self.index = SimhashIndex(objs, k=10)
Esempio n. 41
0
class TestSimhashIndex(TestCase):
    data = {
        1: 'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: 'How are you i am fine. blar blar blar blar blar than',
        3: 'This is simhash test.',
        4: 'How are you i am fine. blar blar blar blar blar thank1',
    }

    def setUp(self):
        objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
        self.index = SimhashIndex(objs, k=10)

    def test_get_near_dup(self):
        s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank')
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 3)

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 3)

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 3)
Esempio n. 42
0
class DocCollection(object):
  def __init__(self, hash_size=64, hash_tol=3, num_words_to_complete=10):
    """
    Params:
      hash_size : The number of output bits of the hash function used in SimHash.
                  Higher values -> able to handle more noise.
      hash_tol  : The number of bits that can differ for a candidate near-match in Simhash
      
      num_words_to_complete : The number of words to complete given a context when a new
                              document is encountered in get_best_match
    """
    
    self.num_words_to_complete = num_words_to_complete
    self.hash_size = hash_size
    self.hash_tol = hash_tol
    
    #This implementation of simhash stores the index in RAM, but it could easily be
    # put on disk.
    self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol)
    self.author_identifier = LanguageModelAuthorIdentifier()
    self.author_semantic_models = SemanticLanguageModels()
  
  def generate_simhash(self, tokens):
    #Generate a Simhash from Spacy tokens.
    sh = Simhash(u'', f=self.hash_size) #silly interface...
    sh.build_by_features(tokens)
    return sh
    
  def add(self, doc, title, author):
    add_to_index = self.simhash_index.add
    
    #Index each paragraph in the document into the simhash index
    paras = extract_paragraphs(doc)
    
    #Update the word shape language model for this author
    para_toks = [tokenize(p) for p in paras]
    flat_tokens = [item for sublist in para_toks for item in sublist]
    self.author_semantic_models.add_doc(flat_tokens, author)
    
    #Update the semantic model for this author
    self.author_identifier.add_doc(flat_tokens, author)
    
    #Add each paragraph to the simhash index
    for para_num, tokens in enumerate(para_toks, 1):
      if not tokens:
        continue
      sh = self.generate_simhash(tokens)
      self.simhash_index.add((tokens, title, author, para_num), sh)
        
  def get_best_match(self, snippet):
    get_near_dups = self.simhash_index.get_near_dups
    generate_simhash = self.generate_simhash
    title_author_to_count = {}
    
    paras = extract_paragraphs(snippet)
    
    #evenly distribute the corrupted paragraphs
    #shuffle(paras)
    
    #For each paragraph, get the closest matching previously encountered paragraphs.
    #If multiple matches, prune via edit distance.
    #The work of art that matches the most paragraphs is the winner (if it matches enough)
    paras_done = 0
    for para in paras:
      tokens = tokenize(para)
      if not tokens:
        continue
      paras_done += 1
      sh = generate_simhash(tokens)
      candidates = [make_tuple(match) for match in get_near_dups(sh)]
      
      #Increment the count of these works
      for candidate in candidates:
        _, title, author, para_num = candidate
        k = (title, author)
        title_author_to_count[k] = title_author_to_count.get(k, 0) + 1
    
    if title_author_to_count:
      #OK, what work was the most frequent, and what was that frequency?
      (title, author), f = max(title_author_to_count.iteritems(), key=lambda item: item[1])
                  
      score = 1.*f/paras_done
      if score >= 0.1:
        return {'title': title, 'author': author, 
                'score': score, 'author_score': None, 
                'completion': None}
    
    #This is either so corrupt that we can't tell what it is, or is a new work.
    #Guess the author
    tokens = [item for sublist in [tokenize(p) for p in paras] for item in sublist]
    author_guess, author_score = self.author_identifier.predict_author(tokens)
    completion = self.author_semantic_models.complete(author_guess, tokens, self.num_words_to_complete, 1)
    
    return {'title': None, 'author': author_guess, 
            'score': None, 'author_score': author_score, 
            'completion': completion}

  def clear(self):
    self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol)
test_index = [(u[0], Simhash(u[0])) for u in urls]

# simhash_results_a.txt : k=20 (subset)
# simhash_results_b.txt

with open('testdata/solr_20150320/simhash_results_k10.txt', 'w') as f:
    f.write('')

start_time = time.time()

for index, (test_url, test_simhash) in enumerate(test_index):
    i_start_time = time.time()
    if index % 50 == 0:
        print 'completed {0} of {1}'.format(index, len(urls))

    duplicates = []

    for i in xrange(0, len(test_index), 300):
        index = SimhashIndex(test_index[i:i + 300], k=10)
        dupes = index.get_near_dups(test_simhash)

        if len(dupes) > 0:
            duplicates += dupes

    print '\t{0} takes {1}'.format(len(duplicates), time.time() - i_start_time)

    with open('testdata/solr_20150320/simhash_results_k10.txt', 'a') as f:
        f.write(json.dumps({test_url: duplicates}) + '\n')

print 'takes:', time.time() - start_time
Esempio n. 44
0
class NearDuplicate:
    def __init__(self, filenames, k=2, metadata_dictionary=None):
        self.filenames = filenames
        self.simhash_index = None 
        self.image_dictionary = {}
        self.metadata_dictionary = metadata_dictionary
        self.k = k 
        # Need to store the image hashes in some fashion
        # Possibly cluster the hashes (k-means) 
    
    def tika_metadata(self, filename):
        """Use the tika-py module to grab metadata for a file"""
        parsed = parser.from_file(filename)
        return parsed.get("metadata", {})

    def exifread_metadata(self, filename):
        """Use the exifread module to grab metadata for a file"""
        f = open(filename, 'rb')
        tags = exifread.process_file(f)
        return tags

    def generate_features_from_dict(self, filename):
        """ Use this function when we provide json metadata information from
            the tika java module"""

        # Find the metadata object from the json metadata file for the image_file named 'filename'
        metadata = self.metadata_dictionary.get(filename, {})
       
        # The tags or type of metadata we want
        feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes", "File Name Suffix"]

        # Create a feature array using these metadata values
        features = []

        feature_weight_dict = {
                "Image Height" : 1, 
                "Image Width" : 1,
                "Files Size" : 2,
                "Content-Type" : 3,
                "Image Bytes" : 6, 
                "File Name Suffix" :2 
        }

        # Grab the bytes of the entire file
        image_bytes = "NONE"
        try:
            image_bytes = open(filename, 'rb').read()
        except OSError:
            image_bytes = "NONE"

        # Get the central bytes 
        image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore")
        byte_offset = len(image_bytes_str)//4
        filename_suffix = filename[-10:]

        modified_metadata = {
                "Image Height" : metadata.get("Image Height", "NONE"), 
                "Image Width" : metadata.get("Image Width", "NONE"),
                "File Size" : metadata.get("File Size", "NONE"),
                "Content-Type" : metadata.get("Content-Type", "NONE"),
                "Image Bytes" : image_bytes_str[byte_offset:-byte_offset], 
                "File Name Suffix" : filename_suffix
        }
       
        # Create an array of (token, weight) tuples. These are our features and weights
        # to be used for the Simhash
        for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), 
                modified_metadata.items()):
            features.append((meta_tag + ":" + meta_value, weight))

        return features


    def generate_features(self, filename):
        """Given an image generate a feature vector"""

        """ 
            Since Tika-Py requires a server call (i.e. slower)
            Do native image metadata grabbing, and fallback on tika if the
            image can't be opened (i.e., it's an svg or gif)
        """
        im, use_tika = None, False 
        try:
            im = Image.open(filename)
            use_tika = False
        except IOError:
            use_tika = True
            
        # Grab the metadata for the image
        metadata = {} 
        
        # We'll store features to use for simhash in a tuple array [(token, weight)]
        features = []

        if use_tika:
            # Use only metadata from tika
            # The image file can't be opened using PIL.Image, so that means
            # a diff type of image besides jpg, png
            metadata = self.tika_metadata(filename)

            # Grab the bytes of the entire file
            image_bytes = open(filename).read()

            # Get the central bytes 

            image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore")
            #image_bytes_str = str(image_bytes)
            byte_offset = len(image_bytes_str)//4
            metadata["Image Bytes"] = image_bytes_str[byte_offset:-byte_offset] 
            feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes"]
            features = [tag + ":" + metadata.get(tag,"NONE") for tag in feature_tags]
            return features

        """ 
            FEATURES
                We'll resize the image so all images are normalized to a certain size 
                Also make sure to retain aspect ratio

                Features to use (in order of importance)
                    - center region bytes 
                    - color histogram
                    - content type
                    - image width
                    - image height

            We can take subregions of the image, and hash those
        """

        
        # Resize the image so all images are normalized
        width = im.size[0]
        height = im.size[1]
        resize_width = 30 
        resize_height = resize_width*height/width
        resize_im = None
        histogram_bytes, histogram_weight = "", 0
        center_region_bytes, center_region_weight = "", 5
        extension = ""
        try :
            resize_im = im.resize((resize_width, resize_height), Image.ANTIALIAS)
            # Crop sub regions
            height_padding, width_padding = resize_height/5, resize_width/5
            box = (width_padding, height_padding, resize_width - width_padding, 
                    resize_height - height_padding)
            sub_region = resize_im.crop(box)
            
            # Generate a histogram
            histogram_bytes, histogram_weight = str(resize_im.histogram()), 4
            center_region_bytes, center_region_weight = str(list(sub_region.getdata())), 3
        except OSError:
            
            # Couldn't resize the image. Let's
            print >> sys.stderr, "Couldn't resize the image. Prob an eps or svg"
            resize_im = im
            resize_width = im.size[0]
            resize_height = im.size[1]
            sub_region = im

            # Grab the bytes of the entire file
            image_bytes = open(filename).read()
            # Get the central bytes 
            #image_bytes_str = str(image_bytes)
            histogram_bytes = "NONE"
            image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore")
            byte_offset = len(image_bytes_str)//4
            center_region_bytes = image_bytes_str[byte_offset:-byte_offset] 
         
        extension = resize_im.format if resize_im.format !=  None else os.path.splitext(filename)[1]
         
        # Figure out the content type (png, jpg, etc.)
        content_type = "image/" + str(extension.lower())
        
        
        feature_weight_dict = {
                "Image Height" : 1, 
                "Image Width" : 1,
                "Image Histogram" : histogram_weight,
                "Content-Type" : 5,
                "Center Region Bytes" : center_region_weight 
        }

        metadata = {
                "Image Height" : str(width), 
                "Image Width" : str(height),
                "Image Histogram" : histogram_bytes,
                "Content-Type" : content_type,
                "Center Region Bytes" : center_region_bytes 
        }
       
        # Create an array of (token, weight) tuples. These are our features and weights
        # to be used for the Simhash
        for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), 
                metadata.items()):
            features.append((meta_tag + ":" + meta_value, weight))

        return features 


    def merge_near_duplicate_dictionaries(self, nd):
        """Merge the current near duplicate instance with another near duplicate instance"""

        smaller_nd = self if len(self.image_dictionary) <= len(nd.image_dictionary) else nd
        larger_nd = self if len(self.image_dictionary) > len(nd.image_dictionary) else nd
        final_dict = larger_nd.image_dictionary

        # Iterate over the smaller near duplicate instance
        for key in smaller_nd.image_dictionary.keys():
            

            # If an exact duplicate exists, just grab it and merge them 
            if larger_nd.image_dictionary.get(key, None) != None:
                arr = smaller_nd.image_dictionary.get(key, []) +\
                        larger_nd.image_dictionary.get(key, [])
                final_dict[key] = arr
                continue

            # Find the closest near duplicate in the larger dictionary by
            # using it's index
            simhash_obj = smaller_nd.image_dictionary[key][0]["hash_object"]

            near_duplicates_keys = larger_nd.simhash_index.get_near_dups(simhash_obj)
            
            # If a near duplicate exists 
            if len(near_duplicates_keys) > 0:
                # grab the array of images at that key in the larger dictionary
                # Merge it the array of images in the smaller dictionary 
                near_dup_key = near_duplicates_keys[0]
                arr = smaller_nd.image_dictionary.get(key, []) +\
                        larger_nd.image_dictionary.get(near_dup_key, [])

                # create an entry in the new dictionary
                final_dict[near_dup_key] = arr
                continue
                
            # Otherwise we should just add this key-object from the dictionary
            # to this array
            final_dict[key] = smaller_nd.image_dictionary[key] 

            # Add this simhash to the Index for efficient searching
            larger_nd.simhash_index.add(key, simhash_obj)

        self.image_dictionary = final_dict
        self.simhash_index = larger_nd.simhash_index

        nd.image_dicionary = final_dict
        nd.simhash_index = larger_nd.simhash_index

        # Now simply return this final dict 
        return final_dict


    def simhash_value_to_key(self, simhash):
        """Given a simhash object, convert it's value to a hexadecimal key 
            This key will be used in our image_file dictionary
        """
        return str(hex(simhash.value))


    def deduplicate_images(self):
        """
            Given a list of image files "self.filenames", deduplicate the images using
            near deduplication 
        """
        # Iterate through our files
        for image_file in self.filenames:
            feature_array = []
            if self.metadata_dictionary != None:
                # Will use a java tika program to generate metadata 
                # Metadata will be a json file with {filename : metadata} objects
                feature_array = self.generate_features_from_dict(image_file)
            else:
                # Use our own function for grabbing metadata
                # Create a list of features
                feature_array = self.generate_features(image_file)
        
            # Simhash this list of features
            sHash = Simhash(feature_array)
            if self.simhash_index == None:
                # First image, so we create the index add it to the dictionary
                # And move on to next iteration
                key = self.simhash_value_to_key(sHash)

                # We will use this index to speed up the process for finding
                # nearby simhashes
                self.simhash_index = SimhashIndex([(key, sHash)], k=self.k)
                self.image_dictionary[key] = [{
                    "filename" : image_file, 
                    "hash_key" : key, 
                    "hash_object": sHash
                }] 
                continue

            near_duplicates_keys = self.simhash_index.get_near_dups(sHash)

            if len(near_duplicates_keys) > 0:
                # There are duplicates, so we should add them to the corresponding entry
                # in the file dictionary

                # Get the key for the nearest duplicate image
                near_dup_simhash_key = near_duplicates_keys[0] 

                # Get the key for this current image 
                current_simhash_key = self.simhash_value_to_key(sHash) 

                # Create an object comprised of the image filename and key
                # We'll store this in a dictionary to be used in our merge step
                current_simhash_object = {
                    "filename" : image_file, 
                    "hash_key" : current_simhash_key,
                    "hash_object" : sHash
                }
                self.image_dictionary[near_dup_simhash_key].append(current_simhash_object)
            else:
                # No duplicates, so let's create an entry in our image filename dictionary
                key = self.simhash_value_to_key(sHash)

                # Add this simhash to the Index for efficient searching
                self.simhash_index.add(key, sHash)

                # Create an object in our image file dictionary
                self.image_dictionary[key] = [{
                    "filename" : image_file, 
                    "hash_key" : key,
                    "hash_object" : sHash
                }]
Esempio n. 45
0
    def deduplicate_images(self):
        """
            Given a list of image files "self.filenames", deduplicate the images using
            near deduplication 
        """
        # Iterate through our files
        for image_file in self.filenames:
            feature_array = []
            if self.metadata_dictionary != None:
                # Will use a java tika program to generate metadata 
                # Metadata will be a json file with {filename : metadata} objects
                feature_array = self.generate_features_from_dict(image_file)
            else:
                # Use our own function for grabbing metadata
                # Create a list of features
                feature_array = self.generate_features(image_file)
        
            # Simhash this list of features
            sHash = Simhash(feature_array)
            if self.simhash_index == None:
                # First image, so we create the index add it to the dictionary
                # And move on to next iteration
                key = self.simhash_value_to_key(sHash)

                # We will use this index to speed up the process for finding
                # nearby simhashes
                self.simhash_index = SimhashIndex([(key, sHash)], k=self.k)
                self.image_dictionary[key] = [{
                    "filename" : image_file, 
                    "hash_key" : key, 
                    "hash_object": sHash
                }] 
                continue

            near_duplicates_keys = self.simhash_index.get_near_dups(sHash)

            if len(near_duplicates_keys) > 0:
                # There are duplicates, so we should add them to the corresponding entry
                # in the file dictionary

                # Get the key for the nearest duplicate image
                near_dup_simhash_key = near_duplicates_keys[0] 

                # Get the key for this current image 
                current_simhash_key = self.simhash_value_to_key(sHash) 

                # Create an object comprised of the image filename and key
                # We'll store this in a dictionary to be used in our merge step
                current_simhash_object = {
                    "filename" : image_file, 
                    "hash_key" : current_simhash_key,
                    "hash_object" : sHash
                }
                self.image_dictionary[near_dup_simhash_key].append(current_simhash_object)
            else:
                # No duplicates, so let's create an entry in our image filename dictionary
                key = self.simhash_value_to_key(sHash)

                # Add this simhash to the Index for efficient searching
                self.simhash_index.add(key, sHash)

                # Create an object in our image file dictionary
                self.image_dictionary[key] = [{
                    "filename" : image_file, 
                    "hash_key" : key,
                    "hash_object" : sHash
                }]
Esempio n. 46
0
class TestSimhashIndex(TestCase):
    data = {
        1: 'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: 'How are you i am fine. blar blar blar blar blar than',
        3: 'This is simhash test.',
        4: 'How are you i am fine. blar blar blar blar blar thank1',
    }

    def setUp(self):
        objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
        self.index = SimhashIndex(objs, k=10)

    def test_get_near_dup(self):
        s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank')
        dups = self.index.get_near_dups(s1)
        self.assertEqual(3, len(dups))

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(2, len(dups))

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(2, len(dups))

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(3, len(dups))

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(3, len(dups))
Esempio n. 47
0
        for j in range(n, n+4):
            fourgram[url].append(words[j])



#duplicate detection
keys = fourgram.keys()
f1 = open('rezFinalNoDuplicates.txt', 'w')
objs = []
for k in fourgram:
    try:
        objs.append((k, Simhash(fourgram[k])))
    except Exception as e:
        print e
#objs = [(k, Simhash(fourgram[k])) for k in fourgram]
index = SimhashIndex(objs, k=3)

print "bucket_size", index.bucket_size()

for key in keys:
    s1 = Simhash(fourgram[key])
    duplicates = ", ".join(index.get_near_dups(s1))
    f1.write(key + "\t" + duplicates+"\n")
    print key, duplicates

'''
while len(keys) > 0:
    key = keys.pop()
    keysJ = list(keys)
    f1.write(key + '\t' + text[key])
Esempio n. 48
0
 def setUp(self):
     objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
     self.index = SimhashIndex(objs, k=10)
Esempio n. 49
0
#########################################################################################################################################

if(args.near.upper() == 'Y'):
    print '---------------------------------'
    print ' MatchMeta.Info Database Fuzzing'
    print '---------------------------------'

    def get_features(s):
        width = 3
        s = s.lower()
        s = re.sub(r'[^\w]+', '', s)
        return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]

    data = {}
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    if os.path.isfile(args.db):
        print 'MatchMeta.Info Database Located'
        print 'Patience...Loading Index...'
        conn = sqlite3.connect(args.db)
        meta = conn.execute("SELECT path FROM MatchMeta WHERE path NOT LIKE '%winsxs%'")
        count = 1

        for line in meta:
            item = Simhash(get_features(unicode(line[0])))
            count = count+1
            index.add(count,item)

        print index.bucket_size()
        print 'Excluding the WINSXS Directory'