コード例 #1
0
def get_simHashindex(hash_list):
    """
    功能:创建Simhash索引
    参数:SimHash列表
    返回值:SimHash索
    """
    return SimhashIndex(hash_list, k=5)  #创建索引
コード例 #2
0
def _build_index():
    global _INDEX
    index_list = []
    for domain in _TRAIN.keys():
        sim = Simhash(domain)
        index_list.append((domain, sim))
    _INDEX = SimhashIndex(index_list, k=100)
コード例 #3
0
def extract_next_links(url, resp) -> "list()":
    defrag=urldefrag(url)[0]
    print(defrag)
    if resp.status == 200:
        print("Scanning")
        if defrag not in urls:
            content = resp.raw_response.text
            data=getVisibleText(content)
            simmed=Simhash(data)
            if simmed.value not in sims:
                index=SimhashIndex(objs,k=3)
                if len(index.get_near_dups(simmed))==0:
                    urls.add(defrag)
                    sims.add(simmed.value)
                    objs.append((url,simmed))
                    print(len(urls),len(sims),len(objs))
                    try:
                        file=open("data_dump.txt","a",errors="ignore")
                        to_write=url+ " \n "+ data+ "\n"+ str(simmed.value) +"\n\n"
                        file.write(to_write)
                    except Exception as e:
                        raise e
                    finally:
                        file.close()
            #urls[defrag].add(getVisibleText(content))
            #print(urls[defrag])
        return getAllUrls(url,content)
    else:
        print("Cant scan")
        return []
コード例 #4
0
 def __init__(self, vocab_to_freq, f=64, k=32):
   self.vocab_to_freq = vocab_to_freq
   self.simhash_index = SimhashIndex([], f=f, k=k)
   self.f = f
   self.k = k
   
   simhash_index = self.simhash_index
   for w in vocab_to_freq:
     sh = Simhash(w, f=f)
     simhash_index.add(w, sh)
コード例 #5
0
def clustering():
    fout = open('cluster.txt', 'w', encoding='UTF-8')
    cursor = conn.cursor()
    cursor.execute(
        'SELECT id, title, cluster, sim_count, link, simhash FROM entries where cluster=0'
    )
    entrylist = cursor.fetchall()
    objs = []
    entrydic = {}
    for item in entrylist:
        if not is_en(item[1]):
            if not item[4].startswith("https://weibo.com"):
                sim = Simhash(get_features_cn(item[1]))
                objs.append((str(item[0]), sim))
                entrydic[str(item[0])] = {
                    'title': item[1],
                    'cluster': 0,
                    'sim_count': 0,
                    'link': item[4],
                    'simhash': sim.value
                }
        else:
            sim = Simhash(get_features(item[1]))
            objs.append((str(item[0]), sim))
            entrydic[str(item[0])] = {
                'title': item[1],
                'cluster': 0,
                'sim_count': 0,
                'link': item[4],
                'simhash': sim.value
            }

    index = SimhashIndex(objs, k=tolerance)
    cluster_num = last_cluster_num
    for key in entrydic:
        if entrydic[key]['cluster'] == 0:
            sims = index.get_near_dups(
                Simhash(get_features_cn(entrydic[key]['title'])))
            for item in sims:
                entrydic[item]['cluster'] = cluster_num
                # if len(sims) > 1:
                entrydic[item]['sim_count'] = len(sims) - 1
                if len(sims) > 1:
                    fout.write(item + '\t' + str(entrydic[item]['cluster']) +
                               '\t' + entrydic[item]['title'] + '\n')
                cursor.execute(
                    'UPDATE entries SET cluster=%s, sim_count=%s, simhash=%s where id = %s',
                    (entrydic[item]['cluster'], entrydic[item]['sim_count'],
                     str(entrydic[item]['simhash']), item))
                # conn.commit()
                # fout.write(item + '\t' + str(entrydic[item]['cluster']) + '\t' + entrydic[item]['title'] + '\t' + entrydic[item]['link'] + '\n')
            cluster_num += 1
    # cursor.execute('UPDATE somevariables SET last_cluster=%s', (cluster_num,))
    # conn.commit()
    conn.close()
コード例 #6
0
ファイル: clust.py プロジェクト: johnb30/ark
def add_to_redis(CONN, hashes):
    logger.info(type(hashes))
    objs = []
    for k, v in hashes.iteritems():
        a = Simhash('a')
        a.value = int(k)
        objs.append((v, a))
    logger.info(objs[0])
    logger.info('Number of objects: {}'.format(len(objs)))
    index = SimhashIndex(CONN, objs, k=3)

    return index
コード例 #7
0
def create_test_data():
    """For 1 million records, it takes 5 minutes.
    """
    complexity = 1000**2
    print("creat data ...")
    data = [rand_str(8) for i in range(complexity)]
    print("calculate simhash ...")
    objs = [(i, Simhash(item)) for i, item in enumerate(data)]
    print("creat index ...")
    index = SimhashIndex(objs, k=3)
    safe_dump_pk(data, datafile)
    safe_dump_pk(index, indexfile)
コード例 #8
0
def main():
    # user_query = input()
    DOCID = 0


    numPartial = 1 

    index = SimhashIndex([])

    totaldocs = 0
    docnum = 0

    validDocFile = open('validDocs2', 'w')

    for root, dirs, files in os.walk(DEVPATH):
        for fname in files:
            if not fname.endswith(".json"):
                continue
            totaldocs += 1
            h2t = html2text.HTML2Text()

            file = open(root + "/" + fname)

            pageDict = json.loads(file.read())

            # close file to get memory back
            file.close()

            # get html formated content
            htmlContent = pageDict['content']

            print(pageDict['url'])

            plainContent = h2t.handle(htmlContent)

            feat = get_features(plainContent)

            sim = Simhash(feat)

            if len(index.get_near_dups(sim)) > 0:
                continue

            print(docnum, totaldocs)

            index.add(str(docnum), sim)

            validDocFile.write(root + "/" + fname + "\n")

            docnum+=1


    validDocFile.close()
コード例 #9
0
async def gen_simhash_index(conf):
    m = 0
    n = 0
    objs = []
    simhash_answer_index = {}
    for items in conf.DEMO_QUESTION:
        for item in items:
            objs.append((n, Simhash(await _tokenization(conf, item))))
            simhash_answer_index[n] = m
            n += 1
        m += 1

    simhash_index = SimhashIndex(objs, k=6)
    return simhash_index, simhash_answer_index
コード例 #10
0
def get_near_dups(query_simhash, candidates_simhash, k):
    res = [0] * len(candidates_simhash)
    query = Simhash(value=query_simhash)

    for i in range(len(candidates_simhash)):
        candidates_simhash[i] = (str(i), Simhash(value=candidates_simhash[i]))
        i = i + 1
    index = SimhashIndex(candidates_simhash, k=k)
    near_dups = index.get_near_dups(query)

    for dup in near_dups:
        res[int(dup)] = 1

    return res
コード例 #11
0
def find_near_matches(session, collection, index_size, probability_index_near_match):
    from simhash import Simhash, SimhashIndex
    logging.getLogger().setLevel(logging.CRITICAL)

    tweet_id_simhash_value = session.execute(
        sa.select([model.Tweet.tweet_id, model.Tweet.features['filter','simhash']])
        .where(model.Tweet.collection == collection)
    )

    simhash_index = SimhashIndex([], k=7)

    insert_relation_stmt = pg.insert(model.relation)
    # insert_tweet_near_matches_stmt = insert_tweet_near_matches_stmt.on_conflict_do_update(
    #     index_elements=['tweet_id', 'collection'],
    #     set_={
    #         'earliest_near_match_id': insert_tweet_near_matches_stmt.excluded.earliest_near_match_id
    #     }
    # )

    indexed_tweet_ids = []

    for i, (tweet_id, simhash_value) in enumerate(tweet_id_simhash_value):

        if (i % 100000) == 1000:
            logger.info('Processed %s tweets. Committing.', i)
            session.commit()

        simhash = Simhash(simhash_value)

        near_matches_ids = simhash_index.get_near_dups(simhash)

        if not near_matches_ids:
            simhash_index.add(tweet_id, simhash)
            indexed_tweet_ids.append((tweet_id, simhash))

            if len(indexed_tweet_ids) > index_size:
                simhash_index.delete(*indexed_tweet_ids.pop(0))

        if near_matches_ids:
            near_match_id = min(near_matches_ids)

            logger.debug('A near match %s for tweet %s', near_match_id, tweet_id)
            session.execute(
                insert_relation_stmt.values(
                    [(tweet_id, collection, 'near_match', near_match_id)]
                )
            )

    session.commit()
コード例 #12
0
ファイル: nn.py プロジェクト: ChenghaoMou/text-dedup
def simhash_clustering(
    signatures: List[int],
    hamming_distance: int = 3,
    # num_blocks: Optional[int] = 5,
) -> List[List[int]]:

    index = SimhashIndex([(i, Simhash(value=signature))
                          for i, signature in enumerate(signatures)],
                         k=hamming_distance)

    neighbors: List[List[int]] = []
    for signature in signatures:
        neighbors.append(
            list(map(int, index.get_near_dups(Simhash(value=signature)))))

    return neighbors
コード例 #13
0
def console_test():
    from simhash import Simhash, SimhashIndex
    data = {
        1: 'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: 'How are you i am fine. blar blar blar blar blar than',
        3: 'This is simhash test.',
        4: 'How are you i am fine. blar blar blar blar blar thank1',
    }
    objs = [(str(k), Simhash(v)) for k, v in data.items()]
    index = SimhashIndex(objs, k=10)
    s1 = Simhash(
        u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank'
    )
    dups = index.get_near_dups(s1)
    dups = index.get_near_dups2(s1, 5)
    index.remove(s1)
コード例 #14
0
    def process_graph(self, project_id):
        visits = defaultdict(list)
        p = 0
        hashtags_db = Hashtag.objects.filter(project_id=project_id)

        logger.info("Total hashtags to process " + str(len(hashtags_db)))
        for hashtag_entry in hashtags_db:
            visits[hashtag_entry.user_id].append(hashtag_entry.hashtag)
            p += 1

        logger.info("Hashtag read")
        logger.info("Hashtag processed " + str(p))
        logger.info("Visits count " + str(len(visits)))

        objs = []
        cant_users = 0
        cant_processed = 0
        index = SimhashIndex(objs, f=f1, k=k1)
        for user, hashtags in visits.iteritems():
            if len(hashtags) > MIN_HASHTAG_PER_USER:
                simhash = Simhash(hashtags, f=f1)
                index.add(user, simhash)
                cant_processed += 1
            cant_users += 1
            if cant_users % 10000 == 0:
                logger.info("%s processed" % cant_users)

        logger.info("Simash index build for %i out of %i users" %
                    (cant_processed, len(visits)))
        cant_processed = 0
        for user, hashtags in visits.iteritems():
            near_dups = index.get_near_dups(Simhash(hashtags, f=f1))
            for user_near_dups in near_dups:
                user_near_dups = long(user_near_dups)
                if user_near_dups != long(user):
                    hashtag_near_dups = visits[user_near_dups]
                    intersect = set(hashtags).intersection(hashtag_near_dups)
                    ratio = len(intersect) * 1.0 / len(hashtag_near_dups)
                    if ratio >= 0.1:
                        hashtag_graph = HashtagGraph(user_oid_i=user,
                                                     user_oid_j=user_near_dups,
                                                     ratio=ratio)
                        hashtag_graph.save()
            cant_processed += 1
            if cant_processed % 10000 == 0:
                logger.info("%i processed" % cant_processed)
コード例 #15
0
ファイル: test.py プロジェクト: MacHu-GWU/simhash-guide
def use_simhash_index():
    data = {
        1: "How are you? I Am fine. blar blar blar blar blar Thanks.",
        2: "How are you i am fine. blar blar blar blar blar than",
        3: "This is simhash test.",
    }
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    print(index.bucket_size())

    s1 = Simhash(
        get_features(u"How are you i am fine. blar blar blar blar blar thank"))
    print(index.get_near_dups(s1))

    index.add("4", s1)
    print(index.get_near_dups(s1))
コード例 #16
0
ファイル: sim.py プロジェクト: vicever/zufang-1
def sim_merge(finaldb_cut, simdb):
    d = {}
    index_list = []
    hashurl2sim = {}
    max_distance = 10
    with open(finaldb_cut, 'r') as f:
        for line in f:
            if not line:
                break
            # hashurl  title  author  images  links  text  pub_time
            # 1        2      3       4       5      6     7
            # jushi  shouji  zujin  dizhi  ditie  url  crawl_time  source  ext
            # 8      9       10     11     12     13   14          15      16
            array = line.rstrip('\r\n').split('\t')
            hashurl = array[0]  #string,key
            title = array[1]  #string
            text = array[5]  #string
            pub_time = array[6]  #string
            url = array[12]  #string

            s = Simhash((title + text).decode('utf-8'))
            d.update({hashurl: (title, url, pub_time)})
            sim = Simhash((title + text).decode('utf-8'))
            index_list.append((hashurl, sim))
            hashurl2sim.update({hashurl: sim})

    index = SimhashIndex(index_list, k=max_distance)
    merged = {}
    while d:
        hashurl, (title, url, pub_time) = d.popitem()
        merged[hashurl] = (title, url, pub_time)
        sim_list = index.get_near_dups(hashurl2sim[hashurl])
        buf_list = []
        for h in sim_list:
            if h != hashurl:
                if d.has_key(h):
                    title2, url2, pub_time2 = d.pop(h)
                    merged[h] = (title2, url2, pub_time2)
                else:
                    title2, url2, pub_time2 = merged[h]
            else:
                title2, url2, pub_time2 = title, url, pub_time
            buf_list.append((h, title2, url2, pub_time2))
        if len(buf_list) > 1:
            buf_list = sorted(buf_list, key=lambda i: i[3], reverse=True)
            simdb.insert('\t'.join([buf_list[0][0], json.dumps(buf_list[1:])]))
コード例 #17
0
ファイル: urlAnalyzer.py プロジェクト: mriverov/tip.twitter
    def process_graph(self, project_id):
        visits = defaultdict(list)
        processed = 0
        urls_db = Urls.objects.filter(project_id=project_id)

        logger.info("Total urls to process " + str(len(urls_db)))
        for url_entry in urls_db:
            visits[url_entry.user_id].append(url_entry.url)
            processed += 1
        logger.info("Urls read")
        logger.info("Urls processed " + str(processed))
        logger.info("Visits count " + str(len(visits)))

        objs = []
        cant_users = 0
        cant_processed = 0
        index = SimhashIndex(objs, f=f1, k=k1)
        for user, urls in visits.iteritems():
            if len(urls) > MIN_URLS_PER_USER:
                simhash = Simhash(urls, f=f1)
                index.add(user, simhash)
                cant_processed += 1
            cant_users += 1
            if cant_users % 10000 == 0:
                logger.info("%s processed" % cant_users)

        logger.info("Simash index build for %i out of %i users" %
                    (cant_processed, len(visits)))
        cant_processed = 0
        for user, urls in visits.iteritems():
            near_dups = index.get_near_dups(Simhash(urls, f=f1))
            for user_near_dups in near_dups:
                user_near_dups = long(user_near_dups)
                if user_near_dups != long(user):
                    urls_near_dups = visits[user_near_dups]
                    intersect = set(urls).intersection(urls_near_dups)
                    ratio = len(intersect) * 1.0 / len(urls_near_dups)
                    if ratio >= 0.1:
                        url_graph = UrlsGraph(user_oid_i=user,
                                              user_oid_j=user_near_dups,
                                              ratio=ratio)
                        url_graph.save()
            cant_processed += 1
            if cant_processed % 10000 == 0:
                logger.info("%i processed" % cant_processed)
コード例 #18
0
def simhashsort(datadic, entryset):
    objs = [(id, Simhash(sent)) for id, sent in datadic.items()]
    index = SimhashIndex(objs, k = tolerance)  # k是容忍度;k越大,检索出的相似文本就越多
    kind = 1  # 类型号
    sorted = set()
    for id in datadic:
        if str(id) in sorted:  # 不重复分类
            continue
        # 求相似集
        similiarlist = index.get_near_dups(Simhash(datadic[id]))
        similiarlist.append(str(id))
        # 将相似集信息返回到entryset中
        for id in similiarlist:
            sorted.add(id)
        for entry in entryset:
            if str(entry["id"]) in similiarlist:
                entry["cluster"] = kind
        kind += 1
コード例 #19
0
def save_duplicates(save_path, text2hash_dict, k=5):
    """Group similar docs' title"""
    # Construct SimhashIndex object for similar docs detection. k is tolerance.
    index = SimhashIndex(text2hash_dict, k=k)

    done = list()
    with tqdm(total=len(text2hash_dict)) as pbar:
        with open(save_path, 'w', encoding='utf8') as file:
            for i in range(len(text2hash_dict) - 1):
                # get near duplicates
                near_dups = index.get_near_dups(text2hash_dict[i][1])
                # near dups includes origin title, len > 1 requested
                if len(near_dups) > 1 and text2hash_dict[i][0] not in done:
                    for title in near_dups:
                        file.write(title)
                        file.write('\n')
                    file.write('#' * 5 + '\n')
                    done.extend(near_dups)
                pbar.update()
コード例 #20
0
 def __init__(self, hash_size=64, hash_tol=3, num_words_to_complete=10):
   """
   Params:
     hash_size : The number of output bits of the hash function used in SimHash.
                 Higher values -> able to handle more noise.
     hash_tol  : The number of bits that can differ for a candidate near-match in Simhash
     
     num_words_to_complete : The number of words to complete given a context when a new
                             document is encountered in get_best_match
   """
   
   self.num_words_to_complete = num_words_to_complete
   self.hash_size = hash_size
   self.hash_tol = hash_tol
   
   #This implementation of simhash stores the index in RAM, but it could easily be
   # put on disk.
   self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol)
   self.author_identifier = LanguageModelAuthorIdentifier()
   self.author_semantic_models = SemanticLanguageModels()
コード例 #21
0
def simhashSort2(datadic, entryset):
    objs = []
    for entry in datadic:
        objs.append((entry[0], Simhash(entry[1])))
    index = SimhashIndex(objs, k=tolerance)  # k是容忍度;k越大,检索出的相似文本就越多
    kind = 1  # 类型号
    sorted = set()
    for item in datadic:
        if str(item[0]) in sorted:  # 不重复分类
            continue
        # 求相似集
        similiarlist = index.get_near_dups(Simhash(item[1]))
        similiarlist.append(str(item[1]))
        # 将相似集信息返回到entryset中
        for id in similiarlist:
            sorted.add(id)
        for entry in entryset:
            if str(entry["id"]) in similiarlist:
                entry["sim_count"] = kind
        kind += 1
コード例 #22
0
 def __init__(self, config, worker=None):
     self.config = config
     self.host, self.port = config.cache_server
     #self.robots = list of banned paths
     self.robots = {}
     self.simhashes = SimhashIndex([])
     self.link = 1
     self.worker = worker
     self.maxWords = (
         "", 0
     )  # maxWords[0] is the URL, maxWords[1] is the number of words in it
     self.wordCounter = Counter(
     )  # a dictionary that keeps track of the # of words
     self.stopWords = [
         '1', 'a', 'about', 'above', 'after', 'again', 'against', 'all',
         'also', 'am', 'an', 'and', 'any', 'are', 'are', "aren't", 'as',
         'at', 'b', 'be', 'because', 'been', 'before', 'being', 'below',
         'between', 'both', 'but', 'by', 'can', 'can', "can't", 'cannot',
         'could', "couldn't", 'd', 'did', "didn't", 'do', 'does', "doesn't",
         'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from',
         'further', 'had', "hadn't", 'has', 'has', "hasn't", "hasn't",
         'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her',
         'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd",
         "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it',
         "it's", 'its', 'itself', "let's", "ll", 'm', 'may', 'me', 'more',
         'most', "mustn't", 'my', 'myself', 'next', 'no', 'nor', 'not',
         'of', 'off', 'on', 'once', 'once', 'one', 'only', 'or', 'other',
         'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 's',
         'same', 'say', 'says', "shan't", 'she', "she'd", "she'll", "she's",
         'should', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that',
         "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
         'there', "there's", 'these', 'they', "they'd", "they'll",
         "they're", "they've", 'this', 'those', 'through', 'to', 'too',
         'under', 'under', 'until', 'until', 'up', 've', 'very', 'was',
         "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were',
         "weren't", 'what', "what's", 'when', "when's", 'where', 'which',
         'while', 'who', "who's", 'whom', 'why', "why's", 'will', 'with',
         "won't", 'would', "wouldn't", 'x', 'y', 'you', "you'd", "you'll",
         "you're", "you've", 'your', 'yourself', 'yourselves'
     ]
def simhash_1(labels, targets, query, query_url, dataset, k=2, width=5):
    dictionary = dict(zip(labels, targets))
    objs = [(str(k), Simhash(get_features(v, width)))
            for k, v in dictionary.items()]
    index = SimhashIndex(objs, k=k)
    query_simhash = Simhash(get_features(query, width))
    near_dups = index.get_near_dups(query_simhash)

    # Save fingerprints for future use
    appendToFingerprints(
        dataset, './dataset/fingerprints.csv', {
            "query": str(query_simhash.value),
            "duplicates": ' '.join([str(obj[1].value) for obj in objs])
        })
    # print("QUERY: {}".format(query_url))
    # pp(near_dups)

    return {
        "dataset": dataset,
        "query": query_url,
        "duplicates": ' '.join(near_dups)
    }
コード例 #24
0
def test(n):
    import time
    import distance
    from simhash import Simhash, SimhashIndex

    WIDTH = 3

    def gg():
        import random
        from random import randint
        from simhash import Simhash, SimhashIndex
        from itertools import groupby
        # text = str(bin(randint(2**63, 2**64-1)))[2:]
        # tokens = [text[i:i + WIDTH] for i in range(max(len(text) - WIDTH + 1, 1))]
        # return text, Simhash({k: sum(1 for _ in g) for k, g in groupby(sorted(tokens))})
        text = ''.join([random.choice('0123456789abcdef') for _ in range(36)])
        return text, Simhash(text)

    hashes = [gg() for _ in range(n)]
    d1, d2 = [], []
    test_string, test_hash = gg()

    start = time.time()
    for s, h in hashes:
        d1.append([distance.hamming(test_string, s), s])
    print time.time() - start

    start = time.time()
    index = SimhashIndex(hashes, k=5)
    for st in index.get_near_dups(test_hash):
        d2.append([distance.hamming(test_string, st), st])
    print time.time() - start

    print len(d1), len(d2)

    for a, b in zip(sorted(d1)[:20], sorted(d2)):
        print a[1] == b[1], '\t', a, '\t', b
コード例 #25
0
ファイル: hashtest.py プロジェクト: vedmathai/nearduplicate
def main(path):
    corpuses = readFiles.normalize(path)
    results = []
    for corpus in corpuses:
        hashset = {}
        listofitems = []
        for item in corpus.keys():
            if item == 'desc': continue
            z = Simhash(corpus[item])
            hashset[item] = z
            listofitems += [(item, z)]

        l = SimhashIndex(listofitems)
        #print(l.get_near_dups(hashset['../corpus/bbc/tech1/001.txt']))
        hashlist = {}
        for i, item1 in enumerate(hashset.keys()):
            hashlist[item1] = []
            for j, item2 in enumerate(hashset.keys()):
                if j < i:
                    hashlist[item1] += [' ']
                    continue
                hashlist[item1] += [hashset[item1].distance(hashset[item2])]
                #print item1, item2, hashset[item1].distance(hashset[item2])
        results += [[hashset, hashlist, corpus['desc']]]

    with open('results.csv', 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='{')
        for hashset, hashlist, desc in results:
            writer.writerow([" "])
            writer.writerow([i for i in desc.split()])
            record = []
            record += [['Table'] + [key for key in hashset.keys()]]
            for k in hashset.keys():
                record += [[k] + hashlist[k]]
            for item in record:
                writer.writerow(item)
コード例 #26
0
 def setUp(self):
     objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
     self.index = SimhashIndex(objs, k=10)
コード例 #27
0
def init_index(url, initial_data):
    data[url] = initial_data
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    global index 
    index = SimhashIndex(objs, k=3)
コード例 #28
0
        f_stop.close()
    f_stop_seg_list = f_stop_text.split('\n')
    for myword in liststr.split('/'):
        if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
            mywordlist.append(myword)
    return ''.join(mywordlist)


#data.head()['content'].apply(lambda x:jiebaclearText(str(x)))

data['content'] = data['content'].apply(lambda x: jiebaclearText(str(x)))
data['simhash'] = data['content'].apply(lambda x: Simhash(x).value)

train = data.loc[data['source'] == 'train']
test = data.loc[data['source'] == 'test']

train.drop('source', axis=1, inplace=True)
test.drop([
    'source',
], axis=1, inplace=True)

objs = [(row["id"], Simhash(row["content"]))
        for index, row in train.iterrows()]

index = SimhashIndex(objs, k=12)
test['result'] = test['content'].apply(
    lambda x: index.get_near_dups(Simhash(x)))

sub['result'] = test['result']
sub.to_csv('../output/simhash.csv', index=False)
コード例 #29
0
import ast
import time
import mysql.connector
from sumy.utils import get_stop_words
import nltk
from config import *
# 注意把password设为你的root口令:
conn = mysql.connector.connect(user='******',
                               password=sql_password,
                               database='test')
conn.autocommit = True

last_cluster_num = 0

objs = []
index = SimhashIndex(objs, k=tolerance)


def restore_simhash():
    global last_cluster_num
    cursor = conn.cursor()
    cursor.execute('select id, simhash from entries where simhash > 0')
    entries = cursor.fetchall()
    for entry in entries:
        index.add(str(entry[0]), Simhash(int(entry[1])))

    cursor.execute('select max(cluster) from entries')
    last_cluster_num = cursor.fetchone()[0] + 1  # 不需要再加1


def is_en(s):
コード例 #30
0
    for file in os.listdir(news_dir):
        news_file = os.path.join(news_dir, file)
        with open(news_file) as f:
            for line in f:
                news = json.loads(line)
                title_features, content_features = get_news_feature(news)
                print(title_features)
                print(content_features)
                title_data.append((str(news_id), Simhash(title_features)))
                content_data.append((str(news_id), Simhash(content_features)))

                news_id += 1
                if news_id % 1000 == 0:
                    logging.info('{} has finished'.format(news_id))

    title_index = SimhashIndex(title_data)
    content_index = SimhashIndex(content_data)
    # saving
    with open('title_index.pkl', 'wb') as f1:
        pickle.dump(title_index, f1)
    with open('content_index.pkl','wb') as f2:
        pickle.dump(content_index, f2)

    # loading
    # with open('title_index.pkl','rb') as f1:
    #     title_index = pickle.load(f1)
    # with open('content_index.pkl','rb') as f2:
    #     content_index = pickle.load(f2)
    #
    # print(title_index.bucket_size)
    # print(content_index.bucket_size)