Esempio n. 1
0
def resort(companies, artifacts, news):
    dic = {'artifact': {}, 'news': {}}
    for a in artifacts:
        str_a = getArtiStr(a)
        simhash_a = simhash.Simhash(simhash.get_features(str_a))
        minDistance = 99999999
        minCompany = ''
        for c in companies:
            str_c = getComStr(c)
            simhash_c = simhash.Simhash(simhash.get_features(str_c))
            distance = simhash_a.distance(simhash_c)
            if distance < minDistance:
                minDistance = distance
                minCompany = c
        dic['artifact'][a] = minCompany

    for n in news:
        str_a = getNewsStr(n)
        simhash_a = simhash.Simhash(simhash.get_features(str_a))
        minDistance = 99999999
        minCompany = ''
        for c in companies:
            str_c = getComStr(c)
            simhash_c = simhash.Simhash(simhash.get_features(str_c))
            distance = simhash_a.distance(simhash_c)
            if distance < minDistance:
                minDistance = distance
                minCompany = c
        dic['news'][n] = minCompany

    return dic
Esempio n. 2
0
def get_simhash_dis(str1, str2):
    """计算两个文本之间的simhash相似度"""
    simhash_str1 = simhash.Simhash(str1)
    simhash_str2 = simhash.Simhash(str2)
    dis_simhash = 1 - simhash_str1.distance(simhash_str2) / 64
    dis_ratio = Levenshtein.ratio(str1, str2)
    dis_jaro = Levenshtein.jaro(str1, str2)
    res = (dis_simhash + dis_ratio + dis_jaro) / 3
    return res
Esempio n. 3
0
def sketchesSimhash(sketches):

    hashes = {}
    for sketch, asProb in sketches.iteritems():
        hashes[sketch] = simhash.Simhash(asProb, f=64)

    return hashes
Esempio n. 4
0
 def compute_simhash(self):
     texto = []
     texto.append(self.enunciado)
     for p in self.cuestiones:
         texto.append(p.enunciado)
         texto.append(p.respuesta)
     self.simhash = "%x" % simhash.Simhash("\n".join(texto)).value
Esempio n. 5
0
 def __init__(
     self,
     setURLIter,
     maxFileBytes,
     setStemmer=UT.Stemmer_killPunct()
 ):
     self.urlIter = setURLIter
     self.stemmer = setStemmer
     self.simhash = SH.Simhash()
     # File writing
     self.writer = UT.FileWriter(True)  # True turns logging on
     self.contentWriter = UT.FileWriter()  # For unlogged content
     # Passing None makes uq resume after last uq id
     # Defeat this by saving a 1 to uqgen.txt
     # Call uq.saveState() after parseAll()
     self.uq = UT.UQGen(None)
     self.maxFile = maxFileBytes
     self.bytesWritten = 0
     # Logging
     self.badList = []
     # Current data
     self.currTitle = ""
     self.currRaw = None
     self.currContent = None
     self.procContent = None
     self.currAttr = None
Esempio n. 6
0
 def get_simhash_value(contents):
     main = ""
     for content in contents:
         if content["content"].strip() != "":
             main = main + content["content"]
     a = simhash.Simhash(simhash.get_features(main))
     # logger.info("*****%s", a.value)
     return str(a.value)
Esempio n. 7
0
    def get_distance(self, newsId1, newsId2):
        news1 = self.collection.find_one({'_id': newsId1})
        news2 = self.collection.find_one({'_id': newsId2})

        def get_simhash_value(contents):
            main = ""
            for content in contents:
                if content["content"].strip() != "":
                    main = main + content["content"]
            a = simhash.Simhash(simhash.get_features(main))
            # logger.info("*****%s", a.value)
            return str(a.value)

        # logger.info('sim1:%s | sim2:%s',get_simhash_value(news1['contents']),get_simhash_value(news2['contents']))

        simhash1 = simhash.Simhash(get_simhash_value(news1['contents']))
        simhash2 = simhash.Simhash(get_simhash_value(news2['contents']))
        distance = simhash1.distance(simhash2)
        return distance
def hash_simhash(text):
    """
    Calculates the SimHash value for the string property passed in.

    :param data: Contains the keyword arguments passed in. Requires a keyword "property",
    which is a string.
    :return: An integer value
    """

    return int(simhash.Simhash(text).value / 100)
Esempio n. 9
0
def process():
    global count
    global finallist
    i = 0
    j = 0
    while i < len(texts):
        j = i + 1
        while j < len(texts):
            count += 1
            s1 = simhash.Simhash(texts[i])
            s2 = simhash.Simhash(texts[j])
            if s1.distance(s2) < 3:
                if i not in finallist:
                    finallist.append(i)
                if j not in finallist:
                    finallist.append(j)
            j += 1
        i += 1

    print finallist
Esempio n. 10
0
def check_same_act(act):
    v = long(act["simhashValue"])
    acts1 = list(
        collection_news.find({
            "type": 60002,
            "beginDate": act["beginDate"],
            "endDate": act["endDate"],
            "city": act["city"]
        }))
    for act1 in acts1:
        if act1.has_key(
                "simhashValue") is False or act["simhashValue"] is None:
            continue
        logger.info("same title: %s", act["title"])
        v1 = long(act1["simhashValue"])
        dis = simhash.Simhash(v).distance(simhash.Simhash(v1))
        if dis < 6:
            logger.info("Same act!!! %s, %s, %s, %s, %s", dis, act["title"],
                        act1["title"], act["link"], act1["link"])
            return True
    return False
Esempio n. 11
0
 def _detect_internal(self, fpath):
     with open(fpath, 'rb') as fp:
         detect_result = dict()
         hash_set = self._load_features_from_file(fp)
         hash_result = simhash.Simhash(hash_set)
         matched_samples = self.detector.get_near_dups(hash_result)
         if len(matched_samples) > 0:
             detect_messgae = '[Webshell] > {0} with matches: {1}'.format(fpath, matched_samples)
             self.cmdx_logger.warning(detect_messgae)
             self.file_logger.warning(detect_messgae + '\r\n')
             detect_result[fpath] = True
         else:
             detect_messgae = 'NormalPage > {0}.'.format(fpath)
             self.cmdx_logger.info(detect_messgae) 
             detect_result[fpath] = False
         return detect_result   #A dict which key means file path and value is a bool value.
Esempio n. 12
0
    def add_url(self, url, **kwargs):
        _path = urlparse(url).path
        if not _path:
            _path = "/"
        _path = self._prehandle_path(_path)
        _final = self._concat_url(url, _path, **kwargs)

        if _final in self.bfilter:
            return
        else:
            self.bfilter.add(_final)

        if self.distance:
            _shash = simhash.Simhash(_final)
            result = self._simindex.get_near_dups(_shash)
            if not result:
                self._simindex.add(uuid.uuid4(), _shash)
Esempio n. 13
0
    def url_is_duplicate(self, url, **kwargs):
        path = urlparse(url).path
        if not path:
            path = "/"
        _path = self._prehandle_path(path)
        _final = self._concat_url(url, _path, **kwargs)

        if _final not in self.bfilter:
            if not self.distance:
                return False
            else:
                shash = simhash.Simhash(_final)
                result = self._simindex.get_near_dups(shash)
                if not result:
                    return False
                else:
                    return True
        else:
            return True
Esempio n. 14
0
 def run(self):
     while True:
         tbd_url = self.frontier.get_tbd_url()
         if not tbd_url:
             self.logger.info("Frontier is empty. Stopping Crawler.")
             self.frontier.end_thread()
             if self.frontier.threadCount == 0:
                 self.report.print_report()
             break
         resp = download(tbd_url, self.config, self.logger)
         self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                          f"using cache {self.config.cache_server}.")
         scraped_urls, tokens = scraper(tbd_url, resp)
         #checks to make sure page is not empty and not a duplicate (using simhash)
         if not tokens == '' and not self.frontier.simhashIndex.get_near_dups(
                 simhash.Simhash(tokens)):
             self.report.store_report(tbd_url, tokens)
             self.frontier.add_simhash(tbd_url, tokens)
             for scraped_url in scraped_urls:
                 self.frontier.add_url(scraped_url)
         self.frontier.mark_url_complete(tbd_url)
Esempio n. 15
0
    return text


# In[21]:

extracted["norm_text"] = extracted.text.apply(normalize_text)

# In[22]:

extracted.head()

# In[25]:

extracted.iloc[48].norm_text

# In[29]:


def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]


# In[30]:

extracted.content_hash = extracted.norm_text.apply(
    lambda X: simhash.Simhash(get_features(X)).value)
extracted.head()
Esempio n. 16
0
_WRE = re.compile(r'\b\S+\b')

words = _WRE.findall(TEXT)

MIN_SAMPLE_WORDS = 10
MAX_SAMPLE_WORDS = 25
NUM_SAMPLES = 1000
TOLERANCE = 6  # <--- Вот это очень влияет на производительность!
SEARCHES = 100

samples = [
    random.sample(words, random.randint(MIN_SAMPLE_WORDS, MAX_SAMPLE_WORDS))
    for _ in range(NUM_SAMPLES)
]
simhashes = [simhash.Simhash(s) for s in samples]

shi = simhash.SimhashIndex([(' '.join(s), sh)
                            for s, sh in zip(samples, simhashes)],
                           k=TOLERANCE)
shi.log.setLevel(logging.ERROR)


def test_dummy():
    result = []
    rsh = random.choice(simhashes)
    for j in range(len(samples)):
        if rsh.distance(simhashes[j]) <= TOLERANCE:
            result.append(samples[j])
    return result
Esempio n. 17
0
def ham_dist(text1, text2):
    return simhash.Simhash(text1, f = 8).distance(simhash.Simhash(text2, f = 8))
Esempio n. 18
0
 def sim(str):
     return simhash.Simhash(simhash.get_features(str))
Esempio n. 19
0
    def createRegisterChain(self, p, ea):
        f = idaapi.FlowChart(idaapi.get_func(ea))

        functionName = idaapi.get_func_name(ea)
        client = MongoClient('localhost', 27017)
        db = client.BinAuthor
        collection = db.Choice18

        if idaapi.get_func_name(ea) not in self.functionRegisterChains.keys():
            self.functionRegisterChains[idaapi.get_func_name(ea)] = {}
        for block in f:
            if p:
                registerChain = {}
                for address in Heads(block.startEA, block.endEA):
                    if idc.GetOpType(address,
                                     0) == 1 and idc.GetOpnd(address, 0) != "":
                        if idc.GetOpnd(address,
                                       0) not in self.functionRegisterChains[
                                           idaapi.get_func_name(ea)].keys():
                            self.functionRegisterChains[idaapi.get_func_name(
                                ea)][idc.GetOpnd(address,
                                                 0)] = [idc.GetMnem(address)]
                        else:
                            self.functionRegisterChains[idaapi.get_func_name(
                                ea)][idc.GetOpnd(address, 0)].append(
                                    idc.GetMnem(address))

                        if idc.GetOpnd(address, 0) not in registerChain.keys():
                            registerChain[idc.GetOpnd(
                                address, 0)] = [idc.GetMnem(address)]
                        else:
                            registerChain[idc.GetOpnd(address, 0)].append(
                                idc.GetMnem(address))
                    if idc.GetOpType(address,
                                     1) == 1 and idc.GetOpnd(address, 1) != "":
                        if idc.GetOpnd(address,
                                       1) not in self.functionRegisterChains[
                                           idaapi.get_func_name(ea)].keys():
                            self.functionRegisterChains[idaapi.get_func_name(
                                ea)][idc.GetOpnd(address,
                                                 1)] = [idc.GetMnem(address)]
                        else:
                            self.functionRegisterChains[idaapi.get_func_name(
                                ea)][idc.GetOpnd(address, 1)].append(
                                    idc.GetMnem(address))

                        if idc.GetOpnd(address, 1) not in registerChain.keys():
                            registerChain[idc.GetOpnd(
                                address, 1)] = [idc.GetMnem(address)]
                        else:
                            registerChain[idc.GetOpnd(address, 1)].append(
                                idc.GetMnem(address))
                for register in registerChain.keys():
                    fingerPrint = str(register)
                    functionMinhashes = {}
                    functionMinhashes["FunctionName"] = functionName
                    functionMinhashes["FileName"] = self.fileName
                    functionMinhashes["FileMD5"] = self.fileMD5
                    functionMinhashes["Author Name"] = self.authorName
                    functionMinhashes["BlockStartEA"] = block.startEA
                    functionMinhashes["register"] = register
                    functionMinhashes["registerChain"] = registerChain[
                        register]
                    counter = 0
                    for instruction in registerChain[register]:
                        fingerPrint += " " + str(instruction)
                        counter += 1

                    functionMinhashes["SimHashSignature"] = str(
                        simhash.Simhash(fingerPrint).value)

                    self.simhashList.append(
                        [counter, simhash.Simhash(fingerPrint).value])
                    if len(fingerPrint.split(" ")) >= 6:
                        self.registerChainMinhash.append([
                            fingerPrint,
                            minhash.minHash(
                                minhash.createShingles(fingerPrint))
                        ])
                        functionMinhashes[
                            "MinHashSignature"] = minhash.minHash(
                                minhash.createShingles(fingerPrint))
                        collection.insert(functionMinhashes)
                    else:
                        self.registerChainMinhash.append([
                            fingerPrint,
                        ])
Esempio n. 20
0
def main():
    simhash.Simhash(unicode(TEXT, 'utf-8'),
                    reg=RE_WORD,
                    hashfunc=lambda x: mmh3.hash64(x)[0])
Esempio n. 21
0
def hash_ad_creative_text(text):
    return simhash.Simhash(_get_features(text)).value
Esempio n. 22
0
    seed = 5381
    for i in s:
        seed = ((seed << 5) + seed) + ord(i)
    return ctypes.c_long(seed).value


def convert_n_bytes(n, b):
    bits = b * 8
    return (n + 2**(bits - 1)) % 2**bits - 2**(bits - 1)


def convert_4_bytes(n):
    return convert_n_bytes(n, 4)


def get_hashcode(s):
    h = 0
    n = len(s)
    for i, c in enumerate(s):
        h = h + ord(c) * 31**(n - 1 - i)
    return convert_4_bytes(h)


print(sh.Simhash(s1).distance(sh.Simhash(s2)))
print(sh.Simhash(s1).value)
print(sh.Simhash(s2).value)
print(djb_hash(s1))
print(djb_hash(s2))
print(get_hashcode(s1))
print(get_hashcode(s2))
Esempio n. 23
0
 def _load_sample_with_features(self, fpath):
     with open(fpath, 'rb') as fp:
         hash_set = self._load_features_from_file(fp)
         hash_result = simhash.Simhash(hash_set)
         self.black_list.add((fpath, hash_result))        
Esempio n. 24
0
 def add_simhash(self, url, page):
     with self.lock:
         s = simhash.Simhash(page)
         self.simhashIndex.add(url, s)
Esempio n. 25
0
    return False


if __name__ == "__main__":
    acts = list(
        collection_news.find({
            "type": 60002,
            "title": "微链投递“直通车”,助力高效融资!(9月份超值福利)"
        }))
    aa = 0
    for act in acts:
        if act.has_key("simhashValue") is False or act["simhashValue"] is None:
            contents = get_contents(act["_id"])
            #logger.info(contents)
            if contents is not None:
                a = simhash.Simhash(simhash.get_features(contents))
                logger.info("*****%s, value: %s", act["title"], a.value)
                v = a.value
                collection_news.update_one(
                    {"_id": act["_id"]},
                    {"$set": {
                        "simhashValue": str(a.value)
                    }})
            else:
                logger.info("No content for title: %s", act["title"])
                continue
        else:
            #continue
            v = long(act["simhashValue"])
            acts1 = list(
                collection_news.find({