Esempio n. 1
0
	def getTextualHash(self, driver):
		"""
		returns a textual hash of the url passed
		"""
		#get the html source
		html_source = driver.page_source
		#create and update our nilsimsa object with the source
		nilsimsaObj = Nilsimsa()
		nilsimsaObj.update(html_source)
		return nilsimsaObj.hexdigest()
Esempio n. 2
0
def test_nilsimsa():
    """
    tests the nilsimsa hash by choosing a random test file
    computes the nilsimsa digest and compares to the true
    value stored in the pickled sid_to_nil dictionary
    """
    fname = random.choice(dircache.listdir(test_data_dir))
    f = open(os.path.join(test_data_dir, fname), "rb")
    nil = Nilsimsa(f.read())
    f.close()
    assert nil.hexdigest() == sid_to_nil[fname.split(".")[0]]
Esempio n. 3
0
def test_nilsimsa():
    """
    tests the nilsimsa hash by choosing a random test file
    computes the nilsimsa digest and compares to the true
    value stored in the pickled sid_to_nil dictionary
    """
    fname = random.choice(dircache.listdir(test_data_dir))
    f = open(os.path.join(test_data_dir, fname), "rb")
    nil = Nilsimsa(f.read())
    f.close()
    assert nil.hexdigest() == sid_to_nil[fname.split(".")[0]]
Esempio n. 4
0
def ToHash(h1,h2, classes):
  try:
    th = tlsh.hash(h1)
  except:
    th = 'None'

  try:
    sh = fuzzyhashlib.sdhash(h1).hexdigest().rstrip()
  except:
    sh = 'None'

  try:
    nil = Nilsimsa(h1).hexdigest()
  except:
    nil = 'None'

  try:
    ss = fuzzyhashlib.ssdeep(h1).hexdigest()
  except:
    ss = 'None'

  ch = []
  if classes!=None:
    for c in classes:
      name = c[0]
      content = c[1]
      try:
        cnil = Nilsimsa(content).hexdigest()
      except:
        cnil = 'None'

      try:
        css = fuzzyhashlib.ssdeep(content).hexdigest()
      except:
        css = 'None'

      try:
        csh = 'None'
        if len(content)  >= 512:
          csh = fuzzyhashlib.sdhash(content).hexdigest().rstrip()
      except:
        csh = 'None'

      try:
        cth = 'None'
        if len(content) >= 256:
          cth = tlsh.hash(content)
      except:
        cth = 'None'
      ch.append((name,cth,csh,cnil,css))
  return th,sh,nil,ss,ch
Esempio n. 5
0
 def __init__(self,string,parent,parent_filename,lines,type):
     self.string=string                      # string dump of TC/KW from parser
     self.parent=parent                      # TC/KW name
     self.parent_filename=parent_filename    # parent filename where TC/KW is located
     self.lines=lines                        # line numbers of TC/KW
     self.type=type                          # type - KW or TC
     self.digest=Nilsimsa(self.string)       # hash
Esempio n. 6
0
	def getNilsimsaHash(self, url, call_phantom=True):
		if call_phantom: self.setUpGetter(url)
		# if not output file exists, then the page failed to load
		if not os.path.isfile("{}-output.txt".format(self.id)):
			return -1
		#create and update our nilsimsa object with the source
		try:
			with open("{}-output.txt".format(self.id), "rb") as f:
				nilsimsaObj = Nilsimsa(f.read())
			#nilsimsaObj.from_file("output.txt")
			self.nilsimsa_hash = nilsimsaObj.hexdigest()
		except Exception as e:
			print(e)
		finally:
			# always remove the old file even if an exception is thrown
			os.remove('{}-output.txt'.format(self.id))
			#test = True
		return self.nilsimsa_hash
Esempio n. 7
0
def get_hash(text, hash_function="ssdeep"):
    """
    Generates hashed text using one of several available hashing functions.

    :param text: The string to hash
    :type text: str
    :param hash_function: The specific algorithm to use; options are ``'nilsimsa'``, ``'md5'``, and ``'ssdeep'`` \
    (default)
    :type hash_function: str
    :return: A hashed representation of the provided string
    :rtype: str

    .. note:: The string will be passed through :py:func:`pewtils.decode_text` and the returned value will be used \
    instead of the original value if it runs successfully, in order to ensure consistent hashing in both Python 2 and \
    3. By default the function uses the :py:mod:`ssdeep` algorithm, which generates context-sensitive hashes that are \
    useful for computing document similarities at scale.

    .. note:: Using `hash_function='ssdeep'` requires the :py:mod:`ssdeep` library, which is not installed by default \
    because it requires the installation of additional system libraries on certain operating systems. For help \
    installing ssdeep, refer to the pewtils documentation installation section, which provides OS-specific instructions.

    Usage::

        from pewtils import get_hash

        >>> text = 'test_string'
        >>> get_hash(text)
        '3:HI2:Hl'
    """

    decoded_text = decode_text(text).encode("utf8").strip()
    if decoded_text == "":
        decoded_text = text
    text = decoded_text
    if hash_function == "nilsimsa":
        from nilsimsa import Nilsimsa

        hashed = Nilsimsa(text).hexdigest()
    elif hash_function == "md5":
        hashed = md5(text).hexdigest()
    else:
        try:
            import ssdeep
        except ImportError:
            raise Exception("""
                To use get_hash with hash_function='ssdeep' you need to install the ssdeep package. Try running: 
                    >> BUILD_LIB=1 pip install ssdeep
                If you encounter installation problems, refer to the pewtils documentation for troubleshooting help.
            """)
        hashed = ssdeep.hash(text)

    return hashed
Esempio n. 8
0
def test_compatability():
    """
    testing compat with deprecated version by comparing nilsimsa
    scores of 5 randomly selected documents from the test corpus
    and asserting that both give the same hexdigest
    """
    names = dircache.listdir(test_data_dir)
    fnames = set([random.choice(names) for i in range(5)])
    for fname in fnames:
        f = open(os.path.join(test_data_dir, fname), "rb")
        text = f.read()
        f.close()
        if not (Nilsimsa(text).hexdigest() == orig_Nilsimsa(text).hexdigest()):
            assert False
    assert True
Esempio n. 9
0
def test_nilsimsa_speed():
    """
    computes nilsimsa hash for all test files and prints speed
    """
    corpus = []
    for fname in listdir(test_data_dir):
        f = open(os.path.join(test_data_dir, fname), "rb")
        corpus.append(f.read())
        f.close()
    start = time.time()
    for text in corpus:
        Nilsimsa(text)
    elapsed = time.time() - start
    print("%d in %f --> %f per second" % (
        len(corpus), elapsed, len(corpus)/elapsed))
Esempio n. 10
0
def get_nilsimsa():
    dbconn = MySQLdb.connect('10.141.221.73', 'root', 'root', 'fdroid')
    dbcursor = dbconn.cursor()
    # sql = 'select block_id,block_code from fdroid.cc_block where detection_id=1 and detection_tp = "20150101--20150131"'
    sql = 'select detection_tp,block_id,block_code from fdroid.cc_block where detection_id=1'
    dbcursor.execute(sql)
    f = open('nilsimsa1.txt', 'w')
    start = time.clock()
    for i in dbcursor.fetchall():
        f.write(
            str(i[0]) + '   ' + str(i[1]) + '   ' +
            Nilsimsa(i[2]).hexdigest() + '\n')
        print i[1]

    end = time.clock()
    f.write(str(end - start))
    f.close()
    dbcursor.close()
    dbconn.close()
Esempio n. 11
0
def get_hash(text, hash_function="ssdeep"):

    """
    Generates hashed text using one of several available hashing functions.

    :param text: The string to hash
    :type text: str
    :param hash_function: The specific algorithm to use; options are ``'nilsimsa'``, ``'md5'``, and ``'ssdeep'`` \
    (default)
    :type hash_function: str
    :return: A hashed representation of the provided string
    :rtype: str

    .. note:: The string will be passed through :py:func:`pewtils.decode_text` and the returned value will be used \
    instead of the original value if it runs successfully, in order to ensure consistent hashing in both Python 2 and \
    3. By default the function uses the :py:mod:`ssdeep` algorithm, which generates context-sensitive hashes that are \
    useful for computing document similarities at scale.

    Usage::

        from pewtils import get_hash

        >>> text = 'test_string'
        >>> get_hash(text)
        '3:HI2:Hl'
    """

    decoded_text = decode_text(text).encode("utf8").strip()
    if decoded_text == "":
        decoded_text = text
    text = decoded_text
    if hash_function == "nilsimsa":
        from nilsimsa import Nilsimsa

        hashed = Nilsimsa(text).hexdigest()
    elif hash_function == "md5":
        hashed = md5(text).hexdigest()
    else:
        import ssdeep

        hashed = ssdeep.hash(text)

    return hashed
Esempio n. 12
0
def nilsimsa_hash(text):
    if isinstance(text, unicode):
        text = text.encode('utf8')
    return Nilsimsa(text).hexdigest()
Esempio n. 13
0
import extractToken
import getCodeFragment
import os
try:
    import cPickle as pickle
except ImportError:
    import pickle

test_data_dir = os.path.join(os.path.dirname(__file__),
                             "nilsimsa\\test_data\\")
test_data = "test_dict.p"
test_dict = os.path.join(test_data_dir, test_data)
sid_to_nil = pickle.load(open(test_dict, "rb"))
# print sid_to_nil

nil = Nilsimsa('0' * 64)
s1 = nil.hexdigest()
nil = Nilsimsa('0' * 63 + '1')
s2 = nil.hexdigest()
print s1, s2
print compare_digests(s1, s2)

# for i in range(1,30):
#     cloneGroup = getCodeFragment.getCloneClass('1.2.txt', i)
#     s1 = Nilsimsa(cloneGroup[0]).hexdigest()
#     s2 = Nilsimsa(cloneGroup[1]).hexdigest()
#     #print s1,s2
#     print compare_digests(s1,s2)
#     if compare_digests(s1,s2) <0:
#         getCodeFragment.printCloneClass('1.2.txt', i)
Esempio n. 14
0
def get_ads(base_url):
    c = conn.cursor()

    page = download.get(base_url + "/search/cpg")

    for p in page.select(".row"):
        pid = p['data-pid']

        a_tag = p.find('a', class_='hdrlnk')
        ad_href = a_tag['href']
        ad_title = a_tag.text

        dt = p.find('time')['datetime']
        dt = datetime.datetime.strptime(dt, "%Y-%m-%d %H:%M")
        dt = int(dt.strftime("%s"))

        c.execute("SELECT * FROM ad WHERE id = ?", (pid, ))
        row = c.fetchone()

        if row is None:
            url = ad_href
            if not ad_href.startswith('http'):
                url = base_url + ad_href

            time.sleep(0.5)
            ad = download.get(url)

            print url
            ad_text = ad.find(id='postingbody')
            if ad_text is None:
                if ad.find(id='has_been_removed'):
                    continue
                else:
                    raise "malformed body"
            ad_text = ad_text.text.strip()

            ad_text = filter(lambda x: x in string.printable, ad_text)
            nilsimsa = Nilsimsa(ad_text)
            lshash = nilsimsa.hexdigest()

            # c.execute("SELECT * FROM ad")
            # row = c.fetchone()
            # while row:
            #     diff = nilsimsa.compare(row[4], True)
            #     if diff < 10:
            #         print diff
            #         print cache.get("text:" + row[0])
            #         print "----"
            #         print ad_text
            #         sys.exit()

            seen = generate_word_counts(ad_text)

            cache.write("text:" + pid, ad_text)

            row = (pid, url, ad_title, dt, lshash)
            c.execute(
                "INSERT INTO ad (id, url, title, posted, lshash) " +
                " VALUES (?,?,?,?,?)", row)

            for word in seen:
                if word not in stopwords:
                    row = (pid, word, seen[word])
                    c.execute(
                        "INSERT INTO tf (id, word, cnt) " + "VALUES (?,?,?)",
                        row)
            conn.commit()
Esempio n. 15
0
            },
        }

    if data['last-modified']:
        try:
            last_modified = int(
                datetime.datetime(
                    *eut.parsedate(data['last-modified'])[:6]).strftime('%s'))
        except Exception, exc:
            logger.info('failed to parse last-modified=%r',
                        data['last-modified'])
            last_modified = 0
    else:
        last_modified = 0
    doc_id = md5(data['content-location']).hexdigest()
    content_hash = Nilsimsa(data['body']).hexdigest()
    file_id = (doc_id, last_modified, content_hash)
    file_id_str = '%s-%d-%s' % file_id

    kvlclient.setup_namespace(highlights_kvlayer_tables)
    if data['store'] is False:
        kvlclient.delete('files', (file_id[0], ))
        kvlclient.delete('highlights', (file_id[0], ))
        logger.info('cleared all store records related to doc_id=%r',
                    file_id[0])
    else:  # storing is allowed
        payload_strs = list(kvlclient.get('highlights', file_id))
        if payload_strs and payload_strs[0][1]:
            payload_str = payload_strs[0][1]
            try:
                payload = json.loads(payload_str)
Esempio n. 16
0
 def calc_nilsimsa(self, gold_surface_form, comp_surface_form):
     nil_0 = Nilsimsa(gold_surface_form)
     nil_1 = Nilsimsa(comp_surface_form)
     nil = compare_digests(nil_0.hexdigest(), nil_1.hexdigest())
     return nil
Esempio n. 17
0
 def compute_hash(self, text):
     from nilsimsa import Nilsimsa
     result = Nilsimsa(data=text)
     result = result.hexdigest()
     return str(result)
Esempio n. 18
0
def test_unicode():
    """
    ensures that feeding unicode to Nilsimsa behaves gracefully
    """
    nil = Nilsimsa(u'\u1F631')
    assert nil.hexdigest()
Esempio n. 19
0
def get_nilsimsa(string):
    return Nilsimsa(string).hexdigest()
Esempio n. 20
0
    []  # ssdeep
]

compare_results = [
    [],  # nilsimsa
    [],  # tlsh
    []  # ssdeep
]

for filename in os.listdir('.'):
    if filename.startswith("prog"):
        with open(filename, "rb") as file:
            file_data = file.read()

            # nilsimsa
            hashes[0].append(Nilsimsa(file_data))

            # tlsh
            hashes[1].append((filename, tlsh.hash(file_data)))

            # ssdeep
            output = subprocess.Popen(["ssdeep.exe", filename],
                                      stdout=subprocess.PIPE).communicate()[0]
            hashes[2].append((filename, output))
            #hashes[2].append((filename, str(output.splitlines()[2]).split("\'", 1)[1].split(",", 1)[0]))

print("")
print("nilsimsa (different 0 - 128 similar)")
for e in hashes[0]:
    print(str(e.hexdigest()))
    compare_results[0].append(hashes[0][0].compare(e.hexdigest(), True))