def getTextualHash(self, driver): """ returns a textual hash of the url passed """ #get the html source html_source = driver.page_source #create and update our nilsimsa object with the source nilsimsaObj = Nilsimsa() nilsimsaObj.update(html_source) return nilsimsaObj.hexdigest()
def test_nilsimsa(): """ tests the nilsimsa hash by choosing a random test file computes the nilsimsa digest and compares to the true value stored in the pickled sid_to_nil dictionary """ fname = random.choice(dircache.listdir(test_data_dir)) f = open(os.path.join(test_data_dir, fname), "rb") nil = Nilsimsa(f.read()) f.close() assert nil.hexdigest() == sid_to_nil[fname.split(".")[0]]
def getNilsimsaHash(self, url, call_phantom=True): if call_phantom: self.setUpGetter(url) # if not output file exists, then the page failed to load if not os.path.isfile("{}-output.txt".format(self.id)): return -1 #create and update our nilsimsa object with the source try: with open("{}-output.txt".format(self.id), "rb") as f: nilsimsaObj = Nilsimsa(f.read()) #nilsimsaObj.from_file("output.txt") self.nilsimsa_hash = nilsimsaObj.hexdigest() except Exception as e: print(e) finally: # always remove the old file even if an exception is thrown os.remove('{}-output.txt'.format(self.id)) #test = True return self.nilsimsa_hash
def calc_nilsimsa(self, gold_surface_form, comp_surface_form): nil_0 = Nilsimsa(gold_surface_form) nil_1 = Nilsimsa(comp_surface_form) nil = compare_digests(nil_0.hexdigest(), nil_1.hexdigest()) return nil
def get_ads(base_url): c = conn.cursor() page = download.get(base_url + "/search/cpg") for p in page.select(".row"): pid = p['data-pid'] a_tag = p.find('a', class_='hdrlnk') ad_href = a_tag['href'] ad_title = a_tag.text dt = p.find('time')['datetime'] dt = datetime.datetime.strptime(dt, "%Y-%m-%d %H:%M") dt = int(dt.strftime("%s")) c.execute("SELECT * FROM ad WHERE id = ?", (pid, )) row = c.fetchone() if row is None: url = ad_href if not ad_href.startswith('http'): url = base_url + ad_href time.sleep(0.5) ad = download.get(url) print url ad_text = ad.find(id='postingbody') if ad_text is None: if ad.find(id='has_been_removed'): continue else: raise "malformed body" ad_text = ad_text.text.strip() ad_text = filter(lambda x: x in string.printable, ad_text) nilsimsa = Nilsimsa(ad_text) lshash = nilsimsa.hexdigest() # c.execute("SELECT * FROM ad") # row = c.fetchone() # while row: # diff = nilsimsa.compare(row[4], True) # if diff < 10: # print diff # print cache.get("text:" + row[0]) # print "----" # print ad_text # sys.exit() seen = generate_word_counts(ad_text) cache.write("text:" + pid, ad_text) row = (pid, url, ad_title, dt, lshash) c.execute( "INSERT INTO ad (id, url, title, posted, lshash) " + " VALUES (?,?,?,?,?)", row) for word in seen: if word not in stopwords: row = (pid, word, seen[word]) c.execute( "INSERT INTO tf (id, word, cnt) " + "VALUES (?,?,?)", row) conn.commit()
def compute_hash(self, text): from nilsimsa import Nilsimsa result = Nilsimsa(data=text) result = result.hexdigest() return str(result)
def test_unicode(): """ ensures that feeding unicode to Nilsimsa behaves gracefully """ nil = Nilsimsa(u'\u1F631') assert nil.hexdigest()
import getCodeFragment import os try: import cPickle as pickle except ImportError: import pickle test_data_dir = os.path.join(os.path.dirname(__file__), "nilsimsa\\test_data\\") test_data = "test_dict.p" test_dict = os.path.join(test_data_dir, test_data) sid_to_nil = pickle.load(open(test_dict, "rb")) # print sid_to_nil nil = Nilsimsa('0' * 64) s1 = nil.hexdigest() nil = Nilsimsa('0' * 63 + '1') s2 = nil.hexdigest() print s1, s2 print compare_digests(s1, s2) # for i in range(1,30): # cloneGroup = getCodeFragment.getCloneClass('1.2.txt', i) # s1 = Nilsimsa(cloneGroup[0]).hexdigest() # s2 = Nilsimsa(cloneGroup[1]).hexdigest() # #print s1,s2 # print compare_digests(s1,s2) # if compare_digests(s1,s2) <0: # getCodeFragment.printCloneClass('1.2.txt', i) # for i in range(1,50):