def merge_pages(self): # merge an unbiased and biased page in an subtle manner final_pages = [] # generate the positions of two insertions based on the query first = simhash.hash(self.query, len(self.true_pages)) second = simhash.hash(self.query*2, len(self.true_pages)) # make sure we don't replace the first result if first == 0: first = (first + 1) % len(self.true_pages) if second == 0: second = (second + 1) % len(self.true_pages) # make sure that our more relevant result is listed first if first == second: if second == len(self.true_pages)-1 and second != 0: first -= 1 elif first !=0: second+=1 elif second < first: first, second = second, first # insert the pages for i in range(len(self.true_pages)): if i == first: final_pages.append(self.cat_pages[0]) elif i == second and len(self.cat_pages) > 1: final_pages.append(self.cat_pages[1]) final_pages.append(self.true_pages[i]) return final_pages
def assertNotMatch(self, a, b, threshold=3): a_h = simhash.hash(a) b_h = simhash.hash(b) diff = self.diff_bits(a_h, b_h) self.assertGreater(diff, threshold, 'Expected (%i) "%s" to NOT match (%i) "%s" (%i)' % ( a_h, a[0:50], b_h, b[0:50], diff))
def get_simhash_similarity(str1, str2): hash1 = simhash.hash(str1) hash2 = simhash.hash(str2) # hash1 = simhash(str1) # hash2 = simhash(str2) # return hash1.similarity(hash2) return diff_bits(hash1, hash2)
def __init__(self, file_id, text, use_zlib, bit_size): self.id = file_id self.tokens = text.split() token_counter = counter.Counter(self.tokens) self.exact_fingerprint = self._adler_32(text, use_zlib) self.near_fingerprint = simhash.hash(token_counter, bit_size) self.near_fingerprint_buckets = [] self.plateau_fingerprint = None self.plateau_fingerprint_buckets = [] plateau = finn.find_plateau(self.tokens) if plateau is not None: plateau_counter = counter.Counter(plateau) self.plateau_fingerprint = simhash.hash(plateau_counter, bit_size)
def parse_page(self, response): hxs = HtmlXPathSelector(response) body = ''.join(hxs.select('//p[@class="paragraph"]//text()').extract()).strip() item = GenericItem() item['body'] = body item['url'] = response.url item['simhash'] = str(simhash.hash(body)) return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) body = ''.join(hxs.select('//span[@itemprop="articleBody"]//text()').extract()).strip() item = GenericItem() item['body'] = body item['url'] = response.url item['simhash'] = str(simhash.hash(body)) return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) body = ''.join( hxs.select( '//span[@itemprop="articleBody"]//text()').extract()).strip() item = GenericItem() item['body'] = body item['url'] = response.url item['simhash'] = str(simhash.hash(body)) return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) body = '' body += ''.join(hxs.select('//h1[@itemprop="headline"]//text()').extract()).strip() + '\n' body += ''.join(hxs.select('//p[@itemprop="description"]/text()').extract()).strip() + '\n' body += ''.join(hxs.select('//div[@itemprop="articleBody"]/p/text()').extract()).strip() item = GenericItem() item['body'] = body item['url'] = response.url item['simhash'] = str(simhash.hash(body)) return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) body = '' body += ''.join(hxs.select('//h2[@class="article-title"]//text()').extract()).strip() + '\n' body += ''.join(hxs.select('//p[@class="article-intro"]//text()').extract()).strip() + '\n' text = hxs.select('//div[@class="article-section clearfix"]/p//text()').extract() body += '\n'.join([x for x in [l.strip() for l in text] if not x.startswith(('<!--', '//'))]) item = GenericItem() item['body'] = body item['url'] = response.url item['simhash'] = str(simhash.hash(body)) return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) body = '' body += ''.join( hxs.select( '//h1[@itemprop="headline"]//text()').extract()).strip() + '\n' body += ''.join( hxs.select('//p[@itemprop="description"]/text()').extract()).strip( ) + '\n' body += ''.join( hxs.select( '//div[@itemprop="articleBody"]/p/text()').extract()).strip() item = GenericItem() item['body'] = body item['url'] = response.url item['simhash'] = str(simhash.hash(body)) return item
def parse_page(self, response): hxs = HtmlXPathSelector(response) body = '' body += ''.join( hxs.select('//h2[@class="article-title"]//text()').extract() ).strip() + '\n' body += ''.join( hxs.select('//p[@class="article-intro"]//text()').extract()).strip( ) + '\n' text = hxs.select( '//div[@class="article-section clearfix"]/p//text()').extract() body += '\n'.join([ x for x in [l.strip() for l in text] if not x.startswith(('<!--', '//')) ]) item = GenericItem() item['body'] = body item['url'] = response.url item['simhash'] = str(simhash.hash(body)) return item
import sys import sqlite3 import simhash # first arg is path to db db = sys.argv[1] conn = sqlite3.connect(db) c = conn.cursor() for chunk in xrange(30): c.execute("SELECT url, body FROM data WHERE simhash IS NULL LIMIT 7000") rows = c.fetchall() for row in rows: url = row[0] body = row[1] c.execute('UPDATE data SET simhash=? WHERE url=?', (str(simhash.hash(body)), url)) conn.commit() conn.close()
def search(self, new_query, new_rpp): # Main search function # Start timer self.start = time.time() # Set variables self.query = new_query self.rpp = int(new_rpp) # Grab JSON from bing # Use hash to create bias self.bias = self.terms[simhash.hash(self.query, len(self.terms))] self.true_json = self.get_dict2(self.query) self.cat_json = self.get_dict2(self.query + ' ' + self.bias) # Empty the stock faroo structure self.faroo_results['iurl'] = '' self.faroo_results['author'] = '' self.faroo_results['votes'] = '' self.faroo_results['related'] = '' self.faroo_results['content'] = '' self.faroo_results['date'] = '' # Convert the bing results into the faroo structure self.true_json = self.convert_to_faroo(self.true_json['results']) self.cat_json = self.convert_to_faroo(self.cat_json['results']) # Check for zero results if len(self.true_json['results']) == 0: # convert back to JSON and print/return return json.dumps(self.true_json) if len(self.cat_json['results']) == 0: # convert true_json back to JSON and print/return return json.dumps(self.true_json) # Populate classes # Crawl each page as we create temp = self.query.split(' ') + self.terms self.true_pages = self.create_pages(self.true_json, False, temp) #self.cat_pages = self.create_pages(self.cat_json, True, temp) self.cat_pages = self.create_pages(self.cat_json, True, self.query.split(' ')) # Rank all pages for pg in self.true_pages: pg.calc_value(self.query + ' ' + self.bias) #pg.calc_value(self.query + ' ' + ' '.join(self.terms)) for pg in self.cat_pages: pg.calc_value(self.query) #pg.calc_value(self.query + ' ' + ' '.join(self.terms)) # Sort the pages by rank self.true_pages.sort() self.cat_pages.sort() # Create final results list self.final_pages = self.merge_pages() # Convert back to correct obj self.final_results_json = [] i = 0 while(i<self.rpp): if i == len(self.final_pages): break self.final_results_json.append(self.final_pages[i].get_json()) i+=1 # Update the final dictionary self.final_dict = self.true_json self.final_dict['count'] = i self.final_dict['results'] = self.final_results_json self.final_dict['time'] = self.true_json['time'] + self.cat_json['time'] + (time.time()-self.start) # Convert to JSON and print return json.dumps(self.final_dict)
import sys import simhash with open('../data.csv', 'rb') as f: companies = [c.strip() for c in f] hashes = [simhash.hash(x) for x in companies] d = {h: c for h, c in zip(hashes, companies)} corpus = simhash.Corpus(32, 16) corpus.insert_bulk(hashes) matches = corpus.find_all_bulk(hashes) results = filter(lambda e: len(set(e)) > 1, matches) print >> sys.stderr, 'There are %d candidates duplicates' % len(results) seen = set() for l in results: s = set(l) for m in s: if m not in seen: print d[m] seen.add(m) print '----'