Esempio n. 1
0
 def merge_pages(self):
     # merge an unbiased and biased page in an subtle manner
     final_pages = []
     # generate the positions of two insertions based on the query
     first = simhash.hash(self.query, len(self.true_pages))
     second = simhash.hash(self.query*2, len(self.true_pages))
 
     # make sure we don't replace the first result
     if first == 0:
         first = (first + 1) % len(self.true_pages)
     if second == 0:
         second = (second + 1) % len(self.true_pages)
 
     # make sure that our more relevant result is listed first
     if first == second:
         if second == len(self.true_pages)-1 and second != 0:
             first -= 1
         elif first !=0:
             second+=1
     elif second < first:
         first, second = second, first
 
     # insert the pages
     for i in range(len(self.true_pages)):
         if i == first:
             final_pages.append(self.cat_pages[0])
         elif i == second and len(self.cat_pages) > 1:
             final_pages.append(self.cat_pages[1])
         final_pages.append(self.true_pages[i])
     return final_pages
Esempio n. 2
0
 def assertNotMatch(self, a, b, threshold=3):
     a_h = simhash.hash(a)
     b_h = simhash.hash(b)
     diff = self.diff_bits(a_h, b_h)
     self.assertGreater(diff, threshold,
         'Expected (%i) "%s" to NOT match (%i) "%s" (%i)' % (
             a_h, a[0:50], b_h, b[0:50], diff))
Esempio n. 3
0
def get_simhash_similarity(str1, str2):
    hash1 = simhash.hash(str1)
    hash2 = simhash.hash(str2)
    # hash1 = simhash(str1)
    # hash2 = simhash(str2)
    # return hash1.similarity(hash2)
    return diff_bits(hash1, hash2)
Esempio n. 4
0
 def assertNotMatch(self, a, b, threshold=3):
     a_h = simhash.hash(a)
     b_h = simhash.hash(b)
     diff = self.diff_bits(a_h, b_h)
     self.assertGreater(diff, threshold,
         'Expected (%i) "%s" to NOT match (%i) "%s" (%i)' % (
             a_h, a[0:50], b_h, b[0:50], diff))
Esempio n. 5
0
    def __init__(self, file_id, text, use_zlib, bit_size):
        self.id = file_id

        self.tokens = text.split()
        token_counter = counter.Counter(self.tokens)

        self.exact_fingerprint = self._adler_32(text, use_zlib)

        self.near_fingerprint = simhash.hash(token_counter, bit_size)
        self.near_fingerprint_buckets = []

        self.plateau_fingerprint = None
        self.plateau_fingerprint_buckets = []
        plateau = finn.find_plateau(self.tokens)
        if plateau is not None:
            plateau_counter = counter.Counter(plateau)
            self.plateau_fingerprint = simhash.hash(plateau_counter, bit_size)
Esempio n. 6
0
  def __init__(self, file_id, text, use_zlib, bit_size):
    self.id = file_id

    self.tokens = text.split()
    token_counter = counter.Counter(self.tokens)

    self.exact_fingerprint = self._adler_32(text, use_zlib)

    self.near_fingerprint = simhash.hash(token_counter, bit_size)
    self.near_fingerprint_buckets = []

    self.plateau_fingerprint = None
    self.plateau_fingerprint_buckets = []
    plateau = finn.find_plateau(self.tokens)
    if plateau is not None:
      plateau_counter = counter.Counter(plateau)
      self.plateau_fingerprint = simhash.hash(plateau_counter, bit_size)
Esempio n. 7
0
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        body = ''.join(hxs.select('//p[@class="paragraph"]//text()').extract()).strip()

        item = GenericItem()
        item['body'] = body
        item['url'] = response.url
        item['simhash'] = str(simhash.hash(body))
        return item
Esempio n. 8
0
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        body = ''.join(hxs.select('//span[@itemprop="articleBody"]//text()').extract()).strip()

        item = GenericItem()
        item['body'] = body
        item['url'] = response.url
        item['simhash'] = str(simhash.hash(body))
        return item
Esempio n. 9
0
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        body = ''.join(
            hxs.select(
                '//span[@itemprop="articleBody"]//text()').extract()).strip()

        item = GenericItem()
        item['body'] = body
        item['url'] = response.url
        item['simhash'] = str(simhash.hash(body))
        return item
Esempio n. 10
0
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        body = ''
        body += ''.join(hxs.select('//h1[@itemprop="headline"]//text()').extract()).strip() + '\n'
        body += ''.join(hxs.select('//p[@itemprop="description"]/text()').extract()).strip() + '\n'
        body += ''.join(hxs.select('//div[@itemprop="articleBody"]/p/text()').extract()).strip()

        item = GenericItem()
        item['body'] = body
        item['url'] = response.url
        item['simhash'] = str(simhash.hash(body))
        return item
Esempio n. 11
0
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        body = ''
        body += ''.join(hxs.select('//h2[@class="article-title"]//text()').extract()).strip() + '\n'
        body += ''.join(hxs.select('//p[@class="article-intro"]//text()').extract()).strip() + '\n'
        text = hxs.select('//div[@class="article-section clearfix"]/p//text()').extract()
        body += '\n'.join([x for x in [l.strip() for l in text] if not x.startswith(('<!--', '//'))])

        item = GenericItem()
        item['body'] = body
        item['url'] = response.url
        item['simhash'] = str(simhash.hash(body))
        return item
Esempio n. 12
0
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        body = ''
        body += ''.join(
            hxs.select(
                '//h1[@itemprop="headline"]//text()').extract()).strip() + '\n'
        body += ''.join(
            hxs.select('//p[@itemprop="description"]/text()').extract()).strip(
            ) + '\n'
        body += ''.join(
            hxs.select(
                '//div[@itemprop="articleBody"]/p/text()').extract()).strip()

        item = GenericItem()
        item['body'] = body
        item['url'] = response.url
        item['simhash'] = str(simhash.hash(body))
        return item
Esempio n. 13
0
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        body = ''
        body += ''.join(
            hxs.select('//h2[@class="article-title"]//text()').extract()
        ).strip() + '\n'
        body += ''.join(
            hxs.select('//p[@class="article-intro"]//text()').extract()).strip(
            ) + '\n'
        text = hxs.select(
            '//div[@class="article-section clearfix"]/p//text()').extract()
        body += '\n'.join([
            x for x in [l.strip() for l in text]
            if not x.startswith(('<!--', '//'))
        ])

        item = GenericItem()
        item['body'] = body
        item['url'] = response.url
        item['simhash'] = str(simhash.hash(body))
        return item
Esempio n. 14
0
import sys
import sqlite3
import simhash

# first arg is path to db
db = sys.argv[1]

conn = sqlite3.connect(db)
c = conn.cursor()

for chunk in xrange(30):
    c.execute("SELECT url, body FROM data WHERE simhash IS NULL LIMIT 7000")
    rows = c.fetchall()
    for row in rows:
        url = row[0]
        body = row[1]
        c.execute('UPDATE data SET simhash=? WHERE url=?', (str(simhash.hash(body)), url))
        conn.commit()

conn.close()
Esempio n. 15
0
    def search(self, new_query, new_rpp):
        # Main search function
        # Start timer
        self.start = time.time()
        
        # Set variables
        self.query = new_query
        self.rpp = int(new_rpp)

        # Grab JSON from bing
        # Use hash to create bias
        self.bias = self.terms[simhash.hash(self.query, len(self.terms))]
        self.true_json = self.get_dict2(self.query)
        self.cat_json =  self.get_dict2(self.query + ' ' + self.bias)
        
        # Empty the stock faroo structure
        self.faroo_results['iurl'] = ''
        self.faroo_results['author'] = ''
        self.faroo_results['votes'] = ''
        self.faroo_results['related'] = ''
        self.faroo_results['content'] = ''
        self.faroo_results['date'] = ''

        # Convert the bing results into the faroo structure
        self.true_json = self.convert_to_faroo(self.true_json['results'])
        self.cat_json = self.convert_to_faroo(self.cat_json['results'])

        # Check for zero results
        if len(self.true_json['results']) == 0:
            # convert back to JSON and print/return
            return json.dumps(self.true_json)
        if len(self.cat_json['results']) == 0:
            # convert true_json back to JSON and print/return
            return json.dumps(self.true_json)

        # Populate classes
        # Crawl each page as we create
        temp = self.query.split(' ') + self.terms
        self.true_pages = self.create_pages(self.true_json, False, temp)
        #self.cat_pages = self.create_pages(self.cat_json, True, temp)
        self.cat_pages = self.create_pages(self.cat_json, True, self.query.split(' '))

        # Rank all pages
        for pg in self.true_pages:
            pg.calc_value(self.query + ' ' + self.bias)
            #pg.calc_value(self.query + ' ' + ' '.join(self.terms))
        for pg in self.cat_pages:
            pg.calc_value(self.query)
            #pg.calc_value(self.query + ' ' + ' '.join(self.terms))

        # Sort the pages by rank
        self.true_pages.sort()
        self.cat_pages.sort()

        # Create final results list
        self.final_pages = self.merge_pages()

        # Convert back to correct obj
        self.final_results_json = []
        i = 0
        while(i<self.rpp):
            if i == len(self.final_pages):
                break
            self.final_results_json.append(self.final_pages[i].get_json())
            i+=1

        # Update the final dictionary
        self.final_dict = self.true_json
        self.final_dict['count'] = i
        self.final_dict['results'] = self.final_results_json
        self.final_dict['time'] = self.true_json['time'] + self.cat_json['time'] + (time.time()-self.start)

        # Convert to JSON and print
        return json.dumps(self.final_dict)
Esempio n. 16
0
import sys
import sqlite3
import simhash

# first arg is path to db
db = sys.argv[1]

conn = sqlite3.connect(db)
c = conn.cursor()

for chunk in xrange(30):
    c.execute("SELECT url, body FROM data WHERE simhash IS NULL LIMIT 7000")
    rows = c.fetchall()
    for row in rows:
        url = row[0]
        body = row[1]
        c.execute('UPDATE data SET simhash=? WHERE url=?',
                  (str(simhash.hash(body)), url))
        conn.commit()

conn.close()
Esempio n. 17
0
import sys
import simhash

with open('../data.csv', 'rb') as f:
    companies = [c.strip() for c in f]

hashes = [simhash.hash(x) for x in companies]
d = {h: c for h, c in zip(hashes, companies)}

corpus = simhash.Corpus(32, 16)
corpus.insert_bulk(hashes)

matches = corpus.find_all_bulk(hashes)

results = filter(lambda e: len(set(e)) > 1, matches)

print >> sys.stderr, 'There are %d candidates duplicates' % len(results)

seen = set()

for l in results:
    s = set(l)
    for m in s:
        if m not in seen:
            print d[m]
            seen.add(m)
    print '----'