def cluster(similarity=pearson): db = SQLite3().cursor() pins = Pins() words = Words() maxcount = pins.size() labels = [r[0] for r in words.set()] offset = 0 n = 1 sum_num = sum([i for i in range(1, maxcount)]) while True: pin_a = pins.find_by_offset(offset) pin_a_words = [w[0] for w in words.find_by_pinid(pin_a[0])] pin_a_wordcount = [pin_a_words.count(w) for w in list(set(labels))] # calculate distance of two pins. for i in range(offset + 1, maxcount): pin_b = pins.find_by_offset(i) pin_b_words = [w[0] for w in words.find_by_pinid(pin_b[0])] pin_b_wordcount = [pin_b_words.count(w) for w in list(set(labels))] # calculate distance of two pins print '[%s] %s / %s calculate score of %s and %s' % ( datetime.today().strftime('%Y-%m-%d %H:%M:%S'), n, sum_num, pin_a[0], pin_b[0]) sim = 1.0 - similarity(pin_a_wordcount, pin_b_wordcount) # save distance to database for cache clusters = Clusters() clusters.data['pin_id_a'] = pin_a[0] clusters.data['pin_id_b'] = pin_b[0] clusters.data['score'] = sim clusters.save() n += 1 if offset >= maxcount - 1: break offset += 1
#!/usr/bin/python import os, sys, cgi, json, site site.addsitedir(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../core')) from db.mapper import Pins, Clusters if 'QUERY_STRING' in os.environ: query = cgi.parse_qs(os.environ['QUERY_STRING']) else: query = {} clusters = Clusters() pins = clusters.get_top_matches(query['pin_id'][0], 50, 0.5) res = [{'score':str(pin[0]), 'uri':str(pin[2]), 'pin_id': str(pin[3])} for pin in pins] print "Content-Type: text/json\n\n" print json.dumps(res)