def test_delete_vector_multiple_hash(self): hashes = [UniBucket('name_hash_%d' % k) for k in range(10)] engine = Engine(self.dim, lshashes=hashes) self.fill_engine(engine) engine.delete_vector(self.removed_value) self.check_delete(engine)
def test_delete_vector_with_provided_value(self): engine = Engine(self.dim, lshashes=[UniBucket('testHash')]) self.fill_engine(engine) engine.delete_vector(self.removed_value, self.removed_vector) self.check_delete(engine)
def test_delete_vector_single_hash(self): engine = Engine(self.dim, lshashes=[UniBucket('testHash')]) self.fill_engine(engine) engine.delete_vector(self.removed_value) self.check_delete(engine)
class ClusterAnalyser: def __init__(self): from multiprocessing import Pool self.resetClusters() self.TUNE_INTERVAL = 1000 self.ncnttot=0 self.ncntq=0 self.store_cnt = 20 self.store_join_cnt = 20 self.p = Pool(20) self.entropyLikelyhood = True self.tuneClusters = True self.cutters = [[0,None,'getRandomContractionsMinCut']] self.simgraphparams = dict(usedropout=False) self.max_runtime = 1200 self.start_time = None self.overrun = False self.min_lines_per_second = 20 def resetClusters(self): # Every new cluster gets an unique id which is the key for this dictionary self.clusters = {} self.next_cluster_id = FI_CLUSTER_ID_OFFSET if opt_lang == 'fi' else 0 # Locality Sensitive Hashing self.lsh_engine = Engine(vecs.dim, lshashes=[RandomBinaryProjections('rpb', HYPERPLANE_COUNT) for i in range(HASH_LAYERS)], distance=lsh_distance_func) # Returns closest clusters to a given sentence, in a sorted list of (distance, cluster) tuples. def query_clusters(query, idfs): doc_vec = document_to_vector(query.split(' '), idfs) if doc_vec is None: return None return sorted([(1 - doc_vec.dot(c.center) / c.norm, c) for id, c in self.clusters.iteritems()]) # look for nearest cluster def lookupNearest(self, doc_vec, keywords=None, similarity=None ): lowest_index = -1 ncnt = self.lsh_engine.candidate_count(doc_vec) self.ncnttot += ncnt self.ncntq += 1 if not similarity is None: nearest_neighbours = list(filter(lambda x: filterKeywordSimilarity(self.clusters[x[1]], keywords), self.lsh_engine.neighbours(doc_vec))) else: nearest_neighbours = list(filter(lambda x: filterKeywords(self.clusters[x[1]], keywords), self.lsh_engine.neighbours(doc_vec))) #nearest_neighbours = self.lsh_engine.neighbours(doc_vec) if len(nearest_neighbours) > 0: # get closest one from tuple (cluster vector, cluster index, distance) nn = min(nearest_neighbours, key=lambda x: (x[2]/self.clusters[x[1]].power)) #if nn[2] < (CLUSTER_THRESHOLD*self.clusters[nn[1]].power): lowest_index = nn[1] return lowest_index def initNewCluster(self, doc_vec, tweet_data, line, tweet_time, lang, tweet_post_time): c = makeNewCluster(self.next_cluster_id, doc_vec, tweet_data, line, tweet_time, lang, tweet_post_time) self.addCluster(c) def addCluster(self, c): self.lsh_engine.store_vector(c.center, self.next_cluster_id) self.clusters[self.next_cluster_id] = c self.next_cluster_id += 1 def tuneClustersCall(self): line = self.line deleted_clusters = [] print ('parallel preprocessing ... ') #parallel preprocessing dlist = list(self.clusters.iteritems()) params = [[self.line - self.TUNE_INTERVAL, self.entropyLikelyhood, self.cutters, self.simgraphparams]]*len(dlist) split_test_out = dict(self.p.map(doAnalyseSplit, zip(dlist, params))) print ('done') for c_idx, c in list(self.clusters.iteritems()): if c_idx in deleted_clusters: continue #print ([c_idx, c]) if c.last_update > line - self.TUNE_INTERVAL: if len(c.documents) > 10: if split_test_out[c_idx]['result']: a = split_test_out[c_idx]['a'] b = split_test_out[c_idx]['b'] probJoin = split_test_out[c_idx]['probJoin'] probSplit = split_test_out[c_idx]['probSplit'] c.documents = list(map(lambda x: x[0],a)) c.text_data = list(map(lambda x: x[1],a)) c.word_index = dict() for t in c.text_data: for w in list(filter(lambda x: len(x) > 3, t[0][0].split(' ')[2:])): c.word_index[w] = '' self.lsh_engine.delete_vector(c_idx) c.center = np.mean(c.documents, axis=0) c.norm = np.linalg.norm(c.center) c.updatePower() self.lsh_engine.store_vector(c.center, c_idx) # copy time parameters for now print ("Split cluster %d into %d and %d %f < %f" % (c_idx, len(a), len(b), probJoin, probSplit)) self.initNewCluster(list(map(lambda x: x[0],b)), list(map(lambda x: x[1][0],b)), c.last_update, c.created_at, c.lang, list(map(lambda x: x[1][1],b))) if self.store_cnt > 0: pickle.dump(dict(a=a,b=b,probJoin=probJoin,probSplit=probSplit),open('stored_split_cases_%d.pckl'%self.store_cnt,'wb')) self.store_cnt -= 1 if len(c.documents) > 30: # Test merge with random nearest nearest_neighbour_clusters = list(filter(lambda x: filterKeywordSimilarity(self.clusters[x[1]], c.word_index), self.lsh_engine.neighbours(c.center)))#self.lsh_engine.neighbours(c.center) nearest_neighbour_clusters.sort(key=lambda x: x[2]) maxrnd = min(len(nearest_neighbour_clusters),6) if len(nearest_neighbour_clusters) > 1: ann, bnn = random.sample(nearest_neighbour_clusters[:maxrnd], 2) a= zip(self.clusters[ann[1]].documents, self.clusters[ann[1]].text_data) b= zip(self.clusters[bnn[1]].documents, self.clusters[bnn[1]].text_data) if len(a) < 20 and (not self.entropyLikelyhood): #or len(a) > 500 : continue if len(b) < 20 and (not self.entropyLikelyhood): #or len(b) > 500 : continue if self.clusters[ann[1]].lang != self.clusters[bnn[1]].lang: continue if self.entropyLikelyhood: c = makeNewCluster(self.next_cluster_id, list(map(lambda x: x[0],a+b)), list(map(lambda x: x[1][0],a+b)), max(self.clusters[bnn[1]].last_update,self.clusters[ann[1]].last_update), max(self.clusters[bnn[1]].created_at,self.clusters[ann[1]].created_at), self.clusters[ann[1]].lang, list(map(lambda x: x[1][1],a+b))) probJoin = computeEntropyLikelyhood(c, idfs) wa = len(a)/(float(len(a))+len(b)) probSplit = (wa*computeEntropyLikelyhood(self.clusters[ann[1]],idfs)+(1-wa)*computeEntropyLikelyhood(self.clusters[bnn[1]],idfs))+(wa*math.log(wa)/math.log(2)+(1-wa)*math.log((1-wa))/math.log(2))+random.random() else: probJoin = computeNormalLikelyhood(a+b) probSplit = computeNormalLikelyhood(a)*computeNormalLikelyhood(b) if probJoin > probSplit: deleted_clusters.append(ann[1]) deleted_clusters.append(bnn[1]) print ("Join clusters %d (%d) and %d (%d) %f > %f" % (ann[1], len(a), bnn[1], len(b), probJoin, probSplit)) if self.store_join_cnt > 0: pickle.dump(dict(a=a,b=b,probJoin=probJoin,probSplit=probSplit),open('stored_join_cases_%d.pckl'%self.store_join_cnt,'wb')) self.store_join_cnt -= 1 if self.entropyLikelyhood: self.addCluster(c) else: self.initNewCluster(list(map(lambda x: x[0],a+b)), list(map(lambda x: x[1][0],a+b)), max(self.clusters[bnn[1]].last_update,self.clusters[ann[1]].last_update), max(self.clusters[bnn[1]].created_at,self.clusters[ann[1]].created_at), self.clusters[ann[1]].lang, list(map(lambda x: x[1][1],a+b))) self.lsh_engine.delete_vector(ann[1]) self.clusters.pop(ann[1]) self.lsh_engine.delete_vector(bnn[1]) self.clusters.pop(bnn[1]) def purgeClusters(self): line = self.line to_be_removed = [] for k, c in self.clusters.iteritems(): if line - c.last_update > (100000 * len(c.documents)) and len(c.documents) < 10: to_be_removed.append((k, c.center)) for t in to_be_removed: self.lsh_engine.delete_vector(t[0]) self.clusters.pop(t[0]) if len(to_be_removed) > 0: print("Removed %d stagnant clusters" % len(to_be_removed)) def calcGrowthRate(self): line = self.line tweet_time = self.tweet_time time_since_last_growth = self.time_since_last_growth for id, c in self.clusters.iteritems(): #if (c.created_at < 1405555200000): # 17/07/2014 00:00:00 # continue c.calculateGrowthAndSentiment() ## calculate growth for first 12h #if len(c.hourly_growth_rate) < 12: #growth_rate = (len(c.text_data) - c.last_size) / float(time_since_last_growth) * 1000 * 60 * 60 #if len(c.hourly_growth_rate) == 0: #c.first_growth_time = tweet_time #c.hourly_growth_rate.append(growth_rate) ## calculate sentiment for new tweets #if len(c.documents) > c.last_size: #cluster_vector = np.mean(c.documents[c.last_size:], axis=0) #sentiment = getSentiment(cluster_vector) #else: #sentiment = 0 #c.hourly_sentiment.append(sentiment) ## calculate total sentiment so far #sentiment = getSentiment(np.mean(c.documents, axis=0)) #c.hourly_accum_sentiment.append(sentiment) #c.last_size = len(c.text_data) #c.hourly_keywords.append(cluster_exporter.get_keywords(c, idfs)[:3])#['three','random','words'] ## print quickly growing ones with high enough entropy ##if growth_rate < 10: #continue #entropy = cluster_exporter.calculate_cluster_entropy(c) #if entropy < ENTROPY_THRESHOLD: #continue #print('Quickly growing cluster %d: %d tweets, %d tweets/h, entropy %.2f\n' % (id, len(c.text_data), int(growth_rate), entropy)) #print('\n'.join(list(map(lambda x: x[0],random.sample(c.text_data, min(len(c.text), 8)))))) #print('\n\n') # Every line in the input file should start with a timestamp in ms and id of document, # followed by the whole document, all separated with spaces and without newlines. # # Note: minimum word frequency is often implemented by the vector model already def construct_clusters(self, filename, from_line=0, from_date=None, to_date=None,idfs=None, lang=None): self.start_time = time.time() if lang != 'ru' and lang != 'fi': print("Lang must be 'ru' or 'fi'") return tweet_file = open(filename) try: self.line = 0 # performance counter self.last_print_line = 0 self.last_print_time = time.time() # used for calculating hourly growth in tweet time self.last_growth_calc = 0 self.tweet_time = 0 self.tweet_time_notz = datetime.utcfromtimestamp(0) for twlineesc in tweet_file: if time.time() - self.start_time > self.max_runtime: self.overrun = True break twline = twlineesc.decode('unicode-escape').encode('utf-8') if len(twline) < 2: continue twsplit = twline.split(',') try: unix_tweet_time =int(time.mktime(datetime.strptime(twsplit[0], '%a %b %d %X +0000 %Y').timetuple()) * 1000) except: print (twline) print (twsplit[0]) raise Exception() tweet = " ".join([str(unix_tweet_time),twsplit[1],twsplit[4]]) self.line += 1 if self.line < from_line: continue if self.tuneClusters: if self.line % self.TUNE_INTERVAL == 0: #pr.disable() self.tuneClustersCall() #pr.enable() # save periodically if False:#self.line % 1000000 == 0 and self.line != 0: save_results(filename + '_' + str(self.line)) # remove really old clusters with a small amount of documents if self.line % 100000 == 0: self.purgeClusters() # print status if self.line % 1000 == 0: #pr.disable() new_time = time.time() lps = int((self.line - self.last_print_line) / (new_time - self.last_print_time)) print("Line: %d, Date: %s, Clusters: %d, %d lines/s AVG candidates: %d" % (self.line, self.tweet_time_notz, len(self.clusters), lps, int(self.ncnttot/(self.ncntq+0.0000001)))) #if int((self.line - self.last_print_line) / (new_time - self.last_print_time)) < 50: # s = StringIO.StringIO() # sortby = 'cumulative' # ps = pstats.Stats(pr, stream=s).sort_stats(sortby) # ps.print_stats() # print (s.getvalue()) self.last_print_line = self.line self.last_print_time = new_time self.ncnttot = 0 self.ncntq = 0 if time.time() - self.start_time > self.max_runtime or lps < self.min_lines_per_second: self.overrun = True break #pr.enable() # calculate growth rate #self.time_since_last_growth = self.tweet_time - self.last_growth_calc #if self.time_since_last_growth > 1000 * 60 * 60: # self.last_growth_calc = self.tweet_time # self.calcGrowthRate() self.tweet_time = unix_tweet_time tweet_parts = tweet.strip().split(' ') #try: # self.tweet_time = int(tweet_parts[0]) #except ValueError: # print('Invalid document on line %d: %s' % (self.line, tweet)) # continue self.tweet_time_notz = datetime.utcfromtimestamp(self.tweet_time * 0.001) tweet_time_utc = utc.localize(self.tweet_time_notz) if from_date is not None and tweet_time_utc < from_date: continue if to_date is not None and tweet_time_utc > to_date: break # TEMP ignore gameinsight spam and short tweets if len(tweet_parts) < 6 or tweet.find('gameinsight') != -1: continue # allocate tweet to cluster doc_vec = document_to_vector(tweet_parts[2:], idfs) if doc_vec is None: continue keywords = list(filter(lambda x: len(x) > 4, tweet.strip().split(' ')[2:])) #ignore short tweets if len(keywords) < 6: continue lowest_index = self.lookupNearest(doc_vec, keywords, similarity=True) if lowest_index != -1: c = self.clusters[lowest_index] c.appendTweet(doc_vec, [[tweet.strip(), twsplit[3], twsplit[2]], self.tweet_time], self.line) #c.documents.append(doc_vec) #c.text_data.append([[tweet.strip(), twsplit[3], twsplit[2]], self.tweet_time]) #c.last_update = self.line # update the cluster center if the cluster is small if len(c.documents) > 0: if len(c.documents) < 5: self.lsh_engine.delete_vector(lowest_index) c.center = np.mean(c.documents, axis=0) c.norm = np.linalg.norm(c.center) self.lsh_engine.store_vector(c.center, lowest_index) else: if len(c.documents) < 100: c.power = np.mean(np.std(c.documents, axis=0)) else: # no cluster found, construct new one self.initNewCluster([doc_vec], [[tweet.strip(), twsplit[3], twsplit[2]]], self.line, self.tweet_time, lang,[self.tweet_time]) except KeyboardInterrupt: print("Line: %d Clusters: %d" % (self.line, len(self.clusters))) print("Cancelled") self.p.close() self.p.join()