def addPoint(self, point): if len(self.clusters) == 0: cl = cluster() cl.addPoint(point) self.clusters.append(cl) else: neighbouringClusters = self.numberOfNeighbouingClusters(point) if neighbouringClusters == 0: cl = cluster() cl.addPoint(point) self.clusters.append(cl) return elif neighbouringClusters == 1: for cl in self.clusters: if cl.isInside(point): cl.addPoint(point) return else: newCluster = cluster() newCluster.addPoint(point) for cl in self.clusters[:]: if cl.isInside(point): for pnt in cl.points: newCluster.addCheckedPoint(pnt) self.clusters.remove(cl) self.clusters.append(newCluster)
def dbscan(self, D, eps, MinPts): self.dataSet = D title(r'dbscan_demo Algorithm', fontsize=18) xlabel(r'Dim 1', fontsize=17) ylabel(r'Dim 2', fontsize=17) C = -1 Noise = cluster('Noise') for point in D: if point not in self.visited: self.visited.append(point) NeighbourPoints = self.regionQuery(point, eps) if len(NeighbourPoints) < MinPts: Noise.addPoint(point) else: name = 'Cluster' + str(self.count) C = cluster(name) self.count += 1 self.expandCluster(point, NeighbourPoints, C, eps, MinPts) plot(C.getX(), C.getY(), 'o', label=name) hold(True) if len(Noise.getPoints()) != 0: plot(Noise.getX(), Noise.getY(), 'x', label='Noise') hold(False) legend(loc='lower left') grid(True) show()
def dbscan(self, MinPts, eps): ''' :param data: The input data points or features :param MinPts: :param eps: :return: ''' self.cluster_count=0 Noise=cluster.cluster("Noise") for point in self.dataset: if point not in self.visited_points: self.visited_points.append(point) else: continue neighbour_points=self.region_query(point, eps) if len(neighbour_points) < MinPts: Noise.add(point) else: cluster_name="cluster "+ str(self.cluster_count) C=cluster.cluster(cluster_name) self.cluster_count+=1 self.expand_cluster(point, neighbour_points, C, eps, MinPts) plt.plot(C.get_syllable(), C.get_usage(), 'o', label=cluster_name, hold = True) if len(Noise.getPoints()) != 0: plt.plot(Noise.get_syllable(), Noise.get_usage(), 'x', label='Noise', hold = False) plt.show()
def ellipseDetect(input): img, data = input params = cv.SimpleBlobDetector_Params() area = img.shape[0] * img.shape[1] minArea = area * 0.000125 maxArea = area * 0.002 print(area, minArea, maxArea) bestcount = 0 # use varying maximum threshold, keep the largest number # of blobs found (if it's less than 8) for maxthresh in range(20, 225, 10): params.thresholdStep = 10.0 params.minThreshold = 10 params.maxThreshold = maxthresh #220.0 params.filterByArea = True params.minArea = minArea params.maxArea = maxArea params.filterByColor = False params.filterByCircularity = True params.minCircularity = 0.7 params.filterByConvexity = True params.minConvexity = 0.8 params.filterByInertia = True params.minInertiaRatio = 0.5 params.minRepeatability = 2 params.minDistBetweenBlobs = 10.0 detector = cv.SimpleBlobDetector_create(params) keypoints = detector.detect(img) keypoints = cluster.cluster(keypoints, 5) if keypoints is not None: count = len(keypoints) for p in keypoints: print(p.pt, p.size) if count <= 8 and count > bestcount: bestcount = count bestpoints = keypoints if bestcount > 0: keypoints = bestpoints print(keypoints) img = cv.drawKeypoints(img, keypoints, None, (255, 0, 0), cv.DrawMatchesFlags_DRAW_RICH_KEYPOINTS) else: keypoints = list() print("No ellipses found") return (img, keypoints, True, ("blob ellipses", "%d keypoints" % bestcount))
def operate(self, routes, address_field, city_field, province_field, zip_field, start, end): done = 0 err = 0 for i, route in enumerate(routes): columns = self.__read_columns( route, [address_field, city_field, province_field, zip_field], start, end) if columns.shape[0] < 1: err += 1 continue addresses = [ Address(columns.iloc[i, 0], columns.iloc[i, 1], columns.iloc[i, 2], columns.iloc[i, 3]).return_val() for i in range(columns.shape[0]) ] labels, self.km[route] = cluster(addresses) columns["Label"] = [f"L{label}" for label in labels] self.__write_df(columns, f"{route}_sheet", close=i + 1 == len(routes)) done += 1 return done, err
def optimal_ponctual_charge(self, atomindex): atom, index = atomindex nb1 = self.cluster_1.get_global_atom_index(atom, index) pop = self.cluster_1.lowdin_charge(nb1) - self.cluster_1[atomindex][0] coords = self.cluster_1[atomindex][1] project_temp = cluster.cluster(self.cluster_1.wfn) project_temp.atoms_coords = self.cluster_1.atoms_coords project_temp.atoms_charges = self.cluster_1.atoms_charges project_temp.atoms_names = self.cluster_1.atoms_names project_temp.atoms_indexes = self.cluster_1.atoms_indexes project_temp.atoms_nb = self.cluster_1.atoms_nb project_temp['BQ', None] = (-self.cluster_1[atomindex][0], coords) del project_temp[atomindex] g = [pop * (1 / 3)] d = [pop * (2 / 3)] project_temp['MP', None] = (g[0], coords) nb2 = len(np.where(self.cluster_1.env_names == 'BQ')[0]) g.append(pop - project_temp.lowdin_charge(nb1 + nb2)) index_mp = len(np.where(self.cluster_1.env_names == 'MP')[0]) print(index_mp) project_temp['MP', index_mp] = (d[0], coords) d.append(pop - project_temp.lowdin_charge(nb2)) a = (d[1] - g[1]) / (d[0] - g[0]) b = d[1] - d[0] * a print(nb1) print(coords) print(pop) print(nb2) print(-b / a) return -b / a
def eval_func_confidences(self, feature_weights): weights_sum = float(sum(feature_weights)) # "normalize" (I don't know if that's the right word) the weights, and make sure none are equal to 0 feature_weights = [max(0.00001, x/weights_sum) for x in feature_weights] IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=self.num_documents, first_doc_num=self.first_doc_num, min_len=35000, pct_plag=1) reduced_docs = _get_reduced_docs(self.atom_type, all_test_files, session) actuals = [] confidences = [] confidence_vectors = [] for feature, weight in zip(self.features, feature_weights): vi = 0 for doc in reduced_docs: feature_vectors = doc.get_feature_vectors([feature], session) confs = cluster(self.cluster_type, 2, feature_vectors) for i, confidence in enumerate(confs, 0): if len(confidence_vectors) <= vi: confidence_vectors.append([]) confidence_vectors[vi].append(confidence * weight) vi += 1 for doc in reduced_docs: for span in doc._spans: actual = 1 if doc.span_is_plagiarized(span) else 0 actuals.append(actual) for vec in confidence_vectors: confidences.append(min(1, sum(vec))) fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1) roc_auc = sklearn.metrics.auc(fpr, tpr) print 'evaluated:', roc_auc, [w for w in feature_weights] return roc_auc
def simulate(N, M, D, S, G, dt): """Simulate function from Computation book modified to take in user specified variables N, D, S, G, dt and output a user specified file name. :param N: The number of particles. :param M: The number of clusters. :param D: The number of dimensions. :param S: The number of time steps. :param G: Gravitational constant. :param dt: The time step. :return: Simulation complete message. """ x = cl.cluster( N, M, D, 500 ) # create cluster object. 500 defines the maximum position coordinate of a cluster. x0, v0, m = x.cluster( ) # x0, v0, m = book.initial_cond(N, D) <-- old way to initialize for s in range(S): with open("clusterdata" + str(s + 1) + ".dat", "w") as myfile: for i in range(M): x1, v1 = book.timestep(x0[i], v0[i], G, m[i], dt) x0[i], v0[i] = x1, v1 myfile.write( str(x0[i]).replace('[', '').replace(']', '') + "\n") myfile.flush() return '\nSimulation complete. Your data has been saved as clusterdata*.dat\n'
def ksplit2(data_list,ks_crit): centroids,labels_init = kmeans2(data_list,2,minit='points') p = 0 l = 2 while p < l+1: r = data_list[labels_init == p] ks,centroids,labels = cluster(r) if ks > ks_crit: # This is the condition for a good cluster p += 1 continue elif ks == -1: break else: m = where(labels_init == p) for i in range(2): n = where(labels == i) labels[n[0]] = l l += 1 p += 1 labels_init[m[0]] = labels return centroids,labels_init,ks_crit
def run_each_frame(file, topo): alld, pbc = sca.get_coord(file, topo) res_d = {} # contains information for each frame avg_dict = {} count = 0 for frame in alld: a, b, c = pbc[frame] d, links_h, links_o, hbonds, obonds = data.data(alld[frame], a, b, c) coord = np.array([d[i][1:] for i in d]) temp_d = cluster.cluster(coord, file) for i in temp_d: temp_d[i] = temp_d[i] if i not in avg_dict: avg_dict[i] = temp_d[i] else: avg_dict[i] += temp_d[i] count += 1 res_d[frame] = temp_d print file, 'Number of frames', len(res_d) for i in avg_dict: avg_dict[i] = avg_dict[i] / count np.save(file[:-6] + '_CMS_perFrame.npy', res_d) np.save(file[:-6] + '_CMS_avg.npy', avg_dict)
def filterClusters(self): if len(self.pairwiseDict) == 0: self.pairwise() amap = AAmap() for i in xrange(0, len(self.atoms)): c = cluster(self.pdb, self.top, self.pfam, '', '', self.seqheader, '', '', self.center, self.cutoff, self.scutoff, self.flag, 1.0, self.desc) c.addNeighbor(amap, self.atoms[i], i) # put itself in first nbnum = 0 for j in xrange(0, len(self.atoms)): key = "%d-%d" % (i, j) if (self.pairwiseDict[key] <= self.cutoff) and (abs(i - j) >= self.scutoff): c.addNeighbor(amap, self.atoms[j], j) nbnum = nbnum + 1 c.thetaPhi.append( self.calculateThetaPhi(self.atoms[i], self.atoms[j])) if nbnum < self.nbcutoff: continue c.pdbidx = c.pdbidx.lstrip() # will change meanDist c.pdbResSeq = c.pdbResSeq.lstrip() meanDist = self.clusterMeanDist(c) if meanDist < 5.8: print('%s,%0.2f,%s,%s,%s,%s') % (self.pdb, meanDist, ''.join( sorted(c.str)), ''.join(sorted( c.typeStr)), c.pdbResSeq, self.getSphericalStr(c)) self.clusters.append(c)
def kMean(data): global globalSse global globalCs # Step 1 - Pick K random points as cluster centers called centroids. cs = [] idx = np.random.randint(data.shape[0], size=K) centers = np.take(X, idx, axis=0) count = 0 lastScore = 0 for center in centers: cs.append(cluster(center)) while True: # Step 2 - Assign each xi to nearest cluster by calculating its distance to each centroid. for c in cs: c.clearData() for idx, x in enumerate(X): cIndex = minDistCentroid(cs, x) cs[cIndex].data.append(idx) # Step 3 - Find new centroid by the new clusters for c in cs: newCentroid = np.mean(np.take(X, c.data, 0), axis=0) c.updateCenter(newCentroid) sse = str(wc(cs, data)) # Step 4 - break if certain iteration met count += 1 if count == 10000 or lastScore == sse: printResult(cs, sse) globalSse = sse globalCs = cs break lastScore = sse
def getRecmmendListController(uid): #从数据库读取用户特征 users = getAllUserInfo() #特征向量化 userData = userFeatureVectorize(users) #对用户聚类 labels, centers = cluster(userData,4) #printUserLabel(users,labels) #return #获取uid所属类别 pos = findUserPos(uid,users) if pos == -1: return 'uid is not in db' userLabel = labels[pos] #获取所有组内用户id ids = getGroup(labels,userLabel,users) #获取所有用户关键词搜索数 usersSearchInfo = getAllUserSearchInfo() usersInfo = geneUserModel(users,usersSearchInfo,userData,labels) #获取组内所有用户关键字搜索数 group = getGroupInfo(ids,usersInfo) #协同过滤获取用户所有关键字搜素数 recList = collFilter(uid,group,userData) #拼凑结果给用户 allKey = getAllKeyInfo() jsonStr = geneResult(recList,allKey) return jsonStr
def identify_cluster(X_data_df, threshold=0.7, correlation_id_method='pearson'): cor = X_data_df.corr(method=correlation_id_method) clusters = [] for j, col in enumerate(cor.columns): for i, row in enumerate(cor.columns[0:j]): if abs(cor.iloc[i, j]) > threshold: current_pair = (col, row, cor.iloc[i, j]) current_pair_added = False for _c in clusters: if _c.can_accept(current_pair): _c.update_with(current_pair) current_pair_added = True if current_pair_added == False: clusters.append(cluster(pairs=[current_pair])) final_clusters = [] #It is possible to have clusters with shared nodes which which is not desirable. Here we merge the cluster that share nodes. for _cluster in clusters: added_to_final = False for final_c in final_clusters: if _cluster.nodes.intersection(final_c.nodes) != set(): final_c.merge_with_cluster(_cluster) added_to_final = True if added_to_final == False: final_clusters.append(_cluster) for i, _cluster in enumerate(final_clusters): _cluster.name = f'cluster_{i}' return final_clusters
def __init__(self, width, height): self.width = width self.height = height self.origin = np.array([self.width / 2, self.height / 2]) self.root = entity( point(self.origin, np.array([0., 0.]), np.array([0., 0.])), np.array([0., 0., 0.])) self.cluster = cluster(self.root)
def index(): global clusters if request.method == 'POST': eps = float(request.form.get('eps', 20)) min_samples = int(request.form.get('min_samples', 2)) clusters = cluster(mat, fnames, eps, min_samples) print(clusters) return render_template('index.html', clusters=clusters)
def agglomerative(data): # first, every single point is a cluster global globalSse global globalCs global distDictionary cs = [] heap = distHeap() for idx, x in enumerate(X): c = cluster(x) c.data.append(idx) cs.append(c) for i, x in enumerate(cs): for j in range(i + 1, len(cs)): dist = cluster_distance(cs[i], cs[j]) clusters = (i, j) distDictionary[clusters] = dist # print "dist" + str(dist) + " " + str(i) + " " + str(j) heap.add_clusters(cs[i], cs[j], dist) while True: # print len(cs) if len(cs) <= K: sse = str(wc(cs, data)) printResult(cs, sse) globalSse = sse globalCs = cs break c1, c2 = heap.min_dist_clusters() # merge c1 and c2 newData = np.concatenate((c1.data, c2.data)) # Append the new cluster into the new cs c = cluster(np.mean(np.take(X, newData, 0), axis=0)) c.data = newData # remove associate c1 and c2 heap.remove_cluster(c1, c2) cs.remove(c1) cs.remove(c2) # add new for c_ind in cs: # print 'add new dist' dist = cluster_distance_cached(c, c_ind) heap.add_clusters(c, c_ind, dist) # print c1.data # print c2.data cs.append(c)
def cluster_recs(): native_lang = request.json['native_lang'] uid = request.json['uid'] clust_num = request.json['clust_num'] percent = request.json['percent'] pop_clusters = cluster(native_lang, uid, clust_num, percent) return json.dumps(pop_clusters)
def run( self, num_processes, sample=None ): #assigns a same cluster # to records that are the same business entity return cluster.cluster(self.df, cluster='cluster', to='to', match='match').run( self.get_matches(num_processes, sample))
def main(): hotelStarDict = { '五星级':5, '五星型':5, '四星级':4, '四星型':4, '三星级':3, '三星型':3, '二星级':2, '二星型':2, '经济级':1, '经济型':1 } sexDict = { '男':0, '女':1 } hobbyDict = {} ff = open('userInfo.txt') lines = ff.readlines() count = 0 for line in lines: id, userName, age, sex, hobby, consumLevel, hotelStar = line.strip()[1:-1].split(',') if hobby not in hobbyDict: hobbyDict[hobby] = count count += 1 hobbyFeat = [0]*len(hobbyDict) userData = [] userIds = [] for line in lines: id, userName, age, sex, hobby, consumLevel, hotelStar = line.strip()[1:-1].split(',') age = int(age) sex = sexDict[sex] hobbyFeature = hobbyFeat[:] hobbyFeature[hobbyDict[hobby]] = 1 hobby = hobbyFeature consumLevel = int(consumLevel) hotelStar = hotelStarDict[hotelStar] feature = [age, sex, consumLevel, hotelStar] feature.extend(hobby) userData.append(feature) userIds.append([id,userName]) ff.close() userData = utils.matNorm(userData) '''for data in userData: print data return''' labels, centers = cluster.cluster(userData,4) for i,id in enumerate(userIds): print id[1],labels[i]
def run(filename, min_size, max_size, dist): noun_dict = noun_extractor.get_nouns(filename) (synset_list, synset_dict) = cluster.get_synset_list(noun_dict) matrix = cluster.gen_sim_matrix(synset_list) clustering = cluster.format_clustering(cluster.cluster(matrix), synset_list) clusters = cluster.get_clusters(clustering, synset_list, min_size, max_size, dist=dist) clusters = filter(lambda x: x[0] is not None, clusters) cluster_counts = cluster.get_cluster_counts(clusters, synset_dict) # sort clusters by noun counts, most frequent first sorted_clusters = [x[1] for x in sorted(enumerate(clusters), key=lambda x: cluster_counts[x[0]], reverse=True)] hypernyms = filter(lambda x: x is not None, map(lambda x: lca(x), sorted_clusters)) return Classification(noun_dict, synset_list, synset_dict, matrix, clustering, sorted_clusters, hypernyms)
def main(): calib_path = raw_input('Enter folder path for calibration parameters:\n') if not os.path.isdir(calib_path): print('Invalid path!') return in_path = raw_input('Enter folder path for input images:\n') if not os.path.isdir(in_path): print('Invalid path!') return out_path = raw_input('Enter folder path for output images:\n') if not os.path.isdir(out_path): os.mkdir(out_path) undistort.undistort(calib_path, in_path, out_path) cluster.cluster(os.path.join(out_path, undistort), out_path) find_contour.find_contour(os.path.join(out_path, cluster), out_path)
def categorize(self, topics): self.seen.append(' '.join(topics)) if len(self.seen) > 100: self.seen = self.seen[1:] cluster.updateLanguage(self.seen[-1].split(' ')) if len(self.seen) < 10: return self.boot.categorize(topics) cats = getLabels(cluster.cluster(self.seen)) for cat in cats: if self.seen[-1] in cats[cat]: return cat return 'I dunno'
def top_local_stop_structure_gt(self, threshold): global_threshold = len(self.pages) * threshold gt_clusters = [] for item in set(self.ground_truth): gt_clusters.append(cluster()) for i in range(len(self.ground_truth)): if self.ground_truth[i] == item: gt_clusters[item].addPage(self.pages[i]) print str(item) + "\t" + str(len(gt_clusters[item].pages)) print "number of gt cluster is " + str(len(gt_clusters)) print "number of cluster 5 is " + str(len(gt_clusters[4].pages)) gt_clusters[4].find_local_stop_structure(self.df, global_threshold)
def top_local_stop_structure_gt(self,threshold): global_threshold = len(self.pages) * threshold gt_clusters = [] for item in set(self.ground_truth): gt_clusters.append(cluster()) for i in range(len(self.ground_truth)): if self.ground_truth[i] == item: gt_clusters[item].addPage(self.pages[i]) print str(item) + "\t" + str(len(gt_clusters[item].pages)) print "number of gt cluster is " + str(len(gt_clusters)) print "number of cluster 5 is " + str(len(gt_clusters[4].pages)) gt_clusters[4].find_local_stop_structure(self.df,global_threshold)
def mainSetK(peaks, expectedK): # means = seed.pickInitMeans(peaks,expectedK) # # numMeans = expectedK # numMatrixSeeds = int(0.75 * numMeans) + 1 # means = matrixSeed.pickInitSeeds(peaks,numMatrixSeeds) # means += seed.pickInitMeans(peaks,numMeans-numMatrixSeeds) # means = matrixSeed.pickInitSeeds(peaks, expectedK) print peaks[0] #The extra list at the beginning is for outliers,and is initialized with all peaks clusters = [peaks] + clustrifyMeans(means) alignmentMatrix = align.generate_align_matrix(peaks, means) clusterVariances = [ 0 ] * 5 #just something so that the first Welch's test doesn't cause termination print 'first runthrough of clustering' (means, clusters) = cluster.cluster(peaks, means, alignmentMatrix) varAlignmentMatrix = align.generate_var_align_matrix(clusters) print 'starting welch\'s t-test clustering with centroid means' (p_val, clusterVariances) = welchTest(clusters, varAlignmentMatrix, clusterVariances) while p_val < probabilityThreshold: means = paring.paredMeans(means, varAlignmentMatrix) numNewMeans = guessNewMeansSetK(peaks, means, p_val, expectedK) #currently, no correlation between how many means duplicated/dropped in paring #and how many and from where they are added in mean picking # means += seed.pickMeans(peaks, numNewMeans) means += seed.pickNewMeansOutliersToRandom(clusters, numNewMeans) # means += seed.pickNewMeans(clusters, numNewMeans, clusterVariances) alignmentMatrix = align.generate_align_matrix(peaks, means) (means, clusters) = cluster.cluster(peaks, means, alignmentMatrix) varAlignmentMatrix = align.generate_var_align_matrix(clusters) (p_val, clusterVariances) = welchTest(clusters, varAlignmentMatrix, clusterVariances) print 'finished clustering of subsequent k guess' return clusters
def get_plagiarism(text, atom_type, features, cluster_method, k): ''' Return a list of tuples of the form [((0, 18), .5), ((20, 45), .91), ...] In each tuple there is a span tuple and a confidence. The span tuple corresponds to an atom of the document and the confidence value corresponds to how confident we are that that span was plagiarized. ''' # Create a FeatureExtractor feature_extractor = FeatureExtractor(text) # get feature vectors feature_vecs = feature_extractor.get_feature_vectors(features, atom_type) # cluster confidences = cluster(cluster_method, k, feature_vecs) # Return it return zip(tokenize(text, atom_type), confidences) # should feature extractor have a method that returns the spans it used instead?
def p_a_comp_post(PARequest): # noqa: E501 """Request the execution of a placement algorithm. Request the execution of a placement algorithm. The caller needs to implement a callback function and supply the relevant URI so that the PA can post there the result of its execution.- # noqa: E501 :param PARequest: Placement algorithm request information. :type PARequest: dict | bytes :rtype: PAResponse """ if connexion.request.is_json: pa_req = connexion.request.get_json() pa_req = cluster(pa_req) return best_garrote(pa_req)
def _get_clusters(self, nbins=36): clusters = cluster.cluster(self.path, key=self.key, nbins=nbins) def _name_cluster(idx): if idx < 10: return str(idx) elif idx < nbins: return chr(idx - 10 + 65) elif idx == nbins: return '|' out = {} for idx,segs in clusters.items(): out[_name_cluster(int(idx))] = [self._segments[int(aidx)] for st,dur,aidx in segs] return out
def construct_confidence_vectors_dataset(self, reduced_docs, features, session): from cluster import cluster conf_dataset = SupervisedDataSet(len(features), 1) confidence_vectors = [] num_trues = 0 for feature in features: vi = 0 for doc in reduced_docs: feature_vectors = doc.get_feature_vectors([feature], session) confidences = cluster("outlier", 2, feature_vectors, center_at_mean=True, num_to_ignore=1, impurity=.2) for i, confidence in enumerate(confidences, 0): if len(confidence_vectors) <= vi: confidence_vectors.append([[], 0]) if doc.span_is_plagiarized(doc._spans[i]): t = 1 num_trues += 1 else: t = 0 confidence_vectors[vi][0].append(confidence) confidence_vectors[vi][1] = t vi += 1 num_plagiarised = num_trues / len(features) print num_plagiarised shuffle(confidence_vectors) for vec in confidence_vectors: if vec[1] == 0: num_plagiarised -= 1 if not (vec[1] == 0 and num_plagiarised <= 0): conf_dataset.addSample(vec[0], vec[1]) f = open(self.dataset_filepath, 'wb') cPickle.dump(conf_dataset, f) print 'dumped dataset file' return conf_dataset
def train(self, pos_samples): def sig_gen_cb(left, right): samples = [ pos_samples[s] for s in left['samples'] + right['samples'] ] new_sig = bayes.Bayes(minlen=self.minlen, kmin=self.kmin, kfrac=self.kfrac, prune=True, statsfile=self.statsfile, threshold_style='min', max_fpos=self.max_fp_count, training_trace=self.fpos_training_streams) new_sig.train(samples) # score = min([new_sig.score(s) for s in samples]) score = new_sig.threshold token_scores = new_sig.token_scores.values() if self.max_tokens_in_est: token_scores.sort(lambda x, y: cmp(y, x)) score = sum(token_scores[:self.max_tokens_in_est]) else: score = sum(token_scores) return (new_sig, score) import cluster self.clusters = cluster.cluster( sig_gen_cb, self.spec_threshold, pos_samples, max_fp_count=self.max_fp_count, fpos_training_streams=self.fpos_training_streams, bound_similarity=self.bound_similarity) if self.threshold_style != 'min': for c in self.clusters: if c['sig']: c['sig'].set_threshold() sigs = [] for c in self.clusters: if len(c['samples']) >= self.min_cluster_size: sigs.append(c['sig']) return sigs
def _get_clusters(self, nbins=36): clusters = cluster.cluster(self.path, key=self.key, nbins=nbins) def _name_cluster(idx): if idx < 10: return str(idx) elif idx < nbins: return chr(idx - 10 + 65) elif idx == nbins: return '|' out = {} for idx, segs in clusters.items(): out[_name_cluster(int(idx))] = [ self._segments[int(aidx)] for st, dur, aidx in segs ] return out
def clustering(fcallstacks_pool, show_plot, total_time, delta, bound): import math ''' 1. Preparing data ''' data=[] for cs in fcallstacks_pool: data.append([ cs.repetitions[cs.rank], cs.instants_distances_mean]) normdata=normalize_data(data) #plot_data(normdata) ''' 2. Perform clustering ''' db = DBSCAN(eps=constants._eps, min_samples=constants._min_samples).fit(normdata) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels=db.labels_ ''' 3. Creating cluster objects ''' nclusters = len(set(labels)) - (1 if -1 in labels else 0) clusters_pool=[] for i in range(0, nclusters): clusters_pool.append(cluster(i)) assert len(labels) == len(fcallstacks_pool) for i in range(0,len(labels)): callstack_cluster_id=labels[i] fcallstacks_pool[i].cluster_id=callstack_cluster_id if fcallstacks_pool[i].cluster_id == -1: # Owned by no cluster continue clusters_pool[callstack_cluster_id].add_callstack(fcallstacks_pool[i]) ''' 4. Show plots ''' #if show_plot: show_plot_thread=multiprocessing.Process( target=show_clustering, args=(data, fcallstacks_pool, labels, core_samples_mask, nclusters, total_time, delta, bound)) #show_plot_thread.start() return clusters_pool, show_plot_thread
def get_plagiarism_passages(text, atom_type, features, cluster_method='none', k=2): ''' Return a list of passages, each of which contains a starting/ending index, its text, its atom_type, and a dictionary of its features ''' # Extract passage objects (including their feature vectors) feature_extractor = FeatureExtractor(text) passages = feature_extractor.get_passages(features, atom_type) feature_vecs = [p.features.values() for p in passages] # If just testing feature extraction, don't cluster passages if cluster_method != 'none': # Cluster the passages and set their confidences confidences = cluster(cluster_method, k, feature_vecs) for psg, conf in zip(passages, confidences): psg.set_plag_confidence(conf) # List of passages with plag. conf. set return passages
def main(): # matplotlib.use('qt5agg') import matplotlib.pyplot as plt from matplotlib.patches import Rectangle init_model() MAT_DIR = './mat/test' LABEL_DIR = './label/test' for dirpath, dirnames, filenames in os.walk(MAT_DIR): print(dirpath) for filename in filenames: if filename == 'full.mat': data = sio.loadmat(os.path.join(dirpath, filename)) img = data['data'] centers = detection(img) img_id = dirpath.split('/')[-1] label_file = os.path.join(LABEL_DIR, img_id + '.mat') labels = sio.loadmat(label_file)['label'] distance = (lambda x1, y1, x2, y2: abs(x1 - x2) + abs(y1 - y2)) centers = cluster(centers) TP = 0 for x, y in labels: for x_, y_ in centers: if distance(x, y, x_, y_) < 36: TP += 1 break precision = float(TP) / len(centers) recall = float(TP) / len(labels) f_score = 2 * (precision * recall) / (precision + recall) six.print_(precision, recall, f_score) f = open(dirpath.split('/')[-1] + '-predict.txt', 'w') for x, y in centers: f.write(str(x) + ' ' + str(y) + '\n') f.close() f = open(dirpath.split('/')[-1] + '-label.txt', 'w') for x, y in labels: f.write(str(x) + ' ' + str(y) + '\n') f.close()
def train(self, pos_samples): def sig_gen_cb(left, right): samples = [pos_samples[s] for s in left['samples'] + right['samples']] new_sig = bayes.Bayes(minlen=self.minlen, kmin=self.kmin, kfrac=self.kfrac, prune=True, statsfile=self.statsfile, threshold_style='min', max_fpos=self.max_fp_count, training_trace=self.fpos_training_streams) new_sig.train(samples) # score = min([new_sig.score(s) for s in samples]) score = new_sig.threshold token_scores = new_sig.token_scores.values() if self.max_tokens_in_est: token_scores.sort(lambda x,y: cmp(y,x)) score = sum(token_scores[:self.max_tokens_in_est]) else: score = sum(token_scores) return (new_sig, score) import cluster self.clusters = cluster.cluster(sig_gen_cb, self.spec_threshold, pos_samples, max_fp_count=self.max_fp_count, fpos_training_streams=self.fpos_training_streams, bound_similarity=self.bound_similarity) if self.threshold_style != 'min': for c in self.clusters: if c['sig']: c['sig'].set_threshold() sigs = [] for c in self.clusters: if len(c['samples']) >= self.min_cluster_size: sigs.append(c['sig']) return sigs
def test(count=1300, k=3, mx=True, fc_done=None, nxt=False): global dts #dts = [[tdt[0] **2,tdt[1]**2] for tdt in dts] #dt.show_pt(dts,c=dt.color(0), s= 1) if fc_done is None: fc_done = cut_done # cluster.DefaultDone(count) rst = cluster.k_means(dts, k, fc_done, cluster.default_dst, mx) avgs = rst[0] dtss = rst[1] #avgs = rst[2] print len(avgs), len(dtss) for i in xrange(len(avgs)): avg = avgs[i] dt.draw([avg], c=dt.color(i), s=30) tdts = dtss[i] print len(tdts) if len(tdts) == 0: continue dt.draw(tdts, c=dt.color(i), s=1) dt.show() if not nxt: return None #return None tree = cluster.cluster(dts, k, cluster.min_dsts) print "done tree" sets = cluster.tree2sets(tree) #sets = cluster.cut_tree(tree,k,cluster.min_cost) print "done cut" print len(sets) for i in xrange(len(sets)): tdts = sets[i] print len(tdts) if len(tdts) == 0: continue dt.draw(tdts, c=dt.color(i), s=1) dt.show() return sets
def MProgram(Engine, Load, Key, Length, Put): print u'基本模块加载完成,您搜索的是:', Key iey=Key.decode("GB18030").encode("UTF-8") from search import search group = search(Engine, iey, Length) key = ['' for col in range(len(group[0]))] if (Load=='Goose'): from load_Goose import load else: from load_BeautifulSoup import load ret = load(group[0],4) from cluster import cluster type=cluster(ret[0]) Key=Key.decode("GB18030") if Put=="SQL": from sql import input input(type,ret[1],group[1], Engine, Load, Key, Length) print u'结果已输出至数据库' elif Put=="Text": from output import output output(type,ret[1],group[1], Engine, Load, Key, Length) print u'结果已输出至 results\Index_'+Engine+'_'+Load+'_'+Key+'_'+Length+'.htm'
def run(filename, min_size, max_size, dist): noun_dict = noun_extractor.get_nouns(filename) (synset_list, synset_dict) = cluster.get_synset_list(noun_dict) matrix = cluster.gen_sim_matrix(synset_list) clustering = cluster.format_clustering(cluster.cluster(matrix), synset_list) clusters = cluster.get_clusters(clustering, synset_list, min_size, max_size, dist=dist) clusters = filter(lambda x: x[0] is not None, clusters) cluster_counts = cluster.get_cluster_counts(clusters, synset_dict) # sort clusters by noun counts, most frequent first sorted_clusters = [ x[1] for x in sorted(enumerate(clusters), key=lambda x: cluster_counts[x[0]], reverse=True) ] hypernyms = filter(lambda x: x is not None, map(lambda x: lca(x), sorted_clusters)) return Classification(noun_dict, synset_list, synset_dict, matrix, clustering, sorted_clusters, hypernyms)
def main(): data_from_pickle = 0 projection_from_pickle = 1 clusters_from_pickle = 1 if data_from_pickle: uber_df, street_df, street_graph, node_coord_dict, coord_node_dict, \ edge_dict, coord_lookup, transition_graph, trans_edge_dict, \ edge_trans_dict, trans_dict = load_data.from_pickle() else: uber_df, street_df, street_graph, node_coord_dict, coord_node_dict, \ edge_dict, coord_lookup, transition_graph, trans_edge_dict, \ edge_trans_dict, trans_dict = load_data.load_fresh() ''' apply Kalman filter first pass here, fix large errors ''' if projection_from_pickle: print 'reading in projected uber_df from pickle...' uber_df = pickle.load(open('../pickles/uber_df_projected.pkl')) print 'read projected uber_df' else: uber_df = project.project(uber_df, street_graph, transition_graph, node_coord_dict, edge_dict, edge_trans_dict, coord_lookup) pickle.dump(uber_df, open('../pickles/uber_df_projected.pkl','wb')) ''' apply Kalman filter second pass here? fix small errors and re-project onto edges? ''' if clusters_from_pickle: uber_df, centroids = cluster.from_pickle() else: uber_df, centroids = cluster.cluster(uber_df) transition_graph = cluster_graphs.preadjust_transweights(uber_df, edge_dict, transition_graph) pickle.dump(transition_graph, open('../pickles/transition_graph_update.pkl','wb')) cgraphs = cluster_graphs.make_cluster_graphs(centroids, transition_graph, uber_df, edge_dict) pickle.dump(cgraphs, open('../pickles/cgraphs.pkl','wb'))
def filterClusters(self): if len(self.pairwiseDict)==0: self.pairwise() amap = AAmap() for i in xrange(0,len(self.atoms)): c=cluster(self.pdb, self.top, self.pfam, '', '', self.seqheader, '', '', self.center, self.cutoff, self.scutoff, self.flag, 1.0, self.desc) c.addNeighbor(amap, self.atoms[i],i) # put itself in first nbnum=0 for j in xrange(0,len(self.atoms)): key= "%d-%d" % (i, j) if (self.pairwiseDict[key] <= self.cutoff) and (abs(i-j) >= self.scutoff): c.addNeighbor(amap, self.atoms[j], j) nbnum=nbnum+1 c.thetaPhi.append(self.calculateThetaPhi(self.atoms[i], self.atoms[j])) if nbnum<self.nbcutoff: continue c.pdbidx=c.pdbidx.lstrip() # will change meanDist c.pdbResSeq=c.pdbResSeq.lstrip() meanDist = self.clusterMeanDist(c) if meanDist < 5.8: print ('%s,%0.2f,%s,%s,%s,%s') % (self.pdb, meanDist, ''.join(sorted(c.str)), ''.join(sorted(c.typeStr)), c.pdbResSeq, self.getSphericalStr(c)) self.clusters.append(c)
from xlwt import * from re import * import re import os import shutil import textExtractor import cluster # input # the path where the jobs would lie should be announced clusterName = 'Tsinghua100' clusterPath = '/WORK/newGroupAdditivityFrog2/banana/validation_g_M06' jobsPerSlot = 12 # constants cluster1 = cluster.cluster(clusterName, clusterPath) cluster1._g09D01=True pattern_logFile = re.compile('^(C[0-9]*H[0-9]*_*[0-9]*_+[r0-9]+_+[CO0-9]+).*\.log$') pattern_fileConf = re.compile('^(C[0-9]*H[0-9]*_[0-9]*_[0-9]+)_[0-9]+_.*$') pattern_gjfFile = re.compile('^(C[0-9]*H[0-9]*_*[0-9]*_+[r0-9]+_+[CO0-9]+).*\.gjf$') pattern_multi = re.compile('^.*Multiplicity = ([0-9]+).*$') pattern_optimized = re.compile('^.*Optimized Parameters.*$') pattern_standard = re.compile('^.*Standard orientation:.*$') pattern_input = re.compile('^.*Input orientation:.*$') pattern_endline = re.compile('^.*---------------------------------------------------------------------.*$') # pattern_energy = re.compile('^.*Sum of electronic and zero-point Energies= *(-?[0-9]+\.[0-9]+).*$') pattern_energy = re.compile('^.*SCF Done: E\([RU]B3LYP\) = *([\-\.0-9Ee]+) +A\.U\. after.*$') pattern_end = re.compile('^.*Normal termination of Gaussian 09.*$') # variables
import matplotlib as mpl import BUM import neuropower import cluster import peakdistribution import simul_multisubject_fmri_dataset import model EXDIR = sys.argv[1] FIGDIR = sys.argv[2] exc = 2 maskfile = os.path.join(EXDIR,"Mask.nii") SPM = nib.load(os.path.join(EXDIR,"Zstat1.nii")).get_data() peaks = cluster.cluster(SPM,exc) # compute P-values pvalues = np.exp(-exc*(np.array(peaks.peak)-exc)) pvalues = [max(10**(-6),t) for t in pvalues] peaks['pval'] = pvalues # estimate model bum = BUM.bumOptim(peaks['pval'].tolist(),starts=10) modelfit = neuropower.modelfit(peaks.peak,bum['pi1'],exc=exc,starts=10,method="RFT") # predict power thresholds = neuropower.threshold(peaks.peak,peaks.pval,FWHM=8,mask=nib.load(maskfile),alpha=0.05,exc=exc)
import numm import random R = 44100 # PADDING = R / 4 # frames between segments # SOURCE = 'snd/Dance_A.wav' SOURCE = 'snd/Duran_A.wav' NBINS = 50 cur_cluster = 0 cluster_idx = 0 paused = False frame_idx = 0 audio = numm.sound2np(SOURCE) clusters = cluster.cluster(SOURCE, NBINS) for c in clusters.values(): random.shuffle(c) def get_segment(cluster, idx): idx = idx % len(clusters[cluster]) start, duration = clusters[cluster][idx] return audio[int(R*start):int(R*(start+duration))] def audio_out(a): global frame_idx, cluster_idx, paused if paused: paused = False return
__author__ = 'Nhuy' import sys from cluster.cluster import * cluster(sys.argv[1], sys.argv[2], int(sys.argv[3]))
def main(argv): ########################################################################### # figure out the input ########################################################################### try: opts, args = getopt.getopt(argv, "hptk:d:s:l:") except getopt.GetoptError: print "Usage: main.py [-h] [-p] [-t] [-k #clusters] "\ "[-d downsample rate] [-s save file] [-l load file] directory" sys.exit(2) # initial parameters for specific methods show = False trans = False k = 4 down = 1 save = None load = None ml = None # options for opt, arg in opts: # help function if opt == '-h': print "Usage: main.py [-h] [-p] [-t] [-k #clusters] "\ "[-d downsample rate] [-s save file] [-l load file] directory" sys.exit() # show pictures of clustering and classification elif opt == '-p': show = True # translate NITFs elif opt == '-t': trans = True # number of clusters elif opt == '-k': if int(arg) <= 0: print "Error: k must be positive" sys.exit(2) k = int(arg) # downsample rate elif opt == '-d': if int(arg) < 0: print "Error: downsample rate cannot be negative" sys.exit(2) down = int(arg) # save file elif opt == '-s': if arg == '': print "Error: save file must have a name" sys.exit(2) save = arg # load file elif opt == '-l': if arg == '': print "Error: load file must have a name" sys.exit(2) load = arg # unhandled option else: assert False, "Error: unhandled option" # image directory if len(args) != 1: print "Usage: main.py [-h] [-p] [-t] [-k #clusters] "\ "[-d downsample rate] [-s save file] [-l load file] directory" sys.exit(2) folder = args[0] ########################################################################### # translate the NITFs to TIFs if needed ########################################################################### if trans: # translate high = translate(folder) # give user time to make first batch of training/testing data print "Waiting for user to draw first batch of training/testing data." raw_input("Press Enter to continue...") # cluster the TIFs allImages = cluster(folder, high, show=True) # classify crop fields allImages, results, ml = classify(folder, allImages, high, k=k, \ down=down) # save the file if save is not None: saveImages(folder, allImages, save) # if the classification is satisfactory return results. Otherwise, # unsatisfactory results so create masks. answer = raw_input("Is this classification satisfactory? y/n\n") while True: if answer == 'y': return results elif answer == 'n': createMask(folder, allImages) break else: answer = raw_input("Please input 'y' or 'n' and press Enter\n") ########################################################################### # or if there is a save file ready for use ########################################################################### elif load is not None and save is None: # load the image list allImages = loadImages(folder, load) # print the error rates from before the save for img in allImages: # dont print anything if no error rates are calculated if np.sum(img.error) == 0.0: continue # print error rates for each cluster print "Error rates of "+img.name+": " for i in xrange(k): print "cluster"+str(i)+": "+str(img.error[i]) # total error rate for image totalError = np.sum(img.error) print "Total error rate: " + str(totalError) + '\n' ########################################################################### # or if there is not a save file ready, and/or translation is not required ########################################################################### else: # give user time to make first batch of training/testing data print "Waiting for user to draw first batch of training/testing data." raw_input("Press Enter to continue...") # cluster the TIFs allImages = cluster(folder, show=show, k=k, down=down) # classify crop fields allImages, results, ml = classify(folder, allImages, k=k, down=down, \ show=show) # save the files if save is not None: saveImages(folder, allImages, save) # if the classification is satisfactory return results. Otherwise, # unsatisfactory results so create masks. answer = raw_input("Is this classification satisfactory? y/n\n") while True: if answer == 'y': return results elif answer == 'n': createMask(folder, allImages) break else: answer = raw_input("Please input 'y' or 'n' and press Enter\n") ########################################################################### # iteratively classify new training data based on errors ########################################################################### while True: # calculate new number of polygons required for each cluster image.calculatePolygons(allImages, k, N=20) # wait for the user to create more T and Q files based on calculations print "Waiting for user to draw new training/testing data." raw_input("Press Enter to continue...") # reclassify if trans: allImages, results, ml = classify(folder, allImages, high, k=k, \ ml=ml, show=show) else: allImages, results, ml = classify(folder, allImages, k=k, ml=ml,\ show=show) # if classification is satisfactory, then return results answer = raw_input("Is this classification satisfactory? y/n\n") while True: if answer == 'y': return results elif answer == 'n': break else: answer = raw_input("Please input 'y' or 'n' and press Enter\n")
def main(fileNames, target='bank'): maxFeatures = 10 windowSize = 20 freqCount = {} targetAppearances = [] wordVectors = {} ## ## Choose featureset by analyzing data. ## First go through dataset and create a list of ## TargetAppearance objects ## #filePathPrefix = basedir #'.' #try: # os.chdir(filePathPrefix) #except OSError, e: # print e # #fileNames = os.listdir(os.curdir) ## ## perform the analysis for each datafile in the directory ## during this step we also collect all the TargetAppearance ## objects and contextWindows ## startTime = time.time() for file in fileNames: if not file.endswith('.txt'): continue print 'opening file: ', file f = open(file, 'r') targetAppearances = findAppearances(f, target, windowSize, targetAppearances) f.close() ## play nice with others... stopTime = time.time() print ('**Training Phase --> datafile analysis time: %.3f seconds.' % (stopTime - startTime)) print ('**Training Phase --> found ', len(targetAppearances), ' instances of target word: ', target) ## once we have analyzed all the data files, we analyze to choose ## the features (# of dimensions) of the word vector space freqCount = analyzeLocalDistribution(f, target, windowSize, targetAppearances) ## freqList is a purely frequency driven selection if freqCount: featureList = freqBasedLocalSelection(freqCount, maxFeatures) else: print 'No occcurrences of target word: ', target, '.' ## ## word vector creation ## startTime = time.time() for file in fileNames: if not file.endswith('.txt'): continue tempWordVectors = {} f = open(file, 'r') print 'wordVector: opening ', file ## derive the word vectors for that file tempWordVectors = createWordVector(f, windowSize, featureList) f.close() ## add the resulting vectors to our total if tempWordVectors: if wordVectors: for key,val in tempWordVectors.items(): sumVectors(wordVectors[str(key)], tempWordVectors[str(key)]) else: wordVectors = tempWordVectors stopTime = time.time() print ('**Training Phase --> word vector creation time: %.3f seconds.' % (stopTime - startTime)) ## ## context vector creation ## for tA in targetAppearances: tA.setContextVector(wordVectors) ## ## sense cluster creation ## print '**Training Phase --> sense clustering' senses = [] senses = cluster.cluster(targetAppearances) ################################### ## ## testing phase ## ################################### tempWordVectors = {} testWordVectors = {} testAppearances = [] startTime = time.time() for file in fileNames: if not file.endswith('.tst'): continue print 'opening file: ', file f = open(file, 'r') ## first find the appearances of the target word in the text testAppearances = findAppearances(f, target, windowSize, testAppearances) tempWordVectors = createWordVector(f, windowSize, featureList) f.close() ## play nice with others... if tempWordVectors: if wordVectors: for key,val in tempWordVectors.items(): testWordVectors = sumVectors(testWordVectors[str(key)], tempWordVectors[str(key)]) stopTime = time.time() print ('**Testing Phase --> file analysis time: %.3f seconds.' % (stopTime - startTime)) ## ## context vector creation ## for tA in testAppearances: tA.setContextVector(wordVectors)
def ksplit(data_list,ks_crit): centroids,labels_init = kmeans2(data_list,2,minit='points') count = 1 labels = labels_init cont = ones(1) llist = zeros((5000,5000)) mask = where(llist == 0) llist[mask] = -1 list_list = [2,4,8,16,32,64,128,256,512,1024,2048,2**12] l = 0 for w in range(len(list_list)): for k in range(list_list[w]): llist[w,k] = l l += 1 j = 0 p = 2 # Runs until the sum of cont == 0, when cont > 1 this means that there are still clusters # to be clustered while sum(cont) >= 1 : cont = zeros(2**count) m = 0 for i in llist[j]: # Using the -1 mask on list this allows for the preset cluster #'s to reset. if i == -1: break # Sets r = positions of ith cluster r = data_list[labels_init == i] if len(r) == 0: continue # Feeds the positions of the ith cluster to ks_means which splits the # data into 2 further clusters, then the KS test is used and a p-val # is returned. ks,centroids_,labels = cluster(r) if ks > ks_crit: # This is the condition for a good cluster continue elif ks == -1: break else: # Re-labels the bifurcated clusters because of non-gaussianity a = where(labels_init == i) for w in range(2): b = where(labels == w) labels[b] = p p += 1 centroids = vstack((centroids,centroids_)) labels_init[a] = labels cont[m] = 1 m += 1 count += 1 j += 1 no_delete = sort(list(set(labels_init))) reverse_count = range(max(no_delete)+1) for k in reverse_count[::-1]: for n in no_delete: kill = 1 if k == n: break else: if kill == len(no_delete): centroids = delete(centroids,k,0) kill += 1 continue return centroids,labels_init,ks_crit
#get some help! if sys.argv[-1] == '--help': print("""Usage:\n \tmakeSnpFile.py inputFile.out --option\n Options: --help\t\tget this menu\n--snpdata\tprint out information about all snps --stats\t\tget stats for each cluster --dadi\t\tprint a formatted dadi snp file to stdout (use > file.txt to save to a file)""") #just for fun, a way to print out data for all snps if sys.argv[-1] == '--snpdata': for i in goodData: print(cluster(i).snpData) #and for the 'stats' line if sys.argv[-1] == '--stats': for i in goodData: print(cluster(i).stats) header = "Inpop\tOutpop\tAllele1\t1\t2\t3\tAllele2\t1\t2\t3\tClstr\tPosition" if sys.argv[-1] == '--dadi': print(header) for i in goodData: currentCluster = cluster(i).outputDadi() if currentCluster != None: print(currentCluster)
def train(self, pos_samples): if self.tokenize_all: pos_samples = self._tokenize_samples(pos_samples) if self.do_cluster: def sig_gen_cb(left, right): lsig = left['sig'] rsig = right['sig'] # tokenize if possible if not lsig and not rsig and self.tokenize_pairs: lsig = pos_samples[left['samples'][0]] rsig = pos_samples[right['samples'][0]] (lsig, rsig) = self._tokenize_samples([lsig, rsig]) else: if lsig: lsig = lsig.lcs else: lsig = list(pos_samples[left['samples'][0]]) if rsig: rsig = rsig.lcs else: rsig = list(pos_samples[right['samples'][0]]) # find the common subsequence lcs = self._find_lcs(lsig, rsig) t = self._lcs_to_tuple(lcs) sig = TupleSig(lcs, t) # print self._lcs_to_regex(sig) # calculate a score for the resulting signature scores = [] for token in t: # prob = sigprob.regex_prob(token, 1000, stats=self.statsfile)[-1] prob = sig_gen.est_fpos_rate(token, self.fpos_training_streams) scores.append(- math.log(prob + 1e-300)/math.log(10)) # using all the token scores overly favors signatures # with many tokens. Current fix is to only use most distinctive # tokens to calculate the score. if self.max_tokens_in_est: scores.sort(lambda x,y: cmp(y,x)) score = sum(scores[:self.max_tokens_in_est]) else: score = sum(scores) return (sig, score) import cluster clusters = cluster.cluster(sig_gen_cb, self.spec_threshold, pos_samples, max_fp_count = self.max_fp_count, fpos_training_streams=self.fpos_training_streams, min_cluster_size=self.min_cluster_size, bound_similarity=self.bound_similarity) # return the tuple signatures for the final clusters self.tuple_list = [] sigs = [] for c in clusters: if len(c['samples']) >= self.min_cluster_size: self.tuple_list.append(c['sig'].tuplesig) sigs.append(c['sig']) self.clusters = clusters return sigs else: # Find a subsequence common to all the samples self.lcs = pos_samples[0] for sample in pos_samples[1:]: self.lcs = self._find_lcs(self.lcs, sample) # Return the final signature regex_string = self._lcs_to_regex(self.lcs) if self.use_fixed_gaps: return [RegexSig(self._lcs_to_regex(self.lcs))] else: return [TupleSig(self.lcs, self._lcs_to_tuple(self.lcs))]
x = self.Y[:,0] y = self.Y[:,1] assert len(self.UP_pages.pages) == x.size for i in range(x.size): write_file.write(self.filename2Url(self.UP_pages.pages[i].path)+"\t"+ str(group_list[i]) +"\t"+str(x[i])+"\t"+str(y[i])+"\n") def filename2Url(self,filename): return filename.replace("_","/") if __name__=='__main__': #UP_pages = allPages(["../Crawler/toy_data/users/","../Crawler/toy_data/questions/","../Crawler/toy_data/lists/"]) #UP_pages = allPages(["../Crawler/crawl_data/Users/","../Crawler/crawl_data/Outlinks_U/","../Crawler/crawl_data/Noise/"]) UP_pages = allPages(["../Crawler/crawl_data/Questions/"]) v = visualizer(UP_pages) user_group = cluster() for i in range(len(UP_pages.ground_truth)): if UP_pages.ground_truth[i] == 1: page = UP_pages.pages[i] user_group.addPage(page) global_threshold = len(UP_pages.pages) * 0.9 print len(user_group.pages) user_group.find_local_stop_structure(UP_pages.nidf,global_threshold) v.show(v.UP_pages.ground_truth,"ground_truth.test") ''' UP_pages = allPages(["../Crawler/crawl_data/Questions/"]) feature_matrix = [] for page in UP_pages.pages: tfidf_vector = []
def problem3(docs): doc_cluster = cluster(docs) predictions = doc_cluster.hierarchical() return predictions
# encoding: utf-8 import os import sys sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/lib') import graphs import cluster import hamming if __name__ == '__main__': if len(sys.argv) < 2: print "usage: ./homework2.py [1|2|3] <inputfile>" sys.exit(0) question = sys.argv[1] filename = sys.argv[2] if len(sys.argv) > 3: param = sys.argv[3] if question == '1': graph = graphs.parse(filename, undirected=True) mindist, clust = cluster.cluster(graph, int(param)) for key,values in clust.items(): print "{}:".format(key) for v in values: print "\t{}".format(v) print "Shortest distance: {}".format(mindist) if question == '2': vertices, radix = hamming.parse(open(filename, 'r')) clustering = hamming.cluster(vertices, radix) print "Found {} clusters".format(clustering.clusters)
spm = spm-enn.mu ps = spm.flatten() ps = [x for x in ps if x == x] xn = np.arange(-10,10,0.01) twocol = Paired_12.mpl_colors plt.figure(figsize=(7,5)) plt.hist(ps,lw=0,facecolor=twocol[0],normed=True,bins=np.arange(-2,10,0.3),label="observed distribution") plt.xlim([-2,10]) plt.ylim([0,0.5]) plt.plot(xn,stats.norm.pdf(xn),color=twocol[1],lw=3,label="null distribution") plt.show() peaks = cluster.cluster(spm) peaks['pval'] = peakdistribution.peakp(peaks.peak.tolist()) bum = BUM.bumOptim(peaks["pval"].tolist(),starts=10) modelfit = neuropower.TFpeakfit(peaks['peak'].tolist(),bum['pi1']) xn = np.arange(-10,10,0.01) twocol = Paired_12.mpl_colors plt.figure(figsize=(7,5)) plt.hist(peaks['peak'].tolist(),lw=0,facecolor=twocol[0],normed=True,bins=np.arange(-2,10,0.3),label="observed distribution") plt.xlim([-2,10]) plt.ylim([0,0.5]) plt.plot(xn,[(1-bum["pi1"])*peakdistribution.peakdens3D(p,1) for p in xn],color=twocol[3],lw=3,label="null distribution") plt.plot(xn,[bum["pi1"]*peakdistribution.peakdens3D(p-modelfit['delta'],1) for p in xn],color=twocol[5],lw=3,label="alternative distribution") plt.plot(xn,neuropower.mixprobdens(modelfit["delta"],bum["pi1"],xn),color=twocol[1],lw=3,label="fitted distribution")
x2 = randn(members)+i y2 = randn(members) x = append(x1,x2) y = append(y1,y2) data_list = [] for m in range(len(x)): data = array([x[m],y[m]]) data_list.append(data) data_list = vstack(data_list) ks = kstest(x,'norm') ks1 = kstest(y1,'norm') ks,ad,cent,labels = cluster(data_list) KSstat = append(KSstat,ks) ADstat = append(ADstat,ad) AD_mu = mean(ADstat) KS_mu = mean(KSstat) ADmean = append(ADmean,AD_mu) KSmean = append(KSmean,KS_mu) ADmean = ADmean/max(abs(ADmean)) KSmean = KSmean/max(abs(KSmean)) pylab.figure() pylab.subplot(2,1,1) pylab.plot(dist,ADmean,label='AD')