def read_clusters(clusters_filename): """ mysekitei, regexpes=[(class, freq_features, their_indices)] """ mysekitei = sekitei([], alpha=0.01) regexpes = [] c, n = 0, 0 res, indices = [], [] with open(clusters_filename, 'r') as file: for line in file.readlines(): if (line[0:3] == '---'): ls = line[3:].split() c, n = int(ls[0]), int(ls[2]) elif (n): r, i = line.split() i = int(i) mysekitei.tags.add(r) mysekitei.tags_order[i] = r res.append(r) indices.append(i) n -= 1 if not n: regexpes.append([c, res, indices]) res, indices = [], [] elif len(line) and line.split() and line.split()[0] == 'n_features=': mysekitei.n_features = int(line.split()[1]) mysekitei.tags_order = [''] * mysekitei.n_features return mysekitei, regexpes
def get_clusters(good_urls, urls, n_urls=500, my_dbs=False, verbose=False): """ """ random.shuffle(good_urls) random.shuffle(urls) fit_urls = good_urls[:n_urls] + urls[:n_urls] mysekitei = sekitei(fit_urls, alpha=0.01) mysekitei.fit() X = mysekitei.most_freq_features() if my_dbs: py = dbscan().fit_predict(X) else: py = DBSCAN().fit_predict(X) hist = [] clusters = list( set(py) ) with open('data/clusters_features.txt', 'w') as file: print >>file, mysekitei.n_features print >>file, '\n\n\n', '\n'.join(mysekitei.tags_order[:mysekitei.n_features]), '\n\n' for c in clusters: hist.append(len([p for p in py if p == c])) # print >>f, c, ':', hist[-1] vizualize_clusters(X, ([1] * n_urls + [0] * n_urls), py, hist) regexpes = mysekitei.get_clusters_regexpes(X, py) with open('data/clusters_freq_features.txt', 'w') as file: print 'n_features=', mysekitei.n_features, '\n\n' print >>file, 'n_features=', mysekitei.n_features, '\n\n' for c,f,i in regexpes: print '---', c, '=', str(len(f)) print '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n' print >>file, '---', c, '=', str(len(f)) print >>file, '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n' with open('data/united_regexpes.txt', 'w') as file: for k,f,i in regexpes: rex = '^' for r in f[:-1]: rex += '(?=%s)' % r.strip('^').rstrip('$') rex += '%s' % f[-1].strip('^') print >>file, k, '=', rex return mysekitei, regexpes
def get_clusters(good_urls, urls, n_urls=500, my_dbs=False, verbose=False): """ """ random.shuffle(good_urls) random.shuffle(urls) fit_urls = good_urls[:n_urls] + urls[:n_urls] mysekitei = sekitei(fit_urls, alpha=0.01) mysekitei.fit() X = mysekitei.most_freq_features() if my_dbs: py = dbscan().fit_predict(X) else: py = DBSCAN().fit_predict(X) regexpes = mysekitei.get_clusters_regexpes(X, py) print 'n_features=', mysekitei.n_features, '\n\n' for c,f,i in regexpes: print '---', c, '=', str(len(f)) print '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n' return mysekitei, regexpes