def main(argv=None): index = ii.InvertedIndex() ii.load("index/fullindex.csv", index) indexer = wi.WebIndexer() indexer.load() vSpace = vs.VectorSpace(index, indexer) vSpace.buildVectors() minK = 2 maxK = 15 n = 2 results = [] if not os.path.exists("index/kmeans"): for k in range(minK, maxK + 1): print k w, u, rss = vSpace.kMeansBestOfN(k, n * k) results.append(rss) pickle.dump(results, open("index/kmeans", "wb")) else: results = pickle.load(open("index/kmeans", "rb")) print "{" for i in range(minK, maxK + 1): sys.stdout.write('\t"' + str(i) + '" : ' + str(results[i - minK])) if i != (maxK): print "," print "\n}"
def main(argv=None): """ # Sample: Indexing the collection index = ii.InvertedIndex() indexer = wi.WebIndexer() tokeniser = tk.Tokeniser() indexer.spimi(index, tokeniser) """ # Sample: Loading the index (don't need to if you just indexed, see above) index = ii.InvertedIndex() ii.load("index/fullindex.csv", index) indexer = wi.WebIndexer() indexer.load() # Sample: Generating the vector space vSpace = vs.VectorSpace(index, indexer) vSpace.buildVectors() # Sample: Simple K-means """ k = 3 # w: List of K clusters [ [docId, docId, ...], [docId, docId, ...], [docId, docId, ...] ] # u: List of K centroids [ vSpace.centroid(w[0]), ..., vSpace.centroid(w[k-1]) ] # rss: total RSS value for this clustering scheme w, u, rss = vSpace.kMeans(k) """ # Sample: K-means with the smallest RSS using N different seeds k = 8 n = 1 w, u, rss = vSpace.kMeansBestOfN(k, n) # Sample: Tokenise input tokeniser = tk.Tokeniser() userInput = raw_input("> ").strip() terms = tokeniser.tokenise(userInput) print terms # Sample: Edit distance terms = [sc.correct(term) for term in terms] print terms queryVector = vSpace.buildQueryVector(terms) docList = vSpace.nearestCluster(w, u, queryVector) # Sample: getting a list of URLs from a list of doc IDs urlList = [indexer.urls[docId] for docId in docList] count = 1 for url in urlList: print count, ":", url count += 1