コード例 #1
0
ファイル: data.py プロジェクト: kaiserahmed/CubeRoot
def main(argv=None):
    index = ii.InvertedIndex()
    ii.load("index/fullindex.csv", index)
    indexer = wi.WebIndexer()
    indexer.load()

    vSpace = vs.VectorSpace(index, indexer)
    vSpace.buildVectors()

    minK = 2
    maxK = 15
    n = 2

    results = []
    if not os.path.exists("index/kmeans"):
        for k in range(minK, maxK + 1):
            print k
            w, u, rss = vSpace.kMeansBestOfN(k, n * k)
            results.append(rss)
        pickle.dump(results, open("index/kmeans", "wb"))
    else:
        results = pickle.load(open("index/kmeans", "rb"))

    print "{"
    for i in range(minK, maxK + 1):
        sys.stdout.write('\t"' + str(i) + '" : ' + str(results[i - minK]))
        if i != (maxK):
            print ","
    print "\n}"
コード例 #2
0
ファイル: main.py プロジェクト: TheSisb/CubeRoot
def main(argv=None):
    """
    # Sample: Indexing the collection
    index = ii.InvertedIndex()
    indexer = wi.WebIndexer()
    tokeniser = tk.Tokeniser()
    indexer.spimi(index, tokeniser)
    """
    # Sample: Loading the index (don't need to if you just indexed, see above)
    index = ii.InvertedIndex()
    ii.load("index/fullindex.csv", index)
    indexer = wi.WebIndexer()
    indexer.load()
    
    # Sample: Generating the vector space
    vSpace = vs.VectorSpace(index, indexer)
    vSpace.buildVectors()
    
    # Sample: Simple K-means
    """
    k = 3
    # w: List of K clusters [ [docId, docId, ...], [docId, docId, ...], [docId, docId, ...] ]
    # u: List of K centroids [ vSpace.centroid(w[0]), ..., vSpace.centroid(w[k-1]) ]
    # rss: total RSS value for this clustering scheme
    w, u, rss = vSpace.kMeans(k)
    """
    # Sample: K-means with the smallest RSS using N different seeds
    k = 8
    n = 1
    w, u, rss = vSpace.kMeansBestOfN(k, n)

    # Sample: Tokenise input
    tokeniser = tk.Tokeniser()
    userInput = raw_input("> ").strip()
    terms = tokeniser.tokenise(userInput)
    print terms

    # Sample: Edit distance
    terms = [sc.correct(term) for term in terms]
    print terms

    queryVector = vSpace.buildQueryVector(terms)
    docList = vSpace.nearestCluster(w, u, queryVector)

    # Sample: getting a list of URLs from a list of doc IDs
    urlList = [indexer.urls[docId] for docId in docList]

    count = 1
    for url in urlList:
        print count, ":", url
        count += 1