Exemple #1
0
        def get_ltuo_hashtag_and_model_rank_accuracy_and_random_rank_accuracy(file):
            ltuo_hashtag_and_model_rank_accuracy_and_random_rank_accuracy = []
            for data in FileIO.iterateLinesFromFile(file):
#                hashtag, model_rank_accuracy, random_rank_accuracy = data.split(',')[1:3]
                data = data.split(',')[2:5]
                ltuo_hashtag_and_model_rank_accuracy_and_random_rank_accuracy.append([float(i) for i in [data[2], data[0], data[1]]])
            return ltuo_hashtag_and_model_rank_accuracy_and_random_rank_accuracy
 def iterateFrequentLocationsFromFIMahout(
     minLocationsTheUserHasCheckedin,
     minUniqueUsersCheckedInTheLocation,
     minCalculatedSupport,
     minLocationsInItemset=0,
     extraMinSupport=minSupport,
     yieldSupport=False,
     lids=False,
 ):
     #        for line in FileIO.iterateLinesFromFile(locationsFIMahoutOutputFile%(minUserLocations, minCalculatedSupport)):
     for line in FileIO.iterateLinesFromFile(
         locationsFIMahoutOutputFile
         % (minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, minCalculatedSupport)
     ):
         if line.startswith("Key:"):
             data = line.split("Value: ")[1][1:-1].split(",")
             if not lids:
                 locationItemset, support = (
                     [getLocationFromLid(i.replace("_", " ")) for i in data[0][1:-1].split()],
                     int(data[1]),
                 )
             else:
                 locationItemset, support = [i.replace("_", " ") for i in data[0][1:-1].split()], int(data[1])
             if support >= extraMinSupport and len(locationItemset) >= minLocationsInItemset:
                 if not yieldSupport:
                     yield [location for location in locationItemset if isWithinBoundingBox(location, us_boundary)]
                 else:
                     yield [
                         location
                         for location in locationItemset
                         if isWithinBoundingBox(getLocationFromLid(location), us_boundary)
                     ], support
Exemple #3
0
def streamingLSHClusteringDemo():
    clustering_settings = {'dimensions': 53,
                            'signature_length': 13,
                            'number_of_permutations': 5,
                            'threshold_for_document_to_be_in_cluster': 0.2}
    clustering=StreamingLSHClustering(**clustering_settings)
    docId = 0
    docsToOriginalClusterMap = {}
    for line in FileIO.iterateLinesFromFile('../data/streaming.dat'):
        document = createDocumentFromLine(docId, line)
        docsToOriginalClusterMap[docId] = document.clusterId
        docId+=1
        clustering.getClusterAndUpdateExistingClusters(document)
    clusterLabels = []
    for k, cluster in clustering.clusters.iteritems(): clusterLabels.append([docsToOriginalClusterMap[doc.docId] for doc in cluster.iterateDocumentsInCluster()])
    return EvaluationMetrics.getValueForClusters(clusterLabels, EvaluationMetrics.purity)
nns_settings = {'dimensions': 53,
                'signature_length': 13,
                'number_of_permutations': 5,
                'signature_type': 'signature_type_lists',
                'nearest_neighbor_threshold': 0.2}

def createDocumentFromLine(docId, line):
    vector, words = Vector(), line.split()
    for word in words[1:]:
        if word not in vector: vector[word]=1
        else: vector[word]+=1
    return Document(words[0], vector)
i = 0
documents = []
for line in FileIO.iterateLinesFromFile('../data/streaming.dat'):
    documents.append(createDocumentFromLine(None, line)); i+=1
    if i==10: break

class NearestNeighborUsingLSHTests(unittest.TestCase):
    def setUp(self):
        self.nnsLSH = NearestNeighborUsingLSH(**nns_settings)
#    def test_nns(self):
#        for d in documents: 
#            self.nnsLSH.update(d)
#            self.assertEqual(d.docId, self.nnsLSH.getNearestDocument(d))
    def test_getNearestDocumentWithReplacement(self):
        for d in documents: self.nnsLSH.update(d)
        for d in documents: print d.docId, self.nnsLSH.getNearestDocumentWithReplacement(d)
        
    
    'signature_type': 'signature_type_lists',
    'nearest_neighbor_threshold': 0.2
}


def createDocumentFromLine(docId, line):
    vector, words = Vector(), line.split()
    for word in words[1:]:
        if word not in vector: vector[word] = 1
        else: vector[word] += 1
    return Document(words[0], vector)


i = 0
documents = []
for line in FileIO.iterateLinesFromFile('../data/streaming.dat'):
    documents.append(createDocumentFromLine(None, line))
    i += 1
    if i == 10: break


class NearestNeighborUsingLSHTests(unittest.TestCase):
    def setUp(self):
        self.nnsLSH = NearestNeighborUsingLSH(**nns_settings)


#    def test_nns(self):
#        for d in documents:
#            self.nnsLSH.update(d)
#            self.assertEqual(d.docId, self.nnsLSH.getNearestDocument(d))
Exemple #6
0
def offlineLSHClusteringDemo():
    wordToDimensionMap = {}

    def createDocumentFromLine(docId, line):
        vector = Vector()
        words = line.split()
        for word in words[1:]:
            if word not in wordToDimensionMap:
                wordToDimensionMap[word] = len(wordToDimensionMap)
            wordDimension = wordToDimensionMap[word]
            if wordDimension not in vector: vector[wordDimension] = 1
            else: vector[wordDimension] += 1
        return Document(docId, vector, clusterId=words[0])

    dimensions = 53
    signatureLength = 13
    numberOfPermutations = 5

    unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1)
    vectorPermutations = VectorPermutation.getPermutations(
        signatureLength, dimensions, unitVector)
    signaturePermutations = [
        SignaturePermutationWithTrie(signatureLength)
        for i in range(numberOfPermutations)
    ]

    permutatedUnitVectors = [
        unitVector.getPermutedVector(r) for r in vectorPermutations
    ]

    # Build LSH Model.
    # Read training documents.
    traningDocumentsMap = {}
    for docId, l in enumerate(
            FileIO.iterateLinesFromFile('../data/train_offline.dat')):
        traningDocumentsMap[docId] = createDocumentFromLine(docId, l)
    # Construct cluster vectors.
    clusterToDocumentsMap = defaultdict(list)
    for document in traningDocumentsMap.values():
        clusterToDocumentsMap[document.clusterId].append(document)
    clusterMap = {}
    for k, v in clusterToDocumentsMap.iteritems():
        clusterMap[k] = Document(docId=k,
                                 vector=Vector.getMeanVector(v),
                                 clusterId=k)

    # Create signatures and signaturePermutations for all the clusters.
    map(
        lambda document: document.setSignatureUsingVectors(
            permutatedUnitVectors), clusterMap.values())
    for permutation in signaturePermutations:
        for document in clusterMap.values():
            permutation.addDocument(document)

    # Testing the model.
    # Read testing documents.
    testDocumentsMap = {}
    for docId, l in enumerate(
            FileIO.iterateLinesFromFile('../data/test_offline.dat')):
        testDocumentsMap[docId] = createDocumentFromLine(docId, l)
    # Create signatures for test documents
    map(
        lambda document: document.setSignatureUsingVectors(
            permutatedUnitVectors), testDocumentsMap.values())

    predicted, labels = [], []
    for t in testDocumentsMap.values():
        possibleNearestClusters = reduce(
            lambda x, y: x.union(y),
            (permutation.getNearestDocuments(t)
             for permutation in signaturePermutations), set())
        predictedClass = max(
            ((clusterId, clusterMap[clusterId].cosineSimilarity(t))
             for clusterId in possibleNearestClusters),
            key=itemgetter(1))
        predicted.append(predictedClass[0])
        labels.append(t.clusterId)
    return EvaluationMetrics.purity(predicted, labels)