Exemple #1
0
def run():
    initial = True
    size = 2000
    tweet_ids = []
    tweet_text = []
    counter = 0
    num_hashtables = 5      ## recompute the random vectors if this is changed
    dimension = 5000000      ## recompute the random vectors if this is changed
    hash_size = 13          ## length of the LSHash of the tweets
    bucket_size = 100       ## size of the queue for each hash in the hash tables
    comparisons = 50       ## upper bound on the number of comparisons (dot product) to find the nearest neighbor
    cos_threshold = .5      ## threshold for the similarity of two tweets

    ## initialize the tf-idf vectorizer
    vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension)
    ## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size
    lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size)


    clusters = {}           ## maintain the clusters
    num_clusters = 0
    Y = None
    Y1 = None
    f_d = open("output.txt",'w')
    loc = "processed_tweets/"
    for root, dirs, filenames in os.walk(loc):
        for f in filenames:
            with open(loc+f) as infile:
                for line in infile:

                    ## load 2000 tweets at a time 
                    tweet = json.loads(line)
                    tweet_ids.append(tweet['id'])
                    tweet_text.append(tweet['text'])
                    counter = counter + 1
                    t2 = 0
                    if counter%size == 0:
                        t1 = time.clock()

                        ## X contains te tf-idf score of the tweets in the "sparse row matrix" format
                        if initial:
                            X = vectorizer.fit_transform(tweet_text)
                        else:
                            X = vectorizer.transform(tweet_text)
                        print X.get_shape()
                        print len(vectorizer.vocabulary_)

                        ## if the total number of keywords exceed the pre-specified dimension, raise error
                        if X.get_shape()[0] > dimension:
                            print X.get_shape()
                            print "dimension exceeded"
                            raise
                        for i in range(X.get_shape()[0]):

                            temp_tweet = X.getrow(i)

                            ## query for the nearest neighbor from the lshash tables
                            nn = lsh.arpoxNN(temp_tweet, L=comparisons)
                            c = 2
                            scase = False

                            ## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster
                            cluster_id = -1
                            if nn is not None:
                                ((a, (b,d)),c) = nn
                                if c <= cos_threshold:
                                    cluster_id = d
                                    clusters.setdefault(d,[]).append(tweet_ids[i])
                                #else:
                                #    scase = True

                            ## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor
                            """ code to linearly search through the tweets"""
                            if (c > cos_threshold or nn is None or scase):
                                cluster_id = num_clusters
                                clusters.setdefault(num_clusters, []).append(tweet_ids[i])
                                num_clusters = num_clusters + 1

                            ### index the tweet into the hsh tables
                            lsh.index(input_point = temp_tweet, extra_data = tuple([tweet_ids[i], cluster_id]))
                        initial = False
                        Y = X
                        Y1 = tweet_ids[:]
                        tweet_ids = []
                        tweet_text = []
                        print counter
                        print time.clock() - t1
                        f2 = open('time.txt','a')
                        f2.write(str(time.clock()-t1) + '\n')
                        f2.close()
                        if counter%100000==0:
                            f2 = open('result.txt', 'a')
                            f2.write(json.dumps(clusters) + "\n")
                            f3 = open('vocab.txt', 'a')
                            f4 = open('vectorizer.txt', 'a')
                            f3.write(json.dumps(vectorizer.vocabulary_) + "\n")
                            f4.write(json.dumps(vectorizer.idf_) + "\n")
                            #print clusters
                            #print vectorizer.vocabulary_
                            f2.close()
                            f3.close()
                            f4.close()
Exemple #2
0
def run():
    initial = True
    size = 2000
    tweet_ids = []
    tweet_text = []
    counter = 0
    num_hashtables = 13      ## recompute the random vectors if this is changed
    dimension = 50000       ## recompute the random vectors if this is changed
    hash_size = 13          ## length of the LSHash of the tweets
    bucket_size = 100       ## size of the queue for each hash in the hash tables
    comparisons = 50       ## upper bound on the number of comparisons (dot product) to find the nearest neighbor
    cos_threshold = .7      ## threshold for the similarity of two tweets

    ## initialize the tf-idf vectorizer
    vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension)
    ## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size
    lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size)


    clusters = {}           ## maintain the clusters
    num_clusters = 0
    inv_index = {}          ## inverse mapping from tweet_id to clusters
    Y = None
    Y1 = None
    f_d = open("output.txt",'w')
    loc = "/Users/dilpreet/Documents/mtp_documents/markedData/data/"
    for root, dirs, filenames in os.walk(loc):
        for f in filenames:
            with open(loc+f) as infile:
                for line in infile:

                    ## load 2000 tweets at a time 
                    tweet = json.loads(line)
                    tweet_ids.append(tweet['id'])
                    tweet_text.append(tweet['text'])
                    counter = counter + 1
                    t2 = 0
                    if counter%size == 0:
                        t1 = time.clock()

                        ## X contains te tf-idf score of the tweets in the "sparse row matrix" format
                        if initial:
                            X = vectorizer.fit_transform(tweet_text)
                        else:
                            X = vectorizer.transform(tweet_text)
                        print X.get_shape()
                        print len(vectorizer.vocabulary_)

                        ## if the total number of keywords exceed the pre-specified dimension, raise error
                        if X.get_shape()[0] > dimension:
                            print X.get_shape()
                            print "dimension exceeded"
                            raise
                        for i in range(X.get_shape()[0]):
                            temp_tweet = X.getrow(i)

                            ## query for the nearest neighbor from the lshash tables
                            nn = lsh.arpoxNN(temp_tweet, L=comparisons)
                            c = 2
                            scase = False

                            ## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster

                            if nn is not None:
                                ((a, b),c) = nn
                                if c <= cos_threshold:
                                    inv_index[tweet_ids[i]] = inv_index[b]
                                    clusters.setdefault(inv_index[b],[]).append(tweet_ids[i])
                                #else:
                                #    scase = True

                            ## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor
                            """ code to linearly search through the tweets"""
                            if (c > cos_threshold or nn is None or scase):
                                searchY = False

                                if (i==0 and not initial):
                                    searchY = True
                                if (i==0 and initial):
                                    inv_index[tweet_ids[i]] = num_clusters
                                    clusters.setdefault(num_clusters, []).append(tweet_ids[i])
                                    num_clusters = num_clusters + 1
                                if (i!=0):
                                    Z = X[:i]
                                    #print temp_tweet.shape
                                    t2 = temp_tweet.transpose()
                                    #print i
                                    a1 = Z.dot(t2).toarray()
                                    a2 = Z.multiply(Z).sum(axis = 1)
                                    a3 = sp.csr_matrix(t2.multiply(t2).sum()).toarray()
                                    a2 = sp.csc_matrix(a2).toarray()
                                    b = [j for j in range(Z.shape[0])]
                                
                                    a = min(b, key = lambda x: 1-float(a1[x][0])/((a2[x][0] + a3[0][0])**.5))
                                    #a = min(Z, key = lambda x: cosine_dist(x[0], temp_tweet))
                                    #print a
                                    t3 = tweet_ids[a]
                                    if (1-float(a1[a][0])/((a2[a][0] + a3[0][0])**.5))> cos_threshold:
                                        if not initial and i != size-1:
                                            searchY = True
                                        else:
                                            inv_index[tweet_ids[i]] = num_clusters
                                            clusters.setdefault(num_clusters, []).append(tweet_ids[i])
                                            num_clusters = num_clusters + 1
                                    else:
                                        inv_index[tweet_ids[i]] = inv_index[t3]
                                        clusters.setdefault(inv_index[t3], []).append(tweet_ids[i])
                                if searchY == True:
                                    Z = Y[i:]
                                    t2 = temp_tweet.transpose()
                                    #print i
                                    a1 = Z.dot(t2).toarray()
                                    a2 = Z.multiply(Z).sum(axis = 1)
                                    a3 = sp.csr_matrix(t2.multiply(t2).sum()).toarray()
                                    a2 = sp.csc_matrix(a2).toarray()
                                    b1 = [j for j in range(Z.shape[0])]
                                    a = min(b1, key = lambda x: 1-float(a1[x][0])/((a2[x][0] + a3[0][0])**.5))
                                    t3 = Y1[a + i]
                                    if (1-float(a1[a][0])/((a2[a][0] + a3[0][0])**.5))< cos_threshold:
                                        inv_index[tweet_ids[i]] = inv_index[t3]
                                    else:
                                        inv_index[tweet_ids[i]] = num_clusters
                                        clusters.setdefault(num_clusters, []).append(tweet_ids[i])
                                        num_clusters = num_clusters + 1

                            ### index the tweet into the hsh tables
                            lsh.index(input_point = temp_tweet, extra_data = tweet_ids[i])
                        initial = False
                        Y = X
                        Y1 = tweet_ids[:]
                        tweet_ids = []
                        tweet_text = []
                        print counter
                        print time.clock() - t1
                        f2 = open('time.txt','a')
                        f2.write(str(time.clock()-t1) + '\n')
                        f2.close()
                        if counter%10000==0:
                            f2 = open('result.txt', 'a')
                            f2.write(json.dumps(clusters) + "\n")
                            f3 = open('vocab.txt', 'a')
                            f4 = open('vectorizer.txt', 'a')
                            f3.write(json.dumps(vectorizer.vocabulary_) + "\n")
                            f4.write(json.dumps(vectorizer.idf_) + "\n")
                            #print clusters
                            #print vectorizer.vocabulary_
                            f2.close()
                            f3.close()
                            f4.close()

    f2 = open('result.txt', 'w')
    f2.write(json.dumps(clusters) + "\n")
    f3 = open('vocab.txt', 'w')
    f4 = open('vectorizer.txt', 'w')
    f5 = open('inv_index.txt', 'w')
    f3.write(json.dumps(vectorizer.vocabulary_) + "\n")
    f4.write(json.dumps(vectorizer.idf_) + "\n")
    f5.write(json.dumps(inv_index))
    #print clusters
    #print vectorizer.vocabulary_
    f2.close()
    f3.close()
    f4.close()
    f5.close()
def run():
    initial = True
    size = 200000
    tweet_ids = []
    tweet_text = []
    counter = 0
    num_hashtables = 4      ## recompute the random vectors if this is changed
    dimension = 5000000      ## recompute the random vectors if this is changed
    hash_size = 13          ## length of the LSHash of the tweets
    bucket_size = 100       ## size of the queue for each hash in the hash tables
    comparisons = 50       ## upper bound on the number of comparisons (dot product) to find the nearest neighbor
    cos_threshold = .7      ## threshold for the similarity of two tweets

    ## initialize the tf-idf vectorizer
    vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension)
    ## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size
    lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size)

    clusters = {}           ## maintain the clusters
    num_clusters = 0


    completed = open('/tmp/completed_tmp.txt')
    completed = completed.readlines()
    completed = set([x.replace('\n', '') for x in completed])

    while(True):
        clusters_size_prev = {}
        files = []
        for root, dirs, filenames in os.walk('/tmp/tweets_tmp/'):
            for fname in filenames:
                if fname != '.DS_Store':
                    files.append(fname)
        files = set(files)
        files = files - completed
        if len(files) == 0:
            print 'sleeping'
            time.sleep(3000)
            print 'checking'
            continue
        #print files
        tweets_dump = {}
        tweet_ids = []
        tweet_text = []
        time_sleep = time.time()
        for fn in files:
            print fn
            time_tmp2 = time.time()
            with open('/tmp/tweets_tmp/' + fn) as infile:
                for line in infile:
                    ## load 2000 tweets at a time 
                    
                    tweet = json.loads(line)
                    tweet_ids.append(tweet['id'])
                    tweet_text.append(tweet['filtered_text'])
                    tweets_dump[str(tweet['id'])] = tweet['text']

                    counter = counter + 1
                    t2 = 0
                    if counter%size == 0:
                        t1 = time.clock()

                        ## X contains te tf-idf score of the tweets in the "sparse row matrix" format
                        if initial:
                            X = vectorizer.fit_transform(tweet_text)
                        else:
                            X = vectorizer.transform(tweet_text)
                        #print X.get_shape()
                        #print len(vectorizer.vocabulary_)

                        ## if the total number of keywords exceed the pre-specified dimension, raise error
                        if X.get_shape()[0] > dimension:
                            print X.get_shape()
                            print "dimension exceeded"
                            raise
                        for i in range(X.get_shape()[0]):

                            temp_tweet = X.getrow(i)

                            ## query for the nearest neighbor from the lshash tables
                            nn = lsh.arpoxNN(temp_tweet, L=comparisons)
                            c = 2
                            scase = False

                            ## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster
                            cluster_id = -1
                            if nn is not None:
                                ((a, (b,d)),c) = nn
                                if c <= cos_threshold:
                                    cluster_id = d
                                    clusters.setdefault(d,[]).append(tweet_ids[i])
                                #else:
                                #    scase = True

                            ## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor
                            """ code to linearly search through the tweets"""
                            if (c > cos_threshold or nn is None or scase):
                                cluster_id = num_clusters
                                clusters.setdefault(num_clusters, []).append(tweet_ids[i])
                                num_clusters = num_clusters + 1

                            ### index the tweet into the hsh tables
                            lsh.index(input_point = temp_tweet, extra_data = tuple([tweet_ids[i], cluster_id]))
                        initial = False
 
                        tweet_ids = []
                        tweet_text = []
                        #print counter
                        #print time.clock() - t1
                        f2 = open('time.txt','a')
                        f2.write(str(time.clock()-t1) + '\n')
                        f2.close()
                        """
                        if counter%100000==0:
                            f2 = open('result.txt', 'w')
                            f2.write(json.dumps(clusters) + "\n")
                            f3 = open('vocab.txt', 'w')
                            f4 = open('vectorizer.txt', 'w')
                            f3.write(json.dumps(vectorizer.vocabulary_) + "\n")
                            f4.write(json.dumps(vectorizer.idf_) + "\n")
                            #print clusters
                            #print vectorizer.vocabulary_
                            f2.close()
                            f3.close()
                            f4.close()
                        """
            print 'done'
            print counter
            print str(time.time() - time_tmp2)
            f = open('/tmp/completed_tmp.txt', 'a')
            f.write(fn + '\n')
            f.close()
            completed.add(fn)
        print "all done"
        time_temp = time.time()
        if not os.path.exists('/home/y/share/htdocs/clusters/' + str(time_temp)):
            os.makedirs('/home/y/share/htdocs/clusters/' + str(time_temp))
        if not os.path.exists('/home/y/share/htdocs/clusters/current'):
            os.makedirs('/home/y/share/htdocs/clusters/current')
        
        clusters_size = {}

        for x in clusters:
            clusters_size[x] = len(clusters[x])
        f = open('/home/y/share/htdocs/clusters/' + str(time_temp) + '/sizes.txt', 'w')
        f.write(json.dumps(clusters_size))
        f.close()
        f = open('/home/y/share/htdocs/clusters/current/sizes.txt', 'w')
        f.write(json.dumps(clusters_size))
        f.close()
        cls = clusters_size.keys()
        cls.sort(key = lambda x : -1 * clusters_size[x])
        cl = []
        for x in cls:
            if clusters_size[x] >=10:
                cl.append(x)
        arr = []
        for i in range(len(cl)):
            write_clusters(i, cl, clusters, tweets_dump, time_temp, '/home/y/share/htdocs/clusters')
            arr.append(cl[i])
        f = open('/home/y/share/htdocs/clusters/' + str(time_temp) + '/list.txt', 'w')
        f.write(json.dumps(arr))
        f.close()
        f = open('/home/y/share/htdocs/clusters/current/list.txt', 'w')
        f.write(json.dumps(arr))
        f.close()
        f = open('/home/y/share/htdocs/clusters/list.txt', 'a')
        f.write(str(time_temp) + '\n')
        f.close()

        if not os.path.exists('/home/y/share/htdocs/ratio_clusters/' + str(time_temp)):
            os.makedirs('/home/y/share/htdocs/ratio_clusters/' + str(time_temp))
        if not os.path.exists('/home/y/share/htdocs/ratio_clusters/current'):
            os.makedirs('/home/y/share/htdocs/ratio_clusters/current')

        ratio = {}
        for x in clusters_size:
            if clusters_size[x]>=10:
                r = 1
                if (x in clusters_size_prev and clusters_size_prev[x] != 0):
                        r = clusters_size_prev[x]
                ratio[x] = clusters_size[x]*1.0/r
        ratio_keys = ratio.keys()
        ratio_keys.sort(key = lambda x : -1 * ratio[x])
        ratio_keys = ratio_keys[:300]
        arr = []
        for i in range(len(ratio_keys)):
            write_clusters(i, ratio_keys, clusters, tweets_dump, time_temp, '/home/y/share/htdocs/ratio_clusters')
            arr.append(ratio_keys[i])
        f = open('/home/y/share/htdocs/ratio_clusters/' + str(time_temp) + '/list.txt', 'w')
        f.write(json.dumps(arr))
        f.close()
        f = open('/home/y/share/htdocs/ratio_clusters/current/list.txt', 'w')
        f.write(json.dumps(arr))
        f.close()
        f = open('/home/y/share/htdocs/ratio_clusters/list.txt', 'a')
        f.write(str(time_temp) + '\n')
        f.close()

        clusters_size_prev = {}
        for x in clusters_size:
            clusters_size_prev[x] = clusters_size[x]

        clusters = {}
        time.sleep(max(0, 3600 - (time.time() - time_sleep)))