def main(): hourlyInterval = 0 # are we building hourly or daily histograms? rawOccurrenceModel = {} # key'd by term pairing # Did they provide the correct args? if len(sys.argv) != 4: usage() sys.exit(-1) # Parse command line if sys.argv[1] == "hourly": hourlyInterval = 1 elif sys.argv[1] == "daily": pass else: usage() sys.exit(-1) # Pull lines with open(sys.argv[2], "r") as f: tweets = f.readlines() print "tweets: %d" % len(tweets) # -------------------------------------------------------------------------- # Process tweets for i in tweets: info = tweetclean.extract(i) if info == None: sys.exit(-1) # Build day string # This needs to return -1 on error, so I'll need to test it. if hourlyInterval: date = tweetdate.buildDateInt(info[0]) else: date = tweetdate.buildDateDayInt(info[0]) # Do some cleanup newTweet = tweetclean.cleanup(info[1]) rawOccurrenceModel = \ languagemodel.update_matrix( rawOccurrenceModel, languagemodel.build_matrix(newTweet, "-&")) # -------------------------------------------------------------------------- # Debug, Dump the Raw Occurrences (not finalized) for k, v in rawOccurrenceModel.items(): print "%s:%d" % (k, v)
def data_pull(database_file, query): """Pull the data from the database.""" user_tweets = {} conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row for row in conn.cursor().execute(query): if row['text'] is not None: data = tweetclean.cleanup(row['text'], True, True) try: user_tweets[row['owner']].append(data) except KeyError: user_tweets[row['owner']] = [] user_tweets[row['owner']].append(data) conn.close() return user_tweets
def data_pull(database_file, query): """Pull the data from the database.""" user_tweets = {} conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row for row in conn.cursor().execute(query): if row['text'] is not None: data = cleanup(row['text'], True, True) try: user_tweets[row['owner']].append(data) except KeyError: user_tweets[row['owner']] = [] user_tweets[row['owner']].append(data) conn.close() return user_tweets
def thread_main(database_file, output_folder, users, stopwords, start, cnt): """ What, what! : ) """ query_tweets = "select id, contents as text from tweets where owner = %d;" users_tweets = {} conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row # -------------------------------------------------------------------------- # Process this thread's users. for u in xrange(start, start + cnt): user_id = users[u] users_tweets = {} output = "%d\t%d\t%d\t%fm" start = time.clock() for row in conn.cursor().execute(query_tweets % user_id): if row['text'] is not None: users_tweets[row['id']] = \ tweetclean.cleanup(row['text'], True, True) curr_cnt = len(users_tweets) doc_tfidf, ignore = vectorspace.build_doc_tfidf(users_tweets, stopwords) # ---------------------------------------------------------------------- centroids = centroid.cluster_documents(doc_tfidf) duration = (time.clock() - start) / 60 # for minutes print output % (user_id, curr_cnt, len(centroids), duration) with open(os.path.join(output_folder, "%d.topics" % user_id), "w") as f: f.write("user: %d\n#topics: %d\n" % (user_id, len(centroids))) # Might be better if I just implement __str__ for Centroids. for cen in centroids: f.write("%s\n" % str(centroids[cen])) f.write("-------------------------------------------------------\n") conn.close()
def data_pull(database_file, query): """Pull the data from the database.""" user_data = {} conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row for row in conn.cursor().execute(query): if row['text'] is not None: data = cleanup(row['text'], True, True) twt = TweetTime(row['created']) uid = row['owner'] # could probably get away with pushing this up -- like in c++. mdv = twt.get_month()["day_val"] try: user_data[uid].add_data(mdv, data) except KeyError: user_data[uid] = frame.FrameUser(uid, mdv, data) conn.close() return user_data
def main(): cleanTweets = {} # dictionary of the tweets by id as integer docFreq = {} # dictionary of in how many documents the "word" appears invdocFreq = {} # dictionary of the inverse document frequencies docTermFreq = {} # dictionary of term frequencies by date as integer docTfIdf = {} # similar to docTermFreq, but holds the tf-idf values # Did they provide the correct args? if len(sys.argv) != 3: usage() sys.exit(-1) # Pull lines with codecs.open(sys.argv[1], "r", 'utf-8') as f: tweets = f.readlines() # Pull stop words with open(sys.argv[2], "r") as f: stopwords = f.readlines() # clean them up! for i in xrange(0, len(stopwords)): stopwords[i] = stopwords[i].strip() # -------------------------------------------------------------------------- # Process tweets for i in tweets: # Each tweet has <id>DATE-TIME</id> and <text>DATA</text>. # # So we'll have a dictionary<string, string> = {"id", "contents"} # # So, we'll just append to the end of the string for the dictionary # entry. info = tweetclean.extract_id(i) if info == None: sys.stderr.write("Invalid tweet hit\n") sys.exit(-1) # Add this tweet to the collection of clean ones. cleanTweets[info[0]] = tweetclean.cleanup(info[1], True, True) docLength = {} # -------------------------------------------------------------------------- # Process the collected tweets for id in cleanTweets.keys(): # Calculate Term Frequencies for this id/document. # Skip 1 letter words. # let's make a short list of the words we'll accept. pruned = [w for w in cleanTweets[id].split(' ') if len(w) > 1 and w not in stopwords] # skip documents that only have one word. if len(pruned) < 2: continue docTermFreq[id] = {} # Prepare the dictionary for that document. for w in pruned: try: docLength[id] += 1 except KeyError: docLength[id] = 1 try: docTermFreq[id][w] += 1 except KeyError: docTermFreq[id][w] = 1 # Contribute to the document frequencies. for w in docTermFreq[id]: try: docFreq[w] += 1 except KeyError: docFreq[w] = 1 # -------------------------------------------------------------------------- # Dump how many unique terms were identified by spacing splitting. print "Total Count of Terms: %s" % docLength print "Unique Terms: %d" % len(docFreq) print "How many Documents: %d" % len(docTermFreq) # -------------------------------------------------------------------------- # Remove singletons -- standard practice. # Skipped with tweets for now... # Calculate the inverse document frequencies. invdocFreq = vectorspace.calculate_invdf(len(docTermFreq), docFreq) # Calculate the tf-idf values. docTfIdf = vectorspace.calculate_tfidf(docLength, docTermFreq, invdocFreq) # -------------------------------------------------------------------------- # Recap of everything we have stored. # docLength is the total count of all terms # cleanTweets is the dictionary of the tweets by id as string # docFreq is the dictionary of in how many documents the "word" appears # invdocFreq is the dictionary of the inverse document frequencies # docTermFreq is the dictionary of term frequencies by date as integer # docTfIdf is similar to docTermFreq, but holds the tf-idf values # -------------------------------------------------------------------------- # Build Centroid List centroids = [] for doc, vec in docTfIdf.iteritems(): centroids.append(centroid.Centroid(str(doc), vec)) similarities = centroid.get_sims(centroids) average_sim = centroid.find_avg(centroids, True, similarities) stddev_sim = centroid.find_std(centroids, True, similarities) print "mean: %.10f\tstd: %.10f" % (average_sim, stddev_sim) # -------------------------------------------------------------------------- # Merge centroids by highest similarity of at least threshold threshold = (average_sim + stddev_sim) while len(centroids) > 1: i, j, sim = centroid.find_max(centroids) # @warning: This is fairly crap. if sim >= threshold: centroids[i].add_centroid(centroids[j]) del centroids[j] print "merged with sim: %.10f" % sim else: break print "len(centroids): %d" % len(centroids) print "avg(centroids): %.10f" % average_sim print "std(centroids): %.10f" % stddev_sim for cen in centroids: print centroid.topTerms(cen, 10)
def main(): # Did they provide the correct args? if len(sys.argv) != 6: usage() sys.exit(-1) database_file = sys.argv[1] user_id = int(sys.argv[2]) stop_file = sys.argv[3] outputvocab = sys.argv[4] outputdata = sys.argv[5] # -------------------------------------------------------------------------- # Pull stop words stopwords = tweetclean.importStopWords(stop_file) # -------------------------------------------------------------------------- # Read in the database query_tweets = "select id, contents as text from tweets where owner = %d;" users_tweets = {} docTermFreq = {} # dictionary of term frequencies by date as integer vocab = [] # array of terms conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row c = conn.cursor() for row in c.execute(query_tweets % user_id): users_tweets[row['id']] = row['text'] conn.close() # -------------------------------------------------------------------------- # Process tweets for id in users_tweets: if users_tweets[id] == None: # this happens, lol. continue users_tweets[id] = tweetclean.cleanup(users_tweets[id], True, True) # Calculate Term Frequencies for this id/document. # Skip 1 letter words. # let's make a short list of the words we'll accept. pruned = [w for w in users_tweets[id].split(' ') \ if len(w) > 1 and w not in stopwords] # skip documents that only have one word. if len(pruned) < 2: continue docTermFreq[id] = {} # Prepare the dictionary for that document. for w in pruned: try: docTermFreq[id][w] += 1 except KeyError: docTermFreq[id][w] = 1 if w not in vocab: # slow. linear search... maybe switch to a sorted method? vocab.append(w) vocab.sort() # -------------------------------------------------------------------------- # Build the vocab.txt file with open(outputvocab, 'w') as f: f.write("\n".join(vocab)) # -------------------------------------------------------------------------- # Given the vocab array, build the document term index + counts: sorted_tweets = sorted(users_tweets.keys()) data = "" for id in sorted_tweets: try: lens = len(docTermFreq[id]) except: continue print "%d" % id data += "%d " % lens for term in docTermFreq[id]: indx = getIndx(vocab, term) if indx == -1: sys.exit(-1) data += "%d:%d " % (indx, docTermFreq[id][term]) data += "\n" with open(outputdata, "w") as f: f.write(data)
def main(): """.""" # Did they provide the correct args? if len(sys.argv) != 6: usage() sys.exit(-1) cpus = multiprocessing.cpu_count() # -------------------------------------------------------------------------- # Parse the parameters. database_file = sys.argv[1] minimum = int(sys.argv[2]) maximum = int(sys.argv[3]) stop_file = sys.argv[4] output_folder = sys.argv[5] if minimum >= maximum: usage() sys.exit(-2) # Pull stop words stopwords = tweetclean.import_stopwords(stop_file) kickoff = \ """ ------------------------------------------------------------------- parameters : database : %s minimum : %d maximum : %d output : %s stop : %s ------------------------------------------------------------------- """ print kickoff % (database_file, minimum, maximum, output_folder, stop_file) # this won't return the 3 columns we care about. query_collect = \ "select owner from tweets group by owner having count(*) >= %d and count(*) < %d" # "select id, contents as text from tweets where owner = %d;" query_prefetch = \ "select owner, id, contents as text from tweets where owner in (%s);" conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row c = conn.cursor() print "#cpus: %d" % cpus # -------------------------------------------------------------------------- # Search the database file for users. users = [] users_tweets = {} start = time.clock() query = query_prefetch % query_collect for row in c.execute(query % (minimum, maximum)): uid = row['owner'] if uid not in users: users.append(uid) if row['text'] is not None: data = tweetclean.cleanup(row['text'], True, True) try: users_tweets[uid][row['id']] = data except KeyError: users_tweets[uid] = {} users_tweets[uid][row['id']] = data print "query time: %fm" % ((time.clock() - start) / 60) print "users: %d\n" % len(users) conn.close() # -------------------------------------------------------------------------- # Process those tweets by user set. print "usr\tcnt\tavg\tstd\tend\tdur" cnt = int(math.ceil((float(len(users)) / cpus))) remains = len(users) threads = [] for i in range(0, cpus): start = i * cnt if cnt > remains: cnt = remains print "launching thread: %d, %d" % (start, cnt) t = threading.Thread(target=thread_main, args=( output_folder, users, users_tweets, stopwords, start, cnt, )) threads.append(t) t.start() remains -= cnt
def main(): daysTweets = {} # dictionary of the tweets by date as integer docFreq = {} # dictionary of document frequencies daysHisto = {} # dictionary of the n-grams by date as integer # Did they provide the correct args? if len(sys.argv) != 2: usage() sys.exit(-1) # Pull lines with open(sys.argv[1], "r") as f: tweets = f.readlines() print "tweets: %d" % len(tweets) # -------------------------------------------------------------------------- # Process tweets for i in tweets: # Each tweet has <created>DATE-TIME</created> and <text>DATA</text>. # # So we'll have a dictionary<string, string> = {"date", "contents"} # # So, we'll just append to the end of the string for the dictionary # entry. info = tweetclean.extract(i) if info == None: sys.exit(-1) # Build day string # This needs to return -1 on error, so I'll need to test it. date = tweetdate.buildDateInt(info[0]) # Do some cleanup newTweet = tweetclean.cleanup(info[1]) # Add this tweet to the collective tweet for the day. if date in daysTweets: daysTweets[date] += " " + newTweet else: daysTweets[date] = newTweet # End of: "for i in tweets:" # Thanks to python and not letting me use curly braces. # -------------------------------------------------------------------------- # Process the collected tweets print "tweet days: %d" % len(daysTweets) gramSize = 3 for day in sorted(daysTweets.keys()): daysHisto[day] = {} # initialize the sub-dictionary totalDaysTerms = 0 # for normalizing the term frequencies, so days with more tweets don't skew values. # This gives me values, starting at 0, growing by gramSize for length of the tweet. # range(0, len(daysTweets[day]), gramSize) # This should give you values, starting at 0 for length of the tweet. # range(0, len(daysTweets[day]), 1) # for j in range(0, len(daysTweets[day]), gramSize): # this doesn't seem to do the sliding window I was expecting but rather just chunks it. w = daysTweets[day][j:j + gramSize] # wu is a special format that will not screw with whitespace wu = "_%s_" % w totalDaysTerms += 1 try: daysHisto[day][wu] += 1 except KeyError: daysHisto[day][wu] = 1 try: docFreq[wu] += 1 except KeyError: docFreq[wu] = 1 # print results to file for day. # unsorted for gram in daysHisto[day]: # I am making it smaller by the size of the document. v = float(daysHisto[day][gram]) / totalDaysTerms daysHisto[day][gram] = v # daysHisto Contains normalized term frequencies, not tf-idf values. # Normalized to account for the length of the document. It would not # be difficult to modify it to contain tf-idf values. It would just have # to wait until all processing is complete. # Dump the matrix. print vectorspace.dumpMatrix(docFreq, daysHisto) + "\n"
def main(): docLength = 0 # total count of all terms daysTweets = {} # dictionary of the tweets by date as integer invdocFreq = {} # dictionary of the inverse document frequencies docFreq = {} # dictionary of document frequencies daysHisto = {} # dictionary of the n-grams by date as integer # Did they provide the correct args? if len(sys.argv) != 2: usage() sys.exit(-1) # Pull lines with open(sys.argv[1], "r") as f: tweets = f.readlines() print "tweets: %d" % len(tweets) # -------------------------------------------------------------------------- # Process tweets for i in tweets: info = tweetclean.extract(i) if info == None: sys.exit(-1) # Build day string # This needs to return -1 on error, so I'll need to test it. date = tweetdate.buildDateInt(info[0]) # Do some cleanup newTweet = tweetclean.cleanup(info[1]) # Add this tweet to the collective tweet for the day. if date in daysTweets: daysTweets[date] += " " + newTweet else: daysTweets[date] = newTweet # End of: "for i in tweets:" # Thanks to python and not letting me use curly braces. # -------------------------------------------------------------------------- # Process the collected tweets print "tweet days: %d" % len(daysTweets) gramSize = 3 docLength = {} for day in sorted(daysTweets.keys()): daysHisto[day] = {} # initialize the sub-dictionary # This gives me values, starting at 0, growing by gramSize for length of the tweet. # range(0, len(daysTweets[day]), gramSize) # This should give you values, starting at 0 for length of the tweet. # range(0, len(daysTweets[day]), 1) # for j in range(0, len(daysTweets[day]), gramSize): # this doesn't seem to do the sliding window I was expecting but rather just chunks it. w = daysTweets[day][j:j + gramSize] # wu is a special format that will not screw with whitespace wu = "_%s_" % w try: docLength[day] += 1 except KeyError: docLength[day] = 1 try: daysHisto[day][wu] += 1 except KeyError: daysHisto[day][wu] = 1 try: docFreq[wu] += 1 except KeyError: docFreq[wu] = 1 # Calculate the inverse document frequencies. invdocFreq = vectorspace.calculate_invdf(len(daysHisto), docFreq) # Calculate the tf-idf values. daysHisto = vectorspace.calculate_tfidf(docLength, daysHisto, invdocFreq) # Dump the matrix. #print vectorspace.dumpMatrix(docFreq, daysHisto) + "\n" # Computer cosine similarities between sequential days. sorted_days = sorted(daysHisto.keys()) for i in range(0, len(sorted_days) - 1): print "similarity(%s, %s) = " % (str( sorted_days[i]), str(sorted_days[i + 1])), print vectorspace.cosineCompute(daysHisto[sorted_days[i]], daysHisto[sorted_days[i + 1]])
def thread_main(database_file, output_folder, users, stopwords, start, cnt): """ Process the users in your range! Each thread gets its own hook into the database, so they don't interfere. I could use the whole Queue thing... but I don't feel like trying to get that to work as well. """ query_tweets = "select id, contents as text from tweets where owner = %d;" users_tweets = {} conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row c = conn.cursor() # -------------------------------------------------------------------------- # Process this thread's users. for j in xrange(start, start + cnt): user_id = users[j] print "processing: %d" % user_id for row in c.execute(query_tweets % user_id): if row['text'] is not None: users_tweets[row['id']] = \ tweetclean.cleanup(row['text'], True, True) # only words that are greater than one letter and not in the stopword # list. texts = [[word for word in users_tweets[uid].split() \ if word not in stopwords and len(word) > 1] \ for uid in users_tweets] # ---------------------------------------------------------------------- # remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) \ if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] \ for text in texts] dictionary = corpora.Dictionary(texts) # store the dictionary, for future reference #dictionary.save(os.path.join("lda_out", '%d.dict' % user_id)) corpus = [dictionary.doc2bow(text) for text in texts] # store to disk, for later use #corpora.MmCorpus.serialize( # os.path.join( # output_folder, # '%d.mm' % user_id), # corpus) # ---------------------------------------------------------------------- # is this different... #corpus = \ # corpora.MmCorpus(os.path.join(output_folder, '%d.mm' % user_id)) lda = models.ldamodel.LdaModel(corpus, id2word=dictionary, chunksize=100, passes=20, num_topics=100) #lda.save('%d.lda' % user_id) # ---------------------------------------------------------------------- topic_strings = lda.show_topics(topics=-1, formatted=True) # shit, they share an output_file, so they could interrupt each other. ### so switch to individual files... ### with open(os.path.join(output_folder, "%d.topics" % user_id), "w") as f: f.write("user: %d\n#topics: %d\n" % (user_id, len(topic_strings))) for topic in topic_strings: # could use .join f.write("%s\n" % str(topic)) conn.close()
def main(): # Weirdly in Python, you have free access to globals from within main(). hourlyInterval = 0 # are we building hourly or daily histograms? docLength = 0 # total count of all terms daysTweets = {} # dictionary of the tweets by date as integer # dictionary of the tweets by date-hour as integer docFreq = {} # dictionary of in how many documents the "word" appears invdocFreq = {} # dictionary of the inverse document frequencies docTermFreq = {} # dictionary of term frequencies by date as integer docTfIdf = {} # similar to docTermFreq, but holds the tf-idf values # Did they provide the correct args? if len(sys.argv) != 5: usage() sys.exit(-1) # Parse command line if sys.argv[1] == "hourly": hourlyInterval = 1 elif sys.argv[1] == "daily": pass else: usage() sys.exit(-1) # Pull lines with open(sys.argv[2], "r") as f: tweets = f.readlines() print "tweets: %d" % len(tweets) # -------------------------------------------------------------------------- # Process tweets for i in tweets: # Each tweet has <created>DATE-TIME</created> and <text>DATA</text>. # # So we'll have a dictionary<string, string> = {"date", "contents"} # # So, we'll just append to the end of the string for the dictionary # entry. info = tweetclean.extract(i) if info == None: sys.exit(-1) # Build day string # This needs to return -1 on error, so I'll need to test it. if hourlyInterval: date = tweetdate.buildDateInt(info[0]) else: date = tweetdate.buildDateDayInt(info[0]) # Do some cleanup newTweet = tweetclean.cleanup(info[1]) # Add this tweet to the collective tweet for the day. if date in daysTweets: daysTweets[date] += " " + newTweet else: daysTweets[date] = newTweet # End of: "for i in tweets:" # Thanks to python and not letting me use curly braces. # -------------------------------------------------------------------------- # Process the collected tweets print "tweet days: %d" % len(daysTweets) docLength = {} for day in daysTweets.keys(): docTermFreq[day] = {} # Prepare the dictionary for that document. # Calculate Term Frequencies for this day/document. # Skip 1 letter words. for w in daysTweets[day].split(' '): if len(w) > 1: try: docLength[day] += 1 except KeyError: docLength[day] = 1 try: docTermFreq[day][w] += 1 except KeyError: docTermFreq[day][w] = 1 # Contribute to the document frequencies. for w in docTermFreq[day]: try: docFreq[w] += 1 except KeyError: docFreq[w] = 1 # -------------------------------------------------------------------------- # Dump how many unique terms were identified by spacing splitting. # Dump how many days of tweets we collected. # For each day of tweets, dump how many unique terms were identified by space splitting. # print "sizeof documents: %s" % docLength print "sizeof docFreq: %d" % len(docFreq) # this is how many unique terms print "sizeof docTermFreq: %d" % len(docTermFreq) # this is how many days for day in docTermFreq: print "sizeof docTermFreq[%s]: %d" % (str(day), len(docTermFreq[day])) # this is how many unique terms were in that day #print docTermFreq[day] # -------------------------------------------------------------------------- # Remove singletons -- standard practice. # Skipped with tweets for now... # Calculate the inverse document frequencies. invdocFreq = vectorspace.calculate_invdf(len(docTermFreq), docFreq) # Calculate the tf-idf values. docTfIdf = vectorspace.calculate_tfidf(docLength, docTermFreq, invdocFreq) # Recap of everything we have stored. # docLength is the total count of all terms # daysTweets is the dictionary of the tweets by date as integer # docFreq is the dictionary of in how many documents the "word" appears # invdocFreq is the dictionary of the inverse document frequencies # docTermFreq is the dictionary of term frequencies by date as integer # docTfIdf is similar to docTermFreq, but holds the tf-idf values # Sort the lists by decreasing value and dump the information. # TODO: Upgrade this to print the top 15-20 or so. sorted_keys = sorted(docTfIdf.keys()) print "token:weight" for day in sorted_keys: print str(day) + ":---" sorted_tokens = sorted( docTfIdf[day].items(), key=operator.itemgetter(1), # (1) is value reverse=True) for k, v in sorted_tokens: print k + ":" + str(v) # Dump the matrix. with open(sys.argv[3], "w") as f: f.write(vectorspace.dumpMatrix(docFreq, docTfIdf) + "\n") # Computer cosine similarities between sequential days. sorted_days = sorted(docTfIdf.keys()) with open(sys.argv[4], "w") as f: # -1 because each goes +1 for i in xrange(0, len(sorted_days) - 1): f.write("similarity(%s, %s) = " % (str(sorted_days[i]), str(sorted_days[i + 1]))) f.write(str(vectorspace.cosineCompute(docTfIdf[sorted_days[i]], docTfIdf[sorted_days[i + 1]])) + "\n")
def main(): cleanTweets = {} # dictionary of the tweets by id as integer docFreq = {} # dictionary of in how many documents the "word" appears invdocFreq = {} # dictionary of the inverse document frequencies docTermFreq = {} # dictionary of term frequencies by date as integer docTfIdf = {} # similar to docTermFreq, but holds the tf-idf values # Did they provide the correct args? if len(sys.argv) != 3: usage() sys.exit(-1) # Pull lines with codecs.open(sys.argv[1], "r", 'utf-8') as f: tweets = f.readlines() # Pull stop words with open(sys.argv[2], "r") as f: stopwords = f.readlines() # clean them up! for i in xrange(0, len(stopwords)): stopwords[i] = stopwords[i].strip() # -------------------------------------------------------------------------- # Process tweets for i in tweets: # Each tweet has <id>DATE-TIME</id> and <text>DATA</text>. # # So we'll have a dictionary<string, string> = {"id", "contents"} # # So, we'll just append to the end of the string for the dictionary # entry. info = tweetclean.extract_id(i) if info == None: sys.stderr.write("Invalid tweet hit\n") sys.exit(-1) # Add this tweet to the collection of clean ones. cleanTweets[info[0]] = tweetclean.cleanup(info[1], True, True) docLength = {} # -------------------------------------------------------------------------- # Process the collected tweets for id in cleanTweets.keys(): # Calculate Term Frequencies for this id/document. # Skip 1 letter words. # let's make a short list of the words we'll accept. pruned = [ w for w in cleanTweets[id].split(' ') if len(w) > 1 and w not in stopwords ] # skip documents that only have one word. if len(pruned) < 2: continue docTermFreq[id] = {} # Prepare the dictionary for that document. for w in pruned: try: docLength[id] += 1 except KeyError: docLength[id] = 1 try: docTermFreq[id][w] += 1 except KeyError: docTermFreq[id][w] = 1 # Contribute to the document frequencies. for w in docTermFreq[id]: try: docFreq[w] += 1 except KeyError: docFreq[w] = 1 # -------------------------------------------------------------------------- # Dump how many unique terms were identified by spacing splitting. print "Total Count of Terms: %s" % docLength print "Unique Terms: %d" % len(docFreq) print "How many Documents: %d" % len(docTermFreq) # -------------------------------------------------------------------------- # Remove singletons -- standard practice. # Skipped with tweets for now... # Calculate the inverse document frequencies. invdocFreq = vectorspace.calculate_invdf(len(docTermFreq), docFreq) # Calculate the tf-idf values. docTfIdf = vectorspace.calculate_tfidf(docLength, docTermFreq, invdocFreq) # -------------------------------------------------------------------------- # Recap of everything we have stored. # docLength is the total count of all terms # cleanTweets is the dictionary of the tweets by id as string # docFreq is the dictionary of in how many documents the "word" appears # invdocFreq is the dictionary of the inverse document frequencies # docTermFreq is the dictionary of term frequencies by date as integer # docTfIdf is similar to docTermFreq, but holds the tf-idf values # -------------------------------------------------------------------------- # Build Centroid List centroids = [] for doc, vec in docTfIdf.iteritems(): centroids.append(centroid.Centroid(str(doc), vec)) similarities = centroid.get_sims(centroids) average_sim = centroid.find_avg(centroids, True, similarities) stddev_sim = centroid.find_std(centroids, True, similarities) print "mean: %.10f\tstd: %.10f" % (average_sim, stddev_sim) # -------------------------------------------------------------------------- # Merge centroids by highest similarity of at least threshold threshold = (average_sim + stddev_sim) while len(centroids) > 1: i, j, sim = centroid.find_max(centroids) # @warning: This is fairly crap. if sim >= threshold: centroids[i].add_centroid(centroids[j]) del centroids[j] print "merged with sim: %.10f" % sim else: break print "len(centroids): %d" % len(centroids) print "avg(centroids): %.10f" % average_sim print "std(centroids): %.10f" % stddev_sim for cen in centroids: print centroid.topTerms(cen, 10)
def main(): # Weirdly in Python, you have free access to globals from within main(). hourlyInterval = 0 # are we building hourly or daily histograms? docLength = 0 # total count of all terms daysTweets = {} # dictionary of the tweets by date as integer # dictionary of the tweets by date-hour as integer docFreq = {} # dictionary of in how many documents the "word" appears invdocFreq = {} # dictionary of the inverse document frequencies docTermFreq = {} # dictionary of term frequencies by date as integer docTfIdf = {} # similar to docTermFreq, but holds the tf-idf values # Did they provide the correct args? if len(sys.argv) != 5: usage() sys.exit(-1) # Parse command line if sys.argv[1] == "hourly": hourlyInterval = 1 elif sys.argv[1] == "daily": pass else: usage() sys.exit(-1) # Pull lines with open(sys.argv[2], "r") as f: tweets = f.readlines() print "tweets: %d" % len(tweets) # -------------------------------------------------------------------------- # Process tweets for i in tweets: # Each tweet has <created>DATE-TIME</created> and <text>DATA</text>. # # So we'll have a dictionary<string, string> = {"date", "contents"} # # So, we'll just append to the end of the string for the dictionary # entry. info = tweetclean.extract(i) if info == None: sys.exit(-1) # Build day string # This needs to return -1 on error, so I'll need to test it. if hourlyInterval: date = tweetdate.buildDateInt(info[0]) else: date = tweetdate.buildDateDayInt(info[0]) # Do some cleanup newTweet = tweetclean.cleanup(info[1]) # Add this tweet to the collective tweet for the day. if date in daysTweets: daysTweets[date] += " " + newTweet else: daysTweets[date] = newTweet # End of: "for i in tweets:" # Thanks to python and not letting me use curly braces. # -------------------------------------------------------------------------- # Process the collected tweets print "tweet days: %d" % len(daysTweets) docLength = {} for day in daysTweets.keys(): docTermFreq[day] = {} # Prepare the dictionary for that document. # Calculate Term Frequencies for this day/document. # Skip 1 letter words. for w in daysTweets[day].split(' '): if len(w) > 1: try: docLength[day] += 1 except KeyError: docLength[day] = 1 try: docTermFreq[day][w] += 1 except KeyError: docTermFreq[day][w] = 1 # Contribute to the document frequencies. for w in docTermFreq[day]: try: docFreq[w] += 1 except KeyError: docFreq[w] = 1 # -------------------------------------------------------------------------- # Dump how many unique terms were identified by spacing splitting. # Dump how many days of tweets we collected. # For each day of tweets, dump how many unique terms were identified by space splitting. # print "sizeof documents: %s" % docLength print "sizeof docFreq: %d" % len(docFreq) # this is how many unique terms print "sizeof docTermFreq: %d" % len(docTermFreq) # this is how many days for day in docTermFreq: print "sizeof docTermFreq[%s]: %d" % ( str(day), len(docTermFreq[day]) ) # this is how many unique terms were in that day #print docTermFreq[day] # -------------------------------------------------------------------------- # Remove singletons -- standard practice. # Skipped with tweets for now... # Calculate the inverse document frequencies. invdocFreq = vectorspace.calculate_invdf(len(docTermFreq), docFreq) # Calculate the tf-idf values. docTfIdf = vectorspace.calculate_tfidf(docLength, docTermFreq, invdocFreq) # Recap of everything we have stored. # docLength is the total count of all terms # daysTweets is the dictionary of the tweets by date as integer # docFreq is the dictionary of in how many documents the "word" appears # invdocFreq is the dictionary of the inverse document frequencies # docTermFreq is the dictionary of term frequencies by date as integer # docTfIdf is similar to docTermFreq, but holds the tf-idf values # Sort the lists by decreasing value and dump the information. # TODO: Upgrade this to print the top 15-20 or so. sorted_keys = sorted(docTfIdf.keys()) print "token:weight" for day in sorted_keys: print str(day) + ":---" sorted_tokens = sorted( docTfIdf[day].items(), key=operator.itemgetter(1), # (1) is value reverse=True) for k, v in sorted_tokens: print k + ":" + str(v) # Dump the matrix. with open(sys.argv[3], "w") as f: f.write(vectorspace.dumpMatrix(docFreq, docTfIdf) + "\n") # Computer cosine similarities between sequential days. sorted_days = sorted(docTfIdf.keys()) with open(sys.argv[4], "w") as f: # -1 because each goes +1 for i in xrange(0, len(sorted_days) - 1): f.write("similarity(%s, %s) = " % (str(sorted_days[i]), str(sorted_days[i + 1]))) f.write( str( vectorspace.cosineCompute(docTfIdf[ sorted_days[i]], docTfIdf[sorted_days[i + 1]])) + "\n")
def main(): # Did they provide the correct args? if len(sys.argv) != 5: usage() sys.exit(-1) database_file = sys.argv[1] minimum = int(sys.argv[2]) stop_file = sys.argv[3] output_folder = sys.argv[4] # -------------------------------------------------------------------------- # Pull stop words stopwords = tweetclean.importStopWords(stop_file) # -------------------------------------------------------------------------- # Read in the database query_collect = "select owner from tweets group by owner having count(*) >= %d;" query_tweets = "select id, contents as text from tweets where owner = %d;" conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row c = conn.cursor() users = [] for row in c.execute(query_collect % (minimum)): users.append(row['owner']) # -------------------------------------------------------------------------- # Process those tweets by user set. for u in users: users_tweets = {} docTermFreq = {} # dictionary of term frequencies by date as integer vocab = [] # array of terms for row in c.execute(query_tweets % u): users_tweets[row['id']] = row['text'] # ---------------------------------------------------------------------- # Process tweets for id in users_tweets: if users_tweets[id] == None: # this happens, lol. continue users_tweets[id] = tweetclean.cleanup(users_tweets[id], True, True) # Calculate Term Frequencies for this id/document. # Skip 1 letter words. # let's make a short list of the words we'll accept. pruned = [w for w in users_tweets[id].split(' ') \ if len(w) > 1 and w not in stopwords] # skip documents that only have one word. if len(pruned) < 2: continue docTermFreq[id] = {} # Prepare the dictionary for that document. for w in pruned: try: docTermFreq[id][w] += 1 except KeyError: docTermFreq[id][w] = 1 # slow. maybe linear search? maybe switch to a sorted method? if w not in vocab: vocab.append(w) vocab.sort() # ---------------------------------------------------------------------- # Build the vocab.txt file with open(os.path.join(output_folder, "%d.vocab" % u), 'w') as f: f.write("\n".join(vocab)) # ---------------------------------------------------------------------- # Given the vocab array, build the document term index + counts: sorted_tweets = sorted(docTermFreq.keys()) data = "" for id in docTermFreq: print "%d" % id data += "%d " % len(docTermFreq[id]) for term in docTermFreq[id]: indx = getIndx(vocab, term) if indx == -1: sys.exit(-1) data += "%d:%d " % (indx, docTermFreq[id][term]) data += "\n" with open(os.path.join(output_folder, "%d.dat" % u), "w") as f: f.write(data) # end for each user. # -------------------------------------------------------------------------- # Done. conn.close()
def main(): docLength = 0 # total count of all terms daysTweets = {} # dictionary of the tweets by date as integer invdocFreq = {} # dictionary of the inverse document frequencies docFreq = {} # dictionary of document frequencies daysHisto = {} # dictionary of the n-grams by date as integer # Did they provide the correct args? if len(sys.argv) != 2: usage() sys.exit(-1) # Pull lines with open(sys.argv[1], "r") as f: tweets = f.readlines() print "tweets: %d" % len(tweets) # -------------------------------------------------------------------------- # Process tweets for i in tweets: info = tweetclean.extract(i) if info == None: sys.exit(-1) # Build day string # This needs to return -1 on error, so I'll need to test it. date = tweetdate.buildDateInt(info[0]) # Do some cleanup newTweet = tweetclean.cleanup(info[1]) # Add this tweet to the collective tweet for the day. if date in daysTweets: daysTweets[date] += " " + newTweet else: daysTweets[date] = newTweet # End of: "for i in tweets:" # Thanks to python and not letting me use curly braces. # -------------------------------------------------------------------------- # Process the collected tweets print "tweet days: %d" % len(daysTweets) gramSize = 3 docLength = {} for day in sorted(daysTweets.keys()): daysHisto[day] = {} # initialize the sub-dictionary # This gives me values, starting at 0, growing by gramSize for length of the tweet. # range(0, len(daysTweets[day]), gramSize) # This should give you values, starting at 0 for length of the tweet. # range(0, len(daysTweets[day]), 1) # for j in range(0, len(daysTweets[day]), gramSize): # this doesn't seem to do the sliding window I was expecting but rather just chunks it. w = daysTweets[day][j:j + gramSize] # wu is a special format that will not screw with whitespace wu = "_%s_" % w try: docLength[day] += 1 except KeyError: docLength[day] = 1 try: daysHisto[day][wu] += 1 except KeyError: daysHisto[day][wu] = 1 try: docFreq[wu] += 1 except KeyError: docFreq[wu] = 1 # Calculate the inverse document frequencies. invdocFreq = vectorspace.calculate_invdf(len(daysHisto), docFreq) # Calculate the tf-idf values. daysHisto = vectorspace.calculate_tfidf(docLength, daysHisto, invdocFreq) # Dump the matrix. #print vectorspace.dumpMatrix(docFreq, daysHisto) + "\n" # Computer cosine similarities between sequential days. sorted_days = sorted(daysHisto.keys()) for i in range(0, len(sorted_days) - 1): print "similarity(%s, %s) = " % (str(sorted_days[i]), str(sorted_days[i + 1])), print vectorspace.cosineCompute(daysHisto[sorted_days[i]], daysHisto[sorted_days[i + 1]])
def thread_main(database_file, output_folder, users, stopwords, start, cnt): """ Process the users in your range! Each thread gets its own hook into the database, so they don't interfere. I could use the whole Queue thing... but I don't feel like trying to get that to work as well. """ query_tweets = "select id, contents as text from tweets where owner = %d;" users_tweets = {} conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row c = conn.cursor() # -------------------------------------------------------------------------- # Process this thread's users. for j in xrange(start, start + cnt): user_id = users[j] print "processing: %d" % user_id for row in c.execute(query_tweets % user_id): if row['text'] is not None: users_tweets[row['id']] = \ tweetclean.cleanup(row['text'], True, True) # only words that are greater than one letter and not in the stopword # list. texts = [[word for word in users_tweets[uid].split() \ if word not in stopwords and len(word) > 1] \ for uid in users_tweets] # ---------------------------------------------------------------------- # remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) \ if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] \ for text in texts] dictionary = corpora.Dictionary(texts) # store the dictionary, for future reference #dictionary.save(os.path.join("lda_out", '%d.dict' % user_id)) corpus = [dictionary.doc2bow(text) for text in texts] # store to disk, for later use #corpora.MmCorpus.serialize( # os.path.join( # output_folder, # '%d.mm' % user_id), # corpus) # ---------------------------------------------------------------------- # is this different... #corpus = \ # corpora.MmCorpus(os.path.join(output_folder, '%d.mm' % user_id)) lda = models.ldamodel.LdaModel( corpus, id2word=dictionary, chunksize=100, passes=20, num_topics=100) #lda.save('%d.lda' % user_id) # ---------------------------------------------------------------------- topic_strings = lda.show_topics(topics= -1, formatted=True) # shit, they share an output_file, so they could interrupt each other. ### so switch to individual files... ### with open(os.path.join(output_folder, "%d.topics" % user_id), "w") as f: f.write("user: %d\n#topics: %d\n" % (user_id, len(topic_strings))) for topic in topic_strings: # could use .join f.write("%s\n" % str(topic)) conn.close()
def main(): # Did they provide the correct args? if len(sys.argv) != 4: usage() sys.exit(-1) database_file = sys.argv[1] user_id = int(sys.argv[2]) stop_file = sys.argv[3] # -------------------------------------------------------------------------- # Pull stop words stopwords = tweetclean.import_stopwords(stop_file) # -------------------------------------------------------------------------- # Read in the database query_tweets = "select id, contents as text from tweets where owner = %d;" users_tweets = {} conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row c = conn.cursor() for row in c.execute(query_tweets % user_id): if row['text'] is not None: users_tweets[row['id']] = \ tweetclean.cleanup(row['text'], True, True) conn.close() # only words that are greater than one letter and not in the stopword list. texts = [[word for word in users_tweets[uid].split() \ if word not in stopwords and len(word) > 1] \ for uid in users_tweets] # remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) \ if all_tokens.count(word) == 1) texts = [[word for word in text \ if word not in tokens_once] for text in texts] dictionary = corpora.Dictionary(texts) # store the dictionary, for future reference dictionary.save('%d.dict' % user_id) corpus = [dictionary.doc2bow(text) for text in texts] # store to disk, for later use corpora.MmCorpus.serialize('%d.mm' % user_id, corpus) # is this different... corpus = corpora.MmCorpus('%d.mm' % user_id) model = models.ldamodel.LdaModel(corpus, id2word=dictionary, chunksize=100, passes=20, num_topics=100) model.save('%d.lda' % user_id) lda = models.ldamodel.LdaModel.load('%d.lda' % user_id) #lda.show_topics(topics=1, topn=1, log=False, formatted=True) # Unlike what the documentation might have you believe, you have to pull it # back as a string if you want to use it. topic_strings = lda.show_topics(topics=-1, formatted=True) print "#topics: %d" % len(topic_strings) for topic in topic_strings: print topic