def main(): # Did they provide the correct args? if len(sys.argv) != 6: usage() sys.exit(-1) cpus = multiprocessing.cpu_count() database_file = sys.argv[1] minimum = int(sys.argv[2]) maximum = int(sys.argv[3]) stop_file = sys.argv[4] output_folder = sys.argv[5] if minimum >= maximum: usage() sys.exit(-2) kickoff = \ """ ------------------------------------------------------------------- parameters : database : %s minimum : %d maximum : %d output : %s stop : %s ------------------------------------------------------------------- """ print kickoff % (database_file, minimum, maximum, output_folder, stop_file) # -------------------------------------------------------------------------- # Pull stop words stopwords = tweetclean.import_stopwords(stop_file) # -------------------------------------------------------------------------- # Read in the database query_collect = \ "select owner from tweets group by owner having count(*) >= %d and count(*) < %d;" conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row c = conn.cursor() # -------------------------------------------------------------------------- # Search the database file for users. users = [] start_time = time.clock() for row in c.execute(query_collect % (minimum, maximum)): users.append(row['owner']) print "%fs" % (time.clock() - start_time) conn.close() # -------------------------------------------------------------------------- # Process those tweets by user set. cnt = int(math.ceil((float(len(users)) / cpus))) remains = len(users) threads = [] for i in range(0, cpus): start = i * cnt if cnt > remains: cnt = remains t = threading.Thread( target=thread_main, args=( database_file, output_folder, users, stopwords, start, cnt,)) threads.append(t) t.start() remains -= cnt # -------------------------------------------------------------------------- # Done. for t in threads: t.join()
def main(): """Main.""" if len(sys.argv) != 6: usage() sys.exit(-1) # -------------------------------------------------------------------------- # Parse the parameters. database_file = sys.argv[1] minimum = int(sys.argv[2]) maximum = int(sys.argv[3]) stop_file = sys.argv[4] output_file = sys.argv[5] if minimum >= maximum: print "minimum is larger than maximum" usage() sys.exit(-2) # Pull stop words stopwords = tweetclean.import_stopwords(stop_file) kickoff = \ """ ------------------------------------------------------------------- parameters : database : %s minimum : %d maximum : %d output : %s stop : %s ------------------------------------------------------------------- """ print kickoff % (database_file, minimum, maximum, output_file, stop_file) # this won't return the 3 columns we care about. query_collect = "select owner from tweets group by owner having count(*) >= %d and count(*) < %d" query_prefetch = "select owner, id, contents as text from tweets where owner in (%s);" query = query_prefetch % query_collect user_tweets = data_pull(database_file, query % (minimum, maximum)) print "data pulled" print "user count: %d" % len(user_tweets) # -------------------------------------------------------------------------- # Convert to a documents into one document per user. docperuser = {} # array representing all the tweets for each user. for user_id in user_tweets: docperuser[user_id] = " ".join(user_tweets[user_id]) if len(docperuser) == 1: sys.stderr.write("Insufficient data for tf-idf, only 1 document\n") sys.exit(-3) tfidf, dictionary = vectorspace.build_doc_tfidf(docperuser, stopwords, True) # -------------------------------------------------------------------------- # Build Centroid List centroids = [] for doc, vec in tfidf.iteritems(): centroids.append(centroid.Centroid(str(doc), vec)) similarities = centroid.get_sims(centroids) average_sim = centroid.find_avg(centroids, True, similarities) stddev_sim = centroid.find_std(centroids, True, similarities) print "mean: %.10f\tstd: %.10f" % (average_sim, stddev_sim) # -------------------------------------------------------------------------- # Merge centroids by highest similarity of at least threshold # the standard deviation is a distance, for the value you must position it. threshold = (average_sim + stddev_sim) while len(centroids) > 1: print "centroids: %d" % len(centroids) i, j, sim = centroid.find_max(centroids) print "\t%d, %d, %f" % (i, j, sim) # @warning: This is fairly crap. if sim >= threshold: centroids[i].add_centroid(centroids[j]) del centroids[j] print "merged with sim: %.10f" % sim else: break print "len(centroids): %d" % len(centroids) print "avg(centroids): %.10f" % average_sim print "std(centroids): %.10f" % stddev_sim for cen in centroids: print centroid.top_term_tuples(cen, 10) with open(output_file, "w") as fout: for cen in centroids: fout.write("%s\n" % cen) sys.exit(0) # Maybe I should determine the top tf-idf values per document and then make # that my dictionary of terms. =) # # Originally, I intended to use clustering to get topics, but really those # are just high tf-idf terms that are common among certain documents... top_dict = set() for doc_id in tfidf: terms = vectorspace.top_terms(tfidf[doc_id], 250) #print "terms of %d: %s" % (doc_id, terms) for term in terms: top_dict.add(term) print "total top terms (not the set): %d" % (250 * len(tfidf)) print "top dict: %d" % len(top_dict) # Dump the matrix. with open(output_file, "w") as fout: #fout.write(vectorspace.dump_raw_matrix(dictionary, tfidf) + "\n") fout.write(vectorspace.dump_raw_matrix(top_dict, tfidf) + "\n")
def main(): """Main.""" if len(sys.argv) != 6: usage() sys.exit(-1) # -------------------------------------------------------------------------- # Parse the parameters. database_file = sys.argv[1] minimum = int(sys.argv[2]) maximum = int(sys.argv[3]) stop_file = sys.argv[4] output_file = sys.argv[5] if minimum >= maximum: print "minimum is larger than maximum" usage() sys.exit(-2) # Pull stop words stopwords = import_stopwords(stop_file) kickoff = \ """ ------------------------------------------------------------------- parameters : database : %s minimum : %d maximum : %d output : %s stop : %s ------------------------------------------------------------------- """ print kickoff % (database_file, minimum, maximum, output_file, stop_file) # this won't return the 3 columns we care about. query_collect = "select owner from tweets group by owner having count(*) >= %d and count(*) < %d" query_prefetch = "select owner, id, contents as text from tweets where owner in (%s);" query = query_prefetch % query_collect user_tweets = data_pull(database_file, query % (minimum, maximum)) print "data pulled" print "user count: %d" % len(user_tweets) # -------------------------------------------------------------------------- # Convert to a documents into one document per user. docperuser = {} # array representing all the tweets for each user. for user_id in user_tweets: docperuser[user_id] = " ".join(user_tweets[user_id]) if len(docperuser) == 1: sys.stderr.write("Insufficient data for tf-idf, only 1 document\n") sys.exit(-3) tfidf, dictionary = build_doc_tfidf(docperuser, stopwords, True) # Maybe I should determine the top tf-idf values per document and then make # that my dictionary of terms. =) # # Originally, I intended to use clustering to get topics, but really those # are just high tf-idf terms that are common among certain documents... top_dict = set() for doc_id in tfidf: terms = top_terms(tfidf[doc_id], 250) for term in terms: top_dict.add(term) print "total top terms (not the set): %d" % (250 * len(tfidf)) print "top dict: %d" % len(top_dict) # Dump the matrix. with open(output_file, "w") as fout: fout.write(dump_raw_matrix(top_dict, tfidf) + "\n")
def main(): """Main.""" if len(sys.argv) != 2: usage() sys.exit(-1) # -------------------------------------------------------------------------- # Parse the parameters. config = SafeConfigParser() config.read(sys.argv[1]) database_file = config.get('input', 'database_file') year_val = config.getint('input', 'year') month_str = config.get('input', 'month') stop_file = config.get('input', 'stopwords') remove_singletons = config.getboolean('input', 'remove_singletons') build_images = {} build_images['rgb'] = config.getboolean('input', 'build_rgb_images') build_images['grey'] = config.getboolean('input', 'build_grey_images') build_csv_files = config.getboolean('input', 'build_csv_files') full_users_only = config.getboolean('input', 'full_users') # XXX: If full_users_only is not set to True, the images and such have # varying dimensions... which is bad. So, there is a bug here and I have # yet to fully investigate it. if month_str not in MONTHS: usage() sys.exit(-2) output_set = {} for section in config.sections(): if section.startswith("run"): output_folder = config.get(section, 'output_folder') output_set[section] = \ Output( output_folder, config.getint(section, 'request_value')) try: stat(output_folder) except OSError: mkdir(output_folder) # -------------------------------------------------------------------------- # Pull stop words stopwords = import_stopwords(stop_file) kickoff = \ """ ------------------------------------------------------------------- parameters : database : %s date : %s output : %s stop : %s count : %s remove : %s output : %s full only : %s ------------------------------------------------------------------- """ print kickoff % \ (database_file, (month_str, year_val), str([output_set[output].get_folder() for output in output_set]), stop_file, str([output_set[output].get_request() for output in output_set]), remove_singletons, build_images, full_users_only) # now that it's an integer lookup that can be more readily searched and # indexed versus a text field search with like. query_prefetch = \ "select owner, created, contents as text from tweets where yyyymm = %d;" # -------------------------------------------------------------------------- # Build a set of documents, per user, per day. num_days = monthrange(year_val, int(MONTHS[month_str]))[1] user_data = \ data_pull( database_file, query_prefetch % \ int(str_yearmonth(year_val, int(MONTHS[month_str])))) if len(user_data) < 2: print "empty dataset." sys.exit(-3) # you want full users only if you're running matrix completion stuff. if full_users_only: users = frame.find_full_users(user_data, stopwords, num_days) else: users = frame.find_valid_users(user_data, stopwords) print "data pulled" print "user count: %d\tframe users: %d" % (len(user_data), len(users)) # this is only an issue at present. mind you, because for the video # analysis code the users don't have to be full. if len(users) < 2: print "no full users" sys.exit(-4) # -------------------------------------------------------------------------- # I don't build a master tf-idf set because the tf-idf values should... # evolve. albeit, I don't think I'm correctly adjusting them -- I'm just # recalculating then. # # Calculate daily tf-idf; then build frame from top terms over the period # of days. frames = {} for day in range(1, num_days + 1): # This is run once per day overall. frames[day] = frame.build_full_frame(users, user_data, day) frames[day].calculate_tfidf(stopwords, remove_singletons) if frames[day].tfidf_len() == 0: print "weird data error." sys.exit(-5) # This is run once per day per output. for output in output_set: out = output_set[output] out.add_terms(frames[day].top_terms_overall(out.get_request())) # get_range() is just whatever the last time you ran # top_terms_overall new_range = frames[day].get_range() # This way the images are created with the correct range to cover # all of them. if out.max_range < new_range: out.max_range = new_range #break #if day == 3: #break # just do first day. print "Frames created" # len(overall_terms) should be at most 250 * num_users * num_days -- if # there is no overlap of high value terms over the period of days between # the users. If there is literally no overlap then each user will have # their own 250 terms each day. # -------------------------------------------------------------------------- # Dump the matrix. output_matrix(frames, output_set, build_csv_files, build_images)
def main(): # Did they provide the correct args? if len(sys.argv) != 6: usage() sys.exit(-1) cpus = multiprocessing.cpu_count() database_file = sys.argv[1] minimum = int(sys.argv[2]) maximum = int(sys.argv[3]) stop_file = sys.argv[4] output_folder = sys.argv[5] if minimum >= maximum: usage() sys.exit(-2) kickoff = \ """ ------------------------------------------------------------------- parameters : database : %s minimum : %d maximum : %d output : %s stop : %s ------------------------------------------------------------------- """ print kickoff % (database_file, minimum, maximum, output_folder, stop_file) # -------------------------------------------------------------------------- # Pull stop words stopwords = tweetclean.import_stopwords(stop_file) # -------------------------------------------------------------------------- # Read in the database query_collect = \ "select owner from tweets group by owner having count(*) >= %d and count(*) < %d;" conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row c = conn.cursor() # -------------------------------------------------------------------------- # Search the database file for users. users = [] start_time = time.clock() for row in c.execute(query_collect % (minimum, maximum)): users.append(row['owner']) print "%fs" % (time.clock() - start_time) conn.close() # -------------------------------------------------------------------------- # Process those tweets by user set. cnt = int(math.ceil((float(len(users)) / cpus))) remains = len(users) threads = [] for i in range(0, cpus): start = i * cnt if cnt > remains: cnt = remains t = threading.Thread(target=thread_main, args=( database_file, output_folder, users, stopwords, start, cnt, )) threads.append(t) t.start() remains -= cnt # -------------------------------------------------------------------------- # Done. for t in threads: t.join()
def main(): """.""" # Did they provide the correct args? if len(sys.argv) != 6: usage() sys.exit(-1) cpus = multiprocessing.cpu_count() # -------------------------------------------------------------------------- # Parse the parameters. database_file = sys.argv[1] minimum = int(sys.argv[2]) maximum = int(sys.argv[3]) stop_file = sys.argv[4] output_folder = sys.argv[5] if minimum >= maximum: usage() sys.exit(-2) # Pull stop words stopwords = tweetclean.import_stopwords(stop_file) kickoff = \ """ ------------------------------------------------------------------- parameters : database : %s minimum : %d maximum : %d output : %s stop : %s ------------------------------------------------------------------- """ print kickoff % (database_file, minimum, maximum, output_folder, stop_file) # this won't return the 3 columns we care about. query_collect = \ "select owner from tweets group by owner having count(*) >= %d and count(*) < %d" # "select id, contents as text from tweets where owner = %d;" query_prefetch = \ "select owner, id, contents as text from tweets where owner in (%s);" conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row c = conn.cursor() print "#cpus: %d" % cpus # -------------------------------------------------------------------------- # Search the database file for users. users = [] users_tweets = {} start = time.clock() query = query_prefetch % query_collect for row in c.execute(query % (minimum, maximum)): uid = row['owner'] if uid not in users: users.append(uid) if row['text'] is not None: data = tweetclean.cleanup(row['text'], True, True) try: users_tweets[uid][row['id']] = data except KeyError: users_tweets[uid] = {} users_tweets[uid][row['id']] = data print "query time: %fm" % ((time.clock() - start) / 60) print "users: %d\n" % len(users) conn.close() # -------------------------------------------------------------------------- # Process those tweets by user set. print "usr\tcnt\tavg\tstd\tend\tdur" cnt = int(math.ceil((float(len(users)) / cpus))) remains = len(users) threads = [] for i in range(0, cpus): start = i * cnt if cnt > remains: cnt = remains print "launching thread: %d, %d" % (start, cnt) t = threading.Thread(target=thread_main, args=( output_folder, users, users_tweets, stopwords, start, cnt, )) threads.append(t) t.start() remains -= cnt
def main(): # Did they provide the correct args? if len(sys.argv) != 4: usage() sys.exit(-1) database_file = sys.argv[1] user_id = int(sys.argv[2]) stop_file = sys.argv[3] # -------------------------------------------------------------------------- # Pull stop words stopwords = tweetclean.import_stopwords(stop_file) # -------------------------------------------------------------------------- # Read in the database query_tweets = "select id, contents as text from tweets where owner = %d;" users_tweets = {} conn = sqlite3.connect(database_file) conn.row_factory = sqlite3.Row c = conn.cursor() for row in c.execute(query_tweets % user_id): if row['text'] is not None: users_tweets[row['id']] = \ tweetclean.cleanup(row['text'], True, True) conn.close() # only words that are greater than one letter and not in the stopword list. texts = [[word for word in users_tweets[uid].split() \ if word not in stopwords and len(word) > 1] \ for uid in users_tweets] # remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) \ if all_tokens.count(word) == 1) texts = [[word for word in text \ if word not in tokens_once] for text in texts] dictionary = corpora.Dictionary(texts) # store the dictionary, for future reference dictionary.save('%d.dict' % user_id) corpus = [dictionary.doc2bow(text) for text in texts] # store to disk, for later use corpora.MmCorpus.serialize('%d.mm' % user_id, corpus) # is this different... corpus = corpora.MmCorpus('%d.mm' % user_id) model = models.ldamodel.LdaModel(corpus, id2word=dictionary, chunksize=100, passes=20, num_topics=100) model.save('%d.lda' % user_id) lda = models.ldamodel.LdaModel.load('%d.lda' % user_id) #lda.show_topics(topics=1, topn=1, log=False, formatted=True) # Unlike what the documentation might have you believe, you have to pull it # back as a string if you want to use it. topic_strings = lda.show_topics(topics=-1, formatted=True) print "#topics: %d" % len(topic_strings) for topic in topic_strings: print topic