Esempio n. 1
0
def main():

    # Did they provide the correct args?
    if len(sys.argv) != 6:
        usage()
        sys.exit(-1)

    cpus = multiprocessing.cpu_count()

    database_file = sys.argv[1]
    minimum = int(sys.argv[2])
    maximum = int(sys.argv[3])
    stop_file = sys.argv[4]
    output_folder = sys.argv[5]

    if minimum >= maximum:
        usage()
        sys.exit(-2)

    kickoff = \
"""
-------------------------------------------------------------------
parameters  :
    database  : %s
    minimum   : %d
    maximum   : %d
    output    : %s
    stop      : %s
-------------------------------------------------------------------
"""

    print kickoff % (database_file, minimum, maximum, output_folder, stop_file) 

    # --------------------------------------------------------------------------
    # Pull stop words
    stopwords = tweetclean.import_stopwords(stop_file)

    # --------------------------------------------------------------------------
    # Read in the database
    query_collect = \
        "select owner from tweets group by owner having count(*) >= %d and count(*) < %d;"

    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    c = conn.cursor()

    # --------------------------------------------------------------------------
    # Search the database file for users.
    users = []
    start_time = time.clock()

    for row in c.execute(query_collect % (minimum, maximum)):
        users.append(row['owner'])

    print "%fs" % (time.clock() - start_time)

    conn.close()

    # --------------------------------------------------------------------------
    # Process those tweets by user set.

    cnt = int(math.ceil((float(len(users)) / cpus)))
    remains = len(users)
    threads = []

    for i in range(0, cpus):
        start = i * cnt

        if cnt > remains:
            cnt = remains

        t = threading.Thread(
                             target=thread_main,
                             args=(
                                   database_file,
                                   output_folder,
                                   users,
                                   stopwords,
                                   start,
                                   cnt,))
        threads.append(t)
        t.start()

        remains -= cnt

    # --------------------------------------------------------------------------
    # Done.

    for t in threads:
        t.join()
Esempio n. 2
0
def main():
    """Main."""

    if len(sys.argv) != 6:
        usage()
        sys.exit(-1)

    # --------------------------------------------------------------------------
    # Parse the parameters.
    database_file = sys.argv[1]
    minimum = int(sys.argv[2])
    maximum = int(sys.argv[3])
    stop_file = sys.argv[4]
    output_file = sys.argv[5]

    if minimum >= maximum:
        print "minimum is larger than maximum"
        usage()
        sys.exit(-2)

    # Pull stop words
    stopwords = tweetclean.import_stopwords(stop_file)

    kickoff = \
"""
-------------------------------------------------------------------
parameters  :
  database  : %s
  minimum   : %d
  maximum   : %d
  output    : %s
  stop      : %s
-------------------------------------------------------------------
"""

    print kickoff % (database_file, minimum, maximum, output_file, stop_file) 

    # this won't return the 3 columns we care about.
    query_collect = "select owner from tweets group by owner having count(*) >= %d and count(*) < %d"
    query_prefetch = "select owner, id, contents as text from tweets where owner in (%s);"

    query = query_prefetch % query_collect

    user_tweets = data_pull(database_file, query % (minimum, maximum))

    print "data pulled"
    print "user count: %d" % len(user_tweets)

    # --------------------------------------------------------------------------
    # Convert to a documents into one document per user.

    docperuser = {} # array representing all the tweets for each user.

    for user_id in user_tweets:
        docperuser[user_id] = " ".join(user_tweets[user_id])

    if len(docperuser) == 1:
        sys.stderr.write("Insufficient data for tf-idf, only 1 document\n")
        sys.exit(-3)

    tfidf, dictionary = vectorspace.build_doc_tfidf(docperuser, stopwords, True)
    
    # --------------------------------------------------------------------------
    # Build Centroid List
    centroids = []

    for doc, vec in tfidf.iteritems():
        centroids.append(centroid.Centroid(str(doc), vec))

    similarities = centroid.get_sims(centroids)
    average_sim = centroid.find_avg(centroids, True, similarities)
    stddev_sim = centroid.find_std(centroids, True, similarities)
    
    print "mean: %.10f\tstd: %.10f" % (average_sim, stddev_sim)
    
    # --------------------------------------------------------------------------
    # Merge centroids by highest similarity of at least threshold
    # the standard deviation is a distance, for the value you must position it.
    threshold = (average_sim + stddev_sim)

    while len(centroids) > 1:
        print "centroids: %d" % len(centroids)
        i, j, sim = centroid.find_max(centroids)
        print "\t%d, %d, %f" % (i, j, sim)

        # @warning: This is fairly crap.
        if sim >= threshold:
            centroids[i].add_centroid(centroids[j])
            del centroids[j]
            print "merged with sim: %.10f" % sim
        else:
            break

    print "len(centroids): %d" % len(centroids)
    print "avg(centroids): %.10f" % average_sim
    print "std(centroids): %.10f" % stddev_sim
    
    for cen in centroids:
        print centroid.top_term_tuples(cen, 10)

    with open(output_file, "w") as fout:
        for cen in centroids:
            fout.write("%s\n" % cen)

    sys.exit(0)

    # Maybe I should determine the top tf-idf values per document and then make
    # that my dictionary of terms. =)
    #
    # Originally, I intended to use clustering to get topics, but really those
    # are just high tf-idf terms that are common among certain documents...

    top_dict = set()

    for doc_id in tfidf:
        terms = vectorspace.top_terms(tfidf[doc_id], 250)
        #print "terms of %d: %s" % (doc_id, terms)
        for term in terms:
            top_dict.add(term)

    print "total top terms (not the set): %d" % (250 * len(tfidf))
    print "top dict: %d" % len(top_dict)

    # Dump the matrix.
    with open(output_file, "w") as fout:
        #fout.write(vectorspace.dump_raw_matrix(dictionary, tfidf) + "\n")
        fout.write(vectorspace.dump_raw_matrix(top_dict, tfidf) + "\n")
Esempio n. 3
0
def main():
    """Main."""

    if len(sys.argv) != 6:
        usage()
        sys.exit(-1)

    # --------------------------------------------------------------------------
    # Parse the parameters.
    database_file = sys.argv[1]
    minimum = int(sys.argv[2])
    maximum = int(sys.argv[3])
    stop_file = sys.argv[4]
    output_file = sys.argv[5]

    if minimum >= maximum:
        print "minimum is larger than maximum"
        usage()
        sys.exit(-2)

    # Pull stop words
    stopwords = import_stopwords(stop_file)

    kickoff = \
"""
-------------------------------------------------------------------
parameters  :
  database  : %s
  minimum   : %d
  maximum   : %d
  output    : %s
  stop      : %s
-------------------------------------------------------------------
"""

    print kickoff % (database_file, minimum, maximum, output_file, stop_file)

    # this won't return the 3 columns we care about.
    query_collect = "select owner from tweets group by owner having count(*) >= %d and count(*) < %d"
    query_prefetch = "select owner, id, contents as text from tweets where owner in (%s);"

    query = query_prefetch % query_collect

    user_tweets = data_pull(database_file, query % (minimum, maximum))

    print "data pulled"
    print "user count: %d" % len(user_tweets)

    # --------------------------------------------------------------------------
    # Convert to a documents into one document per user.

    docperuser = {}  # array representing all the tweets for each user.

    for user_id in user_tweets:
        docperuser[user_id] = " ".join(user_tweets[user_id])

    if len(docperuser) == 1:
        sys.stderr.write("Insufficient data for tf-idf, only 1 document\n")
        sys.exit(-3)

    tfidf, dictionary = build_doc_tfidf(docperuser, stopwords, True)

    # Maybe I should determine the top tf-idf values per document and then make
    # that my dictionary of terms. =)
    #
    # Originally, I intended to use clustering to get topics, but really those
    # are just high tf-idf terms that are common among certain documents...

    top_dict = set()

    for doc_id in tfidf:
        terms = top_terms(tfidf[doc_id], 250)
        for term in terms:
            top_dict.add(term)

    print "total top terms (not the set): %d" % (250 * len(tfidf))
    print "top dict: %d" % len(top_dict)

    # Dump the matrix.
    with open(output_file, "w") as fout:
        fout.write(dump_raw_matrix(top_dict, tfidf) + "\n")
Esempio n. 4
0
def main():
    """Main."""

    if len(sys.argv) != 2:
        usage()
        sys.exit(-1)

    # --------------------------------------------------------------------------
    # Parse the parameters.
    config = SafeConfigParser()
    config.read(sys.argv[1])

    database_file = config.get('input', 'database_file')
    year_val = config.getint('input', 'year')
    month_str = config.get('input', 'month')
    stop_file = config.get('input', 'stopwords')
    remove_singletons = config.getboolean('input', 'remove_singletons')
    build_images = {}
    build_images['rgb'] = config.getboolean('input', 'build_rgb_images')
    build_images['grey'] = config.getboolean('input', 'build_grey_images')
    build_csv_files = config.getboolean('input', 'build_csv_files')
    full_users_only = config.getboolean('input', 'full_users')

    # XXX: If full_users_only is not set to True, the images and such have
    # varying dimensions... which is bad.  So, there is a bug here and I have
    # yet to fully investigate it.

    if month_str not in MONTHS:
        usage()
        sys.exit(-2)

    output_set = {}

    for section in config.sections():
        if section.startswith("run"):
            output_folder = config.get(section, 'output_folder')

            output_set[section] = \
                Output(
                       output_folder,
                       config.getint(section, 'request_value'))

            try:
                stat(output_folder)
            except OSError:
                mkdir(output_folder)

    # --------------------------------------------------------------------------
    # Pull stop words
    stopwords = import_stopwords(stop_file)

    kickoff = \
"""
-------------------------------------------------------------------
parameters  :
  database  : %s
  date      : %s
  output    : %s
  stop      : %s
  count     : %s
  remove    : %s
  output    : %s
  full only : %s
-------------------------------------------------------------------
"""

    print kickoff % \
        (database_file,
         (month_str, year_val),
         str([output_set[output].get_folder() for output in output_set]),
         stop_file,
         str([output_set[output].get_request() for output in output_set]),
         remove_singletons,
         build_images,
         full_users_only)

    # now that it's an integer lookup that can be more readily searched and
    # indexed versus a text field search with like.
    query_prefetch = \
    "select owner, created, contents as text from tweets where yyyymm = %d;"

    # --------------------------------------------------------------------------
    # Build a set of documents, per user, per day.
    num_days = monthrange(year_val, int(MONTHS[month_str]))[1]
    user_data = \
        data_pull(
                  database_file,
                  query_prefetch % \
                    int(str_yearmonth(year_val, int(MONTHS[month_str]))))

    if len(user_data) < 2:
        print "empty dataset."
        sys.exit(-3)

    # you want full users only if you're running matrix completion stuff.
    if full_users_only:
        users = frame.find_full_users(user_data, stopwords, num_days)
    else:
        users = frame.find_valid_users(user_data, stopwords)

    print "data pulled"
    print "user count: %d\tframe users: %d" % (len(user_data), len(users))

    # this is only an issue at present. mind you, because for the video
    # analysis code the users don't have to be full.
    if len(users) < 2:
        print "no full users"
        sys.exit(-4)

    # --------------------------------------------------------------------------
    # I don't build a master tf-idf set because the tf-idf values should...
    # evolve.  albeit, I don't think I'm correctly adjusting them -- I'm just
    # recalculating then.
    #
    # Calculate daily tf-idf; then build frame from top terms over the period
    # of days.
    frames = {}

    for day in range(1, num_days + 1):
        # This is run once per day overall.
        frames[day] = frame.build_full_frame(users, user_data, day)
        frames[day].calculate_tfidf(stopwords, remove_singletons)

        if frames[day].tfidf_len() == 0:
            print "weird data error."
            sys.exit(-5)

        # This is run once per day per output.
        for output in output_set:
            out = output_set[output]
            out.add_terms(frames[day].top_terms_overall(out.get_request()))

            # get_range() is just whatever the last time you ran
            # top_terms_overall
            new_range = frames[day].get_range()

            # This way the images are created with the correct range to cover
            # all of them.
            if out.max_range < new_range:
                out.max_range = new_range

        #break
        #if day == 3:
        #break # just do first day.

    print "Frames created"

    # len(overall_terms) should be at most 250 * num_users * num_days -- if
    # there is no overlap of high value terms over the period of days between
    # the users.  If there is literally no overlap then each user will have
    # their own 250 terms each day.

    # --------------------------------------------------------------------------
    # Dump the matrix.
    output_matrix(frames, output_set, build_csv_files, build_images)
Esempio n. 5
0
def main():
    """Main."""

    if len(sys.argv) != 6:
        usage()
        sys.exit(-1)

    # --------------------------------------------------------------------------
    # Parse the parameters.
    database_file = sys.argv[1]
    minimum = int(sys.argv[2])
    maximum = int(sys.argv[3])
    stop_file = sys.argv[4]
    output_file = sys.argv[5]

    if minimum >= maximum:
        print "minimum is larger than maximum"
        usage()
        sys.exit(-2)

    # Pull stop words
    stopwords = import_stopwords(stop_file)

    kickoff = \
"""
-------------------------------------------------------------------
parameters  :
  database  : %s
  minimum   : %d
  maximum   : %d
  output    : %s
  stop      : %s
-------------------------------------------------------------------
"""

    print kickoff % (database_file, minimum, maximum, output_file, stop_file) 

    # this won't return the 3 columns we care about.
    query_collect = "select owner from tweets group by owner having count(*) >= %d and count(*) < %d"
    query_prefetch = "select owner, id, contents as text from tweets where owner in (%s);"

    query = query_prefetch % query_collect

    user_tweets = data_pull(database_file, query % (minimum, maximum))

    print "data pulled"
    print "user count: %d" % len(user_tweets)

    # --------------------------------------------------------------------------
    # Convert to a documents into one document per user.

    docperuser = {} # array representing all the tweets for each user.

    for user_id in user_tweets:
        docperuser[user_id] = " ".join(user_tweets[user_id])

    if len(docperuser) == 1:
        sys.stderr.write("Insufficient data for tf-idf, only 1 document\n")
        sys.exit(-3)

    tfidf, dictionary = build_doc_tfidf(docperuser, stopwords, True)

    # Maybe I should determine the top tf-idf values per document and then make
    # that my dictionary of terms. =)
    #
    # Originally, I intended to use clustering to get topics, but really those
    # are just high tf-idf terms that are common among certain documents...

    top_dict = set()

    for doc_id in tfidf:
        terms = top_terms(tfidf[doc_id], 250)
        for term in terms:
            top_dict.add(term)

    print "total top terms (not the set): %d" % (250 * len(tfidf))
    print "top dict: %d" % len(top_dict)

    # Dump the matrix.
    with open(output_file, "w") as fout:
        fout.write(dump_raw_matrix(top_dict, tfidf) + "\n")
Esempio n. 6
0
def main():

    # Did they provide the correct args?
    if len(sys.argv) != 6:
        usage()
        sys.exit(-1)

    cpus = multiprocessing.cpu_count()

    database_file = sys.argv[1]
    minimum = int(sys.argv[2])
    maximum = int(sys.argv[3])
    stop_file = sys.argv[4]
    output_folder = sys.argv[5]

    if minimum >= maximum:
        usage()
        sys.exit(-2)

    kickoff = \
"""
-------------------------------------------------------------------
parameters  :
    database  : %s
    minimum   : %d
    maximum   : %d
    output    : %s
    stop      : %s
-------------------------------------------------------------------
"""

    print kickoff % (database_file, minimum, maximum, output_folder, stop_file)

    # --------------------------------------------------------------------------
    # Pull stop words
    stopwords = tweetclean.import_stopwords(stop_file)

    # --------------------------------------------------------------------------
    # Read in the database
    query_collect = \
        "select owner from tweets group by owner having count(*) >= %d and count(*) < %d;"

    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    c = conn.cursor()

    # --------------------------------------------------------------------------
    # Search the database file for users.
    users = []
    start_time = time.clock()

    for row in c.execute(query_collect % (minimum, maximum)):
        users.append(row['owner'])

    print "%fs" % (time.clock() - start_time)

    conn.close()

    # --------------------------------------------------------------------------
    # Process those tweets by user set.

    cnt = int(math.ceil((float(len(users)) / cpus)))
    remains = len(users)
    threads = []

    for i in range(0, cpus):
        start = i * cnt

        if cnt > remains:
            cnt = remains

        t = threading.Thread(target=thread_main,
                             args=(
                                 database_file,
                                 output_folder,
                                 users,
                                 stopwords,
                                 start,
                                 cnt,
                             ))
        threads.append(t)
        t.start()

        remains -= cnt

    # --------------------------------------------------------------------------
    # Done.

    for t in threads:
        t.join()
Esempio n. 7
0
def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) != 6:
        usage()
        sys.exit(-1)

    cpus = multiprocessing.cpu_count()

    # --------------------------------------------------------------------------
    # Parse the parameters.
    database_file = sys.argv[1]
    minimum = int(sys.argv[2])
    maximum = int(sys.argv[3])
    stop_file = sys.argv[4]
    output_folder = sys.argv[5]

    if minimum >= maximum:
        usage()
        sys.exit(-2)

    # Pull stop words
    stopwords = tweetclean.import_stopwords(stop_file)

    kickoff = \
"""
-------------------------------------------------------------------
parameters  :
    database  : %s
    minimum   : %d
    maximum   : %d
    output    : %s
    stop      : %s
-------------------------------------------------------------------
"""

    print kickoff % (database_file, minimum, maximum, output_folder, stop_file)

    # this won't return the 3 columns we care about.
    query_collect = \
        "select owner from tweets group by owner having count(*) >= %d and count(*) < %d"
    # "select id, contents as text from tweets where owner = %d;"
    query_prefetch = \
        "select owner, id, contents as text from tweets where owner in (%s);"

    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    c = conn.cursor()

    print "#cpus: %d" % cpus

    # --------------------------------------------------------------------------
    # Search the database file for users.
    users = []
    users_tweets = {}

    start = time.clock()

    query = query_prefetch % query_collect

    for row in c.execute(query % (minimum, maximum)):
        uid = row['owner']
        if uid not in users:
            users.append(uid)
        if row['text'] is not None:
            data = tweetclean.cleanup(row['text'], True, True)
            try:
                users_tweets[uid][row['id']] = data
            except KeyError:
                users_tweets[uid] = {}
                users_tweets[uid][row['id']] = data

    print "query time: %fm" % ((time.clock() - start) / 60)
    print "users: %d\n" % len(users)

    conn.close()

    # --------------------------------------------------------------------------
    # Process those tweets by user set.

    print "usr\tcnt\tavg\tstd\tend\tdur"

    cnt = int(math.ceil((float(len(users)) / cpus)))
    remains = len(users)
    threads = []

    for i in range(0, cpus):
        start = i * cnt

        if cnt > remains:
            cnt = remains

        print "launching thread: %d, %d" % (start, cnt)

        t = threading.Thread(target=thread_main,
                             args=(
                                 output_folder,
                                 users,
                                 users_tweets,
                                 stopwords,
                                 start,
                                 cnt,
                             ))
        threads.append(t)
        t.start()

        remains -= cnt
Esempio n. 8
0
def main():

    # Did they provide the correct args?
    if len(sys.argv) != 4:
        usage()
        sys.exit(-1)

    database_file = sys.argv[1]
    user_id = int(sys.argv[2])
    stop_file = sys.argv[3]

    # --------------------------------------------------------------------------
    # Pull stop words
    stopwords = tweetclean.import_stopwords(stop_file)

    # --------------------------------------------------------------------------
    # Read in the database
    query_tweets = "select id, contents as text from tweets where owner = %d;"
    users_tweets = {}

    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    c = conn.cursor()

    for row in c.execute(query_tweets % user_id):
        if row['text'] is not None:
            users_tweets[row['id']] = \
                tweetclean.cleanup(row['text'], True, True)

    conn.close()

    # only words that are greater than one letter and not in the stopword list.
    texts = [[word for word in users_tweets[uid].split() \
              if word not in stopwords and len(word) > 1] \
                for uid in users_tweets]

    # remove words that appear only once
    all_tokens = sum(texts, [])
    tokens_once = set(word for word in set(all_tokens) \
                      if all_tokens.count(word) == 1)
    texts = [[word for word in text \
              if word not in tokens_once] for text in texts]

    dictionary = corpora.Dictionary(texts)
    # store the dictionary, for future reference
    dictionary.save('%d.dict' % user_id)

    corpus = [dictionary.doc2bow(text) for text in texts]
    # store to disk, for later use
    corpora.MmCorpus.serialize('%d.mm' % user_id, corpus)

    # is this different...
    corpus = corpora.MmCorpus('%d.mm' % user_id)

    model = models.ldamodel.LdaModel(corpus,
                                     id2word=dictionary,
                                     chunksize=100,
                                     passes=20,
                                     num_topics=100)
    model.save('%d.lda' % user_id)

    lda = models.ldamodel.LdaModel.load('%d.lda' % user_id)

    #lda.show_topics(topics=1, topn=1, log=False, formatted=True)
    # Unlike what the documentation might have you believe, you have to pull it
    # back as a string if you want to use it.
    topic_strings = lda.show_topics(topics=-1, formatted=True)
    print "#topics: %d" % len(topic_strings)
    for topic in topic_strings:
        print topic