def main():

    data.city_corpus_dict()
    #seeds words table with city, word, prob(w/city)
    data.seed_words_table()
    data.city_tweet_corpus_dict()
    for city in cities:
        data.create_region_tweet_count(city)
        feature_selection.populate_db_with_features(city)
    data.create_tweet_total_count()
def main(): 
    
    data.city_corpus_dict()
    #seeds words table with city, word, prob(w/city)
    data.seed_words_table()
    data.city_tweet_corpus_dict()
    for city in cities:
	data.create_region_tweet_count(city)
	feature_selection.populate_db_with_features(city)
    data.create_tweet_total_count()
def classify_text():
    tweet = request.form['tweet']

    start = datetime.datetime.now()
    rankings = data.create_ranking(tweet)
    end = datetime.datetime.now()
    print 'getting city rankings takes: %s' % (end - start)
    

    start = datetime.datetime.now()
    top_5_words = feature_selection.top_words_in_tweet(rankings[0][0],tweet)
    end = datetime.datetime.now()
    print 'getting top 5 words takes: %s' % (end - start)
   
    start = datetime.datetime.now()
    cty_corpus_dict = data.city_corpus_dict()
    word_count_dict = cty_corpus_dict[rankings[0][0].name]
    end = datetime.datetime.now()
    print 'getting bogus word count dict takes: %s' % (end - start)

    start = datetime.datetime.now()
    final_result = []
    for word in top_5_words:
	final_result.append(word)

    names = []
    for i in range(0, len(rankings)):
	city_name = rankings[i][0].name
	names.append(city_name)
    end = datetime.datetime.now()
    print 'generating lists takes: %s' % (end - start)
    return render_template("map.html", tweet=tweet, names=names, rankings=rankings, final_result=final_result)
def get_corpus():
    corpus = {}
    city_to_dict_of_words_to_counts = data.city_corpus_dict()
    for city in cities:
	city_corpus = city_to_dict_of_words_to_counts[city.name]
	for word in city_corpus.keys():
	    corpus[word] = True
    return corpus.keys()
def get_corpus():
    corpus = {}
    city_to_dict_of_words_to_counts = data.city_corpus_dict()
    for city in cities:
        city_corpus = city_to_dict_of_words_to_counts[city.name]
        for word in city_corpus.keys():
            corpus[word] = True
    return corpus.keys()
def classify_text():
    tweet = request.form['tweet']

    start = datetime.datetime.now()
    rankings = data.create_ranking(tweet)
    end = datetime.datetime.now()
    print 'getting city rankings takes: %s' % (end - start)

    start = datetime.datetime.now()
    feature_strings_dict = {}
    city_corpus_leng_dict = {}
    city_tweet_count_dict = {}
    for city in cities:
        corpus_leng = data.find_leng_city_corpus(city)
        city_corpus_leng_dict[city.name] = corpus_leng

        city_tweet_count = data.create_region_tweet_count(city)
        city_tweet_count_dict[city.name] = city_tweet_count

        feature_strings = feature_selection.included_feature_strings(
            city, tweet)
        feature_strings_dict[city.name] = feature_strings
    end = datetime.datetime.now()
    print 'getting top 5 words takes: %s' % (end - start)

    start = datetime.datetime.now()
    cty_corpus_dict = data.city_corpus_dict()
    word_count_dict = cty_corpus_dict[rankings[0][0].name]
    end = datetime.datetime.now()
    print 'getting bogus word count dict takes: %s' % (end - start)

    start = datetime.datetime.now()
    names = []
    for i in range(0, len(rankings)):
        city_name = rankings[i][0].name
        names.append(city_name)
    end = datetime.datetime.now()
    print 'generating lists takes: %s' % (end - start)
    return render_template("map.html",
                           tweet=tweet,
                           city_tweet_count_dict=city_tweet_count_dict,
                           names=names,
                           city_corpus_leng_dict=city_corpus_leng_dict,
                           feature_strings_dict=feature_strings_dict,
                           rankings=rankings)
def rank(city):
    city_corpus = data.city_corpus_dict()
    word_dict = city_corpus[city.name]
    feature_weight_list = []
    count = 0
    for word in word_dict.keys():
	wf = get_tweet_word_counts(word, city)
	score = mutual_info_score(wf['N11'], wf['N10'], wf['N01'], wf['N00'])
	feature_weight_list.append((word, score))
	count += 1
    feature_weight_list.sort(key=operator.itemgetter(1), reverse=True)
    winner_list = []
    feature_word_list = [t[0] for t in feature_weight_list]
    stop_words = {'through':1,'our':1,'about':1,'before':1, 'between':1, 'by':1, 'during':1, 'except':1, 'for':1, 'with':1, 'without':1,'in':1, 'how':1,'his':1, 'took':1, 'could':1, 'would':1, 'will':1, 'at':1, 'should':1, 'can':1, 'we':1, 'us':1, 'as':1,'at':1, 'him':1,'to':1,'sometimes':1, 'you':1, 'were':1, 'i':1, 'my':1, 'her':1, 'he':1,'me':1, 'this':1, 'was':1, 'had':1,'all':1, 'the':1, 'but':1, 'or':1, 'and':1,'there':1, 'it':1, 'is':1, 'then':1, 'a':1, 'an':1, 'be':1, 'for':1, 'of':1, 'what':1, 'when':1, 'why':1, 'where':1, 'are':1, 'am':1, 'because':1, 'they':1}
    for word in feature_word_list:
	if word not in stop_words and len(winner_list)<30:
	    winner_list.append(word)
    return winner_list 
def classify_text():
    tweet = request.form['tweet']

    start = datetime.datetime.now()
    rankings = data.create_ranking(tweet)
    end = datetime.datetime.now()
    print 'getting city rankings takes: %s' % (end - start)
    

    start = datetime.datetime.now()
    feature_strings_dict = {}
    city_corpus_leng_dict = {}
    city_tweet_count_dict = {}
    for city in cities:
	corpus_leng = data.find_leng_city_corpus(city)
	city_corpus_leng_dict[city.name] = corpus_leng
	
	city_tweet_count = data.create_region_tweet_count(city)
	city_tweet_count_dict[city.name] = city_tweet_count

	feature_strings = feature_selection.included_feature_strings(city, tweet)
	feature_strings_dict[city.name] = feature_strings
    end = datetime.datetime.now()
    print 'getting top 5 words takes: %s' % (end - start)
   
    start = datetime.datetime.now()
    cty_corpus_dict = data.city_corpus_dict()
    word_count_dict = cty_corpus_dict[rankings[0][0].name]
    end = datetime.datetime.now()
    print 'getting bogus word count dict takes: %s' % (end - start)

    start = datetime.datetime.now()
    names = []
    for i in range(0, len(rankings)):
	city_name = rankings[i][0].name
	names.append(city_name)
    end = datetime.datetime.now()
    print 'generating lists takes: %s' % (end - start)
    return render_template("map.html", tweet=tweet, city_tweet_count_dict=city_tweet_count_dict, names=names, city_corpus_leng_dict=city_corpus_leng_dict, feature_strings_dict=feature_strings_dict, rankings=rankings)
def rank(city):
    city_corpus = data.city_corpus_dict()
    word_dict = city_corpus[city.name]
    feature_weight_list = []
    count = 0
    for word in word_dict.keys():
        wf = get_tweet_word_counts(word, city)
        score = mutual_info_score(wf['N11'], wf['N10'], wf['N01'], wf['N00'])
        feature_weight_list.append((word, score))
        count += 1
    feature_weight_list.sort(key=operator.itemgetter(1), reverse=True)
    winner_list = []
    feature_word_list = [t[0] for t in feature_weight_list]
    stop_words = {
        'every': 1,
        'got': 1,
        'through': 1,
        'our': 1,
        'especially': 1,
        'about': 1,
        'before': 1,
        'between': 1,
        'by': 1,
        'during': 1,
        'except': 1,
        'for': 1,
        'with': 1,
        'without': 1,
        'in': 1,
        'how': 1,
        'his': 1,
        'took': 1,
        'could': 1,
        'would': 1,
        'will': 1,
        'at': 1,
        'should': 1,
        'can': 1,
        'we': 1,
        'us': 1,
        'as': 1,
        'at': 1,
        'him': 1,
        'to': 1,
        'sometimes': 1,
        'you': 1,
        'were': 1,
        'i': 1,
        'my': 1,
        'her': 1,
        'he': 1,
        'me': 1,
        'this': 1,
        'was': 1,
        'had': 1,
        'all': 1,
        'the': 1,
        'but': 1,
        'or': 1,
        'and': 1,
        'there': 1,
        'it': 1,
        'is': 1,
        'then': 1,
        'a': 1,
        'an': 1,
        'be': 1,
        'for': 1,
        'of': 1,
        'what': 1,
        'when': 1,
        'why': 1,
        'where': 1,
        'are': 1,
        'am': 1,
        'because': 1,
        'they': 1
    }
    for word in feature_word_list:
        if word not in stop_words:
            winner_list.append(word)
    return winner_list