def make_app(): base_dir = path.dirname(path.abspath(__file__)) media_path = path.join(base_dir, "files") debug = True #redis = tornadoredis.ConnectionPool(max_connections=10, wait_for_available=True) redis = tornadoredis.Client() redis.connect() cache = DataManager() cache.redis = redis # cache.initialize() global_vars = dict(cache=cache) return Application([ url(r"/", IndexHandler,global_vars, name="index"), url(r"/thread", ThreadHandler, global_vars, name="thread"), url(r"/ws", WsHandler, global_vars, name="ws"), url(r'/files/(.*)', StaticFileHandler, {'path': media_path}, name="files"), # url(r"/(?P<param1>.*)", HelloHandler, global_vars, name='home'), ], debug=debug, xsrf_cookies=False, template_path=path.join(base_dir, "templates"), static_path=path.join(base_dir, "static"), media_path=media_path, cookie_secret='secret', redis=redis, cache=cache, )
def make_app(): base_dir = path.dirname(path.abspath(__file__)) media_path = path.join(base_dir, "files") debug = True #redis = tornadoredis.ConnectionPool(max_connections=10, wait_for_available=True) redis = tornadoredis.Client() redis.connect() cache = DataManager() cache.redis = redis # cache.initialize() global_vars = dict(cache=cache) return Application( [ url(r"/", IndexHandler, global_vars, name="index"), url(r"/thread", ThreadHandler, global_vars, name="thread"), url(r"/ws", WsHandler, global_vars, name="ws"), url(r'/files/(.*)', StaticFileHandler, {'path': media_path}, name="files"), # url(r"/(?P<param1>.*)", HelloHandler, global_vars, name='home'), ], debug=debug, xsrf_cookies=False, template_path=path.join(base_dir, "templates"), static_path=path.join(base_dir, "static"), media_path=media_path, cookie_secret='secret', redis=redis, cache=cache, )
def __init__(self): db_mgr = DataManager(self.DATABASE) self.train_tweets, self.train_labels = db_mgr.select_wikipedia_train() self.vectorizer = get_vectorizer("tfidf", min_df=1) self.nb = Classifier(classifier="nb") self.train_data = self.vectorizer.fit_transform(self.train_tweets) self.nb.fit(self.train_data, self.train_labels)
def collect_source_data(): dbm = DataManager() for pid, source_url in dbm.get_project_source(): print "working on project %s" % pid print "url: %s" % source_url try: r = requests.get(source_url) fp = codecs_open("%s/html/%s.html" % (HOME_DIR, pid), "w", "utf-8" ) fp.write(r.text) fp.close() sleep(1) except ConnectionError, e: print "failed" print e.message
def scrape_ciid(): dbm = DataManager() for pid, source_url in dbm.get_ciid_projects(): print '\n-----\n' print "working on project %s" % pid print "url: %s" % source_url print "=====" try: fp = codecs_open("%s/html/ciid/%s.html" % (HOME_DIR, pid), "r", "utf-8" ) html = fp.read() fp.close() soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) fp = codecs_open("%s/html/processed/%s.txt" % (HOME_DIR, pid), "w", "utf-8" ) text = " ".join(x.text for x in soup.find("div", {'class':"post"}).findAll("p")) fp.write(text) fp.close() print text sleep(2) except Exception, e: print "failed" print e.message()
from labels import * from metrics import * from features import * from classifiers import Classifier from db import DataManager N_TIMES = 1 for i in range(0,N_TIMES): print i+1, "times" DATABASE = "us_twitter.db" split = 0.8 db_mgr = DataManager(DATABASE) train_tweets, train_labels = db_mgr.select_wikipedia_train() test_tweets, test_labels, dummy1, dummy2 = db_mgr.select_tweets(limit=10, state_fips=True, table="us_tweets", label=state_fips) results = get("results.json") vectorizer = get_vectorizer("tfidf", min_df=1) classifiers = { "BernoulliNB": Classifier(classifier="bnb"), "MultinomialNB": Classifier(classifier="nb"), "KNN-1000": Classifier(classifier="knn", k=1000), "KNN-2000": Classifier(classifier="knn", k=2000), # "SVC": Classifier(classifier="svm", params={"C" : 1.0,"kernel" : 'linear','verbose':True}) "SVC": Classifier(load="classifier-SVC")
["tweets", "preprocess", "state_label",state_fips,True] # ["wiki", "preprocess", "county_label",county_label,True] # ["tweets", "preprocess", "grid_1_label",grid_1_degree,True], # ["tweets", "preprocess", "grid_5_label",grid_5_degree,True], # ["tweets", "preprocess", "grid_10_label",grid_10_degree,True] ] for p in range(0,len(params)): print params[p] TRAINING, PREPROCESSING, LABEL_FUNC, label_func, preprocess = params[p] for i in range(0,N_TIMES): print i+1, "times" DATABASE = "us_twitter.db" split = 0.8 db_mgr = DataManager(DATABASE) if TRAINING == "tweets": train_tweets, train_labels, test_tweets, test_labels = db_mgr.select_tweets(limit=SIZE, preprocess=preprocess, table="us_tweets", split=0.8, label=label_func) else: train_tweets, train_labels = db_mgr.select_wikipedia_train() test_tweets, test_labels, dummy1, dummy2 = db_mgr.select_tweets(limit=(SIZE * 0.2), state_fips=True, table="us_tweets", label=label_func) # print "Train Size:", len(train_tweets) # print "Test Size:", len(test_tweets) vectorizer = get_vectorizer(VECTORIZER, min_df=1) classifiers = { "BernoulliNB": Classifier(classifier="bnb"), "MultinomialNB": Classifier(classifier="nb"),
import pickle from lib import * from labels import * from metrics import * from features import * from db import DataManager from classifiers import Classifier DATABASE = "us_twitter.db" db_mgr = DataManager(DATABASE) train_data, train_labels = db_mgr.select_wikipedia_train() vectorizers = { "count":get_vectorizer("tfidf", min_df=1), "tfidf":get_vectorizer("count", min_df=1) } print "Vectorizing Training Data..." count_data = vectorizers["count"].fit_transform(train_data) tf_idf_data = vectorizers["tfidf"].fit_transform(train_data) classifiers = { "BernoulliNB": { "count":Classifier(classifier="bnb"), "tfidf":Classifier(classifier="bnb") }, "MultinomialNB": { "count":Classifier(classifier="nb"), "tfidf":Classifier(classifier="nb")