Esempio n. 1
0
def make_app():
    base_dir = path.dirname(path.abspath(__file__))
    media_path = path.join(base_dir, "files")
    debug = True
    #redis = tornadoredis.ConnectionPool(max_connections=10, wait_for_available=True)
    redis = tornadoredis.Client()
    redis.connect()
    cache = DataManager()
    cache.redis = redis
    # cache.initialize()
    global_vars = dict(cache=cache)

    return Application([
        url(r"/", IndexHandler,global_vars, name="index"),
        url(r"/thread", ThreadHandler, global_vars,  name="thread"),
        url(r"/ws", WsHandler, global_vars, name="ws"),
        url(r'/files/(.*)', StaticFileHandler,
            {'path': media_path}, name="files"),
        # url(r"/(?P<param1>.*)", HelloHandler, global_vars, name='home'),
    ],
        debug=debug,
        xsrf_cookies=False,
        template_path=path.join(base_dir, "templates"),
        static_path=path.join(base_dir, "static"),
        media_path=media_path,
        cookie_secret='secret',
        redis=redis,
        cache=cache,
    )
Esempio n. 2
0
def make_app():
    base_dir = path.dirname(path.abspath(__file__))
    media_path = path.join(base_dir, "files")
    debug = True
    #redis = tornadoredis.ConnectionPool(max_connections=10, wait_for_available=True)
    redis = tornadoredis.Client()
    redis.connect()
    cache = DataManager()
    cache.redis = redis
    # cache.initialize()
    global_vars = dict(cache=cache)

    return Application(
        [
            url(r"/", IndexHandler, global_vars, name="index"),
            url(r"/thread", ThreadHandler, global_vars, name="thread"),
            url(r"/ws", WsHandler, global_vars, name="ws"),
            url(r'/files/(.*)',
                StaticFileHandler, {'path': media_path},
                name="files"),
            # url(r"/(?P<param1>.*)", HelloHandler, global_vars, name='home'),
        ],
        debug=debug,
        xsrf_cookies=False,
        template_path=path.join(base_dir, "templates"),
        static_path=path.join(base_dir, "static"),
        media_path=media_path,
        cookie_secret='secret',
        redis=redis,
        cache=cache,
    )
Esempio n. 3
0
 def __init__(self):
   db_mgr = DataManager(self.DATABASE)
   self.train_tweets, self.train_labels = db_mgr.select_wikipedia_train()
   self.vectorizer = get_vectorizer("tfidf", min_df=1)
   self.nb = Classifier(classifier="nb")
   self.train_data = self.vectorizer.fit_transform(self.train_tweets)
   self.nb.fit(self.train_data, self.train_labels)
Esempio n. 4
0
def collect_source_data():
    dbm = DataManager()
    for pid, source_url in dbm.get_project_source():
        print "working on project %s" % pid
        print "url: %s" % source_url
        try:
            r = requests.get(source_url)
            fp = codecs_open("%s/html/%s.html" % (HOME_DIR, pid), "w", "utf-8" )
            fp.write(r.text)
            fp.close()
            sleep(1)
        except ConnectionError, e:
            print "failed"
            print e.message
Esempio n. 5
0
def scrape_ciid():
    dbm = DataManager()
    for pid, source_url in dbm.get_ciid_projects():
        print '\n-----\n'
        print "working on project %s" % pid
        print "url: %s" % source_url
        print "====="
        try:
            fp = codecs_open("%s/html/ciid/%s.html" % (HOME_DIR, pid), "r", "utf-8" )
            html = fp.read()
            fp.close()
            soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
            fp = codecs_open("%s/html/processed/%s.txt" % (HOME_DIR, pid), "w", "utf-8" )
            text = " ".join(x.text for x in soup.find("div", {'class':"post"}).findAll("p"))
            fp.write(text)
            fp.close()
            print text
            sleep(2)
        except Exception, e:
            print "failed"
            print e.message()
Esempio n. 6
0
from labels import *
from metrics import *
from features import *
from classifiers import Classifier

from db import DataManager

N_TIMES = 1

for i in range(0,N_TIMES):
  print i+1, "times"
  DATABASE = "us_twitter.db"

  split = 0.8

  db_mgr = DataManager(DATABASE)

  train_tweets, train_labels = db_mgr.select_wikipedia_train()
  test_tweets, test_labels, dummy1, dummy2 = db_mgr.select_tweets(limit=10, state_fips=True, table="us_tweets", label=state_fips)

  results = get("results.json")

  vectorizer = get_vectorizer("tfidf", min_df=1)

  classifiers = {
    "BernoulliNB": Classifier(classifier="bnb"),
    "MultinomialNB": Classifier(classifier="nb"),
    "KNN-1000": Classifier(classifier="knn", k=1000),
    "KNN-2000": Classifier(classifier="knn", k=2000),
    # "SVC": Classifier(classifier="svm", params={"C" : 1.0,"kernel" : 'linear','verbose':True})
    "SVC": Classifier(load="classifier-SVC")
Esempio n. 7
0
          ["tweets", "preprocess", "state_label",state_fips,True]
          # ["wiki", "preprocess", "county_label",county_label,True]
          # ["tweets", "preprocess", "grid_1_label",grid_1_degree,True],
          # ["tweets", "preprocess", "grid_5_label",grid_5_degree,True],
          # ["tweets", "preprocess", "grid_10_label",grid_10_degree,True]
         ]
for p in range(0,len(params)):
  print params[p]
  TRAINING, PREPROCESSING, LABEL_FUNC, label_func, preprocess = params[p]
  for i in range(0,N_TIMES):
    print i+1, "times"
    DATABASE = "us_twitter.db"

    split = 0.8

    db_mgr = DataManager(DATABASE)

    if TRAINING == "tweets":
      train_tweets, train_labels, test_tweets, test_labels = db_mgr.select_tweets(limit=SIZE, preprocess=preprocess, table="us_tweets", split=0.8, label=label_func)
    else:
      train_tweets, train_labels = db_mgr.select_wikipedia_train()
      test_tweets, test_labels, dummy1, dummy2 = db_mgr.select_tweets(limit=(SIZE * 0.2), state_fips=True, table="us_tweets", label=label_func)
    # print "Train Size:", len(train_tweets)
    # print "Test Size:", len(test_tweets)


    vectorizer = get_vectorizer(VECTORIZER, min_df=1)

    classifiers = {
      "BernoulliNB": Classifier(classifier="bnb"),
      "MultinomialNB": Classifier(classifier="nb"),
Esempio n. 8
0
import pickle

from lib import *
from labels import *
from metrics import *
from features import *

from db import DataManager
from classifiers import Classifier

DATABASE = "us_twitter.db"
db_mgr = DataManager(DATABASE)
train_data, train_labels = db_mgr.select_wikipedia_train()

vectorizers = {
  "count":get_vectorizer("tfidf", min_df=1),
  "tfidf":get_vectorizer("count", min_df=1)
}

print "Vectorizing Training Data..."
count_data = vectorizers["count"].fit_transform(train_data)
tf_idf_data = vectorizers["tfidf"].fit_transform(train_data)

classifiers = {
  "BernoulliNB": {
    "count":Classifier(classifier="bnb"),
    "tfidf":Classifier(classifier="bnb")
  },
  "MultinomialNB": {
    "count":Classifier(classifier="nb"),
    "tfidf":Classifier(classifier="nb")