Ejemplo n.º 1
0
 def train(self):
     data = []
     labels = []
     for cdir in util.list_categories():
         path = os.path.join(util.data_dir, cdir)
         fs = os.listdir(path)
         self.logger.info("%s users for category %s" % (len(fs), cdir))
         for ind, f in enumerate(fs):
             if ind == util.users_per_category:
                 break
             p = os.path.join(path, f)
             lines = open(p).readlines()
             doc = ""
             for l in lines:
                 text = " ".join(l.split(" ")[1:])  # remove tweet id
                 if text.strip() != "":
                     doc += text
             if doc.strip() != "":
                 data.append(doc)
                 labels.append(cdir)
     vectorizer = TfidfVectorizer(stop_words='english',
                                  token_pattern=r'[a-zA-Z]{4,}',
                                  min_df=4)
     X_train = vectorizer.fit_transform(data)
     clf = OneVsRestClassifier(LinearSVC(random_state=0)).fit(
         X_train, labels)
     filename = os.path.join(util.classifiers_dir, 'categories.joblib.pkl')
     joblib.dump(clf, filename, compress=9)
     filename = os.path.join(util.classifiers_dir, 'vect.pkl')
     joblib.dump(vectorizer, filename, compress=9)
     return True
 def train(self):
     data = []
     labels = []
     for cdir in util.list_categories():
         path = os.path.join(util.data_dir, cdir)
         fs = os.listdir(path)
         self.logger.info("%s users for category %s" % (len(fs), cdir))
         for ind, f in enumerate(fs):
             if ind == util.users_per_category:
                 break
             p = os.path.join(path, f)
             lines = open(p).readlines()
             doc = ""
             for l in lines:
                 text = " ".join(l.split(" ")[1:]) # remove tweet id
                 if text.strip() != "":
                     doc += text
             if doc.strip() != "":
                 data.append(doc)
                 labels.append(cdir)
     vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'[a-zA-Z]{4,}', min_df=4)
     X_train = vectorizer.fit_transform(data)
     clf = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, labels)
     filename = os.path.join(util.classifiers_dir, 'categories.joblib.pkl')
     joblib.dump(clf, filename, compress=9)
     filename = os.path.join(util.classifiers_dir, 'vect.pkl')
     joblib.dump(vectorizer, filename, compress=9)
     return True
Ejemplo n.º 3
0
Archivo: test.py Proyecto: lemori/study
 def POST(self):
     """Test for web."""
     i = web.input()
     choice = i.get('choice', None)
     item = i.get('item', None)
     if choice == '1':
         return render.index(util.list_categories())
     elif choice == '2':
         if item == '':
             return render.index('请输入商品名称')
         else:
             return render.index(util.search_for_goods(item))
     else:
         return render.index('请选择一个选项')
Ejemplo n.º 4
0
 def __init__(self):
     self.categories = util.list_categories()
     self.logger = logging.getLogger("tclas")
 def __init__(self):
     self.categories = util.list_categories()
     self.logger = logging.getLogger("tclas")