def train(self): data = [] labels = [] for cdir in util.list_categories(): path = os.path.join(util.data_dir, cdir) fs = os.listdir(path) self.logger.info("%s users for category %s" % (len(fs), cdir)) for ind, f in enumerate(fs): if ind == util.users_per_category: break p = os.path.join(path, f) lines = open(p).readlines() doc = "" for l in lines: text = " ".join(l.split(" ")[1:]) # remove tweet id if text.strip() != "": doc += text if doc.strip() != "": data.append(doc) labels.append(cdir) vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'[a-zA-Z]{4,}', min_df=4) X_train = vectorizer.fit_transform(data) clf = OneVsRestClassifier(LinearSVC(random_state=0)).fit( X_train, labels) filename = os.path.join(util.classifiers_dir, 'categories.joblib.pkl') joblib.dump(clf, filename, compress=9) filename = os.path.join(util.classifiers_dir, 'vect.pkl') joblib.dump(vectorizer, filename, compress=9) return True
def train(self): data = [] labels = [] for cdir in util.list_categories(): path = os.path.join(util.data_dir, cdir) fs = os.listdir(path) self.logger.info("%s users for category %s" % (len(fs), cdir)) for ind, f in enumerate(fs): if ind == util.users_per_category: break p = os.path.join(path, f) lines = open(p).readlines() doc = "" for l in lines: text = " ".join(l.split(" ")[1:]) # remove tweet id if text.strip() != "": doc += text if doc.strip() != "": data.append(doc) labels.append(cdir) vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'[a-zA-Z]{4,}', min_df=4) X_train = vectorizer.fit_transform(data) clf = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, labels) filename = os.path.join(util.classifiers_dir, 'categories.joblib.pkl') joblib.dump(clf, filename, compress=9) filename = os.path.join(util.classifiers_dir, 'vect.pkl') joblib.dump(vectorizer, filename, compress=9) return True
def POST(self): """Test for web.""" i = web.input() choice = i.get('choice', None) item = i.get('item', None) if choice == '1': return render.index(util.list_categories()) elif choice == '2': if item == '': return render.index('请输入商品名称') else: return render.index(util.search_for_goods(item)) else: return render.index('请选择一个选项')
def __init__(self): self.categories = util.list_categories() self.logger = logging.getLogger("tclas")