def run_classify(self, train_path, test_path, train_set, test_set, output_path) : loader = PickleMarket() # read train feature_names = loader.load_market(train_path)[0] train_articles = list() for type in train_set.split('#') : path = train_path.replace(u'car', type) train_articles.extend(loader.load_market(path)[1:]) train_dataset = np.array([np.array(article[1:30000], dtype=float) for article in train_articles]) print train_dataset.shape train_label = np.array([np.array(int(article[-1])) for article in train_articles]) # read test test_articles = list() for type in test_set.split('#') : path = test_path.replace(u'car', type) test_articles.extend(loader.load_market(path)[1:]) test_dataset = np.array([np.array(article[1:30000]) for article in test_articles]) print test_dataset.shape test_label = np.array([np.array(int(article[-1])) for article in test_articles]) # train cls classifier = LrClassifier() train_dataset = classifier.normalize(train_dataset, method='mapminmax') test_dataset = classifier.normalize(test_dataset, method='mapminmax') classifier.training(train_dataset, train_label, c=10, kernel='linear') # test cls test_prob = classifier.testing(test_dataset, type='prob') test_class = classifier.testing(test_dataset, type='label') evls, fprs, tprs = classifier.evaluation(test_label, test_prob, test_class) print 'performance is', evls ftprs = [[fpr, tprs[idx]] for idx, fpr in enumerate(fprs)] file_operator = TextFileOperator() file_operator.writing(ftprs, output_path) print 'finish'
def run_feature_select(self, article_market_path, dictionary_path, \ feature_market_path) : loader = PickleMarket() articles = loader.load_market(article_market_path) [word2id, id2word] = loader.load_market(dictionary_path) dim = len(word2id) featuresets = list() length = len(articles) - 1 for idx, article in enumerate(articles) : feature = [0] * dim for word in article['participle_title'] : word = word.to_string() if word in word2id : feature[word2id[word]] += 1 for word in article['participle_content'] : word = word.to_string() if word in word2id : feature[word2id[word]] += 1 featuresets.append([article['id']] + feature + [article['label']]) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) file_operator = TextFileOperator() # file_operator.writing(featuresets, feature_path) loader.dump_market(featuresets, feature_market_path) print 'finish'
def run_classify(self, train_path, test_path, train_set, test_set): loader = PickleMarket() # read train feature_names = loader.load_market(train_path)[0] train_articles = list() for type in train_set.split("#"): path = train_path.replace(u"car", type) train_articles.extend(loader.load_market(path)[1:]) train_dataset = np.array([np.array(article[1:-1], dtype=float) for article in train_articles]) print train_dataset.shape train_label = np.array([np.array(int(article[-1])) for article in train_articles]) # read test test_articles = list() for type in test_set.split("#"): path = test_path.replace(u"car", type) test_articles.extend(loader.load_market(path)[1:]) test_dataset = np.array([np.array(article[1:-1]) for article in test_articles]) print test_dataset.shape test_label = np.array([np.array(int(article[-1])) for article in test_articles]) # train cls classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method="mapminmax") test_dataset = classifier.normalize(test_dataset, method="mapminmax") classifier.training(train_dataset, train_label, c=10, kernel="linear") # test cls test_prob = classifier.testing(test_dataset, type="prob") test_class = classifier.testing(test_dataset, type="label") print "performance is", classifier.evaluation(test_label, test_prob, test_class) print "finish"
def run_robot(self, tag_tree_path, sentences_market_path, tags_path) : robot = Robot() loader = PickleMarket() file_operator = TextFileOperator() cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) sentences = loader.load_market(sentences_market_path) tags = loader.load_market(tags_path) print 'start' string = raw_input().decode('gb18030') # string = u'我想要毛衣' sentences = robot.question_and_answer(string, sentences, tags, tag_tree)
def run_create_dictionary(self, article_market_path, dictionary_path, dict_set) : loader = PickleMarket() word2id, id2word = dict(), dict() index = 0 for type in dict_set.split('#') : path = article_market_path.replace(u'car', type) articles = loader.load_market(path) length = len(articles) - 1 for idx, article in enumerate(articles) : for word in article['participle_title'] : word = word.to_string() if word not in word2id : word2id[word] = index id2word[index] = word index += 1 for word in article['participle_content'] : word = word.to_string() if word not in word2id : word2id[word] = index id2word[index] = word index += 1 if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) loader = PickleMarket() loader.dump_market([word2id, id2word], dictionary_path)
def run_feature_select( self, article_market_path, pos_path, punc_path, klword_path, feature_path, feature_market_path ): loader = PickleMarket() pos_selector = selector.PosExtractor(pos_path, w=15, combined=False) token_selector = selector.TokenExtractor(punc_path) word_selector = selector.WordExtractor(klword_path, weight=1) articles = loader.load_market(article_market_path) length = len(articles) - 1 for idx, article in enumerate(articles): article["features"] = list() article["features"].extend(pos_selector.extract_feature_windows(article["participle_content"])) article["features"].extend( token_selector.extract_feature( article["title"], article["content"], article["participle_title"], article["participle_content"] ) ) article["features"].extend( word_selector.extract_feature(article["participle_title"], article["participle_content"]) ) print "finish rate is %.2f%%\r" % (100.0 * idx / length), print "finish rate is %.2f%%\r" % (100.0 * idx / length) featuresets = [pos_selector.names + token_selector.names + word_selector.names] featuresets.extend([[article["id"]] + article["features"] + [article["label"]] for article in articles]) file_operator = TextFileOperator() file_operator.writing(featuresets, feature_path) loader.dump_market(featuresets, feature_market_path) print "finish"
def run_classify(self, train_path, test_path) : loader = PickleMarket() articles = list() # for type in [u'car', u'finance', u'web'] : # path = train_path.replace(u'all', type) articles.extend(loader.load_market(train_path)) train_dataset = np.array([np.array(article[1:-1]) for article in articles]) print train_dataset.shape train_label = np.array([np.array(int(article[-1])) for article in articles]) articles = loader.load_market(test_path) test_dataset = np.array([np.array(article[1:-1]) for article in articles]) test_label = np.array([np.array(int(article[-1])) for article in articles]) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method='mapminmax') test_dataset = classifier.normalize(test_dataset, method='mapminmax') classifier.training(train_dataset, train_label, cset=range(10, 100, 10), kernel='linear') test_prob = classifier.testing(test_dataset, type='prob') test_class = classifier.testing(test_dataset, type='label') print 'performance is', classifier.evaluation(test_label, test_prob, test_class) print 'finish'
def run_create_word2vec(self, sentences_path, word_embedding_path, word_embedding_market_path) : loader = PickleMarket() sentences = list() for type in [u'_car']:#, u'_finance', u'_web'] : sentences.extend(loader.load_market(sentences_path + type)) print 'import finish ...' embeddor = WordEmbed() print sentences[0] model = embeddor.word_to_vector(type='create', sentences=sentences[0:100], path=word_embedding_market_path) data_list = embeddor.get_word2vec_model(model) file_operator = TextFileOperator() file_operator.writing(data_list, word_embedding_path) print 'create word2vec finished ...'
def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_martket_path, dict_market_path) : file_operator = TextFileOperator() loader = PickleMarket() sentences = loader.load_market(sentences_market_path) cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) robot = Robot() tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:]) loader = JsonMarket() loader.dump_market(tags, tags_martket_path) loader.dump_market(tag_tree.dict_tuple, dict_market_path) print '%.2f%% article >= 1 tags, number is, %d.' \ % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) \ % len([tag for tag in tags_show if len(tag) >= 1])
def run_test(self, tag_tree_path, sentences_market_path, tags_path, \ tags_martket_path, untag_sentence_path) : file_operator = TextFileOperator() loader = PickleMarket() sentences = loader.load_market(sentences_market_path) cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) robot = Robot() tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:]) loader = JsonMarket() self.write_tags(sentences, tags_show, tags_path) loader.dump_market(tags, tags_martket_path) file_operator.writing(untag_sentences, untag_sentence_path) # loader.dump_market(untag_sentences, sentences_market_path) # print '%.2f%% article >= 2 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) print '%.2f%% article >= 3 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
def run_feature_select(self, article_market_path, pos_path, punc_path, klword_path, \ feature_path, feature_market_path) : loader = PickleMarket() pos_selector = selector.PosExtractor(pos_path, w=5, combined=True) token_selector = selector.TokenExtractor(punc_path) word_selector = selector.WordExtractor(klword_path, weight=1) articles = loader.load_market(article_market_path) length = len(articles) - 1 for idx, article in enumerate(articles) : article['features'] = list() article['features'].extend(pos_selector.extract_feature(article['participle_content'])) article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \ article['participle_title'], \ article['participle_content'])) article['features'].extend(word_selector.extract_feature(article['participle_title'], \ article['participle_content'])) print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) featuresets = [[article['id']] + article['features'] + [article['label']] for article in articles] file_operator = TextFileOperator() file_operator.writing(featuresets, feature_path) loader.dump_market(featuresets, feature_market_path) print 'finish'
def run_optimize_params(self, train_article_market_path, test_article_market_path, \ pos_path, punc_path, klword_path, logger_path) : loader = PickleMarket() logger = list() logger.append(['w', 'combined', 'weight', 'kernel', 'c', 'norm', 'car_car', \ 'car_finance', 'car_web', 'finance_car', 'finance_finance', 'finance_web', \ 'web_car', 'web_fiannce', 'web_web', 'merge_car', 'merge_finance', 'merge_web']) domains = [u'car', u'finance', u'web'] wset = [5, 10, 15, 20] combinedset = [True, False] weightset = [1, 2, 5] kernelset = ['linear', 'poly', 'rbf'] cset = [range(10, 100, 10), range(100, 1000, 100)] normset = ['mapminmax', 'zscore'] token_selector = selector.TokenExtractor(punc_path) for w in wset : for combined in combinedset : pos_selector = selector.PosExtractor(pos_path, w=w, combined=combined) for weight in weightset : word_selector = selector.WordExtractor(klword_path, weight=weight) train_featuresets, test_featuresets = list(), list() for step in range(len(domains)) : train_featuresets.append(list()) test_featuresets.append(list()) for index, domain in enumerate(domains) : train_articles = loader.load_market(train_article_market_path.replace(u'all', domain)) test_articles = loader.load_market(test_article_market_path.replace(u'all', domain)) # train length = len(train_articles) - 1 for idx, article in enumerate(train_articles) : article['features'] = list() article['features'].extend(pos_selector.extract_feature(article['participle_content'])) article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \ article['participle_title'], \ article['participle_content'])) article['features'].extend(word_selector.extract_feature(article['participle_title'], \ article['participle_content'])) print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) train_featuresets[index] = [[article['id']] + article['features'] + [article['label']] for article in train_articles] # test length = len(test_articles) - 1 for idx, article in enumerate(test_articles) : article['features'] = list() article['features'].extend(pos_selector.extract_feature(article['participle_content'])) article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \ article['participle_title'], \ article['participle_content'])) article['features'].extend(word_selector.extract_feature(article['participle_title'], \ article['participle_content'])) print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) test_featuresets[index] = [[article['id']] + article['features'] + [article['label']] for article in test_articles] for kernel in kernelset : for c in cset : for norm in normset : evl = list() for train_idx in range(0, len(domains)) : for test_idx in range(0, len(domains)) : train_dataset = np.array([np.array(article[1:-1]) for article in train_featuresets[train_idx]]) train_label = np.array([np.array(int(article[-1])) for article in train_featuresets[train_idx]]) test_dataset = np.array([np.array(article[1:-1]) for article in test_featuresets[test_idx]]) test_label = np.array([np.array(int(article[-1])) for article in test_featuresets[test_idx]]) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type='prob') test_class = classifier.testing(test_dataset, type='label') evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print 'single finished ...' # merge articles = list() for train_idx in range(0, len(domains)) : articles.extend(train_featuresets[train_idx]) train_dataset = np.array([np.array(article[1:-1]) for article in articles]) train_label = np.array([np.array(int(article[-1])) for article in articles]) for test_idx in range(0, len(domains)) : test_dataset = np.array([np.array(article[1:-1]) for article in test_featuresets[test_idx]]) test_label = np.array([np.array(int(article[-1])) for article in test_featuresets[test_idx]]) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type='prob') test_class = classifier.testing(test_dataset, type='label') evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print 'merge finished ...' print 'performance is', 1.0*sum(evl)/len(evl) log = [w, combined, weight, kernel, c[0], norm] log.extend(evl) logger.append(log) file_operator = TextFileOperator() file_operator.writing(logger, logger_path) print 'finish'
def run_optimize_params( self, train_article_market_path, test_article_market_path, pos_path, punc_path, klword_path, logger_path ): loader = PickleMarket() logger = list() logger.append( [ "w", "combined", "weight", "kernel", "c", "norm", "car_car", "car_finance", "car_web", "finance_car", "finance_finance", "finance_web", "web_car", "web_fiannce", "web_web", "merge_car", "merge_finance", "merge_web", ] ) domains = [u"car", u"finance", u"web"] wset = [5, 10, 15, 20] combinedset = [True, False] weightset = [1, 2, 5] kernelset = ["linear", "poly", "rbf"] cset = [range(10, 100, 10), range(100, 1000, 100)] normset = ["mapminmax", "zscore"] token_selector = selector.TokenExtractor(punc_path) for w in wset: for combined in combinedset: pos_selector = selector.PosExtractor(pos_path, w=w, combined=combined) for weight in weightset: word_selector = selector.WordExtractor(klword_path, weight=weight) train_featuresets, test_featuresets = list(), list() for step in range(len(domains)): train_featuresets.append(list()) test_featuresets.append(list()) for index, domain in enumerate(domains): train_articles = loader.load_market(train_article_market_path.replace(u"all", domain)) test_articles = loader.load_market(test_article_market_path.replace(u"all", domain)) # train length = len(train_articles) - 1 for idx, article in enumerate(train_articles): article["features"] = list() article["features"].extend(pos_selector.extract_feature(article["participle_content"])) article["features"].extend( token_selector.extract_feature( article["title"], article["content"], article["participle_title"], article["participle_content"], ) ) article["features"].extend( word_selector.extract_feature( article["participle_title"], article["participle_content"] ) ) print "finish rate is %.2f%%\r" % (100.0 * idx / length), print "finish rate is %.2f%%\r" % (100.0 * idx / length) train_featuresets[index] = [ [article["id"]] + article["features"] + [article["label"]] for article in train_articles ] # test length = len(test_articles) - 1 for idx, article in enumerate(test_articles): article["features"] = list() article["features"].extend(pos_selector.extract_feature(article["participle_content"])) article["features"].extend( token_selector.extract_feature( article["title"], article["content"], article["participle_title"], article["participle_content"], ) ) article["features"].extend( word_selector.extract_feature( article["participle_title"], article["participle_content"] ) ) print "finish rate is %.2f%%\r" % (100.0 * idx / length), print "finish rate is %.2f%%\r" % (100.0 * idx / length) test_featuresets[index] = [ [article["id"]] + article["features"] + [article["label"]] for article in test_articles ] for kernel in kernelset: for c in cset: for norm in normset: evl = list() for train_idx in range(0, len(domains)): for test_idx in range(0, len(domains)): train_dataset = np.array( [np.array(article[1:-1]) for article in train_featuresets[train_idx]] ) train_label = np.array( [np.array(int(article[-1])) for article in train_featuresets[train_idx]] ) test_dataset = np.array( [np.array(article[1:-1]) for article in test_featuresets[test_idx]] ) test_label = np.array( [np.array(int(article[-1])) for article in test_featuresets[test_idx]] ) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type="prob") test_class = classifier.testing(test_dataset, type="label") evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print "single finished ..." # merge articles = list() for train_idx in range(0, len(domains)): articles.extend(train_featuresets[train_idx]) train_dataset = np.array([np.array(article[1:-1]) for article in articles]) train_label = np.array([np.array(int(article[-1])) for article in articles]) for test_idx in range(0, len(domains)): test_dataset = np.array( [np.array(article[1:-1]) for article in test_featuresets[test_idx]] ) test_label = np.array( [np.array(int(article[-1])) for article in test_featuresets[test_idx]] ) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type="prob") test_class = classifier.testing(test_dataset, type="label") evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print "merge finished ..." print "performance is", 1.0 * sum(evl) / len(evl) log = [w, combined, weight, kernel, c[0], norm] log.extend(evl) logger.append(log) file_operator = TextFileOperator() file_operator.writing(logger, logger_path) print "finish"