Example #1
0
 def run_classify(self, train_path, test_path, train_set, test_set, output_path) :
     loader = PickleMarket()
     # read train
     feature_names = loader.load_market(train_path)[0]
     train_articles = list()
     for type in train_set.split('#') :
         path = train_path.replace(u'car', type)
         train_articles.extend(loader.load_market(path)[1:])
     train_dataset = np.array([np.array(article[1:30000], dtype=float) for article in train_articles])
     print train_dataset.shape
     train_label = np.array([np.array(int(article[-1])) for article in train_articles])
     # read test
     test_articles = list()
     for type in test_set.split('#') :
         path = test_path.replace(u'car', type)
         test_articles.extend(loader.load_market(path)[1:])
     test_dataset = np.array([np.array(article[1:30000]) for article in test_articles])
     print test_dataset.shape
     test_label = np.array([np.array(int(article[-1])) for article in test_articles])
     # train cls
     classifier = LrClassifier()
     train_dataset = classifier.normalize(train_dataset, method='mapminmax')
     test_dataset = classifier.normalize(test_dataset, method='mapminmax')
     classifier.training(train_dataset, train_label, c=10, kernel='linear')
     # test cls
     test_prob = classifier.testing(test_dataset, type='prob')
     test_class = classifier.testing(test_dataset, type='label')
     evls, fprs, tprs = classifier.evaluation(test_label, test_prob, test_class)
     print 'performance is', evls
     ftprs = [[fpr, tprs[idx]] for idx, fpr in enumerate(fprs)]
     file_operator = TextFileOperator()
     file_operator.writing(ftprs, output_path)
     print 'finish'
Example #2
0
 def run_convert_sentences(self, sentences_path, sentences_market_path) :
     file_operator = TextFileOperator()
     sentences = file_operator.reading(sentences_path)
     sentences = [[word.split('<:>')[0] for word in sentence] for sentence in sentences]
     loader = PickleMarket()
     loader.dump_market(sentences, sentences_market_path)
     print 'converting sentences finished ...'
Example #3
0
 def run_feature_select(self, article_market_path, dictionary_path, \
                        feature_market_path) :
     loader = PickleMarket()
     articles = loader.load_market(article_market_path)
     [word2id, id2word] = loader.load_market(dictionary_path)
     dim = len(word2id)
     featuresets = list()
     length = len(articles) - 1
     for idx, article in enumerate(articles) :
         feature = [0] * dim
         for word in article['participle_title'] :
             word = word.to_string()
             if word in word2id :
                 feature[word2id[word]] += 1
         for word in article['participle_content'] :
             word = word.to_string()
             if word in word2id :
                 feature[word2id[word]] += 1
         featuresets.append([article['id']] + feature + [article['label']])
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     file_operator = TextFileOperator()
     # file_operator.writing(featuresets, feature_path)
     loader.dump_market(featuresets, feature_market_path)
     print 'finish'
Example #4
0
 def run_create_dictionary(self, article_market_path, dictionary_path, dict_set) :
     loader = PickleMarket()
     word2id, id2word = dict(), dict()
     index = 0
     for type in dict_set.split('#') :
         path = article_market_path.replace(u'car', type)
         articles = loader.load_market(path)
         length = len(articles) - 1
         for idx, article in enumerate(articles) :
             for word in article['participle_title'] :
                 word =   word.to_string()
                 if word not in word2id :
                     word2id[word] = index
                     id2word[index] = word
                     index += 1
             for word in article['participle_content'] :
                 word = word.to_string()
                 if word not in word2id :
                     word2id[word] = index
                     id2word[index] = word
                     index += 1
             if idx % 100 == 0 :
                 print 'finish rate is %.2f%%\r' % (100.0*idx/length),
         print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     loader = PickleMarket()
     loader.dump_market([word2id, id2word], dictionary_path)
Example #5
0
 def run_classify(self, train_path, test_path, train_set, test_set):
     loader = PickleMarket()
     # read train
     feature_names = loader.load_market(train_path)[0]
     train_articles = list()
     for type in train_set.split("#"):
         path = train_path.replace(u"car", type)
         train_articles.extend(loader.load_market(path)[1:])
     train_dataset = np.array([np.array(article[1:-1], dtype=float) for article in train_articles])
     print train_dataset.shape
     train_label = np.array([np.array(int(article[-1])) for article in train_articles])
     # read test
     test_articles = list()
     for type in test_set.split("#"):
         path = test_path.replace(u"car", type)
         test_articles.extend(loader.load_market(path)[1:])
     test_dataset = np.array([np.array(article[1:-1]) for article in test_articles])
     print test_dataset.shape
     test_label = np.array([np.array(int(article[-1])) for article in test_articles])
     # train cls
     classifier = SvmClassifier()
     train_dataset = classifier.normalize(train_dataset, method="mapminmax")
     test_dataset = classifier.normalize(test_dataset, method="mapminmax")
     classifier.training(train_dataset, train_label, c=10, kernel="linear")
     # test cls
     test_prob = classifier.testing(test_dataset, type="prob")
     test_class = classifier.testing(test_dataset, type="label")
     print "performance is", classifier.evaluation(test_label, test_prob, test_class)
     print "finish"
Example #6
0
 def run_feature_select(
     self, article_market_path, pos_path, punc_path, klword_path, feature_path, feature_market_path
 ):
     loader = PickleMarket()
     pos_selector = selector.PosExtractor(pos_path, w=15, combined=False)
     token_selector = selector.TokenExtractor(punc_path)
     word_selector = selector.WordExtractor(klword_path, weight=1)
     articles = loader.load_market(article_market_path)
     length = len(articles) - 1
     for idx, article in enumerate(articles):
         article["features"] = list()
         article["features"].extend(pos_selector.extract_feature_windows(article["participle_content"]))
         article["features"].extend(
             token_selector.extract_feature(
                 article["title"], article["content"], article["participle_title"], article["participle_content"]
             )
         )
         article["features"].extend(
             word_selector.extract_feature(article["participle_title"], article["participle_content"])
         )
         print "finish rate is %.2f%%\r" % (100.0 * idx / length),
     print "finish rate is %.2f%%\r" % (100.0 * idx / length)
     featuresets = [pos_selector.names + token_selector.names + word_selector.names]
     featuresets.extend([[article["id"]] + article["features"] + [article["label"]] for article in articles])
     file_operator = TextFileOperator()
     file_operator.writing(featuresets, feature_path)
     loader.dump_market(featuresets, feature_market_path)
     print "finish"
Example #7
0
 def run_robot(self, tag_tree_path, sentences_market_path, tags_path) :
     robot = Robot()
     loader = PickleMarket()
     file_operator = TextFileOperator()
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     sentences = loader.load_market(sentences_market_path)
     tags = loader.load_market(tags_path)
     print 'start'
     string = raw_input().decode('gb18030')
     # string = u'我想要毛衣'
     sentences = robot.question_and_answer(string, sentences, tags, tag_tree)
Example #8
0
 def run_create_word2vec(self, sentences_path, word_embedding_path, word_embedding_market_path) :
     loader = PickleMarket()
     sentences = list()
     for type in [u'_car']:#, u'_finance', u'_web'] :
         sentences.extend(loader.load_market(sentences_path + type))
     print 'import finish ...'
     embeddor = WordEmbed()
     print sentences[0]
     model = embeddor.word_to_vector(type='create', sentences=sentences[0:100], path=word_embedding_market_path)
     data_list = embeddor.get_word2vec_model(model)
     file_operator = TextFileOperator()
     file_operator.writing(data_list, word_embedding_path)
     print 'create word2vec finished ...'
Example #9
0
 def run_test(self, tag_tree_path, sentences_market_path, tags_path, \
     tags_martket_path, untag_sentence_path) :
     file_operator = TextFileOperator()
     loader = PickleMarket()
     sentences = loader.load_market(sentences_market_path)
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     robot = Robot()
     tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:])
     loader = JsonMarket()
     self.write_tags(sentences, tags_show, tags_path)
     loader.dump_market(tags, tags_martket_path)
     file_operator.writing(untag_sentences, untag_sentence_path)
     # loader.dump_market(untag_sentences, sentences_market_path)
     # print '%.2f%% article >= 2 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
     print '%.2f%% article >= 3 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
Example #10
0
 def run_classify(self, train_path, test_path) :
     loader = PickleMarket()
     articles = list()
     # for type in [u'car', u'finance', u'web'] :
     #     path = train_path.replace(u'all', type)
     articles.extend(loader.load_market(train_path))
     train_dataset = np.array([np.array(article[1:-1]) for article in articles])
     print train_dataset.shape
     train_label = np.array([np.array(int(article[-1])) for article in articles])
     articles = loader.load_market(test_path)
     test_dataset = np.array([np.array(article[1:-1]) for article in articles])
     test_label = np.array([np.array(int(article[-1])) for article in articles])
     classifier = SvmClassifier()
     train_dataset = classifier.normalize(train_dataset, method='mapminmax')
     test_dataset = classifier.normalize(test_dataset, method='mapminmax')
     classifier.training(train_dataset, train_label, cset=range(10, 100, 10), kernel='linear')
     test_prob = classifier.testing(test_dataset, type='prob')
     test_class = classifier.testing(test_dataset, type='label')
     print 'performance is', classifier.evaluation(test_label, test_prob, test_class)
     print 'finish'
Example #11
0
 def run_feature_select(self, article_market_path, pos_path, punc_path, klword_path, \
     feature_path, feature_market_path) :
     loader = PickleMarket()
     pos_selector = selector.PosExtractor(pos_path, w=5, combined=True)
     token_selector = selector.TokenExtractor(punc_path)
     word_selector = selector.WordExtractor(klword_path, weight=1)
     articles = loader.load_market(article_market_path)
     length = len(articles) - 1
     for idx, article in enumerate(articles) :
         article['features'] = list()
         article['features'].extend(pos_selector.extract_feature(article['participle_content']))
         article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \
                                                                     article['participle_title'], \
                                                                     article['participle_content']))
         article['features'].extend(word_selector.extract_feature(article['participle_title'], \
                                                                     article['participle_content']))
         print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     featuresets = [[article['id']] + article['features'] + [article['label']] for article in articles]
     file_operator = TextFileOperator()
     file_operator.writing(featuresets, feature_path)
     loader.dump_market(featuresets, feature_market_path)
     print 'finish'
Example #12
0
 def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_martket_path, dict_market_path) :
     file_operator = TextFileOperator()
     loader = PickleMarket()
     sentences = loader.load_market(sentences_market_path)
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     robot = Robot()
     tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:])
     loader = JsonMarket()
     loader.dump_market(tags, tags_martket_path)
     loader.dump_market(tag_tree.dict_tuple, dict_market_path)
     print '%.2f%% article >= 1 tags, number is, %d.' \
         % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) \
         % len([tag for tag in tags_show if len(tag) >= 1])
Example #13
0
 def run_convert_sentences(self, sentences_path, sentences_market_path) :
     file_operator = TextFileOperator()
     sentences = self.read_sentences(sentences_path)
     loader = PickleMarket()
     loader.dump_market(sentences, sentences_market_path)
Example #14
0
 def run_optimize_params(self, train_article_market_path, test_article_market_path, \
     pos_path, punc_path, klword_path, logger_path) :
     loader = PickleMarket()
     logger = list()
     logger.append(['w', 'combined', 'weight', 'kernel', 'c', 'norm', 'car_car', \
         'car_finance', 'car_web', 'finance_car', 'finance_finance', 'finance_web', \
         'web_car', 'web_fiannce', 'web_web', 'merge_car', 'merge_finance', 'merge_web'])
     domains = [u'car', u'finance', u'web']
     wset = [5, 10, 15, 20]
     combinedset = [True, False]
     weightset = [1, 2, 5]
     kernelset = ['linear', 'poly', 'rbf']
     cset = [range(10, 100, 10), range(100, 1000, 100)]
     normset = ['mapminmax', 'zscore']
     token_selector = selector.TokenExtractor(punc_path)
     for w in wset :
         for combined in combinedset :
             pos_selector = selector.PosExtractor(pos_path, w=w, combined=combined)
             for weight in weightset :
                 word_selector = selector.WordExtractor(klword_path, weight=weight)
                 train_featuresets, test_featuresets = list(), list()
                 for step in range(len(domains)) :
                     train_featuresets.append(list())
                     test_featuresets.append(list())
                 for index, domain in enumerate(domains) :
                     train_articles = loader.load_market(train_article_market_path.replace(u'all', domain))
                     test_articles = loader.load_market(test_article_market_path.replace(u'all', domain))
                     # train
                     length = len(train_articles) - 1
                     for idx, article in enumerate(train_articles) :
                         article['features'] = list()
                         article['features'].extend(pos_selector.extract_feature(article['participle_content']))
                         article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \
                                                                                     article['participle_title'], \
                                                                                     article['participle_content']))
                         article['features'].extend(word_selector.extract_feature(article['participle_title'], \
                                                                                     article['participle_content']))
                         print 'finish rate is %.2f%%\r' % (100.0*idx/length),
                     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
                     train_featuresets[index] = [[article['id']] + article['features'] + [article['label']] for article in train_articles]
                     # test
                     length = len(test_articles) - 1
                     for idx, article in enumerate(test_articles) :
                         article['features'] = list()
                         article['features'].extend(pos_selector.extract_feature(article['participle_content']))
                         article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \
                                                                                     article['participle_title'], \
                                                                                     article['participle_content']))
                         article['features'].extend(word_selector.extract_feature(article['participle_title'], \
                                                                                     article['participle_content']))
                         print 'finish rate is %.2f%%\r' % (100.0*idx/length),
                     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
                     test_featuresets[index] = [[article['id']] + article['features'] + [article['label']] for article in test_articles]
                 for kernel in kernelset :
                     for c in cset :
                         for norm in normset :
                             evl = list()
                             for train_idx in range(0, len(domains)) :
                                 for test_idx in range(0, len(domains)) :
                                     train_dataset = np.array([np.array(article[1:-1]) for article in train_featuresets[train_idx]])
                                     train_label = np.array([np.array(int(article[-1])) for article in train_featuresets[train_idx]])
                                     test_dataset = np.array([np.array(article[1:-1]) for article in test_featuresets[test_idx]])
                                     test_label = np.array([np.array(int(article[-1])) for article in test_featuresets[test_idx]])
                                     classifier = SvmClassifier()
                                     train_dataset = classifier.normalize(train_dataset, method=norm)
                                     test_dataset = classifier.normalize(test_dataset, method=norm)
                                     classifier.training(train_dataset, train_label, cset=c, kernel=kernel)
                                     test_prob = classifier.testing(test_dataset, type='prob')
                                     test_class = classifier.testing(test_dataset, type='label')
                                     evl.append(classifier.evaluation(test_label, test_prob, test_class)[1])
                             print 'single finished ...'
                             # merge
                             articles = list()
                             for train_idx in range(0, len(domains)) :
                                 articles.extend(train_featuresets[train_idx])
                             train_dataset = np.array([np.array(article[1:-1]) for article in articles])
                             train_label = np.array([np.array(int(article[-1])) for article in articles])
                             for test_idx in range(0, len(domains)) :
                                 test_dataset = np.array([np.array(article[1:-1]) for article in test_featuresets[test_idx]])
                                 test_label = np.array([np.array(int(article[-1])) for article in test_featuresets[test_idx]])
                                 classifier = SvmClassifier()
                                 train_dataset = classifier.normalize(train_dataset, method=norm)
                                 test_dataset = classifier.normalize(test_dataset, method=norm)
                                 classifier.training(train_dataset, train_label, cset=c, kernel=kernel)
                                 test_prob = classifier.testing(test_dataset, type='prob')
                                 test_class = classifier.testing(test_dataset, type='label')
                                 evl.append(classifier.evaluation(test_label, test_prob, test_class)[1])
                             print 'merge finished ...'
                             print 'performance is', 1.0*sum(evl)/len(evl)
                             log = [w, combined, weight, kernel, c[0], norm]
                             log.extend(evl)
                             logger.append(log)
     file_operator = TextFileOperator()
     file_operator.writing(logger, logger_path)
     print 'finish'
Example #15
0
 def run_convert_article(self,  article_path, article_market_path) :
     articles = self.read_article(article_path)
     # articles = self.read_participle(articles, participle_path)
     loader = PickleMarket()
     loader.dump_market(articles, article_market_path)
     print 'finish.'
Example #16
0
 def run_optimize_params(
     self, train_article_market_path, test_article_market_path, pos_path, punc_path, klword_path, logger_path
 ):
     loader = PickleMarket()
     logger = list()
     logger.append(
         [
             "w",
             "combined",
             "weight",
             "kernel",
             "c",
             "norm",
             "car_car",
             "car_finance",
             "car_web",
             "finance_car",
             "finance_finance",
             "finance_web",
             "web_car",
             "web_fiannce",
             "web_web",
             "merge_car",
             "merge_finance",
             "merge_web",
         ]
     )
     domains = [u"car", u"finance", u"web"]
     wset = [5, 10, 15, 20]
     combinedset = [True, False]
     weightset = [1, 2, 5]
     kernelset = ["linear", "poly", "rbf"]
     cset = [range(10, 100, 10), range(100, 1000, 100)]
     normset = ["mapminmax", "zscore"]
     token_selector = selector.TokenExtractor(punc_path)
     for w in wset:
         for combined in combinedset:
             pos_selector = selector.PosExtractor(pos_path, w=w, combined=combined)
             for weight in weightset:
                 word_selector = selector.WordExtractor(klword_path, weight=weight)
                 train_featuresets, test_featuresets = list(), list()
                 for step in range(len(domains)):
                     train_featuresets.append(list())
                     test_featuresets.append(list())
                 for index, domain in enumerate(domains):
                     train_articles = loader.load_market(train_article_market_path.replace(u"all", domain))
                     test_articles = loader.load_market(test_article_market_path.replace(u"all", domain))
                     # train
                     length = len(train_articles) - 1
                     for idx, article in enumerate(train_articles):
                         article["features"] = list()
                         article["features"].extend(pos_selector.extract_feature(article["participle_content"]))
                         article["features"].extend(
                             token_selector.extract_feature(
                                 article["title"],
                                 article["content"],
                                 article["participle_title"],
                                 article["participle_content"],
                             )
                         )
                         article["features"].extend(
                             word_selector.extract_feature(
                                 article["participle_title"], article["participle_content"]
                             )
                         )
                         print "finish rate is %.2f%%\r" % (100.0 * idx / length),
                     print "finish rate is %.2f%%\r" % (100.0 * idx / length)
                     train_featuresets[index] = [
                         [article["id"]] + article["features"] + [article["label"]] for article in train_articles
                     ]
                     # test
                     length = len(test_articles) - 1
                     for idx, article in enumerate(test_articles):
                         article["features"] = list()
                         article["features"].extend(pos_selector.extract_feature(article["participle_content"]))
                         article["features"].extend(
                             token_selector.extract_feature(
                                 article["title"],
                                 article["content"],
                                 article["participle_title"],
                                 article["participle_content"],
                             )
                         )
                         article["features"].extend(
                             word_selector.extract_feature(
                                 article["participle_title"], article["participle_content"]
                             )
                         )
                         print "finish rate is %.2f%%\r" % (100.0 * idx / length),
                     print "finish rate is %.2f%%\r" % (100.0 * idx / length)
                     test_featuresets[index] = [
                         [article["id"]] + article["features"] + [article["label"]] for article in test_articles
                     ]
                 for kernel in kernelset:
                     for c in cset:
                         for norm in normset:
                             evl = list()
                             for train_idx in range(0, len(domains)):
                                 for test_idx in range(0, len(domains)):
                                     train_dataset = np.array(
                                         [np.array(article[1:-1]) for article in train_featuresets[train_idx]]
                                     )
                                     train_label = np.array(
                                         [np.array(int(article[-1])) for article in train_featuresets[train_idx]]
                                     )
                                     test_dataset = np.array(
                                         [np.array(article[1:-1]) for article in test_featuresets[test_idx]]
                                     )
                                     test_label = np.array(
                                         [np.array(int(article[-1])) for article in test_featuresets[test_idx]]
                                     )
                                     classifier = SvmClassifier()
                                     train_dataset = classifier.normalize(train_dataset, method=norm)
                                     test_dataset = classifier.normalize(test_dataset, method=norm)
                                     classifier.training(train_dataset, train_label, cset=c, kernel=kernel)
                                     test_prob = classifier.testing(test_dataset, type="prob")
                                     test_class = classifier.testing(test_dataset, type="label")
                                     evl.append(classifier.evaluation(test_label, test_prob, test_class)[1])
                             print "single finished ..."
                             # merge
                             articles = list()
                             for train_idx in range(0, len(domains)):
                                 articles.extend(train_featuresets[train_idx])
                             train_dataset = np.array([np.array(article[1:-1]) for article in articles])
                             train_label = np.array([np.array(int(article[-1])) for article in articles])
                             for test_idx in range(0, len(domains)):
                                 test_dataset = np.array(
                                     [np.array(article[1:-1]) for article in test_featuresets[test_idx]]
                                 )
                                 test_label = np.array(
                                     [np.array(int(article[-1])) for article in test_featuresets[test_idx]]
                                 )
                                 classifier = SvmClassifier()
                                 train_dataset = classifier.normalize(train_dataset, method=norm)
                                 test_dataset = classifier.normalize(test_dataset, method=norm)
                                 classifier.training(train_dataset, train_label, cset=c, kernel=kernel)
                                 test_prob = classifier.testing(test_dataset, type="prob")
                                 test_class = classifier.testing(test_dataset, type="label")
                                 evl.append(classifier.evaluation(test_label, test_prob, test_class)[1])
                             print "merge finished ..."
                             print "performance is", 1.0 * sum(evl) / len(evl)
                             log = [w, combined, weight, kernel, c[0], norm]
                             log.extend(evl)
                             logger.append(log)
     file_operator = TextFileOperator()
     file_operator.writing(logger, logger_path)
     print "finish"