def run_classify(self, train_path, test_path, train_set, test_set, output_path) : loader = PickleMarket() # read train feature_names = loader.load_market(train_path)[0] train_articles = list() for type in train_set.split('#') : path = train_path.replace(u'car', type) train_articles.extend(loader.load_market(path)[1:]) train_dataset = np.array([np.array(article[1:30000], dtype=float) for article in train_articles]) print train_dataset.shape train_label = np.array([np.array(int(article[-1])) for article in train_articles]) # read test test_articles = list() for type in test_set.split('#') : path = test_path.replace(u'car', type) test_articles.extend(loader.load_market(path)[1:]) test_dataset = np.array([np.array(article[1:30000]) for article in test_articles]) print test_dataset.shape test_label = np.array([np.array(int(article[-1])) for article in test_articles]) # train cls classifier = LrClassifier() train_dataset = classifier.normalize(train_dataset, method='mapminmax') test_dataset = classifier.normalize(test_dataset, method='mapminmax') classifier.training(train_dataset, train_label, c=10, kernel='linear') # test cls test_prob = classifier.testing(test_dataset, type='prob') test_class = classifier.testing(test_dataset, type='label') evls, fprs, tprs = classifier.evaluation(test_label, test_prob, test_class) print 'performance is', evls ftprs = [[fpr, tprs[idx]] for idx, fpr in enumerate(fprs)] file_operator = TextFileOperator() file_operator.writing(ftprs, output_path) print 'finish'
def run_create_sentences(self, article_path, participle_title_path, sentences_path) : articles = self.read_article(article_path) titles = self.read_participle_title(participle_title_path) # remove duplications processor = Unique() indexs_unique = [titles[index]['id'] for index in processor.unique( \ [article['participle_title'] for article in titles])] indexs_dict = dict().fromkeys(set(indexs_unique)) remained_articles = [article for article in articles if article['id'] in indexs_dict] print 'remove duplications finished ...' # create sentences segmentor = ContentSegementor() sentences = list() length = len(remained_articles) - 1 for idx, article in enumerate(remained_articles) : segmented_content = segmentor.segement(article['content']) sentences.extend([[sentence] for sentence in segmented_content]) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) file_operator = TextFileOperator() file_operator.writing(sentences, sentences_path) print 'writing sentences finished ...'
def run_feature_select( self, article_market_path, pos_path, punc_path, klword_path, feature_path, feature_market_path ): loader = PickleMarket() pos_selector = selector.PosExtractor(pos_path, w=15, combined=False) token_selector = selector.TokenExtractor(punc_path) word_selector = selector.WordExtractor(klword_path, weight=1) articles = loader.load_market(article_market_path) length = len(articles) - 1 for idx, article in enumerate(articles): article["features"] = list() article["features"].extend(pos_selector.extract_feature_windows(article["participle_content"])) article["features"].extend( token_selector.extract_feature( article["title"], article["content"], article["participle_title"], article["participle_content"] ) ) article["features"].extend( word_selector.extract_feature(article["participle_title"], article["participle_content"]) ) print "finish rate is %.2f%%\r" % (100.0 * idx / length), print "finish rate is %.2f%%\r" % (100.0 * idx / length) featuresets = [pos_selector.names + token_selector.names + word_selector.names] featuresets.extend([[article["id"]] + article["features"] + [article["label"]] for article in articles]) file_operator = TextFileOperator() file_operator.writing(featuresets, feature_path) loader.dump_market(featuresets, feature_market_path) print "finish"
def run_create_word2vec(self, sentences_path, word_embedding_path, word_embedding_market_path) : loader = PickleMarket() sentences = list() for type in [u'_car']:#, u'_finance', u'_web'] : sentences.extend(loader.load_market(sentences_path + type)) print 'import finish ...' embeddor = WordEmbed() print sentences[0] model = embeddor.word_to_vector(type='create', sentences=sentences[0:100], path=word_embedding_market_path) data_list = embeddor.get_word2vec_model(model) file_operator = TextFileOperator() file_operator.writing(data_list, word_embedding_path) print 'create word2vec finished ...'
def run_test(self, tag_tree_path, sentences_market_path, tags_path, \ tags_martket_path, untag_sentence_path) : file_operator = TextFileOperator() loader = PickleMarket() sentences = loader.load_market(sentences_market_path) cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) robot = Robot() tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:]) loader = JsonMarket() self.write_tags(sentences, tags_show, tags_path) loader.dump_market(tags, tags_martket_path) file_operator.writing(untag_sentences, untag_sentence_path) # loader.dump_market(untag_sentences, sentences_market_path) # print '%.2f%% article >= 2 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) print '%.2f%% article >= 3 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
def write_article(self, articles, article_path) : """ Write source article. Each row is an article. Colunm[0] is the id of article. Column[1:] is the attributes of article. """ data_list = list() entry_list = ['id', 'url', 'title', 'content'] data_list.append(entry_list) length = len(articles) - 1 for idx, article in enumerate(articles) : data = [article['id'], article['url'], article['title'], article['content']] data_list.append(data) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) file_operator = TextFileOperator() file_operator.writing(data_list, article_path)
def write_tags(self, sentences, tags, tags_path) : """ Read participle sentences. Each row is a sentence. Each column is a <attribute, value> pair. """ file_operator = TextFileOperator() data_list = list() data_list.append(['sentence', 'tag']) length = len(tags) - 1 for idx, term in enumerate(tags) : if len(term) >= 2 : data = list() data.append(sentences[idx][1]) tag_str = '' for attr, value in term : tag_str += u'<' + attr + u',' + value + u'>' + ' ' data.append(tag_str) data_list.append(data) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) file_operator.writing(data_list, tags_path)
def run_feature_select(self, article_market_path, pos_path, punc_path, klword_path, \ feature_path, feature_market_path) : loader = PickleMarket() pos_selector = selector.PosExtractor(pos_path, w=5, combined=True) token_selector = selector.TokenExtractor(punc_path) word_selector = selector.WordExtractor(klword_path, weight=1) articles = loader.load_market(article_market_path) length = len(articles) - 1 for idx, article in enumerate(articles) : article['features'] = list() article['features'].extend(pos_selector.extract_feature(article['participle_content'])) article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \ article['participle_title'], \ article['participle_content'])) article['features'].extend(word_selector.extract_feature(article['participle_title'], \ article['participle_content'])) print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) featuresets = [[article['id']] + article['features'] + [article['label']] for article in articles] file_operator = TextFileOperator() file_operator.writing(featuresets, feature_path) loader.dump_market(featuresets, feature_market_path) print 'finish'
def run_optimize_params(self, train_article_market_path, test_article_market_path, \ pos_path, punc_path, klword_path, logger_path) : loader = PickleMarket() logger = list() logger.append(['w', 'combined', 'weight', 'kernel', 'c', 'norm', 'car_car', \ 'car_finance', 'car_web', 'finance_car', 'finance_finance', 'finance_web', \ 'web_car', 'web_fiannce', 'web_web', 'merge_car', 'merge_finance', 'merge_web']) domains = [u'car', u'finance', u'web'] wset = [5, 10, 15, 20] combinedset = [True, False] weightset = [1, 2, 5] kernelset = ['linear', 'poly', 'rbf'] cset = [range(10, 100, 10), range(100, 1000, 100)] normset = ['mapminmax', 'zscore'] token_selector = selector.TokenExtractor(punc_path) for w in wset : for combined in combinedset : pos_selector = selector.PosExtractor(pos_path, w=w, combined=combined) for weight in weightset : word_selector = selector.WordExtractor(klword_path, weight=weight) train_featuresets, test_featuresets = list(), list() for step in range(len(domains)) : train_featuresets.append(list()) test_featuresets.append(list()) for index, domain in enumerate(domains) : train_articles = loader.load_market(train_article_market_path.replace(u'all', domain)) test_articles = loader.load_market(test_article_market_path.replace(u'all', domain)) # train length = len(train_articles) - 1 for idx, article in enumerate(train_articles) : article['features'] = list() article['features'].extend(pos_selector.extract_feature(article['participle_content'])) article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \ article['participle_title'], \ article['participle_content'])) article['features'].extend(word_selector.extract_feature(article['participle_title'], \ article['participle_content'])) print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) train_featuresets[index] = [[article['id']] + article['features'] + [article['label']] for article in train_articles] # test length = len(test_articles) - 1 for idx, article in enumerate(test_articles) : article['features'] = list() article['features'].extend(pos_selector.extract_feature(article['participle_content'])) article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \ article['participle_title'], \ article['participle_content'])) article['features'].extend(word_selector.extract_feature(article['participle_title'], \ article['participle_content'])) print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) test_featuresets[index] = [[article['id']] + article['features'] + [article['label']] for article in test_articles] for kernel in kernelset : for c in cset : for norm in normset : evl = list() for train_idx in range(0, len(domains)) : for test_idx in range(0, len(domains)) : train_dataset = np.array([np.array(article[1:-1]) for article in train_featuresets[train_idx]]) train_label = np.array([np.array(int(article[-1])) for article in train_featuresets[train_idx]]) test_dataset = np.array([np.array(article[1:-1]) for article in test_featuresets[test_idx]]) test_label = np.array([np.array(int(article[-1])) for article in test_featuresets[test_idx]]) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type='prob') test_class = classifier.testing(test_dataset, type='label') evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print 'single finished ...' # merge articles = list() for train_idx in range(0, len(domains)) : articles.extend(train_featuresets[train_idx]) train_dataset = np.array([np.array(article[1:-1]) for article in articles]) train_label = np.array([np.array(int(article[-1])) for article in articles]) for test_idx in range(0, len(domains)) : test_dataset = np.array([np.array(article[1:-1]) for article in test_featuresets[test_idx]]) test_label = np.array([np.array(int(article[-1])) for article in test_featuresets[test_idx]]) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type='prob') test_class = classifier.testing(test_dataset, type='label') evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print 'merge finished ...' print 'performance is', 1.0*sum(evl)/len(evl) log = [w, combined, weight, kernel, c[0], norm] log.extend(evl) logger.append(log) file_operator = TextFileOperator() file_operator.writing(logger, logger_path) print 'finish'
def run_optimize_params( self, train_article_market_path, test_article_market_path, pos_path, punc_path, klword_path, logger_path ): loader = PickleMarket() logger = list() logger.append( [ "w", "combined", "weight", "kernel", "c", "norm", "car_car", "car_finance", "car_web", "finance_car", "finance_finance", "finance_web", "web_car", "web_fiannce", "web_web", "merge_car", "merge_finance", "merge_web", ] ) domains = [u"car", u"finance", u"web"] wset = [5, 10, 15, 20] combinedset = [True, False] weightset = [1, 2, 5] kernelset = ["linear", "poly", "rbf"] cset = [range(10, 100, 10), range(100, 1000, 100)] normset = ["mapminmax", "zscore"] token_selector = selector.TokenExtractor(punc_path) for w in wset: for combined in combinedset: pos_selector = selector.PosExtractor(pos_path, w=w, combined=combined) for weight in weightset: word_selector = selector.WordExtractor(klword_path, weight=weight) train_featuresets, test_featuresets = list(), list() for step in range(len(domains)): train_featuresets.append(list()) test_featuresets.append(list()) for index, domain in enumerate(domains): train_articles = loader.load_market(train_article_market_path.replace(u"all", domain)) test_articles = loader.load_market(test_article_market_path.replace(u"all", domain)) # train length = len(train_articles) - 1 for idx, article in enumerate(train_articles): article["features"] = list() article["features"].extend(pos_selector.extract_feature(article["participle_content"])) article["features"].extend( token_selector.extract_feature( article["title"], article["content"], article["participle_title"], article["participle_content"], ) ) article["features"].extend( word_selector.extract_feature( article["participle_title"], article["participle_content"] ) ) print "finish rate is %.2f%%\r" % (100.0 * idx / length), print "finish rate is %.2f%%\r" % (100.0 * idx / length) train_featuresets[index] = [ [article["id"]] + article["features"] + [article["label"]] for article in train_articles ] # test length = len(test_articles) - 1 for idx, article in enumerate(test_articles): article["features"] = list() article["features"].extend(pos_selector.extract_feature(article["participle_content"])) article["features"].extend( token_selector.extract_feature( article["title"], article["content"], article["participle_title"], article["participle_content"], ) ) article["features"].extend( word_selector.extract_feature( article["participle_title"], article["participle_content"] ) ) print "finish rate is %.2f%%\r" % (100.0 * idx / length), print "finish rate is %.2f%%\r" % (100.0 * idx / length) test_featuresets[index] = [ [article["id"]] + article["features"] + [article["label"]] for article in test_articles ] for kernel in kernelset: for c in cset: for norm in normset: evl = list() for train_idx in range(0, len(domains)): for test_idx in range(0, len(domains)): train_dataset = np.array( [np.array(article[1:-1]) for article in train_featuresets[train_idx]] ) train_label = np.array( [np.array(int(article[-1])) for article in train_featuresets[train_idx]] ) test_dataset = np.array( [np.array(article[1:-1]) for article in test_featuresets[test_idx]] ) test_label = np.array( [np.array(int(article[-1])) for article in test_featuresets[test_idx]] ) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type="prob") test_class = classifier.testing(test_dataset, type="label") evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print "single finished ..." # merge articles = list() for train_idx in range(0, len(domains)): articles.extend(train_featuresets[train_idx]) train_dataset = np.array([np.array(article[1:-1]) for article in articles]) train_label = np.array([np.array(int(article[-1])) for article in articles]) for test_idx in range(0, len(domains)): test_dataset = np.array( [np.array(article[1:-1]) for article in test_featuresets[test_idx]] ) test_label = np.array( [np.array(int(article[-1])) for article in test_featuresets[test_idx]] ) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type="prob") test_class = classifier.testing(test_dataset, type="label") evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print "merge finished ..." print "performance is", 1.0 * sum(evl) / len(evl) log = [w, combined, weight, kernel, c[0], norm] log.extend(evl) logger.append(log) file_operator = TextFileOperator() file_operator.writing(logger, logger_path) print "finish"