def append_data(src, dest, classes): """ read class c from src, append result to dest :param src: url :param dest: url :param classes: classes a list :return: """ for c in classes: if c not in EMOTION_CLASS.keys(): raise ValueError("%s is not support class" % c) src_tree = None dest_tree = None try: src_tree = ET.parse(src) dest_tree = ET.parse(dest) except IOError: print "cannot parse file" exit(-1) if src_tree and dest_tree: src_root = src_tree.getroot() dest_root = dest_tree.getroot() l = [src_root.findall("weibo[@emotion-type='%s']" % c) for c in classes] l = flatten(l) random.shuffle(l) [dest_root.append(l1) for l1 in l] # write to file dest_tree.write(dest, encoding="utf-8") print "append data is done."
def __load(url, ratio, direction=True, subjective=True, balance=False): """ Loading Training Data Except Objective Sentence :param url: :param direction: 默认从上往下取 :param subjective: 加载主观句还是客观句,默认加载主观数据 True: 加载多类别,即情绪标签 False: 加载二类别,即主客观 :param balance: 是否需要平衡加载数据集,默认以非平衡的方式加载 :return: """ # 若是加载客观的数据,也就没有平衡加载的概念 # if not subjective and balance: # raise AttributeError("can not load data which is objective and use balanced way!") tree = None try: tree = ET.parse(url) except IOError: print "cannot parse file" exit(-1) if tree is not None: # get the root root = tree.getroot() # get the direct child # 若非平衡加载,只需要将所有的 weibo 看成一类,即可复用代码 # todo # ElementTree XPath 貌似不支持 not、!= 操作,所有暂时采用以下方案代替 each_class = [[sentence for sentence in root.findall("weibo") if sentence.get("emotion-type") != "none"]] if not subjective: each_class = [root.findall("weibo[@emotion-type]")] if balance: each_class = [root.findall("weibo[@emotion-type='%s']" % c) for c in EMOTION_CLASS.keys()] if not subjective: each_class = Load.partition(root.findall("weibo[@emotion-type]"), lambda x: x.get("emotion-type") == "none") each_class_size = [len(c) for c in each_class] each_class_range = [slice(int(n * ratio)) for n in each_class_size] if not direction: _reverse_ratio = 1 - ratio each_class_range = [slice(int(n * _reverse_ratio), n) for n in each_class_size] sentences = [] for i, each in enumerate(each_class): _range = each_class_range[i] sentences.append([Load.integrate(sentence) for sentence in each[_range]]) # shuffle sentences = flatten(sentences) # random.shuffle(sentences) return [{"sentence": sentence.text.encode("utf_8"), "emotion-tag": sentence.get("emotion_tag"), "emotion-1-type": sentence.get("emotion-type"), "emotion-2-type": sentence.get("emotion-2-type")} for sentence in sentences]
def append_data(src, dest, classes): """ read class c from src, append result to dest :param src: url :param dest: url :param classes: classes a list :return: """ for c in classes: if c not in EMOTION_CLASS.keys(): raise ValueError("%s is not support class" % c) src_tree = None dest_tree = None try: src_tree = ET.parse(src) dest_tree = ET.parse(dest) except IOError: print "cannot parse file" exit(-1) if src_tree and dest_tree: src_root = src_tree.getroot() dest_root = dest_tree.getroot() l = [ src_root.findall("weibo[@emotion-type='%s']" % c) for c in classes ] l = flatten(l) random.shuffle(l) [dest_root.append(l1) for l1 in l] # write to file dest_tree.write(dest, encoding="utf-8") print "append data is done."
def getclasses(self): return EMOTION_CLASS.keys()
if not FileUtil.isexist(out) or FileUtil.isempty(out): clf0 = Classification() clf0.cross_validation(train, class_label, score="recall") test_index = np.loadtxt(out, dtype=int) test = train[test_index] test_label = np.asanyarray(class_label)[test_index].tolist() method_options = ("second", "four", "five") method_options_0 = ("B", "C", "D") linestyle = (':', '--', '-') plot.get_instance() for i in range(len(method_options)): bayes = IncrBayes() clf = Classification(bayes=bayes) clf.get_classificator(train, class_label, iscrossvalidate=crossvalidate, isbalance=False, minority_target=EMOTION_CLASS.keys()) # clf.get_classificator(train, class_label, isbalance=True, minority_target=["anger", "fear", "surprise"]) if(i == 0): pred = clf.predict(test) pred_unknow = clf.predict_unknow(test) print "origin precision:", clf.metrics_precision(test_label, pred_unknow) print "origin recall:", clf.metrics_recall(test_label, pred_unknow) print "origin f1:", clf.metrics_f1(test_label, pred_unknow) print "origin accuracy:", clf.metrics_accuracy(test_label, pred_unknow) print "origin zero_one_loss:", clf.metrics_zero_one_loss(test_label, pred_unknow) test_proba = clf.predict_max_proba(test) print "origin my_zero_one_loss:", clf.metrics_my_zero_one_loss(test_proba) print clf.metrics_correct(test_label, pred_unknow) # plot.plot_roc(test_label, clf.predict_proba(test), classes=clf.bayes.classes_.tolist(), text='origin')
def __load(url, ratio, direction=True, subjective=True, balance=False): """ Loading Training Data Except Objective Sentence :param url: :param direction: 默认从上往下取 :param subjective: 加载主观句还是客观句,默认加载主观数据 True: 加载多类别,即情绪标签 False: 加载二类别,即主客观 :param balance: 是否需要平衡加载数据集,默认以非平衡的方式加载 :return: """ # 若是加载客观的数据,也就没有平衡加载的概念 # if not subjective and balance: # raise AttributeError("can not load data which is objective and use balanced way!") tree = None try: tree = ET.parse(url) except IOError: print "cannot parse file" exit(-1) if tree is not None: # get the root root = tree.getroot() # get the direct child # 若非平衡加载,只需要将所有的 weibo 看成一类,即可复用代码 # todo # ElementTree XPath 貌似不支持 not、!= 操作,所有暂时采用以下方案代替 each_class = [[ sentence for sentence in root.findall("weibo") if sentence.get("emotion-type") != "none" ]] if not subjective: each_class = [root.findall("weibo[@emotion-type]")] if balance: each_class = [ root.findall("weibo[@emotion-type='%s']" % c) for c in EMOTION_CLASS.keys() ] if not subjective: each_class = Load.partition( root.findall("weibo[@emotion-type]"), lambda x: x.get("emotion-type") == "none") each_class_size = [len(c) for c in each_class] each_class_range = [slice(int(n * ratio)) for n in each_class_size] if not direction: _reverse_ratio = 1 - ratio each_class_range = [ slice(int(n * _reverse_ratio), n) for n in each_class_size ] sentences = [] for i, each in enumerate(each_class): _range = each_class_range[i] sentences.append( [Load.integrate(sentence) for sentence in each[_range]]) # shuffle sentences = flatten(sentences) # random.shuffle(sentences) return [{ "sentence": sentence.text.encode("utf_8"), "emotion-tag": sentence.get("emotion_tag"), "emotion-1-type": sentence.get("emotion-type"), "emotion-2-type": sentence.get("emotion-2-type") } for sentence in sentences]
def getclasses(self): if self.subjective: classes = EMOTION_CLASS.keys() else: classes = OBJECTIVE_CLASS.keys() return classes
clf = Classification() crossvalidate = False # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例 if crossvalidate: out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt") if not FileUtil.isexist(out) or FileUtil.isempty(out): clf.cross_validation(train, class_label, score="recall") test_index = np.loadtxt(out, dtype=int) test = train[test_index] test_label = np.asanyarray(class_label)[test_index].tolist() clf.get_classificator(train, class_label, iscrossvalidate=crossvalidate, isbalance=False, minority_target=EMOTION_CLASS.keys()) pred = clf.predict(test) pred_unknow = clf.predict_unknow(test) # print pred print "precision:", clf.metrics_precision(test_label, pred_unknow) print "recall:", clf.metrics_recall(test_label, pred_unknow) print "f1:", clf.metrics_f1(test_label, pred_unknow) print "accuracy:", clf.metrics_accuracy(test_label, pred_unknow) print "zero_one_loss:", clf.metrics_zero_one_loss(test_label, pred_unknow) test_proba = clf.predict_max_proba(test) print "my_zero_one_loss:", clf.metrics_my_zero_one_loss(test_proba) print clf.metrics_correct(test_label, pred_unknow) plot.get_instance() plot.plot_roc(test_label,
def __each_class_text(datas, c): # 获取 datas 下,类别 c 的文本 if c not in EMOTION_CLASS.keys(): raise ValueError("have no emotion class") return [data.get("sentence") for data in datas if data.get("emotion-1-type") == c]