def get_instances(self, label_file, xml_file): instances = [] labels_final = set() tagger = PerceptronTagger( ) # load nltk perceptron just once to speed up tagging labels_dict = { 0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "sadness", 5: "surprise" } tree = ET.parse(xml_file) root = tree.getroot() with open(label_file) as f: for sent, line in izip(root, f): id_xml = sent.attrib.values()[0] id_labels = line.rstrip().split() id_file = id_labels[0] if id_xml == id_file: for i in sent.itertext(): text = i labels = id_labels[1:] label = labels.index( str(max([int(label) for label in labels]))) inst = Instance(text, labels_dict[label]) inst_tokenized = word_tokenize(text) inst_tagged = tagger.tag(inst_tokenized) for tokentag in inst_tagged: token = Token(tokentag[0], tokentag[1]) inst.add_token(token) instances.append(inst) labels_final.add(label) return instances, labels_final
def get_instances(self, folder): # happiness/joy??????????????????????????? labels_dict = { "hp": "joy", "sd": "sadness", "ag": "anger", "dg": "disgust", "sp": "surprise", "fr": "fear" } instances = [] labels = set() tagger = PerceptronTagger( ) # load nltk perceptron just once to speed up tagging with open(folder) as f: for line in f: label, id, text = line.strip().split( " ", 2) # split by first two spaces only if label == "ne": # ignore no emotion continue inst = Instance(text, labels_dict[label]) inst_tokenized = word_tokenize(text) inst_tagged = tagger.tag(inst_tokenized) for tokentag in inst_tagged: token = Token(tokentag[0], tokentag[1]) inst.add_token(token) instances.append(inst) labels.add(label) return instances, labels
def get_instances(self, folder): instances = [] labels = set() for author in os.listdir(folder): path = folder + "/" + author + "/agree-sent/" path_pos = folder + "/" + author + "/pos/" if os.path.exists(path) and os.path.exists(path_pos): for af in os.listdir(path): current = os.path.join(path, af) current_pos = os.path.join( path_pos, af.split('.')[0] + '.sent.okpuncs.props.pos') if os.path.isfile(current) and os.path.isfile(current_pos): agree_data = open(current, "rb") pos_data = open(current_pos, "rb").readlines() for x in agree_data: x = x.strip() id = int(x.split("@")[0]) y = pos_data[id].strip() label = int(x.split("@")[1]) text = x.split("@")[2] inst = Instance(text, label) for tagtoken in y.split("):("): tag = tagtoken.split(" ")[0].lstrip("(") token = tagtoken.split(" ")[1] token = Token(token, tag) inst.add_token(token) instances.append(inst) labels.add(label) return instances, labels
def get_instances(self, folder): instances = [] labels = set() tagger = PerceptronTagger() # load nltk perceptron just once to speed up tagging with io.open(folder, encoding="utf-8") as f: # with open(folder) as f: for line in f: # line = unicode(line).encode("utf-8") line_split = line.rstrip().split("\t") if len(line_split) != 3: continue id, text, label = line_split id = id.rstrip(":") text = re.sub('[#]', '', text.rstrip()) label = re.sub('[^a-z]', '', label) inst = Instance(text, label) inst_tokenized = word_tokenize(text) inst_tagged = tagger.tag(inst_tokenized) for tokentag in inst_tagged: token = Token(tokentag[0], tokentag[1]) inst.add_token(token) instances.append(inst) labels.add(label) return instances, labels