def main(): weights_file = open(sys.argv[1], "rb") feature_weights = pickle.load(weights_file) weights_file.close() perceptron = percepclassify.perceptron_classify() sys.stdin = codecs.getreader("latin-1")(sys.stdin.detach(), errors="ignore") sys.stdout = codecs.getwriter("latin-1")(sys.stdout.detach(), errors="ignore") for line in sys.stdin: tagged_tokens = [] tags = [] pos = [] tags.append("") tags.append("") pos.append("") pos.append("") # line is tokens separated by space new_line = " ".join(["B_O_S", "B_O_S", line.rstrip(), "E_O_S", "E_O_S"]) tokens = re.split(r"\s+", new_line) for i, token in enumerate(tokens[2:-2]): curr = i + 2 word_pos = token.rpartition("/") word = word_pos[0] pos = word_pos[1] prev2_word_pos = tokens[curr - 2].rpartition("/") prev2_word = prev2_word_pos[0].lower() prev2_pos = prev2_word_pos[2] prev2_tag = tags[curr - 2] prev1_word_pos = tokens[curr - 1].rpartition("/") prev1_word = prev1_word_pos[0].lower() prev1_pos = prev1_word_pos[1] prev1_tag = tags[curr - 1] features = " ".join( [ word.lower(), "w_pos:" + pos, "w1_tag:" + prev1_tag, "w1_pos:" + prev1_pos, "w2_tag:" + prev2_tag, "w2_pos:" + prev2_pos, "w_shape:" + wordshape(word), ] ) tags.append(perceptron.classify(features, feature_weights)) tagged_tokens.append(str(token + "/" + tags[curr])) tagged_sequence = " ".join(tagged_tokens) sys.stdout.write(tagged_sequence + "\n") sys.stdout.flush return
def main(): weights_file = open(sys.argv[1] ,'rb') feature_weights = pickle.load(weights_file) weights_file.close() perceptron = percepclassify.perceptron_classify() sys.stdin = codecs.getreader('latin-1')(sys.stdin.detach(), errors='ignore') sys.stdout = codecs.getwriter('latin-1')(sys.stdout.detach(), errors='ignore') for line in sys.stdin: tagged_tokens = [] tags = [] tags.append("") tags.append("") #line is tokens separated by space new_line = " ".join(["B_O_S","B_O_S",line.rstrip(),"E_O_S","E_O_S"]) tokens = re.split(r'\s+',new_line) for i, token in enumerate(tokens[2:-2]): curr = i+2 prev2_word = tokens[curr-2].lower() prev2_tag = tags[curr-2] prev1_word = tokens[curr-1].lower() prev1_tag = tags[curr-1] next1_word = tokens[curr+1].lower() next2_word = tokens[curr+2].lower() features = " ".join([token.lower(),"w1_prev:"+prev1_word,"w1_tag:"+prev1_tag,"w2_prev:"+prev2_word, "w2_tag:"+prev2_tag, "w1_next:"+next1_word,"w2_next:"+next2_word,"w_shape:"+wordshape(token)]) tags.append(perceptron.classify(features, feature_weights)) tagged_tokens.append(str(token + "/" + tags[curr])) tagged_sequence = " ".join(tagged_tokens) sys.stdout.write(tagged_sequence + "\n") sys.stdout.flush return
def learn(self, trainingfile, modelfile, devfile): #initializing the weight maps feature_weights = defaultdict(int) avg_feature_weights = defaultdict(int) best_avg_vector = defaultdict(int) #setting error rate to 1.0 prev_error = 1.0 #Preprocessing the training file to get all the tags #and initialize the feature_vectors and avg_feature_vectors #open the training file training_file = codecs.open(trainingfile, "r+",encoding='latin-1',errors = 'ignore') #counter i -to keep track of the lines in the text i = 0 #storing each line of the training file to this List lines = [] for line in training_file: tokens = [] tokens = re.split(r'\s+', line.rstrip()) lines.append(tokens) i += 1 if tokens[0] not in self.labels: self.labels.append(tokens[0]) self.feature_vector[tokens[0]] = {} self.avg_feature_vector[tokens[0]] = {} for token in tokens[1:]: if token not in feature_weights: feature_weights[token] = 0.0 avg_feature_weights[token] = 0.0 #close the training file training_file.close() #check if devfile is provided, else split training set #currently defaulting to 10% of the total training set train_lines = 0 if (devfile): train_lines = i else: train_lines = int(i * .8) #create temporary dev_file devfile = "_temp_dev" dev_file = codecs.open(devfile, "w+",encoding='latin-1',errors = 'ignore') for line in lines[train_lines:]: dev_file.write(" ".join(line) + "\n") dev_file.close() for label in self.labels: self.feature_vector[label] = dict(feature_weights) self.avg_feature_vector[label] = dict(avg_feature_weights) #iterate maxIter times c = 1 #counter is to keep track of the averaging for i in range(self.maxIter): iter_start = time.time() #timer print("Iteration :", i + 1) for line in lines[:train_lines - 1]: #print("C:",c) #Calculated weights for each label stored as hashmap calculated_weights = {} for label in self.labels: feature_weights = self.feature_vector[label] weight_calc = 0 for word in line[1:]: weight_calc += feature_weights[word] calculated_weights[label] = weight_calc #class with highest weight for the features is selected classified_label = sorted(calculated_weights, key=calculated_weights.get, reverse=True)[0] #actuallabel actual_label = line[0] #update weights in case of wrong-classification if actual_label != classified_label: for word in line[1:]: self.update_feature_weights(c, classified_label, word, -1.0) self.update_feature_weights(c, actual_label, word, 1.0) #self.print_feature_weights(1) c += 1 #print("Iteration time: ",iter_end-iter_start) #Test the performance of the average weight vector using the classifier #initialize the perceptron classifier classifier = percepclassify.perceptron_classify() #After each Iteration test against the Dev set if provided if (devfile): #update the average weights for label in self.labels: for feature in self.feature_vector[label]: self.update_feature_weights(c, label, feature, 0) #Call the Perceptron classifier to check for the error rate on dev set dev_error = classifier.check_dev_error(devfile, self.feature_vector) print("Error:", dev_error) #if current error is less than the previous error we set this as the best average vector if prev_error > dev_error: prev_error = dev_error best_avg_vector = dict(self.avg_feature_vector) #Iteration ends here iter_end = time.time() print("Time:",iter_end-iter_start) if (devfile): self.write_weights_file(best_avg_vector, modelfile) return for label in self.labels: for feature in self.feature_vector[label]: self.update_feature_weights(c, label, feature, 0) #self.print_feature_weights(1) #Write the learned model to the modelfile self.write_weights_file(self.avg_feature_vector, modelfile) return