Example #1
0
def main():

    weights_file = open(sys.argv[1], "rb")
    feature_weights = pickle.load(weights_file)
    weights_file.close()

    perceptron = percepclassify.perceptron_classify()

    sys.stdin = codecs.getreader("latin-1")(sys.stdin.detach(), errors="ignore")
    sys.stdout = codecs.getwriter("latin-1")(sys.stdout.detach(), errors="ignore")

    for line in sys.stdin:
        tagged_tokens = []
        tags = []
        pos = []
        tags.append("")
        tags.append("")
        pos.append("")
        pos.append("")

        # line is tokens separated by space

        new_line = " ".join(["B_O_S", "B_O_S", line.rstrip(), "E_O_S", "E_O_S"])

        tokens = re.split(r"\s+", new_line)

        for i, token in enumerate(tokens[2:-2]):
            curr = i + 2
            word_pos = token.rpartition("/")
            word = word_pos[0]
            pos = word_pos[1]

            prev2_word_pos = tokens[curr - 2].rpartition("/")
            prev2_word = prev2_word_pos[0].lower()
            prev2_pos = prev2_word_pos[2]
            prev2_tag = tags[curr - 2]
            prev1_word_pos = tokens[curr - 1].rpartition("/")
            prev1_word = prev1_word_pos[0].lower()
            prev1_pos = prev1_word_pos[1]
            prev1_tag = tags[curr - 1]

            features = " ".join(
                [
                    word.lower(),
                    "w_pos:" + pos,
                    "w1_tag:" + prev1_tag,
                    "w1_pos:" + prev1_pos,
                    "w2_tag:" + prev2_tag,
                    "w2_pos:" + prev2_pos,
                    "w_shape:" + wordshape(word),
                ]
            )
            tags.append(perceptron.classify(features, feature_weights))
            tagged_tokens.append(str(token + "/" + tags[curr]))

        tagged_sequence = " ".join(tagged_tokens)
        sys.stdout.write(tagged_sequence + "\n")
        sys.stdout.flush

    return
Example #2
0
def main():

    weights_file = open(sys.argv[1] ,'rb')
    feature_weights = pickle.load(weights_file)
    weights_file.close()

    perceptron = percepclassify.perceptron_classify()

    sys.stdin = codecs.getreader('latin-1')(sys.stdin.detach(), errors='ignore')
    sys.stdout = codecs.getwriter('latin-1')(sys.stdout.detach(), errors='ignore')

    for line in sys.stdin:
        tagged_tokens = []
        tags = []
        tags.append("")
        tags.append("")

        #line is tokens separated by space

        new_line = " ".join(["B_O_S","B_O_S",line.rstrip(),"E_O_S","E_O_S"])

        tokens = re.split(r'\s+',new_line)

        for i, token in enumerate(tokens[2:-2]):
            curr = i+2
            prev2_word = tokens[curr-2].lower()
            prev2_tag = tags[curr-2]
            prev1_word = tokens[curr-1].lower()
            prev1_tag = tags[curr-1]
            next1_word = tokens[curr+1].lower()
            next2_word = tokens[curr+2].lower()

            features = " ".join([token.lower(),"w1_prev:"+prev1_word,"w1_tag:"+prev1_tag,"w2_prev:"+prev2_word, "w2_tag:"+prev2_tag, "w1_next:"+next1_word,"w2_next:"+next2_word,"w_shape:"+wordshape(token)])
            tags.append(perceptron.classify(features, feature_weights))
            tagged_tokens.append(str(token + "/" + tags[curr]))

        tagged_sequence = " ".join(tagged_tokens)
        sys.stdout.write(tagged_sequence + "\n")
        sys.stdout.flush

    return
Example #3
0
    def learn(self, trainingfile, modelfile, devfile):

        #initializing the weight maps
        feature_weights = defaultdict(int)
        avg_feature_weights = defaultdict(int)
        best_avg_vector = defaultdict(int)

        #setting error rate to 1.0
        prev_error = 1.0

        #Preprocessing the training file to get all the tags
        #and initialize the feature_vectors and avg_feature_vectors

        #open the training file
        training_file = codecs.open(trainingfile, "r+",encoding='latin-1',errors = 'ignore')

        #counter i -to keep track of the lines in the text
        i = 0
        #storing each line of the training file to this List
        lines = []

        for line in training_file:

            tokens = []

            tokens = re.split(r'\s+', line.rstrip())
            lines.append(tokens)
            i += 1

            if tokens[0] not in self.labels:
                self.labels.append(tokens[0])
                self.feature_vector[tokens[0]] = {}
                self.avg_feature_vector[tokens[0]] = {}

            for token in tokens[1:]:
                if token not in feature_weights:
                    feature_weights[token] = 0.0
                    avg_feature_weights[token] = 0.0

        #close the training file
        training_file.close()

        #check if devfile is provided, else split training set
        #currently defaulting to 10% of the total training set

        train_lines = 0
        if (devfile):
            train_lines = i
        else:
            train_lines = int(i * .8)
            #create temporary dev_file
            devfile = "_temp_dev"
            dev_file = codecs.open(devfile, "w+",encoding='latin-1',errors = 'ignore')
            for line in lines[train_lines:]:
                dev_file.write(" ".join(line) + "\n")

            dev_file.close()

        for label in self.labels:
            self.feature_vector[label] = dict(feature_weights)
            self.avg_feature_vector[label] = dict(avg_feature_weights)

        #iterate maxIter times

        c = 1  #counter is to keep track of the averaging

        for i in range(self.maxIter):

            iter_start = time.time()  #timer
            print("Iteration :", i + 1)

            for line in lines[:train_lines - 1]:
                #print("C:",c)

                #Calculated weights for each label stored as hashmap
                calculated_weights = {}

                for label in self.labels:

                    feature_weights = self.feature_vector[label]
                    weight_calc = 0

                    for word in line[1:]:
                        weight_calc += feature_weights[word]

                    calculated_weights[label] = weight_calc

                #class with highest weight for the features is selected
                classified_label = sorted(calculated_weights, key=calculated_weights.get, reverse=True)[0]

                #actuallabel
                actual_label = line[0]

                #update weights in case of wrong-classification
                if actual_label != classified_label:

                    for word in line[1:]:
                        self.update_feature_weights(c, classified_label, word, -1.0)
                        self.update_feature_weights(c, actual_label, word, 1.0)

                #self.print_feature_weights(1)
                c += 1

            #print("Iteration time: ",iter_end-iter_start)

            #Test the performance of the average weight vector using the classifier

            #initialize the perceptron classifier
            classifier = percepclassify.perceptron_classify()

            #After each Iteration test against the Dev set if provided
            if (devfile):

                #update the average weights
                for label in self.labels:
                    for feature in self.feature_vector[label]:
                        self.update_feature_weights(c, label, feature, 0)

                #Call the Perceptron classifier to check for the error rate on dev set
                dev_error = classifier.check_dev_error(devfile, self.feature_vector)
                print("Error:", dev_error)
                #if current error is less than the previous error we set this as the best average vector
                if prev_error > dev_error:
                    prev_error = dev_error
                    best_avg_vector = dict(self.avg_feature_vector)

        #Iteration ends here
            iter_end = time.time()
            print("Time:",iter_end-iter_start)
        if (devfile):
            self.write_weights_file(best_avg_vector, modelfile)
            return

        for label in self.labels:
            for feature in self.feature_vector[label]:
                self.update_feature_weights(c, label, feature, 0)

        #self.print_feature_weights(1)
        #Write the learned model to the modelfile
        self.write_weights_file(self.avg_feature_vector, modelfile)

        return