Example #1
0
 def __init__(self, labeled):
     self.labeled = labeled
     self.numIterations = 5
     self.posTypes = []
     self.labelTypes = []
     self.model = PerceptronModel(self.labeled)
Example #2
0
 def __init__(self, labeled):
     self.labeled = labeled
     self.numIterations = 5
     self.posTypes = []
     self.labelTypes = []
     self.model = PerceptronModel(self.labeled)
Example #3
0
class Parser:
    def __init__(self, labeled):
        self.labeled = labeled
        self.numIterations = 5
        self.posTypes = []
        self.labelTypes = []
        self.model = PerceptronModel(self.labeled)
        # self.model = DeepLearningModel(self.labeled, self.posTypes, self.labelTypes)
        # self.model = SvmModel(self.labeled)

    def initialize(self, sentence):
        #            ID    FORM   LEMMA  CPOSTAG  POSTAG  FEATS   HEAD  DEPREL  PHEAD   PDEPREL
        self.root = [
            '0', 'ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT', '-1', 'ROOT', 'ROOT',
            'ROOT'
        ]
        self.buff = [self.root] + list(
            reversed(sentence))  # buff = [ROOT, N, N-1, ... 1]
        self.stack = list()
        self.arcs = {}  # {dependentId: headId}
        self.labels = {}  # {dependentId: dependentLabel}
        self.wordsThatHaveHeads = {}  # {wordId: word}
        self.transitions = list()
        self.dependentIDs = defaultdict(list)  # {headId: [dependentId, ...]}

        # Calculate the leftmost and rightmost children for each node in the sentence
        # Note: At test time this data is not used.
        for word in sentence:
            self.dependentIDs[word[C.HEAD]].append(word[C.ID])

    # Record types that were seen
    def trackTypes(self, sentence):
        for word in sentence:
            if word[C.POSTAG] not in self.posTypes:
                self.posTypes.append(word[C.POSTAG])
            if word[C.DEPREL] not in self.labelTypes:
                self.labelTypes.append(word[C.DEPREL])

    # This function should take a transition object and apply to the current parser state. It need not return anything.
    def execute_transition(self, transition):
        if transition.transitionType == Transition.Shift:
            self.stack.append(self.buff.pop())
        else:
            head = self.stack[
                -1] if transition.transitionType == Transition.LeftArc else self.stack[
                    -2]
            dependent = self.stack[
                -2] if transition.transitionType == Transition.LeftArc else self.stack[
                    -1]
            self.arcs[dependent[C.ID]] = head[C.ID]
            self.labels[dependent[C.ID]] = transition.label
            self.stack.remove(dependent)
            self.wordsThatHaveHeads[dependent[C.ID]] = dependent
        self.transitions.append(transition)

    @staticmethod
    def load_corpus(filename):
        print >> sys.stderr, 'Loading treebank from %s' % filename
        corpus = []
        sentence = []
        with open(filename, 'r') as f:
            for line in f:
                line = line.strip()
                if not line:
                    corpus.append(sentence)
                    sentence = []
                else:
                    word = line.split('\t')
                    sentence.append(word)
        print >> sys.stderr, 'Loaded %d sentences' % len(corpus)
        return corpus

    def output(self, sentence):
        for word in sentence:
            word[C.HEAD] = self.arcs.get(word[C.ID], '0')
            label = self.labels.get(word[C.ID], '_')
            word[C.DEPREL] = label if label is not None else '_'
            print '\t'.join(word)
        print

    def train(self, trainingSet):
        for i in range(self.numIterations):
            corpus = Parser.load_corpus(trainingSet)
            oracle = Oracle()
            for sentence in corpus:
                self.initialize(sentence)
                self.trackTypes(sentence)
                while len(self.buff) > 0 or len(self.stack) > 1:
                    transition = oracle.getTransition(self.stack, self.buff,
                                                      self.dependentIDs,
                                                      self.arcs, self.labeled)
                    self.model.learn(transition, self.stack, self.buff,
                                     self.labels, self.transitions, self.arcs,
                                     self.wordsThatHaveHeads)
                    self.execute_transition(transition)

    def parse(self, testSet):
        corpus = Parser.load_corpus(testSet)
        for sentence in corpus:
            self.initialize(sentence)
            while len(self.buff) > 0 or len(self.stack) > 1:
                _, transition = self.model.predict(self.stack, self.buff,
                                                   self.labels,
                                                   self.transitions, self.arcs,
                                                   self.wordsThatHaveHeads)
                self.execute_transition(transition)
            self.output(sentence)
Example #4
0
class Parser:
    def __init__(self, labeled):
        self.labeled = labeled
        self.numIterations = 5
        self.posTypes = []
        self.labelTypes = []
        self.model = PerceptronModel(self.labeled)
        # self.model = DeepLearningModel(self.labeled, self.posTypes, self.labelTypes)
        # self.model = SvmModel(self.labeled)

    def initialize(self, sentence):
        #            ID    FORM   LEMMA  CPOSTAG  POSTAG  FEATS   HEAD  DEPREL  PHEAD   PDEPREL
        self.root = ["0", "ROOT", "ROOT", "ROOT", "ROOT", "ROOT", "-1", "ROOT", "ROOT", "ROOT"]
        self.buff = [self.root] + list(reversed(sentence))  # buff = [ROOT, N, N-1, ... 1]
        self.stack = list()
        self.arcs = {}  # {dependentId: headId}
        self.labels = {}  # {dependentId: dependentLabel}
        self.wordsThatHaveHeads = {}  # {wordId: word}
        self.transitions = list()
        self.dependentIDs = defaultdict(list)  # {headId: [dependentId, ...]}

        # Calculate the leftmost and rightmost children for each node in the sentence
        # Note: At test time this data is not used.
        for word in sentence:
            self.dependentIDs[word[C.HEAD]].append(word[C.ID])

    # Record types that were seen
    def trackTypes(self, sentence):
        for word in sentence:
            if word[C.POSTAG] not in self.posTypes:
                self.posTypes.append(word[C.POSTAG])
            if word[C.DEPREL] not in self.labelTypes:
                self.labelTypes.append(word[C.DEPREL])

    # This function should take a transition object and apply to the current parser state. It need not return anything.
    def execute_transition(self, transition):
        if transition.transitionType == Transition.Shift:
            self.stack.append(self.buff.pop())
        else:
            head = self.stack[-1] if transition.transitionType == Transition.LeftArc else self.stack[-2]
            dependent = self.stack[-2] if transition.transitionType == Transition.LeftArc else self.stack[-1]
            self.arcs[dependent[C.ID]] = head[C.ID]
            self.labels[dependent[C.ID]] = transition.label
            self.stack.remove(dependent)
            self.wordsThatHaveHeads[dependent[C.ID]] = dependent
        self.transitions.append(transition)

    @staticmethod
    def load_corpus(filename):
        print >> sys.stderr, "Loading treebank from %s" % filename
        corpus = []
        sentence = []
        with open(filename, "r") as f:
            for line in f:
                line = line.strip()
                if not line:
                    corpus.append(sentence)
                    sentence = []
                else:
                    word = line.split("\t")
                    sentence.append(word)
        print >> sys.stderr, "Loaded %d sentences" % len(corpus)
        return corpus

    def output(self, sentence):
        for word in sentence:
            word[C.HEAD] = self.arcs.get(word[C.ID], "0")
            label = self.labels.get(word[C.ID], "_")
            word[C.DEPREL] = label if label is not None else "_"
            print "\t".join(word)
        print

    def train(self, trainingSet):
        for i in range(self.numIterations):
            corpus = Parser.load_corpus(trainingSet)
            oracle = Oracle()
            for sentence in corpus:
                self.initialize(sentence)
                self.trackTypes(sentence)
                while len(self.buff) > 0 or len(self.stack) > 1:
                    transition = oracle.getTransition(self.stack, self.buff, self.dependentIDs, self.arcs, self.labeled)
                    self.model.learn(
                        transition,
                        self.stack,
                        self.buff,
                        self.labels,
                        self.transitions,
                        self.arcs,
                        self.wordsThatHaveHeads,
                    )
                    self.execute_transition(transition)

    def parse(self, testSet):
        corpus = Parser.load_corpus(testSet)
        for sentence in corpus:
            self.initialize(sentence)
            while len(self.buff) > 0 or len(self.stack) > 1:
                _, transition = self.model.predict(
                    self.stack, self.buff, self.labels, self.transitions, self.arcs, self.wordsThatHaveHeads
                )
                self.execute_transition(transition)
            self.output(sentence)