Beispiel #1
0
    def first_train(self, data, Y, do_grid=False):
        """
        Model::first_train()

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of split sentences    (1 sent = 1 line from file)
        @param Y         A list of list of IOB labels (1:1 mapping with data)
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None
        """

        print '\textracting  features (pass one)'

        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # Parition into prose v. nonprose
        prose = []
        nonprose = []
        pchunks = []
        nchunks = []
        for line, labels in zip(data, Y):
            isProse, feats = feat_obj.extract_IOB_features(line)
            if isProse:
                prose.append(feats)
                pchunks += labels
            else:
                nonprose.append(feats)
                nchunks += labels

        # Classify both prose & nonprose
        flabels = ['prose', 'nonprose']
        fsets = [prose, nonprose]
        chunksets = [pchunks, nchunks]
        dvects = [self.first_prose_vec, self.first_nonprose_vec]
        clfs = [self.first_prose_clf, self.first_nonprose_clf]

        vectorizers = []
        classifiers = []

        for flabel, fset, chunks, dvect, clf in zip(flabels, fsets, chunksets,
                                                    dvects, clfs):

            if len(fset) == 0:
                raise Exception(
                    'Training data must have %s training examples' % flabel)

            print '\tvectorizing features (pass one) ' + flabel

            # Vectorize IOB labels
            Y = [IOB_labels[y] for y in chunks]

            # Save list structure to reconstruct after vectorization
            offsets = [len(sublist) for sublist in fset]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i - 1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.fit_transform(flattened)
            vectorizers.append(dvect)

            print '\ttraining classifiers (pass one) ' + flabel

            # CRF needs reconstructed lists
            if self.crf_enabled:
                X = list(X)
                X = [X[i:j] for i, j in zip([0] + offsets, offsets)]
                Y = [Y[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
            else:
                lib = sci

            # Train classifiers
            clf = lib.train(X, Y, do_grid)
            classifiers.append(clf)

        # Save vectorizers
        self.first_prose_vec = vectorizers[0]
        self.first_nonprose_vec = vectorizers[1]

        # Save classifiers
        self.first_prose_clf = classifiers[0]
        self.first_nonprose_clf = classifiers[1]
Beispiel #2
0
    def first_train(self, data, Y, do_grid=False):

        """
        Model::first_train()

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of split sentences    (1 sent = 1 line from file)
        @param Y         A list of list of IOB labels (1:1 mapping with data)
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None
        """

        print '\textracting  features (pass one)'


        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)


        # Parition into prose v. nonprose
        prose    = []
        nonprose = []
        pchunks = []
        nchunks = []
        for line,labels in zip(data,Y):
            isProse,feats = feat_obj.extract_IOB_features(line)
            if isProse:
                prose.append(feats)
                pchunks += labels
            else:
                nonprose.append(feats)
                nchunks += labels


        # Classify both prose & nonprose
        flabels    = ['prose'             , 'nonprose'             ]
        fsets      = [prose               , nonprose               ]
        chunksets  = [pchunks             , nchunks                ]
        dvects     = [self.first_prose_vec, self.first_nonprose_vec]
        clfs       = [self.first_prose_clf, self.first_nonprose_clf]

        vectorizers = []
        classifiers = []

        for flabel,fset,chunks,dvect,clf in zip(flabels, fsets, chunksets, dvects, clfs):

            if len(fset) == 0:
                raise Exception('Training data must have %s training examples' % flabel)

            print '\tvectorizing features (pass one) ' + flabel

            # Vectorize IOB labels
            Y = [  IOB_labels[y]  for  y  in  chunks  ]

            # Save list structure to reconstruct after vectorization
            offsets = [ len(sublist) for sublist in fset ]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i-1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.fit_transform(flattened)
            vectorizers.append(dvect)


            print '\ttraining classifiers (pass one) ' + flabel

            # CRF needs reconstructed lists
            if self.crf_enabled:
                X = list(X)
                X = [ X[i:j] for i, j in zip([0] + offsets, offsets)]
                Y = [ Y[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
            else:
                lib = sci

            # Train classifiers
            clf  = lib.train(X, Y, do_grid)
            classifiers.append(clf)


        # Save vectorizers
        self.first_prose_vec    = vectorizers[0]
        self.first_nonprose_vec = vectorizers[1]

        # Save classifiers
        self.first_prose_clf    = classifiers[0]
        self.first_nonprose_clf = classifiers[1]
Beispiel #3
0
    def first_predict(self, data):
        """
        Model::first_predict()

        Purpose: Predict IOB chunks on data

        @param data.  A list of split sentences    (1 sent = 1 line from file)
        @return       A list of list of IOB labels (1:1 mapping with data)
        """

        print '\textracting  features (pass one)'

        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # separate prose and nonprose data
        prose = []
        nonprose = []
        plinenos = []
        nlinenos = []
        for i, line in enumerate(data):
            isProse, feats = feat_obj.extract_IOB_features(line)
            if isProse:
                prose.append(feats)
                plinenos.append(i)
            else:
                nonprose.append(feats)
                nlinenos.append(i)

        # Classify both prose & nonprose
        flabels = ['prose', 'nonprose']
        fsets = [prose, nonprose]
        dvects = [self.first_prose_vec, self.first_nonprose_vec]
        clfs = [self.first_prose_clf, self.first_nonprose_clf]
        preds = []

        for flabel, fset, dvect, clf in zip(flabels, fsets, dvects, clfs):

            # If nothing to predict, skip actual prediction
            if len(fset) == 0:
                preds.append([])
                continue

            print '\tvectorizing features (pass one) ' + flabel

            # Save list structure to reconstruct after vectorization
            offsets = [len(sublist) for sublist in fset]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i - 1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.transform(flattened)

            print '\tpredicting    labels (pass one) ' + flabel

            # CRF requires reconstruct lists
            if self.crf_enabled:
                X = list(X)
                X = [X[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
            else:
                lib = sci

            # Predict IOB labels
            out = lib.predict(clf, X)

            # Format labels from output
            pred = [out[i:j] for i, j in zip([0] + offsets, offsets)]
            preds.append(pred)

        # Recover predictions
        plist = preds[0]
        nlist = preds[1]

        # Stitch prose and nonprose data back together
        # translate IOB labels into a readable format
        prose_iobs = []
        nonprose_iobs = []
        iobs = []
        trans = lambda l: reverse_IOB_labels[int(l)]
        for sentence in data:
            if is_prose_sentence(sentence):
                prose_iobs.append(plist.pop(0))
                prose_iobs[-1] = map(trans, prose_iobs[-1])
                iobs.append(prose_iobs[-1])
            else:
                nonprose_iobs.append(nlist.pop(0))
                nonprose_iobs[-1] = map(trans, nonprose_iobs[-1])
                iobs.append(nonprose_iobs[-1])

        # list of list of IOB labels
        return iobs, prose_iobs, nonprose_iobs
Beispiel #4
0
    def first_predict(self, data):

        """
        Model::first_predict()

        Purpose: Predict IOB chunks on data

        @param data.  A list of split sentences    (1 sent = 1 line from file)
        @return       A list of list of IOB labels (1:1 mapping with data)
        """

        print '\textracting  features (pass one)'


        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # separate prose and nonprose data
        prose    = []
        nonprose = []
        plinenos = []
        nlinenos = []
        for i,line in enumerate(data):
            isProse,feats = feat_obj.extract_IOB_features(line)
            if isProse:
                prose.append(feats)
                plinenos.append(i)
            else:
                nonprose.append(feats)
                nlinenos.append(i)


        # Classify both prose & nonprose
        flabels = ['prose'             , 'nonprose'             ]
        fsets   = [prose               , nonprose               ]
        dvects  = [self.first_prose_vec, self.first_nonprose_vec]
        clfs    = [self.first_prose_clf, self.first_nonprose_clf]
        preds   = []

        for flabel,fset,dvect,clf in zip(flabels, fsets, dvects, clfs):

            # If nothing to predict, skip actual prediction
            if len(fset) == 0:
                preds.append([])
                continue


            print '\tvectorizing features (pass one) ' + flabel

            # Save list structure to reconstruct after vectorization
            offsets = [ len(sublist) for sublist in fset ]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i-1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.transform(flattened)


            print '\tpredicting    labels (pass one) ' + flabel

            # CRF requires reconstruct lists
            if self.crf_enabled:
                X = list(X)
                X = [ X[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
            else:
                lib = sci

            # Predict IOB labels
            out = lib.predict(clf, X)

            # Format labels from output
            pred = [out[i:j] for i, j in zip([0] + offsets, offsets)]
            preds.append(pred)


        # Recover predictions
        plist = preds[0]
        nlist = preds[1]


        # Stitch prose and nonprose data back together
        # translate IOB labels into a readable format
        prose_iobs    = []
        nonprose_iobs = []
        iobs          = []
        trans = lambda l: reverse_IOB_labels[int(l)]
        for sentence in data:
            if prose_sentence(sentence):
                prose_iobs.append( plist.pop(0) )
                prose_iobs[-1] = map(trans, prose_iobs[-1])
                iobs.append( prose_iobs[-1] )
            else:
                nonprose_iobs.append( nlist.pop(0) )
                nonprose_iobs[-1] = map(trans, nonprose_iobs[-1])
                iobs.append( nonprose_iobs[-1] )


        # list of list of IOB labels
        return iobs, prose_iobs, nonprose_iobs