コード例 #1
0
    def second_predict(self, data, inds_list):

        # If first pass predicted no concepts, then skip
        # NOTE: Special case because SVM cannot have empty input
        if sum([len(inds) for inds in inds_list]) == 0:
            print "first pass predicted no concepts, skipping second pass"
            return []

        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()

        print '\textracting  features (pass two)'

        # Extract features
        X = [
            feat_o.concept_features(s, inds)
            for s, inds in zip(data, inds_list)
        ]
        X = reduce(concat, X)

        print '\tvectorizing features (pass two)'

        # Vectorize features
        X = self.second_vec.transform(X)

        print '\tpredicting    labels (pass two)'

        # Predict concept labels
        out = sci.predict(self.second_clf, X)

        # Line-by-line processing
        o = list(out)
        classifications = []
        for lineno, inds in enumerate(inds_list):

            # Skip empty line
            if not inds: continue

            # For each concept
            for ind in inds:

                # Get next concept
                concept = reverse_concept_labels[o.pop(0)]

                # Get start position (ex. 7th word of line)
                start = 0
                for i in range(ind):
                    start += len(data[lineno][i].split())

                # Length of chunk
                length = len(data[lineno][ind].split())

                # Classification token
                classifications.append(
                    (concept, lineno + 1, start, start + length - 1))

        # Return classifications
        return classifications
コード例 #2
0
ファイル: model.py プロジェクト: aussina/CliNER
    def second_train(self, data, inds_list, Y, do_grid=False):

        """
        Model::second_train()

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of list of strings.
                           - A string is a chunked phrase
                           - An inner list corresponds to one line from the file
        @param inds_list A list of list of integer indices
                           - assertion: len(data) == len(inds_list)
                           - one line of 'inds_list' contains a list of indices
                               into the corresponding line for 'data'
        @param Y         A list of concept labels
                           - assertion: there are sum(len(inds_list)) labels
                               AKA each index from inds_list maps to a label
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None
        """

        print '\textracting  features (pass two)'

        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()

        # Extract features
        X = [ feat_o.concept_features(s,inds) for s,inds in zip(data,inds_list) ]
        X = reduce(concat, X)


        print '\tvectorizing features (pass two)'

        # Vectorize labels
        Y = [  concept_labels[y]  for  y  in  Y  ]

        # Vectorize features
        X = self.second_vec.fit_transform(X)


        print '\ttraining  classifier (pass two)'


        # Train the model
        self.second_clf = sci.train(X, Y, do_grid)
コード例 #3
0
    def second_train(self, data, inds_list, Y, do_grid=False):
        """
        Model::second_train()

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of list of strings.
                           - A string is a chunked phrase
                           - An inner list corresponds to one line from the file
        @param inds_list A list of list of integer indices
                           - assertion: len(data) == len(inds_list)
                           - one line of 'inds_list' contains a list of indices
                               into the corresponding line for 'data'
        @param Y         A list of concept labels
                           - assertion: there are sum(len(inds_list)) labels
                               AKA each index from inds_list maps to a label
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None
        """

        print '\textracting  features (pass two)'

        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()

        # Extract features
        X = [
            feat_o.concept_features(s, inds)
            for s, inds in zip(data, inds_list)
        ]
        X = reduce(concat, X)

        print '\tvectorizing features (pass two)'

        # Vectorize labels
        Y = [concept_labels[y] for y in Y]

        # Vectorize features
        X = self.second_vec.fit_transform(X)

        print '\ttraining  classifier (pass two)'

        # Train the model
        self.second_clf = sci.train(X, Y, do_grid)
コード例 #4
0
    def first_train(self, data, Y, do_grid=False):
        """
        Model::first_train()

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of split sentences    (1 sent = 1 line from file)
        @param Y         A list of list of IOB labels (1:1 mapping with data)
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None
        """

        print '\textracting  features (pass one)'

        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # Parition into prose v. nonprose
        prose = []
        nonprose = []
        pchunks = []
        nchunks = []
        for line, labels in zip(data, Y):
            isProse, feats = feat_obj.extract_IOB_features(line)
            if isProse:
                prose.append(feats)
                pchunks += labels
            else:
                nonprose.append(feats)
                nchunks += labels

        # Classify both prose & nonprose
        flabels = ['prose', 'nonprose']
        fsets = [prose, nonprose]
        chunksets = [pchunks, nchunks]
        dvects = [self.first_prose_vec, self.first_nonprose_vec]
        clfs = [self.first_prose_clf, self.first_nonprose_clf]

        vectorizers = []
        classifiers = []

        for flabel, fset, chunks, dvect, clf in zip(flabels, fsets, chunksets,
                                                    dvects, clfs):

            if len(fset) == 0:
                raise Exception(
                    'Training data must have %s training examples' % flabel)

            print '\tvectorizing features (pass one) ' + flabel

            # Vectorize IOB labels
            Y = [IOB_labels[y] for y in chunks]

            # Save list structure to reconstruct after vectorization
            offsets = [len(sublist) for sublist in fset]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i - 1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.fit_transform(flattened)
            vectorizers.append(dvect)

            print '\ttraining classifiers (pass one) ' + flabel

            # CRF needs reconstructed lists
            if self.crf_enabled:
                X = list(X)
                X = [X[i:j] for i, j in zip([0] + offsets, offsets)]
                Y = [Y[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
            else:
                lib = sci

            # Train classifiers
            clf = lib.train(X, Y, do_grid)
            classifiers.append(clf)

        # Save vectorizers
        self.first_prose_vec = vectorizers[0]
        self.first_nonprose_vec = vectorizers[1]

        # Save classifiers
        self.first_prose_clf = classifiers[0]
        self.first_nonprose_clf = classifiers[1]
コード例 #5
0
    def first_predict(self, data):
        """
        Model::first_predict()

        Purpose: Predict IOB chunks on data

        @param data.  A list of split sentences    (1 sent = 1 line from file)
        @return       A list of list of IOB labels (1:1 mapping with data)
        """

        print '\textracting  features (pass one)'

        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # separate prose and nonprose data
        prose = []
        nonprose = []
        plinenos = []
        nlinenos = []
        for i, line in enumerate(data):
            isProse, feats = feat_obj.extract_IOB_features(line)
            if isProse:
                prose.append(feats)
                plinenos.append(i)
            else:
                nonprose.append(feats)
                nlinenos.append(i)

        # Classify both prose & nonprose
        flabels = ['prose', 'nonprose']
        fsets = [prose, nonprose]
        dvects = [self.first_prose_vec, self.first_nonprose_vec]
        clfs = [self.first_prose_clf, self.first_nonprose_clf]
        preds = []

        for flabel, fset, dvect, clf in zip(flabels, fsets, dvects, clfs):

            # If nothing to predict, skip actual prediction
            if len(fset) == 0:
                preds.append([])
                continue

            print '\tvectorizing features (pass one) ' + flabel

            # Save list structure to reconstruct after vectorization
            offsets = [len(sublist) for sublist in fset]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i - 1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.transform(flattened)

            print '\tpredicting    labels (pass one) ' + flabel

            # CRF requires reconstruct lists
            if self.crf_enabled:
                X = list(X)
                X = [X[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
            else:
                lib = sci

            # Predict IOB labels
            out = lib.predict(clf, X)

            # Format labels from output
            pred = [out[i:j] for i, j in zip([0] + offsets, offsets)]
            preds.append(pred)

        # Recover predictions
        plist = preds[0]
        nlist = preds[1]

        # Stitch prose and nonprose data back together
        # translate IOB labels into a readable format
        prose_iobs = []
        nonprose_iobs = []
        iobs = []
        trans = lambda l: reverse_IOB_labels[int(l)]
        for sentence in data:
            if is_prose_sentence(sentence):
                prose_iobs.append(plist.pop(0))
                prose_iobs[-1] = map(trans, prose_iobs[-1])
                iobs.append(prose_iobs[-1])
            else:
                nonprose_iobs.append(nlist.pop(0))
                nonprose_iobs[-1] = map(trans, nonprose_iobs[-1])
                iobs.append(nonprose_iobs[-1])

        # list of list of IOB labels
        return iobs, prose_iobs, nonprose_iobs
コード例 #6
0
ファイル: model.py プロジェクト: aussina/CliNER
    def first_train(self, data, Y, do_grid=False):

        """
        Model::first_train()

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of split sentences    (1 sent = 1 line from file)
        @param Y         A list of list of IOB labels (1:1 mapping with data)
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None
        """

        print '\textracting  features (pass one)'


        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)


        # Parition into prose v. nonprose
        prose    = []
        nonprose = []
        pchunks = []
        nchunks = []
        for line,labels in zip(data,Y):
            isProse,feats = feat_obj.extract_IOB_features(line)
            if isProse:
                prose.append(feats)
                pchunks += labels
            else:
                nonprose.append(feats)
                nchunks += labels


        # Classify both prose & nonprose
        flabels    = ['prose'             , 'nonprose'             ]
        fsets      = [prose               , nonprose               ]
        chunksets  = [pchunks             , nchunks                ]
        dvects     = [self.first_prose_vec, self.first_nonprose_vec]
        clfs       = [self.first_prose_clf, self.first_nonprose_clf]

        vectorizers = []
        classifiers = []

        for flabel,fset,chunks,dvect,clf in zip(flabels, fsets, chunksets, dvects, clfs):

            if len(fset) == 0:
                raise Exception('Training data must have %s training examples' % flabel)

            print '\tvectorizing features (pass one) ' + flabel

            # Vectorize IOB labels
            Y = [  IOB_labels[y]  for  y  in  chunks  ]

            # Save list structure to reconstruct after vectorization
            offsets = [ len(sublist) for sublist in fset ]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i-1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.fit_transform(flattened)
            vectorizers.append(dvect)


            print '\ttraining classifiers (pass one) ' + flabel

            # CRF needs reconstructed lists
            if self.crf_enabled:
                X = list(X)
                X = [ X[i:j] for i, j in zip([0] + offsets, offsets)]
                Y = [ Y[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
            else:
                lib = sci

            # Train classifiers
            clf  = lib.train(X, Y, do_grid)
            classifiers.append(clf)


        # Save vectorizers
        self.first_prose_vec    = vectorizers[0]
        self.first_nonprose_vec = vectorizers[1]

        # Save classifiers
        self.first_prose_clf    = classifiers[0]
        self.first_nonprose_clf = classifiers[1]
コード例 #7
0
ファイル: model.py プロジェクト: aussina/CliNER
    def second_predict(self, data, inds_list):

        # If first pass predicted no concepts, then skip
        # NOTE: Special case because SVM cannot have empty input
        if sum([ len(inds) for inds in inds_list ]) == 0:
            print "first pass predicted no concepts, skipping second pass"
            return []


        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()


        print '\textracting  features (pass two)'


        # Extract features
        X = [ feat_o.concept_features(s,inds) for s,inds in zip(data,inds_list) ]
        X = reduce(concat, X)


        print '\tvectorizing features (pass two)'


        # Vectorize features
        X = self.second_vec.transform(X)


        print '\tpredicting    labels (pass two)'


        # Predict concept labels
        out = sci.predict(self.second_clf, X)


        # Line-by-line processing
        o = list(out)
        classifications = []
        for lineno,inds in enumerate(inds_list):

            # Skip empty line
            if not inds: continue

            # For each concept
            for ind in inds:

                # Get next concept
                concept = reverse_concept_labels[o.pop(0)]

                # Get start position (ex. 7th word of line)
                start = 0
                for i in range(ind):
                    start += len( data[lineno][i].split() )

                # Length of chunk
                length = len(data[lineno][ind].split())

                # Classification token
                classifications.append( (concept,lineno+1,start,start+length-1) )

        # Return classifications
        return classifications
コード例 #8
0
ファイル: model.py プロジェクト: aussina/CliNER
    def first_predict(self, data):

        """
        Model::first_predict()

        Purpose: Predict IOB chunks on data

        @param data.  A list of split sentences    (1 sent = 1 line from file)
        @return       A list of list of IOB labels (1:1 mapping with data)
        """

        print '\textracting  features (pass one)'


        # Create object that is a wrapper for the features
        feat_obj = FeatureWrapper(data)

        # separate prose and nonprose data
        prose    = []
        nonprose = []
        plinenos = []
        nlinenos = []
        for i,line in enumerate(data):
            isProse,feats = feat_obj.extract_IOB_features(line)
            if isProse:
                prose.append(feats)
                plinenos.append(i)
            else:
                nonprose.append(feats)
                nlinenos.append(i)


        # Classify both prose & nonprose
        flabels = ['prose'             , 'nonprose'             ]
        fsets   = [prose               , nonprose               ]
        dvects  = [self.first_prose_vec, self.first_nonprose_vec]
        clfs    = [self.first_prose_clf, self.first_nonprose_clf]
        preds   = []

        for flabel,fset,dvect,clf in zip(flabels, fsets, dvects, clfs):

            # If nothing to predict, skip actual prediction
            if len(fset) == 0:
                preds.append([])
                continue


            print '\tvectorizing features (pass one) ' + flabel

            # Save list structure to reconstruct after vectorization
            offsets = [ len(sublist) for sublist in fset ]
            for i in range(1, len(offsets)):
                offsets[i] += offsets[i-1]

            # Vectorize features
            flattened = [item for sublist in fset for item in sublist]
            X = dvect.transform(flattened)


            print '\tpredicting    labels (pass one) ' + flabel

            # CRF requires reconstruct lists
            if self.crf_enabled:
                X = list(X)
                X = [ X[i:j] for i, j in zip([0] + offsets, offsets)]
                lib = crf
            else:
                lib = sci

            # Predict IOB labels
            out = lib.predict(clf, X)

            # Format labels from output
            pred = [out[i:j] for i, j in zip([0] + offsets, offsets)]
            preds.append(pred)


        # Recover predictions
        plist = preds[0]
        nlist = preds[1]


        # Stitch prose and nonprose data back together
        # translate IOB labels into a readable format
        prose_iobs    = []
        nonprose_iobs = []
        iobs          = []
        trans = lambda l: reverse_IOB_labels[int(l)]
        for sentence in data:
            if prose_sentence(sentence):
                prose_iobs.append( plist.pop(0) )
                prose_iobs[-1] = map(trans, prose_iobs[-1])
                iobs.append( prose_iobs[-1] )
            else:
                nonprose_iobs.append( nlist.pop(0) )
                nonprose_iobs[-1] = map(trans, nonprose_iobs[-1])
                iobs.append( nonprose_iobs[-1] )


        # list of list of IOB labels
        return iobs, prose_iobs, nonprose_iobs