コード例 #1
0
    def second_predict(self, data, inds_list):

        # If first pass predicted no concepts, then skip
        # NOTE: Special case because SVM cannot have empty input
        if sum([len(inds) for inds in inds_list]) == 0:
            print "first pass predicted no concepts, skipping second pass"
            return []

        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()

        print '\textracting  features (pass two)'

        # Extract features
        X = [
            feat_o.concept_features(s, inds)
            for s, inds in zip(data, inds_list)
        ]
        X = reduce(concat, X)

        print '\tvectorizing features (pass two)'

        # Vectorize features
        X = self.second_vec.transform(X)

        print '\tpredicting    labels (pass two)'

        # Predict concept labels
        out = sci.predict(self.second_clf, X)

        # Line-by-line processing
        o = list(out)
        classifications = []
        for lineno, inds in enumerate(inds_list):

            # Skip empty line
            if not inds: continue

            # For each concept
            for ind in inds:

                # Get next concept
                concept = reverse_concept_labels[o.pop(0)]

                # Get start position (ex. 7th word of line)
                start = 0
                for i in range(ind):
                    start += len(data[lineno][i].split())

                # Length of chunk
                length = len(data[lineno][ind].split())

                # Classification token
                classifications.append(
                    (concept, lineno + 1, start, start + length - 1))

        # Return classifications
        return classifications
コード例 #2
0
ファイル: model.py プロジェクト: aussina/CliNER
    def second_train(self, data, inds_list, Y, do_grid=False):

        """
        Model::second_train()

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of list of strings.
                           - A string is a chunked phrase
                           - An inner list corresponds to one line from the file
        @param inds_list A list of list of integer indices
                           - assertion: len(data) == len(inds_list)
                           - one line of 'inds_list' contains a list of indices
                               into the corresponding line for 'data'
        @param Y         A list of concept labels
                           - assertion: there are sum(len(inds_list)) labels
                               AKA each index from inds_list maps to a label
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None
        """

        print '\textracting  features (pass two)'

        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()

        # Extract features
        X = [ feat_o.concept_features(s,inds) for s,inds in zip(data,inds_list) ]
        X = reduce(concat, X)


        print '\tvectorizing features (pass two)'

        # Vectorize labels
        Y = [  concept_labels[y]  for  y  in  Y  ]

        # Vectorize features
        X = self.second_vec.fit_transform(X)


        print '\ttraining  classifier (pass two)'


        # Train the model
        self.second_clf = sci.train(X, Y, do_grid)
コード例 #3
0
    def second_train(self, data, inds_list, Y, do_grid=False):
        """
        Model::second_train()

        Purpose: Train the first pass classifiers (for IOB chunking)

        @param data      A list of list of strings.
                           - A string is a chunked phrase
                           - An inner list corresponds to one line from the file
        @param inds_list A list of list of integer indices
                           - assertion: len(data) == len(inds_list)
                           - one line of 'inds_list' contains a list of indices
                               into the corresponding line for 'data'
        @param Y         A list of concept labels
                           - assertion: there are sum(len(inds_list)) labels
                               AKA each index from inds_list maps to a label
        @param do_grid   A boolean indicating whether to perform a grid search

        @return          None
        """

        print '\textracting  features (pass two)'

        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()

        # Extract features
        X = [
            feat_o.concept_features(s, inds)
            for s, inds in zip(data, inds_list)
        ]
        X = reduce(concat, X)

        print '\tvectorizing features (pass two)'

        # Vectorize labels
        Y = [concept_labels[y] for y in Y]

        # Vectorize features
        X = self.second_vec.fit_transform(X)

        print '\ttraining  classifier (pass two)'

        # Train the model
        self.second_clf = sci.train(X, Y, do_grid)
コード例 #4
0
ファイル: model.py プロジェクト: aussina/CliNER
    def second_predict(self, data, inds_list):

        # If first pass predicted no concepts, then skip
        # NOTE: Special case because SVM cannot have empty input
        if sum([ len(inds) for inds in inds_list ]) == 0:
            print "first pass predicted no concepts, skipping second pass"
            return []


        # Create object that is a wrapper for the features
        feat_o = FeatureWrapper()


        print '\textracting  features (pass two)'


        # Extract features
        X = [ feat_o.concept_features(s,inds) for s,inds in zip(data,inds_list) ]
        X = reduce(concat, X)


        print '\tvectorizing features (pass two)'


        # Vectorize features
        X = self.second_vec.transform(X)


        print '\tpredicting    labels (pass two)'


        # Predict concept labels
        out = sci.predict(self.second_clf, X)


        # Line-by-line processing
        o = list(out)
        classifications = []
        for lineno,inds in enumerate(inds_list):

            # Skip empty line
            if not inds: continue

            # For each concept
            for ind in inds:

                # Get next concept
                concept = reverse_concept_labels[o.pop(0)]

                # Get start position (ex. 7th word of line)
                start = 0
                for i in range(ind):
                    start += len( data[lineno][i].split() )

                # Length of chunk
                length = len(data[lineno][ind].split())

                # Classification token
                classifications.append( (concept,lineno+1,start,start+length-1) )

        # Return classifications
        return classifications