class TitleFeatureExtractor(BaseEstimator):
    def __init__(self, n_clusters = 50, pca_n_components = 20, 
                        kmpca_n_components = 3, kernel_n_components = 30):
        self.counter = text.CountVectorizer(stop_words = 'english', 
                        ngram_range = (1, 2), 
                        min_df = 30, binary = True, lowercase = True)
        self.km = cluster.MiniBatchKMeans(n_clusters = n_clusters, 
                    n_init=10, batch_size=10000, verbose = 1)
        self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
        self.kmpca = decomposition.RandomizedPCA(n_components=kmpca_n_components)
        self.rbf = kernel_approximation.RBFSampler(n_components=kernel_n_components)
        self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4)
        self.X_names = ['Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX', 'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX']
        self.linear_feature_selector = None
    ## X is the corpus - list of sentences
    def fit(self, X, y=None):
        self.counter.fit(X)
        CounterX = self.counter.transform(X)
        self.km.fit(CounterX)
        labels = self.km.labels_.reshape(-1, 1)
        ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
        self.pca.fit(CounterX)
        KmX = self.km.transform(CounterX)
        self.kmpca.fit(KmX)
        self.rbf.fit(CounterX)
        self.tree_hasher.fit(ClusterdX.todense())
        if y is not None:
            self.linear_feature_selector = CorrelationFeatureSelector(pvalue_threshold=0.8)
            self.linear_feature_selector.fit(CounterX.tocsr()[:y.shape[0], :], y)
        return self
    def transform(self, X):
        # transform
        CounterX = self.counter.transform(X)
        #labels = self.km.labels_.reshape(-1, 1)
        labels = self.km.predict(CounterX).reshape(-1, 1)
        #print labels.shape
        ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
        KmX = self.km.transform(CounterX)
        PCAX = self.pca.transform(CounterX)
        PCAClusterdX = self.kmpca.transform(KmX)
        RbfX = self.rbf.transform(CounterX)
        TreeX = self.tree_hasher.transform(ClusterdX.todense())
        # index of transformed matrix
        if self.linear_feature_selector:
            CounterX = self.linear_feature_selector.transform(CounterX)
        Xs = [CounterX, ClusterdX, KmX, PCAX, PCAClusterdX, RbfX, TreeX]
        #print map(lambda x:x.shape, Xs)
        fs = [M.shape[1] for M in Xs]
        self.feature_names_ = sum([map(lambda fi: self.X_names[i]+'_'+str(fi), range(findex)) 
                                    for (i,findex) in enumerate(fs)], 
                                    [])
        ## result matrix
        X = sparse.hstack(Xs)
        return X
    def fit_transform(self, X):
        return self.fit(X).transform(X)
 def fit(self, X, y=None):
     ## preprocessing
     (raw_locations, locations) = X
     ## BUILD first term of raw_location -- more informative than normalized location
     pattern = re.compile(r"\b[\w\s]+")
     locations_initial = map(lambda s: pattern.findall(s), raw_locations)
     locations_initial = map(
         lambda (i, s): s[0] if (s and s[0] in self.location_dict) else locations[i], enumerate(locations_initial)
     )
     ## strategy: pick the first word from LocationRaw,
     ## if found in the tree, use it, otherwise use LocationNormlalized
     ## the above data structure could be slow when searching
     standard_locations = np.array(
         [
             " ".join(self.location_dict[initial]) if initial in self.location_dict else "UK"
             for initial in locations_initial
         ]
     )  ## CAPTITAL 'UK' for unfound
     ## build feature extractors
     X = standard_locations
     self.counter.fit(X)
     CounterX = self.counter.transform(X)
     self.km.fit(CounterX)
     labels = self.km.labels_.reshape(-1, 1)
     ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
     self.pca.fit(CounterX)
     KmX = self.km.transform(CounterX)
     self.kmpca.fit(KmX)
     self.rbf.fit(CounterX)
     self.tree_hasher.fit(ClusterdX.todense())
     if y is not None:
         self.linear_feature_selector = CorrelationFeatureSelector(pvalue_threshold=0.8)
         self.linear_feature_selector.fit(CounterX.tocsr()[: y.shape[0], :], y)
     return self
Example #3
0
 def fit(self, X, y=None):
     self.counter.fit(X)
     CounterX = self.counter.transform(X)
     self.km.fit(CounterX)
     labels = self.km.labels_.reshape(-1, 1)
     ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
     self.pca.fit(CounterX)
     KmX = self.km.transform(CounterX)
     self.kmpca.fit(KmX)
     self.rbf.fit(CounterX)
     self.tree_hasher.fit(ClusterdX.todense())
     if y is not None:
         self.linear_feature_selector = CorrelationFeatureSelector(
             pvalue_threshold=0.8)
         self.linear_feature_selector.fit(CounterX.tocsr()[:y.shape[0], :],
                                          y)
     return self
 def fit(self, X, y=None):
     self.counter.fit(X)
     CounterX = self.counter.transform(X)
     self.km.fit(CounterX)
     labels = self.km.labels_.reshape(-1, 1)
     ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
     self.pca.fit(CounterX)
     KmX = self.km.transform(CounterX)
     self.kmpca.fit(KmX)
     self.rbf.fit(CounterX)
     self.tree_hasher.fit(ClusterdX.todense())
     if y is not None:
         self.linear_feature_selector = CorrelationFeatureSelector(pvalue_threshold=0.8)
         self.linear_feature_selector.fit(CounterX.tocsr()[:y.shape[0], :], y)
     return self
 def fit(self, X, y=None):
     ## preprocessing
     (raw_locations, locations) = X
     ## BUILD first term of raw_location -- more informative than normalized location
     pattern = re.compile(r'\b[\w\s]+')
     locations_initial = map(lambda s: pattern.findall(s), raw_locations)
     locations_initial = map(
         lambda (i, s): s[0]
         if (s and s[0] in self.location_dict) else locations[i],
         enumerate(locations_initial))
     ## strategy: pick the first word from LocationRaw,
     ## if found in the tree, use it, otherwise use LocationNormlalized
     ## the above data structure could be slow when searching
     standard_locations = np.array([
         ' '.join(self.location_dict[initial])
         if initial in self.location_dict else 'UK'
         for initial in locations_initial
     ])  ## CAPTITAL 'UK' for unfound
     ## build feature extractors
     X = standard_locations
     self.counter.fit(X)
     CounterX = self.counter.transform(X)
     self.km.fit(CounterX)
     labels = self.km.labels_.reshape(-1, 1)
     ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
     self.pca.fit(CounterX)
     KmX = self.km.transform(CounterX)
     self.kmpca.fit(KmX)
     self.rbf.fit(CounterX)
     self.tree_hasher.fit(ClusterdX.todense())
     if y is not None:
         self.linear_feature_selector = CorrelationFeatureSelector(
             pvalue_threshold=0.8)
         self.linear_feature_selector.fit(CounterX.tocsr()[:y.shape[0], :],
                                          y)
     return self
class LocationFeatureExtractor(BaseEstimator):
    def __init__(self, n_clusters=100, pca_n_components=10, kmpca_n_components=7, kernel_n_components=30):
        self.counter = text.CountVectorizer(
            stop_words="english", ngram_range=(1, 1), min_df=2, max_df=0.8, binary=True, lowercase=True
        )
        self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1)
        self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
        self.kmpca = decomposition.RandomizedPCA(n_components=kmpca_n_components)
        self.rbf = kernel_approximation.RBFSampler(n_components=kernel_n_components)
        self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4)
        self.X_names = [
            "Loc_CounterX",
            "Loc_ClusterdX",
            "Loc_KmX",
            "Loc_PCAX",
            "Loc_PCAClusterdX",
            "Loc_RbfX",
            "Loc_TreeX",
        ]
        self.linear_feature_selector = None
        ## BUILD dictionary based on location_tree - faster for search
        location_tree = [row[0].lower().split("~")[::-1] for row in csv.reader(open(LOCATION_TREE_FILE))]
        self.location_dict = {}
        for locs in location_tree:
            for i in range(len(locs)):
                if locs[i] not in self.location_dict:
                    self.location_dict[locs[i]] = locs[i:]

    ## X is the corpus - list of sentences
    def fit(self, X, y=None):
        ## preprocessing
        (raw_locations, locations) = X
        ## BUILD first term of raw_location -- more informative than normalized location
        pattern = re.compile(r"\b[\w\s]+")
        locations_initial = map(lambda s: pattern.findall(s), raw_locations)
        locations_initial = map(
            lambda (i, s): s[0] if (s and s[0] in self.location_dict) else locations[i], enumerate(locations_initial)
        )
        ## strategy: pick the first word from LocationRaw,
        ## if found in the tree, use it, otherwise use LocationNormlalized
        ## the above data structure could be slow when searching
        standard_locations = np.array(
            [
                " ".join(self.location_dict[initial]) if initial in self.location_dict else "UK"
                for initial in locations_initial
            ]
        )  ## CAPTITAL 'UK' for unfound
        ## build feature extractors
        X = standard_locations
        self.counter.fit(X)
        CounterX = self.counter.transform(X)
        self.km.fit(CounterX)
        labels = self.km.labels_.reshape(-1, 1)
        ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
        self.pca.fit(CounterX)
        KmX = self.km.transform(CounterX)
        self.kmpca.fit(KmX)
        self.rbf.fit(CounterX)
        self.tree_hasher.fit(ClusterdX.todense())
        if y is not None:
            self.linear_feature_selector = CorrelationFeatureSelector(pvalue_threshold=0.8)
            self.linear_feature_selector.fit(CounterX.tocsr()[: y.shape[0], :], y)
        return self

    def transform(self, X):
        ## preprocessing
        (raw_locations, locations) = X
        ## BUILD first term of raw_location -- more informative than normalized location
        pattern = re.compile(r"\b[\w\s]+")
        locations_initial = map(lambda s: pattern.findall(s), raw_locations)
        locations_initial = map(
            lambda (i, s): s[0] if (s and s[0] in self.location_dict) else locations[i], enumerate(locations_initial)
        )
        ## strategy: pick the first word from LocationRaw,
        ## if found in the tree, use it, otherwise use LocationNormlalized
        ## the above data structure could be slow when searching
        standard_locations = np.array(
            [
                " ".join(self.location_dict[initial]) if initial in self.location_dict else "UK"
                for initial in locations_initial
            ]
        )  ## CAPTITAL 'UK' for unfound
        ## build feature extractors
        X = standard_locations
        # transform
        CounterX = self.counter.transform(X)
        # labels = self.km.labels_.reshape(-1, 1)
        labels = self.km.predict(CounterX).reshape(-1, 1)
        ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
        KmX = self.km.transform(CounterX)
        PCAX = self.pca.transform(CounterX)
        PCAClusterdX = self.kmpca.transform(KmX)
        RbfX = self.rbf.transform(CounterX)
        TreeX = self.tree_hasher.transform(ClusterdX.todense())
        # index of transformed matrix
        if self.linear_feature_selector:
            CounterX = self.linear_feature_selector.transform(CounterX)
        Xs = [CounterX, ClusterdX, KmX, PCAX, PCAClusterdX, RbfX, TreeX]
        fs = [M.shape[1] for M in Xs]
        self.feature_names_ = sum(
            [map(lambda fi: self.X_names[i] + "_" + str(fi), range(findex)) for (i, findex) in enumerate(fs)], []
        )
        ## result matrix
        X = sparse.hstack(Xs)
        return X

    def fit_transform(self, X):
        return self.fit(X).transform(X)
class LocationFeatureExtractor(BaseEstimator):
    def __init__(self,
                 n_clusters=100,
                 pca_n_components=10,
                 kmpca_n_components=7,
                 kernel_n_components=30):
        self.counter = text.CountVectorizer(stop_words='english',
                                            ngram_range=(1, 1),
                                            min_df=2,
                                            max_df=0.8,
                                            binary=True,
                                            lowercase=True)
        self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                          n_init=10,
                                          batch_size=10000,
                                          verbose=1)
        self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
        self.kmpca = decomposition.RandomizedPCA(
            n_components=kmpca_n_components)
        self.rbf = kernel_approximation.RBFSampler(
            n_components=kernel_n_components)
        self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                         max_depth=5,
                                                         n_jobs=4)
        self.X_names = [
            'Loc_CounterX', 'Loc_ClusterdX', 'Loc_KmX', 'Loc_PCAX',
            'Loc_PCAClusterdX', 'Loc_RbfX', 'Loc_TreeX'
        ]
        self.linear_feature_selector = None
        ## BUILD dictionary based on location_tree - faster for search
        location_tree = [
            row[0].lower().split('~')[::-1]
            for row in csv.reader(open(LOCATION_TREE_FILE))
        ]
        self.location_dict = {}
        for locs in location_tree:
            for i in range(len(locs)):
                if locs[i] not in self.location_dict:
                    self.location_dict[locs[i]] = locs[i:]

    ## X is the corpus - list of sentences
    def fit(self, X, y=None):
        ## preprocessing
        (raw_locations, locations) = X
        ## BUILD first term of raw_location -- more informative than normalized location
        pattern = re.compile(r'\b[\w\s]+')
        locations_initial = map(lambda s: pattern.findall(s), raw_locations)
        locations_initial = map(
            lambda (i, s): s[0]
            if (s and s[0] in self.location_dict) else locations[i],
            enumerate(locations_initial))
        ## strategy: pick the first word from LocationRaw,
        ## if found in the tree, use it, otherwise use LocationNormlalized
        ## the above data structure could be slow when searching
        standard_locations = np.array([
            ' '.join(self.location_dict[initial])
            if initial in self.location_dict else 'UK'
            for initial in locations_initial
        ])  ## CAPTITAL 'UK' for unfound
        ## build feature extractors
        X = standard_locations
        self.counter.fit(X)
        CounterX = self.counter.transform(X)
        self.km.fit(CounterX)
        labels = self.km.labels_.reshape(-1, 1)
        ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
        self.pca.fit(CounterX)
        KmX = self.km.transform(CounterX)
        self.kmpca.fit(KmX)
        self.rbf.fit(CounterX)
        self.tree_hasher.fit(ClusterdX.todense())
        if y is not None:
            self.linear_feature_selector = CorrelationFeatureSelector(
                pvalue_threshold=0.8)
            self.linear_feature_selector.fit(CounterX.tocsr()[:y.shape[0], :],
                                             y)
        return self

    def transform(self, X):
        ## preprocessing
        (raw_locations, locations) = X
        ## BUILD first term of raw_location -- more informative than normalized location
        pattern = re.compile(r'\b[\w\s]+')
        locations_initial = map(lambda s: pattern.findall(s), raw_locations)
        locations_initial = map(
            lambda (i, s): s[0]
            if (s and s[0] in self.location_dict) else locations[i],
            enumerate(locations_initial))
        ## strategy: pick the first word from LocationRaw,
        ## if found in the tree, use it, otherwise use LocationNormlalized
        ## the above data structure could be slow when searching
        standard_locations = np.array([
            ' '.join(self.location_dict[initial])
            if initial in self.location_dict else 'UK'
            for initial in locations_initial
        ])  ## CAPTITAL 'UK' for unfound
        ## build feature extractors
        X = standard_locations
        # transform
        CounterX = self.counter.transform(X)
        #labels = self.km.labels_.reshape(-1, 1)
        labels = self.km.predict(CounterX).reshape(-1, 1)
        ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
        KmX = self.km.transform(CounterX)
        PCAX = self.pca.transform(CounterX)
        PCAClusterdX = self.kmpca.transform(KmX)
        RbfX = self.rbf.transform(CounterX)
        TreeX = self.tree_hasher.transform(ClusterdX.todense())
        # index of transformed matrix
        if self.linear_feature_selector:
            CounterX = self.linear_feature_selector.transform(CounterX)
        Xs = [CounterX, ClusterdX, KmX, PCAX, PCAClusterdX, RbfX, TreeX]
        fs = [M.shape[1] for M in Xs]
        self.feature_names_ = sum([
            map(lambda fi: self.X_names[i] + '_' + str(fi), range(findex))
            for (i, findex) in enumerate(fs)
        ], [])
        ## result matrix
        X = sparse.hstack(Xs)
        return X

    def fit_transform(self, X):
        return self.fit(X).transform(X)
Example #8
0
class TitleFeatureExtractor(BaseEstimator):
    def __init__(self,
                 n_clusters=50,
                 pca_n_components=20,
                 kmpca_n_components=3,
                 kernel_n_components=30):
        self.counter = text.CountVectorizer(stop_words='english',
                                            ngram_range=(1, 2),
                                            min_df=30,
                                            binary=True,
                                            lowercase=True)
        self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                          n_init=10,
                                          batch_size=10000,
                                          verbose=1)
        self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
        self.kmpca = decomposition.RandomizedPCA(
            n_components=kmpca_n_components)
        self.rbf = kernel_approximation.RBFSampler(
            n_components=kernel_n_components)
        self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                         max_depth=5,
                                                         n_jobs=4)
        self.X_names = [
            'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX',
            'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX'
        ]
        self.linear_feature_selector = None

    ## X is the corpus - list of sentences
    def fit(self, X, y=None):
        self.counter.fit(X)
        CounterX = self.counter.transform(X)
        self.km.fit(CounterX)
        labels = self.km.labels_.reshape(-1, 1)
        ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
        self.pca.fit(CounterX)
        KmX = self.km.transform(CounterX)
        self.kmpca.fit(KmX)
        self.rbf.fit(CounterX)
        self.tree_hasher.fit(ClusterdX.todense())
        if y is not None:
            self.linear_feature_selector = CorrelationFeatureSelector(
                pvalue_threshold=0.8)
            self.linear_feature_selector.fit(CounterX.tocsr()[:y.shape[0], :],
                                             y)
        return self

    def transform(self, X):
        # transform
        CounterX = self.counter.transform(X)
        #labels = self.km.labels_.reshape(-1, 1)
        labels = self.km.predict(CounterX).reshape(-1, 1)
        #print labels.shape
        ClusterdX = preprocessing.OneHotEncoder().fit_transform(labels)
        KmX = self.km.transform(CounterX)
        PCAX = self.pca.transform(CounterX)
        PCAClusterdX = self.kmpca.transform(KmX)
        RbfX = self.rbf.transform(CounterX)
        TreeX = self.tree_hasher.transform(ClusterdX.todense())
        # index of transformed matrix
        if self.linear_feature_selector:
            CounterX = self.linear_feature_selector.transform(CounterX)
        Xs = [CounterX, ClusterdX, KmX, PCAX, PCAClusterdX, RbfX, TreeX]
        #print map(lambda x:x.shape, Xs)
        fs = [M.shape[1] for M in Xs]
        self.feature_names_ = sum([
            map(lambda fi: self.X_names[i] + '_' + str(fi), range(findex))
            for (i, findex) in enumerate(fs)
        ], [])
        ## result matrix
        X = sparse.hstack(Xs)
        return X

    def fit_transform(self, X):
        return self.fit(X).transform(X)