コード例 #1
0
ファイル: features.py プロジェクト: hexterisk/static-malwired
    def process_raw_features(self, raw_obj):

        sections = raw_obj["sections"]
        general = [
            len(sections),  # total number of sections
            # number of sections with nonzero size
            sum(1 for s in sections if s["size"] == 0),
            # number of sections with an empty name
            sum(1 for s in sections if s["name"] == ""),
            # number of RX
            sum(1 for s in sections
                if "MEM_READ" in s["props"] and "MEM_EXECUTE" in s["props"]),
            # number of W
            sum(1 for s in sections if "MEM_WRITE" in s["props"])
        ]
        # gross characteristics of each section
        section_sizes = [(s["name"], s["size"]) for s in sections]
        section_sizes_hashed = FeatureHasher(50, input_type="pair").transform(
            [section_sizes]).toarray()[0]
        section_entropy = [(s["name"], s["entropy"]) for s in sections]
        section_entropy_hashed = FeatureHasher(
            50, input_type="pair").transform([section_entropy]).toarray()[0]
        section_vsize = [(s["name"], s["vsize"]) for s in sections]
        section_vsize_hashed = FeatureHasher(50, input_type="pair").transform(
            [section_vsize]).toarray()[0]
        entry_name_hashed = FeatureHasher(50, input_type="string").transform(
            [raw_obj["entry"]]).toarray()[0]
        characteristics = [
            p for s in sections for p in s["props"]
            if s["name"] == raw_obj["entry"]
        ]
        characteristics_hashed = FeatureHasher(
            50, input_type="string").transform([characteristics]).toarray()[0]

        return np.hstack([
            general, section_sizes_hashed, section_entropy_hashed,
            section_vsize_hashed, entry_name_hashed, characteristics_hashed
        ]).astype(np.float32)
コード例 #2
0
def test_model_train_explicit():
    raw_corpus = load_raw_corpus(False)
    sent_corpus = load_sentence_corpus(False)

    features = flatten([[span.span_features() for span in Doc(raw).spans]
                        for raw in raw_corpus])
    y = flatten([[is_eos(span, sent['sentences']) for span in Doc(raw).spans]
                 for raw, sent in zip(raw_corpus, sent_corpus)])

    assert len(features) == len(y)

    pipeline = Pipeline([('hasher', FeatureHasher()), ('pa', PA())])

    pipeline.fit(features, y)
コード例 #3
0
ファイル: pe_exports_features.py プロジェクト: ucds-sg/h2o-ai
    def exports_features(self, lief_binary):
        from sklearn.feature_extraction import FeatureHasher

        exports = sorted(lief_binary.exported_functions)
        
        features_hashed = {}
        if exports:
            for i, x in enumerate(FeatureHasher(128, input_type='string').transform(exports).toarray()[0]):
                features_hashed.update({f'Exports_functions_hash_{i}': x})
        else:
            for i in range(128):
                features_hashed.update({f'Exports_functions_hash_{i}': 0})

        return features_hashed
コード例 #4
0
def test_hasher_alternate_sign():
    X = [list("Thequickbrownfoxjumped")]

    Xt = FeatureHasher(alternate_sign=True,
                       non_negative=False,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() < 0 and Xt.data.max() > 0

    Xt = FeatureHasher(alternate_sign=True,
                       non_negative=True,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() > 0

    Xt = FeatureHasher(alternate_sign=False,
                       non_negative=True,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() > 0
    Xt_2 = FeatureHasher(alternate_sign=False,
                         non_negative=False,
                         input_type='string').fit_transform(X)
    # With initially positive features, the non_negative option should
    # have no impact when alternate_sign=False
    assert_array_equal(Xt.data, Xt_2.data)
コード例 #5
0
def generate_X(df: pd.core.frame.DataFrame,
               feature_names: Tuple[str],
               n_features: int = 2**20,
               add_bias: bool = True) -> csr_matrix:
    D = df.filter(items=feature_names).to_dict(orient='records')
    for d in D:
        # split `usertag` string
        # e.x.
        # {'usertag': '10059,10052,10063'}
        # will become as follows
        # {'usertag=10059': 1, 'usertag=10052': 1, 'usertag=10063': 1}
        if 'usertag' in d:
            for usertag in d['usertag'].split(','):
                d['usertag={}'.format(usertag)] = 1
            # delete original `usertag`
            del d['usertag']
    if add_bias is True:
        X = FeatureHasher(n_features=n_features - 1).transform(D)
        X = utils.add_bias(X)
    else:
        X = FeatureHasher(n_features=n_features).transform(D)
    del D
    return X
コード例 #6
0
def test_hasher_invalid_input():
    raw_X = [[], (), iter(range(0))]

    feature_hasher = FeatureHasher(input_type="gobbledygook")
    with pytest.raises(ValueError):
        feature_hasher.transform(raw_X)
    feature_hasher = FeatureHasher(n_features=-1)
    with pytest.raises(ValueError):
        feature_hasher.transform(raw_X)
    feature_hasher = FeatureHasher(n_features=0)
    with pytest.raises(ValueError):
        feature_hasher.transform(raw_X)
    feature_hasher = FeatureHasher(n_features="ham")
    with pytest.raises(TypeError):
        feature_hasher.transform(raw_X)

    feature_hasher = FeatureHasher(n_features=np.uint16(2 ** 6))
    with pytest.raises(ValueError):
        feature_hasher.transform([])
    with pytest.raises(Exception):
        feature_hasher.transform([[5.5]])
    with pytest.raises(Exception):
        feature_hasher.transform([[None]])
コード例 #7
0
def predict(text):
  clf = getModel()
  featuresValues = process(text)
  vector = FeatureHasher(n_features=6).transform([featuresValues]).toarray()
  result = clf.predict(vector)
  fake_result = sum([value > 0 for value in featuresValues.values()])

  #tweak for correct Boolean json serialization
  if result[0] and sum(featuresValues.values()):
    jsonRes = True
  else:
    jsonRes = False

  return {'result': jsonRes, 'values': featuresValues, 'fake_result': fake_result}
コード例 #8
0
    def initialize(self):
        if self.model_class == 'scikit':
            self.model = SGDRegressor(loss='squared_loss',
                                      alpha=0.1,
                                      n_iter=10,
                                      shuffle=True,
                                      eta0=0.0001)
            self.feature_constructor = FeatureHasher(n_features=200,
                                                     dtype=np.float64,
                                                     non_negative=False,
                                                     input_type='dict')

        elif self.model_class == 'lookup':
            self.model = {}
コード例 #9
0
def load_conll(f, features, n_features=(2**16), split=False):
    """Load CoNLL file, extract features on the tokens and vectorize them.

    The ConLL file format is a line-oriented text format that describes
    sequences in a space-separated format, separating the sequences with
    blank lines. Typically, the last space-separated part is a label.

    Since the tab-separated parts are usually tokens (and maybe things like
    part-of-speech tags) rather than feature vectors, a function must be
    supplied that does the actual feature extraction. This function has access
    to the entire sequence, so that it can extract context features.

    A ``sklearn.feature_extraction.FeatureHasher`` (the "hashing trick")
    is used to map symbolic input feature names to columns, so this function
    dos not remember the actual input feature names.

    Parameters
    ----------
    f : {string, file-like}
        Input file.
    features : callable
        Feature extraction function. Must take a list of tokens l that
        represent a single sequence and an index i into this list, and must
        return an iterator over strings that represent the features of l[i].
    n_features : integer, optional
        Number of columns in the output.
    split : boolean, default=False
        Whether to split lines on whitespace beyond what is needed to parse
        out the labels. This is useful for CoNLL files that have extra columns
        containing information like part of speech tags.

    Returns
    -------
    X : scipy.sparse matrix, shape (n_samples, n_features)
        Samples (feature vectors), as a single sparse matrix.
    y : np.ndarray, dtype np.string, shape n_samples
        Per-sample labels.
    lengths : np.ndarray, dtype np.int32, shape n_sequences
        Lengths of sequences within (X, y). The sum of these is equal to
        n_samples.
    """
    fh = FeatureHasher(n_features=n_features, input_type="string")
    labels = []
    lengths = []

    with _open(f) as f:
        raw_X = _conll_sequences(f, features, labels, lengths, split)
        X = fh.transform(raw_X)

    return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
コード例 #10
0
    def transform(self, df):
        print('encode ...')
        single_space = type(self.hash_space) == int
        if single_space:
            result = np.zeros((df.shape[0], self.hash_space))
            hash_space = [self.hash_space for _ in self.columns]
        else:
            result = []
            hash_space = self.hash_space

        total = len(self.columns) + len(self.array_columns)
        for idx, hash_column in enumerate(zip(self.columns, hash_space)):
            column, n_features = hash_column
            from sklearn.feature_extraction import FeatureHasher
            h = FeatureHasher(n_features=n_features, input_type='string', alternate_sign=self.alternate_sign)
            salt = str(hash(column))
            f = h.transform(df[column].astype(str).apply(lambda x: [x + salt]))  # FeatureHasher requires vectors
            if single_space:
                result = result + f.toarray()
            else:
                result.append(f.toarray())

        for idx, hash_column in enumerate(zip(self.array_columns, hash_space)):
            column, n_features = hash_column
            h = FeatureHasher(n_features=n_features, input_type='string', alternate_sign=self.alternate_sign)
            f = h.transform(df[column])
            if single_space:
                result = result + f.toarray()
            else:
                result.append(f.toarray())

        if single_space:
            return result

        # if self.sparse:
        #     return hstack(result)
        return np.concatenate(result, axis=1)
コード例 #11
0
    def initialize(self):
        if self.model_class == 'scikit':
            self.model = SGDRegressor(loss='squared_loss',
                                      alpha=0.1,
                                      n_iter=10,
                                      shuffle=True,
                                      eta0=0.0001)
            self.feature_constructor = FeatureHasher(n_features=200,
                                                     dtype=np.float64,
                                                     non_negative=False,
                                                     input_type='dict')

        elif self.model_class == 'lookup':
            self.model = {}

        # This thing crawls,, too much python overhead for subprocess and pipe
        elif self.model_class == 'vw':
            self.model = None
            self.model_path = self.base_folder_name + "/model.vw"
            self.cache_path = self.base_folder_name + "/temp.cache"
            self.f1 = open(self.base_folder_name + "/train.vw", 'a')

            self.train_vw_cmd = [
                '/usr/local/bin/vw', '--save_resume', '--holdout_off', '-c',
                '--cache_file', self.cache_path, '-f', self.model_path,
                '--passes', '20', '--loss_function', 'squared'
            ]
            self.train_vw_resume_cmd = [
                '/usr/local/bin/vw', '--save_resume', '-i', self.model_path,
                '-f', self.model_path
            ]

            # self.remove_vw_files()

        elif self.model_class == 'vw_python':
            # TODO interactions, lrq, dropout etc commands go here
            # TODO Need to pass model path and throw finish somewhere to store the final model
            self.model_path = self.base_folder_name + "/model.vw"
            self.cache_path = self.base_folder_name + "/temp.cache"
            #self.f1 = open(self.base_folder_name + "/train.vw", 'a')
            self.model = pyvw.vw(quiet=True,
                                 l2=0.00000001,
                                 loss_function='squared',
                                 passes=1,
                                 holdout_off=True,
                                 cache=self.cache_path,
                                 f=self.model_path,
                                 lrq='sdsd7',
                                 lrqdropout=True)
コード例 #12
0
def clasificador():
    data = read_all_documents('./training')
    documents = data['docs']
    labels = data['labels']

    vectorizer = DictVectorizer()
    vectorizer.fit_transform(tokens_frequency(d) for d in documents)

    vectorizer.get_feature_names()

    #Sparse matrices
    hasher = FeatureHasher(n_features=2**8, input_type="string")
    X = hasher.transform(tokens(d) for d in documents)

    #Train a text classifier using K-Means clustering

    clf = joblib.load('modelo_entrenado.pkl')  # Carga del modelo.

    prepositions = [
        'a', 'ante', 'bajo', 'cabe', 'con', 'contra', 'de', 'desde', 'en',
        'entre', 'hacia', 'hasta', 'para', 'por', 'según', 'sin', 'so',
        'sobre', 'tras'
    ]
    prep_alike = [
        'durante', 'mediante', 'excepto', 'salvo', 'incluso', 'más', 'menos'
    ]
    adverbs = ['no', 'si', 'sí']
    articles = [
        'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas', 'este', 'esta',
        'estos', 'estas', 'aquel', 'aquella', 'aquellos', 'aquellas'
    ]
    aux_verbs = [
        'he', 'has', 'ha', 'hemos', 'habéis', 'han', 'había', 'habías',
        'habíamos', 'habíais', 'habían'
    ]
    tfid = TfidfVectorizer(stop_words=prepositions + prep_alike + adverbs +
                           articles + aux_verbs)

    X_train = tfid.fit_transform(documents)
    y_train = labels

    #Predict categories for new articles

    test = read_all_documents('prueba')
    X_test = tfid.transform(test['docs'])
    y_test = test['labels']
    pred = clf.predict(X_test)
    cat = str(pred[0])
    return (cat)
コード例 #13
0
    def predict(self, files):
        '''
        return a vector of predicted values for the set of files specified.
        Assume convention, 0=Benign, 1=Malware.
        '''
        assert self.model is not None

        # now extract features from file, hash them and use self.model to return predictions

        start_time = time()

        completed_files = 0
        feature_dictionary_list = []
        print('Starting feature extraction')

        for _file in files:
            feature_dictionary_list.append(get_frequency_map(_file))
            completed_files += 1
            print('Completed extracting features from ' +
                  str(completed_files) + ' files',
                  end='\r')

        print('')

        end_time = time()

        print('Feature extraction completed in ' + str(end_time - start_time) +
              ' seconds')

        print('Starting testing')

        start_time = time()

        features = 7000
        hasher = FeatureHasher(n_features=features)
        features_x = hasher.transform(feature_dictionary_list).toarray()
        y = self.model.predict(features_x)

        end_time = time()

        print('Testing completed in ' + str(end_time - start_time) +
              ' seconds')

        f = lambda x: 1 if x > 0 else 0

        def transform(x):
            return np.fromiter((f(a) for a in x), x.dtype)

        return transform(y)
コード例 #14
0
    def train(self, files, labels, save=None):
        '''
        Train the model on files whose file paths have been specified as
        a list. Save the trained model parameters in default location, or if
        specified at a custom location.

        Labels need to be multiclass
        '''

        if not save:
            save = self.model_filename

        assert len(files) == len(labels)

        feature_dictionary_list = []

        print('Starting feature extraction')
        start_time = time()
        completed_files = 0

        for _file in files:
            feature_dictionary_list.append(get_frequency_map(_file))
            completed_files += 1
            print('Completed extracting features from ' +
                  str(completed_files) + ' files',
                  end='\r')

        print('')
        end_time = time()
        print('Feature extraction completed in ' + str(end_time - start_time) +
              ' seconds')

        print('Starting training model')
        start_time = time()

        features = 7000
        hasher = FeatureHasher(n_features=features)
        features_x = hasher.transform(feature_dictionary_list).toarray()
        features_y = np.array(labels)
        clf = RandomForestClassifier()
        clf.fit(features_x, features_y)

        end_time = time()
        print('Training completed in ' + str(end_time - start_time) +
              ' seconds')

        pickle.dump(clf, open(save, 'wb'))
        if save == self.model_filename:
            self.model = clf
コード例 #15
0
def test_feature_hasher_pairs():
    raw_X = (iter(d.items()) for d in [{
        "foo": 1,
        "bar": 2
    }, {
        "baz": 3,
        "quux": 4,
        "foo": -1
    }])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert [1, 2] == x1_nz
    assert [1, 3, 4] == x2_nz
コード例 #16
0
def hashFeatures(feature_dict, features_vol=20):

    feature_array = []

    for i in range(features_vol):
        h = FeatureHasher(n_features=(i + 1))
        f = h.transform(feature_dict)

        hashed_features = f.toarray()
        hashed_features = pd.DataFrame(hashed_features)

        feature_array.append(hashed_features)
        print("\tWorking on feature " + str(i) + " of " + str(features_vol))

    return feature_array
コード例 #17
0
def transform_data(ip,cpf,qtd_access_ip_last_days,qtd_access_cpf_last_days):
    groups = ip.split( "." )
    equalize_group_length = "".join( map( lambda group: group.zfill(3), groups ))
    h = FeatureHasher(n_features=10, input_type='string')
    
    ip = h.transform([equalize_group_length]).toarray()[0]
    cpf = h.transform([cpf]).toarray()[0]
    
    data = np.concatenate((ip, cpf))
    data = list(data)
    
    data.append(qtd_access_ip_last_days)
    data.append(qtd_access_cpf_last_days)
    
    return data
コード例 #18
0
def get_hashed_matrix_and_hasher(data, column_name, num_features):
    """
    1. If not str, convert to string
    2. Then will hash data and return matrix
    3. Also returns the fitted hash
    """

    data = make_column_hashable(data, column_name)

    hasher = FeatureHasher(n_features=num_features,
                           non_negative=False,
                           input_type='string')
    matrix = hasher.fit_transform(data[column_name])

    return matrix, hasher
コード例 #19
0
ファイル: preprocessor.py プロジェクト: ryanbekabe/Invincea
def pe_import(pe):
    if hasattr(pe, 'DIRECTORY_ENTRY_IMPORT'):
        for entry in pe.DIRECTORY_ENTRY_IMPORT:
            import_info = []
            if isinstance(entry.dll, bytes):
                libname = entry.dll.decode().lower()
            else:
                libname = entry.dll.lower()
            import_info.append(libname)
        libraries_hashed = FeatureHasher(256, input_type="string").transform(
            [import_info]).toarray()[0]
        sum = libraries_hashed.sum()
        return libraries_hashed / sum
    else:
        return np.zeros(256, dtype=np.float64)
コード例 #20
0
    def _fe_category_feature_hashing(self, X, column_name, n_features):
        """
		Description
		:param name: description
		:return: Description
		"""
        fh = FeatureHasher(n_features=n_features, input_type='string')
        x_features_arr = fh.fit_transform(
            X[column_name].astype('str')).toarray()
        column_names = np.array([])
        for i in range(n_features):
            column_names = np.append(column_names,
                                     column_name + '_' + str(i + 1))
        return pd.concat(
            [X, pd.DataFrame(x_features_arr, columns=column_names)], axis=1)
コード例 #21
0
    def get_section_info(self):
        section = self.report["section"]
        sections = section["sections"]
        vector = [
            len(sections),
            sum(1 for s in sections if s["size"] == 0),
            # number of sections with an empty name
            sum(1 for s in sections if s["name"] == ""),
            # number of RX
            sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
            # number of W
            sum(1 for s in sections if 'MEM_WRITE' in s['props'])
        ]

        section_sizes = [(s['name'], s['size']) for s in sections] 
        vector.extend(FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]) # section_sizes_hashed
        section_entropy = [(s['name'], s['entropy']) for s in sections]
        vector.extend(FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]) # section_entropy_hashed
        section_vsize = [(s['name'], s['vsize']) for s in sections]
        vector.extend(FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]) # section_vsize_hashed
        vector.extend(FeatureHasher(50, input_type="string").transform([section['entry']]).toarray()[0]) # entry_name_hashed
        characteristics = [p for s in sections for p in s['props'] if s['name'] == section['entry']]
        vector.extend(FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]) # characteristics_hashed
        return vector
コード例 #22
0
    def __init__(self,
                 gamma=0.9,
                 lr=1e-3,
                 state_size=1000,
                 action_size=1000,
                 hidden_size=200):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.gamma = gamma
        self.lr = lr

        self.state_size = state_size
        self.action_size = action_size
        self.hidden_size = hidden_size

        self.state_hasher = FeatureHasher(n_features=self.state_size)
        self.action_hasher = FeatureHasher(n_features=self.action_size)
        self.value_net = ACValueNet(self.state_size, self.hidden_size)
        self.action_net = ACActionNet(self.action_size, self.hidden_size,
                                      self.hidden_size)

        params = (list(self.value_net.parameters()) +
                  list(self.action_net.parameters()))
        self.optimizer = torch.optim.Adam(params, lr=self.lr)
コード例 #23
0
 def _vectorize_union(self, words):
     #TODO
     strs = 'classifier/train_data/' + self.identify + '_' + self.type + '_tfidf.txt'
     #strs = 'classifier/train_data/muqin_VotingClassifier_tfidf.txt'
     print strs
     if self.test:
         tv = pickle.load(open(strs, 'rb'))
         vocabulary = tv.vocabulary_
         strs = None
     else:
         vocabulary = None
     pipeline = Pipeline([
         ('SentenceDep', SentenceDepExtractor()),
         (
             'union',
             FeatureUnion(
                 transformer_list=[
                     ('line',
                      Pipeline([
                          ('selector', ItemSelector(key='line')),
                          ('dict', DictVectorizer()),
                      ])),
                     ('dep',
                      Pipeline([
                          ('selector', ItemSelector(key='dep')),
                          ('tfidf',
                           FeatureHasher(n_features=2**7,
                                         input_type='dict')),
                      ])),
                     ('sentence',
                      Pipeline([
                          ('selector', ItemSelector(key='sentence')),
                          ('tfidf',
                           SaveTfidfVectorizer(strs,
                                               vocabulary=vocabulary,
                                               stop_words=self.stop)),
                          ('best', TruncatedSVD(n_components=100)),
                      ])),
                 ],
                 # weight components in FeatureUnion
                 transformer_weights={
                     'line': 0.4,
                     'sentence': 0.1,
                     'dep': 0.5,
                 },
             )),
     ])
     return pipeline.fit_transform(words)
コード例 #24
0
def test():

    from sklearn.feature_extraction import FeatureHasher
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import metrics

    feat = 7000
    h = FeatureHasher(n_features=feat)

    start_time = time()

    TX = h.transform(pickle.load(open(test_feature_list_filename,
                                      'rb'))).toarray()
    Ty = np.array(pickle.load(open(test_predict_filename, 'rb')))

    clf = pickle.load(open('model2_parameters.sav', 'rb'))

    prediction_values = clf.predict(TX)

    f = lambda x: 1 if x > 0 else 0

    def fromiter(x):
        return np.fromiter((f(xi) for xi in x), x.dtype)

    prediction_values = fromiter(prediction_values)
    Ty = fromiter(Ty)

    print("features:", feat)
    print("accuracy:", metrics.accuracy_score(prediction_values, Ty))
    print("f1 score:", metrics.f1_score(prediction_values, Ty,
                                        average='micro'))
    print("precision score:",
          metrics.precision_score(prediction_values, Ty, average='micro'))
    print("recall score:",
          metrics.recall_score(prediction_values, Ty, average='micro'))
    print("f1 score (macro):",
          metrics.f1_score(prediction_values, Ty, average='macro'))
    print("precision score (macro):",
          metrics.precision_score(prediction_values, Ty, average='macro'))
    print("recall score (macro):",
          metrics.recall_score(prediction_values, Ty, average='macro'))

    print("prediction is", prediction_values.tolist())
    print("y is", Ty.tolist())

    end_time = time()

    print('Testing complete in ' + str(end_time - start_time) + ' seconds')
コード例 #25
0
ファイル: test_featureset.py プロジェクト: monkidea/skll
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):
    """
    Helper function for the two unit tests for FeatureSet.from_data_frame().
    Since labels are optional, run two tests, one with, one without.
    """
    import pandas

    # First, setup the test data.
    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=3, random_state=1234567890)

    # Not using 0 - 100 here because that would be pandas' default index names anyway.
    # So let's make sure pandas is using the ids we supply.
    ids = list(range(100, 200))

    featureset_name = 'test'

    # if use_feature_hashing, run these tests with a vectorizer
    feature_bins = 4
    vectorizer = (FeatureHasher(n_features=feature_bins)
                  if use_feature_hasher else None)

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # Now, create a FeatureSet object.
    if with_labels:
        expected = FeatureSet(featureset_name, ids, features=features, labels=y,
                              vectorizer=vectorizer)
    else:
        expected = FeatureSet(featureset_name, ids, features=features,
                              vectorizer=vectorizer)

    # Also create a DataFrame and then create a FeatureSet from it.
    df = pandas.DataFrame(features, index=ids)
    if with_labels:
        df['y'] = y
        current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y',
                                             vectorizer=vectorizer)
    else:
        current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer)

    return (expected, current)
コード例 #26
0
ファイル: criteo_ml.py プロジェクト: tneller/criteo
def xgb_categorical_hashing(hash_size):
    print('Using Criteo count predictors and {}-hashed categorical features:'.
          format(hash_size))
    df = file_to_dataframe(train_filename)
    y = df[[target_name]]
    x_noncat = df[noncategorical_predictor_names]
    x_cat = df[categorical_predictor_names]
    # DataFrame.dtypes for data must be int, float or bool, so one common approach to categorical data is to
    # prepend the field name to the category string and hash it to an index (e.g. 0 - 999,999), so that each
    # categorical results in a one-hot hashed encoding. Collisions may occur, but with enough indices, the trick
    # works well in practice.

    # This code is based on the "hashing trick" used in a number of CTR prediction competitions and discussed here:
    # https://blog.myyellowroad.com/using-categorical-data-in-machine-learning-with-python-from-dummy-variables-to-deep-category-66041f734512
    x_cat_hash = copy.copy(x_cat)
    for i in range(x_cat_hash.shape[1]):
        x_cat_hash.iloc[:, i] = x_cat_hash.columns[
            i] + ':' + x_cat_hash.iloc[:, i].astype('str')
    h = FeatureHasher(n_features=hash_size, input_type="string")
    x_cat_hash = pd.SparseDataFrame(h.transform(x_cat_hash.values))
    x = x_noncat.to_sparse(fill_value=None).join(x_cat_hash)
    # x = sparse.csr_matrix(x.to_coo())
    num_folds = 10
    early_stop_rounds = 5
    max_rounds = 5000
    # params documentation: https://xgboost.readthedocs.io/en/latest/python/python_api.html
    params = {
        'objective': 'binary:logistic',
        'silent': 1,
        'eval_metric': 'logloss',
        'nthread': num_threads
    }
    xg_train = xgb.DMatrix(
        x,
        label=y,
    )
    print(
        "{}-fold cross validation with logloss metric, early stopping after {} non-decreasing logloss iterations."
        .format(num_folds, early_stop_rounds))
    cv = xgb.cv(params,
                xg_train,
                max_rounds,
                nfold=num_folds,
                early_stopping_rounds=early_stop_rounds,
                verbose_eval=1)
    # Note: cv is a pandas DataFrame with each row representing a round's logloss results. I only print the last.
    # The test-logloss-mean is the main measure of interest for our comparison.
    print(cv[-1:])
コード例 #27
0
def hashing_encoding(df, cols, data_percent=0.85, verbose=False):
    for i in cols:
        val_counts = df[i].value_counts(dropna=False)
        s = sum(val_counts.values)
        h = val_counts.values / s
        c_sum = np.cumsum(h)
        c_sum = pd.Series(c_sum)
        n = c_sum[c_sum > data_percent].index[0]
        if verbose:
            print("n hashing para ", i, ":", n)
        if n > 0:
            fh = FeatureHasher(n_features=n, input_type='string')
            hashed_features = fh.fit_transform(
                df[i].astype(str).values.reshape(-1, 1)).todense()
            df = df.join(pd.DataFrame(hashed_features).add_prefix(i + '_'))
    return df.drop(columns=cols)
コード例 #28
0
    def build_x_vectors(self, ent_couple_objects):
        '''

        :param tuple(sen_id, ent1 name, ent2 name, x)
        :return: tuple(sen_id, ent1 name, ent2 name, x)
        '''
        if not self.feature_hasher:
            self.feature_hasher = FeatureHasher(n_features=len(
                self.features_set),
                                                input_type='string')

        x_data = self.feature_hasher.transform(
            [t[3] for t in ent_couple_objects])
        converted_ent_objects = [(t[0], t[1], t[2], x_data[i])
                                 for i, t in enumerate(ent_couple_objects)]
        return converted_ent_objects, x_data
コード例 #29
0
    def hashNgram(self, listOfSentences, n, numberOfFeatures, finNgram=None):
        hasher = FeatureHasher(n_features=numberOfFeatures)

        def sentToNgram(listOfSentences):
            for sent in listOfSentences:
                sentDic = {}
                sentNgrams = Counter(ngrams(sent, n))
                for ngramElement in sentNgrams:
                    if finNgram:
                        if ngramElement in finNgram:
                            sentDic[str(ngramElement)] = sentNgrams[ngramElement]
                    else:
                        sentDic[str(ngramElement)] = sentNgrams[ngramElement]
                yield sentDic

        return hasher.transform(sentToNgram(listOfSentences)).tolil()
コード例 #30
0
    def _hash(self, data_column, num_hash_features):
        """Convert a categorical feature to numerical representation using hashing.

    Args:
      data_column: a pandas series representing a DataFrame column
      num_hash_features: the number of hashing features

    Returns:
      hash_columns: a pandas DataFrame of the hashed columns
    """
        hasher = FeatureHasher(n_features=num_hash_features,
                               input_type='string')
        data_column = data_column.fillna('null')
        hashed_matrix = hasher.transform(data_column).toarray()
        hash_columns = pd.DataFrame(hashed_matrix)
        return hash_columns