def process_raw_features(self, raw_obj): sections = raw_obj["sections"] general = [ len(sections), # total number of sections # number of sections with nonzero size sum(1 for s in sections if s["size"] == 0), # number of sections with an empty name sum(1 for s in sections if s["name"] == ""), # number of RX sum(1 for s in sections if "MEM_READ" in s["props"] and "MEM_EXECUTE" in s["props"]), # number of W sum(1 for s in sections if "MEM_WRITE" in s["props"]) ] # gross characteristics of each section section_sizes = [(s["name"], s["size"]) for s in sections] section_sizes_hashed = FeatureHasher(50, input_type="pair").transform( [section_sizes]).toarray()[0] section_entropy = [(s["name"], s["entropy"]) for s in sections] section_entropy_hashed = FeatureHasher( 50, input_type="pair").transform([section_entropy]).toarray()[0] section_vsize = [(s["name"], s["vsize"]) for s in sections] section_vsize_hashed = FeatureHasher(50, input_type="pair").transform( [section_vsize]).toarray()[0] entry_name_hashed = FeatureHasher(50, input_type="string").transform( [raw_obj["entry"]]).toarray()[0] characteristics = [ p for s in sections for p in s["props"] if s["name"] == raw_obj["entry"] ] characteristics_hashed = FeatureHasher( 50, input_type="string").transform([characteristics]).toarray()[0] return np.hstack([ general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed, characteristics_hashed ]).astype(np.float32)
def test_model_train_explicit(): raw_corpus = load_raw_corpus(False) sent_corpus = load_sentence_corpus(False) features = flatten([[span.span_features() for span in Doc(raw).spans] for raw in raw_corpus]) y = flatten([[is_eos(span, sent['sentences']) for span in Doc(raw).spans] for raw, sent in zip(raw_corpus, sent_corpus)]) assert len(features) == len(y) pipeline = Pipeline([('hasher', FeatureHasher()), ('pa', PA())]) pipeline.fit(features, y)
def exports_features(self, lief_binary): from sklearn.feature_extraction import FeatureHasher exports = sorted(lief_binary.exported_functions) features_hashed = {} if exports: for i, x in enumerate(FeatureHasher(128, input_type='string').transform(exports).toarray()[0]): features_hashed.update({f'Exports_functions_hash_{i}': x}) else: for i in range(128): features_hashed.update({f'Exports_functions_hash_{i}': 0}) return features_hashed
def test_hasher_alternate_sign(): X = [list("Thequickbrownfoxjumped")] Xt = FeatureHasher(alternate_sign=True, non_negative=False, input_type='string').fit_transform(X) assert Xt.data.min() < 0 and Xt.data.max() > 0 Xt = FeatureHasher(alternate_sign=True, non_negative=True, input_type='string').fit_transform(X) assert Xt.data.min() > 0 Xt = FeatureHasher(alternate_sign=False, non_negative=True, input_type='string').fit_transform(X) assert Xt.data.min() > 0 Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False, input_type='string').fit_transform(X) # With initially positive features, the non_negative option should # have no impact when alternate_sign=False assert_array_equal(Xt.data, Xt_2.data)
def generate_X(df: pd.core.frame.DataFrame, feature_names: Tuple[str], n_features: int = 2**20, add_bias: bool = True) -> csr_matrix: D = df.filter(items=feature_names).to_dict(orient='records') for d in D: # split `usertag` string # e.x. # {'usertag': '10059,10052,10063'} # will become as follows # {'usertag=10059': 1, 'usertag=10052': 1, 'usertag=10063': 1} if 'usertag' in d: for usertag in d['usertag'].split(','): d['usertag={}'.format(usertag)] = 1 # delete original `usertag` del d['usertag'] if add_bias is True: X = FeatureHasher(n_features=n_features - 1).transform(D) X = utils.add_bias(X) else: X = FeatureHasher(n_features=n_features).transform(D) del D return X
def test_hasher_invalid_input(): raw_X = [[], (), iter(range(0))] feature_hasher = FeatureHasher(input_type="gobbledygook") with pytest.raises(ValueError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features=-1) with pytest.raises(ValueError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features=0) with pytest.raises(ValueError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features="ham") with pytest.raises(TypeError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features=np.uint16(2 ** 6)) with pytest.raises(ValueError): feature_hasher.transform([]) with pytest.raises(Exception): feature_hasher.transform([[5.5]]) with pytest.raises(Exception): feature_hasher.transform([[None]])
def predict(text): clf = getModel() featuresValues = process(text) vector = FeatureHasher(n_features=6).transform([featuresValues]).toarray() result = clf.predict(vector) fake_result = sum([value > 0 for value in featuresValues.values()]) #tweak for correct Boolean json serialization if result[0] and sum(featuresValues.values()): jsonRes = True else: jsonRes = False return {'result': jsonRes, 'values': featuresValues, 'fake_result': fake_result}
def initialize(self): if self.model_class == 'scikit': self.model = SGDRegressor(loss='squared_loss', alpha=0.1, n_iter=10, shuffle=True, eta0=0.0001) self.feature_constructor = FeatureHasher(n_features=200, dtype=np.float64, non_negative=False, input_type='dict') elif self.model_class == 'lookup': self.model = {}
def load_conll(f, features, n_features=(2**16), split=False): """Load CoNLL file, extract features on the tokens and vectorize them. The ConLL file format is a line-oriented text format that describes sequences in a space-separated format, separating the sequences with blank lines. Typically, the last space-separated part is a label. Since the tab-separated parts are usually tokens (and maybe things like part-of-speech tags) rather than feature vectors, a function must be supplied that does the actual feature extraction. This function has access to the entire sequence, so that it can extract context features. A ``sklearn.feature_extraction.FeatureHasher`` (the "hashing trick") is used to map symbolic input feature names to columns, so this function dos not remember the actual input feature names. Parameters ---------- f : {string, file-like} Input file. features : callable Feature extraction function. Must take a list of tokens l that represent a single sequence and an index i into this list, and must return an iterator over strings that represent the features of l[i]. n_features : integer, optional Number of columns in the output. split : boolean, default=False Whether to split lines on whitespace beyond what is needed to parse out the labels. This is useful for CoNLL files that have extra columns containing information like part of speech tags. Returns ------- X : scipy.sparse matrix, shape (n_samples, n_features) Samples (feature vectors), as a single sparse matrix. y : np.ndarray, dtype np.string, shape n_samples Per-sample labels. lengths : np.ndarray, dtype np.int32, shape n_sequences Lengths of sequences within (X, y). The sum of these is equal to n_samples. """ fh = FeatureHasher(n_features=n_features, input_type="string") labels = [] lengths = [] with _open(f) as f: raw_X = _conll_sequences(f, features, labels, lengths, split) X = fh.transform(raw_X) return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
def transform(self, df): print('encode ...') single_space = type(self.hash_space) == int if single_space: result = np.zeros((df.shape[0], self.hash_space)) hash_space = [self.hash_space for _ in self.columns] else: result = [] hash_space = self.hash_space total = len(self.columns) + len(self.array_columns) for idx, hash_column in enumerate(zip(self.columns, hash_space)): column, n_features = hash_column from sklearn.feature_extraction import FeatureHasher h = FeatureHasher(n_features=n_features, input_type='string', alternate_sign=self.alternate_sign) salt = str(hash(column)) f = h.transform(df[column].astype(str).apply(lambda x: [x + salt])) # FeatureHasher requires vectors if single_space: result = result + f.toarray() else: result.append(f.toarray()) for idx, hash_column in enumerate(zip(self.array_columns, hash_space)): column, n_features = hash_column h = FeatureHasher(n_features=n_features, input_type='string', alternate_sign=self.alternate_sign) f = h.transform(df[column]) if single_space: result = result + f.toarray() else: result.append(f.toarray()) if single_space: return result # if self.sparse: # return hstack(result) return np.concatenate(result, axis=1)
def initialize(self): if self.model_class == 'scikit': self.model = SGDRegressor(loss='squared_loss', alpha=0.1, n_iter=10, shuffle=True, eta0=0.0001) self.feature_constructor = FeatureHasher(n_features=200, dtype=np.float64, non_negative=False, input_type='dict') elif self.model_class == 'lookup': self.model = {} # This thing crawls,, too much python overhead for subprocess and pipe elif self.model_class == 'vw': self.model = None self.model_path = self.base_folder_name + "/model.vw" self.cache_path = self.base_folder_name + "/temp.cache" self.f1 = open(self.base_folder_name + "/train.vw", 'a') self.train_vw_cmd = [ '/usr/local/bin/vw', '--save_resume', '--holdout_off', '-c', '--cache_file', self.cache_path, '-f', self.model_path, '--passes', '20', '--loss_function', 'squared' ] self.train_vw_resume_cmd = [ '/usr/local/bin/vw', '--save_resume', '-i', self.model_path, '-f', self.model_path ] # self.remove_vw_files() elif self.model_class == 'vw_python': # TODO interactions, lrq, dropout etc commands go here # TODO Need to pass model path and throw finish somewhere to store the final model self.model_path = self.base_folder_name + "/model.vw" self.cache_path = self.base_folder_name + "/temp.cache" #self.f1 = open(self.base_folder_name + "/train.vw", 'a') self.model = pyvw.vw(quiet=True, l2=0.00000001, loss_function='squared', passes=1, holdout_off=True, cache=self.cache_path, f=self.model_path, lrq='sdsd7', lrqdropout=True)
def clasificador(): data = read_all_documents('./training') documents = data['docs'] labels = data['labels'] vectorizer = DictVectorizer() vectorizer.fit_transform(tokens_frequency(d) for d in documents) vectorizer.get_feature_names() #Sparse matrices hasher = FeatureHasher(n_features=2**8, input_type="string") X = hasher.transform(tokens(d) for d in documents) #Train a text classifier using K-Means clustering clf = joblib.load('modelo_entrenado.pkl') # Carga del modelo. prepositions = [ 'a', 'ante', 'bajo', 'cabe', 'con', 'contra', 'de', 'desde', 'en', 'entre', 'hacia', 'hasta', 'para', 'por', 'según', 'sin', 'so', 'sobre', 'tras' ] prep_alike = [ 'durante', 'mediante', 'excepto', 'salvo', 'incluso', 'más', 'menos' ] adverbs = ['no', 'si', 'sí'] articles = [ 'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas', 'este', 'esta', 'estos', 'estas', 'aquel', 'aquella', 'aquellos', 'aquellas' ] aux_verbs = [ 'he', 'has', 'ha', 'hemos', 'habéis', 'han', 'había', 'habías', 'habíamos', 'habíais', 'habían' ] tfid = TfidfVectorizer(stop_words=prepositions + prep_alike + adverbs + articles + aux_verbs) X_train = tfid.fit_transform(documents) y_train = labels #Predict categories for new articles test = read_all_documents('prueba') X_test = tfid.transform(test['docs']) y_test = test['labels'] pred = clf.predict(X_test) cat = str(pred[0]) return (cat)
def predict(self, files): ''' return a vector of predicted values for the set of files specified. Assume convention, 0=Benign, 1=Malware. ''' assert self.model is not None # now extract features from file, hash them and use self.model to return predictions start_time = time() completed_files = 0 feature_dictionary_list = [] print('Starting feature extraction') for _file in files: feature_dictionary_list.append(get_frequency_map(_file)) completed_files += 1 print('Completed extracting features from ' + str(completed_files) + ' files', end='\r') print('') end_time = time() print('Feature extraction completed in ' + str(end_time - start_time) + ' seconds') print('Starting testing') start_time = time() features = 7000 hasher = FeatureHasher(n_features=features) features_x = hasher.transform(feature_dictionary_list).toarray() y = self.model.predict(features_x) end_time = time() print('Testing completed in ' + str(end_time - start_time) + ' seconds') f = lambda x: 1 if x > 0 else 0 def transform(x): return np.fromiter((f(a) for a in x), x.dtype) return transform(y)
def train(self, files, labels, save=None): ''' Train the model on files whose file paths have been specified as a list. Save the trained model parameters in default location, or if specified at a custom location. Labels need to be multiclass ''' if not save: save = self.model_filename assert len(files) == len(labels) feature_dictionary_list = [] print('Starting feature extraction') start_time = time() completed_files = 0 for _file in files: feature_dictionary_list.append(get_frequency_map(_file)) completed_files += 1 print('Completed extracting features from ' + str(completed_files) + ' files', end='\r') print('') end_time = time() print('Feature extraction completed in ' + str(end_time - start_time) + ' seconds') print('Starting training model') start_time = time() features = 7000 hasher = FeatureHasher(n_features=features) features_x = hasher.transform(feature_dictionary_list).toarray() features_y = np.array(labels) clf = RandomForestClassifier() clf.fit(features_x, features_y) end_time = time() print('Training completed in ' + str(end_time - start_time) + ' seconds') pickle.dump(clf, open(save, 'wb')) if save == self.model_filename: self.model = clf
def test_feature_hasher_pairs(): raw_X = (iter(d.items()) for d in [{ "foo": 1, "bar": 2 }, { "baz": 3, "quux": 4, "foo": -1 }]) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert [1, 2] == x1_nz assert [1, 3, 4] == x2_nz
def hashFeatures(feature_dict, features_vol=20): feature_array = [] for i in range(features_vol): h = FeatureHasher(n_features=(i + 1)) f = h.transform(feature_dict) hashed_features = f.toarray() hashed_features = pd.DataFrame(hashed_features) feature_array.append(hashed_features) print("\tWorking on feature " + str(i) + " of " + str(features_vol)) return feature_array
def transform_data(ip,cpf,qtd_access_ip_last_days,qtd_access_cpf_last_days): groups = ip.split( "." ) equalize_group_length = "".join( map( lambda group: group.zfill(3), groups )) h = FeatureHasher(n_features=10, input_type='string') ip = h.transform([equalize_group_length]).toarray()[0] cpf = h.transform([cpf]).toarray()[0] data = np.concatenate((ip, cpf)) data = list(data) data.append(qtd_access_ip_last_days) data.append(qtd_access_cpf_last_days) return data
def get_hashed_matrix_and_hasher(data, column_name, num_features): """ 1. If not str, convert to string 2. Then will hash data and return matrix 3. Also returns the fitted hash """ data = make_column_hashable(data, column_name) hasher = FeatureHasher(n_features=num_features, non_negative=False, input_type='string') matrix = hasher.fit_transform(data[column_name]) return matrix, hasher
def pe_import(pe): if hasattr(pe, 'DIRECTORY_ENTRY_IMPORT'): for entry in pe.DIRECTORY_ENTRY_IMPORT: import_info = [] if isinstance(entry.dll, bytes): libname = entry.dll.decode().lower() else: libname = entry.dll.lower() import_info.append(libname) libraries_hashed = FeatureHasher(256, input_type="string").transform( [import_info]).toarray()[0] sum = libraries_hashed.sum() return libraries_hashed / sum else: return np.zeros(256, dtype=np.float64)
def _fe_category_feature_hashing(self, X, column_name, n_features): """ Description :param name: description :return: Description """ fh = FeatureHasher(n_features=n_features, input_type='string') x_features_arr = fh.fit_transform( X[column_name].astype('str')).toarray() column_names = np.array([]) for i in range(n_features): column_names = np.append(column_names, column_name + '_' + str(i + 1)) return pd.concat( [X, pd.DataFrame(x_features_arr, columns=column_names)], axis=1)
def get_section_info(self): section = self.report["section"] sections = section["sections"] vector = [ len(sections), sum(1 for s in sections if s["size"] == 0), # number of sections with an empty name sum(1 for s in sections if s["name"] == ""), # number of RX sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']), # number of W sum(1 for s in sections if 'MEM_WRITE' in s['props']) ] section_sizes = [(s['name'], s['size']) for s in sections] vector.extend(FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]) # section_sizes_hashed section_entropy = [(s['name'], s['entropy']) for s in sections] vector.extend(FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]) # section_entropy_hashed section_vsize = [(s['name'], s['vsize']) for s in sections] vector.extend(FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]) # section_vsize_hashed vector.extend(FeatureHasher(50, input_type="string").transform([section['entry']]).toarray()[0]) # entry_name_hashed characteristics = [p for s in sections for p in s['props'] if s['name'] == section['entry']] vector.extend(FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]) # characteristics_hashed return vector
def __init__(self, gamma=0.9, lr=1e-3, state_size=1000, action_size=1000, hidden_size=200): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.gamma = gamma self.lr = lr self.state_size = state_size self.action_size = action_size self.hidden_size = hidden_size self.state_hasher = FeatureHasher(n_features=self.state_size) self.action_hasher = FeatureHasher(n_features=self.action_size) self.value_net = ACValueNet(self.state_size, self.hidden_size) self.action_net = ACActionNet(self.action_size, self.hidden_size, self.hidden_size) params = (list(self.value_net.parameters()) + list(self.action_net.parameters())) self.optimizer = torch.optim.Adam(params, lr=self.lr)
def _vectorize_union(self, words): #TODO strs = 'classifier/train_data/' + self.identify + '_' + self.type + '_tfidf.txt' #strs = 'classifier/train_data/muqin_VotingClassifier_tfidf.txt' print strs if self.test: tv = pickle.load(open(strs, 'rb')) vocabulary = tv.vocabulary_ strs = None else: vocabulary = None pipeline = Pipeline([ ('SentenceDep', SentenceDepExtractor()), ( 'union', FeatureUnion( transformer_list=[ ('line', Pipeline([ ('selector', ItemSelector(key='line')), ('dict', DictVectorizer()), ])), ('dep', Pipeline([ ('selector', ItemSelector(key='dep')), ('tfidf', FeatureHasher(n_features=2**7, input_type='dict')), ])), ('sentence', Pipeline([ ('selector', ItemSelector(key='sentence')), ('tfidf', SaveTfidfVectorizer(strs, vocabulary=vocabulary, stop_words=self.stop)), ('best', TruncatedSVD(n_components=100)), ])), ], # weight components in FeatureUnion transformer_weights={ 'line': 0.4, 'sentence': 0.1, 'dep': 0.5, }, )), ]) return pipeline.fit_transform(words)
def test(): from sklearn.feature_extraction import FeatureHasher from sklearn.ensemble import RandomForestClassifier from sklearn import metrics feat = 7000 h = FeatureHasher(n_features=feat) start_time = time() TX = h.transform(pickle.load(open(test_feature_list_filename, 'rb'))).toarray() Ty = np.array(pickle.load(open(test_predict_filename, 'rb'))) clf = pickle.load(open('model2_parameters.sav', 'rb')) prediction_values = clf.predict(TX) f = lambda x: 1 if x > 0 else 0 def fromiter(x): return np.fromiter((f(xi) for xi in x), x.dtype) prediction_values = fromiter(prediction_values) Ty = fromiter(Ty) print("features:", feat) print("accuracy:", metrics.accuracy_score(prediction_values, Ty)) print("f1 score:", metrics.f1_score(prediction_values, Ty, average='micro')) print("precision score:", metrics.precision_score(prediction_values, Ty, average='micro')) print("recall score:", metrics.recall_score(prediction_values, Ty, average='micro')) print("f1 score (macro):", metrics.f1_score(prediction_values, Ty, average='macro')) print("precision score (macro):", metrics.precision_score(prediction_values, Ty, average='macro')) print("recall score (macro):", metrics.recall_score(prediction_values, Ty, average='macro')) print("prediction is", prediction_values.tolist()) print("y is", Ty.tolist()) end_time = time() print('Testing complete in ' + str(end_time - start_time) + ' seconds')
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher): """ Helper function for the two unit tests for FeatureSet.from_data_frame(). Since labels are optional, run two tests, one with, one without. """ import pandas # First, setup the test data. # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # Not using 0 - 100 here because that would be pandas' default index names anyway. # So let's make sure pandas is using the ids we supply. ids = list(range(100, 200)) featureset_name = 'test' # if use_feature_hashing, run these tests with a vectorizer feature_bins = 4 vectorizer = (FeatureHasher(n_features=feature_bins) if use_feature_hasher else None) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # Now, create a FeatureSet object. if with_labels: expected = FeatureSet(featureset_name, ids, features=features, labels=y, vectorizer=vectorizer) else: expected = FeatureSet(featureset_name, ids, features=features, vectorizer=vectorizer) # Also create a DataFrame and then create a FeatureSet from it. df = pandas.DataFrame(features, index=ids) if with_labels: df['y'] = y current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y', vectorizer=vectorizer) else: current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer) return (expected, current)
def xgb_categorical_hashing(hash_size): print('Using Criteo count predictors and {}-hashed categorical features:'. format(hash_size)) df = file_to_dataframe(train_filename) y = df[[target_name]] x_noncat = df[noncategorical_predictor_names] x_cat = df[categorical_predictor_names] # DataFrame.dtypes for data must be int, float or bool, so one common approach to categorical data is to # prepend the field name to the category string and hash it to an index (e.g. 0 - 999,999), so that each # categorical results in a one-hot hashed encoding. Collisions may occur, but with enough indices, the trick # works well in practice. # This code is based on the "hashing trick" used in a number of CTR prediction competitions and discussed here: # https://blog.myyellowroad.com/using-categorical-data-in-machine-learning-with-python-from-dummy-variables-to-deep-category-66041f734512 x_cat_hash = copy.copy(x_cat) for i in range(x_cat_hash.shape[1]): x_cat_hash.iloc[:, i] = x_cat_hash.columns[ i] + ':' + x_cat_hash.iloc[:, i].astype('str') h = FeatureHasher(n_features=hash_size, input_type="string") x_cat_hash = pd.SparseDataFrame(h.transform(x_cat_hash.values)) x = x_noncat.to_sparse(fill_value=None).join(x_cat_hash) # x = sparse.csr_matrix(x.to_coo()) num_folds = 10 early_stop_rounds = 5 max_rounds = 5000 # params documentation: https://xgboost.readthedocs.io/en/latest/python/python_api.html params = { 'objective': 'binary:logistic', 'silent': 1, 'eval_metric': 'logloss', 'nthread': num_threads } xg_train = xgb.DMatrix( x, label=y, ) print( "{}-fold cross validation with logloss metric, early stopping after {} non-decreasing logloss iterations." .format(num_folds, early_stop_rounds)) cv = xgb.cv(params, xg_train, max_rounds, nfold=num_folds, early_stopping_rounds=early_stop_rounds, verbose_eval=1) # Note: cv is a pandas DataFrame with each row representing a round's logloss results. I only print the last. # The test-logloss-mean is the main measure of interest for our comparison. print(cv[-1:])
def hashing_encoding(df, cols, data_percent=0.85, verbose=False): for i in cols: val_counts = df[i].value_counts(dropna=False) s = sum(val_counts.values) h = val_counts.values / s c_sum = np.cumsum(h) c_sum = pd.Series(c_sum) n = c_sum[c_sum > data_percent].index[0] if verbose: print("n hashing para ", i, ":", n) if n > 0: fh = FeatureHasher(n_features=n, input_type='string') hashed_features = fh.fit_transform( df[i].astype(str).values.reshape(-1, 1)).todense() df = df.join(pd.DataFrame(hashed_features).add_prefix(i + '_')) return df.drop(columns=cols)
def build_x_vectors(self, ent_couple_objects): ''' :param tuple(sen_id, ent1 name, ent2 name, x) :return: tuple(sen_id, ent1 name, ent2 name, x) ''' if not self.feature_hasher: self.feature_hasher = FeatureHasher(n_features=len( self.features_set), input_type='string') x_data = self.feature_hasher.transform( [t[3] for t in ent_couple_objects]) converted_ent_objects = [(t[0], t[1], t[2], x_data[i]) for i, t in enumerate(ent_couple_objects)] return converted_ent_objects, x_data
def hashNgram(self, listOfSentences, n, numberOfFeatures, finNgram=None): hasher = FeatureHasher(n_features=numberOfFeatures) def sentToNgram(listOfSentences): for sent in listOfSentences: sentDic = {} sentNgrams = Counter(ngrams(sent, n)) for ngramElement in sentNgrams: if finNgram: if ngramElement in finNgram: sentDic[str(ngramElement)] = sentNgrams[ngramElement] else: sentDic[str(ngramElement)] = sentNgrams[ngramElement] yield sentDic return hasher.transform(sentToNgram(listOfSentences)).tolil()
def _hash(self, data_column, num_hash_features): """Convert a categorical feature to numerical representation using hashing. Args: data_column: a pandas series representing a DataFrame column num_hash_features: the number of hashing features Returns: hash_columns: a pandas DataFrame of the hashed columns """ hasher = FeatureHasher(n_features=num_hash_features, input_type='string') data_column = data_column.fillna('null') hashed_matrix = hasher.transform(data_column).toarray() hash_columns = pd.DataFrame(hashed_matrix) return hash_columns