dlls_vectorizer.fit(dlls_train) dlls_train = dlls_vectorizer.transform(dlls_train) dlls_test = dlls_vectorizer.transform(dlls_test) symbols_model = gensim.models.Word2Vec.load('w2v/symbols.100.5.model') symbols_w2v = dict(zip(symbols_model.index2word, symbols_model.syn0)) #symbols_vectorizer = MeanEmbeddingVectorizer(symbols_w2v) symbols_vectorizer = TfidfEmbeddingVectorizer(symbols_w2v) symbols_vectorizer.fit(symbols_train) symbols_train = symbols_vectorizer.transform(symbols_train) symbols_test = symbols_vectorizer.transform(symbols_test) # concatenate numerical and textual features X_train = concatenate_features( features_train, [identifiers_train, dlls_train, symbols_train]) X_test = concatenate_features(features_test, [identifiers_test, dlls_test, symbols_test]) # normalize features if classifier == "svcrbf": normalization = MinMaxScaler(feature_range=(-1, 1)) else: normalization = MinMaxScaler() normalization.fit(X_train) X_train = normalization.transform(X_train) X_test = normalization.transform(X_test) # use in a classifier if classifier == "svc": clf = SVC(kernel="linear", probability=THRESHOLD)
# csv header h_tf = [] for i in range(0,i_dlls-1): h_tf.append(USED_FEATURES[i]) h_tf.append("Time") for i in range(1,dlls_tf.shape[1]+1): h_tf.append("dlls"+str(i)) for i in range(1,symbols_tf.shape[1]+1): h_tf.append("symbols"+str(i)) h_tf.append("label") h_tf.append("md5") num_features = np.array(num_features) # concatenate numerical and textual features X_tf = concatenate_features(num_features,[dlls_tf, symbols_tf]) # normalization normalization_tfidf = MinMaxScaler() normalization_tfidf.fit(X_tf) X_tf = normalization_tfidf.transform(X_tf) # write features csv csv_ordered = open('mw_features_full_sorted_2.csv', "a") c_ordered = csv.writer(csv_ordered) c_ordered.writerow(h_tf) for i in range(X_tf.shape[0]): c_ordered.writerow(np.concatenate((X_tf[i], [labels[i]], [md5s[i]]),axis=0))
# train tfidf for each textual feature identifiers_tfidf = TfidfVectorizer(max_features=N_FEATURES) identifiers_tfidf.fit(identifiers) identifiers_tf = identifiers_tfidf.transform(identifiers) dlls_tfidf = TfidfVectorizer(max_features=N_FEATURES) dlls_tfidf.fit(dlls) dlls_tf = dlls_tfidf.transform(dlls) symbols_tfidf = TfidfVectorizer(max_features=N_FEATURES) symbols_tfidf.fit(symbols) symbols_tf = symbols_tfidf.transform(symbols) # concatenate numerical and textual features X_tf = concatenate_features(features,[identifiers_tf, dlls_tf, symbols_tf]) # W2V identifiers_model = gensim.models.Word2Vec.load('w2v/identifiers.100.5.model') identifiers_w2v = dict(zip(identifiers_model.index2word, identifiers_model.syn0)) identifiers_vectorizer = MeanEmbeddingVectorizer(identifiers_w2v) identifiers_vectorizer.fit(identifiers) identifiers_w2v = identifiers_vectorizer.transform(identifiers) dlls_model = gensim.models.Word2Vec.load('w2v/dlls.100.5.model') dlls_w2v = dict(zip(dlls_model.index2word, dlls_model.syn0)) dlls_vectorizer = MeanEmbeddingVectorizer(dlls_w2v) dlls_vectorizer.fit(dlls) dlls_w2v = dlls_vectorizer.transform(dlls)