def save_feature(self, file_name, save_file_name): db_handler = DatabaseHandler() opcode_variety = db_handler.extract_opcode_variety() opcode_sequence_O0 = db_handler.extract_opcode_sequence(file_name=file_name + '_MinGW_O0') opcode_sequence_O1 = db_handler.extract_opcode_sequence(file_name=file_name + '_MinGW_O1') opcode_sequence_O2 = db_handler.extract_opcode_sequence(file_name=file_name + '_MinGW_O2') opcode_sequence_O3 = db_handler.extract_opcode_sequence(file_name=file_name + '_MinGW_O3') with open(self.csv_save_dir_name + os.sep + save_file_name, 'wb') as f: writer = csv.writer(f) writer.writerow([file_name, 'O0', 'O1', 'O2', 'O3']) for opcode in opcode_variety: row = [] row.append(opcode) row.append(opcode_sequence_O0.count(opcode)) row.append(opcode_sequence_O1.count(opcode)) row.append(opcode_sequence_O2.count(opcode)) row.append(opcode_sequence_O3.count(opcode)) writer.writerow(row) row = [] row.append('Sum') row.append(len(opcode_sequence_O0)) row.append(len(opcode_sequence_O1)) row.append(len(opcode_sequence_O2)) row.append(len(opcode_sequence_O3)) writer.writerow(row)
def extract_feature_vector(self, file_id, extraction_method): db_handler = DatabaseHandler() if self.opcode_variety_ is None: self.set_opcode_variety_from_database() opcode_sequence = db_handler.extract_opcode_sequence(file_id) if extraction_method == 'bag-of-opcodes': feature_vector = self.extract_bag_of_opcodes(opcode_sequence) elif extraction_method == '2-gram': feature_vector = self.extract_ngram(opcode_sequence, 2) elif extraction_method == '3-gram': feature_vector = self.extract_ngram(opcode_sequence, 3) elif extraction_method == 'proposed': subroutine_sequence = db_handler.extract_subroutine_sequence(file_id) average_subroutine_length = self.extract_average_subroutine_length(subroutine_sequence) location_sequence = db_handler.extract_location_sequence(file_id) average_basicblock_length = self.extract_average_basicblock_length(location_sequence) # construct feature_vector here else: sys.stderr.write('Error: no extraction method "' + extraction_method + '" found.') sys.exit() return feature_vector
opcode_sequence = db_handler.extract_opcode_sequence(file_id) if extraction_method == 'bag-of-opcodes': feature_vector = self.extract_bag_of_opcodes(opcode_sequence) elif extraction_method == '2-gram': feature_vector = self.extract_ngram(opcode_sequence, 2) elif extraction_method == '3-gram': feature_vector = self.extract_ngram(opcode_sequence, 3) elif extraction_method == 'proposed': subroutine_sequence = db_handler.extract_subroutine_sequence(file_id) average_subroutine_length = self.extract_average_subroutine_length(subroutine_sequence) location_sequence = db_handler.extract_location_sequence(file_id) average_basicblock_length = self.extract_average_basicblock_length(location_sequence) # construct feature_vector here else: sys.stderr.write('Error: no extraction method "' + extraction_method + '" found.') sys.exit() return feature_vector if __name__ == '__main__': db_handler = DatabaseHandler() opcode_sequence = db_handler.extract_opcode_sequence(500) # bigrams = nltk.bigrams(opcode_sequence) # fd = nltk.FreqDist(bigrams) # cfd = nltk.ConditionalFreqDist(bigrams) # cfd[u'cmp'].plot(50) trigrams = nltk.trigrams(opcode_sequence) print list(trigrams) # fd = nltk.FreqDist(trigrams) cfd = nltk.ConditionalFreqDist(trigrams) cfd[u'cmp'].plot(50)