def cyber_object_miner(file_name): sents = readfile(file_name) cyber_objects_words = list() cyber_objects_arg1 = list() action_verbs = list() for sentence in sents: extraction_list = list() try: predict = configuration.model_AllenNLP_SRL.predict(sentence=sentence) if len(predict['verbs']) == 0: # print('AllenNLP can not extract: ', sentence) cyber_objects_words.append(helper.remove_stopwords(sentence)) else: for extractions in predict['verbs']: verb = extractions['verb'] # checking from the verb_list # print(verb) action_verbs.append(verb) # if helper.is_dictionay_key(configuration.preprocessOntologies.verb_dict, # configuration.stemmer.stem(verb)): description = extractions['description'] # print('description:',description) # # print(description) single_extraction_dict = AllenNLP.process_single_AllenNLP_description(description) extraction_list.append(single_extraction_dict) extraction_list = AllenNLP.analyze_whole_sentence(extraction_list) extraction_list = AllenNLP.delete_extra_args(extraction_list) # print(single_extraction_dict) for single_extraction_dict in extraction_list: try: cyber_objects_arg1.append(helper.remove_stopwords(single_extraction_dict['where'])) except: pass # print('ARG1 not predent') except: pass return cyber_objects_arg1, cyber_objects_words, action_verbs
def main(): input_size = 10000 embedding_size = 24 output_size = 5 learning_rate = 0.01 oov_token = '<OOV>' loss = 'sparse_categorical_crossentropy' optimizer = Adam(learning_rate=learning_rate) epochs = 1 train_val_split = 0.2 sentences, sentiments = helper.get_data('data/train.tsv') sentences = helper.remove_stopwords(sentences, 'data/stopwords') max_length = len(max(sentences, key=len)) tokenizer = helper.get_tokenizer(input_size, oov_token, sentences) padded_sentences = helper.convert_to_sequences(tokenizer, sentences, max_length) train_padded_sentences, validation_padded_sentences, train_sentiments, validation_sentiments = \ train_test_split( padded_sentences, sentiments, test_size=train_val_split, random_state=42 ) train_padded_sentences = np.array(train_padded_sentences) train_sentiments = np.array(train_sentiments) validation_padded_sentences = np.array(validation_padded_sentences) validation_sentiments = np.array(validation_sentiments) layers = [ tf.keras.layers.Embedding(input_size, embedding_size, input_length=max_length), # tf.keras.layers.LSTM(32), # tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='relu'), # tf.keras.layers.MaxPooling1D(pool_size=4), # tf.keras.layers.Dropout(0.2), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dense(units=24, activation='relu'), tf.keras.layers.Dense(units=output_size, activation='softmax') ] model = MODEL(input_size, output_size, layers, loss, optimizer, epochs) model.__train__(train_padded_sentences, train_sentiments, validation_padded_sentences, validation_sentiments) model.__plot_graph__('accuracy')
def combine_cyber_object(files=cyber_object_files): text = '' for file in files: text += helper.read_file(file) + '\n' text = text.split('\n') dict_cyber = dict() for tt in text: tt = tt.translate(str.maketrans('', '', '!"#$%&\()*+,.:;<=>?@[\\]^_`{|}~')).lower() key = helper.stem_and_lemmatize(tt, isRemoveStopword=True) dict_cyber[key] = helper.remove_stopwords(tt) what_list = list(set(list(dict_cyber.values()))) what_list.sort() # for __ in what_list: # print(__) # print(len(what_list)) helper.write_file(output_file, what_list) return
def read_action_what(file_name='action_what.txt'): text = helper.read_file(file_name) text = text.split('\n') dict_cyber = dict() for tt in text: tt = tt.translate( str.maketrans('', '', '!"#$%&\()*+,.:;<=>?@[\\]^_`{|}~')).lower() # print(dict__) # print('tt:',tt) # if tt == '': # continue # stemmed = configuration.stemmer.stem(configuration.lemmatizer.lemmatize(tt)).lower() # print(stemmed) # if stemmed in dict__.keys(): # dict__[stemmed] = dict__[stemmed] + ' ' + tt # else: # dict__[stemmed] = tt + ' ' # what_list.append(configuration.stemmer.stem(configuration.lemmatizer.lemmatize(tt)).lower()) key = helper.stem_and_lemmatize(tt, isRemoveStopword=True) dict_cyber[key] = helper.remove_stopwords(tt) what_list = list(set(list(dict_cyber.values()))) what_list.sort() for __ in what_list: print(__) print(len(what_list)) helper.write_file('ontology/temp.txt', what_list) # with open('ontology/all_cyber_list_3.txt','w', encoding='utf-8') as f: # keys = list(dict_cyber.values()) # keys.sort() # for tt in keys: # # f.write(tt + ' - ' + dict__[tt] + '\n') # f.write(tt + '\n') # print(tt) # # print(configuration.stemmer.stem(tt)) return what_list
def analyze_single_extraction(extracted_dictionary): if len(extracted_dictionary) <= 1: return False, create_attack_vector() args_list = list(extracted_dictionary.keys()) result_list = list() # # print('dict_list', dict_list) compact_attack_vector = create_attack_vector() # check for negetive sentence isAttributePresent = all(elem in args_list for elem in ['ARGM-NEG']) if isAttributePresent: compact_attack_vector['what'] = 'ARGM-NEG' return False, compact_attack_vector # Without subject, remove # if 'ARG0' not in list(args_list): # return False, create_attack_vector() # Subject tracking if 'ARG0' in args_list and extracted_dictionary['ARG0'].strip() == 'we': # print('Skipped because no ARG0 extracted') return False, compact_attack_vector # Check for V and ARG1 isAttributePresent = all(elem in args_list for elem in ['V', 'ARG1']) if isAttributePresent: # Check for preposition in ARG1, cyber_object compact_attack_vector['what'] = helper.remove_stopwords( extracted_dictionary['V']) if ('itself' in extracted_dictionary['ARG1'].split() or 'it' in extracted_dictionary['ARG1'].split() ) and 'ARG0' in args_list: compact_attack_vector['where'] = helper.remove_stopwords( extracted_dictionary['ARG0']) else: compact_attack_vector['where'] = helper.remove_stopwords( extracted_dictionary['ARG1']) # Check for purpose isAttributePresent = all(elem in args_list for elem in ['ARGM-PRP']) isWhyFound = True if isAttributePresent: compact_attack_vector['why'] = helper.remove_stopwords( extracted_dictionary['ARGM-PRP']) isWhyFound = False # Need to add propbank extraction isAttributePresent = all(elem in args_list for elem in ['ARG2']) if isAttributePresent and isWhyFound: if 'to' in extracted_dictionary['ARG2'].split(): compact_attack_vector['why'] = helper.remove_stopwords( extracted_dictionary['ARG2']) # Check for manner --> how isAttributePresent = all(elem in args_list for elem in ['ARGM-MNR']) if isAttributePresent: compact_attack_vector['how'] = helper.remove_stopwords( extracted_dictionary['ARGM-MNR']) isAttributePresent = all(elem in args_list for elem in ['ARGM-TMP']) if isAttributePresent: compact_attack_vector['when'] = extracted_dictionary['ARGM-TMP'] # compact_attack_vector['when'] = helper.remove_stopwords(extracted_dictionary['ARGM-TMP']) # if len(compact_attack_vector['when']) == 0: # return False, compact_attack_vector return True, compact_attack_vector
def get_important_relations(self, dep_tree, sentence): extracted_words = dict() what_bagofwords = set() where_bagofwords = set() where_attribute_bagofwords = set() how_bagofwords = set() why_bagofwords = set() when_bagofwords = set() subject_bagofwords = set() action_bagofwords = set() for node in dep_tree[0]: # print(node) self.get_relation(node, 'dobj', what_bagofwords, where_bagofwords) # if node[0] == 'dobj': # action_bagofwords.add(verb+" "+obj) self.get_relation(node, 'nsubj', what_bagofwords, subject_bagofwords) self.get_relation(node, 'nmod:on', what_bagofwords, where_attribute_bagofwords) self.get_relation(node, 'nmod:in', where_attribute_bagofwords, where_attribute_bagofwords) self.get_relation(node, 'advcl:to', what_bagofwords, why_bagofwords) self.get_relation(node, 'compound', where_bagofwords, where_bagofwords) self.get_relation(node, 'nsubjpass', where_bagofwords, where_bagofwords) self.get_relation(node, 'nmod:agent', where_bagofwords, where_bagofwords) self.get_relation(node, 'nmod:from', where_bagofwords, where_bagofwords) self.get_relation(node, 'nmod:to', where_bagofwords, where_bagofwords) self.get_relation(node, 'nmod:with', where_bagofwords, where_bagofwords) self.get_relation(node, 'nmod:via', where_bagofwords, where_bagofwords) self.get_relation(node, 'nmod:over', where_bagofwords, where_bagofwords) self.get_relation(node, 'nmod:for', where_bagofwords, where_bagofwords) self.get_relation(node, 'nmod:via', where_bagofwords, where_bagofwords) self.get_relation(node, 'nmod:through', where_bagofwords, where_bagofwords) self.get_relation(node, 'nmod:using', where_bagofwords, where_bagofwords) self.get_relation(node, 'nmod:into', where_bagofwords, where_bagofwords) # what_bafofwords.append(verb) # where_bagofwords.append(obj) extracted_words['what'] = helper.remove_stopwords(what_bagofwords) extracted_words['where'] = helper.remove_stopwords(where_bagofwords) extracted_words['where_attribute'] = helper.remove_stopwords( where_attribute_bagofwords) extracted_words['why'] = helper.remove_stopwords(why_bagofwords) extracted_words['when'] = helper.remove_stopwords(when_bagofwords) extracted_words['how'] = helper.remove_stopwords(how_bagofwords) extracted_words['subject'] = helper.remove_stopwords( subject_bagofwords) extracted_words['action'] = helper.remove_stopwords(action_bagofwords) extracted_words['text'] = sentence return extracted_words
#START DATA PREPROCESSING data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(' ')) data['punct%'] = data['body_text'].apply(lambda x: count_punctuation(x)) data['body_text_clean'] = data['body_text'].apply( lambda x: remove_punctuation(x)) data['body_text_tokenized'] = data['body_text_clean'].apply( lambda x: tokenize(x)) stopwords = nltk.corpus.stopwords.words('english') data['body_text_nonstop'] = data['body_text_tokenized'].apply( lambda x: remove_stopwords(x, stopwords)) stemmer = nltk.PorterStemmer() data['body_text_stemmed'] = data['body_text_nonstop'].apply( lambda x: stemming(x, stemmer)) wn = nltk.WordNetLemmatizer() data['body_text_lemmatized'] = data['body_text_nonstop'].apply( lambda x: lemmatizing(x, wn)) #END DATA PREPROCESSING #BEGINNING VECTORIZATION OF DATA
data = documents.copy() # Removes phrases with @ in them data = [re.sub('\S*@\S*\s?', '', sent) for sent in data] # Truncates multiple consecutive whitespace to one data = [re.sub('\s+', ' ', sent) for sent in data] # Removes ' characters data = [re.sub("\'", "", sent) for sent in data] data_words = list(he.sent_to_words(data)) print('Building Bigrams') # Making Bigrams - Higher the threshold, fewer the phrases bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) bigram_mod = gensim.models.phrases.Phraser(bigram) print('Removing Stopwords') # Remove Stop Words data_words_nostops = he.remove_stopwords(data_words, stop_words) print('Forming Bigrams') # Form Bigrams data_words_bigrams = he.make_bigrams(data_words_nostops, bigram_mod) print('Lemmatizing Data') # Lemmatize Data data_lemmatized = he.lemmatization( data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # The keep_n parameter controls the size of the vocabulary. # At this stage, we have to manually experiment with various vocabulary sizes to see what works best. # I found that ~8-10% of the number of documents is a good size. # For Digital India, I used vocab size of 1000 (12412 documents). # For GST, I used a vocab size of 1500 (15k documents approx) print('Creating Dictionary')