def predict(self): # TODO: MAKE IT BETTER # input_filename = 'Input.java' input_filename = 'input.js' print('Starting interactive prediction...') while True: print( 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) user_input = input() if user_input.lower() in self.exit_keywords: print('Exiting...') return try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename) except ValueError as e: print(e) continue raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) for raw_prediction, method_prediction in zip(raw_prediction_results, method_prediction_results): print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) print('Attention:') for attention_obj in method_prediction.attention_paths: print('%f\tcontext: %s,%s,%s' % ( attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) if self.config.EXPORT_CODE_VECTORS: print('Code vector:') print(' '.join(map(str, raw_prediction.code_vector)))
def predict(self): basepath = 'C:\\Users\\User\\Documents\\SP2019-DNC\\nodeJS_Rest\\dumpFolder\\JavaMethods\\' path, dirs, files = next(os.walk(basepath)) file_count = len(files) print(file_count) files = os.listdir(basepath) with open('C:\\Users\\User\\Documents\SP2019-DNC\\nodeJS_Rest\\dumpFolder\\c2vVector.csv', 'w', newline='') as file: writer = csv.writer(file) for file in files: input_filename = 'C:\\Users\\User\\Documents\\SP2019-DNC\\nodeJS_Rest\\dumpFolder\\JavaMethods\\' + str(file) # print('Starting interactive prediction...on ' + str(i) + '.java') try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename) except ValueError as e: print(e) continue raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) for raw_prediction, method_prediction in zip(raw_prediction_results, method_prediction_results): # print('Original name:\t' + method_prediction.original_name) # for name_prob_pair in method_prediction.predictions: # print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) # print('Attention:') # for attention_obj in method_prediction.attention_paths: # print('%f\tcontext: %s,%s,%s' % ( # attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) if self.config.EXPORT_CODE_VECTORS: writer.writerow([str(file), ' '.join(map(str, raw_prediction.code_vector))]) # print('Code vector:') # print(' '.join(map(str, raw_prediction.code_vector))) file.close() return
def predict(self): input_filename = '/home/joaorura/PycharmProjects/API_REST_Code2Algo/external_projects/code2vec/Input.java' print('Starting prediction...') try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename) except ValueError as e: print(e) return raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) for raw_prediction, method_prediction in zip( raw_prediction_results, method_prediction_results): print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) print('Attention:') for attention_obj in method_prediction.attention_paths: print('%f\tcontext: %s,%s,%s' % (attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) if self.config.EXPORT_CODE_VECTORS: print('Code vector:') print(' '.join(map(str, raw_prediction.code_vector)))
def predict_input_files(self): for file in self.config.INPUT_FILES: try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( file) except ValueError as e: print(e) continue raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) for raw_prediction, method_prediction in zip( raw_prediction_results, method_prediction_results): #print('Original name:\t' + method_prediction.original_name) # for attention_obj in method_prediction.attention_paths: # print('%f\tcontext: %s,%s,%s' % ( # attention_obj['score'], attention_obj['token1'], attention_obj['path'], # attention_obj['token2'])) # if self.config.EXPORT_CODE_VECTORS: # print('Code vector:') print(' '.join(map(str, raw_prediction.code_vector)))
def predict(self): print('Starting file processing...') path_to_report = "labeled_reports" c = 0 for root, dirs, files in os.walk(path_to_report): for file in files: if not file.endswith('.java'): break input_filename = root + "/" + file try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename) except ValueError as e: print(e) continue raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words) methods_names = [] methods_vectors = [] for raw_prediction, method_prediction in zip( raw_prediction_results, method_prediction_results): methods_names.append(''.join( method_prediction.original_name.split('|'))) if self.config.EXPORT_CODE_VECTORS: methods_vectors.append(raw_prediction.code_vector) df = pd.DataFrame(data=methods_vectors) df['method'] = methods_names df.to_csv(input_filename.split('.')[0] + '.csv') print(input_filename + " - done")
def predict(self): try: os.mkdir('csv/') except FileExistsError: pass with open('../javaPahts.json') as f: data = json.load(f) print('Starting interactive prediction...') start_time = time.time() for app in data: appVectors = [] for javaPath in data[app]: #print( # 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) #user_input = input() #if user_input.lower() in self.exit_keywords: # print('Exiting...') # return input_filename = os.path.join("..",javaPath) try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename) except ValueError as e: print(e) continue raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) #count = 0 for raw_prediction, method_prediction in zip(raw_prediction_results, method_prediction_results): #if count >0: #print("too many vectors!") #count +=1 #print('Original name:\t' + method_prediction.original_name) #for name_prob_pair in method_prediction.predictions: #print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) #print('Attention:') #for attention_obj in method_prediction.attention_paths: #print('%f\tcontext: %s,%s,%s' % ( #attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) if self.config.EXPORT_CODE_VECTORS: appVectors.append( [method_prediction.original_name]+[javaPath]+list(raw_prediction.code_vector)) #print('Code vector:') #print(' '.join(map(str, raw_prediction.code_vector))) #print(type(raw_prediction.code_vector)) #if count ==1: #print("perfect!") pd.DataFrame(appVectors).to_csv('csv/'+app+'.csv',index=False) print("--- %s seconds ---" % (time.time() - start_time))
def predict(self): input_filename = 'function.c' print('Starting interactive prediction for c...') try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename) except ValueError as e: print(e) #continue raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) with open('result.txt', 'w+') as resultfile: for raw_prediction, method_prediction in zip( raw_prediction_results, method_prediction_results): print('Original-name: ' + method_prediction.original_name + '\n') print(method_prediction) resultfile.writelines('Function Name - ' + method_prediction.original_name + '\n') if 'NONE' in method_prediction.predictions[0][ 'name'] and float(method_prediction.predictions[0] ['probability']) < 0.80: for number in range(1, 4): prediction = method_prediction.predictions[number] prediction['name'] = map_vulnerability_names( prediction['name']) print('prediction for NONE is - ', prediction) resultfile.writelines( prediction['name'] + ' ^ ' + str(float(prediction['probability'] * 100)) + '%' + '\n') else: for number in range(0, 3): prediction = method_prediction.predictions[number] prediction['name'] = map_vulnerability_names( prediction['name']) print('prediction for NONE is - ', prediction) resultfile.writelines( prediction['name'] + ' ^ ' + str(float(prediction['probability'] * 100)) + '%' + '\n') for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name']))
def predict(self): print( 'NOTICE: The cusomized version of predict() in interactive_predict.py was called!' ) input_filenames = [SRC_PATH] if not SRC_PATH.match('*.java'): input_filenames = sorted(SRC_PATH.glob('*.java')) for input_filename in input_filenames: print(input_filename) try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename.as_posix()) except ValueError as e: print(e) continue raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) f_out = DST_PATH if not DST_PATH.match('*.txt'): DST_PATH.mkdir(parents=True, exist_ok=True) f_out = DST_PATH / f'{input_filename.stem}.txt' with f_out.open(mode='w') as f: for raw_prediction, method_prediction in zip( raw_prediction_results, method_prediction_results): print('Original name:\t' + method_prediction.original_name) # for name_prob_pair in method_prediction.predictions: # print('\t(%f) predicted: %s' % # (name_prob_pair['probability'], # name_prob_pair['name'])) # print('Attention:') # for attention_obj in method_prediction.attention_paths: # print('%f\tcontext: %s,%s,%s' % # (attention_obj['score'], attention_obj['token1'], # attention_obj['path'], attention_obj['token2'])) if self.config.EXPORT_CODE_VECTORS: cv = ' '.join(map(str, raw_prediction.code_vector)) print('Code vector:') print(cv) # write code vector in text files f.write('{},{}\n'.format( method_prediction.original_name, cv))
def predict(self): for input_filename in glob.glob( '/Users/jianyumao/defects4j/GoodBlock/*.block'): #input_filename = 'Input.java' #print('Starting interactive prediction...') #print( # 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) #user_input = input() #if user_input.lower() in self.exit_keywords: # print('Exiting...') # return try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename) except ValueError as e: #print(e) continue raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) for raw_prediction, method_prediction in zip( raw_prediction_results, method_prediction_results): #print('Original name:\t' + method_prediction.original_name) #outlog = open('/Users/jianyumao/Desktop/Linux/code2vec/FixedLog/'+input_filename.lstrip('/Users/jianyumao/defects4j/FixedInnerFunc/')+'.log', 'w') #outlog.write('Original name:\t' + method_prediction.original_name) #for name_prob_pair in method_prediction.predictions: #print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) #outlog.write('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) #print('Attention:') #outlog.write('\nAttention:\n') #for attention_obj in method_prediction.attention_paths: #print('%f\tcontext: %s,%s,%s' % ( #attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) #outlog.write('%f\tcontext: %s,%s,%s\n' % ( #attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) #outlog.close() if self.config.EXPORT_CODE_VECTORS: print('Code vector:') print(' '.join(map(str, raw_prediction.code_vector)))
def predict_file(self, input_filename, output_filename): print(input_filename, '--->', output_filename) output = [] try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename) except ValueError as e: print(e) print('press a key to continue...') input() return raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) for raw_prediction, method_prediction in zip( raw_prediction_results, method_prediction_results): prediction = method_prediction.predictions[0]['name'][0] actual = method_prediction.original_name output.append('Prediction:\t' + prediction) output.append('Actual:\t' + actual) for name_prob_pair in method_prediction.predictions: output.append( '\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) output.append('Attention:') for attention_obj in method_prediction.attention_paths: output.append('%f\tcontext: %s,%s,%s' % (attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) if self.config.EXPORT_CODE_VECTORS: output.append('Code vector:') output.append(' '.join(map(str, raw_prediction.code_vector))) output_body = '\n'.join(output) Path(output_filename).write_text('/*\n' + output_body + '\n*/' + '\n\n' + Path(input_filename).read_text()) return prediction == actual
def predict(self): ## Add iterative search to go over files produced by SARD utility sard_file_path='sard/data/' for input_filename in os.listdir(sard_file_path): print('Starting interactive prediction...') print(input_filename) '''while True: print( 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) user_input = input() if user_input.lower() in self.exit_keywords: print('Exiting...') return''' try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(sard_file_path+input_filename) print('*****************************************************') print(predict_lines) print(hash_to_string_dict) print('*****************************************************') except ValueError as e: print(e) continue raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) for raw_prediction, method_prediction in zip(raw_prediction_results, method_prediction_results): print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) print('Attention:') for attention_obj in method_prediction.attention_paths: print('%f\tcontext: %s,%s,%s' % ( attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) if self.config.EXPORT_CODE_VECTORS: print('Code vector:') ## dump code vector to a new file f = open('sard/vector/'+input_filename +'.vector','w') f.write(' '.join(map(str, raw_prediction.code_vector))) f.close()
def predict(self, predict_lines, hash_to_string_dict, target_method, top_attentions=3): # try: # predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename) # except ValueError as e: # return None raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) for raw_prediction, method_prediction in zip( raw_prediction_results, method_prediction_results): #print('checking', target_method.lower(), method_prediction.original_name.replace('|', '')) if target_method.lower( ) == method_prediction.original_name.replace('|', ''): return raw_prediction.code_vector # Do we want to get a context as well? But how to use it?
def train(config: Config, model): path_extractor = Extractor(config, jar_path=JAR_PATH, max_path_length=MAX_PATH_LENGTH, max_path_width=MAX_PATH_WIDTH) if len(config.INPUT_FILES) != 1: raise Exception data_file = config.INPUT_FILES[0] data = pd.read_csv(data_file, names=['path1', 'path2', 'label']) features = data.copy() labels = features.pop(('label')) for i, paths in features.iterrows(): if i == 0: continue predict_lines, hash_to_string_dict = path_extractor.extract_paths( paths['path1']) raw_prediction_results = model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) print() for raw_prediction, method_prediction in zip( raw_prediction_results, method_prediction_results): # print('Original name:\t' + method_prediction.original_name) # for attention_obj in method_prediction.attention_paths: # print('%f\tcontext: %s,%s,%s' % ( # attention_obj['score'], attention_obj['token1'], attention_obj['path'], # attention_obj['token2'])) # if self.config.EXPORT_CODE_VECTORS: # print('Code vector:') print(' '.join(map(str, raw_prediction.code_vector)))
def predict(self): print('Starting interactive prediction...') # Make sure not to analyze the same file twice when I have to re-run #Config for buggy methods #test_files_path = "/home/kilby/Documents/code/c2v_models/files/"+self.config.PROJECT+"/buggy_methods" #vector_data_path = "/home/kilby/Documents/code/c2v_models/files/"+self.config.PROJECT+"/vector_data.txt" #Config for clean methods of buggy files #test_files_path = "/home/kilby/Documents/code/c2v_models/files/buggy_files_modified/"+self.config.PROJECT #vector_data_path = "/home/kilby/Documents/code/c2v_models/files/buggy_files_modified/"+self.config.PROJECT+"/vector_data.txt" #.. config continued for both configs above... #test_files = [f for f in listdir(test_files_path) if isfile(join(test_files_path, f))] #ignore_files = set() #CHANGE THIS BACK! #with open(vector_data_path, 'r') as vector_file: #for line in vector_file: #line = line.split(',') #ignore_files.add(line[0]) #test_files = set(test_files) - ignore_files #Config to collect all methods from all cloned repos test_files_path = "" vector_data_path = "/home/kilby/Documents/code/c2v_models/files/all_target_file_vectors.txt" import pandas as pd import os df = pd.read_csv( "/home/kilby/Documents/code/c2v_models/files/nabats_dataset.csv") test_files = [] for i, r in df.iterrows(): path = r['filepath'] path = path.replace('NABATS', 'code').replace('kjbaron', 'kilby') path = path.split('/') path.insert(7, r['project']) path = "/".join(path) if os.path.isfile(path): test_files.append(path) counter = 0 total = len(test_files) for input_filename in test_files: print(counter, "/", total, input_filename) counter += 1 try: # predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename) predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( test_files_path + '/' + input_filename) except ValueError as e: print(e) print(input_filename) continue raw_prediction_results = self.model.predict(predict_lines) method_prediction_results = common.parse_prediction_results( raw_prediction_results, hash_to_string_dict, self.model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) for raw_prediction, method_prediction in zip( raw_prediction_results, method_prediction_results): # print('Original name:\t' + method_prediction.original_name) # for name_prob_pair in method_prediction.predictions: # print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) # print('Attention:') # for attention_obj in method_prediction.attention_paths: # print('%f\tcontext: %s,%s,%s' % ( # attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) # MODIFIED BY KILBY TO COLLECT VECTORS ALONG WITH FILE AND METHOD NAMES if self.config.EXPORT_CODE_VECTORS: # print('Code vector:') # print(' '.join(map(str, raw_prediction.code_vector))) with open(vector_data_path, 'a') as f: f.write( input_filename + "," + raw_prediction.original_name + "," + ' '.join(map(str, raw_prediction.code_vector)) + "\n")
with open(json_file) as sample_file, open( context_paths) as contexts_file, open(predictions_file, "w") as predictions: for sample, function in zip(sample_file, contexts_file): sample = json.loads(sample.strip()) parts = function.rstrip().split(' ') method_name = parts[0] current_result_line_parts = [method_name] contexts = parts[1:] for context in contexts[:200]: context_parts = context.split(',') context_word1 = context_parts[0] context_path = context_parts[1] context_word2 = context_parts[2] current_result_line_parts += [ '%s,%s,%s' % (context_word1, context_path, context_word2) ] space_padding = ' ' * (200 - len(contexts)) result_line = ' '.join(current_result_line_parts) + space_padding raw_prediction_results = model.predict([result_line]) method_prediction_results = common.parse_prediction_results( raw_prediction_results, model.vocabs.target_vocab.special_words, topk=SHOW_TOP_CONTEXTS) for raw_prediction, method_prediction in zip( raw_prediction_results, method_prediction_results): predictions.write( f"{sample['idx']}\t{dicti[method_prediction.predictions[0]['name'][0]]}\n" )