def _okay(js, ast, predictor): calls = [ predictor.callmap[call['_call']] for call in gather_calls(ast['ast']) ] apicalls = list( set( chain.from_iterable([ bayou.models.low_level_evidences.evidence.APICalls.from_call( call) for call in calls ]))) types = list( set( chain.from_iterable([ bayou.models.low_level_evidences.evidence.Types.from_call(call) for call in calls ]))) keywords = list( set( chain.from_iterable([ bayou.models.low_level_evidences.evidence.Keywords.from_call( call) for call in calls ]))) ev_okay = all([c in apicalls for c in js['apicalls']]) and all([t in types for t in js['types']]) \ and all([k in keywords for k in js['keywords']]) return ev_okay
def read_data(self, filename, save=None): with open(filename) as f: js = json.load(f) data_points = [] callmap = dict() ignored, done = 0, 0 for program in js['programs']: if 'ast' not in program: continue try: evidence = [ ev.read_data_point(program) for ev in self.config.evidence ] ast_paths = self.get_ast_paths(program['ast']['_nodes']) self.validate_sketch_paths(program, ast_paths) for path in ast_paths: path.insert(0, ('DSubTree', CHILD_EDGE)) data_points.append((evidence, path)) calls = gather_calls(program['ast']) for call in calls: if call['_call'] not in callmap: callmap[call['_call']] = call except (TooLongPathError, InvalidSketchError) as e: ignored += 1 done += 1 print('{:8d} programs in training data'.format(done)) print('{:8d} programs ignored by given config'.format(ignored)) print('{:8d} data points total'.format(len(data_points))) # randomly shuffle to avoid bias towards initial data points during training random.shuffle(data_points) evidences, targets = zip(*data_points) # save callmap if save location is given if save is not None: with open(os.path.join(save, 'callmap.pkl'), 'wb') as f: pickle.dump(callmap, f) return evidences, targets
def extract_evidence(fileName, expNumber): #print('Loading data file...') with open(fileName) as f: js = json.load(f) #print('Done') ''' Program_dict dictionary holds Key values in format (Key = File_Name Value = dict(Key = String Method_Name, Value = [String ReturnType, List[String] FormalParam , List[String] Sequences] )) ''' programs_dict = dict() valid = [] #This part appends sorrounding evidences done = 0 ignored = 0 for program in js['programs']: try: ast_node_graph, ast_paths = ast_extractor.get_ast_paths(program['ast']['_nodes']) ast_extractor.validate_sketch_paths(program, ast_paths, max_ast_depth) file_name = program['file'] method_name = program['method'] returnType = program['returnType'] if 'returnType' in program else "__Constructor__" formalParam = program['formalParam'] if 'formalParam' in program else [] sequences = program['sequences'] sequences = [[shorten(call) for call in json_seq['calls']] for json_seq in sequences] sequences.sort(key=len, reverse=True) programs_dict[method_name] = [returnType, formalParam, sequences[0]] valid.append(1) except (ast_extractor.TooLongPathError, ast_extractor.InvalidSketchError) as e: ignored += 1 valid.append(0) choice = None if sum(valid) == 0: return json.dumps({}, indent=4) else: while(True): rand = random.randint(0, len(valid) - 1) if valid[rand] == 1: choice = rand break done = 0 sample = None for pid, program in enumerate(js['programs']): if pid != choice: continue calls = gather_calls(program['ast']) apicalls = list(set(chain.from_iterable([bayou.models.low_level_evidences.evidence.APICalls.from_call(call) for call in calls]))) types = list(set(chain.from_iterable([bayou.models.low_level_evidences.evidence.Types.from_call(call) for call in calls]))) keywords = list(set(chain.from_iterable([bayou.models.low_level_evidences.evidence.Keywords.from_call(call) for call in calls]))) sample = dict(program) sample['apicalls'] = apicalls sample['types'] = types sample['keywords'] = keywords sample['body'] = stripJavaDoc(sample['body']) method_name = program['method'] sequences = program['sequences'] sequences = [[shorten(call) for call in json_seq['calls']] for json_seq in sequences] sequences.sort(key=len, reverse=True) sample['sequences'] = sequences[0] # Take in classTypes and sample a few sample['classTypes'] = list(set(program['classTypes'])) if 'classTypes' in program else [] if len(sample['classTypes']) == 0: del sample['classTypes'] sample['sorrreturntype'] = [] sample['sorrformalparam'] = [] sample['sorrsequences'] = [] # (Key = File_Name Value = dict(Key = String Method_Name, Value = [String ReturnType, List[String] FormalParam , List[String] Sequences] )) otherMethods = list(programs_dict.keys()) for method in otherMethods: # Each iterator is a method Name with @linenumber # Ignore the current method from list of sorrounding methods if method == method_name: continue # Keep a count on number of sorrounding methods, if it exceeds the random choice, break for choice, evidence in zip(programs_dict[method],['sorrreturntype', 'sorrformalparam', 'sorrsequences']): sample[evidence].append(choice) ## SORR RET sample['sorrreturntype'] = list(set(sample['sorrreturntype'])) if len(sample['sorrreturntype']) == 0: del sample['sorrreturntype'] ## SORR FP oldSorrFP = sample['sorrformalparam'] filteredSorrFP = [] for FP in oldSorrFP: temp = FP if len(temp) > 0: filteredSorrFP.append( tuple(temp) ) filteredSorrFP.sort(key=len, reverse=True) sample['sorrformalparam'] = list(set(filteredSorrFP)) if len(sample['sorrformalparam']) == 0: del sample['sorrformalparam'] ## SORR SEQ oldSorrSeq = sample['sorrsequences'] oldSorrSeq.sort(key=len, reverse=True) filteredSorrSeq = [] for seq in oldSorrSeq: if len(seq) > 0: filteredSorrSeq.append(tuple(seq)) sample['sorrsequences'] = list(set(filteredSorrSeq)) if len(sample['sorrsequences']) == 0: del sample['sorrsequences'] done += 1 # print('Extracted evidence for {} programs'.format(done), end='\n') return json.dumps(sample, indent=2)
def extract_evidence(clargs): print('Loading data file...') f = open(clargs.input_file[0], 'rb') print('Done') ''' Program_dict dictionary holds Key values in format (Key = File_Name Value = dict(Key = String Method_Name, Value = [String ReturnType, List[String] FormalParam , List[String] Sequences] )) ''' programs_dict = dict() returnDict = dict() FP_Dict = dict() valid = [] #This part appends sorrounding evidences done = 0 ignored = 0 for program in ijson.items(f, 'programs.item'): if 'ast' not in program: continue try: ast_node_graph, ast_paths = ast_extractor.get_ast_paths( program['ast']['_nodes']) ast_extractor.validate_sketch_paths(program, ast_paths, clargs.max_ast_depth) file_name = program['file'] method_name = program['method'] sequences = program['sequences'] sequences = [[shorten(call) for call in json_seq['calls']] for json_seq in sequences] # sequences = [[shorten(call) for call in json_seq] for json_seq in sequences] sequences.sort(key=len, reverse=True) sequences = sequences[0] if 'returnType' not in program: continue if program['returnType'] == 'None': program['returnType'] = '__Constructor__' returnType = program['returnType'] if returnType not in returnDict: returnDict[returnType] = 1 else: returnDict[returnType] += 1 formalParam = program[ 'formalParam'] if 'formalParam' in program else [] for type in formalParam: if type not in FP_Dict: FP_Dict[type] = 1 else: FP_Dict[type] += 1 # if len(sequences) > clargs.max_seqs or (len(sequences) == 1 and len(sequences[0]['calls']) == 1) or \ # any([len(sequence['calls']) > clargs.max_seq_length for sequence in sequences]): # raise ast_extractor.TooLongPathError if file_name not in programs_dict: programs_dict[file_name] = dict() if method_name in programs_dict[file_name]: print('Hit Found') programs_dict[file_name][method_name] = [ returnType, formalParam, sequences ] except (ast_extractor.TooLongPathError, ast_extractor.InvalidSketchError) as e: ignored += 1 done += 1 if done % 100000 == 0: print( 'Extracted evidences of sorrounding features for {} programs'. format(done), end='\n') print('') print('{:8d} programs/asts in training data'.format(done)) print('{:8d} programs/asts ignored by given config'.format(ignored)) print('{:8d} programs/asts to search over'.format(done - ignored)) trainProgDict = set() for program_file_name in programs_dict.keys(): rand = random.uniform(0, 1) if rand <= 0.7: trainProgDict.add(program_file_name) train_programs = [] test_programs = [] topRetKeys = dict() for w in sorted(returnDict, key=returnDict.get, reverse=True)[:1000]: topRetKeys[w] = returnDict[w] topFPKeys = dict() for w in sorted(FP_Dict, key=FP_Dict.get, reverse=True)[:1000]: topFPKeys[w] = FP_Dict[w] f.close() f = open(clargs.input_file[0], 'rb') done = 0 for program in ijson.items(f, 'programs.item'): if 'ast' not in program: continue try: ast_node_graph, ast_paths = ast_extractor.get_ast_paths( program['ast']['_nodes']) ast_extractor.validate_sketch_paths(program, ast_paths, clargs.max_ast_depth) file_name = program['file'] method_name = program['method'] sequences = program['sequences'] sequences = [[shorten(call) for call in json_seq['calls']] for json_seq in sequences] # sequences = [[shorten(call) for call in json_seq] for json_seq in sequences] sequences.sort(key=len, reverse=True) program['sequences'] = sequences[0] if 'returnType' not in program: continue if program['returnType'] == 'None': program['returnType'] = '__Constructor__' if program['returnType'] not in topRetKeys: program['returnType'] = '__UDT__' returnType = program['returnType'] formalParam = program[ 'formalParam'] if 'formalParam' in program else [] newFP = [] for type in formalParam: if type not in topFPKeys: type = '__UDT__' newFP.append(type) # if len(sequences) > clargs.max_seqs or (len(sequences) == 1 and len(sequences[0]['calls']) == 1) or \ # any([len(sequence['calls']) > clargs.max_seq_length for sequence in sequences]): # continue sample = dict(program) calls = gather_calls(program['ast']) apicalls = list( set( chain.from_iterable([ bayou.models.low_level_evidences.evidence.APICalls. from_call(call) for call in calls ]))) types = list( set( chain.from_iterable([ bayou.models.low_level_evidences.evidence.Types. from_call(call) for call in calls ]))) keywords = list( set( chain.from_iterable([ bayou.models.low_level_evidences.evidence.Keywords. from_call(call) for call in calls ]))) random.shuffle(apicalls) random.shuffle(types) random.shuffle(keywords) sample['apicalls'] = apicalls sample['types'] = types sample['keywords'] = keywords sample['returnType'] = returnType sample['formalParam'] = newFP classTypes = list(set( program['classTypes'])) if 'classTypes' in program else [] filteredClassTypes = [] for type in classTypes: if type in topRetKeys or type in topRetKeys: filteredClassTypes.append(type) sample['classTypes'] = filteredClassTypes if len(filteredClassTypes) == 0: del sample['classTypes'] sample['sorrreturntype'] = [] sample['sorrformalparam'] = [] sample['sorrsequences'] = [] #(Key = File_Name Value = dict(Key = String Method_Name, Value = [String ReturnType, List[String] FormalParam , List[String] Sequences] )) otherMethods = list(programs_dict[file_name].keys()) random.shuffle(otherMethods) for method in otherMethods: # Each iterator is a method Name with @linenumber # Ignore the current method from list of sorrounding methods if method == method_name: continue for choice, evidence in zip( programs_dict[file_name][method], ['sorrreturntype', 'sorrformalparam', 'sorrsequences']): sample[evidence].append(choice) ## SORR RET oldSorrRet = sample['sorrreturntype'] filteredSorrRet = [] for type in oldSorrRet: if type in topRetKeys: filteredSorrRet.append(type) sample['sorrreturntype'] = list(set(filteredSorrRet)) if len(sample['sorrreturntype']) == 0: del sample['sorrreturntype'] ## SORR FP oldSorrFP = sample['sorrformalparam'] filteredSorrFP = [] for FP in oldSorrFP: temp = [] for type in FP: if type in topFPKeys: temp.append(type) if len(temp) > 0: filteredSorrFP.append(tuple(temp)) filteredSorrFP.sort(key=len, reverse=True) sample['sorrformalparam'] = list(set(filteredSorrFP)) if len(sample['sorrformalparam']) == 0: del sample['sorrformalparam'] ## SORR SEQ oldSorrSeq = sample['sorrsequences'] oldSorrSeq.sort(key=len, reverse=True) filteredSorrSeq = [] for seq in oldSorrSeq: if len(seq) > 0: filteredSorrSeq.append(tuple(seq)) sample['sorrsequences'] = list(set(filteredSorrSeq)) if len(sample['sorrsequences']) == 0: del sample['sorrsequences'] if file_name in trainProgDict: train_programs.append(sample) else: test_programs.append(sample) except (ast_extractor.TooLongPathError, ast_extractor.InvalidSketchError) as e: ignored += 1 done += 1 if done % 100000 == 0: print( 'Extracted evidence [API/Type/Keywords/Sorrounding Evidences] for {} programs' .format(done), end='\n') random.shuffle(train_programs) print('\nWriting to {}...'.format(clargs.output_file[0]), end='') outFile = clargs.output_file[0] outFile = outFile.split(".")[0] with open(outFile + "_train.json", 'w') as f: json.dump({'programs': train_programs}, fp=f, indent=2) with open(outFile + "_test.json", 'w') as f: json.dump({'programs': test_programs}, fp=f, indent=2) print('done')
def extract_evidence(clargs): print('Loading data file...', end='') with open(clargs.input_file[0]) as f: js = json.load(f) print('done') done = 0 programs = [] for program in js['programs']: sequences = program['sequences'] if len(sequences) > clargs.max_seqs or \ any([len(sequence['calls']) > clargs.max_seq_length for sequence in sequences]): continue calls = gather_calls(program['ast']) apicalls = list( set( chain.from_iterable([ bayou.models.low_level_evidences.evidence.APICalls. from_call(call) for call in calls ]))) types = list( set( chain.from_iterable([ bayou.models.low_level_evidences.evidence.Types.from_call( call) for call in calls ]))) keywords = list( set( chain.from_iterable([ bayou.models.low_level_evidences.evidence.Keywords. from_call(call) for call in calls ]))) if clargs.num_samples == 0: program['apicalls'] = apicalls program['types'] = types program['keywords'] = keywords programs.append(program) else: # put all evidences in the same bag (to avoid bias during sampling) evidences = [(e, 'apicalls') for e in apicalls] + [(e, 'types') for e in types] + \ [(e, 'keywords') for e in keywords] num_samples = clargs.num_samples if clargs.num_samples > 0 else math.ceil( len(evidences) / -clargs.num_samples) for i in range(num_samples): sample = dict(program) sample['apicalls'] = [] sample['types'] = [] sample['keywords'] = [] if clargs.observability is not None: observability = clargs.observability if clargs.observability > 0 else random.randint( 1, 100) choices = random.sample( evidences, math.ceil(len(evidences) * observability / 100)) elif clargs.distribution is not None: random.shuffle(evidences) num = np.random.choice(range(len(clargs.distribution)), p=clargs.distribution) choices = evidences[:num + 1] else: raise ValueError('Invalid option for sampling') for choice, evidence in choices: sample[evidence].append(choice) programs.append(sample) done += 1 print('Extracted evidence for {} programs'.format(done), end='\r') print('\nWriting to {}...'.format(clargs.output_file[0]), end='') with open(clargs.output_file[0], 'w') as f: json.dump({'programs': programs}, fp=f, indent=2) print('done')