コード例 #1
0
ファイル: ast_server.py プロジェクト: winnerineast/bayou
def _okay(js, ast, predictor):
    calls = [
        predictor.callmap[call['_call']] for call in gather_calls(ast['ast'])
    ]
    apicalls = list(
        set(
            chain.from_iterable([
                bayou.models.low_level_evidences.evidence.APICalls.from_call(
                    call) for call in calls
            ])))
    types = list(
        set(
            chain.from_iterable([
                bayou.models.low_level_evidences.evidence.Types.from_call(call)
                for call in calls
            ])))
    keywords = list(
        set(
            chain.from_iterable([
                bayou.models.low_level_evidences.evidence.Keywords.from_call(
                    call) for call in calls
            ])))

    ev_okay = all([c in apicalls for c in js['apicalls']]) and all([t in types for t in js['types']]) \
        and all([k in keywords for k in js['keywords']])
    return ev_okay
コード例 #2
0
    def read_data(self, filename, save=None):
        with open(filename) as f:
            js = json.load(f)
        data_points = []
        callmap = dict()
        ignored, done = 0, 0

        for program in js['programs']:
            if 'ast' not in program:
                continue
            try:
                evidence = [
                    ev.read_data_point(program) for ev in self.config.evidence
                ]
                ast_paths = self.get_ast_paths(program['ast']['_nodes'])
                self.validate_sketch_paths(program, ast_paths)
                for path in ast_paths:
                    path.insert(0, ('DSubTree', CHILD_EDGE))
                    data_points.append((evidence, path))
                calls = gather_calls(program['ast'])
                for call in calls:
                    if call['_call'] not in callmap:
                        callmap[call['_call']] = call
            except (TooLongPathError, InvalidSketchError) as e:
                ignored += 1
            done += 1
        print('{:8d} programs in training data'.format(done))
        print('{:8d} programs ignored by given config'.format(ignored))
        print('{:8d} data points total'.format(len(data_points)))

        # randomly shuffle to avoid bias towards initial data points during training
        random.shuffle(data_points)
        evidences, targets = zip(*data_points)

        # save callmap if save location is given
        if save is not None:
            with open(os.path.join(save, 'callmap.pkl'), 'wb') as f:
                pickle.dump(callmap, f)

        return evidences, targets
コード例 #3
0
ファイル: processJSON.py プロジェクト: rohanmukh/bayou
def extract_evidence(fileName, expNumber):
    #print('Loading data file...')
    with open(fileName) as f:
        js = json.load(f)
    #print('Done')

    ''' Program_dict dictionary holds Key values in format
    (Key = File_Name Value = dict(Key = String Method_Name, Value = [String ReturnType, List[String] FormalParam , List[String] Sequences] ))
    '''
    programs_dict = dict()

    valid = []
    #This part appends sorrounding evidences

    done = 0
    ignored = 0
    for program in js['programs']:
        try:
            ast_node_graph, ast_paths = ast_extractor.get_ast_paths(program['ast']['_nodes'])
            ast_extractor.validate_sketch_paths(program, ast_paths, max_ast_depth)

            file_name = program['file']
            method_name = program['method']
            returnType = program['returnType'] if 'returnType' in program else "__Constructor__"
            formalParam = program['formalParam'] if 'formalParam' in program else []

            sequences = program['sequences']
            sequences = [[shorten(call) for call in json_seq['calls']] for json_seq in sequences]
            sequences.sort(key=len, reverse=True)


            programs_dict[method_name] = [returnType, formalParam, sequences[0]]
            valid.append(1)


        except (ast_extractor.TooLongPathError, ast_extractor.InvalidSketchError) as e:
            ignored += 1
            valid.append(0)


    choice = None

    if sum(valid) == 0:
        return json.dumps({}, indent=4)
    else:
        while(True):
            rand = random.randint(0, len(valid) - 1)
            if valid[rand] == 1:
                choice = rand
                break



    done = 0
    sample = None
    for pid, program in enumerate(js['programs']):

        if pid != choice:
            continue

        calls = gather_calls(program['ast'])
        apicalls = list(set(chain.from_iterable([bayou.models.low_level_evidences.evidence.APICalls.from_call(call)
                                                 for call in calls])))
        types = list(set(chain.from_iterable([bayou.models.low_level_evidences.evidence.Types.from_call(call)
                                              for call in calls])))
        keywords = list(set(chain.from_iterable([bayou.models.low_level_evidences.evidence.Keywords.from_call(call)
                                                for call in calls])))

        sample = dict(program)
        sample['apicalls'] = apicalls
        sample['types'] = types
        sample['keywords'] = keywords

        sample['body'] = stripJavaDoc(sample['body'])

        method_name = program['method']

        sequences = program['sequences']
        sequences = [[shorten(call) for call in json_seq['calls']] for json_seq in sequences]
        sequences.sort(key=len, reverse=True)
        sample['sequences'] = sequences[0]

        # Take in classTypes and sample a few
        sample['classTypes'] = list(set(program['classTypes'])) if 'classTypes' in program else []
        if len(sample['classTypes']) == 0:
            del sample['classTypes']

        sample['sorrreturntype'] = []
        sample['sorrformalparam'] = []
        sample['sorrsequences'] = []

        #    (Key = File_Name Value = dict(Key = String Method_Name, Value = [String ReturnType, List[String] FormalParam , List[String] Sequences] ))
        otherMethods = list(programs_dict.keys())

        for method in otherMethods: # Each iterator is a method Name with @linenumber

            # Ignore the current method from list of sorrounding methods
            if method == method_name:
                continue
            # Keep a count on number of sorrounding methods, if it exceeds the random choice, break

            for choice, evidence in zip(programs_dict[method],['sorrreturntype', 'sorrformalparam', 'sorrsequences']):
                sample[evidence].append(choice)

        ## SORR RET
        sample['sorrreturntype'] = list(set(sample['sorrreturntype']))
        if len(sample['sorrreturntype']) == 0:
            del sample['sorrreturntype']

        ## SORR FP
        oldSorrFP = sample['sorrformalparam']
        filteredSorrFP = []
        for FP in oldSorrFP:
            temp = FP
            if len(temp) > 0:
                filteredSorrFP.append( tuple(temp) )

        filteredSorrFP.sort(key=len, reverse=True)
        sample['sorrformalparam'] = list(set(filteredSorrFP))
        if len(sample['sorrformalparam']) == 0:
            del sample['sorrformalparam']

        ## SORR SEQ
        oldSorrSeq = sample['sorrsequences']
        oldSorrSeq.sort(key=len, reverse=True)
        filteredSorrSeq = []
        for seq in oldSorrSeq:
            if len(seq) > 0:
                filteredSorrSeq.append(tuple(seq))

        sample['sorrsequences'] = list(set(filteredSorrSeq))
        if len(sample['sorrsequences']) == 0:
            del sample['sorrsequences']

        done += 1
        # print('Extracted evidence for {} programs'.format(done), end='\n')


    return json.dumps(sample, indent=2)
コード例 #4
0
ファイル: evidence_extractor.py プロジェクト: rohanmukh/bayou
def extract_evidence(clargs):
    print('Loading data file...')

    f = open(clargs.input_file[0], 'rb')
    print('Done')
    ''' Program_dict dictionary holds Key values in format
    (Key = File_Name Value = dict(Key = String Method_Name, Value = [String ReturnType, List[String] FormalParam , List[String] Sequences] ))
    '''
    programs_dict = dict()

    returnDict = dict()
    FP_Dict = dict()

    valid = []
    #This part appends sorrounding evidences
    done = 0
    ignored = 0
    for program in ijson.items(f, 'programs.item'):
        if 'ast' not in program:
            continue
        try:
            ast_node_graph, ast_paths = ast_extractor.get_ast_paths(
                program['ast']['_nodes'])
            ast_extractor.validate_sketch_paths(program, ast_paths,
                                                clargs.max_ast_depth)

            file_name = program['file']
            method_name = program['method']

            sequences = program['sequences']

            sequences = [[shorten(call) for call in json_seq['calls']]
                         for json_seq in sequences]
            # sequences = [[shorten(call) for call in json_seq] for json_seq in sequences]
            sequences.sort(key=len, reverse=True)
            sequences = sequences[0]

            if 'returnType' not in program:
                continue

            if program['returnType'] == 'None':
                program['returnType'] = '__Constructor__'

            returnType = program['returnType']

            if returnType not in returnDict:
                returnDict[returnType] = 1
            else:
                returnDict[returnType] += 1

            formalParam = program[
                'formalParam'] if 'formalParam' in program else []

            for type in formalParam:
                if type not in FP_Dict:
                    FP_Dict[type] = 1
                else:
                    FP_Dict[type] += 1

            # if len(sequences) > clargs.max_seqs or (len(sequences) == 1 and len(sequences[0]['calls']) == 1) or \
            #     any([len(sequence['calls']) > clargs.max_seq_length for sequence in sequences]):
            #         raise ast_extractor.TooLongPathError

            if file_name not in programs_dict:
                programs_dict[file_name] = dict()

            if method_name in programs_dict[file_name]:
                print('Hit Found')

            programs_dict[file_name][method_name] = [
                returnType, formalParam, sequences
            ]

        except (ast_extractor.TooLongPathError,
                ast_extractor.InvalidSketchError) as e:
            ignored += 1

        done += 1
        if done % 100000 == 0:
            print(
                'Extracted evidences of sorrounding features for {} programs'.
                format(done),
                end='\n')

    print('')

    print('{:8d} programs/asts in training data'.format(done))
    print('{:8d} programs/asts ignored by given config'.format(ignored))
    print('{:8d} programs/asts to search over'.format(done - ignored))

    trainProgDict = set()
    for program_file_name in programs_dict.keys():
        rand = random.uniform(0, 1)
        if rand <= 0.7:
            trainProgDict.add(program_file_name)

    train_programs = []
    test_programs = []

    topRetKeys = dict()
    for w in sorted(returnDict, key=returnDict.get, reverse=True)[:1000]:
        topRetKeys[w] = returnDict[w]

    topFPKeys = dict()
    for w in sorted(FP_Dict, key=FP_Dict.get, reverse=True)[:1000]:
        topFPKeys[w] = FP_Dict[w]

    f.close()
    f = open(clargs.input_file[0], 'rb')
    done = 0
    for program in ijson.items(f, 'programs.item'):
        if 'ast' not in program:
            continue
        try:
            ast_node_graph, ast_paths = ast_extractor.get_ast_paths(
                program['ast']['_nodes'])
            ast_extractor.validate_sketch_paths(program, ast_paths,
                                                clargs.max_ast_depth)

            file_name = program['file']
            method_name = program['method']

            sequences = program['sequences']
            sequences = [[shorten(call) for call in json_seq['calls']]
                         for json_seq in sequences]
            # sequences = [[shorten(call) for call in json_seq] for json_seq in sequences]
            sequences.sort(key=len, reverse=True)

            program['sequences'] = sequences[0]

            if 'returnType' not in program:
                continue
            if program['returnType'] == 'None':
                program['returnType'] = '__Constructor__'

            if program['returnType'] not in topRetKeys:
                program['returnType'] = '__UDT__'

            returnType = program['returnType']

            formalParam = program[
                'formalParam'] if 'formalParam' in program else []
            newFP = []
            for type in formalParam:
                if type not in topFPKeys:
                    type = '__UDT__'
                newFP.append(type)

            # if len(sequences) > clargs.max_seqs or (len(sequences) == 1 and len(sequences[0]['calls']) == 1) or \
            #         any([len(sequence['calls']) > clargs.max_seq_length for sequence in sequences]):
            #     continue

            sample = dict(program)
            calls = gather_calls(program['ast'])
            apicalls = list(
                set(
                    chain.from_iterable([
                        bayou.models.low_level_evidences.evidence.APICalls.
                        from_call(call) for call in calls
                    ])))
            types = list(
                set(
                    chain.from_iterable([
                        bayou.models.low_level_evidences.evidence.Types.
                        from_call(call) for call in calls
                    ])))
            keywords = list(
                set(
                    chain.from_iterable([
                        bayou.models.low_level_evidences.evidence.Keywords.
                        from_call(call) for call in calls
                    ])))
            random.shuffle(apicalls)
            random.shuffle(types)
            random.shuffle(keywords)
            sample['apicalls'] = apicalls
            sample['types'] = types
            sample['keywords'] = keywords
            sample['returnType'] = returnType
            sample['formalParam'] = newFP

            classTypes = list(set(
                program['classTypes'])) if 'classTypes' in program else []
            filteredClassTypes = []
            for type in classTypes:
                if type in topRetKeys or type in topRetKeys:
                    filteredClassTypes.append(type)

            sample['classTypes'] = filteredClassTypes
            if len(filteredClassTypes) == 0:
                del sample['classTypes']

            sample['sorrreturntype'] = []
            sample['sorrformalparam'] = []
            sample['sorrsequences'] = []

            #(Key = File_Name Value = dict(Key = String Method_Name, Value = [String ReturnType, List[String] FormalParam , List[String] Sequences] ))

            otherMethods = list(programs_dict[file_name].keys())
            random.shuffle(otherMethods)

            for method in otherMethods:  # Each iterator is a method Name with @linenumber
                # Ignore the current method from list of sorrounding methods
                if method == method_name:
                    continue

                for choice, evidence in zip(
                        programs_dict[file_name][method],
                    ['sorrreturntype', 'sorrformalparam', 'sorrsequences']):
                    sample[evidence].append(choice)

            ## SORR RET
            oldSorrRet = sample['sorrreturntype']
            filteredSorrRet = []
            for type in oldSorrRet:
                if type in topRetKeys:
                    filteredSorrRet.append(type)
            sample['sorrreturntype'] = list(set(filteredSorrRet))
            if len(sample['sorrreturntype']) == 0:
                del sample['sorrreturntype']

            ## SORR FP
            oldSorrFP = sample['sorrformalparam']
            filteredSorrFP = []
            for FP in oldSorrFP:
                temp = []
                for type in FP:
                    if type in topFPKeys:
                        temp.append(type)
                if len(temp) > 0:
                    filteredSorrFP.append(tuple(temp))

            filteredSorrFP.sort(key=len, reverse=True)
            sample['sorrformalparam'] = list(set(filteredSorrFP))

            if len(sample['sorrformalparam']) == 0:
                del sample['sorrformalparam']

            ## SORR SEQ
            oldSorrSeq = sample['sorrsequences']
            oldSorrSeq.sort(key=len, reverse=True)
            filteredSorrSeq = []
            for seq in oldSorrSeq:
                if len(seq) > 0:
                    filteredSorrSeq.append(tuple(seq))

            sample['sorrsequences'] = list(set(filteredSorrSeq))
            if len(sample['sorrsequences']) == 0:
                del sample['sorrsequences']

            if file_name in trainProgDict:
                train_programs.append(sample)
            else:
                test_programs.append(sample)

        except (ast_extractor.TooLongPathError,
                ast_extractor.InvalidSketchError) as e:
            ignored += 1

        done += 1
        if done % 100000 == 0:
            print(
                'Extracted evidence [API/Type/Keywords/Sorrounding Evidences] for {} programs'
                .format(done),
                end='\n')

    random.shuffle(train_programs)

    print('\nWriting to {}...'.format(clargs.output_file[0]), end='')
    outFile = clargs.output_file[0]
    outFile = outFile.split(".")[0]

    with open(outFile + "_train.json", 'w') as f:
        json.dump({'programs': train_programs}, fp=f, indent=2)

    with open(outFile + "_test.json", 'w') as f:
        json.dump({'programs': test_programs}, fp=f, indent=2)

    print('done')
コード例 #5
0
def extract_evidence(clargs):
    print('Loading data file...', end='')
    with open(clargs.input_file[0]) as f:
        js = json.load(f)
    print('done')
    done = 0
    programs = []
    for program in js['programs']:
        sequences = program['sequences']
        if len(sequences) > clargs.max_seqs or \
                any([len(sequence['calls']) > clargs.max_seq_length for sequence in sequences]):
            continue

        calls = gather_calls(program['ast'])

        apicalls = list(
            set(
                chain.from_iterable([
                    bayou.models.low_level_evidences.evidence.APICalls.
                    from_call(call) for call in calls
                ])))
        types = list(
            set(
                chain.from_iterable([
                    bayou.models.low_level_evidences.evidence.Types.from_call(
                        call) for call in calls
                ])))
        keywords = list(
            set(
                chain.from_iterable([
                    bayou.models.low_level_evidences.evidence.Keywords.
                    from_call(call) for call in calls
                ])))

        if clargs.num_samples == 0:
            program['apicalls'] = apicalls
            program['types'] = types
            program['keywords'] = keywords
            programs.append(program)
        else:
            # put all evidences in the same bag (to avoid bias during sampling)
            evidences = [(e, 'apicalls') for e in apicalls] + [(e, 'types') for e in types] + \
                        [(e, 'keywords') for e in keywords]
            num_samples = clargs.num_samples if clargs.num_samples > 0 else math.ceil(
                len(evidences) / -clargs.num_samples)

            for i in range(num_samples):
                sample = dict(program)
                sample['apicalls'] = []
                sample['types'] = []
                sample['keywords'] = []

                if clargs.observability is not None:
                    observability = clargs.observability if clargs.observability > 0 else random.randint(
                        1, 100)
                    choices = random.sample(
                        evidences,
                        math.ceil(len(evidences) * observability / 100))
                elif clargs.distribution is not None:
                    random.shuffle(evidences)
                    num = np.random.choice(range(len(clargs.distribution)),
                                           p=clargs.distribution)
                    choices = evidences[:num + 1]
                else:
                    raise ValueError('Invalid option for sampling')

                for choice, evidence in choices:
                    sample[evidence].append(choice)
                programs.append(sample)

        done += 1
        print('Extracted evidence for {} programs'.format(done), end='\r')

    print('\nWriting to {}...'.format(clargs.output_file[0]), end='')
    with open(clargs.output_file[0], 'w') as f:
        json.dump({'programs': programs}, fp=f, indent=2)
    print('done')