Beispiel #1
0
    def train(self, model):
        if self.train_params['reload'] > 0:
            self.load_model(model, self.train_params['reload'])
        valid_every = self.train_params.get('valid_every', None)
        save_every = self.train_params.get('save_every', None)
        batch_size = self.train_params.get('batch_size', 128)
        nb_epoch = self.train_params.get('nb_epoch', 10)
        split = self.train_params.get('validation_split', 0)

        val_loss = {'loss': 1., 'epoch': 0}
        chunk_size = self.train_params.get('chunk_size', 100000)

        for i in range(self.train_params['reload'] + 1, nb_epoch):
            print('Epoch %d :: \n' % i, end='')

            logger.debug('loading data chunk..')
            offset = (i - 1) * self.train_params.get('chunk_size', 100000)

            names = data_loader.load_hdf5(
                self.data_path + self.data_params['train_methname'], offset,
                chunk_size)
            apis = data_loader.load_hdf5(
                self.data_path + self.data_params['train_apiseq'], offset,
                chunk_size)
            tokens = data_loader.load_hdf5(
                self.data_path + self.data_params['train_tokens'], offset,
                chunk_size)
            descs = data_loader.load_hdf5(
                self.data_path + self.data_params['train_desc'], offset,
                chunk_size)

            logger.debug('padding data..')
            methnames = pad(names, self.data_params['methname_len'])
            apiseqs = pad(apis, self.data_params['apiseq_len'])
            tokens = pad(tokens, self.data_params['tokens_len'])
            good_descs = pad(descs, self.data_params['desc_len'])
            bad_descs = [desc for desc in descs]
            random.shuffle(bad_descs)
            bad_descs = pad(bad_descs, self.data_params['desc_len'])

            hist = model.fit(
                [methnames, apiseqs, tokens, good_descs, bad_descs],
                epochs=1,
                batch_size=batch_size,
                validation_split=split)

            if hist.history['val_loss'][0] < val_loss['loss']:
                val_loss = {'loss': hist.history['val_loss'][0], 'epoch': i}
            print('Best: Loss = {}, Epoch = {}'.format(val_loss['loss'],
                                                       val_loss['epoch']))

            if save_every is not None and i % save_every == 0:
                self.save_model(model, i)

            if valid_every is not None and i % valid_every == 0:
                acc, mrr, map, ndcg = self.valid(model, 1000, 1)
Beispiel #2
0
    def repr_code(self, model):
        logger.info('Loading the use data ..')
        methnames = data_loader.load_hdf5(
            self.data_path + self.data_params['use_methname'], 0, -1)
        apiseqs = data_loader.load_hdf5(
            self.data_path + self.data_params['use_apiseq'], 0, -1)
        tokens = data_loader.load_hdf5(
            self.data_path + self.data_params['use_tokens'], 0, -1)
        methnames = pad(methnames, self.data_params['methname_len'])
        apiseqs = pad(apiseqs, self.data_params['apiseq_len'])
        tokens = pad(tokens, self.data_params['tokens_len'])

        logger.info('Representing code ..')
        vecs = model.repr_code([methnames, apiseqs, tokens], batch_size=1000)
        vecs = vecs.astype('float32')
        vecs = normalize(vecs)
        return vecs
Beispiel #3
0
    def repr_code(self, model):
        logger.info('Loading the use data ..')
        methnames = data_loader.load_hdf5(
            self.data_path + self.data_params['use_methname'], 0, -1)
        methnames = pad(methnames, self.data_params['methname_len'])

        logger.info('Representing code ..')
        vecs = model.repr_code([methnames], batch_size=10000)
        vecs = vecs.astype(np.float)
        vecs = normalize(vecs)
        return vecs
Beispiel #4
0
    def valid(self, model, poolsize, K):
        """
        validate in a code pool. 
        param: poolsize - size of the code pool, if -1, load the whole test set
        """
        def ACC(real, predict):
            sum = 0.0
            for val in real:
                try:
                    index = predict.index(val)
                except ValueError:
                    index = -1
                if index != -1: sum = sum + 1
            return sum / float(len(real))

        def MAP(real, predict):
            sum = 0.0
            for id, val in enumerate(real):
                try:
                    index = predict.index(val)
                except ValueError:
                    index = -1
                if index != -1: sum = sum + (id + 1) / float(index + 1)
            return sum / float(len(real))

        def MRR(real, predict):
            sum = 0.0
            for val in real:
                try:
                    index = predict.index(val)
                except ValueError:
                    index = -1
                if index != -1: sum = sum + 1.0 / float(index + 1)
            return sum / float(len(real))

        def NDCG(real, predict):
            dcg = 0.0
            idcg = IDCG(len(real))
            for i, predictItem in enumerate(predict):
                if predictItem in real:
                    itemRelevance = 1
                    rank = i + 1
                    dcg += (math.pow(2, itemRelevance) -
                            1.0) * (math.log(2) / math.log(rank + 1))
            return dcg / float(idcg)

        def IDCG(n):
            idcg = 0
            itemRelevance = 1
            for i in range(n):
                idcg += (math.pow(2, itemRelevance) - 1.0) * (math.log(2) /
                                                              math.log(i + 2))
            return idcg

        #load valid dataset
        if self._eval_sets is None:
            methnames = data_loader.load_hdf5(
                self.data_path + self.data_params['valid_methname'], 0,
                poolsize)
            apiseqs = data_loader.load_hdf5(
                self.data_path + self.data_params['valid_apiseq'], 0, poolsize)
            tokens = data_loader.load_hdf5(
                self.data_path + self.data_params['valid_tokens'], 0, poolsize)
            descs = data_loader.load_hdf5(
                self.data_path + self.data_params['valid_desc'], 0, poolsize)
            self._eval_sets = {
                'methnames': methnames,
                'apiseqs': apiseqs,
                'tokens': tokens,
                'descs': descs
            }

        acc, mrr, map, ndcg = 0, 0, 0, 0
        data_len = len(self._eval_sets['descs'])
        for i in tqdm(range(data_len)):
            desc = self._eval_sets['descs'][i]  #good desc
            descs = pad([desc] * data_len, self.data_params['desc_len'])
            methnames = pad(self._eval_sets['methnames'],
                            self.data_params['methname_len'])
            apiseqs = pad(self._eval_sets['apiseqs'],
                          self.data_params['apiseq_len'])
            tokens = pad(self._eval_sets['tokens'],
                         self.data_params['tokens_len'])
            n_results = K
            sims = model.predict([methnames, apiseqs, tokens, descs],
                                 batch_size=data_len).flatten()
            negsims = np.negative(sims)
            predict = np.argsort(
                negsims)  #predict = np.argpartition(negsims, kth=n_results-1)
            predict = predict[:n_results]
            predict = [int(k) for k in predict]
            real = [i]
            acc += ACC(real, predict)
            mrr += MRR(real, predict)
            map += MAP(real, predict)
            ndcg += NDCG(real, predict)
        acc = acc / float(data_len)
        mrr = mrr / float(data_len)
        map = map / float(data_len)
        ndcg = ndcg / float(data_len)
        logger.info(f'ACC={acc}, MRR={mrr}, MAP={map}, nDCG={ndcg}')
        return acc, mrr, map, ndcg
Beispiel #5
0
####################################################################################################################################################################
    data_param = config.get('data_params', dict())
    
    ### Loading training set
    data = pd.read_csv('use.rawcode.txt', sep="\n", header=None)
    
    for i in range(len(data[0])):
        method_string = data[0][i]
        code_embedding_DeepCS.replace_methname(method_string, i)
        
    methname = pd.read_csv('new_methname.txt', sep="\n", header=None)

    methnames = code_embedding_DeepCS.text_to_array(methname[0])
    
    
    descs = data_loader.load_hdf5(data_path + data_param['train_desc'], 0, args.chunk_size)
    good_descs = pad(descs, data_param['desc_len'])
    bad_descs = [desc for desc in descs]
    random.shuffle(bad_descs)
    bad_descs = pad(bad_descs, data_param['desc_len'])
    
    
    
    #refactoring
    refactored_mn = TestSuiteGenerator.generateTestSuite(model, methnames, args.chunk_size, args.gen, args.mutation_rate)

    refactored_mn = np.array(refactored_mn).reshape(args.chunk_size, 6)
    methnames = np.concatenate((methnames, refactored_mn), axis=0)

    good_descs = np.concatenate((good_descs, good_descs), axis=0)
    bad_descs = np.concatenate((bad_descs, bad_descs), axis=0)
Beispiel #6
0
    def valid(self, model, poolsize, K):
        """
        validate in a code pool. 
        param: poolsize - size of the code pool, if -1, load the whole test set
        """
        def ACC(real, predict):
            sum = 0.0
            for val in real:
                try:
                    index = predict.index(val)
                except ValueError:
                    index = -1
                if index != -1: sum = sum + 1
            return sum / float(len(real))

        def MAP(real, predict):
            sum = 0.0
            for id, val in enumerate(real):
                try:
                    index = predict.index(val)
                except ValueError:
                    index = -1
                if index != -1: sum = sum + (id + 1) / float(index + 1)
            return sum / float(len(real))

        def MRR(real, predict):
            sum = 0.0
            for val in real:
                try:
                    index = predict.index(val)
                except ValueError:
                    index = -1
                if index != -1: sum = sum + 1.0 / float(index + 1)
            return sum / float(len(real))

        def NDCG(real, predict):
            dcg = 0.0
            idcg = IDCG(len(real))
            for i, predictItem in enumerate(predict):
                if predictItem in real:
                    itemRelevance = 1
                    rank = i + 1
                    dcg += (math.pow(2, itemRelevance) -
                            1.0) * (math.log(2) / math.log(rank + 1))
            return dcg / float(idcg)

        def IDCG(n):
            idcg = 0
            itemRelevance = 1
            for i in range(n):
                idcg += (math.pow(2, itemRelevance) - 1.0) * (math.log(2) /
                                                              math.log(i + 2))
            return idcg

        #load valid dataset

        print('self._eval_sets,', self._eval_sets)
        if self._eval_sets is None:
            methnames = data_loader.load_hdf5(
                self.data_path + self.data_params['valid_methname'], 0,
                poolsize)
            descs = data_loader.load_hdf5(
                self.data_path + self.data_params['valid_desc'], 0, poolsize)
            self._eval_sets = {'methnames': methnames, 'descs': descs}

        accs, mrrs, maps, ndcgs = [], [], [], []
        data_len = len(self._eval_sets['descs'])
        print('data_len', data_len)
        for i in tqdm(range(data_len)):
            desc = self._eval_sets['descs'][i]  #good desc
            descs = pad([desc] * data_len, self.data_params['desc_len'])
            methnames = pad(self._eval_sets['methnames'],
                            self.data_params['methname_len'])
            n_results = K
            sims = model.predict([methnames, descs],
                                 batch_size=data_len).flatten()
            negsims = np.negative(sims)
            predict = np.argpartition(negsims, kth=n_results - 1)
            predict = predict[:n_results]
            predict = [int(k) for k in predict]
            real = [i]
            accs.append(ACC(real, predict))
            mrrs.append(MRR(real, predict))
            maps.append(MAP(real, predict))
            ndcgs.append(NDCG(real, predict))
        logger.info(
            f'ACC={np.mean(accs)}, MRR={np.mean(mrrs)}, MAP={np.mean(maps)}, nDCG={np.mean(ndcgs)}'
        )
        return accs, mrrs, maps, ndcgs
Beispiel #7
0
    mode = 'test'
    filename = './log_folder/record.txt'
    threshold_CC = 0
    threshold_MC = 0.7
    symbols_SQ = 2
    seq = '[0 ,1]'
    seq = re.findall(r"\d+\.?\d*", seq)
    TargMetri = None
    CoverageStop = 0.9
    TestCaseNum = 2000
    minimalTest = 0
    r = record(filename, time.time())
    data_param = config.get('data_params', dict())

    ### Loading training set
    names = data_loader.load_hdf5(data_path + data_param['train_methname'], 0,
                                  data_param.get('chunk_size', 100000))
    methnames = pad(names, data_param['methname_len'])
    print(methnames[0])
    z = methnames[0].reshape(1, 6)
    print(z)
    z[0][1] = 0
    print(z)
    apis = data_loader.load_hdf5(
        data_path + data_param['train_apiseq'],
        0,
        data_param.get('chunk_size', 100000),
    )
    tokens = data_loader.load_hdf5(
        data_path + data_param['train_tokens'],
        0,
        data_param.get('chunk_size', 100000),