Exemple #1
0
def predict():
    train_code, holdout_code, train_comment, holdout_comment = read_training_files(
        '../../data/processed_data/')
    loc = "/home/bohong/文档/mygit/cdpensearch/cdpensearch/oneEncoder/seqmodel.hdf5"
    seq2seq_Model = load_model(loc)

    loc = OUTPUT_PATH / 'py_code_proc_v2.dpkl'
    num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH /
                                                     'py_code_proc_v2.dpkl')
    num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH /
                                                     'py_comment_proc_v2.dpkl')
    seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                    decoder_preprocessor=dec_pp,
                                    seq2seq_model=seq2seq_Model)
    demo_testdf = pd.DataFrame({
        'code': holdout_code,
        'comment': holdout_comment,
        'ref': ''
    })
    # seq2seq_inf.predications(df=demo_testdf)
    f = open("generatetag.txt")
    score = seq2seq_inf.evaluate_model(f.readlines(),
                                       holdout_comment,
                                       max_len=None)
    f.close()
    print(score)
 def __init__(self):
     with open('body_pp.dpkl', 'rb') as f:
         body_pp = dpickle.load(f)
     with open('title_pp.dpkl', 'rb') as f:
         title_pp = dpickle.load(f)
     self.model = Seq2Seq_Inference(
         encoder_preprocessor=body_pp,
         decoder_preprocessor=title_pp,
         seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
Exemple #3
0
  def evaluate_keras(self):
    """Generates predictions on holdout set and calculates BLEU Score."""
    seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=self.body_pp,
                                    decoder_preprocessor=self.title_pp,
                                    seq2seq_model=self.seq2seq_Model)

    bleu_score = seq2seq_inf.evaluate_model(holdout_bodies=self.test_df.body.tolist(),
                                            holdout_titles=self.test_df.issue_title.tolist(),
                                            max_len_title=12)
    logging.info("Bleu score: %s", bleu_score)
    return bleu_score
 def load_seq2seq_model(self):
     K.clear_session()
     seq2seq_Model = load_model(
         str(self.seq2seq_path / 'code_summary_seq2seq_model.h5'))
     num_encoder_tokens, enc_pp = load_text_processor(
         self.seq2seq_path / 'py_code_proc_v2.dpkl')
     num_decoder_tokens, dec_pp = load_text_processor(
         self.seq2seq_path / 'py_comment_proc_v2.dpkl')
     self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                          decoder_preprocessor=dec_pp,
                                          seq2seq_model=seq2seq_Model)
Exemple #5
0
    def __init__(self):
        body_pp_file = os.getenv('BODY_PP_FILE', 'body_pp.dpkl')
        print('body_pp file {0}'.format(body_pp_file))
        with open(body_pp_file, 'rb') as body_file:
            body_pp = dpickle.load(body_file)

        title_pp_file = os.getenv('TITLE_PP_FILE', 'title_pp.dpkl')
        print('title_pp file {0}'.format(title_pp_file))
        with open(title_pp_file, 'rb') as title_file:
            title_pp = dpickle.load(title_file)

        model_file = os.getenv('MODEL_FILE', 'seq2seq_model_tutorial.h5')
        print('model file {0}'.format(model_file))
        self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                       decoder_preprocessor=title_pp,
                                       seq2seq_model=load_model(model_file))
Exemple #6
0
def detect(inputs, input_model_h5, input_title_preprocessor_dpkl,
           input_body_preprocessor_dpkl):
    # Load model, preprocessors.
    seq2seq_Model = keras.models.load_model(input_model_h5)
    num_encoder_tokens, body_pp = load_text_processor(
        input_body_preprocessor_dpkl)
    num_decoder_tokens, title_pp = load_text_processor(
        input_title_preprocessor_dpkl)

    # Prepare inference.
    seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                    decoder_preprocessor=title_pp,
                                    seq2seq_model=seq2seq_Model)

    # Output predictions for n random rows in the test set.
    return seq2seq_inf.generate_issue_title(input[0])
def load_summarizer(seq2seq_model_path, text_processor_path):
    """
    Loads the code summarizer model and returns the interference object
    to be used for predicting docstrings.

    Input: -----

    Returns: Seq2Seq_Inference object

    Author: Tyler Medlin

    """
    #the code from the GitHub team has a LOT of soon to be depricated functions
    #suppress the depricated warnings
    tf.logging.set_verbosity('ERROR')
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
    warnings.filterwarnings("ignore")

    logging.warning('Loading pre-trained model...')
    # Load model
    seq2seq_Model = load_model(seq2seq_model_path +
                               '/py_func_sum_v9_.epoch16-val2.55276.hdf5')

    logging.warning('Loading text processor (encoder)...')
    # Load encoder (code) pre-processor
    num_encoder_tokens, enc_pp = load_text_processor(text_processor_path +
                                                     '/py_code_proc_v2.dpkl')

    logging.warning('Loading text processor (decoder)...')
    # Load decoder (docstrings/comments) pre-processor
    num_decoder_tokens, dec_pp = load_text_processor(
        text_processor_path + '/py_comment_proc_v2.dpkl')

    graph = tf.get_default_graph()

    seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                    decoder_preprocessor=dec_pp,
                                    seq2seq_model=seq2seq_Model)

    return seq2seq_inf, graph
 def create_autotag(self, postgres, file_id):
     K.clear_session()
     seq2seq_Model = load_model(
         str(self.seq2seq_path / 'code_summary_seq2seq_model.h5'))
     num_encoder_tokens, enc_pp = load_text_processor(
         self.seq2seq_path / 'py_code_proc_v2.dpkl')
     num_decoder_tokens, dec_pp = load_text_processor(
         self.seq2seq_path / 'py_comment_proc_v2.dpkl')
     self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                          decoder_preprocessor=dec_pp,
                                          seq2seq_model=seq2seq_Model)
     paras, paraids, autotags, manualtags = postgres.get_paragraphs_fileid(
         file_id)
     paras = [str(item) for item in paras]
     no_docstring_funcs = paras
     no_docstring_paraids = paraids
     print("no_docstring_paraids = ", no_docstring_paraids)
     print("size of paragraphs = ", len(no_docstring_funcs))
     print("size of paraids = ", len(no_docstring_paraids))
     demo_testdf = pd.DataFrame({
         'code': no_docstring_funcs,
         'comment': '',
         'ref': ''
     })
     auto_tag = self.seq2seq_inf.demo_model_predictions(n=15,
                                                        df=demo_testdf)
     print("size of auto_tag = ", len(auto_tag))
     with open(self.data_path / 'without_docstrings.autotag',
               'w',
               encoding='utf-8') as f:
         index = 0
         for item in auto_tag:
             f.write("%s\n" % item)
             paraid = no_docstring_paraids[index]
             # paraid = paraid.strip()
             updated_rows = postgres.update_autotag(paraid, item)
             index = index + 1
     K.clear_session()
 def load_models(self):
     K.clear_session()
     print("Going to load 'code_summary_seq2seq_model.h5'")
     seq2seq_Model = load_model(
         str(self.seq2seq_path / 'code_summary_seq2seq_model.h5'))
     print("Going to load 'py_code_proc_v2.dpkl'")
     num_encoder_tokens, enc_pp = load_text_processor(
         self.seq2seq_path / 'py_code_proc_v2.dpkl')
     print("Going to load 'py_comment_proc_v2.dpkl'")
     num_decoder_tokens, dec_pp = load_text_processor(
         self.seq2seq_path / 'py_comment_proc_v2.dpkl')
     print("Going to load 'Seq2Seq_Inference'")
     self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                          decoder_preprocessor=dec_pp,
                                          seq2seq_model=seq2seq_Model)
     print("Going to load 'code2emb_model.hdf5'")
     self.code2emb_model = load_model(str(self.code2emb_path /
                                          'code2emb_model.hdf5'),
                                      custom_objects=None,
                                      compile=False)
     print("Going to load 'py_code_proc_v2.dpkl'")
     self.num_encoder_tokens_vector, self.enc_pp_vector = load_text_processor(
         self.seq2seq_path / 'py_code_proc_v2.dpkl')
import os
import cPickle as pickle
from keras.models import load_model
import pandas as pd


with open('github_issues-bundle.pkl', 'rb') as fp:
    X, Y, decoder_target_data, idx2word, word2idx = pickle.load(fp)

with open('github_issues.pkl', 'rb') as fp:
    train_dataset, test_dataset = pickle.load(fp)

print('Train',train_dataset.shape)
print('Test',test_dataset.shape)

seq2seq_Model = load_model('seq2seq_model_keras.h5')


from seq2seq_utils import Seq2Seq_Inference
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=X,
                                decoder_preprocessor=Y,
                                seq2seq_model=seq2seq_Model,
                                idx2word= idx2word,
                                word2idx= word2idx)

# this method displays the predictions on random rows of the holdout set
seq2seq_inf.demo_model_predictions(n=1, issue_df=X, threshold=1)


print('EOP')
from seq2seq_utils import Seq2Seq_Inference

# Parsing flags.
parser = argparse.ArgumentParser()
parser.add_argument("--input_model_h5")
parser.add_argument("--input_body_preprocessor_dpkl")
parser.add_argument("--input_title_preprocessor_dpkl")
parser.add_argument("--input_testdf_csv")
parser.add_argument("--input_prediction_count", type=int, default=50)
args = parser.parse_args()
print(args)

# Read data.
testdf = pd.read_csv(args.input_testdf_csv)

# Load model, preprocessors.
seq2seq_Model = keras.models.load_model(args.input_model_h5)
num_encoder_tokens, body_pp = load_text_processor(
    args.input_body_preprocessor_dpkl)
num_decoder_tokens, title_pp = load_text_processor(
    args.input_title_preprocessor_dpkl)

# Prepare inference.
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                decoder_preprocessor=title_pp,
                                seq2seq_model=seq2seq_Model)

# Output predictions for n random rows in the test set.
seq2seq_inf.demo_model_predictions(n=args.input_prediction_count,
                                   issue_df=testdf)
Exemple #12
0
# 
# To evaluate this model we are going to do two things:
# 
# 1.  Manually inspect the results of predicted docstrings for code snippets, to make sure they look sensible.
# 2.  Calculate the [BLEU Score](https://en.wikipedia.org/wiki/BLEU) so that we can quantitately benchmark different iterations of this algorithm and to guide hyper-parameter tuning.

# ### Manually Inspect Results (on holdout set)

# In[15]:


from seq2seq_utils import Seq2Seq_Inference
import pandas as pd

seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                 decoder_preprocessor=dec_pp,
                                 seq2seq_model=seq2seq_Model)

demo_testdf = pd.DataFrame({'code':holdout_code, 'comment':holdout_comment, 'ref':''})
seq2seq_inf.demo_model_predictions(n=15, df=demo_testdf)


# ### Comment on manual inspection of results:
# 
# The predicted code summaries are not perfect, but we can see that the model has learned to extract some semantic meaning from the code.  That's all we need to get reasonable results in this case.  

# ### Calculate BLEU Score (on holdout set)
# 
# BLEU Score is described [in this wikipedia article](https://en.wikipedia.org/wiki/BLEU), and is a way to measure the efficacy of summarization/translation such as the one we conducted here.  This metric is useful if you wish to conduct extensive hyper-parameter tuning and try to improve the seq2seq model.

# In[24]: