class Types(Evidence): def load_embedding(self, save_dir): embed_save_dir = os.path.join(save_dir, 'embed_types') self.lda = LDA(from_file=os.path.join(embed_save_dir, 'model.pkl')) def read_data_point(self, program): types = program['types'] if 'types' in program else [] return list(set(types)) def wrangle(self, data): return np.array(self.lda.infer(data), dtype=np.float32) def placeholder(self, config): return tf.placeholder(tf.float32, [config.batch_size, self.lda.model.n_components]) def encode(self, inputs, config): with tf.variable_scope('types'): encoding = tf.layers.dense(inputs, config.units) return encoding @staticmethod def from_call(call): split = list(reversed([q for q in call.split('(')[0].split('.')[:-1] if q[0].isupper()])) types = [split[1], split[0]] if len(split) > 1 else [split[0]] types = [re.sub('<.*', r'', t) for t in types] # ignore generic types in evidence args = call.split('(')[1].split(')')[0].split(',') args = [arg.split('.')[-1] for arg in args] args = [re.sub('<.*', r'', arg) for arg in args] # remove generics args = [re.sub('\[\]', r'', arg) for arg in args] # remove array type types_args = [arg for arg in args if not arg == '' and not arg.startswith('Tau_')] return types + types_args
class APICalls(Evidence): def load_embedding(self, save_dir): embed_save_dir = os.path.join(save_dir, 'embed_apicalls') self.lda = LDA(from_file=os.path.join(embed_save_dir, 'model.pkl')) def read_data_point(self, program): apicalls = program['apicalls'] if 'apicalls' in program else [] return list(set(apicalls)) def wrangle(self, data): return np.array(self.lda.infer(data), dtype=np.float32) def placeholder(self, config): return tf.placeholder(tf.float32, [config.batch_size, self.lda.model.n_components]) def encode(self, inputs, config): with tf.variable_scope('apicalls'): encoding = tf.layers.dense(inputs, config.units) return encoding @staticmethod def from_call(call): split = call.split('(')[0].split('.') cls, name = split[-2:] return [name] if not cls == name else []
class APICalls(Evidence): def load_embedding(self, save_dir): embed_save_dir = os.path.join(save_dir, 'embed_apicalls') self.lda = LDA(from_file=os.path.join(embed_save_dir, 'model.pkl')) def read_data_point(self, program): apicalls = program['apicalls'] if 'apicalls' in program else [] return list(set(apicalls)) def wrangle(self, data): return np.array(self.lda.infer(data), dtype=np.float32) def placeholder(self, config): return tf.placeholder(tf.float32, [config.batch_size, self.lda.model.n_components]) def exists(self, inputs): return tf.not_equal(tf.count_nonzero(inputs, axis=1), 0) def init_sigma(self, config): with tf.variable_scope('apicalls'): self.sigma = tf.get_variable('sigma', []) def encode(self, inputs, config): with tf.variable_scope('apicalls'): encoding = tf.layers.dense(inputs, self.units, activation=tf.nn.tanh) for i in range(self.num_layers - 1): encoding = tf.layers.dense(encoding, self.units, activation=tf.nn.tanh) w = tf.get_variable('w', [self.units, config.latent_size]) b = tf.get_variable('b', [config.latent_size]) latent_encoding = tf.nn.xw_plus_b(encoding, w, b) return latent_encoding def evidence_loss(self, psi, encoding, config): sigma_sq = tf.square(self.sigma) loss = 0.5 * (config.latent_size * tf.log(2 * np.pi * sigma_sq + 1e-10) + tf.square(encoding - psi) / sigma_sq) return loss @staticmethod def from_call(callnode): call = callnode['_call'] call = re.sub('^\$.*\$', '', call) # get rid of predicates split = call.split('(')[0].split('.') cls, name = split[-2:] cls = cls.split('<')[ 0] # class name might be generic but method name is never return [name] if not cls == name else []
class Types(Evidence): def load_embedding(self, save_dir): embed_save_dir = os.path.join(save_dir, 'embed_types') self.lda = LDA(from_file=os.path.join(embed_save_dir, 'model.pkl')) def read_data_point(self, program): types = program['types'] if 'types' in program else [] return list(set(types)) def wrangle(self, data): return np.array(self.lda.infer(data), dtype=np.float32) def placeholder(self, config): return tf.placeholder(tf.float32, [config.batch_size, self.lda.model.n_components]) def exists(self, inputs): return tf.not_equal(tf.count_nonzero(inputs, axis=1), 0) def init_sigma(self, config): with tf.variable_scope('types'): self.sigma = tf.get_variable('sigma', []) def encode(self, inputs, config): with tf.variable_scope('types'): encoding = tf.layers.dense(inputs, self.units) w = tf.get_variable('w', [self.units, config.latent_size]) b = tf.get_variable('b', [config.latent_size]) latent_encoding = tf.nn.xw_plus_b(encoding, w, b) return latent_encoding def evidence_loss(self, psi, encoding, config): sigma_sq = tf.square(self.sigma) loss = 0.5 * (config.latent_size * tf.log(2 * np.pi * sigma_sq + 1e-10) + tf.square(encoding - psi) / sigma_sq) return loss @staticmethod def from_call(call): split = list(reversed([q for q in call.split('(')[0].split('.')[:-1] if q[0].isupper()])) types = [split[1], split[0]] if len(split) > 1 else [split[0]] types = [re.sub('<.*', r'', t) for t in types] # ignore generic types in evidence args = call.split('(')[1].split(')')[0].split(',') args = [arg.split('.')[-1] for arg in args] args = [re.sub('<.*', r'', arg) for arg in args] # remove generics args = [re.sub('\[\]', r'', arg) for arg in args] # remove array type types_args = [arg for arg in args if not arg == '' and not arg.startswith('Tau_')] return types + types_args
class APICalls(Evidence): def load_embedding(self, save_dir): embed_save_dir = os.path.join(save_dir, 'embed_apicalls') self.lda = LDA(from_file=os.path.join(embed_save_dir, 'model.pkl')) def read_data_point(self, program): apicalls = program['apicalls'] if 'apicalls' in program else [] return list(set(apicalls)) def wrangle(self, data): return np.array(self.lda.infer(data), dtype=np.float32) def placeholder(self, config): return tf.placeholder(tf.float32, [config.batch_size, self.lda.model.n_components]) def exists(self, inputs): return tf.not_equal(tf.count_nonzero(inputs, axis=1), 0) def init_sigma(self, config): with tf.variable_scope('apicalls'): self.sigma = tf.get_variable('sigma', []) def encode(self, inputs, config): with tf.variable_scope('apicalls'): encoding = tf.layers.dense(inputs, self.units, activation=tf.nn.tanh) for i in range(self.num_layers - 1): encoding = tf.layers.dense(encoding, self.units, activation=tf.nn.tanh) w = tf.get_variable('w', [self.units, config.latent_size]) b = tf.get_variable('b', [config.latent_size]) latent_encoding = tf.nn.xw_plus_b(encoding, w, b) return latent_encoding def evidence_loss(self, psi, encoding, config): sigma_sq = tf.square(self.sigma) loss = 0.5 * (config.latent_size * tf.log(2 * np.pi * sigma_sq + 1e-10) + tf.square(encoding - psi) / sigma_sq) return loss @staticmethod def from_call(callnode): call = callnode['_call'] call = re.sub('^\$.*\$', '', call) # get rid of predicates split = call.split('(')[0].split('.') cls, name = split[-2:] cls = cls.split('<')[0] # class name might be generic but method name is never return [name] if not cls == name else []
class Types(Evidence): def load_embedding(self, save_dir): embed_save_dir = os.path.join(save_dir, 'embed_types') self.lda = LDA(from_file=os.path.join(embed_save_dir, 'model.pkl')) def read_data_point(self, program): types = program['types'] if 'types' in program else [] return list(set(types)) def wrangle(self, data): return np.array(self.lda.infer(data), dtype=np.float32) def placeholder(self, config): return tf.placeholder(tf.float32, [config.batch_size, self.lda.model.n_components]) def encode(self, inputs, config): with tf.variable_scope('types'): encoding = tf.layers.dense(inputs, config.units) return encoding @staticmethod def from_call(call): split = list( reversed([ q for q in call.split('(')[0].split('.')[:-1] if q[0].isupper() ])) types = [split[1], split[0]] if len(split) > 1 else [split[0]] types = [re.sub('<.*', r'', t) for t in types] # ignore generic types in evidence args = call.split('(')[1].split(')')[0].split(',') args = [arg.split('.')[-1] for arg in args] args = [re.sub('<.*', r'', arg) for arg in args] # remove generics args = [re.sub('\[\]', r'', arg) for arg in args] # remove array type types_args = [ arg for arg in args if not arg == '' and not arg.startswith('Tau_') ] return types + types_args
class Keywords(Evidence): STOP_WORDS = { # CoreNLP English stop words "'ll", "'s", "'m", "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "return", "arent", "cant", "couldnt", "didnt", "doesnt", "dont", "hadnt", "hasnt", "havent", "hes", "heres", "hows", "im", "isnt", "its", "lets", "mustnt", "shant", "shes", "shouldnt", "thats", "theres", "theyll", "theyre", "theyve", "wasnt", "were", "werent", "whats", "whens", "wheres", "whos", "whys", "wont", "wouldnt", "youd", "youll", "youre", "youve" } def load_embedding(self, save_dir): embed_save_dir = os.path.join(save_dir, 'embed_keywords') self.lda = LDA(from_file=os.path.join(embed_save_dir, 'model.pkl')) def read_data_point(self, program): keywords = program['keywords'] if 'keywords' in program else [] return list(set(keywords)) def wrangle(self, data): return np.array(self.lda.infer(data), dtype=np.float32) def placeholder(self, config): return tf.placeholder(tf.float32, [config.batch_size, self.lda.model.n_components]) def exists(self, inputs): return tf.not_equal(tf.count_nonzero(inputs, axis=1), 0) def init_sigma(self, config): with tf.variable_scope('keywords'): self.sigma = tf.get_variable('sigma', []) def encode(self, inputs, config): with tf.variable_scope('keywords'): encoding = tf.layers.dense(inputs, self.units, activation=tf.nn.tanh) for i in range(self.num_layers - 1): encoding = tf.layers.dense(encoding, self.units, activation=tf.nn.tanh) w = tf.get_variable('w', [self.units, config.latent_size]) b = tf.get_variable('b', [config.latent_size]) latent_encoding = tf.nn.xw_plus_b(encoding, w, b) return latent_encoding def evidence_loss(self, psi, encoding, config): sigma_sq = tf.square(self.sigma) loss = 0.5 * (config.latent_size * tf.log(2 * np.pi * sigma_sq + 1e-10) + tf.square(encoding - psi) / sigma_sq) return loss @staticmethod def split_camel(s): s = re.sub('(.)([A-Z][a-z]+)', r'\1#\2', s) # UC followed by LC s = re.sub('([a-z0-9])([A-Z])', r'\1#\2', s) # LC followed by UC return s.split('#') @staticmethod def from_call(callnode): call = callnode['_call'] call = re.sub('^\$.*\$', '', call) # get rid of predicates qualified = call.split('(')[0] qualified = re.sub('<.*>', '', qualified).split( '.') # remove generics for keywords # add qualified names (java, util, xml, etc.), API calls and types keywords = list(chain.from_iterable([Keywords.split_camel(s) for s in qualified])) + \ list(chain.from_iterable([Keywords.split_camel(c) for c in APICalls.from_call(callnode)])) + \ list(chain.from_iterable([Keywords.split_camel(t) for t in Types.from_call(callnode)])) # convert to lower case, omit stop words and take the set return list( set([ k.lower() for k in keywords if k.lower() not in Keywords.STOP_WORDS ]))
class Keywords(Evidence): STOP_WORDS = { # CoreNLP English stop words "'ll", "'s", "'m", "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "return", "arent", "cant", "couldnt", "didnt", "doesnt", "dont", "hadnt", "hasnt", "havent", "hes", "heres", "hows", "im", "isnt", "its", "lets", "mustnt", "shant", "shes", "shouldnt", "thats", "theres", "theyll", "theyre", "theyve", "wasnt", "were", "werent", "whats", "whens", "wheres", "whos", "whys", "wont", "wouldnt", "youd", "youll", "youre", "youve" } def load_embedding(self, save_dir): embed_save_dir = os.path.join(save_dir, 'embed_keywords') self.lda = LDA(from_file=os.path.join(embed_save_dir, 'model.pkl')) def read_data_point(self, program): keywords = program['keywords'] if 'keywords' in program else [] return list(set(keywords)) def wrangle(self, data): return np.array(self.lda.infer(data), dtype=np.float32) def placeholder(self, config): return tf.placeholder(tf.float32, [config.batch_size, self.lda.model.n_components]) def exists(self, inputs): return tf.not_equal(tf.count_nonzero(inputs, axis=1), 0) def init_sigma(self, config): with tf.variable_scope('keywords'): self.sigma = tf.get_variable('sigma', []) def encode(self, inputs, config): with tf.variable_scope('keywords'): encoding = tf.layers.dense(inputs, self.units, activation=tf.nn.tanh) for i in range(self.num_layers - 1): encoding = tf.layers.dense(encoding, self.units, activation=tf.nn.tanh) w = tf.get_variable('w', [self.units, config.latent_size]) b = tf.get_variable('b', [config.latent_size]) latent_encoding = tf.nn.xw_plus_b(encoding, w, b) return latent_encoding def evidence_loss(self, psi, encoding, config): sigma_sq = tf.square(self.sigma) loss = 0.5 * (config.latent_size * tf.log(2 * np.pi * sigma_sq + 1e-10) + tf.square(encoding - psi) / sigma_sq) return loss @staticmethod def split_camel(s): s = re.sub('(.)([A-Z][a-z]+)', r'\1#\2', s) # UC followed by LC s = re.sub('([a-z0-9])([A-Z])', r'\1#\2', s) # LC followed by UC return s.split('#') @staticmethod def from_call(callnode): call = callnode['_call'] call = re.sub('^\$.*\$', '', call) # get rid of predicates qualified = call.split('(')[0] qualified = re.sub('<.*>', '', qualified).split('.') # remove generics for keywords # add qualified names (java, util, xml, etc.), API calls and types keywords = list(chain.from_iterable([Keywords.split_camel(s) for s in qualified])) + \ list(chain.from_iterable([Keywords.split_camel(c) for c in APICalls.from_call(callnode)])) + \ list(chain.from_iterable([Keywords.split_camel(t) for t in Types.from_call(callnode)])) # convert to lower case, omit stop words and take the set return list(set([k.lower() for k in keywords if k.lower() not in Keywords.STOP_WORDS]))
class Types(Evidence): def load_embedding(self, save_dir): embed_save_dir = os.path.join(save_dir, 'embed_types') self.lda = LDA(from_file=os.path.join(embed_save_dir, 'model.pkl')) def read_data_point(self, program): types = program['types'] if 'types' in program else [] return list(set(types)) def wrangle(self, data): return np.array(self.lda.infer(data), dtype=np.float32) def placeholder(self, config): return tf.placeholder(tf.float32, [config.batch_size, self.lda.model.n_components]) def exists(self, inputs): return tf.not_equal(tf.count_nonzero(inputs, axis=1), 0) def init_sigma(self, config): with tf.variable_scope('types'): self.sigma = tf.get_variable('sigma', []) def encode(self, inputs, config): with tf.variable_scope('types'): encoding = tf.layers.dense(inputs, self.units, activation=tf.nn.tanh) for i in range(self.num_layers - 1): encoding = tf.layers.dense(encoding, self.units, activation=tf.nn.tanh) w = tf.get_variable('w', [self.units, config.latent_size]) b = tf.get_variable('b', [config.latent_size]) latent_encoding = tf.nn.xw_plus_b(encoding, w, b) return latent_encoding def evidence_loss(self, psi, encoding, config): sigma_sq = tf.square(self.sigma) loss = 0.5 * (config.latent_size * tf.log(2 * np.pi * sigma_sq + 1e-10) + tf.square(encoding - psi) / sigma_sq) return loss @staticmethod def from_call(callnode): call = callnode['_call'] call = re.sub('^\$.*\$', '', call) # get rid of predicates split = list(reversed([q for q in call.split('(')[0].split('.')[:-1] if q[0].isupper()])) types = [split[1], split[0]] if len(split) > 1 else [split[0]] types = [re.sub('<.*', r'', t) for t in types] # ignore generic types in evidence args = call.split('(')[1].split(')')[0].split(',') args = [arg.split('.')[-1] for arg in args] args = [re.sub('<.*', r'', arg) for arg in args] # remove generics args = [re.sub('\[\]', r'', arg) for arg in args] # remove array type types_args = [arg for arg in args if not arg == '' and not arg.startswith('Tau_')] if '_throws' in callnode: throws = [throw.split('.')[-1] for throw in callnode['_throws']] throws = [re.sub('<.*', r'', throw) for throw in throws] # remove generics throws = [re.sub('\[\]', r'', throw) for throw in throws] # remove array type throws = [throw for throw in throws if not throw.startswith('Tau_')] else: throws = [] if '_returns' in callnode: ret = callnode['_returns'].split('.')[-1] ret = re.sub('<.*', r'', ret) # remove generics ret = re.sub('\[\]', r'', ret) # remove array type returns = [] if ret.startswith('Tau_') or ret == 'void' else [ret] else: returns = [] return types + types_args + throws + returns