def run_train(model_class: Type[Model], train_data_dirs: List[RichPath], valid_data_dirs: List[RichPath], save_folder: str, hyperparameters: Dict[str, Any], azure_info_path: Optional[str], run_name: str, quiet: bool = False, max_files_per_dir: Optional[int] = None, parallelize: bool = True) -> RichPath: assert parallelize model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder) if os.path.exists(model.model_save_path): model = model_restore_helper.restore(RichPath.create( model.model_save_path), is_train=True) model.train_log( "Resuming training run %s of model %s with following hypers:\n%s" % (run_name, model.__class__.__name__, str(hyperparameters))) resume = True else: model.train_log( "Tokenizing and building vocabulary for code snippets and queries. This step may take several hours." ) model.load_metadata(train_data_dirs, max_files_per_dir=max_files_per_dir, parallelize=parallelize) model.make_model(is_train=True) model.train_log( "Starting training run %s of model %s with following hypers:\n%s" % (run_name, model.__class__.__name__, str(hyperparameters))) resume = False philly_job_id = os.environ.get('PHILLY_JOB_ID') if philly_job_id is not None: # We are in Philly write out the model name in an auxiliary file with open(os.path.join(save_folder, philly_job_id + '.job'), 'w') as f: f.write(os.path.basename(model.model_save_path)) wandb.config.update(model.hyperparameters) model.train_log("Loading training and validation data.") train_data = model.load_data_from_dirs(train_data_dirs, is_test=False, max_files_per_dir=max_files_per_dir, parallelize=parallelize) valid_data = model.load_data_from_dirs(valid_data_dirs, is_test=False, max_files_per_dir=max_files_per_dir, parallelize=parallelize) model.train_log("Begin Training.") model_path = model.train(train_data, valid_data, azure_info_path, quiet=quiet, resume=resume) return model_path
def load_model(self): model_path = RichPath.create(self.local_model_path, None) print("Restoring model from %s" % model_path) self.model = model_restore_helper.restore(path=model_path, is_train=False, hyper_overrides={}) for language in ['python', 'go', 'javascript', 'java', 'php', 'ruby']: # for language in ['python']: print("Loading language: %s" % language) self.definitions[language] = pickle.load( open( '../resources/data/{}_dedupe_definitions_v2.pkl'.format( language), 'rb')) if os.path.exists('/datadrive/{}.ann'.format(language)): self.indices[language] = AnnoyIndex(128, 'angular') self.indices[language].load( '/datadrive/{}.ann'.format(language)) else: indexes = [{ 'code_tokens': d['function_tokens'], 'language': d['language'] } for d in tqdm(self.definitions[language])] code_representations = self.model.get_code_representations( indexes) print(code_representations[0].shape) self.indices[language] = AnnoyIndex( code_representations[0].shape[0], 'angular') for index, vector in tqdm(enumerate(code_representations)): assert vector is not None self.indices[language].add_item(index, vector) self.indices[language].build(1000) self.indices[language].save( '/datadrive/{}.ann'.format(language))
def __init__(self, model_path: RichPath, test_batch_size: int=1000, distance_metric: str='cosine', quiet: bool=False, hypers_override: Optional[Dict[str, Any]]=None) -> None: self.__model = model_restore_helper.restore(path=model_path, is_train=False, hyper_overrides=hypers_override) self.__quiet = quiet self.__test_batch_size = test_batch_size self.__distance_metric = distance_metric
def run(arguments) -> None: azure_info_path = arguments.get('--azure-info', None) model_path = RichPath.create(arguments['MODEL_PATH'], azure_info_path=azure_info_path) model = model_restore_helper.restore(path=model_path, is_train=False) if arguments['--query']: embeddings, elements = model.get_query_token_embeddings() else: embeddings, elements = model.get_code_token_embeddings( arguments['--language']) max_num_elements = int(arguments['--lim-items']) if max_num_elements > 0: embeddings, elements = embeddings[: max_num_elements], elements[: max_num_elements] print(f'Collected {len(elements)} elements to visualize.') embeddings = model.sess.run(fetches=embeddings) if arguments['plot-tsne']: emb_2d = TSNE( n_components=2, verbose=1, metric=arguments['--distance-metric']).fit_transform(embeddings) plt.scatter(emb_2d[:, 0], emb_2d[:, 1]) for i in range(len(elements)): plt.annotate(elements[i], xy=(emb_2d[i, 0], emb_2d[i, 1])) plt.show() elif arguments['print-nns']: flat_distances = pdist(embeddings, arguments['--distance-metric']) num_nns = int(arguments['--num-nns']) for i, element in enumerate(elements): distance_from_i = np.fromiter((flat_distances[square_to_condensed( i, j, len(elements))] if i != j else float('inf') for j in range(len(elements))), dtype=np.float) nns = [int(k) for k in np.argsort(distance_from_i)[:num_nns] ] # The first two NNs if distance_from_i[nns[0]] > float( arguments['DISTANCE_THRESHOLD']): continue try: print(f'{element} --> ' + ', '.join(f'{elements[n]} ({distance_from_i[n]:.2f})' for n in nns)) except: print('Error printing token for nearest neighbors pair.')
file=sys.stderr) sys.exit(1) print("Fetching run files from W&B...") gz_run_files = [f for f in run.files() if f.name.endswith('gz')] if not gz_run_files: print("ERROR: Run contains no model-like files") sys.exit(1) model_file = gz_run_files[0].download(replace=True) local_model_path = model_file.name run_id = args_wandb_run_id.split('/')[-1] model_path = RichPath.create(local_model_path, None) print("Restoring model from %s" % model_path) model = model_restore_helper.restore(path=model_path, is_train=False, hyper_overrides={}) predictions = [] for language in ('python', 'go', 'javascript', 'java', 'php', 'ruby'): print("Evaluating language: %s" % language) definitions = pickle.load( open( '../resources/data/{}_dedupe_definitions_v2.pkl'.format( language), 'rb')) # dict_keys(['nwo', 'sha', 'path', 'language', 'identifier', 'parameters', 'argument_list', 'return_statement', # 'docstring', 'docstring_summary', 'docstring_tokens', 'function', 'function_tokens', 'url', 'score']) indexes = [{ 'code': d['function'], 'code_tokens': d['function_tokens'], 'language': d['language']
def run(arguments) -> None: azure_info_path = arguments.get('--azure-info', None) data_path = RichPath.create(arguments['DATA_PATH'], azure_info_path) assert data_path.is_dir(), "%s is not a folder" % (data_path,) hypers_override = arguments.get('--hypers-override') if hypers_override is not None: hypers_override = json.loads(hypers_override) else: hypers_override = {} model_path = RichPath.create(arguments['MODEL_PATH'], azure_info_path=azure_info_path) model = model_restore_helper.restore( path=model_path, is_train=False, hyper_overrides=hypers_override) num_elements_to_take = int(arguments['--max-num-items']) data = chain(*chain(list(f.read_by_file_suffix()) for f in data_path.iterate_filtered_files_in_dir('*.jsonl.gz'))) if num_elements_to_take == 0: # Take all data = list(data) else: assert num_elements_to_take > 0 data = take(num_elements_to_take, data) num_nns = int(arguments['--num-nns']) if arguments['--code']: representations = model.get_code_representations(data) elif arguments['--query']: representations = model.get_query_representations(data) else: code_representations = model.get_code_representations(data) query_representations = model.get_query_representations(data) representations = np.concatenate([code_representations, query_representations], axis=-1) filtered_representations = [] filtered_data = [] # type: List[Dict[str, Any]] for i, representation in enumerate(representations): if representation is None: continue filtered_representations.append(representation) filtered_data.append(data[i]) filtered_representations = np.stack(filtered_representations, axis=0) flat_distances = pdist(filtered_representations, arguments['--distance-metric']) for i, data in enumerate(filtered_data): distance_from_i = np.fromiter( (flat_distances[square_to_condensed(i, j, len(filtered_data))] if i != j else float('inf') for j in range(len(filtered_data))), dtype=np.float) nns = [int(k) for k in np.argsort(distance_from_i)[:num_nns]] # The first two NNs if distance_from_i[nns[0]] > float(arguments['--distance-threshold']): continue print('===============================================================') print(f"{data['repo']}:{data['path']}:{data['lineno']}") print(to_string(data['original_string'], language=data['language'])) for j in range(num_nns): print() print( f'Nearest Neighbour {j + 1}: {filtered_data[nns[j]]["repo"]}:{filtered_data[nns[j]]["path"]}:{filtered_data[nns[j]]["lineno"]} (distance {distance_from_i[nns[j]]})') print(to_string(filtered_data[nns[j]]['original_string'], language=filtered_data[nns[j]]['language']))