def get_staqc_dataset(path: RichPath) -> List[Dict[str, Any]]: codes = path.get_filtered_files_in_dir( 'python*qid_by*code.pickle')[0].read_as_pickle() titles = path.get_filtered_files_in_dir( 'python*qid_by*title.pickle')[0].read_as_pickle() data = chain([{ 'code': code, 'code_tokens': tokenize_python_from_string(code, func_only=False).code_tokens, 'docstring': titles[_id], 'docstring_tokens': tokenize_docstring_from_string(titles[_id]), 'language': 'python' } for _id, code in codes.items()]) filtered_data = filter_untokenizable_code(data) log_row_count_diff(original_data=codes.items(), filtered_data=filtered_data, label='StaQC') assert len( filtered_data ) > 0, 'No code tokens retrieved after applying filters for StaQC.' return filtered_data
def run_train(model_class: Type[Model], train_data_path: RichPath, valid_data_path: RichPath, save_folder: str, hyperparameters: Dict[str, Any], run_name: Optional[str]=None, quiet: bool=False) \ -> RichPath: train_data_chunk_paths = train_data_path.get_filtered_files_in_dir('chunk_*') valid_data_chunk_paths = valid_data_path.get_filtered_files_in_dir('valid_chunk_*') model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder) if os.path.exists(model.model_save_path): model = model_restore_helper.restore(RichPath.create(model.model_save_path), is_train=True) model.train_log("Resuming training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'], model.__class__.__name__, json.dumps( hyperparameters))) resume = True else: model.load_existing_metadata(train_data_path.join('metadata.pkl.gz')) model.make_model(is_train=True) model.train_log("Starting training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'], model.__class__.__name__, json.dumps(hyperparameters))) resume = False model_path = model.train(train_data_chunk_paths, valid_data_chunk_paths, quiet=quiet, resume=resume) return model_path
def run_export(model_path: RichPath, test_data_path: RichPath, output_folder: str): test_hyper_overrides = { 'run_id': 'exporting', "dropout_keep_rate": 1.0, } data_chunks = test_data_path.get_filtered_files_in_dir('*gz') # Restore model model = model_restore_helper.restore(model_path, is_train=False, hyper_overrides=test_hyper_overrides) exporting = model.export_representations(data_chunks) os.makedirs(output_folder, exist_ok=True) with open(os.path.join(output_folder, 'vectors.tsv'), 'w') as vectors_file,\ open(os.path.join(output_folder, 'metadata.tsv'), 'w') as metadata_file: metadata_file.write('varname\ttype\tkind\tprovenance\n') for annot in exporting: metadata_file.write( f'{assert_valid_str(annot.name)}\t{assert_valid_str(annot.type_annotation)}\t{assert_valid_str(annot.kind)}\t{assert_valid_str(annot.provenance)}\n' ) vectors_file.write('\t'.join(str(e) for e in annot.representation)) vectors_file.write('\n')
def run_test(model_path: RichPath, test_data_path: RichPath, output_folder: str, num_processes: int): test_run_id = "_".join([time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())]) test_hyper_overrides = { "run_id": test_run_id, "cx_max_num_types": 50, "cg_max_num_types": 50, "eg_propagation_substeps": 100, "eg_max_variable_choices": 15, "dropout_keep_rate": 1.0, } test_data_chunks = test_data_path.get_filtered_files_in_dir('*gz') test_jobs = [(model_path, test_hyper_overrides, output_folder, chunk_id, chunk_data_paths) for chunk_id, chunk_data_paths in enumerate(chunkify(test_data_chunks, num_processes))] with Pool(processes=num_processes) as pool: num_samples, token_perplexities, correct_at_1, correct_at_5 = zip(*pool.starmap(test_on_raw_chunks, test_jobs)) # num_samples, token_perplexities, correct_at_1, correct_at_5 = zip(*[test_on_raw_chunks(*job) for job in test_jobs]) num_samples = sum(num_samples) token_perplexities = np.concatenate(token_perplexities, axis=0) correct_at_1 = sum(correct_at_1) correct_at_5 = sum(correct_at_5) print('Num samples: %i (%i before filtering)' % (len(token_perplexities), num_samples)) print('Avg Sample Perplexity: %.2f' % np.mean(token_perplexities)) print('Std Sample Perplexity: %.2f' % np.std(token_perplexities)) print('Accuracy@1: %.4f%%' % (float(correct_at_1) / num_samples * 100)) print('Accuracy@5: %.4f%%' % (float(correct_at_5) / num_samples * 100))
def get_conala_dataset(path: RichPath) -> List[Dict[str, Any]]: data_files = sorted(path.get_filtered_files_in_dir('*.json'), key=lambda p: p.path) raw_data = [ row for row in flatten(list(f.read_as_json() for f in data_files)) if row['rewritten_intent'] ] data = chain([{ 'code': row['snippet'], 'code_tokens': tokenize_python_from_string(row['snippet'], func_only=False).code_tokens, 'docstring': row['rewritten_intent'], 'docstring_tokens': tokenize_docstring_from_string(row['rewritten_intent']), 'language': 'python' } for row in raw_data]) filtered_data = filter_untokenizable_code(data) log_row_count_diff(original_data=raw_data, filtered_data=filtered_data, label='CoNaLa') assert len( filtered_data ) > 0, 'No code tokens retrieved after applying filters for CoNaLa.' return filtered_data
def run_test(model_path: RichPath, test_data_path: RichPath, type_lattice_path: RichPath, alias_metadata_path: RichPath, print_predictions: bool = False): test_run_id = "_".join( [time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())]) test_hyper_overrides = { 'run_id': test_run_id, "dropout_keep_rate": 1.0, } test_data_chunks = test_data_path.get_filtered_files_in_dir('*gz') # Restore model model = model_restore_helper.restore( model_path, is_train=False, hyper_overrides=test_hyper_overrides) evaluator = TypePredictionEvaluator(type_lattice_path, alias_metadata_path) all_annotations = model.annotate(test_data_chunks) for annotation in all_annotations: if ignore_type_annotation(annotation.original_annotation): continue predicted_annotation = max(annotation.predicted_annotation_logprob_dist, key=lambda x: annotation.predicted_annotation_logprob_dist[x]) if print_predictions: print( f'{annotation.provenance} -- {annotation.name}: {annotation.original_annotation} -> {predicted_annotation} ({math.exp(annotation.predicted_annotation_logprob_dist[predicted_annotation])*100:.1f}%)') evaluator.add_sample(ground_truth=annotation.original_annotation, predicted_dist=annotation.predicted_annotation_logprob_dist) print(json.dumps(evaluator.metrics(), indent=2, sort_keys=True))
def get_data_files_from_directory( data_dir: RichPath, max_num_files: Optional[int] = None) -> List[RichPath]: files = data_dir.get_filtered_files_in_dir('*.gz') if max_num_files is None: return files else: return sorted(files)[:int(max_num_files)]
def start(model_path: RichPath, index_data_train: RichPath, index_data_valid: RichPath): test_hyper_overrides = { 'run_id': 'indexing', "dropout_keep_rate": 1.0, } train_data_chunks = index_data_train.get_filtered_files_in_dir( '*.jsonl.gz') valid_data_chunks = index_data_valid.get_filtered_files_in_dir( '*.jsonl.gz') # Restore model model = model_restore_helper.restore(model_path, is_train=False, hyper_overrides=test_hyper_overrides) model.create_index(train_data_chunks + valid_data_chunks) model.save(model_path)
def load_from_folder(path: RichPath, shuffle: bool, rank: int, world_size): all_files = [ p for i, p in enumerate(path.get_filtered_files_in_dir("*.jsonl.gz")) if i % world_size == rank ] if shuffle: random.shuffle(all_files) for file in all_files: yield from file.read_as_jsonl()
def split_many_files(input_dir: RichPath, output_dir: RichPath, train_ratio: float, valid_ratio: float, test_ratio: float, test_only_projects: Set[str]) -> None: output_paths = {} # type: Dict[str, RichPath] for split_name in ['train', 'valid', 'test', 'test-only']: graph_dir_name_for_split_type = input_dir.basename() + '-' + split_name graph_dir_for_split_type = output_dir.join( graph_dir_name_for_split_type) output_paths[split_name] = graph_dir_for_split_type graph_dir_for_split_type.make_as_dir() pool = Pool() pool.starmap(split_file, [(f, output_paths, train_ratio, valid_ratio, test_ratio, test_only_projects) for f in input_dir.get_filtered_files_in_dir('*')]) return None
def run_predict(model_path: RichPath, test_data_path: RichPath, output_file: RichPath): test_run_id = "_".join( [time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())]) test_hyper_overrides = { 'run_id': test_run_id, "dropout_keep_rate": 1.0, } test_data_chunks = test_data_path.get_filtered_files_in_dir('*.jsonl.gz') # Restore model model = model_restore_helper.restore(model_path, is_train=False, hyper_overrides=test_hyper_overrides) def predictions(): for annotation in model.annotate(test_data_chunks): if ignore_annotation(annotation.original_annotation): continue ordered_annotation_predictions = sorted( annotation.predicted_annotation_logprob_dist, key=lambda x: -annotation.predicted_annotation_logprob_dist[ x])[:10] annotation_dict = annotation._asdict() logprobs = annotation_dict['predicted_annotation_logprob_dist'] filtered_logprobs = [] for annot in ordered_annotation_predictions: logprob = float(logprobs[annot]) if annot == '%UNK%' or annot == '%UNKNOWN%': annot = 'typing.Any' filtered_logprobs.append((annot, logprob)) annotation_dict[ 'predicted_annotation_logprob_dist'] = filtered_logprobs yield annotation_dict output_file.save_as_compressed_file(predictions())
def load_from_folder(path: RichPath, shuffle: bool): all_files = path.get_filtered_files_in_dir("*.jsonl.gz") if shuffle: random.shuffle(all_files) for file in all_files: yield from file.read_as_jsonl()