Example #1
0
def get_staqc_dataset(path: RichPath) -> List[Dict[str, Any]]:
    codes = path.get_filtered_files_in_dir(
        'python*qid_by*code.pickle')[0].read_as_pickle()
    titles = path.get_filtered_files_in_dir(
        'python*qid_by*title.pickle')[0].read_as_pickle()
    data = chain([{
        'code':
        code,
        'code_tokens':
        tokenize_python_from_string(code, func_only=False).code_tokens,
        'docstring':
        titles[_id],
        'docstring_tokens':
        tokenize_docstring_from_string(titles[_id]),
        'language':
        'python'
    } for _id, code in codes.items()])

    filtered_data = filter_untokenizable_code(data)
    log_row_count_diff(original_data=codes.items(),
                       filtered_data=filtered_data,
                       label='StaQC')

    assert len(
        filtered_data
    ) > 0, 'No code tokens retrieved after applying filters for StaQC.'
    return filtered_data
Example #2
0
def run_train(model_class: Type[Model],
              train_data_path: RichPath,
              valid_data_path: RichPath,
              save_folder: str,
              hyperparameters: Dict[str, Any],
              run_name: Optional[str]=None,
              quiet: bool=False) \
        -> RichPath:
    train_data_chunk_paths = train_data_path.get_filtered_files_in_dir('chunk_*')
    valid_data_chunk_paths = valid_data_path.get_filtered_files_in_dir('valid_chunk_*')

    model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder)
    if os.path.exists(model.model_save_path):
        model = model_restore_helper.restore(RichPath.create(model.model_save_path), is_train=True)
        model.train_log("Resuming training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'],
                                                                                             model.__class__.__name__,
                                                                                             json.dumps(
                                                                                                 hyperparameters)))
        resume = True
    else:
        model.load_existing_metadata(train_data_path.join('metadata.pkl.gz'))
        model.make_model(is_train=True)
        model.train_log("Starting training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'],
                                                                                             model.__class__.__name__,
                                                                                             json.dumps(hyperparameters)))
        resume = False
    model_path = model.train(train_data_chunk_paths, valid_data_chunk_paths, quiet=quiet, resume=resume)
    return model_path
Example #3
0
def run_export(model_path: RichPath, test_data_path: RichPath,
               output_folder: str):
    test_hyper_overrides = {
        'run_id': 'exporting',
        "dropout_keep_rate": 1.0,
    }

    data_chunks = test_data_path.get_filtered_files_in_dir('*gz')

    # Restore model
    model = model_restore_helper.restore(model_path,
                                         is_train=False,
                                         hyper_overrides=test_hyper_overrides)

    exporting = model.export_representations(data_chunks)

    os.makedirs(output_folder, exist_ok=True)
    with open(os.path.join(output_folder, 'vectors.tsv'), 'w') as vectors_file,\
            open(os.path.join(output_folder, 'metadata.tsv'), 'w') as metadata_file:

        metadata_file.write('varname\ttype\tkind\tprovenance\n')
        for annot in exporting:
            metadata_file.write(
                f'{assert_valid_str(annot.name)}\t{assert_valid_str(annot.type_annotation)}\t{assert_valid_str(annot.kind)}\t{assert_valid_str(annot.provenance)}\n'
            )
            vectors_file.write('\t'.join(str(e) for e in annot.representation))
            vectors_file.write('\n')
def run_test(model_path: RichPath, test_data_path: RichPath, output_folder: str, num_processes: int):
    test_run_id = "_".join([time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())])

    test_hyper_overrides = {
                             "run_id": test_run_id,
                             "cx_max_num_types": 50,
                             "cg_max_num_types": 50,
                             "eg_propagation_substeps": 100,
                             "eg_max_variable_choices": 15,
                             "dropout_keep_rate": 1.0,
                            }

    test_data_chunks = test_data_path.get_filtered_files_in_dir('*gz')

    test_jobs = [(model_path, test_hyper_overrides, output_folder, chunk_id, chunk_data_paths)
                 for chunk_id, chunk_data_paths in enumerate(chunkify(test_data_chunks, num_processes))]
    with Pool(processes=num_processes) as pool:
        num_samples, token_perplexities, correct_at_1, correct_at_5 = zip(*pool.starmap(test_on_raw_chunks, test_jobs))
    # num_samples, token_perplexities, correct_at_1, correct_at_5 = zip(*[test_on_raw_chunks(*job) for job in test_jobs])

    num_samples = sum(num_samples)
    token_perplexities = np.concatenate(token_perplexities, axis=0)
    correct_at_1 = sum(correct_at_1)
    correct_at_5 = sum(correct_at_5)

    print('Num samples: %i (%i before filtering)' % (len(token_perplexities), num_samples))
    print('Avg Sample Perplexity: %.2f' % np.mean(token_perplexities))
    print('Std Sample Perplexity: %.2f' % np.std(token_perplexities))
    print('Accuracy@1: %.4f%%' % (float(correct_at_1) / num_samples * 100))
    print('Accuracy@5: %.4f%%' % (float(correct_at_5) / num_samples * 100))
Example #5
0
def get_conala_dataset(path: RichPath) -> List[Dict[str, Any]]:
    data_files = sorted(path.get_filtered_files_in_dir('*.json'),
                        key=lambda p: p.path)
    raw_data = [
        row for row in flatten(list(f.read_as_json() for f in data_files))
        if row['rewritten_intent']
    ]

    data = chain([{
        'code':
        row['snippet'],
        'code_tokens':
        tokenize_python_from_string(row['snippet'],
                                    func_only=False).code_tokens,
        'docstring':
        row['rewritten_intent'],
        'docstring_tokens':
        tokenize_docstring_from_string(row['rewritten_intent']),
        'language':
        'python'
    } for row in raw_data])

    filtered_data = filter_untokenizable_code(data)
    log_row_count_diff(original_data=raw_data,
                       filtered_data=filtered_data,
                       label='CoNaLa')

    assert len(
        filtered_data
    ) > 0, 'No code tokens retrieved after applying filters for CoNaLa.'
    return filtered_data
Example #6
0
def run_test(model_path: RichPath, test_data_path: RichPath, type_lattice_path: RichPath, alias_metadata_path: RichPath, print_predictions: bool = False):
    test_run_id = "_".join(
        [time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())])

    test_hyper_overrides = {
        'run_id': test_run_id,
        "dropout_keep_rate": 1.0,
    }

    test_data_chunks = test_data_path.get_filtered_files_in_dir('*gz')

    # Restore model
    model = model_restore_helper.restore(
        model_path, is_train=False, hyper_overrides=test_hyper_overrides)

    evaluator = TypePredictionEvaluator(type_lattice_path, alias_metadata_path)

    all_annotations = model.annotate(test_data_chunks)
    for annotation in all_annotations:
        if ignore_type_annotation(annotation.original_annotation):
            continue
        predicted_annotation = max(annotation.predicted_annotation_logprob_dist,
                                   key=lambda x: annotation.predicted_annotation_logprob_dist[x])
        if print_predictions:
            print(
                f'{annotation.provenance} -- {annotation.name}: {annotation.original_annotation} -> {predicted_annotation} ({math.exp(annotation.predicted_annotation_logprob_dist[predicted_annotation])*100:.1f}%)')
        evaluator.add_sample(ground_truth=annotation.original_annotation,
                             predicted_dist=annotation.predicted_annotation_logprob_dist)

    print(json.dumps(evaluator.metrics(), indent=2, sort_keys=True))
Example #7
0
def get_data_files_from_directory(
        data_dir: RichPath,
        max_num_files: Optional[int] = None) -> List[RichPath]:
    files = data_dir.get_filtered_files_in_dir('*.gz')
    if max_num_files is None:
        return files
    else:
        return sorted(files)[:int(max_num_files)]
Example #8
0
def start(model_path: RichPath, index_data_train: RichPath,
          index_data_valid: RichPath):
    test_hyper_overrides = {
        'run_id': 'indexing',
        "dropout_keep_rate": 1.0,
    }

    train_data_chunks = index_data_train.get_filtered_files_in_dir(
        '*.jsonl.gz')
    valid_data_chunks = index_data_valid.get_filtered_files_in_dir(
        '*.jsonl.gz')

    # Restore model
    model = model_restore_helper.restore(model_path,
                                         is_train=False,
                                         hyper_overrides=test_hyper_overrides)

    model.create_index(train_data_chunks + valid_data_chunks)
    model.save(model_path)
Example #9
0
def load_from_folder(path: RichPath, shuffle: bool, rank: int, world_size):
    all_files = [
        p
        for i, p in enumerate(path.get_filtered_files_in_dir("*.jsonl.gz"))
        if i % world_size == rank
    ]

    if shuffle:
        random.shuffle(all_files)
    for file in all_files:
        yield from file.read_as_jsonl()
Example #10
0
def split_many_files(input_dir: RichPath, output_dir: RichPath,
                     train_ratio: float, valid_ratio: float, test_ratio: float,
                     test_only_projects: Set[str]) -> None:
    output_paths = {}  # type: Dict[str, RichPath]
    for split_name in ['train', 'valid', 'test', 'test-only']:
        graph_dir_name_for_split_type = input_dir.basename() + '-' + split_name
        graph_dir_for_split_type = output_dir.join(
            graph_dir_name_for_split_type)
        output_paths[split_name] = graph_dir_for_split_type
        graph_dir_for_split_type.make_as_dir()

    pool = Pool()
    pool.starmap(split_file,
                 [(f, output_paths, train_ratio, valid_ratio, test_ratio,
                   test_only_projects)
                  for f in input_dir.get_filtered_files_in_dir('*')])

    return None
Example #11
0
def run_predict(model_path: RichPath, test_data_path: RichPath,
                output_file: RichPath):
    test_run_id = "_".join(
        [time.strftime("%Y-%m-%d-%H-%M-%S"),
         str(os.getpid())])

    test_hyper_overrides = {
        'run_id': test_run_id,
        "dropout_keep_rate": 1.0,
    }

    test_data_chunks = test_data_path.get_filtered_files_in_dir('*.jsonl.gz')

    # Restore model
    model = model_restore_helper.restore(model_path,
                                         is_train=False,
                                         hyper_overrides=test_hyper_overrides)

    def predictions():
        for annotation in model.annotate(test_data_chunks):
            if ignore_annotation(annotation.original_annotation):
                continue
            ordered_annotation_predictions = sorted(
                annotation.predicted_annotation_logprob_dist,
                key=lambda x: -annotation.predicted_annotation_logprob_dist[
                    x])[:10]

            annotation_dict = annotation._asdict()
            logprobs = annotation_dict['predicted_annotation_logprob_dist']
            filtered_logprobs = []
            for annot in ordered_annotation_predictions:
                logprob = float(logprobs[annot])
                if annot == '%UNK%' or annot == '%UNKNOWN%':
                    annot = 'typing.Any'
                filtered_logprobs.append((annot, logprob))
            annotation_dict[
                'predicted_annotation_logprob_dist'] = filtered_logprobs

            yield annotation_dict

    output_file.save_as_compressed_file(predictions())
Example #12
0
def load_from_folder(path: RichPath, shuffle: bool):
    all_files = path.get_filtered_files_in_dir("*.jsonl.gz")
    if shuffle:
        random.shuffle(all_files)
    for file in all_files:
        yield from file.read_as_jsonl()