Ejemplo n.º 1
0
def run_train(model_class: Type[Model],
              train_data_dirs: List[RichPath],
              valid_data_dirs: List[RichPath],
              save_folder: str,
              hyperparameters: Dict[str, Any],
              azure_info_path: Optional[str],
              run_name: str,
              quiet: bool = False,
              max_files_per_dir: Optional[int] = None,
              parallelize: bool = True) -> RichPath:
    assert parallelize
    model = model_class(hyperparameters,
                        run_name=run_name,
                        model_save_dir=save_folder,
                        log_save_dir=save_folder)
    if os.path.exists(model.model_save_path):
        model = model_restore_helper.restore(RichPath.create(
            model.model_save_path),
                                             is_train=True)
        model.train_log(
            "Resuming training run %s of model %s with following hypers:\n%s" %
            (run_name, model.__class__.__name__, str(hyperparameters)))
        resume = True
    else:
        model.train_log(
            "Tokenizing and building vocabulary for code snippets and queries.  This step may take several hours."
        )
        model.load_metadata(train_data_dirs,
                            max_files_per_dir=max_files_per_dir,
                            parallelize=parallelize)
        model.make_model(is_train=True)

        model.train_log(
            "Starting training run %s of model %s with following hypers:\n%s" %
            (run_name, model.__class__.__name__, str(hyperparameters)))
        resume = False

    philly_job_id = os.environ.get('PHILLY_JOB_ID')
    if philly_job_id is not None:
        # We are in Philly write out the model name in an auxiliary file
        with open(os.path.join(save_folder, philly_job_id + '.job'), 'w') as f:
            f.write(os.path.basename(model.model_save_path))

    wandb.config.update(model.hyperparameters)
    model.train_log("Loading training and validation data.")
    train_data = model.load_data_from_dirs(train_data_dirs,
                                           is_test=False,
                                           max_files_per_dir=max_files_per_dir,
                                           parallelize=parallelize)
    valid_data = model.load_data_from_dirs(valid_data_dirs,
                                           is_test=False,
                                           max_files_per_dir=max_files_per_dir,
                                           parallelize=parallelize)
    model.train_log("Begin Training.")
    model_path = model.train(train_data,
                             valid_data,
                             azure_info_path,
                             quiet=quiet,
                             resume=resume)
    return model_path
Ejemplo n.º 2
0
    def load_model(self):
        model_path = RichPath.create(self.local_model_path, None)
        print("Restoring model from %s" % model_path)
        self.model = model_restore_helper.restore(path=model_path,
                                                  is_train=False,
                                                  hyper_overrides={})

        for language in ['python', 'go', 'javascript', 'java', 'php', 'ruby']:
            # for language in ['python']:
            print("Loading language: %s" % language)
            self.definitions[language] = pickle.load(
                open(
                    '../resources/data/{}_dedupe_definitions_v2.pkl'.format(
                        language), 'rb'))

            if os.path.exists('/datadrive/{}.ann'.format(language)):
                self.indices[language] = AnnoyIndex(128, 'angular')
                self.indices[language].load(
                    '/datadrive/{}.ann'.format(language))
            else:
                indexes = [{
                    'code_tokens': d['function_tokens'],
                    'language': d['language']
                } for d in tqdm(self.definitions[language])]
                code_representations = self.model.get_code_representations(
                    indexes)
                print(code_representations[0].shape)
                self.indices[language] = AnnoyIndex(
                    code_representations[0].shape[0], 'angular')
                for index, vector in tqdm(enumerate(code_representations)):
                    assert vector is not None
                    self.indices[language].add_item(index, vector)
                self.indices[language].build(1000)
                self.indices[language].save(
                    '/datadrive/{}.ann'.format(language))
 def __init__(self, model_path: RichPath, test_batch_size: int=1000, distance_metric: str='cosine',
              quiet: bool=False, hypers_override: Optional[Dict[str, Any]]=None) -> None:
     self.__model = model_restore_helper.restore(path=model_path,
                                                 is_train=False,
                                                 hyper_overrides=hypers_override)
     self.__quiet = quiet
     self.__test_batch_size = test_batch_size
     self.__distance_metric = distance_metric
Ejemplo n.º 4
0
def run(arguments) -> None:
    azure_info_path = arguments.get('--azure-info', None)

    model_path = RichPath.create(arguments['MODEL_PATH'],
                                 azure_info_path=azure_info_path)

    model = model_restore_helper.restore(path=model_path, is_train=False)

    if arguments['--query']:
        embeddings, elements = model.get_query_token_embeddings()
    else:
        embeddings, elements = model.get_code_token_embeddings(
            arguments['--language'])

    max_num_elements = int(arguments['--lim-items'])
    if max_num_elements > 0:
        embeddings, elements = embeddings[:
                                          max_num_elements], elements[:
                                                                      max_num_elements]

    print(f'Collected {len(elements)} elements to visualize.')

    embeddings = model.sess.run(fetches=embeddings)

    if arguments['plot-tsne']:
        emb_2d = TSNE(
            n_components=2, verbose=1,
            metric=arguments['--distance-metric']).fit_transform(embeddings)

        plt.scatter(emb_2d[:, 0], emb_2d[:, 1])
        for i in range(len(elements)):
            plt.annotate(elements[i], xy=(emb_2d[i, 0], emb_2d[i, 1]))

        plt.show()
    elif arguments['print-nns']:
        flat_distances = pdist(embeddings, arguments['--distance-metric'])
        num_nns = int(arguments['--num-nns'])

        for i, element in enumerate(elements):
            distance_from_i = np.fromiter((flat_distances[square_to_condensed(
                i, j, len(elements))] if i != j else float('inf')
                                           for j in range(len(elements))),
                                          dtype=np.float)

            nns = [int(k) for k in np.argsort(distance_from_i)[:num_nns]
                   ]  # The first two NNs

            if distance_from_i[nns[0]] > float(
                    arguments['DISTANCE_THRESHOLD']):
                continue
            try:
                print(f'{element} --> ' +
                      ', '.join(f'{elements[n]} ({distance_from_i[n]:.2f})'
                                for n in nns))
            except:
                print('Error printing token for nearest neighbors pair.')
Ejemplo n.º 5
0
                  file=sys.stderr)
            sys.exit(1)

        print("Fetching run files from W&B...")
        gz_run_files = [f for f in run.files() if f.name.endswith('gz')]
        if not gz_run_files:
            print("ERROR: Run contains no model-like files")
            sys.exit(1)
        model_file = gz_run_files[0].download(replace=True)
        local_model_path = model_file.name
        run_id = args_wandb_run_id.split('/')[-1]

    model_path = RichPath.create(local_model_path, None)
    print("Restoring model from %s" % model_path)
    model = model_restore_helper.restore(path=model_path,
                                         is_train=False,
                                         hyper_overrides={})

    predictions = []
    for language in ('python', 'go', 'javascript', 'java', 'php', 'ruby'):
        print("Evaluating language: %s" % language)
        definitions = pickle.load(
            open(
                '../resources/data/{}_dedupe_definitions_v2.pkl'.format(
                    language), 'rb'))
        # dict_keys(['nwo', 'sha', 'path', 'language', 'identifier', 'parameters', 'argument_list', 'return_statement',
        # 'docstring', 'docstring_summary', 'docstring_tokens', 'function', 'function_tokens', 'url', 'score'])
        indexes = [{
            'code': d['function'],
            'code_tokens': d['function_tokens'],
            'language': d['language']
Ejemplo n.º 6
0
def run(arguments) -> None:
    azure_info_path = arguments.get('--azure-info', None)
    data_path = RichPath.create(arguments['DATA_PATH'], azure_info_path)
    assert data_path.is_dir(), "%s is not a folder" % (data_path,)

    hypers_override = arguments.get('--hypers-override')
    if hypers_override is not None:
        hypers_override = json.loads(hypers_override)
    else:
        hypers_override = {}

    model_path = RichPath.create(arguments['MODEL_PATH'], azure_info_path=azure_info_path)

    model = model_restore_helper.restore(
        path=model_path,
        is_train=False,
        hyper_overrides=hypers_override)

    num_elements_to_take = int(arguments['--max-num-items'])
    data = chain(*chain(list(f.read_by_file_suffix()) for f in data_path.iterate_filtered_files_in_dir('*.jsonl.gz')))
    if num_elements_to_take == 0:  # Take all
        data = list(data)
    else:
        assert num_elements_to_take > 0
        data = take(num_elements_to_take, data)

    num_nns = int(arguments['--num-nns'])

    if arguments['--code']:
        representations = model.get_code_representations(data)
    elif arguments['--query']:
        representations = model.get_query_representations(data)
    else:
        code_representations = model.get_code_representations(data)
        query_representations = model.get_query_representations(data)
        representations = np.concatenate([code_representations, query_representations], axis=-1)

    filtered_representations = []
    filtered_data = []  # type: List[Dict[str, Any]]
    for i, representation in enumerate(representations):
        if representation is None:
            continue
        filtered_representations.append(representation)
        filtered_data.append(data[i])

    filtered_representations = np.stack(filtered_representations, axis=0)
    flat_distances = pdist(filtered_representations, arguments['--distance-metric'])

    for i, data in enumerate(filtered_data):
        distance_from_i = np.fromiter(
            (flat_distances[square_to_condensed(i, j, len(filtered_data))] if i != j else float('inf') for j in
             range(len(filtered_data))), dtype=np.float)

        nns = [int(k) for k in np.argsort(distance_from_i)[:num_nns]]  # The first two NNs

        if distance_from_i[nns[0]] > float(arguments['--distance-threshold']):
            continue

        print('===============================================================')
        print(f"{data['repo']}:{data['path']}:{data['lineno']}")
        print(to_string(data['original_string'], language=data['language']))

        for j in range(num_nns):
            print()
            print(
                f'Nearest Neighbour {j + 1}: {filtered_data[nns[j]]["repo"]}:{filtered_data[nns[j]]["path"]}:{filtered_data[nns[j]]["lineno"]} (distance {distance_from_i[nns[j]]})')
            print(to_string(filtered_data[nns[j]]['original_string'], language=filtered_data[nns[j]]['language']))