Esempio n. 1
0
def run(args):

    azure_info_path = args.get('--azure-info', None)
    input_path = RichPath.create(args['INPUT_FILENAME'], azure_info_path)
    output_folder = RichPath.create(args['OUTPUT_FOLDER'], azure_info_path)
    train = float(args['--train-ratio'])
    valid = float(args['--valid-ratio'])
    test = float(args['--test-ratio'])
    holdout = float(args['--holdout-ratio'])

    # get data and process it
    df = jsonl_to_df(input_path)
    print('Removing fuzzy duplicates ... this may take some time.')
    df = remove_duplicate_code_df(df)
    df = df.sample(frac=1, random_state=20181026)  # shuffle order of files
    df = label_folds(df, train_ratio=train, valid_ratio=valid, test_ratio=test, holdout_ratio=holdout)
    splits = ['train', 'valid', 'test', 'holdout']

    for split in splits:
        split_df = df[df.partition == split]

        # save dataframes as chunked jsonl files
        jsonl_save_folder = output_folder.join(f'jsonl/{split}')
        print(f'Uploading data to {str(jsonl_save_folder)}')
        chunked_save_df_to_jsonl(split_df, jsonl_save_folder)

        # Upload dataframes to Azure
        filename = f'/tmp/{split}_df.pkl'
        df_save_path = output_folder.join(f'DataFrame/{split}_df.pkl')
        split_df.to_pickle(filename)
        print(f'Uploading data to {str(df_save_path)}')
        df_save_path.copy_from(RichPath.create(filename))
        os.unlink(filename)
Esempio n. 2
0
def run(args):

    azure_info_path = args.get('--azure-info', None)
    input_folder = args['INPUT_FOLDERNAME']
    output_folder = RichPath.create(args['OUTPUT_FOLDER'])
    rate = float(args['RATE'])
    for language in ['python', 'go', 'javascript', 'java', 'ruby', 'php']:
        language_trainings_files = sorted(Path(f'{input_folder}/{language}/final/jsonl/train').glob('**/*.gz'))
        df = load_concat_df(language_trainings_files)
        print(df.count())
        sample = df.sample(frac = rate)
        print(sample.count())
        lang_output_folder = output_folder.join(f'{language}/final/jsonl/train')
        
        os.makedirs(str(lang_output_folder.to_local_path()))
        chunked_save_df_to_jsonl(df, lang_output_folder, basefilename=f'{language}_train')

        shutil.copytree(f'{input_folder}/{language}/final/jsonl/test', f'{output_folder}/{language}/final/jsonl/test')
        shutil.copytree(f'{input_folder}/{language}/final/jsonl/valid', f'{output_folder}/{language}/final/jsonl/valid')
        print(df.count())
        print(language_trainings_files)
def run(args):
    azure_info_path = args.get('--azure-info')
    output_folder = RichPath.create(args['OUTPUT_PATH'], azure_info_path)

    # Download / read the data files:
    if args['--input-folder'] is None:
        print('Downloading data...')
        raw_code_data_df = download_files_into_pandas()
    else:
        print('Loading data...')
        raw_code_data_df = load_files_into_pandas(args['--input-folder'])
    print('Data loaded.')

    # Find all the functions and methods, filter out ones that don't meet requirements,
    # separate the code from the docstring and produce a list of functions that includes the code,
    # the first line of the docstring, and metadata of each:
    with Pool() as pool:
        function_data = pool.map(parse_raw_data_into_function_list,
                                 raw_code_data_df.content.tolist())
    assert len(function_data) == raw_code_data_df.shape[0], \
        f'Row count mismatch. `raw_code_data_df` has {raw_code_data_df.shape[0]} rows; `function_data` has {len(function_data)} rows.'
    raw_code_data_df['function_data'] = function_data
    print(
        f'Split {raw_code_data_df.shape[0]} blobs into {sum(len(fun_data) for fun_data in function_data)} documented individual functions.'
    )

    # Flatten function data out:
    # TODO: We should also have access to the SHA of the objects here.
    raw_code_data_df = raw_code_data_df.set_index(
        ['repo', 'path'])['function_data'].apply(pd.Series).stack()
    raw_code_data_df = raw_code_data_df.reset_index()
    raw_code_data_df.columns = ['repo', 'path', '_', 'function_data']

    # Extract meta-data and format dataframe.
    function_data_df = pd.DataFrame(
        raw_code_data_df.function_data.values.tolist())
    assert len(raw_code_data_df) == len(function_data_df), \
        f'Row count mismatch. `raw_code_data_df` has {len(raw_code_data_df)} rows; `function_data_df` has {len(function_data_df)} rows.'
    function_data_df = pd.concat(
        [raw_code_data_df[['repo', 'path']], function_data_df], axis=1)

    # remove observations where the same code appears more than once
    num_before_dedup = len(function_data_df)
    function_data_df = function_data_df.drop_duplicates(['code'])
    num_after_dedup = len(function_data_df)

    print(
        f'Removed {num_before_dedup - num_after_dedup} exact duplicate rows.')

    print('Tokenizing code, comments and docstrings ...')
    with Pool() as pool:
        code_tokenization_results: List[ParsedCode] = pool.map(
            tokenize_python_from_string, function_data_df['code'].tolist())

        code_tokens_list, comment_tokens_list = list(
            zip(*code_tokenization_results))
        function_data_df['code_tokens'] = code_tokens_list
        function_data_df['comment_tokens'] = comment_tokens_list
        function_data_df['docstring_tokens'] = pool.map(
            tokenize_docstring_from_string,
            function_data_df['docstring'].tolist())
    function_data_df.dropna(
        subset=['code_tokens', 'comment_tokens', 'docstring_tokens'],
        inplace=True)
    function_data_df.reset_index(inplace=True, drop=True)

    cols_to_keep = [
        'repo',
        'path',
        'lineno',
        'func_name',
        'language',
        'code',
        'code_tokens',
        'comment_tokens',
        'docstring',
        'docstring_tokens',
    ]
    # write data to jsonl
    print(f'Count by language:\n{function_data_df.language.value_counts()}')
    chunked_save_df_to_jsonl(df=function_data_df[cols_to_keep],
                             output_folder=output_folder,
                             parallel=True)
    print(f'Wrote {function_data_df.shape[0]} rows to {str(output_folder)}.')