Exemple #1
0
def split_csvs(notebooks, repos, owners, local):
    # Shuffle repositories.
    repos = repos.sample(frac=1).reset_index(drop=True)

    # Randomly assign repos and the notbooks/owners that go with them.
    partition_repos = np.array_split(
        repos,
        10
    )
    for i in range(10):
        repos_i = partition_repos[i]
        if local:
            repos_i.to_csv('../csv/repos1b_{0}.csv'.format(i))
        else:
            df_to_s3(repos_i, 'csv/repos1b_{0}.csv'.format(i))
        
        notebooks_i =  notebooks[
            notebooks.repo_id.isin(repos_i["repo_id"])
        ].reset_index(drop=True)
        if local:
            notebooks_i.to_csv('../csv/notebooks1b_{0}.csv'.format(i))
        else:
            df_to_s3(notebooks_i, 'csv/notebooks1b_{0}.csv'.format(i))
        
        owners_i = owners[
            owners.owner_id.isin(repos_i["owner_id"])
        ]
        if local:
            owners_i.to_csv('../csv/owners1b_{0}.csv'.format(i))
        else:
            df_to_s3(owners_i, 'csv/owners1b_{0}.csv'.format(i))
Exemple #2
0
def update_split_csvs(notebooks, repos, owners, local):
    all_notebook_files = []
    all_notebooks = []
    for i in range(10):
        notebooks_i = s3_to_df('csv/notebooks1_{0}.csv'.format(i))
        all_notebook_files += list(notebooks_i.file)
        all_notebooks.append(notebooks_i)
    
    # Isolate new notebooks and shuffle
    new_notebooks = notebooks[~notebooks.file.isin(all_notebook_files)]
    print('There are {0} new notebooks to distribute.'.format(len(new_notebooks)))
    new_notebooks = new_notebooks.sample(frac = 1).reset_index(drop = True)

    # Split up and add to existing csvs
    partition_new_notebooks = np.array_split(
        new_notebooks,
        10
    )
    for i in range(10):
        new_notebooks_i = partition_new_notebooks[i]
        old_notebooks_i = all_notebooks[i]
        notebooks_i = pd.concat([old_notebooks_i, new_notebooks_i])
        if local:
            notebooks_i.to_csv('../csv/notebooks1b_{0}.csv'.format(i))
        else:
            df_to_s3(notebooks_i, 'csv/notebooks1b_{0}.csv'.format(i))

        repos_i = repos[
            repos.repo_id.isin(notebooks_i.repo_id)
        ].reset_index(drop=True)
        if local:
            repos_i.to_csv('../csv/repos1b_{0}.csv'.format(i))
        else:
            df_to_s3(repos_i, 'csv/repos1b_{0}.csv'.format(i))
        
        owners_i = owners[
            owners.owner_id.isin(notebooks_i.owner_id)
        ].reset_index(drop=True)
        if local:
            owners_i.to_csv('../csv/owners1b_{0}.csv'.format(i))
        else:
            df_to_s3(owners_i, 'csv/owners1b_{0}.csv'.format(i))
def clean_metadata(num_needed, updating, local):
    """ 
    Extract information from metadata JSON files and save to CSVs. 
    Equivalent to Adam's 1_nb_metadata_cleaning.ipynb.
    """

    try:
        if local:
            pass
        else:
            notebooks_done = s3_to_df("csv/notebooks1.csv")
            owners_done = s3_to_df("csv/owners1.csv")
            repos_done = s3_to_df("csv/repos1.csv")

        notebook_files_done = set(notebooks_done.file)
        owner_ids_done = set(owners_done.owner_id)
        repo_ids_done = set(repos_done.repo_id)

        print(
            'Metadata already processed for {0} notebooks, {1} owners, and {2} repos.'
            .format(len(notebook_files_done), len(owner_ids_done),
                    len(repo_ids_done)))

    except:
        notebook_files_done = []
        owner_ids_done = []
        repo_ids_done = []

        print("Metadata not processed for any files.")

    # Get all query files.
    if local:
        nb_search_files = os.listdir(JSON_PATH)
    else:
        nb_search_files = list_s3_dir('json/')

    # Sort query files by size then by page number.
    nb_search_files = sorted(nb_search_files,
                             key=lambda x:
                             (int(x.split("_")[2].split("..")[0]),
                              int(x.split("_")[3][1:].split(".")[0])))

    debug_print("We have {0} query files.".format(len(nb_search_files)))

    notebooks = {}
    repos = {}
    owners = {}

    for j, json_file_name in enumerate(nb_search_files):
        # Keep track of progress.
        if (j + 1) % COUNT_TRIGGER / 100 == 0 or j + 1 == len(nb_search_files):
            debug_print("{0} / {1} data files processed".format(
                j + 1, len(nb_search_files)))

        file_components = json_file_name.replace(".json", "").split("_")
        filesize = file_components[2]
        query_page = int(file_components[3][1:])

        if local:
            with open(JSON_PATH + json_file_name, "r") as json_file:
                # Parse file name to get size and query page.
                file_dict = json.load(json_file)
        else:
            obj = s3.Object("notebook-research",
                            "json/{0}".format(json_file_name))
            file_dict = json.loads(obj.get()["Body"].read().decode("UTF-8"))

        # Report missing data.
        if "incomplete_results" in file_dict:
            if file_dict["incomplete_results"] == True:
                msg = "{0} has incomplete results".format(json_file_name)
                write_to_log("../logs/nb_metadata_cleaning_log.txt", msg)

        days_since = file_dict["days_since"]
        if "items" in file_dict:
            if len(file_dict["items"]) == 0:
                msg = "{0} has 0 items".format(json_file_name)
                write_to_log("../logs/nb_metadata_cleaning_log.txt", msg)

            else:
                # Save data for each item.
                for i in range(len(file_dict["items"])):
                    item = file_dict["items"][i]
                    item_repo = item["repository"]
                    repo_id = item_repo["id"]
                    owner_id = item_repo["owner"]["id"]

                    # Don"t save forked notebooks.
                    if item_repo["fork"]:
                        continue

                    # Full path is unique for each file.
                    name = "{0}/{1}/{2}".format(item_repo["owner"]["login"],
                                                item_repo["name"],
                                                item["path"]).replace(
                                                    "/", "..")

                    if name not in notebook_files_done:
                        notebook = {
                            "file": name,
                            "html_url": item["html_url"],
                            "name": item["name"],
                            "path": item["path"],
                            "repo_id": repo_id,
                            "owner_id": owner_id,
                            "filesize": filesize,
                            "query_page": query_page,
                            "days_since": days_since
                        }
                        notebooks[name] = notebook

                    if repo_id not in repos and repo_id not in repo_ids_done:
                        repo = {
                            "repo_name": item_repo["name"],
                            "owner_id": owner_id,
                            "repo_description": item_repo["description"],
                            "repo_fork": item_repo["fork"],
                            "repo_html_url": item_repo["html_url"],
                            "repo_private": item_repo["private"],
                        }
                        repos[repo_id] = repo

                    if owner_id not in owners and owner_id not in owner_ids_done:
                        owner = {
                            "owner_html_url": item_repo["owner"]["html_url"],
                            "owner_login": item_repo["owner"]["login"],
                        }
                        owners[owner_id] = owner

                    # If updating we dont always need the full page.
                    if updating and len(notebooks) == num_needed:
                        break
        else:
            msg = "{0} has no items object".format(json_file_name)
            write_to_log("../logs/nb_metadata_cleaning_log.txt", msg)

        if updating and len(notebooks) == num_needed:
            break

    # Display status
    debug_print(("\nAfter processing all query files, "
                 "we have {0} new notebooks.").format(len(notebooks)))
    debug_print("Written by {0} owners.".format(len(owners)))
    debug_print("Held in {0} repositories.".format(len(repos)))

    # Translate dictionaries to DataFrames and save to CSV.
    # Ordered by days since, if duplicates keep the most recent
    # (i.e. keep last, which was found more days since 1-1-19).
    notebooks_df = pd.DataFrame(notebooks).transpose()\
        .sort_values(by=["days_since","file"]).drop_duplicates(
            subset =["file"],
            keep="last"
        )
    owners_df = pd.DataFrame(owners).transpose().reset_index().rename(
        columns={"index": "owner_id"}, index=str)
    repos_df = pd.DataFrame(repos).transpose().reset_index().rename(
        columns={"index": "repo_id"}, index=str)

    if local:
        pd.concat([notebooks_df,
                   notebooks_done]).to_csv("{0}/notebooks1.csv".format(PATH),
                                           index=False)
        pd.concat([owners_df,
                   owners_done]).to_csv("{0}/owners1.csv".format(PATH),
                                        index=False)
        pd.concat([repos_df, repos_done]).to_csv("{0}/repos1.csv".format(PATH),
                                                 index=False)
    else:
        df_to_s3(pd.concat([notebooks_df, notebooks_done]),
                 "csv/notebooks1.csv")
        df_to_s3(pd.concat([owners_df, owners_done]), "csv/owners1.csv")
        df_to_s3(pd.concat([repos_df, repos_done]), "csv/repos1.csv")
Exemple #4
0
def main():
    # Parse command line arguments.
    parser = argparse.ArgumentParser()
    parser.add_argument("--local",
                        action="store_const",
                        dest="local",
                        const=True,
                        default=False,
                        help="Stores results locally instead of using S3.")
    parser.add_argument("--worker",
                        metavar="N",
                        type=int,
                        help=("GITHUB_TOKEN assigned to these sizes (workers "
                              "sorted in alphabetical order: {0}).").format(
                                  list(TOKENS.keys())))
    args = parser.parse_args()
    local = args.local
    worker = args.worker

    # If running in parallel, mark csv files with the worker number.
    global EXTENSION
    EXTENSION = '_{0}'.format(worker) if worker != None else ''
    print('EXTENSION', EXTENSION)

    start = datetime.datetime.now()

    # List of saved CSV files.
    if local:
        current_csvs = set(os.listdir(PATH))
    else:
        current_csvs = list_s3_dir(S3_PATH)

    # Open basic data from json files (files created in query_git.py).
    if set([
            "notebooks1{0}.csv".format(EXTENSION),
            "repos1{0}.csv".format(EXTENSION),
            "owners1{0}.csv".format(EXTENSION)
    ]).issubset(current_csvs):
        notebooks1 = get_df("notebooks1{0}.csv".format(EXTENSION), local)
        owners1 = get_df("owners1{0}.csv".format(EXTENSION), local)
        repos1 = get_df("repos1{0}.csv".format(EXTENSION), local)
    else:
        debug_print("Notebooks1, Owners1, and Repos1 were not found.")
        sys.exit(0)

    debug_print("Notebooks1, Owners1, and Repos1 were found and opened." +
                BREAK)

    ### Add information for repositories and owners. ##################
    save = False
    if not set([
            "owners2{0}.csv".format(EXTENSION),
            "repos2{0}.csv".format(EXTENSION)
    ]).issubset(current_csvs):
        owners2, repos2 = update_owners_repos(owners1, repos1, local)
        save = True
    else:
        try:
            owners2_old = get_df("owners2{0}.csv".format(EXTENSION), local)
            repos2_old = get_df("repos2{0}.csv".format(EXTENSION), local)
            debug_print(
                "Found and opened data for {0} owners and {1} repos.".format(
                    len(owners2_old), len(repos2_old)))
        except:
            owners2_old = []
            repos2_old = []

        if len(owners2_old) > 0 and len(repos2_old) > 0:
            owners1_new = owners1[~owners1.owner_id.isin(owners2_old.owner_id)]
            repos1_new = repos1[~repos1.repo_id.isin(repos2_old.repo_id)]
        else:
            owners1_new = owners1
            repos1_new = repos1

        debug_print("Collecting data for {0} owners and {1} repos.".format(
            len(owners1_new), len(repos1_new)))

        if len(owners1_new) > 0 and len(repos1_new) > 0:
            owners2_new, repos2_new = update_owners_repos(
                owners1_new, repos1_new, local)

            if len(owners2_new) > 0 and len(repos2_new) > 0:
                owners2 = pd.concat([owners2_old, owners2_new
                                     ]).drop_duplicates(subset='owner_id')
                repos2 = pd.concat([repos2_old, repos2_new
                                    ]).drop_duplicates(subset='repo_id')
            else:
                owners2 = owners2_old
                repos2 = repos2_old
        else:
            owners2 = owners2_old
            repos2 = repos2_old

    ## Save
    if save:
        debug_print("Saving combined data for {0} owners and {1} repos".format(
            len(owners2), len(repos2)))
        if local:
            owners2.to_csv("{0}/owners2{1}.csv".format(PATH, EXTENSION),
                           index=False)
            repos2.to_csv("{0}/repos2{1}.csv".format(PATH, EXTENSION),
                          index=False)
        else:
            df_to_s3(owners2, "{0}/owners2{1}.csv".format(S3_PATH, EXTENSION))
            df_to_s3(repos2, "{0}/repos2{1}.csv".format(S3_PATH, EXTENSION))
        debug_print("Owners2 and Repos2 were created and saved.\n" + BREAK)

    ## Add data on cells within each notebook. #######################
    if not set(["notebooks2{0}.csv".format(EXTENSION)]).issubset(current_csvs):
        print("Notebooks2 not found, creating from scratch.")
        get_all_nb_cells(notebooks1, local, 0)
    else:
        # Get existing data.
        try:
            notebooks2_old = get_df("notebooks2{0}.csv".format(EXTENSION),
                                    local)
            debug_print(
                "Found and opened notebook data for {0} notebooks.".format(
                    len(notebooks2_old)))
        except Exception as e:
            notebooks2_old = []
            print("Notebooks2 could not be opened, creating from scratch.")
            print(type(e), e)

        # Isolate rows of notebooks1 corresponding to new notebooks
        if len(notebooks2_old) > 0:
            notebooks1_new = notebooks1[~notebooks1.file.isin(notebooks2_old.
                                                              file)]
        else:
            notebooks1_new = notebooks1

        debug_print("Collecting data for {0} notebooks.".format(
            len(notebooks1_new)))

        # If there are new notebooks, add cell data
        if len(notebooks1_new) > 0:
            get_all_nb_cells(notebooks1_new, local, len(notebooks2_old))

        del notebooks2_old

    # Check time and report status.
    end = datetime.datetime.now()
    debug_print("TOTAL TIME: {0}".format(end - start))
Exemple #5
0
def get_all_nb_cells(notebooks, local, done):
    """ Get cell and notebook data for each notebook. """
    new_nb_info = {}
    all_cells_info = {}
    missing = []

    for count, row in notebooks.iterrows():
        # Track progress.
        file_name = row["file"]
        data = None
        if count % COUNT_TRIGGER == 0 or count == len(notebooks) - 1:
            print("{0} / {1} notebooks processed for cell data".format(
                count,
                len(notebooks) + done))

            # Save data and reset. (In chunks to avoid MemoryError).
            if count > 0:
                # Transform data to DataFrame.
                notebooks_temp = pd.DataFrame(new_nb_info).transpose()
                cells_temp = pd.DataFrame(
                    all_cells_info).transpose().reset_index(drop=True)

                # Save data to CSV.
                try:
                    if local:
                        notebooks_temp.to_csv(
                            "{0}/notebooks2_{1}_{2}.csv".format(
                                PATH, EXTENSION, count / COUNT_TRIGGER),
                            index=False)
                        cells_temp.to_csv("{0}/cells1_{1}_{2}.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                          index=False)
                    else:
                        df_to_s3(
                            notebooks_temp,
                            "{0}/notebooks2_{1}_{2}.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            cells_temp, "{0}/cells1_{1}_{2}.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))

                except MemoryError:
                    # Split data into 3 sections and try saving again.
                    n1 = notebooks_temp.iloc[:len(notebooks_temp) // 4]
                    n2 = notebooks_temp.iloc[len(notebooks_temp) // 4:2 *
                                             len(notebooks_temp) // 4]
                    n3 = notebooks_temp.iloc[2 * len(notebooks_temp) // 4:3 *
                                             len(notebooks_temp) // 4]
                    n4 = notebooks_temp.iloc[3 * len(notebooks_temp) // 4:]

                    c1 = cells_temp.iloc[:len(cells_temp) // 8]
                    c2 = cells_temp.iloc[len(cells_temp) // 8:2 *
                                         len(cells_temp) // 8]
                    c3 = cells_temp.iloc[2 * len(cells_temp) // 8:3 *
                                         len(cells_temp) // 8]
                    c4 = cells_temp.iloc[3 * len(cells_temp) // 8:4 *
                                         len(cells_temp) // 8]
                    c5 = cells_temp.iloc[4 * len(cells_temp) // 8:5 *
                                         len(cells_temp) // 8]
                    c6 = cells_temp.iloc[5 * len(cells_temp) // 8:6 *
                                         len(cells_temp) // 8]
                    c7 = cells_temp.iloc[6 * len(cells_temp) // 8:7 *
                                         len(cells_temp) // 8]
                    c8 = cells_temp.iloc[7 * len(cells_temp) // 8:]

                    if local:
                        n1.to_csv("{0}/notebooks2_{1}_{2}_1.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        n2.to_csv("{0}/notebooks2_{1}_{2}_2.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        n3.to_csv("{0}/notebooks2_{1}_{2}_3.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        n4.to_csv("{0}/notebooks2_{1}_{2}_4.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)

                        c1.to_csv("{0}/cells1_{1}_{2}_1.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c2.to_csv("{0}/cells1_{1}_{2}_2.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c3.to_csv("{0}/cells1_{1}_{2}_3.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c4.to_csv("{0}/cells1_{1}_{2}_4.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c5.to_csv("{0}/cells1_{1}_{2}_5.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c6.to_csv("{0}/cells1_{1}_{2}_6.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c7.to_csv("{0}/cells1_{1}_{2}_7.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c8.to_csv("{0}/cells1_{1}_{2}_8.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                    else:
                        df_to_s3(
                            n1, "{0}/notebooks2_{1}_{2}_1.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            n2, "{0}/notebooks2_{1}_{2}_2.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            n3, "{0}/notebooks2_{1}_{2}_3.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            n4, "{0}/notebooks2_{1}_{2}_4.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))

                        df_to_s3(
                            c1, "{0}/cells1_{1}_{2}_1.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c2, "{0}/cells1_{1}_{2}_2.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c3, "{0}/cells1_{1}_{2}_3.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c4, "{0}/cells1_{1}_{2}_4.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c5, "{0}/cells1_{1}_{2}_5.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c6, "{0}/cells1_{1}_{2}_6.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c7, "{0}/cells1_{1}_{2}_7.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c8, "{0}/cells1_{1}_{2}_8.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))

                # Empty current dictionaries.
                new_nb_info = {}
                all_cells_info = {}
                print("CSVs saved")

        # Initialize row of data.
        nb_info = {
            "file": file_name,
            "google_collab": False,
            "nbformat": "",
            "nbformat_minor": "",
            "num_cells": 0,
            "kernel_lang": "",
            "kernel_name": "",
            "lang_name": "",
            "lang_version": ""
        }

        # Open notebooks as json.
        try:
            obj = s3.Object("notebook-research",
                            "notebooks/{0}".format(file_name))
            data = json.loads(obj.get()["Body"].read().decode("UTF-8"))
        except Exception:
            # Report missed files.
            msg = "Notebook {0} did not open.".format(file_name)
            write_to_log("../logs/repo_metadata_cleaning_log.txt", msg)
            missing.append(file_name)

            # Add row with missing values.
            if file_name not in new_nb_info:
                new_nb_info[file_name] = nb_info

            continue

        # If data was able to load as JSON, extract information.
        if data and isinstance(data, dict):
            keys = data.keys()

            # Get nb top level format metadata.
            if "nbformat" in keys:
                nb_info["nbformat"] = data["nbformat"]
            if "nbformat_minor" in keys:
                nb_info["nbformat_minor"] = data["nbformat_minor"]

            # Get info from the metadata dictionary.
            if ("metadata" in keys and data["metadata"] != None
                    and isinstance(data["metadata"], dict)):
                metadata_keys = data["metadata"].keys()

                # Access language data.
                if ("kernelspec" in metadata_keys
                        and data["metadata"]["kernelspec"] != None
                        and isinstance(data["metadata"]["kernelspec"], dict)):
                    kernel_keys = data["metadata"]["kernelspec"].keys()

                    # If Google colab notebook, only Python 2.7 or 3.6 are possible.
                    if "colab" in metadata_keys:
                        nb_info["google_collab"] = True
                        if ("name" in kernel_keys
                                and "display_name" in kernel_keys):
                            nb_info["kernel_lang"] = data["metadata"][
                                "kernelspec"]["name"]
                            nb_info["kernel_name"] = data["metadata"][
                                "kernelspec"]["display_name"]
                            if nb_info["kernel_lang"] == "python3":
                                nb_info["lang_name"] = "python"
                                nb_info["lang_version"] = "3.6"
                            elif nb_info["kernel_lang"] == "python2":
                                nb_info["lang_name"] = "python"
                                nb_info["lang_version"] = "2.7"

                    # Not Google colab, access kernel language and display name.
                    else:
                        if "language" in kernel_keys:
                            nb_info["kernel_lang"] = data["metadata"][
                                "kernelspec"]["language"]
                        if "display_name" in kernel_keys:
                            nb_info["kernel_name"] = data["metadata"][
                                "kernelspec"]["display_name"]

                # Access language info.
                if ("language_info" in metadata_keys
                        and "colab" not in metadata_keys):
                    lang_keys = data["metadata"]["language_info"].keys()
                    if "name" in lang_keys and "colab" not in metadata_keys:
                        nb_info["lang_name"] = data["metadata"][
                            "language_info"]["name"]
                    if "version" in lang_keys and "colab" not in metadata_keys:
                        nb_info["lang_version"] = data["metadata"][
                            "language_info"]["version"]
                elif "language" in metadata_keys:
                    nb_info["lang_name"] = data["metadata"]["language"]

            # Get information about individual cells.
            cells_info = {}
            if "cells" in keys:
                nb_info["num_cells"] = len(data["cells"])
                cell_id = 0
                for cell in data["cells"]:
                    cell_info, nb_language = get_single_cell(
                        cell_id, file_name, cell, nb_info["lang_name"])

                    if nb_info["lang_name"] == "":
                        nb_info["lang_name"] = nb_language.lower()

                    if (file_name, cell_id) not in cells_info:
                        cells_info[(file_name, cell_id)] = cell_info

                    cell_id += 1

            elif "worksheets" in keys:
                cell_id = 0
                for w in data["worksheets"]:
                    for cell in w["cells"]:
                        cell_info, nb_language = get_single_cell(
                            cell_id, file_name, cell, nb_info["lang_name"])

                        if nb_info["lang_name"] == "":
                            nb_info["lang_name"] = nb_language.lower()

                        if (file_name, cell_id) not in cells_info:
                            cells_info[(file_name, cell_id)] = cell_info

                        cell_id += 1

        all_cells_info.update(cells_info)

        if file_name not in new_nb_info:
            new_nb_info[file_name] = nb_info

    debug_print("{0} notebooks are missing cell data.".format(len(missing)))
    return new_nb_info, all_cells_info