コード例 #1
0
def main():
    # create parser object
    parser = argparse.ArgumentParser(
        description=
        "nucleai cli for searching, ingesting, and downloading datasets.")

    # defining arguments for parser object
    parser.add_argument(
        "-ots",
        "--one_time_setup",
        nargs="*",
        help=
        "Setups up metadocument index in the elasticsearch. This only needs to be done once."
    )

    parser.add_argument(
        "-s",
        "--search",
        nargs=1,
        help=
        "Searched metadocument index by keywords. Returns top 10 matches by reverse indexing."
    )

    parser.add_argument(
        "-d",
        "--download",
        type=str,
        nargs=1,
        help="Downloads and pickles data based off dataset id.")

    parser.add_argument(
        "-dt",
        "--download_and_tensorize",
        type=str,
        nargs=1,
        help="Downloads, tensorizes, and pickles data based off dataset id.")

    parser.add_argument(
        "-dttws",
        "--download_tensorize_two_way_split",
        type=str,
        nargs=1,
        help=
        "Downloads, tensorizes, splits across two tensors, and pickles data based off dataset id."
    )

    # The following should probably live in a different package as they help to ingest.
    parser.add_argument(
        "-iq",
        "--ingest_by_query",
        type=str,
        nargs=1,
        help="Ingests data from data.gov into the elasticsearch by keywords.")

    parser.add_argument(
        "-ip",
        "--ingest_by_package",
        type=str,
        nargs=1,
        help=
        "Ingests data from data.gov into the elasticsearch by name of data.gov package."
    )

    # parse the arguments from standard input
    args = parser.parse_args()

    # calling functions depending on type of argument
    if args.one_time_setup != None:
        print("INFO: Setting up metadocument index.")
        ES().init_metadocument_index()
    elif args.ingest_by_query != None:
        query = args.ingest_by_query[0]
        print("INFO: Ingesting all datasets related to %s." % query)
        scraper = DataDotGovScraper(query)
        packages = scraper.get_packages()
        for package in packages:
            try:
                scraper.ingest_dataset(package)
            except:
                print("ERROR: Could not load data.")
    elif args.ingest_by_package != None:
        package = args.ingest_by_package[0]
        print("INFO: Ingesting dataset called to %s." % package)
        DataDotGovScraper(None).ingest_dataset(package)
    elif args.search != None:
        query = args.search[0]
        print("INFO: Searching our database for %s." % query)
        print(SearchClient().search_by_partial_match(query, 10))
    elif args.download != None:
        dataset_id = args.download[0]
        cleaned_dataframe = get_cleaned_dataframe(dataset_id)

        print("INFO: Pickling %s dataset." % dataset_id)
        cleaned_dataframe.to_pickle("%s_dataframe.pkl" % dataset_id)
    elif args.download_and_tensorize != None:
        dataset_id = args.download_and_tensorize[0]
        cleaned_dataframe = get_cleaned_dataframe(dataset_id)
        torch_tensor = one_hot_encode_and_tensorize(dataset_id,
                                                    cleaned_dataframe)

        print("INFO: Pickling %s dataset." % dataset_id)
        torch.save(torch_tensor, "%s.tensor" % dataset_id)
    elif args.download_tensorize_two_way_split != None:
        # For now, comment these lines out since the elasticsearch is down.
        # dataset_id = args.download[0]
        # cleaned_dataframe = get_cleaned_dataframe(dataset_id)

        # Temporarily, dataset_id will refer to the URL passed in.
        dataset_id = args.download_tensorize_two_way_split[0]
        cleaned_dataframe = get_cleaned_dataframe_by_url(dataset_id)

        num_rows = cleaned_dataframe.shape[0]
        num_columns = len(cleaned_dataframe.columns)

        first_split_rows_sample_size = randint(num_rows / 2 - 1, num_rows)
        second_split_rows_sample_size = randint(num_rows / 2 - 1, num_rows)

        first_split_columns_sample_size = randint(num_columns / 2 - 1,
                                                  num_columns)
        second_split_columns_sample_size = randint(num_columns / 2 - 1,
                                                   num_columns)

        first_split_dataframe = cleaned_dataframe.sample(first_split_rows_sample_size)\
            .sample(first_split_columns_sample_size, axis=1).apply(np.random.permutation)
        second_split_dataframe = cleaned_dataframe.sample(second_split_rows_sample_size)\
            .sample(second_split_columns_sample_size, axis=1).apply(np.random.permutation)

        first_torch_tensor = one_hot_encode_and_tensorize(
            dataset_id, first_split_dataframe)
        second_torch_tensor = one_hot_encode_and_tensorize(
            dataset_id, second_split_dataframe)

        print("INFO: Pickling %s dataset." % dataset_id)
        torch.save(first_torch_tensor, "tmp-%s-1.tensor" % hash(dataset_id))
        torch.save(second_torch_tensor, "tmp-%s-2.tensor" % hash(dataset_id))
コード例 #2
0
 def __init__(self):
     self.es = ES()