Ejemplo n.º 1
0
def get_parser() -> argparse.ArgumentParser:
    """
    Creates the cmdline argument parser.
    """
    parser = argparse.ArgumentParser(
        formatter_class=ArgumentDefaultsHelpFormatterNoNone)

    # sourced.engine args

    subparsers = parser.add_subparsers(help="Commands", dest="command")

    extract_parser = subparsers.add_parser(
        "extract",
        help="Extract features from input repositories",
        formatter_class=ArgumentDefaultsHelpFormatterNoNone)

    extract_parser.set_defaults(handler=code2vec_extract_features)

    add_repo2_args(extract_parser)

    # code2vec specific args
    extract_parser.add_argument('--max-length',
                                type=int,
                                default=5,
                                help="Max path length.",
                                required=False)
    extract_parser.add_argument('--max-width',
                                type=int,
                                default=2,
                                help="Max path width.",
                                required=False)
    extract_parser.add_argument(
        '-o',
        '--output',
        type=str,
        help="Output path for the Code2VecFeatures model",
        required=True)
    return parser
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()

    # sourced.engine args
    add_repo2_args(parser)

    # code2vec specific args
    parser.add_argument('-g',
                        '--max_length',
                        type=int,
                        default=5,
                        help="Max path length.",
                        required=False)
    parser.add_argument('-w',
                        '--max_width',
                        type=int,
                        default=2,
                        help="Max path width.",
                        required=False)

    args = parser.parse_args()

    code2vec(args)
Ejemplo n.º 3
0
def get_parser() -> argparse.ArgumentParser:
    """
    Create the cmdline argument parser.
    """
    parser = argparse.ArgumentParser(
        formatter_class=args.ArgumentDefaultsHelpFormatterNoNone)
    parser.add_argument("--log-level",
                        default="INFO",
                        choices=logging._nameToLevel,
                        help="Logging verbosity.")
    # Create and construct subparsers
    subparsers = parser.add_subparsers(help="Commands", dest="command")

    def add_parser(name, help_message):
        return subparsers.add_parser(
            name,
            help=help_message,
            formatter_class=args.ArgumentDefaultsHelpFormatterNoNone)

    # ------------------------------------------------------------------------
    preprocessing_parser = subparsers.add_parser(
        "preprocrepos",
        help="Convert siva to parquet files with extracted information.")
    preprocessing_parser.set_defaults(handler=cmd.preprocess_repos)
    preprocessing_parser.add_argument(
        "-x",
        "--mode",
        choices=Moder.Options.__all__,
        default="file",
        help="What to extract from repositories.")
    args.add_repo2_args(preprocessing_parser)
    preprocessing_parser.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to the parquet files with bag batches.")
    default_fields = ("blob_id", "repository_id", "content", "path",
                      "commit_hash", "uast", "lang")
    preprocessing_parser.add_argument("-f",
                                      "--fields",
                                      nargs="+",
                                      default=default_fields,
                                      help="Fields to save.")
    # ------------------------------------------------------------------------
    repos2bow_parser = add_parser(
        "repos2bow", "Convert source code to the bag-of-words model.")
    repos2bow_parser.set_defaults(handler=cmd.repos2bow)
    args.add_df_args(repos2bow_parser)
    args.add_repo2_args(repos2bow_parser)
    args.add_feature_args(repos2bow_parser)
    args.add_bow_args(repos2bow_parser)
    args.add_repartitioner_arg(repos2bow_parser)
    args.add_cached_index_arg(repos2bow_parser)
    # ------------------------------------------------------------------------
    repos2bow_index_parser = add_parser(
        "repos2bow_index",
        "Creates the index, quant and docfreq model of the bag-of-words model."
    )
    repos2bow_index_parser.set_defaults(handler=cmd.repos2bow_index)
    args.add_df_args(repos2bow_index_parser)
    args.add_repo2_args(repos2bow_index_parser)
    args.add_feature_args(repos2bow_index_parser)
    args.add_repartitioner_arg(repos2bow_index_parser)
    args.add_cached_index_arg(repos2bow_index_parser, create=True)
    # ------------------------------------------------------------------------
    repos2df_parser = add_parser(
        "repos2df",
        "Calculate document frequencies of features extracted from source code."
    )
    repos2df_parser.set_defaults(handler=cmd.repos2df)
    args.add_df_args(repos2df_parser)
    args.add_repo2_args(repos2df_parser)
    args.add_feature_args(repos2df_parser)
    # ------------------------------------------------------------------------
    repos2ids_parser = subparsers.add_parser(
        "repos2ids", help="Convert source code to a bag of identifiers.")
    repos2ids_parser.set_defaults(handler=cmd.repos2ids)
    args.add_repo2_args(repos2ids_parser)
    args.add_split_stem_arg(repos2ids_parser)
    args.add_repartitioner_arg(repos2ids_parser)
    repos2ids_parser.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] output path to the CSV file with identifiers.")
    repos2ids_parser.add_argument(
        "--idfreq",
        action="store_true",
        help="Adds identifier frequencies to the output CSV file."
        "num_repos is the number of repositories where the identifier appears in."
        "num_files is the number of files where the identifier appears in."
        "num_occ is the total number of occurrences of the identifier.")
    # ------------------------------------------------------------------------
    repos2coocc_parser = add_parser(
        "repos2coocc",
        "Convert source code to the sparse co-occurrence matrix of identifiers."
    )
    repos2coocc_parser.set_defaults(handler=cmd.repos2coocc)
    args.add_df_args(repos2coocc_parser)
    args.add_repo2_args(repos2coocc_parser)
    args.add_split_stem_arg(repos2coocc_parser)
    args.add_repartitioner_arg(repos2coocc_parser)
    repos2coocc_parser.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to the Cooccurrences model.")

    # ------------------------------------------------------------------------
    repos2roles_and_ids = add_parser(
        "repos2roleids",
        "Converts a UAST to a list of pairs, where pair is a role and "
        "identifier. Role is merged generic roles where identifier was found.")
    repos2roles_and_ids.set_defaults(handler=cmd.repos2roles_and_ids)
    args.add_repo2_args(repos2roles_and_ids)
    args.add_split_stem_arg(repos2roles_and_ids)
    repos2roles_and_ids.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to the directory where spark should store the result. "
        "Inside the directory you find result is csv format, status file and sumcheck files."
    )
    # ------------------------------------------------------------------------
    repos2identifier_distance = add_parser(
        "repos2id_distance", "Converts a UAST to a list of identifier pairs "
        "and distance between them.")
    repos2identifier_distance.set_defaults(handler=cmd.repos2id_distance)
    args.add_repo2_args(repos2identifier_distance)
    args.add_split_stem_arg(repos2identifier_distance)
    repos2identifier_distance.add_argument(
        "-t",
        "--type",
        required=True,
        choices=extractors.IdentifierDistance.DistanceType.All,
        help="Distance type.")
    repos2identifier_distance.add_argument(
        "--max-distance",
        default=extractors.IdentifierDistance.DEFAULT_MAX_DISTANCE,
        type=int,
        help="Maximum distance to save.")
    repos2identifier_distance.add_argument("-x",
                                           "--mode",
                                           choices=("file", "func"),
                                           default="file",
                                           help="What to extract from "
                                           "repositories.")
    repos2identifier_distance.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to the directory where spark should store the result. "
        "Inside the directory you find result is csv format, status file and sumcheck files."
    )
    # ------------------------------------------------------------------------
    repos2id_sequence = add_parser(
        "repos2idseq",
        "Converts a UAST to sequence of identifiers sorted by order of appearance."
    )
    repos2id_sequence.set_defaults(handler=cmd.repos2id_sequence)
    args.add_repo2_args(repos2id_sequence)
    args.add_split_stem_arg(repos2id_sequence)
    repos2id_sequence.add_argument(
        "--skip-docname",
        default=False,
        action="store_true",
        help="Do not save document name in CSV file, only identifier sequence."
    )
    repos2id_sequence.add_argument("-x",
                                   "--mode",
                                   choices=("file", "func"),
                                   default="file",
                                   help="What to extract from repositories.")
    repos2id_sequence.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to the directory where spark should store the result. "
        "Inside the directory you find result is csv format, status file and sumcheck files."
    )
    # ------------------------------------------------------------------------
    preproc_parser = add_parser(
        "id2vec-preproc",
        "Convert a sparse co-occurrence matrix to the Swivel shards.")
    preproc_parser.set_defaults(handler=cmd.id2vec_preprocess)
    args.add_df_args(preproc_parser)
    preproc_parser.add_argument("-s",
                                "--shard-size",
                                default=4096,
                                type=int,
                                help="The shard (submatrix) size.")
    preproc_parser.add_argument(
        "-i", "--input", help="Concurrence model produced by repos2coocc.")
    preproc_parser.add_argument("-o",
                                "--output",
                                required=True,
                                help="Output directory.")
    # ------------------------------------------------------------------------
    train_parser = add_parser("id2vec-train",
                              "Train identifier embeddings using Swivel.")
    train_parser.set_defaults(handler=cmd.run_swivel)
    mirror_tf_args(train_parser)
    # ------------------------------------------------------------------------
    id2vec_postproc_parser = add_parser(
        "id2vec-postproc",
        "Combine row and column embeddings produced by Swivel and write them to an .asdf."
    )
    id2vec_postproc_parser.set_defaults(handler=cmd.id2vec_postprocess)
    id2vec_postproc_parser.add_argument(
        "-i",
        "--swivel-data",
        required=True,
        help="Folder with swivel row and column embeddings data. "
        "You can get it using id2vec_train subcommand.")
    id2vec_postproc_parser.add_argument(
        "-o",
        "--output",
        required=True,
        help="Output directory for Id2Vec model.")
    # ------------------------------------------------------------------------
    id2vec_project_parser = add_parser(
        "id2vec-project", "Present id2vec model in Tensorflow Projector.")
    id2vec_project_parser.set_defaults(handler=cmd.id2vec_project)
    args.add_df_args(id2vec_project_parser, required=False)
    id2vec_project_parser.add_argument("-i",
                                       "--input",
                                       required=True,
                                       help="id2vec model to present.")
    id2vec_project_parser.add_argument("-o",
                                       "--output",
                                       required=True,
                                       help="Projector output directory.")
    id2vec_project_parser.add_argument("--no-browser",
                                       action="store_true",
                                       help="Do not open the browser.")
    # ------------------------------------------------------------------------
    train_id_split_parser = add_parser(
        "train-id-split", "Train a neural network to split identifiers.")
    train_id_split_parser.set_defaults(handler=cmd.train_id_split)
    # common arguments for CNN/RNN models
    train_id_split_parser.add_argument(
        "-i",
        "--input",
        required=True,
        help="Path to the input data in CSV format:"
        "num_files,num_occ,num_repos,token,token_split")
    train_id_split_parser.add_argument(
        "-e",
        "--epochs",
        type=int,
        default=10,
        help="Number of training epochs. The more the better"
        "but the training time is proportional.")
    train_id_split_parser.add_argument(
        "-b",
        "--batch-size",
        type=int,
        default=500,
        help="Batch size. Higher values better utilize GPUs"
        "but may harm the convergence.")
    train_id_split_parser.add_argument("-l",
                                       "--length",
                                       type=int,
                                       default=40,
                                       help="RNN sequence length.")
    train_id_split_parser.add_argument("-o",
                                       "--output",
                                       required=True,
                                       help="Path to store the trained model.")
    train_id_split_parser.add_argument(
        "-t",
        "--test-ratio",
        type=float,
        default=0.2,
        help="Fraction of the dataset to use for evaluation.")
    train_id_split_parser.add_argument(
        "-p",
        "--padding",
        default="post",
        choices=("pre", "post"),
        help="Whether to pad before or after each sequence.")
    train_id_split_parser.add_argument(
        "--optimizer",
        default="Adam",
        choices=("RMSprop", "Adam"),
        help="Algorithm to use as an optimizer for the neural net.")
    train_id_split_parser.add_argument("--lr",
                                       default=0.001,
                                       type=float,
                                       help="Initial learning rate.")
    train_id_split_parser.add_argument(
        "--final-lr",
        default=0.00001,
        type=float,
        help="Final learning rate. The decrease from "
        "the initial learning rate is done linearly.")
    train_id_split_parser.add_argument(
        "--samples-before-report",
        type=int,
        default=5 * 10**6,
        help="Number of samples between each validation report"
        "and training updates.")
    train_id_split_parser.add_argument(
        "--val-batch-size",
        type=int,
        default=2000,
        help="Batch size for validation."
        "It can be increased to speed up the pipeline but"
        "it proportionally increases the memory consumption.")
    train_id_split_parser.add_argument("--seed",
                                       type=int,
                                       default=1989,
                                       help="Random seed.")
    train_id_split_parser.add_argument(
        "--devices", default="0", help="Device(s) to use. '-1' means CPU.")
    train_id_split_parser.add_argument(
        "--csv-identifier",
        default=3,
        help="Column name in the CSV file for the raw identifier.")
    train_id_split_parser.add_argument(
        "--csv-identifier-split",
        default=4,
        help="Column name in the CSV file for the split"
        "identifier.")
    train_id_split_parser.add_argument(
        "--include-csv-header",
        action="store_true",
        help="Treat the first line of the input CSV as a regular"
        "line.")
    train_id_split_parser.add_argument(
        "--model",
        type=str,
        choices=("RNN", "CNN"),
        required=True,
        help="Neural Network model to use to learn the identifier"
        "splitting task.")
    train_id_split_parser.add_argument(
        "-s",
        "--stack",
        default=2,
        type=int,
        help="Number of layers stacked on each other.")
    # RNN specific arguments
    train_id_split_parser.add_argument("--type-cell",
                                       default="LSTM",
                                       choices=("GRU", "LSTM", "CuDNNLSTM",
                                                "CuDNNGRU"),
                                       help="Recurrent layer type to use.")
    train_id_split_parser.add_argument("-n",
                                       "--neurons",
                                       default=256,
                                       type=int,
                                       help="Number of neurons on each layer.")
    # CNN specific arguments
    train_id_split_parser.add_argument(
        "-f",
        "--filters",
        default="64,32,16,8",
        help="Number of filters for each kernel size.")
    train_id_split_parser.add_argument("-k",
                                       "--kernel-sizes",
                                       default="2,4,8,16",
                                       help="Sizes for sliding windows.")
    train_id_split_parser.add_argument(
        "--dim-reduction",
        default=32,
        type=int,
        help="Number of 1-d kernels to reduce dimensionality"
        "after each layer.")
    # ------------------------------------------------------------------------
    bow2vw_parser = add_parser(
        "bow2vw",
        "Convert a bag-of-words model to the dataset in Vowpal Wabbit format.")
    bow2vw_parser.set_defaults(handler=cmd.bow2vw)
    bow2vw_parser.add_argument("--bow",
                               help="URL or path to a bag-of-words model.")
    bow2vw_parser.add_argument(
        "--id2vec", help="URL or path to the identifier embeddings.")
    bow2vw_parser.add_argument("-o",
                               "--output",
                               required=True,
                               help="Path to the output file.")
    # ------------------------------------------------------------------------
    bigartm_postproc_parser = add_parser(
        "bigartm2asdf",
        "Convert a human-readable BigARTM model to Modelforge format.")
    bigartm_postproc_parser.set_defaults(handler=cmd.bigartm2asdf)
    bigartm_postproc_parser.add_argument("input")
    bigartm_postproc_parser.add_argument("output")
    # ------------------------------------------------------------------------
    bigartm_parser = add_parser(
        "bigartm", "Install bigartm/bigartm to the current working directory.")
    bigartm_parser.set_defaults(handler=install_bigartm)
    bigartm_parser.add_argument(
        "--tmpdir",
        help="Store intermediate files in this directory instead of /tmp.")
    bigartm_parser.add_argument("--output",
                                default=os.getcwd(),
                                help="Output directory.")

    # ------------------------------------------------------------------------
    merge_df = add_parser("merge-df",
                          "Merge DocumentFrequencies models to a single one.")
    merge_df.set_defaults(handler=cmd.merge_df)
    args.add_min_docfreq(merge_df)
    args.add_vocabulary_size_arg(merge_df)
    merge_df.add_argument("-o",
                          "--output",
                          required=True,
                          help="Path to the merged DocumentFrequencies model.")
    merge_df.add_argument("-i",
                          "--input",
                          required=True,
                          nargs="+",
                          help="DocumentFrequencies models input files."
                          "Use `-i -` to read input files from stdin.")
    merge_df.add_argument(
        "--ordered",
        action="store_true",
        default=False,
        help="Save OrderedDocumentFrequencies. "
        "If not specified DocumentFrequencies model will be saved")
    # ------------------------------------------------------------------------
    merge_coocc = add_parser("merge-coocc",
                             "Merge several Cooccurrences models together.")
    merge_coocc.set_defaults(handler=cmd.merge_coocc)
    add_spark_args(merge_coocc)
    merge_coocc.add_argument("-o",
                             "--output",
                             required=True,
                             help="Path to the merged Cooccurrences model.")
    merge_coocc.add_argument("-i",
                             "--input",
                             required=True,
                             help="Cooccurrences models input files."
                             "Use `-i -` to read input files from stdin.")
    merge_coocc.add_argument(
        "--docfreq",
        required=True,
        help="[IN] Specify OrderedDocumentFrequencies model. "
        "Identifiers that are not present in the model will be ignored.")
    merge_coocc.add_argument(
        "--no-spark",
        action="store_true",
        default=False,
        help="Use the local reduction instead of PySpark. "
        "Can be faster and consume less memory if the data fits into RAM.")
    # ------------------------------------------------------------------------
    merge_bow = add_parser("merge-bow", "Merge BOW models to a single one.")
    merge_bow.set_defaults(handler=cmd.merge_bow)
    merge_bow.add_argument("-i",
                           "--input",
                           required=True,
                           nargs="+",
                           help="BOW models input files."
                           "Use `-i -` to read input files from stdin.")
    merge_bow.add_argument("-o",
                           "--output",
                           required=True,
                           help="Path to the merged BOW model.")
    merge_bow.add_argument(
        "-f",
        "--features",
        nargs="+",
        choices=[ex.NAME for ex in extractors.__extractors__.values()],
        default=None,
        help=
        "To keep only specific features, if not specified all will be kept.")
    # ------------------------------------------------------------------------
    id2role_eval = add_parser(
        "id2role-eval",
        "Compare the embeddings quality on role prediction problem.")
    id2role_eval.set_defaults(handler=cmd.id2role_eval)
    id2role_eval.add_argument("-m",
                              "--models",
                              required=True,
                              nargs="+",
                              help="Id2Vec models to compare."
                              "Use `-i -` to read input files from stdin.")
    id2role_eval.add_argument(
        "-d",
        "--dataset",
        required=True,
        help=
        "Dataset directory. You can collect dataset via repos2roleids command."
    )
    id2role_eval.add_argument("-s",
                              "--seed",
                              default=420,
                              help="Random seed for reproducible results.")
    return parser
Ejemplo n.º 4
0
def get_parser() -> argparse.ArgumentParser:
    """
    Creates the cmdline argument parser.
    """

    parser = argparse.ArgumentParser(
        formatter_class=args.ArgumentDefaultsHelpFormatterNoNone)
    parser.add_argument("--log-level",
                        default="INFO",
                        choices=logging._nameToLevel,
                        help="Logging verbosity.")
    # Create and construct subparsers
    subparsers = parser.add_subparsers(help="Commands", dest="command")

    def add_parser(name, help_message):
        return subparsers.add_parser(
            name,
            help=help_message,
            formatter_class=args.ArgumentDefaultsHelpFormatterNoNone)

    # ------------------------------------------------------------------------
    preprocessing_parser = subparsers.add_parser(
        "preprocrepos",
        help="Convert siva to parquet files with extracted information.")
    preprocessing_parser.set_defaults(handler=cmd.preprocess_repos)
    preprocessing_parser.add_argument(
        "-x",
        "--mode",
        choices=Moder.Options.__all__,
        default="file",
        help="What to extract from repositories.")
    args.add_repo2_args(preprocessing_parser)
    args.add_dzhigurda_arg(preprocessing_parser)
    preprocessing_parser.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to the parquet files with bag batches.")
    default_fields = ("blob_id", "repository_id", "content", "path",
                      "commit_hash", "uast")
    preprocessing_parser.add_argument("-f",
                                      "--fields",
                                      nargs="+",
                                      default=default_fields,
                                      help="Fields to save.")
    # ------------------------------------------------------------------------
    repos2bow_parser = add_parser(
        "repos2bow", "Convert source code to the bag-of-words model.")
    repos2bow_parser.set_defaults(handler=cmd.repos2bow)
    args.add_df_args(repos2bow_parser)
    args.add_repo2_args(repos2bow_parser)
    args.add_feature_args(repos2bow_parser)
    args.add_bow_args(repos2bow_parser)
    args.add_repartitioner_arg(repos2bow_parser)
    # ------------------------------------------------------------------------
    repos2df_parser = add_parser(
        "repos2df",
        "Calculate document frequencies of features extracted from source code."
    )
    repos2df_parser.set_defaults(handler=cmd.repos2df)
    args.add_df_args(repos2df_parser)
    args.add_repo2_args(repos2df_parser)
    args.add_feature_args(repos2df_parser)
    # ------------------------------------------------------------------------
    repos2ids_parser = subparsers.add_parser(
        "repos2ids", help="Convert source code to a bag of identifiers.")
    repos2ids_parser.set_defaults(handler=cmd.repos2ids)
    args.add_repo2_args(repos2ids_parser)
    args.add_split_stem_arg(repos2ids_parser)
    args.add_repartitioner_arg(repos2ids_parser)
    repos2ids_parser.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] output path to the CSV file with identifiers.")
    repos2ids_parser.add_argument(
        "--idfreq",
        action="store_true",
        help="Adds identifier frequencies to the output CSV file."
        "num_repos is the number of repositories where the identifier appears in."
        "num_files is the number of files where the identifier appears in."
        "num_occ is the total number of occurences of the identifier.")
    # ------------------------------------------------------------------------
    repos2coocc_parser = add_parser(
        "repos2coocc",
        "Convert source code to the sparse co-occurrence matrix of identifiers."
    )
    repos2coocc_parser.set_defaults(handler=cmd.repos2coocc)
    args.add_df_args(repos2coocc_parser)
    args.add_repo2_args(repos2coocc_parser)
    args.add_split_stem_arg(repos2coocc_parser)
    args.add_repartitioner_arg(repos2coocc_parser)
    repos2coocc_parser.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to the Cooccurrences model.")

    # ------------------------------------------------------------------------
    repos2roles_and_ids = add_parser(
        "repos2roleids",
        "Converts a UAST to a list of pairs, where pair is a role and "
        "identifier. Role is merged generic roles where identifier was found.")
    repos2roles_and_ids.set_defaults(handler=cmd.repos2roles_and_ids)
    args.add_repo2_args(repos2roles_and_ids)
    args.add_split_stem_arg(repos2roles_and_ids)
    repos2roles_and_ids.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to the directory where spark should store the result. "
        "Inside the direcory you find result is csv format, status file and sumcheck files."
    )
    # ------------------------------------------------------------------------
    repos2identifier_distance = add_parser(
        "repos2id_distance", "Converts a UAST to a list of identifier pairs "
        "and distance between them.")
    repos2identifier_distance.set_defaults(handler=cmd.repos2id_distance)
    args.add_repo2_args(repos2identifier_distance)
    args.add_split_stem_arg(repos2identifier_distance)
    repos2identifier_distance.add_argument(
        "-t",
        "--type",
        required=True,
        choices=IdentifierDistance.DistanceType.All,
        help="Distance type.")
    repos2identifier_distance.add_argument(
        "--max-distance",
        default=IdentifierDistance.DEFAULT_MAX_DISTANCE,
        type=int,
        help="Maximum distance to save.")
    repos2identifier_distance.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to the directory where spark should store the result. "
        "Inside the direcory you find result is csv format, status file and sumcheck files."
    )
    # ------------------------------------------------------------------------
    repos2id_sequence = add_parser(
        "repos2idseq",
        "Converts a UAST to sequence of identifiers sorted by order of appearance."
    )
    repos2id_sequence.set_defaults(handler=cmd.repos2id_sequence)
    args.add_repo2_args(repos2id_sequence)
    args.add_split_stem_arg(repos2id_sequence)
    repos2id_sequence.add_argument(
        "--skip-docname",
        default=False,
        action="store_true",
        help="Do not save document name in CSV file, only identifier sequence."
    )
    repos2id_sequence.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to the directory where spark should store the result. "
        "Inside the direcory you find result is csv format, status file and sumcheck files."
    )
    # ------------------------------------------------------------------------
    preproc_parser = add_parser(
        "id2vec-preproc",
        "Convert a sparse co-occurrence matrix to the Swivel shards.")
    preproc_parser.set_defaults(handler=cmd.id2vec_preprocess)
    args.add_df_args(preproc_parser)
    preproc_parser.add_argument("-s",
                                "--shard-size",
                                default=4096,
                                type=int,
                                help="The shard (submatrix) size.")
    preproc_parser.add_argument(
        "-i", "--input", help="Concurrence model produced by repos2coocc.")
    preproc_parser.add_argument("-o",
                                "--output",
                                required=True,
                                help="Output directory.")
    # ------------------------------------------------------------------------
    train_parser = add_parser("id2vec-train",
                              "Train identifier embeddings using Swivel.")
    train_parser.set_defaults(handler=cmd.run_swivel)
    mirror_tf_args(train_parser)
    # ------------------------------------------------------------------------
    id2vec_postproc_parser = add_parser(
        "id2vec-postproc",
        "Combine row and column embeddings produced by Swivel and write them to an .asdf."
    )
    id2vec_postproc_parser.set_defaults(handler=cmd.id2vec_postprocess)
    id2vec_postproc_parser.add_argument(
        "-i",
        "--swivel-data",
        required=True,
        help="Folder with swivel row and column embeddings data. "
        "You can get it using id2vec_train subcommand.")
    id2vec_postproc_parser.add_argument(
        "-o",
        "--output",
        required=True,
        help="Output directory for Id2Vec model.")
    # ------------------------------------------------------------------------
    id2vec_project_parser = add_parser(
        "id2vec-project", "Present id2vec model in Tensorflow Projector.")
    id2vec_project_parser.set_defaults(handler=cmd.id2vec_project)
    args.add_df_args(id2vec_project_parser, required=False)
    id2vec_project_parser.add_argument("-i",
                                       "--input",
                                       required=True,
                                       help="id2vec model to present.")
    id2vec_project_parser.add_argument("-o",
                                       "--output",
                                       required=True,
                                       help="Projector output directory.")
    id2vec_project_parser.add_argument("--no-browser",
                                       action="store_true",
                                       help="Do not open the browser.")
    # ------------------------------------------------------------------------
    bow2vw_parser = add_parser(
        "bow2vw",
        "Convert a bag-of-words model to the dataset in Vowpal Wabbit format.")
    bow2vw_parser.set_defaults(handler=cmd.bow2vw)
    bow2vw_parser.add_argument("--bow",
                               help="URL or path to a bag-of-words model.")
    bow2vw_parser.add_argument(
        "--id2vec", help="URL or path to the identifier embeddings.")
    bow2vw_parser.add_argument("-o",
                               "--output",
                               required=True,
                               help="Path to the output file.")
    # ------------------------------------------------------------------------
    bigartm_postproc_parser = add_parser(
        "bigartm2asdf",
        "Convert a human-readable BigARTM model to Modelforge format.")
    bigartm_postproc_parser.set_defaults(handler=cmd.bigartm2asdf)
    bigartm_postproc_parser.add_argument("input")
    bigartm_postproc_parser.add_argument("output")
    # ------------------------------------------------------------------------
    bigartm_parser = add_parser(
        "bigartm", "Install bigartm/bigartm to the current working directory.")
    bigartm_parser.set_defaults(handler=install_bigartm)
    bigartm_parser.add_argument(
        "--tmpdir",
        help="Store intermediate files in this directory instead of /tmp.")
    bigartm_parser.add_argument("--output",
                                default=os.getcwd(),
                                help="Output directory.")
    # ------------------------------------------------------------------------
    dump_parser = add_parser("dump", "Dump a model to stdout.")
    dump_parser.set_defaults(handler=cmd.dump_model)
    dump_parser.add_argument("input",
                             help="Path to the model file, URL or UUID.")
    dump_parser.add_argument("--gcs",
                             default=None,
                             dest="gcs_bucket",
                             help="GCS bucket to use.")
    # ------------------------------------------------------------------------
    merge_df = add_parser("merge-df",
                          "Merge DocumentFrequencies models to a singe one.")
    merge_df.set_defaults(handler=cmd.merge_df)
    args.add_filter_arg(merge_df)
    args.add_min_docfreq(merge_df)
    args.add_vocabulary_size_arg(merge_df)
    merge_df.add_argument("-o",
                          "--output",
                          required=True,
                          help="Path to the merged DocumentFrequencies model.")
    merge_df.add_argument("-i",
                          "--input",
                          required=True,
                          nargs="+",
                          help="DocumentFrequencies models input files."
                          "Use `-i -` to read input files from stdin.")
    merge_df.add_argument(
        "--ordered",
        action="store_true",
        default=False,
        help="Save OrderedDocumentFrequencies. "
        "If not specified DocumentFrequencies model will be saved")
    # ------------------------------------------------------------------------
    merge_coocc = add_parser("merge-coocc",
                             "Merge several Cooccurrences models together.")
    merge_coocc.set_defaults(handler=cmd.merge_coocc)
    add_spark_args(merge_coocc)
    args.add_filter_arg(merge_coocc)
    merge_coocc.add_argument("-o",
                             "--output",
                             required=True,
                             help="Path to the merged Cooccurrences model.")
    merge_coocc.add_argument("-i",
                             "--input",
                             required=True,
                             help="Cooccurrences models input files."
                             "Use `-i -` to read input files from stdin.")
    merge_coocc.add_argument(
        "--docfreq",
        required=True,
        help="[IN] Specify OrderedDocumentFrequencies model. "
        "Identifiers that are not present in the model will be ignored.")
    merge_coocc.add_argument(
        "--no-spark",
        action="store_true",
        default=False,
        help="Use the local reduction instead of PySpark. "
        "Can be faster and consume less memory if the data fits into RAM.")
    return parser
Ejemplo n.º 5
0
def get_parser() -> argparse.ArgumentParser:
    """
    Create the cmdline argument parser.
    """
    parser = argparse.ArgumentParser(
        formatter_class=ArgumentDefaultsHelpFormatterNoNone)
    parser.add_argument("--log-level",
                        default="INFO",
                        choices=logging._nameToLevel,
                        help="Logging verbosity.")

    def add_feature_weight_arg(my_parser):
        help_desc = "%s's weight - all features from this extractor will be multiplied by this " \
                  "factor"
        for ex in extractors.__extractors__.values():
            my_parser.add_argument("--%s-weight" % ex.NAME,
                                   default=1,
                                   type=float,
                                   help=help_desc % ex.__name__)

    def add_cassandra_args(my_parser):
        my_parser.add_argument("--cassandra",
                               default="0.0.0.0:9042",
                               help="Cassandra's host:port.")
        my_parser.add_argument("--keyspace",
                               default="apollo",
                               help="Cassandra's key space.")
        my_parser.add_argument(
            "--tables",
            help=
            "Table name mapping (JSON): bags, hashes, hashtables, hashtables2."
        )

    def add_wmh_args(my_parser, params_help: str, add_hash_size: bool,
                     required: bool):
        if add_hash_size:
            my_parser.add_argument("--size",
                                   type=int,
                                   default=128,
                                   help="Hash size.")
        my_parser.add_argument("-p",
                               "--params",
                               required=required,
                               help=params_help)
        my_parser.add_argument("-t",
                               "--threshold",
                               required=required,
                               type=float,
                               help="Jaccard similarity threshold.")
        my_parser.add_argument(
            "--false-positive-weight",
            type=float,
            default=0.5,
            help="Used to adjust the relative importance of "
            "minimizing false positives count when optimizing "
            "for the Jaccard similarity threshold.")
        my_parser.add_argument(
            "--false-negative-weight",
            type=float,
            default=0.5,
            help="Used to adjust the relative importance of "
            "minimizing false negatives count when optimizing "
            "for the Jaccard similarity threshold.")

    def add_template_args(my_parser, default_template):
        my_parser.add_argument("--batch",
                               type=int,
                               default=100,
                               help="Number of hashes to query at a time.")
        my_parser.add_argument("--template",
                               default=default_template,
                               help="Jinja2 template to render.")

    # Create and construct subparsers
    subparsers = parser.add_subparsers(help="Commands", dest="command")

    # ------------------------------------------------------------------------
    warmup_parser = subparsers.add_parser("warmup",
                                          help="Initialize source{d} engine.")
    warmup_parser.set_defaults(handler=warmup)
    add_engine_args(warmup_parser, default_packages=[CASSANDRA_PACKAGE])

    # ------------------------------------------------------------------------
    db_parser = subparsers.add_parser(
        "resetdb", help="Destructively initialize the database.")
    db_parser.set_defaults(handler=reset_db)
    add_cassandra_args(db_parser)
    db_parser.add_argument(
        "--hashes-only",
        action="store_true",
        help=
        "Only clear the tables: hashes, hashtables, hashtables2. Do not touch the rest."
    )
    # ------------------------------------------------------------------------
    preprocess_parser = subparsers.add_parser(
        "preprocess",
        help=
        "Creates the index, quant and docfreq model of the bag-of-words model."
    )
    preprocess_parser.set_defaults(handler=preprocess)
    add_df_args(preprocess_parser)
    add_repo2_args(preprocess_parser)
    add_feature_args(preprocess_parser)
    add_repartitioner_arg(preprocess_parser)
    preprocess_parser.add_argument(
        "--cached-index-path",
        default=None,
        help="[OUT] Path to the docfreq model holding the document's index.")
    # ------------------------------------------------------------------------
    source2bags_parser = subparsers.add_parser(
        "bags", help="Convert source code to weighted sets.")
    source2bags_parser.set_defaults(handler=source2bags)
    add_bow_args(source2bags_parser)
    add_dzhigurda_arg(source2bags_parser)
    add_repo2_args(source2bags_parser, default_packages=[CASSANDRA_PACKAGE])
    add_feature_args(source2bags_parser)
    add_cassandra_args(source2bags_parser)
    add_df_args(source2bags_parser)
    add_repartitioner_arg(source2bags_parser)
    source2bags_parser.add_argument(
        "--cached-index-path",
        default=None,
        help="[IN] Path to the docfreq model holding the document's index.")

    # ------------------------------------------------------------------------
    hash_parser = subparsers.add_parser(
        "hash", help="Run MinHashCUDA on the bag batches.")
    hash_parser.set_defaults(handler=hash_batches)
    hash_parser.add_argument("-i",
                             "--input",
                             help="Path to the directory with Parquet files.")
    hash_parser.add_argument("--seed",
                             type=int,
                             default=int(time()),
                             help="Random generator's seed.")
    hash_parser.add_argument("--mhc-verbosity",
                             type=int,
                             default=1,
                             help="MinHashCUDA logs verbosity level.")
    hash_parser.add_argument(
        "--devices",
        type=int,
        default=0,
        help="Or-red indices of NVIDIA devices to use. 0 means all.")
    add_wmh_args(hash_parser, "Path to the output file with WMH parameters.",
                 True, True)
    add_cassandra_args(hash_parser)
    add_spark_args(hash_parser, default_packages=[CASSANDRA_PACKAGE])
    add_feature_weight_arg(hash_parser)
    add_repartitioner_arg(hash_parser)

    # ------------------------------------------------------------------------
    query_parser = subparsers.add_parser("query",
                                         help="Query for similar files.")
    query_parser.set_defaults(handler=query)
    mode_group = query_parser.add_mutually_exclusive_group(required=True)
    mode_group.add_argument("-i", "--id", help="Query for this id (id mode).")
    mode_group.add_argument("-c",
                            "--file",
                            help="Query for this file (file mode).")
    query_parser.add_argument(
        "--docfreq", help="Path to OrderedDocumentFrequencies (file mode).")
    query_parser.add_argument(
        "--min-docfreq",
        default=1,
        type=int,
        help="The minimum document frequency of each feature.")
    query_parser.add_argument("--bblfsh",
                              default="localhost:9432",
                              help="Babelfish server's address.")
    query_parser.add_argument("--precise",
                              action="store_true",
                              help="Calculate the precise set.")
    add_wmh_args(query_parser, "Path to the Weighted MinHash parameters.",
                 False, False)
    add_feature_args(query_parser, required=False)
    add_template_args(query_parser, "query.md.jinja2")
    add_cassandra_args(query_parser)

    # ------------------------------------------------------------------------
    cc_parser = subparsers.add_parser(
        "cc",
        help=
        "Load the similar pairs of files and run connected components analysis."
    )
    cc_parser.set_defaults(handler=find_connected_components)
    add_cassandra_args(cc_parser)
    cc_parser.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to connected components ASDF model.")

    # ------------------------------------------------------------------------
    dumpcc_parser = subparsers.add_parser(
        "dumpcc", help="Output the connected components to stdout.")
    dumpcc_parser.set_defaults(handler=dumpcc)
    dumpcc_parser.add_argument("-i",
                               "--input",
                               required=True,
                               help="Path to connected components ASDF model.")
    # ------------------------------------------------------------------------
    community_parser = subparsers.add_parser(
        "cmd",
        help=
        "Run Community Detection analysis on the connected components from \"cc\"."
    )
    community_parser.set_defaults(handler=detect_communities)
    community_parser.add_argument(
        "-i",
        "--input",
        required=True,
        help="Path to connected components ASDF model.")
    community_parser.add_argument(
        "-o",
        "--output",
        required=True,
        help="[OUT] Path to the communities ASDF model.")
    community_parser.add_argument(
        "--edges",
        choices=("linear", "quadratic", "1", "2"),
        default="linear",
        help="The method to generate the graph's edges: bipartite - "
        "linear and fast, but may not fit some the CD algorithms, "
        "or all to all within a bucket - quadratic and slow, but "
        "surely fits all the algorithms.")
    cmd_choices = [k[10:] for k in dir(Graph) if k.startswith("community_")]
    community_parser.add_argument(
        "-a",
        "--algorithm",
        choices=cmd_choices,
        default="walktrap",
        help="The community detection algorithm to apply.")
    community_parser.add_argument(
        "-p",
        "--params",
        type=json.loads,
        default={},
        help="Parameters for the algorithm (**kwargs, JSON format).")
    community_parser.add_argument("--no-spark",
                                  action="store_true",
                                  help="Do not use Spark.")
    add_spark_args(community_parser)

    # ------------------------------------------------------------------------
    dumpcmd_parser = subparsers.add_parser(
        "dumpcmd", help="Output the detected communities to stdout.")
    dumpcmd_parser.set_defaults(handler=dumpcmd)
    dumpcmd_parser.add_argument("input",
                                help="Path to the communities ASDF model.")
    add_template_args(dumpcmd_parser, "report.md.jinja2")
    add_cassandra_args(dumpcmd_parser)

    # ------------------------------------------------------------------------
    evalcc_parser = subparsers.add_parser(
        "evalcc",
        help=
        "Evaluate the communities: calculate the precise similarity and the "
        "fitness metric.")
    evalcc_parser.set_defaults(handler=evaluate_communities)
    evalcc_parser.add_argument("-t",
                               "--threshold",
                               required=True,
                               type=float,
                               help="Jaccard similarity threshold.")
    evalcc_parser.add_argument("-i",
                               "--input",
                               required=True,
                               help="Path to the communities model.")

    add_spark_args(evalcc_parser, default_packages=[CASSANDRA_PACKAGE])
    add_cassandra_args(evalcc_parser)

    # TODO: retable [.....] -> [.] [.] [.] [.] [.]
    return parser