gc.collect()
 config_raw = json.load(open("config.json"))
 config = ConfigHandler(config_raw, "train", raise_error_unknown=True)
 config = parseCLIArgs(args, config)
 data = pickle.load(open(config["tagged_pairs"], "rb"))
 scores = []
 weights = {
     "Nearest Neighbors": 1,
     "Decision Tree": 3,
     "Random Forest": 2,
     "Neural Net": 2,
     "Naive Bayes": 1,
     "AdaBoost": 2,
     "QDA": 1,
 }
 config.addArgument("classifier_weights", weights)
 # params = {
 #     'Nearest Neighbors': {
 #         'algorithm': 'ball_tree',
 #         'leaf_size': 10,
 #         'metric': 'manhattan',
 #         'n_neighbors': 9,
 #         'p': 1,
 #         'weights': 'uniform'
 #     },
 #     'Decision Tree': {
 #         'max_depth': 5,
 #         'max_features': 5,
 #         'min_samples_leaf': 4,
 #         'min_samples_split': 2,
 #         'min_weight_fraction_leaf': 0.0
Example #2
0
    data = loadData([
        "department_corpus", "incomplete_papers", "org_corpus", "conflicts",
        "parsed_papers", "same_names", "test_special_keys"
    ], config.logger, config)
    same_names = data["same_names"]
    parsed = data["parsed_papers"]
    parsed = {x: Paper(**info) for x, info in parsed.items()}
    org_corpus = data["org_corpus"]
    department_corpus = data["department_corpus"]
    incomplete = data["incomplete_papers"]
    special_keys = data["test_special_keys"]
    excluded_dict = data["conflicts"]

    compare_authors_args = {
        "company_corpus": org_corpus,
        "department_corpus": department_corpus,
        "threshold": .4
    }
    excluded = []
    for k, c in excluded_dict.items():
        for _id, n in c:
            excluded.append(_id)
    config.addArgument("exclude", excluded)
    pair_creator = CreateTrainingData(parsed,
                                      incomplete,
                                      special_keys,
                                      compare_args=compare_authors_args,
                                      **config["CreateTrainingData"])
    gc.collect()
    pair_creator(get_info_all=True)