def _get_config_handler(): """ Gets the config path based on if Docker is used or not Checks environment for DOCKER='true' Returns appropriate ConfigHandler """ if 'DOCKER' not in os.environ: return ConfigHandler() else: USAGE = bool(os.environ.get("DOCKER")) if USAGE: return ConfigHandler("/src/config.yml") else: return ConfigHandler()
def test_init(self): print("INFO: Testing init") with open(os.getcwd() + self.log_path + "init.log", "w") as f: pass config = ConfigHandler(self.config_raw, "init") test_raw = TargetCreator(self.parsed_raw, self.id_to_name, self.author_papers, **config["TargetCreator"]) for k in self.parsed_raw.keys(): if k not in test_raw.papers: print( "{} is missing from TargetCreator.papers when passed raw papers" .format(k)) self.fail() test_paper_class = TargetCreator(self.papers, self.id_to_name, self.author_papers, **config["TargetCreator"]) for k, v in self.papers.items(): if k not in test_paper_class.papers: print( "{} is missing from TargetCreator.papers when passed dict of Paper classes" .format(k)) self.fail() if v != test_paper_class.papers[k]: print("Paper {} does not equal itself in TargetCreator.papers". format(k)) self.fail()
def test_createTarget(self): with open(os.getcwd() + self.log_path + "handle_target.log", "w") as f: pass config = ConfigHandler(self.config_raw, "handle_target") target_creator = TargetCreator(self.parsed_raw, self.id_to_name, self.author_papers, **config["TargetCreator"]) rtr = target_creator.createTarget("xuan-jing-huang") self.assertEqual(["1", "2", "3"], [x[-1] for x in rtr]) self.assertEqual(3, len(target_creator.new_papers)) self.assertEqual(3, len(target_creator.new_id_to_name)) self.assertEqual(3, len(target_creator.new_author_papers)) for k, p in target_creator.new_author_papers.items(): for j in rtr: if j == k: continue for paper in p: self.assertTrue( j not in target_creator.new_papers[paper].authors) self.assertTrue( j not in target_creator.new_papers[paper].affiliations)
def test_fillData(self): with open(os.getcwd() + self.log_path + "fill_data.log", "w") as f: pass config = ConfigHandler(self.config_raw, "fill_data") target_creator = TargetCreator(self.parsed_raw, self.id_to_name, self.author_papers, **config["TargetCreator"]) test_papers = [] for x in self.test_authors: test_papers.extend(self.author_papers[x]) rtr = [] for a in self.test_authors: rtr.extend(target_creator.createTarget(a)) papers, auth_papers, id_to_name = target_creator.fillData() for a in self.test_authors: if a in auth_papers: print(a) self.assertTrue(a not in auth_papers) self.assertTrue(a not in id_to_name) for a in rtr: self.assertTrue(a in auth_papers) self.assertTrue(a in id_to_name) for p in test_papers: if p not in papers: print(p) self.fail() self.assertTrue(p in papers) found_one = False for a in rtr: actual_id = a[:-1] if actual_id in papers[p].affiliations or actual_id in papers[ p].authors: self.fail("{} is in paper {} when it should not be".format( actual_id, p)) if p in auth_papers[a]: found_one = True if not found_one: self.fail("{} was not found in any authors' papers".format(p))
from src.auth_handler import AuthHandler from src.request_handler import RequestHandler from src.os_handler import OSHandler from src.network_handler import NetworkHandler from src.print_handler import PrintHandler from src.prints.distribution_prints import DistributionPrints app = Flask(__name__) episode_job_queue = Queue() # Don't try any of this here - startup configs should fail immediately # Initialize in __main__ if c = ( ConfigHandler() if 'DOCKER' not in os.environ or not bool(os.environ.get("DOCKER")) else ConfigHandler("/src/config.yml") ) # Represents ConfigHandler p = PrintHandler(c) # Represents PrintHandler logger = p.logger # Represents the logger object dp = DistributionPrints(p.Colors()) # Represents the EncodePrints object a = ( AuthHandler(p) if 'DOCKER' not in os.environ or not bool(os.environ.get("DOCKER")) else AuthHandler(p, "/src/auth.yml") ) # Represents AuthHandler def distribute_worker(): """ Represents a single thread that is continuously scanning for a new distribution job to handle. Once it finds one, it takes it and procsses it.
createCLIShared(arguments) createCLIGroup(arguments, "VoteClassifier", "Arguments for the VoteClassifier, check the documentation of VoteClassifier to see default " "values", VoteClassifier.parameters) if __name__ == '__main__': gc.collect() args = arguments.parse_args() log_path = os.getcwd() + '/logs/train.log' with open(log_path, 'w'): pass print("INFO: Starting Preprocess Data") gc.collect() config_raw = json.load(open("config.json")) config = ConfigHandler(config_raw, "train", raise_error_unknown=True) config = parseCLIArgs(args, config) data = pickle.load(open(config["tagged_pairs"], "rb")) scores = [] weights = { "Nearest Neighbors": 1, "Decision Tree": 3, "Random Forest": 2, "Neural Net": 2, "Naive Bayes": 1, "AdaBoost": 2, "QDA": 1, } config.addArgument("classifier_weights", weights) # params = { # 'Nearest Neighbors': {
def test_updatePapers(self): print("INFO: Testing updatePapers") with open(os.getcwd() + self.log_path + "update_papers.log", "w") as f: pass config = ConfigHandler(self.config_raw, "update_papers") author_papers_copy = deepcopy(self.author_papers) papers_copy = {x: Paper(**v.asDict()) for x, v in self.papers.items()} tests = [ ["qiang-wang", "qiang-wang1", None], # No papers passed ['hua-wu', "hua-wu1", ['P16-1159']], # Error papers ['yun-chen', "yun-chen1", ['P16-1159']], # Not in paper ['yun-chen', "yun-chen1", ['P17-1176']], ['victor-ok-li', "victor-ok-li1", ['P17-1176']], # Paper already done ["xuan-jing-huang", "fail-test", ["P19-1642"]], ['fail-test', "yun-huang1", ['S19-2016']], ] target_creator = TargetCreator(papers_copy, self.id_to_name, author_papers_copy, **config["TargetCreator"]) target_creator.one_per_paper = False target_creator.error_papers = {"P16-1159"} a = tests[0] target_creator._updatePapers(*a) self.assertEqual(1, len(target_creator.new_papers)) self.assertEqual(1, len(target_creator.new_author_papers)) self.assertTrue("qiang-wang1" in target_creator.new_author_papers) self.assertTrue("W19-4416" in target_creator.new_papers) self.assertTrue( "qiang-wang1" in target_creator.new_papers["W19-4416"].authors) self.assertTrue("qiang-wang1" in target_creator.new_papers["W19-4416"].affiliations) b = tests[1] target_creator._updatePapers(*b) self.assertEqual(1, len(target_creator.new_papers)) self.assertEqual(1, len(target_creator.new_author_papers)) c = tests[2] target_creator._updatePapers(*c) self.assertEqual(1, len(target_creator.new_papers)) self.assertEqual(1, len(target_creator.new_author_papers)) d = tests[3] target_creator._updatePapers(*d) self.assertEqual(2, len(target_creator.new_papers)) self.assertEqual(2, len(target_creator.new_author_papers)) self.assertTrue( "qiang-wang1" in target_creator.new_papers["W19-4416"].authors) self.assertTrue("qiang-wang1" in target_creator.new_papers["W19-4416"].affiliations) self.assertTrue( "yun-chen1" in target_creator.new_papers["P17-1176"].authors) self.assertTrue( "yun-chen1" in target_creator.new_papers["P17-1176"].affiliations) e = tests[4] target_creator._updatePapers(*e) self.assertEqual(2, len(target_creator.new_papers)) self.assertEqual(3, len(target_creator.new_author_papers)) self.assertTrue( "yun-chen1" in target_creator.new_papers["P17-1176"].authors) self.assertTrue( "yun-chen1" in target_creator.new_papers["P17-1176"].affiliations) self.assertTrue( "victor-ok-li1" in target_creator.new_papers["P17-1176"].authors) self.assertTrue("victor-ok-li1" in target_creator.new_papers["P17-1176"].affiliations) f = tests[5] target_creator._updatePapers(*f) self.assertEqual(2, len(target_creator.new_papers)) self.assertEqual(3, len(target_creator.new_author_papers)) g = tests[6] target_creator._updatePapers(*g) self.assertEqual(2, len(target_creator.new_papers)) self.assertEqual(3, len(target_creator.new_author_papers))
"Arguments for how to create targets", TargetCreator.parameters) createCLIGroup( arguments, "AuthorDisambiguation", "Arguments for how to disambiguate authors, check author_disambiguation.py for default values", AuthorDisambiguation.parameters) if __name__ == '__main__': args = arguments.parse_args() with open(os.getcwd() + "/logs/disambiguate.log", 'w'): pass log_path = os.getcwd() + "/logs/disambiguate.log" print("INFO: Starting Create Data") gc.collect() config_raw = json.load(open("config.json")) config = ConfigHandler(config_raw, "disambiguate", raise_error_unknown=True) data = loadData([ "department_corpus", "incomplete_papers", "org_corpus", "conflicts", "parsed_papers", "same_names", "test_special_keys", "author_papers", "id_to_name" ], config.logger, config) author_papers = data["author_papers"] id_to_name = data["id_to_name"] same_names = data["same_names"] parsed = data["parsed_papers"] parsed = {x: Paper(**info) for x, info in parsed.items()} org_corpus = data["org_corpus"] department_corpus = data["department_corpus"] incomplete = data["incomplete_papers"] special_keys = data["test_special_keys"]
createCLIShared(arguments) createCLIGroup( arguments, "CreateTrainingData", "Arguments for the CreateTrainingData, check the documentation of CreateTrainingData to see default " "values", CreateTrainingData.parameters) if __name__ == "__main__": args = arguments.parse_args() with open(os.getcwd() + "/logs/preprocess_data.log", 'w'): pass log_path = os.getcwd() + "/logs/preprocess_data.log" print("INFO: Starting Preprocess Data") gc.collect() config_raw = json.load(open("config.json")) config = ConfigHandler(config_raw, "preprocess_data", raise_error_unknown=True) config = parseCLIArgs(args, config) data = loadData([ "department_corpus", "incomplete_papers", "org_corpus", "conflicts", "parsed_papers", "same_names", "test_special_keys" ], config.logger, config) same_names = data["same_names"] parsed = data["parsed_papers"] parsed = {x: Paper(**info) for x, info in parsed.items()} org_corpus = data["org_corpus"] department_corpus = data["department_corpus"] incomplete = data["incomplete_papers"] special_keys = data["test_special_keys"] excluded_dict = data["conflicts"]
description="Parse Disambiguate targets. You can specify these in config.json instead of using command line arguments", formatter_class=argparse.MetavarTypeHelpFormatter) createCLIShared(arguments) createCLIGroup(arguments, "TargetCreator", "Arguments for how to create targets", TargetCreator.parameters) createCLIGroup(arguments, "AuthorDisambiguation", "Arguments for how to disambiguate authors, check author_disambiguation.py for default values", AuthorDisambiguation.parameters) if __name__ == '__main__': args = arguments.parse_args() with open(os.getcwd() + "/logs/evaluate_disambiguation.log", 'w'): pass log_path = os.getcwd() + "/logs/evaluate_disambiguation.log" print("INFO: Starting Create Data") gc.collect() config_raw = json.load(open("config.json")) config = ConfigHandler(config_raw, "evaluate_disambiguation", raise_error_unknown=True) data = loadData( ["department_corpus", "incomplete_papers", "org_corpus", "conflicts", "parsed_papers", "same_names", "test_special_keys", "author_papers", "id_to_name"], config.logger, config) author_papers = data["author_papers"] id_to_name = data["id_to_name"] same_names = data["same_names"] parsed = data["parsed_papers"] parsed = {x: Paper(**info) for x, info in parsed.items()} org_corpus = data["org_corpus"] department_corpus = data["department_corpus"] incomplete = data["incomplete_papers"] special_keys = data["test_special_keys"] target_creator = TargetCreator(parsed, id_to_name, author_papers, **config["TargetCreator"]) tests = [
createCLIShared(arguments) createCLIGroup( arguments, "PDFParser", "Arguments for the PDFParser, check the documentation of pdf_parser.py to see default values", PDFParserWrapper.parameters) createCLIGroup( arguments, "ACLParser", "Arguments for the ACLParser, check the documentation of acl_parser.py to see default values", ACLParser.parameters) if __name__ == '__main__': args = arguments.parse_args() with open(os.getcwd() + "/logs/create_data.log", 'w'): pass log_path = os.getcwd() + "/logs/create_data.log" print("INFO: Starting Create Data") gc.collect() config_raw = json.load(open("config.json")) config = ConfigHandler(config_raw, "create_data", raise_error_unknown=True) config = parseCLIArgs(args, config) acl_parser = ACLParser(**config["ACLParser"]) acl_parser(config["xml_path"], config["name_variants_path"]) data = loadData(["aliases", "acl_papers", "id_to_name", "same_names"], config.logger, config, override_keys={"acl_papers": "papers"}) parser = PDFParserWrapper(**data, **config["PDFParser"]) parser(config["parsed_pdf_path"]) gc.collect()