Example #1
0
def _get_config_handler():
    """ 
    Gets the config path based on if Docker is used or not
    Checks environment for DOCKER='true' 
    Returns appropriate ConfigHandler
    """
    if 'DOCKER' not in os.environ:
        return ConfigHandler()
    else:
        USAGE = bool(os.environ.get("DOCKER"))
        if USAGE:
            return ConfigHandler("/src/config.yml")
        else:
            return ConfigHandler()
Example #2
0
 def test_init(self):
     print("INFO: Testing init")
     with open(os.getcwd() + self.log_path + "init.log", "w") as f:
         pass
     config = ConfigHandler(self.config_raw, "init")
     test_raw = TargetCreator(self.parsed_raw, self.id_to_name,
                              self.author_papers, **config["TargetCreator"])
     for k in self.parsed_raw.keys():
         if k not in test_raw.papers:
             print(
                 "{} is missing from TargetCreator.papers when passed raw papers"
                 .format(k))
             self.fail()
     test_paper_class = TargetCreator(self.papers, self.id_to_name,
                                      self.author_papers,
                                      **config["TargetCreator"])
     for k, v in self.papers.items():
         if k not in test_paper_class.papers:
             print(
                 "{} is missing from TargetCreator.papers when passed dict of Paper classes"
                 .format(k))
             self.fail()
         if v != test_paper_class.papers[k]:
             print("Paper {} does not equal itself in TargetCreator.papers".
                   format(k))
             self.fail()
Example #3
0
 def test_createTarget(self):
     with open(os.getcwd() + self.log_path + "handle_target.log", "w") as f:
         pass
     config = ConfigHandler(self.config_raw, "handle_target")
     target_creator = TargetCreator(self.parsed_raw, self.id_to_name,
                                    self.author_papers,
                                    **config["TargetCreator"])
     rtr = target_creator.createTarget("xuan-jing-huang")
     self.assertEqual(["1", "2", "3"], [x[-1] for x in rtr])
     self.assertEqual(3, len(target_creator.new_papers))
     self.assertEqual(3, len(target_creator.new_id_to_name))
     self.assertEqual(3, len(target_creator.new_author_papers))
     for k, p in target_creator.new_author_papers.items():
         for j in rtr:
             if j == k:
                 continue
             for paper in p:
                 self.assertTrue(
                     j not in target_creator.new_papers[paper].authors)
                 self.assertTrue(
                     j not in target_creator.new_papers[paper].affiliations)
Example #4
0
    def test_fillData(self):
        with open(os.getcwd() + self.log_path + "fill_data.log", "w") as f:
            pass
        config = ConfigHandler(self.config_raw, "fill_data")
        target_creator = TargetCreator(self.parsed_raw, self.id_to_name,
                                       self.author_papers,
                                       **config["TargetCreator"])
        test_papers = []
        for x in self.test_authors:
            test_papers.extend(self.author_papers[x])
        rtr = []
        for a in self.test_authors:
            rtr.extend(target_creator.createTarget(a))
        papers, auth_papers, id_to_name = target_creator.fillData()
        for a in self.test_authors:
            if a in auth_papers:
                print(a)
                self.assertTrue(a not in auth_papers)
                self.assertTrue(a not in id_to_name)
        for a in rtr:
            self.assertTrue(a in auth_papers)
            self.assertTrue(a in id_to_name)

        for p in test_papers:
            if p not in papers:
                print(p)
                self.fail()
            self.assertTrue(p in papers)
            found_one = False
            for a in rtr:
                actual_id = a[:-1]
                if actual_id in papers[p].affiliations or actual_id in papers[
                        p].authors:
                    self.fail("{} is in paper {} when it should not be".format(
                        actual_id, p))
                if p in auth_papers[a]:
                    found_one = True
            if not found_one:
                self.fail("{} was not found in any authors' papers".format(p))
Example #5
0
from src.auth_handler import AuthHandler
from src.request_handler import RequestHandler
from src.os_handler import OSHandler
from src.network_handler import NetworkHandler
from src.print_handler import PrintHandler
from src.prints.distribution_prints import DistributionPrints

app = Flask(__name__)

episode_job_queue = Queue()

# Don't try any of this here - startup configs should fail immediately
# Initialize in __main__ if

c = (
    ConfigHandler() if 'DOCKER' not in os.environ
    or not bool(os.environ.get("DOCKER")) else ConfigHandler("/src/config.yml")
)  # Represents ConfigHandler
p = PrintHandler(c)  # Represents PrintHandler
logger = p.logger  # Represents the logger object
dp = DistributionPrints(p.Colors())  # Represents the EncodePrints object
a = (
    AuthHandler(p) if 'DOCKER' not in os.environ
    or not bool(os.environ.get("DOCKER")) else AuthHandler(p, "/src/auth.yml")
)  # Represents AuthHandler


def distribute_worker():
    """
    Represents a single thread that is continuously scanning for a new distribution job to handle.
    Once it finds one, it takes it and procsses it.
createCLIShared(arguments)
createCLIGroup(arguments, "VoteClassifier",
               "Arguments for the VoteClassifier, check the documentation of VoteClassifier to see default "
               "values",
               VoteClassifier.parameters)

if __name__ == '__main__':
    gc.collect()
    args = arguments.parse_args()
    log_path = os.getcwd() + '/logs/train.log'
    with open(log_path, 'w'):
        pass
    print("INFO: Starting Preprocess Data")
    gc.collect()
    config_raw = json.load(open("config.json"))
    config = ConfigHandler(config_raw, "train", raise_error_unknown=True)
    config = parseCLIArgs(args, config)
    data = pickle.load(open(config["tagged_pairs"], "rb"))
    scores = []
    weights = {
        "Nearest Neighbors": 1,
        "Decision Tree": 3,
        "Random Forest": 2,
        "Neural Net": 2,
        "Naive Bayes": 1,
        "AdaBoost": 2,
        "QDA": 1,
    }
    config.addArgument("classifier_weights", weights)
    # params = {
    #     'Nearest Neighbors': {
Example #7
0
    def test_updatePapers(self):
        print("INFO: Testing updatePapers")
        with open(os.getcwd() + self.log_path + "update_papers.log", "w") as f:
            pass
        config = ConfigHandler(self.config_raw, "update_papers")
        author_papers_copy = deepcopy(self.author_papers)
        papers_copy = {x: Paper(**v.asDict()) for x, v in self.papers.items()}
        tests = [
            ["qiang-wang", "qiang-wang1", None],  # No papers passed
            ['hua-wu', "hua-wu1", ['P16-1159']],  # Error papers
            ['yun-chen', "yun-chen1", ['P16-1159']],  # Not in paper
            ['yun-chen', "yun-chen1", ['P17-1176']],
            ['victor-ok-li', "victor-ok-li1",
             ['P17-1176']],  # Paper already done
            ["xuan-jing-huang", "fail-test", ["P19-1642"]],
            ['fail-test', "yun-huang1", ['S19-2016']],
        ]

        target_creator = TargetCreator(papers_copy, self.id_to_name,
                                       author_papers_copy,
                                       **config["TargetCreator"])
        target_creator.one_per_paper = False
        target_creator.error_papers = {"P16-1159"}
        a = tests[0]
        target_creator._updatePapers(*a)
        self.assertEqual(1, len(target_creator.new_papers))
        self.assertEqual(1, len(target_creator.new_author_papers))
        self.assertTrue("qiang-wang1" in target_creator.new_author_papers)
        self.assertTrue("W19-4416" in target_creator.new_papers)
        self.assertTrue(
            "qiang-wang1" in target_creator.new_papers["W19-4416"].authors)
        self.assertTrue("qiang-wang1" in
                        target_creator.new_papers["W19-4416"].affiliations)

        b = tests[1]
        target_creator._updatePapers(*b)
        self.assertEqual(1, len(target_creator.new_papers))
        self.assertEqual(1, len(target_creator.new_author_papers))

        c = tests[2]
        target_creator._updatePapers(*c)
        self.assertEqual(1, len(target_creator.new_papers))
        self.assertEqual(1, len(target_creator.new_author_papers))

        d = tests[3]
        target_creator._updatePapers(*d)
        self.assertEqual(2, len(target_creator.new_papers))
        self.assertEqual(2, len(target_creator.new_author_papers))
        self.assertTrue(
            "qiang-wang1" in target_creator.new_papers["W19-4416"].authors)
        self.assertTrue("qiang-wang1" in
                        target_creator.new_papers["W19-4416"].affiliations)
        self.assertTrue(
            "yun-chen1" in target_creator.new_papers["P17-1176"].authors)
        self.assertTrue(
            "yun-chen1" in target_creator.new_papers["P17-1176"].affiliations)

        e = tests[4]
        target_creator._updatePapers(*e)
        self.assertEqual(2, len(target_creator.new_papers))
        self.assertEqual(3, len(target_creator.new_author_papers))
        self.assertTrue(
            "yun-chen1" in target_creator.new_papers["P17-1176"].authors)
        self.assertTrue(
            "yun-chen1" in target_creator.new_papers["P17-1176"].affiliations)
        self.assertTrue(
            "victor-ok-li1" in target_creator.new_papers["P17-1176"].authors)
        self.assertTrue("victor-ok-li1" in
                        target_creator.new_papers["P17-1176"].affiliations)

        f = tests[5]
        target_creator._updatePapers(*f)
        self.assertEqual(2, len(target_creator.new_papers))
        self.assertEqual(3, len(target_creator.new_author_papers))

        g = tests[6]
        target_creator._updatePapers(*g)
        self.assertEqual(2, len(target_creator.new_papers))
        self.assertEqual(3, len(target_creator.new_author_papers))
Example #8
0
               "Arguments for how to create targets", TargetCreator.parameters)
createCLIGroup(
    arguments, "AuthorDisambiguation",
    "Arguments for how to disambiguate authors, check author_disambiguation.py for default values",
    AuthorDisambiguation.parameters)

if __name__ == '__main__':
    args = arguments.parse_args()
    with open(os.getcwd() + "/logs/disambiguate.log", 'w'):
        pass
    log_path = os.getcwd() + "/logs/disambiguate.log"
    print("INFO: Starting Create Data")
    gc.collect()
    config_raw = json.load(open("config.json"))
    config = ConfigHandler(config_raw,
                           "disambiguate",
                           raise_error_unknown=True)
    data = loadData([
        "department_corpus", "incomplete_papers", "org_corpus", "conflicts",
        "parsed_papers", "same_names", "test_special_keys", "author_papers",
        "id_to_name"
    ], config.logger, config)
    author_papers = data["author_papers"]
    id_to_name = data["id_to_name"]
    same_names = data["same_names"]
    parsed = data["parsed_papers"]
    parsed = {x: Paper(**info) for x, info in parsed.items()}
    org_corpus = data["org_corpus"]
    department_corpus = data["department_corpus"]
    incomplete = data["incomplete_papers"]
    special_keys = data["test_special_keys"]
Example #9
0
createCLIShared(arguments)
createCLIGroup(
    arguments, "CreateTrainingData",
    "Arguments for the CreateTrainingData, check the documentation of CreateTrainingData to see default "
    "values", CreateTrainingData.parameters)

if __name__ == "__main__":
    args = arguments.parse_args()
    with open(os.getcwd() + "/logs/preprocess_data.log", 'w'):
        pass
    log_path = os.getcwd() + "/logs/preprocess_data.log"
    print("INFO: Starting Preprocess Data")
    gc.collect()
    config_raw = json.load(open("config.json"))
    config = ConfigHandler(config_raw,
                           "preprocess_data",
                           raise_error_unknown=True)
    config = parseCLIArgs(args, config)
    data = loadData([
        "department_corpus", "incomplete_papers", "org_corpus", "conflicts",
        "parsed_papers", "same_names", "test_special_keys"
    ], config.logger, config)
    same_names = data["same_names"]
    parsed = data["parsed_papers"]
    parsed = {x: Paper(**info) for x, info in parsed.items()}
    org_corpus = data["org_corpus"]
    department_corpus = data["department_corpus"]
    incomplete = data["incomplete_papers"]
    special_keys = data["test_special_keys"]
    excluded_dict = data["conflicts"]
Example #10
0
    description="Parse Disambiguate targets. You can specify these in config.json instead of using command line arguments",
    formatter_class=argparse.MetavarTypeHelpFormatter)
createCLIShared(arguments)
createCLIGroup(arguments, "TargetCreator", "Arguments for how to create targets", TargetCreator.parameters)
createCLIGroup(arguments, "AuthorDisambiguation", "Arguments for how to disambiguate authors, check author_disambiguation.py for default values",
               AuthorDisambiguation.parameters)

if __name__ == '__main__':
    args = arguments.parse_args()
    with open(os.getcwd() + "/logs/evaluate_disambiguation.log", 'w'):
        pass
    log_path = os.getcwd() + "/logs/evaluate_disambiguation.log"
    print("INFO: Starting Create Data")
    gc.collect()
    config_raw = json.load(open("config.json"))
    config = ConfigHandler(config_raw, "evaluate_disambiguation", raise_error_unknown=True)
    data = loadData(
        ["department_corpus", "incomplete_papers", "org_corpus", "conflicts", "parsed_papers", "same_names", "test_special_keys", "author_papers",
         "id_to_name"], config.logger, config)
    author_papers = data["author_papers"]
    id_to_name = data["id_to_name"]
    same_names = data["same_names"]
    parsed = data["parsed_papers"]
    parsed = {x: Paper(**info) for x, info in parsed.items()}
    org_corpus = data["org_corpus"]
    department_corpus = data["department_corpus"]
    incomplete = data["incomplete_papers"]
    special_keys = data["test_special_keys"]

    target_creator = TargetCreator(parsed, id_to_name, author_papers, **config["TargetCreator"])
    tests = [
createCLIShared(arguments)
createCLIGroup(
    arguments, "PDFParser",
    "Arguments for the PDFParser, check the documentation of pdf_parser.py to see default values",
    PDFParserWrapper.parameters)
createCLIGroup(
    arguments, "ACLParser",
    "Arguments for the ACLParser, check the documentation of acl_parser.py to see default values",
    ACLParser.parameters)

if __name__ == '__main__':
    args = arguments.parse_args()
    with open(os.getcwd() + "/logs/create_data.log", 'w'):
        pass
    log_path = os.getcwd() + "/logs/create_data.log"
    print("INFO: Starting Create Data")
    gc.collect()
    config_raw = json.load(open("config.json"))
    config = ConfigHandler(config_raw, "create_data", raise_error_unknown=True)
    config = parseCLIArgs(args, config)
    acl_parser = ACLParser(**config["ACLParser"])
    acl_parser(config["xml_path"], config["name_variants_path"])

    data = loadData(["aliases", "acl_papers", "id_to_name", "same_names"],
                    config.logger,
                    config,
                    override_keys={"acl_papers": "papers"})
    parser = PDFParserWrapper(**data, **config["PDFParser"])
    parser(config["parsed_pdf_path"])
    gc.collect()