Esempio n. 1
0
    def test_improvement(pre_settings, post_settings):
        """
		TODO: what this function should do:
		1. run without selected candidates in the train set and evaluate
		2. run with selected candidates in the train set and evaluate
		3. return: stats for the 1st run, stats for the 2nd run and improvement obtained 
		"""
        from citation_extractor.core import citation_extractor
        from citation_extractor.eval import SimpleEvaluator
        from citation_extractor.Utils import aph_corpus
        from citation_extractor.Utils import IO
        # extractor without selected candidates in the train set and evaluate
        pre_extractor = citation_extractor(pre_settings)
        # extractor with selected candidates in the train set and evaluate
        post_extractor = citation_extractor(post_settings)
        # initialise evaluator and evaluate against the test set
        se = SimpleEvaluator([pre_extractor, post_extractor],
                             post_settings.TEST_DIR)
        results = se.eval()
        print "***data***"
        print "pre-active learning TRAIN-SET: %s" % str(pre_settings.DATA_DIRS)
        train_details = aph_corpus.get_collection_details(
            pre_settings.TRAIN_COLLECTIONS)
        print "pre-active learning TRAIN-SET: # tokens = %i; # NEs = %i" % (
            train_details['total_token_count'],
            train_details['ne_token_count'])
        train_details = aph_corpus.get_collection_details(
            post_settings.TRAIN_COLLECTIONS)
        print "post-active learning TRAIN-SET: %s" % str(
            post_settings.DATA_DIRS)
        print "post-active learning TRAIN-SET: # tokens = %i; # NEs = %i" % (
            train_details['total_token_count'],
            train_details['ne_token_count'])
        test_details = aph_corpus.get_collection_details(
            post_settings.TEST_COLLECTIONS)
        print "TEST-SET: %s" % str(post_settings.TEST_DIR)
        print "TEST-SET details: # tokens = %i; # NEs = %i\n" % (
            test_details['total_token_count'], test_details['ne_token_count'])
        print "*** pre-active learning ***"
        pre_al_results = results[str(pre_extractor)][0]
        print "fscore: %f \nprecision: %f\nrecall: %f\n" % (
            pre_al_results["f-score"] * 100, pre_al_results["precision"] * 100,
            pre_al_results["recall"] * 100)
        print "*** post-active learning ***"
        post_al_results = results[str(post_extractor)][0]
        print "fscore: %f \nprecision: %f\nrecall: %f\n" % (
            post_al_results["f-score"] * 100, post_al_results["precision"] *
            100, post_al_results["recall"] * 100)
        print "*** post-active learning gain (%) ***"
        print "fscore: %f \nprecision: %f\nrecall: %f\n" % (
            post_al_results["f-score"] * 100 - pre_al_results["f-score"] * 100,
            post_al_results["precision"] * 100 -
            pre_al_results["precision"] * 100,
            post_al_results["recall"] * 100 - pre_al_results["recall"] * 100)
        IO.write_iob_file(se.output[str(pre_extractor)],
                          "%spre_out.data" % post_settings.OUT_DIR)
        IO.write_iob_file(se.output[str(post_extractor)],
                          "%spost_out.data" % post_settings.OUT_DIR)
Esempio n. 2
0
	def select_candidates(settings):
		"""
		Run the ActiveLearner and select a set of effective candidates.
		"""
		from citation_extractor.core import citation_extractor
		from citation_extractor.Utils import aph_corpus
		
		extr = citation_extractor(settings)
		#example_text = u"Eschilo interprete di se stesso (Ar. Ran. 1126ss., 1138-1150)"
		#tokens = extr.tokenize(example_text)
		#result = extr.extract([tokens])
		# create an ActiveLearner instance
		al = ActiveLearner(extr,0.2,settings.DEV_DIR,settings.TEST_DIR)
		candidates = al.learn()
		pruned_candidates = al.get_pruned_candidates()
		al.logger.info("Total tokens classified: %i"%al.token_count)

		effective_candidates_detail = "\n".join(["[%s] %s -> %f"%(c.instance,c.token,c.ci_score) for c in candidates])
		file = open("%sec_details.txt"%settings.TEMP_DIR,"w")
		file.write(effective_candidates_detail)
		file.close()

		effective_candidate_list = "\n".join(["%s/%s\t%s"%(n,len(al.get_pruned_candidates()),id) for n,id in enumerate(pruned_candidates)])
		file = open("%sec_list.txt"%settings.TEMP_DIR,"w")
		file.write(effective_candidate_list)
		file.close()
Esempio n. 3
0
    def tag_candidates(settings):
        import glob
        import os
        import codecs
        from citation_extractor.Utils import IO
        from citation_extractor.core import citation_extractor

        extractor = citation_extractor(settings)
        for infile in glob.glob(os.path.join(settings.CANDIDATES_DIR,
                                             '*.iob')):
            print "processing %s" % infile
            instances = IO.file_to_instances(infile)
            string_instances = [[tok[0] for tok in i] for i in instances]
            results = extractor.extract([string_instances])
            out_dir = settings.OUT_DIR
            out_fname = "%s%s" % (out_dir, os.path.basename(infile))
            file = codecs.open(out_fname, 'w', encoding="utf-8")
            instances = [
                "\n".join([
                    "%s\t%s" % (t["token"].decode("utf-8"), t["label"])
                    for t in r
                ]) for r in results
            ]
            file.write("\n\n".join(instances))
            file.close()
            print "output written to %s" % out_fname
Esempio n. 4
0
    def select_candidates(settings):
        """
		Run the ActiveLearner and select a set of effective candidates.
		"""
        from citation_extractor.core import citation_extractor
        from citation_extractor.Utils import aph_corpus

        extr = citation_extractor(settings)
        #example_text = u"Eschilo interprete di se stesso (Ar. Ran. 1126ss., 1138-1150)"
        #tokens = extr.tokenize(example_text)
        #result = extr.extract([tokens])
        # create an ActiveLearner instance
        al = ActiveLearner(extr, 0.2, settings.DEV_DIR, settings.TEST_DIR)
        candidates = al.learn()
        pruned_candidates = al.get_pruned_candidates()
        al.logger.info("Total tokens classified: %i" % al.token_count)

        effective_candidates_detail = "\n".join([
            "[%s] %s -> %f" % (c.instance, c.token, c.ci_score)
            for c in candidates
        ])
        file = open("%sec_details.txt" % settings.TEMP_DIR, "w")
        file.write(effective_candidates_detail)
        file.close()

        effective_candidate_list = "\n".join([
            "%s/%s\t%s" % (n, len(al.get_pruned_candidates()), id)
            for n, id in enumerate(pruned_candidates)
        ])
        file = open("%sec_list.txt" % settings.TEMP_DIR, "w")
        file.write(effective_candidate_list)
        file.close()
Esempio n. 5
0
	def test_improvement(pre_settings,post_settings):
		"""
		TODO: what this function should do:
		1. run without selected candidates in the train set and evaluate
		2. run with selected candidates in the train set and evaluate
		3. return: stats for the 1st run, stats for the 2nd run and improvement obtained 
		"""
		from citation_extractor.core import citation_extractor
		from citation_extractor.eval import SimpleEvaluator
		from citation_extractor.Utils import aph_corpus
		from citation_extractor.Utils import IO
		# extractor without selected candidates in the train set and evaluate
		pre_extractor = citation_extractor(pre_settings)
		# extractor with selected candidates in the train set and evaluate
		post_extractor = citation_extractor(post_settings)
		# initialise evaluator and evaluate against the test set
		se = SimpleEvaluator([pre_extractor,post_extractor],post_settings.TEST_DIR)
		results = se.eval()
		print "***data***"
		print "pre-active learning TRAIN-SET: %s"%str(pre_settings.DATA_DIRS)
		train_details = aph_corpus.get_collection_details(pre_settings.TRAIN_COLLECTIONS)
		print "pre-active learning TRAIN-SET: # tokens = %i; # NEs = %i"%(train_details['total_token_count'],train_details['ne_token_count'])
		train_details = aph_corpus.get_collection_details(post_settings.TRAIN_COLLECTIONS)
		print "post-active learning TRAIN-SET: %s"%str(post_settings.DATA_DIRS)
		print "post-active learning TRAIN-SET: # tokens = %i; # NEs = %i"%(train_details['total_token_count'],train_details['ne_token_count'])
		test_details = aph_corpus.get_collection_details(post_settings.TEST_COLLECTIONS)
		print "TEST-SET: %s"%str(post_settings.TEST_DIR)
		print "TEST-SET details: # tokens = %i; # NEs = %i\n"%(test_details['total_token_count'],test_details['ne_token_count'])
		print "*** pre-active learning ***"
		pre_al_results = results[str(pre_extractor)][0]
		print "fscore: %f \nprecision: %f\nrecall: %f\n"%(pre_al_results["f-score"]*100,pre_al_results["precision"]*100,pre_al_results["recall"]*100)
		print "*** post-active learning ***"
		post_al_results = results[str(post_extractor)][0]
		print "fscore: %f \nprecision: %f\nrecall: %f\n"%(post_al_results["f-score"]*100,post_al_results["precision"]*100,post_al_results["recall"]*100)
		print "*** post-active learning gain (%) ***"
		print "fscore: %f \nprecision: %f\nrecall: %f\n"%(post_al_results["f-score"]*100 - pre_al_results["f-score"]*100,post_al_results["precision"]*100 - pre_al_results["precision"]*100,post_al_results["recall"]*100 - pre_al_results["recall"]*100)
		IO.write_iob_file(se.output[str(pre_extractor)],"%spre_out.data"%post_settings.OUT_DIR)
		IO.write_iob_file(se.output[str(post_extractor)],"%spost_out.data"%post_settings.OUT_DIR)
Esempio n. 6
0
def get_extractor(settings):
	"""
	Instantiate, train and return a Citation_Extractor. 
	"""
	import sys
	import citation_extractor as citation_extractor_module
	from citation_extractor.core import citation_extractor
	from citation_extractor.eval import IO
	ce = None
	try:
		logger.info("Using CitationExtractor v. %s"%citation_extractor_module.__version__)
		train_instances = []
		for directory in settings.DATA_DIRS:
		    train_instances += IO.read_iob_files(directory,extension=".txt")
		logger.info("Training data: found %i directories containing %i  sentences and %i tokens"%(len(settings.DATA_DIRS),len(train_instances),IO.count_tokens(train_instances)))
		ce = citation_extractor(settings)
	except Exception, e:
		print e
Esempio n. 7
0
	def tag_candidates(settings):
		import glob
		import os
		import codecs
		from citation_extractor.Utils import IO
		from citation_extractor.core import citation_extractor
		
		extractor = citation_extractor(settings)
		for infile in glob.glob( os.path.join(settings.CANDIDATES_DIR, '*.iob') ):
			print "processing %s"%infile
			instances = IO.file_to_instances(infile)
			string_instances = [[tok[0] for tok in i]for i in instances]
			results = extractor.extract([string_instances])
			out_dir = settings.OUT_DIR
			out_fname = "%s%s"%(out_dir,os.path.basename(infile))
			file = codecs.open(out_fname, 'w',encoding="utf-8")
			instances = ["\n".join(["%s\t%s"%(t["token"].decode("utf-8"),t["label"]) for t in r]) for r in results]
			file.write("\n\n".join(instances))
			file.close()
			print "output written to %s"%out_fname