Ejemplo n.º 1
0
 def __init__(self,
              extractors,
              iob_directories=[],
              iob_file=None,
              label_index=-1):
     """
     Args:
         extractors:
             the list of canonical citation extractors to evaluate
         iob_test_file: 
             the file in IOB format to be used for testing and evaluating the extactors
     """
     # read the test instances from a list of directories containing the test data
     import logging
     self.logger = logging.getLogger("CREX.SIMPLEVAL")
     if (iob_file is None):
         self.logger.debug(iob_directories)
         data = []
         for directory in iob_directories:
             data += IO.read_iob_files(directory, ".txt")
         self.test_instances = data
     else:
         self.test_instances = IO.file_to_instances(iob_file)
     self.logger.debug("Found %i instances for test" %
                       len(self.test_instances))
     self.extractors = extractors
     self.output = {}
     self.error_matrix = None
     self.label_index = label_index
     return
Ejemplo n.º 2
0
 def __init__(self,extractors,iob_directories=[],iob_file=None,label_index=-1):
     """
     Args:
         extractors:
             the list of canonical citation extractors to evaluate
         iob_test_file:
             the file in IOB format to be used for testing and evaluating the extactors
     """
     # read the test instances from a list of directories containing the test data
     import logging
     self.logger = logging.getLogger("CREX.SIMPLEVAL")
     if(iob_file is None):
         self.logger.debug(iob_directories)
         data = []
         for directory in iob_directories:
             data += IO.read_iob_files(directory,".txt")
         self.test_instances = data
     else:
         self.test_instances = IO.file_to_instances(iob_file)
     self.logger.debug("Found %i instances for test"%len(self.test_instances))
     self.extractors = extractors
     self.output = {}
     self.error_matrix = None
     self.label_index = label_index
     return
Ejemplo n.º 3
0
def get_extractor(settings):
    """
    Instantiate, train and return a Citation_Extractor.
    """
    import sys
    import citation_extractor as citation_extractor_module
    from citation_extractor.core import citation_extractor
    from citation_extractor.Utils import IO
    ce = None
    try:
        logger.info("Using CitationExtractor v. %s" %
                    citation_extractor_module.__version__)
        train_instances = []
        for directory in settings.DATA_DIRS:
            train_instances += IO.read_iob_files(directory, extension=".txt")
        logger.info(
            "Training data: found %i directories containing %i  sentences and %i tokens"
            % (len(settings.DATA_DIRS), len(train_instances),
               IO.count_tokens(train_instances)))

        if (settings.CLASSIFIER is None):
            ce = citation_extractor(settings)
        else:
            ce = citation_extractor(settings, settings.CLASSIFIER)

    except Exception, e:
        print e
Ejemplo n.º 4
0
def get_extractor(settings):
	"""
	Instantiate, train and return a Citation_Extractor. 
	"""
	import sys
	import citation_extractor as citation_extractor_module
	from citation_extractor.core import citation_extractor
	from citation_extractor.eval import IO
	ce = None
	try:
		logger.info("Using CitationExtractor v. %s"%citation_extractor_module.__version__)
		train_instances = []
		for directory in settings.DATA_DIRS:
		    train_instances += IO.read_iob_files(directory,extension=".txt")
		logger.info("Training data: found %i directories containing %i  sentences and %i tokens"%(len(settings.DATA_DIRS),len(train_instances),IO.count_tokens(train_instances)))
		ce = citation_extractor(settings)
	except Exception, e:
		print e
Ejemplo n.º 5
0
 def read_instances(directories):
     result = []
     for d in directories:
         result += IO.read_iob_files(d)
     return result
Ejemplo n.º 6
0
 def read_instances(directories):
     result = []
     for d in directories:
         result += IO.read_iob_files(d)
     return result