예제 #1
0
def get_extractor(settings):
    """
    Instantiate, train and return a Citation_Extractor.
    """
    import sys
    import citation_extractor as citation_extractor_module
    from citation_extractor.core import citation_extractor
    from citation_extractor.Utils import IO
    ce = None
    try:
        logger.info("Using CitationExtractor v. %s" %
                    citation_extractor_module.__version__)
        train_instances = []
        for directory in settings.DATA_DIRS:
            train_instances += IO.read_iob_files(directory, extension=".txt")
        logger.info(
            "Training data: found %i directories containing %i  sentences and %i tokens"
            % (len(settings.DATA_DIRS), len(train_instances),
               IO.count_tokens(train_instances)))

        if (settings.CLASSIFIER is None):
            ce = citation_extractor(settings)
        else:
            ce = citation_extractor(settings, settings.CLASSIFIER)

    except Exception, e:
        print e
예제 #2
0
def preproc_document(doc_id,inp_dir,interm_dir,out_dir,abbreviations,taggers):
	"""
	Returns:

	language, number of sentences, number of tokens

	"""
	lang, no_sentences, no_tokens = np.nan,np.nan,np.nan
	try:
		intermediate_out_file = "%s%s"%(interm_dir,doc_id)
		iob_out_file = "%s%s"%(out_dir,doc_id)
		text = codecs.open("%s%s"%(inp_dir,doc_id),'r','utf-8').read()
		intermediate_text = sentencebreaks_to_newlines(text)
		recovered_text= recover_segmentation_errors(intermediate_text,abbreviations,verbose=False)
		codecs.open(intermediate_out_file,'w','utf-8').write(recovered_text)
		logger.info("Written intermediate output to %s"%intermediate_out_file)
		lang = detect_language(text)
		logger.info("Language detected=\"%s\""%lang)
		sentences = recovered_text.split('\n')
		logger.info("Document \"%s\" has %i sentences"%(doc_id,len(sentences)))
		tagged_sentences = taggers[lang].tag_sents(sentences)
		tokenised_text = [[token[:2] for token in line] for line in tagged_sentences]
		IO.write_iob_file(tokenised_text,iob_out_file)
		logger.info("Written IOB output to %s"%iob_out_file)
		no_sentences = len(recovered_text.split('\n'))
		no_tokens = IO.count_tokens(tokenised_text)
	except Exception, e:
		logger.error("The pre-processing of document %s (lang=\'%s\') failed with error \"%s\""%(doc_id,lang,e)) 
예제 #3
0
def preproc_document(doc_id,
                     inp_dir,
                     interm_dir,
                     out_dir,
                     abbreviations,
                     taggers,
                     split_sentences=True):
    """
    :param doc_id: the input filename
    :param inp_dir: the input directory
    :param interm_dir: the directory where to store intermediate outputs
    :param out_dir: the directory where to store the PoS-tagged and tokenised text
    :param abbreviations:
    :param taggers: the dictionary returned by `get_taggers`
    :param split_sentences: (boolean) whether to slit text into sentences or not.
                            If `False`, text is split on newline characters `\n`.

    Returns:

    language, number of sentences, number of tokens

    """
    lang, no_sentences, no_tokens = np.nan, np.nan, np.nan
    try:
        intermediate_out_file = "%s%s" % (interm_dir, doc_id)
        iob_out_file = "%s%s" % (out_dir, doc_id)
        text = codecs.open("%s%s" % (inp_dir, doc_id), 'r', 'utf-8').read()
        if (split_sentences):
            intermediate_text = sentencebreaks_to_newlines(text)
            text = recover_segmentation_errors(intermediate_text,
                                               abbreviations,
                                               verbose=False)
        else:
            logger.info("Document %s: skipping sentence splitting" % doc_id)
        sentences = text.split('\n')
        logger.info("Document \"%s\" has %i sentences" %
                    (doc_id, len(sentences)))
        codecs.open(intermediate_out_file, 'w', 'utf-8').write(text)
        logger.info("Written intermediate output to %s" %
                    intermediate_out_file)
        lang = detect_language(text)
        logger.info("Language detected=\"%s\"" % lang)
        tagged_sentences = taggers[lang].tag_sents(sentences)
        tokenised_text = [[token for token in line]
                          for line in tagged_sentences]
        IO.write_iob_file(tokenised_text, iob_out_file)
        logger.info("Written IOB output to %s" % iob_out_file)
        no_sentences = len(text.split('\n'))
        no_tokens = IO.count_tokens(tokenised_text)
    except Exception, e:
        logger.error(
            "The pre-processing of document %s (lang=\'%s\') failed with error \"%s\""
            % (doc_id, lang, e))
예제 #4
0
def get_extractor(settings):
	"""
	Instantiate, train and return a Citation_Extractor. 
	"""
	import sys
	import citation_extractor as citation_extractor_module
	from citation_extractor.core import citation_extractor
	from citation_extractor.eval import IO
	ce = None
	try:
		logger.info("Using CitationExtractor v. %s"%citation_extractor_module.__version__)
		train_instances = []
		for directory in settings.DATA_DIRS:
		    train_instances += IO.read_iob_files(directory,extension=".txt")
		logger.info("Training data: found %i directories containing %i  sentences and %i tokens"%(len(settings.DATA_DIRS),len(train_instances),IO.count_tokens(train_instances)))
		ce = citation_extractor(settings)
	except Exception, e:
		print e