def get_extractor(settings): """ Instantiate, train and return a Citation_Extractor. """ import sys import citation_extractor as citation_extractor_module from citation_extractor.core import citation_extractor from citation_extractor.Utils import IO ce = None try: logger.info("Using CitationExtractor v. %s" % citation_extractor_module.__version__) train_instances = [] for directory in settings.DATA_DIRS: train_instances += IO.read_iob_files(directory, extension=".txt") logger.info( "Training data: found %i directories containing %i sentences and %i tokens" % (len(settings.DATA_DIRS), len(train_instances), IO.count_tokens(train_instances))) if (settings.CLASSIFIER is None): ce = citation_extractor(settings) else: ce = citation_extractor(settings, settings.CLASSIFIER) except Exception, e: print e
def preproc_document(doc_id,inp_dir,interm_dir,out_dir,abbreviations,taggers): """ Returns: language, number of sentences, number of tokens """ lang, no_sentences, no_tokens = np.nan,np.nan,np.nan try: intermediate_out_file = "%s%s"%(interm_dir,doc_id) iob_out_file = "%s%s"%(out_dir,doc_id) text = codecs.open("%s%s"%(inp_dir,doc_id),'r','utf-8').read() intermediate_text = sentencebreaks_to_newlines(text) recovered_text= recover_segmentation_errors(intermediate_text,abbreviations,verbose=False) codecs.open(intermediate_out_file,'w','utf-8').write(recovered_text) logger.info("Written intermediate output to %s"%intermediate_out_file) lang = detect_language(text) logger.info("Language detected=\"%s\""%lang) sentences = recovered_text.split('\n') logger.info("Document \"%s\" has %i sentences"%(doc_id,len(sentences))) tagged_sentences = taggers[lang].tag_sents(sentences) tokenised_text = [[token[:2] for token in line] for line in tagged_sentences] IO.write_iob_file(tokenised_text,iob_out_file) logger.info("Written IOB output to %s"%iob_out_file) no_sentences = len(recovered_text.split('\n')) no_tokens = IO.count_tokens(tokenised_text) except Exception, e: logger.error("The pre-processing of document %s (lang=\'%s\') failed with error \"%s\""%(doc_id,lang,e))
def preproc_document(doc_id, inp_dir, interm_dir, out_dir, abbreviations, taggers, split_sentences=True): """ :param doc_id: the input filename :param inp_dir: the input directory :param interm_dir: the directory where to store intermediate outputs :param out_dir: the directory where to store the PoS-tagged and tokenised text :param abbreviations: :param taggers: the dictionary returned by `get_taggers` :param split_sentences: (boolean) whether to slit text into sentences or not. If `False`, text is split on newline characters `\n`. Returns: language, number of sentences, number of tokens """ lang, no_sentences, no_tokens = np.nan, np.nan, np.nan try: intermediate_out_file = "%s%s" % (interm_dir, doc_id) iob_out_file = "%s%s" % (out_dir, doc_id) text = codecs.open("%s%s" % (inp_dir, doc_id), 'r', 'utf-8').read() if (split_sentences): intermediate_text = sentencebreaks_to_newlines(text) text = recover_segmentation_errors(intermediate_text, abbreviations, verbose=False) else: logger.info("Document %s: skipping sentence splitting" % doc_id) sentences = text.split('\n') logger.info("Document \"%s\" has %i sentences" % (doc_id, len(sentences))) codecs.open(intermediate_out_file, 'w', 'utf-8').write(text) logger.info("Written intermediate output to %s" % intermediate_out_file) lang = detect_language(text) logger.info("Language detected=\"%s\"" % lang) tagged_sentences = taggers[lang].tag_sents(sentences) tokenised_text = [[token for token in line] for line in tagged_sentences] IO.write_iob_file(tokenised_text, iob_out_file) logger.info("Written IOB output to %s" % iob_out_file) no_sentences = len(text.split('\n')) no_tokens = IO.count_tokens(tokenised_text) except Exception, e: logger.error( "The pre-processing of document %s (lang=\'%s\') failed with error \"%s\"" % (doc_id, lang, e))
def get_extractor(settings): """ Instantiate, train and return a Citation_Extractor. """ import sys import citation_extractor as citation_extractor_module from citation_extractor.core import citation_extractor from citation_extractor.eval import IO ce = None try: logger.info("Using CitationExtractor v. %s"%citation_extractor_module.__version__) train_instances = [] for directory in settings.DATA_DIRS: train_instances += IO.read_iob_files(directory,extension=".txt") logger.info("Training data: found %i directories containing %i sentences and %i tokens"%(len(settings.DATA_DIRS),len(train_instances),IO.count_tokens(train_instances))) ce = citation_extractor(settings) except Exception, e: print e