예제 #1
0
def preproc_document(doc_id,inp_dir,interm_dir,out_dir,abbreviations,taggers):
	"""
	Returns:

	language, number of sentences, number of tokens

	"""
	lang, no_sentences, no_tokens = np.nan,np.nan,np.nan
	try:
		intermediate_out_file = "%s%s"%(interm_dir,doc_id)
		iob_out_file = "%s%s"%(out_dir,doc_id)
		text = codecs.open("%s%s"%(inp_dir,doc_id),'r','utf-8').read()
		intermediate_text = sentencebreaks_to_newlines(text)
		recovered_text= recover_segmentation_errors(intermediate_text,abbreviations,verbose=False)
		codecs.open(intermediate_out_file,'w','utf-8').write(recovered_text)
		logger.info("Written intermediate output to %s"%intermediate_out_file)
		lang = detect_language(text)
		logger.info("Language detected=\"%s\""%lang)
		sentences = recovered_text.split('\n')
		logger.info("Document \"%s\" has %i sentences"%(doc_id,len(sentences)))
		tagged_sentences = taggers[lang].tag_sents(sentences)
		tokenised_text = [[token[:2] for token in line] for line in tagged_sentences]
		IO.write_iob_file(tokenised_text,iob_out_file)
		logger.info("Written IOB output to %s"%iob_out_file)
		no_sentences = len(recovered_text.split('\n'))
		no_tokens = IO.count_tokens(tokenised_text)
	except Exception, e:
		logger.error("The pre-processing of document %s (lang=\'%s\') failed with error \"%s\""%(doc_id,lang,e)) 
예제 #2
0
def extract_citations(extractor,outputdir,filename,iob_sentences,outfilename=None):
	"""docstring for extract_citations"""
	# this is the important bit which performs the citation extraction
	import sys
	import os
	from citation_extractor.eval import IO

	result,out_fname = None, ""
	if(outfilename is None):
		path,name = os.path.split(filename)
		out_fname = '%s%s'%(outputdir,name)
	else:
		out_fname = outfilename
	try:
		postags = [[("z_POS",token[1]) for token in instance] for instance in iob_sentences if len(instance)>0]
		instances = [[token[0] for token in instance] for instance in iob_sentences if len(instance)>0]
		result = extractor.extract(instances, postags)
		output = []
		for i,res in enumerate(result):
		    temp = []
		    for n,d_res in enumerate(res):
		        temp.append((res[n]["token"], postags[i][n][1], res[n]["label"]))
		    output.append(temp)
		try:
		    IO.write_iob_file(output,out_fname)
		    print >> sys.stderr, "Output successfully written to file \"%s\""%out_fname
		    return result,out_fname
		except Exception, e:
		    raise e
	except Exception, e:
		raise e
예제 #3
0
    def test_improvement(pre_settings, post_settings):
        """
		TODO: what this function should do:
		1. run without selected candidates in the train set and evaluate
		2. run with selected candidates in the train set and evaluate
		3. return: stats for the 1st run, stats for the 2nd run and improvement obtained 
		"""
        from citation_extractor.core import citation_extractor
        from citation_extractor.eval import SimpleEvaluator
        from citation_extractor.Utils import aph_corpus
        from citation_extractor.Utils import IO
        # extractor without selected candidates in the train set and evaluate
        pre_extractor = citation_extractor(pre_settings)
        # extractor with selected candidates in the train set and evaluate
        post_extractor = citation_extractor(post_settings)
        # initialise evaluator and evaluate against the test set
        se = SimpleEvaluator([pre_extractor, post_extractor],
                             post_settings.TEST_DIR)
        results = se.eval()
        print "***data***"
        print "pre-active learning TRAIN-SET: %s" % str(pre_settings.DATA_DIRS)
        train_details = aph_corpus.get_collection_details(
            pre_settings.TRAIN_COLLECTIONS)
        print "pre-active learning TRAIN-SET: # tokens = %i; # NEs = %i" % (
            train_details['total_token_count'],
            train_details['ne_token_count'])
        train_details = aph_corpus.get_collection_details(
            post_settings.TRAIN_COLLECTIONS)
        print "post-active learning TRAIN-SET: %s" % str(
            post_settings.DATA_DIRS)
        print "post-active learning TRAIN-SET: # tokens = %i; # NEs = %i" % (
            train_details['total_token_count'],
            train_details['ne_token_count'])
        test_details = aph_corpus.get_collection_details(
            post_settings.TEST_COLLECTIONS)
        print "TEST-SET: %s" % str(post_settings.TEST_DIR)
        print "TEST-SET details: # tokens = %i; # NEs = %i\n" % (
            test_details['total_token_count'], test_details['ne_token_count'])
        print "*** pre-active learning ***"
        pre_al_results = results[str(pre_extractor)][0]
        print "fscore: %f \nprecision: %f\nrecall: %f\n" % (
            pre_al_results["f-score"] * 100, pre_al_results["precision"] * 100,
            pre_al_results["recall"] * 100)
        print "*** post-active learning ***"
        post_al_results = results[str(post_extractor)][0]
        print "fscore: %f \nprecision: %f\nrecall: %f\n" % (
            post_al_results["f-score"] * 100, post_al_results["precision"] *
            100, post_al_results["recall"] * 100)
        print "*** post-active learning gain (%) ***"
        print "fscore: %f \nprecision: %f\nrecall: %f\n" % (
            post_al_results["f-score"] * 100 - pre_al_results["f-score"] * 100,
            post_al_results["precision"] * 100 -
            pre_al_results["precision"] * 100,
            post_al_results["recall"] * 100 - pre_al_results["recall"] * 100)
        IO.write_iob_file(se.output[str(pre_extractor)],
                          "%spre_out.data" % post_settings.OUT_DIR)
        IO.write_iob_file(se.output[str(post_extractor)],
                          "%spost_out.data" % post_settings.OUT_DIR)
예제 #4
0
def preproc_document(doc_id,
                     inp_dir,
                     interm_dir,
                     out_dir,
                     abbreviations,
                     taggers,
                     split_sentences=True):
    """
    :param doc_id: the input filename
    :param inp_dir: the input directory
    :param interm_dir: the directory where to store intermediate outputs
    :param out_dir: the directory where to store the PoS-tagged and tokenised text
    :param abbreviations:
    :param taggers: the dictionary returned by `get_taggers`
    :param split_sentences: (boolean) whether to slit text into sentences or not.
                            If `False`, text is split on newline characters `\n`.

    Returns:

    language, number of sentences, number of tokens

    """
    lang, no_sentences, no_tokens = np.nan, np.nan, np.nan
    try:
        intermediate_out_file = "%s%s" % (interm_dir, doc_id)
        iob_out_file = "%s%s" % (out_dir, doc_id)
        text = codecs.open("%s%s" % (inp_dir, doc_id), 'r', 'utf-8').read()
        if (split_sentences):
            intermediate_text = sentencebreaks_to_newlines(text)
            text = recover_segmentation_errors(intermediate_text,
                                               abbreviations,
                                               verbose=False)
        else:
            logger.info("Document %s: skipping sentence splitting" % doc_id)
        sentences = text.split('\n')
        logger.info("Document \"%s\" has %i sentences" %
                    (doc_id, len(sentences)))
        codecs.open(intermediate_out_file, 'w', 'utf-8').write(text)
        logger.info("Written intermediate output to %s" %
                    intermediate_out_file)
        lang = detect_language(text)
        logger.info("Language detected=\"%s\"" % lang)
        tagged_sentences = taggers[lang].tag_sents(sentences)
        tokenised_text = [[token for token in line]
                          for line in tagged_sentences]
        IO.write_iob_file(tokenised_text, iob_out_file)
        logger.info("Written IOB output to %s" % iob_out_file)
        no_sentences = len(text.split('\n'))
        no_tokens = IO.count_tokens(tokenised_text)
    except Exception, e:
        logger.error(
            "The pre-processing of document %s (lang=\'%s\') failed with error \"%s\""
            % (doc_id, lang, e))
def main():
    import argparse
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("input", type=str, help="IOB input file")
    parser.add_argument("--standoff-dir",
                        help="Stand-off directory",
                        type=str,
                        required=True)
    parser.add_argument("--output-dir",
                        help="IOB output file",
                        type=str,
                        required=True)
    args = parser.parse_args()

    print >> sys.stderr, "IOB Input:", args.input
    print >> sys.stderr, "Stand-off input folder: ", args.standoff_dir
    print >> sys.stderr, "IOB output dir:", args.output_dir

    fname = os.path.split(args.input)[1].split(".")[0]

    # read the correspondant .ann file with stand-off annotation
    so_entities, so_relations, so_annotations = read_ann_file(
        "%s.txt" % fname, args.standoff_dir)

    # extract for each token the start and end
    sentences = process(args.input)
    token_start_end = get_start_end(sentences)

    # read IOB from file
    iob_data = IO.file_to_instances(args.input)
    # make sure that data is consistent
    assert [len(sentence) for sentence in iob_data
            ] == [len(sentence) for sentence in token_start_end]

    so_entities = [(so_entities[ent][1], so_entities[ent][0],
                    int(so_entities[ent][2]), int(so_entities[ent][3]))
                   for ent in so_entities.keys()]
    updated_iob_instances = update(token_start_end, iob_data, so_entities)
    try:
        destination = "%s%s.txt" % (args.output_dir, fname)
        IO.write_iob_file(updated_iob_instances, destination)
        print >> sys.stderr, "IOB output written to \'%s\'" % destination
    except Exception, e:
        print >> sys.stderr, "Writing ouput to \'%s\' failed with error \'%s\'" % (
            destination, e)
예제 #6
0
def do_ner(doc_id,inp_dir,interm_dir,out_dir,extractor,so2iob_script):
	# TODO:
	# wrap with a try/except/finally
	# return doc_id and a boolean
	from citation_extractor.Utils import IO
	try:
		data = IO.file_to_instances("%s%s"%(inp_dir,doc_id))
		postags = [[("z_POS",token[1]) for token in instance] for instance in data if len(instance)>0]
		instances = [[token[0] for token in instance] for instance in data if len(instance)>0]
		result = extractor.extract(instances,postags)
		output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1], res[n]["label"]) for n,d_res in enumerate(res)] for i,res in enumerate(result)]
		out_fname = "%s%s"%(interm_dir,doc_id)
		IO.write_iob_file(output,out_fname)
		logger.info("Output successfully written to file \"%s\""%out_fname)
		tostandoff(out_fname,out_dir,so2iob_script)
		return (doc_id,True)
	except Exception, e:
		logger.error("The NER of document %s failed with error \"%s\""%(doc_id,e))
		return (doc_id,False)
예제 #7
0
	def test_improvement(pre_settings,post_settings):
		"""
		TODO: what this function should do:
		1. run without selected candidates in the train set and evaluate
		2. run with selected candidates in the train set and evaluate
		3. return: stats for the 1st run, stats for the 2nd run and improvement obtained 
		"""
		from citation_extractor.core import citation_extractor
		from citation_extractor.eval import SimpleEvaluator
		from citation_extractor.Utils import aph_corpus
		from citation_extractor.Utils import IO
		# extractor without selected candidates in the train set and evaluate
		pre_extractor = citation_extractor(pre_settings)
		# extractor with selected candidates in the train set and evaluate
		post_extractor = citation_extractor(post_settings)
		# initialise evaluator and evaluate against the test set
		se = SimpleEvaluator([pre_extractor,post_extractor],post_settings.TEST_DIR)
		results = se.eval()
		print "***data***"
		print "pre-active learning TRAIN-SET: %s"%str(pre_settings.DATA_DIRS)
		train_details = aph_corpus.get_collection_details(pre_settings.TRAIN_COLLECTIONS)
		print "pre-active learning TRAIN-SET: # tokens = %i; # NEs = %i"%(train_details['total_token_count'],train_details['ne_token_count'])
		train_details = aph_corpus.get_collection_details(post_settings.TRAIN_COLLECTIONS)
		print "post-active learning TRAIN-SET: %s"%str(post_settings.DATA_DIRS)
		print "post-active learning TRAIN-SET: # tokens = %i; # NEs = %i"%(train_details['total_token_count'],train_details['ne_token_count'])
		test_details = aph_corpus.get_collection_details(post_settings.TEST_COLLECTIONS)
		print "TEST-SET: %s"%str(post_settings.TEST_DIR)
		print "TEST-SET details: # tokens = %i; # NEs = %i\n"%(test_details['total_token_count'],test_details['ne_token_count'])
		print "*** pre-active learning ***"
		pre_al_results = results[str(pre_extractor)][0]
		print "fscore: %f \nprecision: %f\nrecall: %f\n"%(pre_al_results["f-score"]*100,pre_al_results["precision"]*100,pre_al_results["recall"]*100)
		print "*** post-active learning ***"
		post_al_results = results[str(post_extractor)][0]
		print "fscore: %f \nprecision: %f\nrecall: %f\n"%(post_al_results["f-score"]*100,post_al_results["precision"]*100,post_al_results["recall"]*100)
		print "*** post-active learning gain (%) ***"
		print "fscore: %f \nprecision: %f\nrecall: %f\n"%(post_al_results["f-score"]*100 - pre_al_results["f-score"]*100,post_al_results["precision"]*100 - pre_al_results["precision"]*100,post_al_results["recall"]*100 - pre_al_results["recall"]*100)
		IO.write_iob_file(se.output[str(pre_extractor)],"%spre_out.data"%post_settings.OUT_DIR)
		IO.write_iob_file(se.output[str(post_extractor)],"%spost_out.data"%post_settings.OUT_DIR)
예제 #8
0
def do_ner(doc_id, inp_dir, interm_dir, out_dir, extractor, so2iob_script):
    # TODO:
    # wrap with a try/except/finally
    # return doc_id and a boolean
    from citation_extractor.Utils import IO
    try:
        data = IO.file_to_instances("%s%s" % (inp_dir, doc_id))
        postags = [[("z_POS", token[1]) for token in instance]
                   for instance in data if len(instance) > 0]
        instances = [[token[0] for token in instance] for instance in data
                     if len(instance) > 0]
        result = extractor.extract(instances, postags)
        output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1],
                    res[n]["label"]) for n, d_res in enumerate(res)]
                  for i, res in enumerate(result)]
        out_fname = "%s%s" % (interm_dir, doc_id)
        IO.write_iob_file(output, out_fname)
        logger.info("Output successfully written to file \"%s\"" % out_fname)
        tostandoff(out_fname, out_dir, so2iob_script)
        return (doc_id, True)
    except Exception, e:
        logger.error("The NER of document %s failed with error \"%s\"" %
                     (doc_id, e))
        return (doc_id, False)
예제 #9
0
    def run(self):
        """
        TODO        
        """
        iterations = []
        results = {}
        results_by_entity = {}
        # first lets' create test and train set for each iteration
        for x, iter in enumerate(self.dataSets_iterator):
            self.logger.info("Iteration %i" % (x + 1))
            train_set = []
            test_set = []
            for y, set in enumerate(iter):
                for n, group in enumerate(set):
                    if (y == 0):
                        train_set += group
                    else:
                        test_set += group
            iterations.append((train_set, test_set))

        # let's go through all the iterations
        for i, iter in enumerate(iterations):
            results["iter-%i" % (i + 1)] = {}
            results_by_entity["iter-%i" % (i + 1)] = {}
            train_file = "%sfold_%i.train" % (self.evaluation_dir, i + 1)
            test_file = "%sfold_%i.test" % (self.evaluation_dir, i + 1)
            IO.write_iob_file(iter[0], train_file)
            IO.write_iob_file(iter[1], test_file)
            # the following line is a bit of a workaround
            # to avoid recomputing the features when training
            # each new classifier, I take them from the file created
            # to train the CRF model (which should always be the first extractor
            # to be evaluated).
            filename = "%sfold_%i.train.train" % (
                self.extractors[0][1].TEMP_DIR, (i + 1))
            f = codecs.open(filename, 'r', 'utf-8')
            data = f.read()
            f.close()
            feature_sets = [[[
                token.split('\t')[:len(token.split('\t')) - 1],
                token.split('\t')[len(token.split('\t')) - 1:]
            ] for token in instance.split('\n')]
                            for instance in data.split('\n\n')]
            order = FeatureExtractor().get_feature_order()
            labelled_feature_sets = []
            for instance in feature_sets:
                for token in instance:
                    temp = [{
                        order[n]: feature
                        for n, feature in enumerate(token[0])
                    }, token[1][0]]
                    labelled_feature_sets.append(temp)
            self.logger.info("read %i labelled instances" % len(feature_sets))
            for n, extractor in enumerate(self.extractors):
                extractor_settings = extractor[1]
                extractor_name = extractor[0]
                results["iter-%i" % (i + 1)][extractor_name] = {}
                self.logger.info("Running iteration #%i with extractor %s" %
                                 (i + 1, extractor_name))
                self.logger.info(train_file)
                self.logger.info(test_file)
                self.logger.info(extractor_settings)
                extractor_settings.DATA_FILE = train_file
                if (extractor_settings.CLASSIFIER is not None):
                    extractor = citation_extractor(
                        extractor_settings, extractor_settings.CLASSIFIER,
                        labelled_feature_sets)
                else:
                    extractor = citation_extractor(extractor_settings)
                self.logger.info(extractor.classifier)
                se = SimpleEvaluator([
                    (extractor_name, extractor),
                ],
                                     iob_file=test_file)
                results["iter-%i" %
                        (i + 1)][extractor_name] = se.eval()[extractor_name][0]
                results_by_entity["iter-%i" % (
                    i +
                    1)][extractor_name] = SimpleEvaluator.calc_stats_by_entity(
                        se.eval()[extractor_name][1])
                #self.logger.info(results_by_entity["iter-%i"%(i+1)][extractor_name])
        return results, results_by_entity
예제 #10
0
    def run(self):
        """
        TODO
        """
        iterations = []
        results = {}
        results_by_entity = {}
        # first lets' create test and train set for each iteration
        for x,iter in enumerate(self.dataSets_iterator):
            self.logger.info("Iteration %i"%(x+1))
            train_set=[]
            test_set=[]
            for y,set in enumerate(iter):
                for n,group in enumerate(set):
                    if(y==0):
                        train_set+=group
                    else:
                        test_set+=group
            iterations.append((train_set,test_set))

        # let's go through all the iterations
        for i,iter in enumerate(iterations):
            results["iter-%i"%(i+1)] = {}
            results_by_entity["iter-%i"%(i+1)] = {}
            train_file="%sfold_%i.train"%(self.evaluation_dir,i+1)
            test_file="%sfold_%i.test"%(self.evaluation_dir,i+1)
            IO.write_iob_file(iter[0],train_file)
            IO.write_iob_file(iter[1],test_file)
            # the following line is a bit of a workaround
            # to avoid recomputing the features when training
            # each new classifier, I take them from the file created
            # to train the CRF model (which should always be the first extractor
            # to be evaluated).
            filename = "%sfold_%i.train.train"%(self.extractors[0][1].TEMP_DIR,(i+1))
            f=codecs.open(filename,'r','utf-8')
            data = f.read()
            f.close()
            feature_sets=[[[token.split('\t')[:len(token.split('\t'))-1],token.split('\t')[len(token.split('\t'))-1:]] for token in instance.split('\n')] for instance in data.split('\n\n')]
            order = FeatureExtractor().get_feature_order()
            labelled_feature_sets=[]
            for instance in feature_sets:
                for token in instance:
                    temp = [{order[n]:feature for n,feature in enumerate(token[0])},token[1][0]]
                    labelled_feature_sets.append(temp)
            self.logger.info("read %i labelled instances"%len(feature_sets))
            for n,extractor in enumerate(self.extractors):
                    extractor_settings = extractor[1]
                    extractor_name = extractor[0]
                    results["iter-%i"%(i+1)][extractor_name] = {}
                    self.logger.info("Running iteration #%i with extractor %s"%(i+1,extractor_name))
                    self.logger.info(train_file)
                    self.logger.info(test_file)
                    self.logger.info(extractor_settings)
                    extractor_settings.DATA_FILE = train_file
                    if(extractor_settings.CLASSIFIER is not None):
                        extractor = citation_extractor(extractor_settings, extractor_settings.CLASSIFIER,labelled_feature_sets)
                    else:
                        extractor = citation_extractor(extractor_settings)
                    self.logger.info(extractor.classifier)
                    se = SimpleEvaluator([(extractor_name, extractor),],iob_file=test_file)
                    results["iter-%i"%(i+1)][extractor_name] = se.eval()[extractor_name][0]
                    results_by_entity["iter-%i"%(i+1)][extractor_name] = SimpleEvaluator.calc_stats_by_entity(se.eval()[extractor_name][1])
                    #self.logger.info(results_by_entity["iter-%i"%(i+1)][extractor_name])
        return results,results_by_entity