Esempio n. 1
0
	def learn(self):
		"""
		What the function does:
			* read dev-set
			* for file in dev-set:
			 * for instance in file:
				* res = extract(instance)
				* for tok in res:
					* cand = Candidate(res) # more complex than this, actually
					* if(is_effective_candidate(cand)):
						* self.candidates.append(cand)
		"""
		import glob
		import os
		import operator
		from citation_extractor.Utils import IO
		
		for infile in glob.glob(os.path.join(self.dev_set, '*.iob')):
			instances = IO.file_to_instances(infile)
			string_instances = [[tok[0] for tok in i]for i in instances]
			results = self.classifier.extract([string_instances])
			for n,r in enumerate(results):
				for tok in r:
					probs = [(tag,tok["probs"][tag]["prob"]) for tag in tok["probs"].keys()] # extract the probabilities for each tag
					probs.sort(key=lambda tup: tup[1],reverse=True)
					self.logger.debug(probs)
					cand = Candidate(tok["token"],"%s#%i"%(infile,n),probs[:2]) # just the 2 top most likely tags are considered
					if(self.is_effective_candidate(cand)):
						self.candidates.append(cand)
					self.token_count+=1
		self.candidates.sort(key=operator.attrgetter('ci_score'),reverse=True)
		return self.candidates
Esempio n. 2
0
 def __init__(self,
              extractors,
              iob_directories=[],
              iob_file=None,
              label_index=-1):
     """
     Args:
         extractors:
             the list of canonical citation extractors to evaluate
         iob_test_file: 
             the file in IOB format to be used for testing and evaluating the extactors
     """
     # read the test instances from a list of directories containing the test data
     import logging
     self.logger = logging.getLogger("CREX.SIMPLEVAL")
     if (iob_file is None):
         self.logger.debug(iob_directories)
         data = []
         for directory in iob_directories:
             data += IO.read_iob_files(directory, ".txt")
         self.test_instances = data
     else:
         self.test_instances = IO.file_to_instances(iob_file)
     self.logger.debug("Found %i instances for test" %
                       len(self.test_instances))
     self.extractors = extractors
     self.output = {}
     self.error_matrix = None
     self.label_index = label_index
     return
Esempio n. 3
0
 def __init__(self,extractors,iob_directories=[],iob_file=None,label_index=-1):
     """
     Args:
         extractors:
             the list of canonical citation extractors to evaluate
         iob_test_file:
             the file in IOB format to be used for testing and evaluating the extactors
     """
     # read the test instances from a list of directories containing the test data
     import logging
     self.logger = logging.getLogger("CREX.SIMPLEVAL")
     if(iob_file is None):
         self.logger.debug(iob_directories)
         data = []
         for directory in iob_directories:
             data += IO.read_iob_files(directory,".txt")
         self.test_instances = data
     else:
         self.test_instances = IO.file_to_instances(iob_file)
     self.logger.debug("Found %i instances for test"%len(self.test_instances))
     self.extractors = extractors
     self.output = {}
     self.error_matrix = None
     self.label_index = label_index
     return
Esempio n. 4
0
    def tag_candidates(settings):
        import glob
        import os
        import codecs
        from citation_extractor.Utils import IO
        from citation_extractor.core import citation_extractor

        extractor = citation_extractor(settings)
        for infile in glob.glob(os.path.join(settings.CANDIDATES_DIR,
                                             '*.iob')):
            print "processing %s" % infile
            instances = IO.file_to_instances(infile)
            string_instances = [[tok[0] for tok in i] for i in instances]
            results = extractor.extract([string_instances])
            out_dir = settings.OUT_DIR
            out_fname = "%s%s" % (out_dir, os.path.basename(infile))
            file = codecs.open(out_fname, 'w', encoding="utf-8")
            instances = [
                "\n".join([
                    "%s\t%s" % (t["token"].decode("utf-8"), t["label"])
                    for t in r
                ]) for r in results
            ]
            file.write("\n\n".join(instances))
            file.close()
            print "output written to %s" % out_fname
def main():
    import argparse
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("input", type=str, help="IOB input file")
    parser.add_argument("--standoff-dir",
                        help="Stand-off directory",
                        type=str,
                        required=True)
    parser.add_argument("--output-dir",
                        help="IOB output file",
                        type=str,
                        required=True)
    args = parser.parse_args()

    print >> sys.stderr, "IOB Input:", args.input
    print >> sys.stderr, "Stand-off input folder: ", args.standoff_dir
    print >> sys.stderr, "IOB output dir:", args.output_dir

    fname = os.path.split(args.input)[1].split(".")[0]

    # read the correspondant .ann file with stand-off annotation
    so_entities, so_relations, so_annotations = read_ann_file(
        "%s.txt" % fname, args.standoff_dir)

    # extract for each token the start and end
    sentences = process(args.input)
    token_start_end = get_start_end(sentences)

    # read IOB from file
    iob_data = IO.file_to_instances(args.input)
    # make sure that data is consistent
    assert [len(sentence) for sentence in iob_data
            ] == [len(sentence) for sentence in token_start_end]

    so_entities = [(so_entities[ent][1], so_entities[ent][0],
                    int(so_entities[ent][2]), int(so_entities[ent][3]))
                   for ent in so_entities.keys()]
    updated_iob_instances = update(token_start_end, iob_data, so_entities)
    try:
        destination = "%s%s.txt" % (args.output_dir, fname)
        IO.write_iob_file(updated_iob_instances, destination)
        print >> sys.stderr, "IOB output written to \'%s\'" % destination
    except Exception, e:
        print >> sys.stderr, "Writing ouput to \'%s\' failed with error \'%s\'" % (
            destination, e)
Esempio n. 6
0
def do_ner(doc_id,inp_dir,interm_dir,out_dir,extractor,so2iob_script):
	# TODO:
	# wrap with a try/except/finally
	# return doc_id and a boolean
	from citation_extractor.Utils import IO
	try:
		data = IO.file_to_instances("%s%s"%(inp_dir,doc_id))
		postags = [[("z_POS",token[1]) for token in instance] for instance in data if len(instance)>0]
		instances = [[token[0] for token in instance] for instance in data if len(instance)>0]
		result = extractor.extract(instances,postags)
		output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1], res[n]["label"]) for n,d_res in enumerate(res)] for i,res in enumerate(result)]
		out_fname = "%s%s"%(interm_dir,doc_id)
		IO.write_iob_file(output,out_fname)
		logger.info("Output successfully written to file \"%s\""%out_fname)
		tostandoff(out_fname,out_dir,so2iob_script)
		return (doc_id,True)
	except Exception, e:
		logger.error("The NER of document %s failed with error \"%s\""%(doc_id,e))
		return (doc_id,False)
Esempio n. 7
0
	def tag_candidates(settings):
		import glob
		import os
		import codecs
		from citation_extractor.Utils import IO
		from citation_extractor.core import citation_extractor
		
		extractor = citation_extractor(settings)
		for infile in glob.glob( os.path.join(settings.CANDIDATES_DIR, '*.iob') ):
			print "processing %s"%infile
			instances = IO.file_to_instances(infile)
			string_instances = [[tok[0] for tok in i]for i in instances]
			results = extractor.extract([string_instances])
			out_dir = settings.OUT_DIR
			out_fname = "%s%s"%(out_dir,os.path.basename(infile))
			file = codecs.open(out_fname, 'w',encoding="utf-8")
			instances = ["\n".join(["%s\t%s"%(t["token"].decode("utf-8"),t["label"]) for t in r]) for r in results]
			file.write("\n\n".join(instances))
			file.close()
			print "output written to %s"%out_fname
Esempio n. 8
0
def do_ner(doc_id, inp_dir, interm_dir, out_dir, extractor, so2iob_script):
    # TODO:
    # wrap with a try/except/finally
    # return doc_id and a boolean
    from citation_extractor.Utils import IO
    try:
        data = IO.file_to_instances("%s%s" % (inp_dir, doc_id))
        postags = [[("z_POS", token[1]) for token in instance]
                   for instance in data if len(instance) > 0]
        instances = [[token[0] for token in instance] for instance in data
                     if len(instance) > 0]
        result = extractor.extract(instances, postags)
        output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1],
                    res[n]["label"]) for n, d_res in enumerate(res)]
                  for i, res in enumerate(result)]
        out_fname = "%s%s" % (interm_dir, doc_id)
        IO.write_iob_file(output, out_fname)
        logger.info("Output successfully written to file \"%s\"" % out_fname)
        tostandoff(out_fname, out_dir, so2iob_script)
        return (doc_id, True)
    except Exception, e:
        logger.error("The NER of document %s failed with error \"%s\"" %
                     (doc_id, e))
        return (doc_id, False)
Esempio n. 9
0
    def learn(self):
        """
		What the function does:
			* read dev-set
			* for file in dev-set:
			 * for instance in file:
				* res = extract(instance)
				* for tok in res:
					* cand = Candidate(res) # more complex than this, actually
					* if(is_effective_candidate(cand)):
						* self.candidates.append(cand)
		"""
        import glob
        import os
        import operator
        from citation_extractor.Utils import IO

        for infile in glob.glob(os.path.join(self.dev_set, '*.iob')):
            instances = IO.file_to_instances(infile)
            string_instances = [[tok[0] for tok in i] for i in instances]
            results = self.classifier.extract([string_instances])
            for n, r in enumerate(results):
                for tok in r:
                    probs = [(tag, tok["probs"][tag]["prob"])
                             for tag in tok["probs"].keys()
                             ]  # extract the probabilities for each tag
                    probs.sort(key=lambda tup: tup[1], reverse=True)
                    self.logger.debug(probs)
                    cand = Candidate(
                        tok["token"], "%s#%i" % (infile, n), probs[:2]
                    )  # just the 2 top most likely tags are considered
                    if (self.is_effective_candidate(cand)):
                        self.candidates.append(cand)
                    self.token_count += 1
        self.candidates.sort(key=operator.attrgetter('ci_score'), reverse=True)
        return self.candidates