Esempio n. 1
0
def reformat_iob(input_fname, output_fname,lang_code):
	"""
	TODO
		* this should go into the Utils module
		* add support for abbreviation file for treetagger, to pass with -a param from cli
		
	Utility function. Reformat an existing IOB file applying a tokenisation based on punctuation instead of white spaces.
	The IOB tags get transferred to the newly created tokens.
	
	Args:
		input_fname:
			a string, being the path to the input file
		output_fname:
			a string, being the path to the output file
		lang_code:
			the language of the file content, important for tokenisation and POS
	"""
	from citation_extractor.Utils import IO
	from urllib import urlopen
	import re
	import codecs
	result = []
	file = codecs.open(input_fname,"r",'utf-8')
	data = file.read()
	file.close()
	sentences = IO.read_instances(data)
	plain_sentences = []
	for s in sentences:
		plain = [t[0] for t in s]
		plain_sentences.append(" ".join(plain))
	for n,sent in enumerate(sentences):
		new_sent = []
		wt_sent = tokenise_and_tag(plain_sentences[n],lang_code)
		read = 0 # is a pointer which helps to synchronize the reading between the two streams of tokens
		prev_tok = ""
		unic = False
		for n,tok in enumerate(wt_sent):
			if(type(tok[0])!=type(u"x")):
				try:
					token = tok[0].decode('utf-8')
				except Exception, e:
					token = tok[0].decode('latin-1')
			else:
				unic = True
				token = tok[0]
			#print type(token)
			pos_tag = None
			if(tok[1] == ''):
				pos_tag = tok[2]
			elif(tok[1] != ''):
				pos_tag = tok[1]
				
			if(token == sent[read][0]): # the two tokens are identical
				new_sent.append((tok[0],pos_tag,sent[read][1]))
				read += 1
			elif("%s%s"%(prev_tok,token) == sent[read][0]): # current + previous token are equal to the token in the other stream
				#print "eureka"
				label = sent[read][1]
				if(re.match(r"B-",sent[read][1]) is not None):
					label = re.sub(r"B-","I-",sent[read][1])
				new_sent.append((tok[0],pos_tag,label))
				read += 1
			elif(token in sent[read][0]): # TODO
				if(re.match("^%s.*"%re.escape(tok[0]),sent[read][0])):
					new_sent.append((tok[0],pos_tag,sent[read][1]))
				else:
					label = sent[read][1]
					if(re.match(r"B-",sent[read][1]) is not None):
						label = re.sub(r"B-","I-",sent[read][1])
					new_sent.append((tok[0],pos_tag,label))
			else:
				read += 1
				new_sent.append((tok[0],pos_tag,sent[read][1]))	
		result.append(new_sent)