def pos_tag(texts):

    from nltk.tag.stanford import POSTagger
    
    jar = config.mainpath+"analyze/SPOS/stanford-postagger.jar"
    if language == "german":
        model = config.mainpath+"analyze/SPOS/models/german-fast.tagger"
    if language == "english":
        model = config.mainpath+"analyze/SPOS/models/english-bidirectional-distsim.tagger"
    tagger = POSTagger(model, path_to_jar = jar, encoding="UTF-8")

    return tagger.tag_sents(texts)
Example #2
0
 def add_POS(self, row_file, target):
     '''
     row_str = '';
     f = open(row_file,'rb');
     for row in f:
         row_str+=row;
     soup = BeautifulSoup(row_str);
     self.soup = soup;
     sentences = soup.find_all('sentence');
     all_token = list();
     for block in sentences:
         text = block.text.strip();
         text_token = self.tf.stanford_tokenize(text);
         all_token.append(text_token);
     '''
     all_token = self.get_token(target)
     stanford_tagger = \
     POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar')
     tag_list = list()
     for row in all_token:
         temp_list = list()
         for word in row:
             if len(word) > 1 and re.match(r'^[A-Z]+', word):
                 temp_list.append(word.lower())
             else:
                 temp_list.append(word)
         tag_list.append(temp_list)
         1
     #end for
     tagged_result = stanford_tagger.tag_sents(tag_list)
     '''
     for row in tagged_result:
         index_list = list();
         for num,item in enumerate(row):
             if not re.match(r'.*[\w\d]+',item[0]):
                 index_list.append(num);
         for i in index_list:
             row[i]=(row[i][0],row[i][0]);
     #end for
     '''
     w = open('pos_%s' % target, 'wb')
     for num1, row in enumerate(tagged_result):
         for num2, item in enumerate(row):
             w.write(all_token[num1][num2] + ' ' + item[1] + '\n')
         w.write('\n')
     #print tagged_result;
     return
Example #3
0
 def add_POS(self,row_file,target):
     '''
     row_str = '';
     f = open(row_file,'rb');
     for row in f:
         row_str+=row;
     soup = BeautifulSoup(row_str);
     self.soup = soup;
     sentences = soup.find_all('sentence');
     all_token = list();
     for block in sentences:
         text = block.text.strip();
         text_token = self.tf.stanford_tokenize(text);
         all_token.append(text_token);
     '''
     all_token = self.get_token(target);
     stanford_tagger = \
     POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar');
     tag_list = list();
     for row in all_token:
         temp_list = list();
         for word in row:
             if len(word)>1 and re.match(r'^[A-Z]+',word):
                 temp_list.append(word.lower());
             else:
                 temp_list.append(word);
         tag_list.append(temp_list);1
     #end for
     tagged_result = stanford_tagger.tag_sents(tag_list);
     '''
     for row in tagged_result:
         index_list = list();
         for num,item in enumerate(row):
             if not re.match(r'.*[\w\d]+',item[0]):
                 index_list.append(num);
         for i in index_list:
             row[i]=(row[i][0],row[i][0]);
     #end for
     '''
     w = open('pos_%s'%target,'wb');
     for num1,row in enumerate(tagged_result):
         for num2,item in enumerate(row):
             w.write(all_token[num1][num2]+' '+item[1]+'\n');
         w.write('\n');
     #print tagged_result;
     return;
Example #4
0
 def generate_pos_set(self):
     print '正在构建正性集词典....'
     pos_dict = dict()
     pos_set = set()
     sentences = list()
     for row in self.train_label:
         for key in row:
             if ' ' in key:
                 sentences.append(self.tk.word_tokenize(key))
             else:
                 pos_dict[key] = pos_dict.setdefault(key, 0) + 1
                 #pos_set.add(key);
     #end for
     st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\
                     ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar')
     result = st.tag_sents(sentences)
     for row in result:
         for item in row:
             if item[1].startswith('NN'):
                 pos_dict[item[0]] = pos_dict.setdefault(item[0], 0) + 1
                 #pos_set.add(item[0]);
     #end for
     neg_dict = dict()
     for num, row in enumerate(self.tagged_train_data):
         for item in row:
             if item[1].startswith(
                     'NN') and item[0] not in self.train_word_label[num]:
                 neg_dict[item[0]] = neg_dict.setdefault(item[0], 0) + 1
     for key in pos_dict.keys():
         if pos_dict[key] > 1:
             if neg_dict.has_key(key):
                 if neg_dict[key] / pos_dict[key] < 2:
                     pos_set.add(key)
             else:
                 pos_set.add(key)
     self.pos_set = pos_set
     print '完成!'
     return
Example #5
0
 def generate_pos_set(self):
     print '正在构建正性集词典....';
     pos_dict = dict();
     pos_set=set();
     sentences = list();
     for row in self.train_label:
         for key in row:
             if ' ' in key:
                 sentences.append(self.tk.word_tokenize(key));
             else:
                 pos_dict[key] = pos_dict.setdefault(key,0) + 1;
                 #pos_set.add(key);
     #end for
     st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\
                     ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar');
     result = st.tag_sents(sentences);
     for row in result:
         for item in row:
             if item[1].startswith('NN'):
                 pos_dict[item[0]] = pos_dict.setdefault(item[0],0) + 1;
                 #pos_set.add(item[0]);
     #end for
     neg_dict = dict();
     for num,row in enumerate(self.tagged_train_data):
         for item in row :
             if item[1].startswith('NN') and item[0] not in self.train_word_label[num]:
                 neg_dict[item[0]] = neg_dict.setdefault(item[0],0) + 1;
     for key in pos_dict.keys():
         if pos_dict[key] > 1:
             if neg_dict.has_key(key):
                 if neg_dict[key]/pos_dict[key] < 2:
                     pos_set.add(key);
             else:
                 pos_set.add(key);
     self.pos_set=pos_set;
     print '完成!';
     return;
Example #6
0
class POSTagSelector:

	def __init__(self, pos_model, stanford_tagger, java_path):
		"""
		Creates a POSTagSelector instance.
	
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		"""
		os.environ['JAVAHOME'] = java_path
		self.tagger = POSTagger(pos_model, stanford_tagger)

	def selectCandidates(self, substitutions, victor_corpus):
		"""
		Selects which candidates can replace the target complex words in each instance of a VICTOR corpus.
	
		@param substitutions: Candidate substitutions to be filtered.
		It can be in two formats:
		A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions.
		Example: substitutions['perched'] = {'sat', 'roosted'}
		A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector.
		Example: [['sat', 'roosted'], ['easy', 'uncomplicated']]
		@param victor_corpus: Path to a corpus in the VICTOR format.
		For more information about the file's format, refer to the LEXenstein Manual.
		@return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus.
		"""
		selected_substitutions = []

		substitution_candidates = []
		if isinstance(substitutions, list):
			substitution_candidates = substitutions
		elif isinstance(substitutions, dict):
			void = VoidSelector()
			substitution_candidates = void.selectCandidates(substitutions, victor_corpus)
		else:
			print('ERROR: Substitutions are neither a dictionary or a list!')
			return selected_substitutions
		
		#Read VICTOR corpus:
		lexf = open(victor_corpus)
		sents = []
		targets = []
		heads = []
		words = set([])
		c = -1
		for line in lexf:
			c += 1
			data = line.strip().split('\t')
			sent = data[0].strip().split(' ')
			target = data[1].strip()
			head = int(data[2].strip())
			sents.append(sent)
			targets.append(target)
			heads.append(head)
			words.update(set(substitution_candidates[c]))
		lexf.close()
		
		#Tag sentences:
		tagged_sents = self.tagger.tag_sents(sents)
		
		#Tag words:
		words = list(words)
		words_sents = [[w] for w in words]
		tagged_words = self.tagger.tag_sents(words_sents)
		word_to_tag = {}
		for i in range(0, len(words)):
			word_to_tag[words[i]] = tagged_words[i][0][1]
		
		for i in range(0, len(sents)):
			target = targets[i]
			head = heads[i]
			target_pos = str(tagged_sents[i][head][1])
		
			candidates = []
			candidates = set(substitution_candidates[i])
			candidates = self.getCandidatesWithSamePOS(candidates, word_to_tag, target_pos)
		
			selected_substitutions.append(candidates)
		lexf.close()
		return selected_substitutions
	
	def getTargetPOS(self, sent, target, head):
		pos_data = []
		try:
			pos_data = nltk.pos_tag(sent)
			return pos_data[head][1]
		except UnicodeDecodeError:
			try:
				pos_data = nltk.pos_tag(target)
				return pos_data[0][1]
			except UnicodeDecodeError:
				return 'None'
			
		
	def getCandidatesWithSamePOS(self, candidates, word_to_tag, target_pos):
		result = set([])
		for candidate in candidates:
			if candidate in word_to_tag.keys():
				ctag = word_to_tag[candidate]
				if ctag==target_pos:
					result.add(candidate)
		return result
	
	def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False):
		"""
		Saves a set of selected substitutions in a file in VICTOR format.
	
		@param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected.
		@param substitutions: The vector of substitutions selected for the VICTOR corpus.
		@param output_path: The path in which to save the resulting VICTOR corpus.
		@param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution.
		"""
		o = open(output_path, 'w')
		f = open(victor_corpus)
		for subs in substitutions:
			data = f.readline().strip().split('\t')
			sentence = data[0].strip()
			target = data[1].strip()
			head = data[2].strip()
			
			newline = sentence + '\t' + target + '\t' + head + '\t'
			for sub in subs:
				newline += '0:'+sub + '\t'
			o.write(newline.strip() + '\n')
		f.close()
		o.close()