def tagger(self):
        self.tokenize(self.taggerUse)
        if self.taggerUse == 'standford':
            tagger = StanfordPOSTagger('/home/jmutal/dataMining/stanford-postagger-full-2017-06-09/models/spanish-distsim.tagger',
                               '/home/jmutal/dataMining/stanford-postagger-full-2017-06-09/stanford-postagger-3.8.0.jar')
            tagged_sents = tagger.tag_sents(self.clean_corpus)
        else:
            tagged_sents = self.nlp.pipe(self.clean_corpus, n_threads=8)
            tagged_sents = self.proccess_spacy(tagged_sents)

        return self.dig2num(tagged_sents)
Exemple #2
0
    def compute_pos(self):
        path = self._get_pos_path()

        if not path.exists():
            logging.info("Computing POS tags from tweets...")
            from nltk.tag.stanford import StanfordPOSTagger
            from nltk.tokenize.casual import TweetTokenizer

            s_path = self.stanford_path

            stanford_tagger = StanfordPOSTagger(
                Path(s_path,
                     'models/english-left3words-distsim.tagger').as_posix(),
                Path(s_path, 'stanford-postagger.jar').as_posix())
            tokenizer = TweetTokenizer()

            tagged_tweets = stanford_tagger.tag_sents(
                [tokenizer.tokenize(text) for text in self.corpus['text']])

            for tagged_tweet in tagged_tweets:
                for token, tag in tagged_tweet:
                    if len(token) > 0:
                        self.token_tags.add((token, tag))

            with path.open('w') as f:
                for token, tag in self.token_tags:
                    f.write('%s\t%s\n' % (token, tag))

            self.token_tags = dict(self.token_tags)

            logging.info("Wrote %d unique pairs (word, pos_tag)" %
                         len(self.token_tags))
        else:
            logging.info(
                "POS tag file already exist. Loading into class instance...")
            with path.open() as f:
                tags = [line.split() for line in f.readlines()]
            self.token_tags = dict(tags)
            logging.info("POS tags loaded.")
Exemple #3
0
class POSTagSelector:

	def __init__(self, pos_model, stanford_tagger, java_path):
		"""
		Creates a POSTagSelector instance.
	
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		"""
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)

	def selectCandidates(self, substitutions, victor_corpus):
		"""
		Selects which candidates can replace the target complex words in each instance of a VICTOR corpus.
	
		@param substitutions: Candidate substitutions to be filtered.
		It can be in two formats:
		A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions.
		Example: substitutions['perched'] = {'sat', 'roosted'}
		A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector.
		Example: [['sat', 'roosted'], ['easy', 'uncomplicated']]
		@param victor_corpus: Path to a corpus in the VICTOR format.
		For more information about the file's format, refer to the LEXenstein Manual.
		@return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus.
		"""
		selected_substitutions = []

		substitution_candidates = []
		if isinstance(substitutions, list):
			substitution_candidates = substitutions
		elif isinstance(substitutions, dict):
			void = VoidSelector()
			substitution_candidates = void.selectCandidates(substitutions, victor_corpus)
		else:
			print('ERROR: Substitutions are neither a dictionary or a list!')
			return selected_substitutions
		
		#Read VICTOR corpus:
		lexf = open(victor_corpus)
		sents = []
		targets = []
		heads = []
		words = set([])
		c = -1
		for line in lexf:
			c += 1
			data = line.strip().split('\t')
			sent = data[0].strip().split(' ')
			target = data[1].strip()
			head = int(data[2].strip())
			sents.append(sent)
			targets.append(target)
			heads.append(head)
			words.update(set(substitution_candidates[c]))
		lexf.close()
		
		#Tag sentences:
		tagged_sents = self.tagger.tag_sents(sents)
		
		#Tag words:
		words = list(words)
		words_sents = [[w] for w in words]
		tagged_words = self.tagger.tag_sents(words_sents)
		word_to_tag = {}
		for i in range(0, len(words)):
			word_to_tag[words[i]] = tagged_words[i][0][1]
		
		for i in range(0, len(sents)):
			target = targets[i]
			head = heads[i]
			target_pos = str(tagged_sents[i][head][1])
		
			candidates = []
			candidates = set(substitution_candidates[i])
			candidates = self.getCandidatesWithSamePOS(candidates, word_to_tag, target_pos)
		
			selected_substitutions.append(candidates)
		lexf.close()
		return selected_substitutions
	
	def getTargetPOS(self, sent, target, head):
		pos_data = []
		try:
			pos_data = nltk.pos_tag(sent)
			return pos_data[head][1]
		except UnicodeDecodeError:
			try:
				pos_data = nltk.pos_tag(target)
				return pos_data[0][1]
			except UnicodeDecodeError:
				return 'None'
			
		
	def getCandidatesWithSamePOS(self, candidates, word_to_tag, target_pos):
		result = set([])
		for candidate in candidates:
			if candidate in word_to_tag:
				ctag = word_to_tag[candidate]
				if ctag==target_pos:
					result.add(candidate)
		return result
	
	def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False):
		"""
		Saves a set of selected substitutions in a file in VICTOR format.
	
		@param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected.
		@param substitutions: The vector of substitutions selected for the VICTOR corpus.
		@param output_path: The path in which to save the resulting VICTOR corpus.
		@param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution.
		"""
		o = open(output_path, 'w')
		f = open(victor_corpus)
		for subs in substitutions:
			data = f.readline().strip().split('\t')
			sentence = data[0].strip()
			target = data[1].strip()
			head = data[2].strip()
			
			newline = sentence + '\t' + target + '\t' + head + '\t'
			for sub in subs:
				newline += '0:'+sub + '\t'
			o.write(newline.strip() + '\n')
		f.close()
		o.close()
Exemple #4
0
            #~ pos_documents.append(pos_words)
            #~
            #~ neg_sentences = ReviewFlat.objects.filter(store_app_id=app_id, date__range=(previous_date, date), star_rating__lt=4).exclude(body=None).values_list('body')
            #~ tagged_neg_sentences = stanford_pos_tag.tag_sents(neg_sentences)
            #~ new_neg_sentences = []
            #~ neg_words = []
            #~ neg_documents = []
            #~ for sentence in tagged_neg_sentences:
            #~ neg_words = [word[0] for word in sentence if word[1] in ['NN', 'NNS', 'NNP', 'RB', 'JJ', 'VB', 'VBG']]
            #~ neg_documents.append(neg_words)

            sentences = ReviewFlat.objects.filter(
                store_app_id=app_id,
                date__range=(previous_date,
                             date)).exclude(body=None).values_list('body')
            tagged_sentences = stanford_pos_tag.tag_sents(sentences)
            new_sentences = []
            words = []
            documents = []
            for sentence in tagged_sentences:
                words = [
                    word[0] for word in sentence if word[1] in
                    ['NN', 'NNS', 'NNP', 'RB', 'JJ', 'VB', 'VBG']
                ]
                documents.append(words)
        previous_date = date
n_dim = 300
#Initialize model and build vocab
app_w2v = Word2Vec(size=n_dim, min_count=10)
app_w2v.build_vocab(documents)
app_w2v.train(documents)
Exemple #5
0
      #~ for sentence in tagged_pos_sentences:
        #~ pos_words = [word[0] for word in sentence if word[1] in ['NN', 'NNS', 'NNP', 'RB', 'JJ', 'VB', 'VBG']]
        #~ pos_documents.append(pos_words)
#~ 
      #~ neg_sentences = ReviewFlat.objects.filter(store_app_id=app_id, date__range=(previous_date, date), star_rating__lt=4).exclude(body=None).values_list('body')
      #~ tagged_neg_sentences = stanford_pos_tag.tag_sents(neg_sentences)
      #~ new_neg_sentences = []
      #~ neg_words = []
      #~ neg_documents = []
      #~ for sentence in tagged_neg_sentences:
        #~ neg_words = [word[0] for word in sentence if word[1] in ['NN', 'NNS', 'NNP', 'RB', 'JJ', 'VB', 'VBG']]
        #~ neg_documents.append(neg_words)
    

      sentences = ReviewFlat.objects.filter(store_app_id=app_id, date__range=(previous_date, date)).exclude(body=None).values_list('body')
      tagged_sentences = stanford_pos_tag.tag_sents(sentences)
      new_sentences = []
      words = []
      documents = []
      for sentence in tagged_sentences:
        words = [word[0] for word in sentence if word[1] in ['NN', 'NNS', 'NNP', 'RB', 'JJ', 'VB', 'VBG']]
        documents.append(words)
    previous_date = date
n_dim = 300
#Initialize model and build vocab
app_w2v = Word2Vec(size=n_dim, min_count=10)
app_w2v.build_vocab(documents)
app_w2v.train(documents)

def buildWordVector(text, size):
  vec = np.zeros(size).reshape((1, size))
Exemple #6
0
class WordVectorSelector:
	
	def __init__(self, vector_model, pos_model, stanford_tagger, java_path, pos_type='none'):
		"""
		Creates an instance of the WordVectorSelector class.
	
		@param vector_model: Path to a binary word vector model.
		For instructions on how to create the model, please refer to the LEXenstein Manual.
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		@param pos_type: The type of POS tags with which the model's words are annotated, if any.
		Values supported: none, treebank, paetzold
		"""
		self.model = gensim.models.word2vec.Word2Vec.load_word2vec_format(vector_model, binary=True)
		self.pos_type = pos_type
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)
	
	def selectCandidates(self, substitutions, victor_corpus, proportion=1.0, proportion_type='percentage', stop_words_file=None, window=99999, onlyInformative=False, keepTarget=False, onePerWord=False):
		"""
		Selects which candidates can replace the target complex words in each instance of a VICTOR corpus.
	
		@param substitutions: Candidate substitutions to be filtered.
		It can be in two formats:
		A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions.
		Example: substitutions['perched'] = {'sat', 'roosted'}
		A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector.
		Example: [['sat', 'roosted'], ['easy', 'uncomplicated']]
		@param victor_corpus: Path to a corpus in the VICTOR format.
		For more information about the file's format, refer to the LEXenstein Manual.
		@param proportion: Percentage of substitutions to keep.
		If proportion_type is set to "percentage", then this parameter must be a floating point number between 0 and 1.
		If proportion_type is set to "integer", then this parameter must be an integer number.
		@param proportion_type: Type of proportion to be kept.
		Values supported: percentage, integer.
		@param stop_words_file: Path to the file containing stop words of the desired language.
		The file must contain one stop word per line.
		@param window: Number of tokens around the target complex sentence to consider as its context.
		@param onlyInformative: If True, only content words are considered as part of the complex word's context, such as nouns, verbs, adjectives and adverbs.
		@param keepTarget: If True, the complex target word is also included as part of its context.
		@param onePerWord: If True, a word in the complex word's context can only contribute once to its resulting word vector.
		@return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus.
		"""
		#Initialize selected substitutions:
		selected_substitutions = []
		
		#Read stop words:
		stop_words = set([])
		if stop_words_file != None:
			stop_words = set([word.strip() for word in open(stop_words_file)])

		#Configure input:
		substitution_candidates = []
		if isinstance(substitutions, list):
			substitution_candidates = substitutions
		elif isinstance(substitutions, dict):
			void = VoidSelector()
			substitution_candidates = void.selectCandidates(substitutions, victor_corpus)
		else:
			print('ERROR: Substitutions are neither a dictionary or a list!')
			return selected_substitutions		

		#Parse sentences:
		lexf = open(victor_corpus)
		sents = [line.strip().split('\t')[0].strip().split(' ') for line in lexf]
		lexf.close()
		tagged_sents = self.tagger.tag_sents(sents)
		
		#Transform them to the right format:
		if self.pos_type=='paetzold':
			transformed = []
			for sent in tagged_sents:
				tokens = []
				for token in sent:
					tokens.append((token[0], getGeneralisedPOS(token[1])))
				transformed.append(tokens)
			tagged_sents = transformed
		
		#Rank candidates:
		c = -1
		lexf = open(victor_corpus)
		for line in lexf:
			c += 1
			data = line.strip().split('\t')
			sent = data[0].strip()
			target = data[1].strip()
			head = int(data[2].strip())
			pos_tags = tagged_sents[c]
			target_pos = pos_tags[head][1]
		
			target_vec = self.getSentVec(sent, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tags)
			candidates = substitution_candidates[c]

			candidate_dists = {}
			for candidate in candidates:
				candidate_vec = self.getWordVec(candidate, target_pos)
				try:
					candidate_dists[candidate] = cosine(candidate_vec, target_vec)
				except ValueError:
					candidate_dists = candidate_dists

			final_candidates = self.getFinalCandidates(candidate_dists, proportion, proportion_type)

			selected_substitutions.append(final_candidates)
		lexf.close()
		return selected_substitutions
		
	def getSentVec(self, sentence, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tokens):
		informative_tags = set([])
		if onlyInformative:
			if self.pos_type=='treebank':
				informative_tags = set(['NN', 'NNS', 'JJ', 'JJS', 'JJR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RB', 'RBR', 'RBS'])
			if self.pos_type=='paetzold':
				informative_tags = set(['N', 'V', 'J', 'R'])
		
		tokens = sentence.split(' ')
		
		valid_tokens = []
		if keepTarget:
			valid = tokens[head].strip()
			if self.pos_type!='none':
				valid += '|||' + pos_tokens[head][1]
			valid_tokens.append(valid)
		
		if head>0:
			for i in range(max(0, head-window), head):
				if len(informative_tags)==0 or pos_tokens[i][1].lower().strip() in informative_tags:
					if tokens[i] not in stop_words:
						valid = tokens[i]
						if self.pos_type!='none':
							valid += '|||' + pos_tokens[i][1]
						valid_tokens.append(valid)
		
		if head<len(tokens)-1:
			for i in range(head+1, min(len(tokens), head+1+window)):
				if len(informative_tags)==0 or pos_tokens[i][1].lower().strip() in informative_tags:
					if tokens[i] not in stop_words:
						valid = tokens[i]
						if self.pos_type!='none':
							valid += '|||' + pos_tokens[i][1]
						valid_tokens.append(valid)
						
		if onePerWord:
			valid_tokens = list(set(valid_tokens))
		
		result = []
		for	token in valid_tokens:
			if len(result)==0:
				try:
					result = self.model[token]
				except Exception:
					result = []
			else:
				try:
					result = np.add(result, self.model[token])
				except Exception:
					result = result
		result = result/float(len(valid_tokens))
		return result
		
	def getWordVec(self, candidate, target_pos):
		cand = None
		if self.pos_type!='none':
			cand = candidate + '|||' + target_pos
		else:
			cand = candidate

		result = np.array([])
		try:
			result = self.model[cand]
		except Exception:
			pass
		return result
				
	def getFinalCandidates(self, candidate_dists, proportion, proportion_type):
		result = sorted(list(candidate_dists.keys()), key=candidate_dists.__getitem__)
		if proportion_type=='percentage':
			return result[0:max(1, int(proportion*float(len(result))))]
		elif proportion_type=='integer':
			if proportion>=len(result):
				return result
			else:
				return result[0:max(1, int(proportion))]
		else:
			print('Unrecognized proportion type.')
			return result
		
	def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False):
		"""
		Saves a set of selected substitutions in a file in VICTOR format.
	
		@param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected.
		@param substitutions: The vector of substitutions selected for the VICTOR corpus.
		@param output_path: The path in which to save the resulting VICTOR corpus.
		@param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution.
		"""
		o = open(output_path, 'w')
		f = open(victor_corpus)
		for subs in substitutions:
			data = f.readline().strip().split('\t')
			sentence = data[0].strip()
			target = data[1].strip()
			head = data[2].strip()
			
			newline = sentence + '\t' + target + '\t' + head + '\t'
			for sub in subs:
				newline += '0:'+sub + '\t'
			o.write(newline.strip() + '\n')
		f.close()
		o.close()
Exemple #7
0
import os
import re
from nltk.tag.stanford import StanfordPOSTagger
from nltk import word_tokenize
import json
import helpers

if not os.path.exists('scenes'):
    print('Scenes folder doesn\'t exist :( Run prepare.py first', end='\n')

pos_tagger = StanfordPOSTagger(r'english-bidirectional-distsim.tagger')

for filename in sorted(os.listdir('scenes'), key=helpers.natural_keys):
    scenefile = open('scenes/' + filename, 'r+')
    scene = json.load(scenefile)
    scene['processed'] = list()
    for sentence in scene['raw']:
        words = word_tokenize(sentence)
        words = list(filter(None,
                            [re.sub(r'\W+', '', word) for word in words]))
        scene['processed'].append(words)
    scene['processed'] = pos_tagger.tag_sents(scene['processed'])
    scenefile.seek(0)
    json.dump(scene, scenefile, indent=2)
    scenefile.truncate()
    scenefile.close()
class TextProcessor:
    def __init__(self, corpus, expanded_urls):
        self.tokenizer = TweetTokenizer()
        self.stemmer = PorterStemmer()
        self.stopwords = stopwords.words('english')
        self.corpus = corpus
        self.expanded_urls = expanded_urls
        self.re_url = r'http\S+'
        self.punctuation = string.punctuation
        self.stanford_pos_pwd = '/Users/mquezada/stanford-postagger-full-2015-12-09/'
        self.stanford_pos = StanfordPOSTagger(
            self.stanford_pos_pwd + 'models/english-left3words-distsim.tagger',
            self.stanford_pos_pwd + 'stanford-postagger.jar')
        self.tag_vocab = defaultdict(Counter)
        self.tag_token = dict()
        self.vocab = defaultdict(set)
        self.tags = Counter()

    def __iter__(self):
        yield from self.process()

    def process(self):
        for tokens in self.stanford_pos.tag_sents(self.tokenseq_generator()):
            #for tokens in self.tokenseq_generator():
            res = []
            for token, tag in tokens:
                #for token in tokens:
                processed = self.process_token(token)
                if processed:
                    #most_similar = self.w2v.most_similar(token)
                    self.tag_vocab[processed].update({tag: 1})
                    self.tag_token[token] = tag
                    self.tags.update({tag: 1})

                    res.append(processed)
            if res:
                yield res

    @staticmethod
    def clean_url(url):
        spl = urlsplit(url)
        spl = urlsplit(spl.geturl())
        return urlunsplit((spl[0], spl[1], spl[2], '', ''))

    def process_token(self, token):
        if re.match(self.re_url, token):
            return TextProcessor.clean_url(self.expanded_urls.get(
                token, token))

        t = token.lower()
        #t = token

        if t in self.stopwords or t in self.punctuation:
            return None

        if len(t) < 3 or t.startswith('@'):
            return None

        if not t.startswith('#'):
            t = t.translate({ord(k): "" for k in self.punctuation})

        t = self.stemmer.stem(t)

        self.vocab[t].add(token)
        return t

    def tokenseq_generator(self):
        for text in self.corpus:
            yield self.tokenizer.tokenize(text)
	newsent2 = ''
	tokens = sent2.split(' ')
	index2 = -1
	for i in range(0, len(tokens)):
		token = tokens[i]
		if token=='<b>':
			index2 = i
		if token!='<b>' and token!='</b>':
			newsent2 += token + ' '
	newsent2 = newsent2.strip()
	sents2.append(newsent2.split(' '))

	heads2.append(index2)
f.close()

tagged_sents1 = tagger.tag_sents(sents1)
tagged_sents2 = tagger.tag_sents(sents2)

f = open('ratings.txt')
o = open('dataset.txt', 'w')
c = -1
for line in f:
	c += 1
	data = line.strip().split('\t')
	word1 = data[1].strip()
	word2 = data[3].strip()
	sent1 = data[5].strip()
	sent2 = data[6].strip()
	tagged_sent1 = tagged_sents1[c]
	tagged_sent2 = tagged_sents2[c]
	head1 = heads1[c]
Exemple #10
0
print "time_cost = %.2fs" % (time.time() - start)

# #############################
# ### CORPUS PRE-PROCESSING ###
# #############################
print "pos..."
start = time.time()
speakers = []
utterances = []
for item in json.load(codecs.open(path, encoding='utf-8'), encoding='utf-8'):
    speakers.append(item['role'])
    utterances.append(utils.clean_utterance(item['text'], filler_words))

utterances_tagged = [
    ' '.join(['/'.join(t) for t in sent])
    for sent in pos_tagger.tag_sents([u.split() for u in utterances])
]
print "time_cost = %.2fs" % (time.time() - start)

data = zip(range(len(utterances)), speakers, utterances_tagged)
print "UCD..."
start = time.time()
communities = detection(data, stopwords, config)
print "time_cost = %.2fs" % (time.time() - start)
for c in communities[0]:
    print utils.remove_tags_from_text(c)

print "MSC..."
start = time.time()
compressions, graphs = compression(communities, stopwords, word_vectors,
                                   language_model, config, language)