def tagger(self): self.tokenize(self.taggerUse) if self.taggerUse == 'standford': tagger = StanfordPOSTagger('/home/jmutal/dataMining/stanford-postagger-full-2017-06-09/models/spanish-distsim.tagger', '/home/jmutal/dataMining/stanford-postagger-full-2017-06-09/stanford-postagger-3.8.0.jar') tagged_sents = tagger.tag_sents(self.clean_corpus) else: tagged_sents = self.nlp.pipe(self.clean_corpus, n_threads=8) tagged_sents = self.proccess_spacy(tagged_sents) return self.dig2num(tagged_sents)
def compute_pos(self): path = self._get_pos_path() if not path.exists(): logging.info("Computing POS tags from tweets...") from nltk.tag.stanford import StanfordPOSTagger from nltk.tokenize.casual import TweetTokenizer s_path = self.stanford_path stanford_tagger = StanfordPOSTagger( Path(s_path, 'models/english-left3words-distsim.tagger').as_posix(), Path(s_path, 'stanford-postagger.jar').as_posix()) tokenizer = TweetTokenizer() tagged_tweets = stanford_tagger.tag_sents( [tokenizer.tokenize(text) for text in self.corpus['text']]) for tagged_tweet in tagged_tweets: for token, tag in tagged_tweet: if len(token) > 0: self.token_tags.add((token, tag)) with path.open('w') as f: for token, tag in self.token_tags: f.write('%s\t%s\n' % (token, tag)) self.token_tags = dict(self.token_tags) logging.info("Wrote %d unique pairs (word, pos_tag)" % len(self.token_tags)) else: logging.info( "POS tag file already exist. Loading into class instance...") with path.open() as f: tags = [line.split() for line in f.readlines()] self.token_tags = dict(tags) logging.info("POS tags loaded.")
class POSTagSelector: def __init__(self, pos_model, stanford_tagger, java_path): """ Creates a POSTagSelector instance. @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param stanford_tagger: Path to the "stanford-postagger.jar" file. The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param java_path: Path to the system's "java" executable. Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. """ os.environ['JAVAHOME'] = java_path self.tagger = StanfordPOSTagger(pos_model, stanford_tagger) def selectCandidates(self, substitutions, victor_corpus): """ Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. @param substitutions: Candidate substitutions to be filtered. It can be in two formats: A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. Example: substitutions['perched'] = {'sat', 'roosted'} A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] @param victor_corpus: Path to a corpus in the VICTOR format. For more information about the file's format, refer to the LEXenstein Manual. @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. """ selected_substitutions = [] substitution_candidates = [] if isinstance(substitutions, list): substitution_candidates = substitutions elif isinstance(substitutions, dict): void = VoidSelector() substitution_candidates = void.selectCandidates(substitutions, victor_corpus) else: print('ERROR: Substitutions are neither a dictionary or a list!') return selected_substitutions #Read VICTOR corpus: lexf = open(victor_corpus) sents = [] targets = [] heads = [] words = set([]) c = -1 for line in lexf: c += 1 data = line.strip().split('\t') sent = data[0].strip().split(' ') target = data[1].strip() head = int(data[2].strip()) sents.append(sent) targets.append(target) heads.append(head) words.update(set(substitution_candidates[c])) lexf.close() #Tag sentences: tagged_sents = self.tagger.tag_sents(sents) #Tag words: words = list(words) words_sents = [[w] for w in words] tagged_words = self.tagger.tag_sents(words_sents) word_to_tag = {} for i in range(0, len(words)): word_to_tag[words[i]] = tagged_words[i][0][1] for i in range(0, len(sents)): target = targets[i] head = heads[i] target_pos = str(tagged_sents[i][head][1]) candidates = [] candidates = set(substitution_candidates[i]) candidates = self.getCandidatesWithSamePOS(candidates, word_to_tag, target_pos) selected_substitutions.append(candidates) lexf.close() return selected_substitutions def getTargetPOS(self, sent, target, head): pos_data = [] try: pos_data = nltk.pos_tag(sent) return pos_data[head][1] except UnicodeDecodeError: try: pos_data = nltk.pos_tag(target) return pos_data[0][1] except UnicodeDecodeError: return 'None' def getCandidatesWithSamePOS(self, candidates, word_to_tag, target_pos): result = set([]) for candidate in candidates: if candidate in word_to_tag: ctag = word_to_tag[candidate] if ctag==target_pos: result.add(candidate) return result def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): """ Saves a set of selected substitutions in a file in VICTOR format. @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. @param substitutions: The vector of substitutions selected for the VICTOR corpus. @param output_path: The path in which to save the resulting VICTOR corpus. @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. """ o = open(output_path, 'w') f = open(victor_corpus) for subs in substitutions: data = f.readline().strip().split('\t') sentence = data[0].strip() target = data[1].strip() head = data[2].strip() newline = sentence + '\t' + target + '\t' + head + '\t' for sub in subs: newline += '0:'+sub + '\t' o.write(newline.strip() + '\n') f.close() o.close()
#~ pos_documents.append(pos_words) #~ #~ neg_sentences = ReviewFlat.objects.filter(store_app_id=app_id, date__range=(previous_date, date), star_rating__lt=4).exclude(body=None).values_list('body') #~ tagged_neg_sentences = stanford_pos_tag.tag_sents(neg_sentences) #~ new_neg_sentences = [] #~ neg_words = [] #~ neg_documents = [] #~ for sentence in tagged_neg_sentences: #~ neg_words = [word[0] for word in sentence if word[1] in ['NN', 'NNS', 'NNP', 'RB', 'JJ', 'VB', 'VBG']] #~ neg_documents.append(neg_words) sentences = ReviewFlat.objects.filter( store_app_id=app_id, date__range=(previous_date, date)).exclude(body=None).values_list('body') tagged_sentences = stanford_pos_tag.tag_sents(sentences) new_sentences = [] words = [] documents = [] for sentence in tagged_sentences: words = [ word[0] for word in sentence if word[1] in ['NN', 'NNS', 'NNP', 'RB', 'JJ', 'VB', 'VBG'] ] documents.append(words) previous_date = date n_dim = 300 #Initialize model and build vocab app_w2v = Word2Vec(size=n_dim, min_count=10) app_w2v.build_vocab(documents) app_w2v.train(documents)
#~ for sentence in tagged_pos_sentences: #~ pos_words = [word[0] for word in sentence if word[1] in ['NN', 'NNS', 'NNP', 'RB', 'JJ', 'VB', 'VBG']] #~ pos_documents.append(pos_words) #~ #~ neg_sentences = ReviewFlat.objects.filter(store_app_id=app_id, date__range=(previous_date, date), star_rating__lt=4).exclude(body=None).values_list('body') #~ tagged_neg_sentences = stanford_pos_tag.tag_sents(neg_sentences) #~ new_neg_sentences = [] #~ neg_words = [] #~ neg_documents = [] #~ for sentence in tagged_neg_sentences: #~ neg_words = [word[0] for word in sentence if word[1] in ['NN', 'NNS', 'NNP', 'RB', 'JJ', 'VB', 'VBG']] #~ neg_documents.append(neg_words) sentences = ReviewFlat.objects.filter(store_app_id=app_id, date__range=(previous_date, date)).exclude(body=None).values_list('body') tagged_sentences = stanford_pos_tag.tag_sents(sentences) new_sentences = [] words = [] documents = [] for sentence in tagged_sentences: words = [word[0] for word in sentence if word[1] in ['NN', 'NNS', 'NNP', 'RB', 'JJ', 'VB', 'VBG']] documents.append(words) previous_date = date n_dim = 300 #Initialize model and build vocab app_w2v = Word2Vec(size=n_dim, min_count=10) app_w2v.build_vocab(documents) app_w2v.train(documents) def buildWordVector(text, size): vec = np.zeros(size).reshape((1, size))
class WordVectorSelector: def __init__(self, vector_model, pos_model, stanford_tagger, java_path, pos_type='none'): """ Creates an instance of the WordVectorSelector class. @param vector_model: Path to a binary word vector model. For instructions on how to create the model, please refer to the LEXenstein Manual. @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param stanford_tagger: Path to the "stanford-postagger.jar" file. The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param java_path: Path to the system's "java" executable. Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. @param pos_type: The type of POS tags with which the model's words are annotated, if any. Values supported: none, treebank, paetzold """ self.model = gensim.models.word2vec.Word2Vec.load_word2vec_format(vector_model, binary=True) self.pos_type = pos_type os.environ['JAVAHOME'] = java_path self.tagger = StanfordPOSTagger(pos_model, stanford_tagger) def selectCandidates(self, substitutions, victor_corpus, proportion=1.0, proportion_type='percentage', stop_words_file=None, window=99999, onlyInformative=False, keepTarget=False, onePerWord=False): """ Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. @param substitutions: Candidate substitutions to be filtered. It can be in two formats: A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. Example: substitutions['perched'] = {'sat', 'roosted'} A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] @param victor_corpus: Path to a corpus in the VICTOR format. For more information about the file's format, refer to the LEXenstein Manual. @param proportion: Percentage of substitutions to keep. If proportion_type is set to "percentage", then this parameter must be a floating point number between 0 and 1. If proportion_type is set to "integer", then this parameter must be an integer number. @param proportion_type: Type of proportion to be kept. Values supported: percentage, integer. @param stop_words_file: Path to the file containing stop words of the desired language. The file must contain one stop word per line. @param window: Number of tokens around the target complex sentence to consider as its context. @param onlyInformative: If True, only content words are considered as part of the complex word's context, such as nouns, verbs, adjectives and adverbs. @param keepTarget: If True, the complex target word is also included as part of its context. @param onePerWord: If True, a word in the complex word's context can only contribute once to its resulting word vector. @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. """ #Initialize selected substitutions: selected_substitutions = [] #Read stop words: stop_words = set([]) if stop_words_file != None: stop_words = set([word.strip() for word in open(stop_words_file)]) #Configure input: substitution_candidates = [] if isinstance(substitutions, list): substitution_candidates = substitutions elif isinstance(substitutions, dict): void = VoidSelector() substitution_candidates = void.selectCandidates(substitutions, victor_corpus) else: print('ERROR: Substitutions are neither a dictionary or a list!') return selected_substitutions #Parse sentences: lexf = open(victor_corpus) sents = [line.strip().split('\t')[0].strip().split(' ') for line in lexf] lexf.close() tagged_sents = self.tagger.tag_sents(sents) #Transform them to the right format: if self.pos_type=='paetzold': transformed = [] for sent in tagged_sents: tokens = [] for token in sent: tokens.append((token[0], getGeneralisedPOS(token[1]))) transformed.append(tokens) tagged_sents = transformed #Rank candidates: c = -1 lexf = open(victor_corpus) for line in lexf: c += 1 data = line.strip().split('\t') sent = data[0].strip() target = data[1].strip() head = int(data[2].strip()) pos_tags = tagged_sents[c] target_pos = pos_tags[head][1] target_vec = self.getSentVec(sent, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tags) candidates = substitution_candidates[c] candidate_dists = {} for candidate in candidates: candidate_vec = self.getWordVec(candidate, target_pos) try: candidate_dists[candidate] = cosine(candidate_vec, target_vec) except ValueError: candidate_dists = candidate_dists final_candidates = self.getFinalCandidates(candidate_dists, proportion, proportion_type) selected_substitutions.append(final_candidates) lexf.close() return selected_substitutions def getSentVec(self, sentence, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tokens): informative_tags = set([]) if onlyInformative: if self.pos_type=='treebank': informative_tags = set(['NN', 'NNS', 'JJ', 'JJS', 'JJR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RB', 'RBR', 'RBS']) if self.pos_type=='paetzold': informative_tags = set(['N', 'V', 'J', 'R']) tokens = sentence.split(' ') valid_tokens = [] if keepTarget: valid = tokens[head].strip() if self.pos_type!='none': valid += '|||' + pos_tokens[head][1] valid_tokens.append(valid) if head>0: for i in range(max(0, head-window), head): if len(informative_tags)==0 or pos_tokens[i][1].lower().strip() in informative_tags: if tokens[i] not in stop_words: valid = tokens[i] if self.pos_type!='none': valid += '|||' + pos_tokens[i][1] valid_tokens.append(valid) if head<len(tokens)-1: for i in range(head+1, min(len(tokens), head+1+window)): if len(informative_tags)==0 or pos_tokens[i][1].lower().strip() in informative_tags: if tokens[i] not in stop_words: valid = tokens[i] if self.pos_type!='none': valid += '|||' + pos_tokens[i][1] valid_tokens.append(valid) if onePerWord: valid_tokens = list(set(valid_tokens)) result = [] for token in valid_tokens: if len(result)==0: try: result = self.model[token] except Exception: result = [] else: try: result = np.add(result, self.model[token]) except Exception: result = result result = result/float(len(valid_tokens)) return result def getWordVec(self, candidate, target_pos): cand = None if self.pos_type!='none': cand = candidate + '|||' + target_pos else: cand = candidate result = np.array([]) try: result = self.model[cand] except Exception: pass return result def getFinalCandidates(self, candidate_dists, proportion, proportion_type): result = sorted(list(candidate_dists.keys()), key=candidate_dists.__getitem__) if proportion_type=='percentage': return result[0:max(1, int(proportion*float(len(result))))] elif proportion_type=='integer': if proportion>=len(result): return result else: return result[0:max(1, int(proportion))] else: print('Unrecognized proportion type.') return result def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): """ Saves a set of selected substitutions in a file in VICTOR format. @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. @param substitutions: The vector of substitutions selected for the VICTOR corpus. @param output_path: The path in which to save the resulting VICTOR corpus. @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. """ o = open(output_path, 'w') f = open(victor_corpus) for subs in substitutions: data = f.readline().strip().split('\t') sentence = data[0].strip() target = data[1].strip() head = data[2].strip() newline = sentence + '\t' + target + '\t' + head + '\t' for sub in subs: newline += '0:'+sub + '\t' o.write(newline.strip() + '\n') f.close() o.close()
import os import re from nltk.tag.stanford import StanfordPOSTagger from nltk import word_tokenize import json import helpers if not os.path.exists('scenes'): print('Scenes folder doesn\'t exist :( Run prepare.py first', end='\n') pos_tagger = StanfordPOSTagger(r'english-bidirectional-distsim.tagger') for filename in sorted(os.listdir('scenes'), key=helpers.natural_keys): scenefile = open('scenes/' + filename, 'r+') scene = json.load(scenefile) scene['processed'] = list() for sentence in scene['raw']: words = word_tokenize(sentence) words = list(filter(None, [re.sub(r'\W+', '', word) for word in words])) scene['processed'].append(words) scene['processed'] = pos_tagger.tag_sents(scene['processed']) scenefile.seek(0) json.dump(scene, scenefile, indent=2) scenefile.truncate() scenefile.close()
class TextProcessor: def __init__(self, corpus, expanded_urls): self.tokenizer = TweetTokenizer() self.stemmer = PorterStemmer() self.stopwords = stopwords.words('english') self.corpus = corpus self.expanded_urls = expanded_urls self.re_url = r'http\S+' self.punctuation = string.punctuation self.stanford_pos_pwd = '/Users/mquezada/stanford-postagger-full-2015-12-09/' self.stanford_pos = StanfordPOSTagger( self.stanford_pos_pwd + 'models/english-left3words-distsim.tagger', self.stanford_pos_pwd + 'stanford-postagger.jar') self.tag_vocab = defaultdict(Counter) self.tag_token = dict() self.vocab = defaultdict(set) self.tags = Counter() def __iter__(self): yield from self.process() def process(self): for tokens in self.stanford_pos.tag_sents(self.tokenseq_generator()): #for tokens in self.tokenseq_generator(): res = [] for token, tag in tokens: #for token in tokens: processed = self.process_token(token) if processed: #most_similar = self.w2v.most_similar(token) self.tag_vocab[processed].update({tag: 1}) self.tag_token[token] = tag self.tags.update({tag: 1}) res.append(processed) if res: yield res @staticmethod def clean_url(url): spl = urlsplit(url) spl = urlsplit(spl.geturl()) return urlunsplit((spl[0], spl[1], spl[2], '', '')) def process_token(self, token): if re.match(self.re_url, token): return TextProcessor.clean_url(self.expanded_urls.get( token, token)) t = token.lower() #t = token if t in self.stopwords or t in self.punctuation: return None if len(t) < 3 or t.startswith('@'): return None if not t.startswith('#'): t = t.translate({ord(k): "" for k in self.punctuation}) t = self.stemmer.stem(t) self.vocab[t].add(token) return t def tokenseq_generator(self): for text in self.corpus: yield self.tokenizer.tokenize(text)
newsent2 = '' tokens = sent2.split(' ') index2 = -1 for i in range(0, len(tokens)): token = tokens[i] if token=='<b>': index2 = i if token!='<b>' and token!='</b>': newsent2 += token + ' ' newsent2 = newsent2.strip() sents2.append(newsent2.split(' ')) heads2.append(index2) f.close() tagged_sents1 = tagger.tag_sents(sents1) tagged_sents2 = tagger.tag_sents(sents2) f = open('ratings.txt') o = open('dataset.txt', 'w') c = -1 for line in f: c += 1 data = line.strip().split('\t') word1 = data[1].strip() word2 = data[3].strip() sent1 = data[5].strip() sent2 = data[6].strip() tagged_sent1 = tagged_sents1[c] tagged_sent2 = tagged_sents2[c] head1 = heads1[c]
print "time_cost = %.2fs" % (time.time() - start) # ############################# # ### CORPUS PRE-PROCESSING ### # ############################# print "pos..." start = time.time() speakers = [] utterances = [] for item in json.load(codecs.open(path, encoding='utf-8'), encoding='utf-8'): speakers.append(item['role']) utterances.append(utils.clean_utterance(item['text'], filler_words)) utterances_tagged = [ ' '.join(['/'.join(t) for t in sent]) for sent in pos_tagger.tag_sents([u.split() for u in utterances]) ] print "time_cost = %.2fs" % (time.time() - start) data = zip(range(len(utterances)), speakers, utterances_tagged) print "UCD..." start = time.time() communities = detection(data, stopwords, config) print "time_cost = %.2fs" % (time.time() - start) for c in communities[0]: print utils.remove_tags_from_text(c) print "MSC..." start = time.time() compressions, graphs = compression(communities, stopwords, word_vectors, language_model, config, language)