Beispiel #1
0
    def __init__(self, language, sw_files=[], load_default=True):
        self.language = language
        self.stopwords = []

        if load_default:
            wlcr = WordListCorpusReader(data.GzipFileSystemPathPointer(DEFAULT_SW_FILE), [language], encoding="utf-8")
            self.stopwords = wlcr.words(language)
            logging.info("Loaded default stopwords from file %s" % DEFAULT_SW_FILE)

        path = BASE_SW_PATH + language
        for sw_file in sw_files:
            wlcr = WordListCorpusReader(data.FileSystemPathPointer(path), sw_file, encoding="utf-8")
            self.stopwords += wlcr.words(sw_file)
            logging.info("Loaded stopwords from file '%s'" % sw_file)
def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = '../corpora/'
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]
Beispiel #3
0
def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                _corpus_root)
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]
def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = os.path.join(
        os.path.abspath(os.path.dirname(__file__)), _corpus_root)
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]
Beispiel #5
0
def extract_mimetypes(text, do_replace=True):
    '''
    pull a list of mimetypes from some text feature

    return a list of mimetypes in the text block and
    the text, without mimetypes or unmodified
    '''
    mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt')

    found_mimetypes = [w for w in mimetypes.words() if w in text]

    if do_replace:
        text = remove_tokens('mimetypes.txt', text)

    return found_mimetypes, text
def extract_mimetypes(text, do_replace=True):
    '''
    pull a list of mimetypes from some text feature

    return a list of mimetypes in the text block and
    the text, without mimetypes or unmodified
    '''
    mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt')

    found_mimetypes = [w for w in mimetypes.words() if w in text]

    if do_replace:
        text = remove_tokens('mimetypes.txt', text)

    return found_mimetypes, text
Beispiel #7
0
def read_stopwords(path):
    '''使用nltk读停用词表
    '''
    root,fileid=os.path.split(path)
    stopwords=WordListCorpusReader(root,[fileid])
    return stopwords.words(fileid)
class OpinionSentenceFinder:
	def __init__(self, features, feature_sentences):
		self.feature_sentences = feature_sentences
		self.opinion_sentences = []
		self.features = features
		self.__init_corpora()
		for sent_index in xrange(len(self.feature_sentences)):
			sent = self.feature_sentences[sent_index]
			self.feature_sentences[sent_index]['opinion_sent'] = []
			for feature in self.features:
				feature = feature[0]
				if feature in sent['nouns'] or feature in sent['noun_phrases']:
					for index in xrange(len(sent['tags'])):
						(w, t) = sent['tags'][index]
						if w.find(feature.split()[0]) > -1:
							JJ = self.get_nearest_JJ(sent['tags'], index)
							self.feature_sentences[sent_index]['opinion_sent'].append((feature, JJ))
							self.opinion_sentences.append((feature, JJ))
		
	def __init_corpora(self):
		self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words')
		self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
		self.negative_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'negative-words.txt')
		self.positive_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'positive-words.txt')

					
	def remove_uncertain_features(self):
		None
	"""
		Todo: concat consecutive JJ's (Opt.) 
		      Remove meaningless JJ's (95% done.)
		      Implement lemmatizing while checking JJ's
		      Stop scanning for JJ's, after the period or ',' or other sentence ends (done.)
		      Negation of opinions. (done.)
		      (Opt.) Append (RR, RB) to the JJ
		      Special treatment for NOUNS in pros
			Fix neg bug
	"""
	def get_nearest_JJ(self, tags, n_index):
		adj = ''
		neg = ''
		sentiment = None
		for i in xrange(n_index + 1, len(tags)):
			(w, t) = tags[i]
			if w in self.sent_ends.words():
				break
			if w in self.negation_words.words():
				neg = w
			if t in ['JJ', 'JJR', 'JJS']:
				adj = w
			if unicode.encode(w) in self.negative_sentiments.words():
				adj = w
				sentiment = False
			if unicode.encode(w) in self.positive_sentiments.words():
				adj = w
				sentiment = True
				break
		start = n_index
		if len(adj) < 1:
			end = -1
			neg = ''
		else:
			end = n_index - (i - n_index) - 1
		for j in xrange(start, end, -1):
			(w, t) = tags[j] 
			if w in self.sent_ends.words():
				break
			if w in self.negation_words.words():
				neg = w
			if t in ['JJ', 'JJR', 'JJS']:
				adj = w
			if unicode.encode(w) in self.negative_sentiments.words():
				adj = w
				sentiment = False
			if unicode.encode(w) in self.positive_sentiments.words():
				adj = w
				sentiment = True
				break
		if len(neg) > 1:
			sentiment = not sentiment
		return (sentiment, neg, adj) 
class OpinionSentenceCollector:
    def __init__(self, features, feature_sentences):
        self.features = features
        self.feature_sentences = feature_sentences
        self.opinion_sentences = []
        self.opinion_features = []

        self.init_corpus()

        for sentence_index in xrange(len(self.feature_sentences)):
            sentence = self.feature_sentences[sentence_index]
            self.feature_sentences[sentence_index]['opinion_sentence'] = []
            for feature in self.features:
                #Extracting the feature from (feature, count) tuple
                feature = feature[0]
                if feature in sentence['nouns'] or feature in sentence['noun_phrases']:
                    for tag_index in xrange(len(sentence['tags'])):
                        (word, tag) = sentence['tags'][tag_index]
                        if(word.find(feature.split()[0])) > -1:
                            (sentiment_score, opinion) = self.calculate_sent_score(sentence['tags'], tag_index)
                            if len(opinion) > 0:
                                self.opinion_features.append(feature)
                                self.opinion_sentences.append((feature, sentiment_score, sentence['sentence']))

    def init_corpus(self):
        self.negation_words = WordListCorpusReader('../data/corpus/', 'negation-words.txt')
        self.negative_sentiments = WordListCorpusReader('../data/corpus/', 'negative-words.txt')
        self.positive_sentiments = WordListCorpusReader('../data/corpus/', 'positive-words.txt')


    def calculate_sent_score(self, tags, tag_index):

        positive_sentiment_score = 0
        negative_sentiment_score = 0
        adjective = ''
        negation_words = ''

        for i in xrange(tag_index + 1, len(tags)):
            (word, tag) = tags[i]
            if word in self.negation_words.words():
                negation_words = word
            if tag in ['JJ', 'JJR', 'JJS']:
                adjective = word
                if word in self.negative_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        negative_sentiment_score += 1
                    else:
                        positive_sentiment_score += 1
                if word in self.positive_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        positive_sentiment_score += 1
                    else:
                        negative_sentiment_score += 1

        start = 0
        negation_words = ''

        for j in xrange(start, tag_index):
            (word, tag) = tags[j]
            if word in self.negation_words.words():
                negation_words = word
            if tag in ['JJ', 'JJR', 'JJS']:
                adjective = word
                if word in self.negative_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        negative_sentiment_score += 1
                    else:
                        positive_sentiment_score += 1
                if word in self.positive_sentiments.words():
                    if not len(negation_words) > 0:
                        positive_sentiment_score += 1
                    else:
                        negative_sentiment_score += 1

        final_score = positive_sentiment_score - negative_sentiment_score

        #print "Sentiment Score", final_score, adjective
        return final_score, adjective
class OpinionSentenceFinder:
    def __init__(self, features, feature_sentences):
        self.feature_sentences = feature_sentences
        self.opinion_sentences = []
        self.features = features
        self.__init_corpora()
        for sent_index in xrange(len(self.feature_sentences)):
            sent = self.feature_sentences[sent_index]
            self.feature_sentences[sent_index]['opinion_sent'] = []
            for feature in self.features:
                feature = feature[0]
                if feature in sent['nouns'] or feature in sent['noun_phrases']:
                    for index in xrange(len(sent['tags'])):
                        (w, t) = sent['tags'][index]
                        if w.find(feature.split()[0]) > -1:
                            JJ = self.get_nearest_JJ(sent['tags'], index)
                            self.feature_sentences[sent_index][
                                'opinion_sent'].append((feature, JJ))
                            self.opinion_sentences.append((feature, JJ))

    def __init_corpora(self):
        self.negation_words = WordListCorpusReader('../data/corpora/',
                                                   'negation_words')
        self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
        self.negative_sentiments = WordListCorpusReader(
            '../data/corpora/sentiment-lexicon', 'negative-words.txt')
        self.positive_sentiments = WordListCorpusReader(
            '../data/corpora/sentiment-lexicon', 'positive-words.txt')

    def remove_uncertain_features(self):
        None

    """
		Todo: concat consecutive JJ's (Opt.) 
		      Remove meaningless JJ's (95% done.)
		      Implement lemmatizing while checking JJ's
		      Stop scanning for JJ's, after the period or ',' or other sentence ends (done.)
		      Negation of opinions. (done.)
		      (Opt.) Append (RR, RB) to the JJ
		      Special treatment for NOUNS in pros
			Fix neg bug
	"""

    def get_nearest_JJ(self, tags, n_index):
        adj = ''
        neg = ''
        sentiment = None
        for i in xrange(n_index + 1, len(tags)):
            (w, t) = tags[i]
            if w in self.sent_ends.words():
                break
            if w in self.negation_words.words():
                neg = w
            if t in ['JJ', 'JJR', 'JJS']:
                adj = w
            if unicode.encode(w) in self.negative_sentiments.words():
                adj = w
                sentiment = False
            if unicode.encode(w) in self.positive_sentiments.words():
                adj = w
                sentiment = True
                break
        start = n_index
        if len(adj) < 1:
            end = -1
            neg = ''
        else:
            end = n_index - (i - n_index) - 1
        for j in xrange(start, end, -1):
            (w, t) = tags[j]
            if w in self.sent_ends.words():
                break
            if w in self.negation_words.words():
                neg = w
            if t in ['JJ', 'JJR', 'JJS']:
                adj = w
            if unicode.encode(w) in self.negative_sentiments.words():
                adj = w
                sentiment = False
            if unicode.encode(w) in self.positive_sentiments.words():
                adj = w
                sentiment = True
                break
        if len(neg) > 1:
            sentiment = not sentiment
        return (sentiment, neg, adj)