def load_token_list(term_file): ''' load some stopword list from the corpus ''' __location__ = '../corpora/' tokens = WordListCorpusReader(__location__, term_file) return [w.replace('+', '\+') for w in tokens.words()]
def load_token_list(term_file): ''' load some stopword list from the corpus ''' __location__ = os.path.join(os.path.abspath(os.path.dirname(__file__)), _corpus_root) tokens = WordListCorpusReader(__location__, term_file) return [w.replace('+', '\+') for w in tokens.words()]
def load_token_list(term_file): ''' load some stopword list from the corpus ''' __location__ = os.path.join( os.path.abspath(os.path.dirname(__file__)), _corpus_root) tokens = WordListCorpusReader(__location__, term_file) return [w.replace('+', '\+') for w in tokens.words()]
def extract_mimetypes(text, do_replace=True): ''' pull a list of mimetypes from some text feature return a list of mimetypes in the text block and the text, without mimetypes or unmodified ''' mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt') found_mimetypes = [w for w in mimetypes.words() if w in text] if do_replace: text = remove_tokens('mimetypes.txt', text) return found_mimetypes, text
def __init_corpora(self): self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words') self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends') self.negative_sentiments = WordListCorpusReader( '../data/corpora/sentiment-lexicon', 'negative-words.txt') self.positive_sentiments = WordListCorpusReader( '../data/corpora/sentiment-lexicon', 'positive-words.txt')
def __init__(self, language, sw_files=[], load_default=True): self.language = language self.stopwords = [] if load_default: wlcr = WordListCorpusReader(data.GzipFileSystemPathPointer(DEFAULT_SW_FILE), [language], encoding="utf-8") self.stopwords = wlcr.words(language) logging.info("Loaded default stopwords from file %s" % DEFAULT_SW_FILE) path = BASE_SW_PATH + language for sw_file in sw_files: wlcr = WordListCorpusReader(data.FileSystemPathPointer(path), sw_file, encoding="utf-8") self.stopwords += wlcr.words(sw_file) logging.info("Loaded stopwords from file '%s'" % sw_file)
@author: ake @software: PyCharm Community Edition @time: 2016/4/28 9:32 """ import re from gensim import corpora, models, similarities import xml.etree.ElementTree as Et from GetData.preprocess import getdata, ltp, ltp_pos from nltk.corpus import WordListCorpusReader import jieba import jieba.posseg as pseg import logging # 添加停用词 STOP_PATH = r'D:\MyProject\pythonProjects\TopicMine\LDA_T\data\\' stopwords = set(WordListCorpusReader(STOP_PATH, 'stopwords.txt').words()) # def parse_lda_xml(file): with open(file, 'r', encoding='utf-8') as f: xml_raw = f.read().strip().split('\n\n') # xml文本,可能包含多个xml,用双换行进行切分 f.close() docs = [] # 存储结果 for doc in xml_raw: xml = Et.fromstring(doc) doc_words = [] for sentence in xml.findall('./doc/para/sent'): # 循环读取句子 word_list = [words for words in sentence] # 循环读取word列表 wordsall = [] # 存储句子的匹配结果 for word in word_list: # 循环解析每个word要素
def read_stopwords(path): '''使用nltk读停用词表 ''' root,fileid=os.path.split(path) stopwords=WordListCorpusReader(root,[fileid]) return stopwords.words(fileid)
class OpinionSentenceFinder: def __init__(self, features, feature_sentences): self.feature_sentences = feature_sentences self.opinion_sentences = [] self.features = features self.__init_corpora() for sent_index in xrange(len(self.feature_sentences)): sent = self.feature_sentences[sent_index] self.feature_sentences[sent_index]['opinion_sent'] = [] for feature in self.features: feature = feature[0] if feature in sent['nouns'] or feature in sent['noun_phrases']: for index in xrange(len(sent['tags'])): (w, t) = sent['tags'][index] if w.find(feature.split()[0]) > -1: JJ = self.get_nearest_JJ(sent['tags'], index) self.feature_sentences[sent_index]['opinion_sent'].append((feature, JJ)) self.opinion_sentences.append((feature, JJ)) def __init_corpora(self): self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words') self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends') self.negative_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'negative-words.txt') self.positive_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'positive-words.txt') def remove_uncertain_features(self): None """ Todo: concat consecutive JJ's (Opt.) Remove meaningless JJ's (95% done.) Implement lemmatizing while checking JJ's Stop scanning for JJ's, after the period or ',' or other sentence ends (done.) Negation of opinions. (done.) (Opt.) Append (RR, RB) to the JJ Special treatment for NOUNS in pros Fix neg bug """ def get_nearest_JJ(self, tags, n_index): adj = '' neg = '' sentiment = None for i in xrange(n_index + 1, len(tags)): (w, t) = tags[i] if w in self.sent_ends.words(): break if w in self.negation_words.words(): neg = w if t in ['JJ', 'JJR', 'JJS']: adj = w if unicode.encode(w) in self.negative_sentiments.words(): adj = w sentiment = False if unicode.encode(w) in self.positive_sentiments.words(): adj = w sentiment = True break start = n_index if len(adj) < 1: end = -1 neg = '' else: end = n_index - (i - n_index) - 1 for j in xrange(start, end, -1): (w, t) = tags[j] if w in self.sent_ends.words(): break if w in self.negation_words.words(): neg = w if t in ['JJ', 'JJR', 'JJS']: adj = w if unicode.encode(w) in self.negative_sentiments.words(): adj = w sentiment = False if unicode.encode(w) in self.positive_sentiments.words(): adj = w sentiment = True break if len(neg) > 1: sentiment = not sentiment return (sentiment, neg, adj)
def __init_corpora(self): self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words') self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends') self.negative_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'negative-words.txt') self.positive_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'positive-words.txt')
def init_corpus(self): self.negation_words = WordListCorpusReader('../data/corpus/', 'negation-words.txt') self.negative_sentiments = WordListCorpusReader('../data/corpus/', 'negative-words.txt') self.positive_sentiments = WordListCorpusReader('../data/corpus/', 'positive-words.txt')
class OpinionSentenceCollector: def __init__(self, features, feature_sentences): self.features = features self.feature_sentences = feature_sentences self.opinion_sentences = [] self.opinion_features = [] self.init_corpus() for sentence_index in xrange(len(self.feature_sentences)): sentence = self.feature_sentences[sentence_index] self.feature_sentences[sentence_index]['opinion_sentence'] = [] for feature in self.features: #Extracting the feature from (feature, count) tuple feature = feature[0] if feature in sentence['nouns'] or feature in sentence['noun_phrases']: for tag_index in xrange(len(sentence['tags'])): (word, tag) = sentence['tags'][tag_index] if(word.find(feature.split()[0])) > -1: (sentiment_score, opinion) = self.calculate_sent_score(sentence['tags'], tag_index) if len(opinion) > 0: self.opinion_features.append(feature) self.opinion_sentences.append((feature, sentiment_score, sentence['sentence'])) def init_corpus(self): self.negation_words = WordListCorpusReader('../data/corpus/', 'negation-words.txt') self.negative_sentiments = WordListCorpusReader('../data/corpus/', 'negative-words.txt') self.positive_sentiments = WordListCorpusReader('../data/corpus/', 'positive-words.txt') def calculate_sent_score(self, tags, tag_index): positive_sentiment_score = 0 negative_sentiment_score = 0 adjective = '' negation_words = '' for i in xrange(tag_index + 1, len(tags)): (word, tag) = tags[i] if word in self.negation_words.words(): negation_words = word if tag in ['JJ', 'JJR', 'JJS']: adjective = word if word in self.negative_sentiments.words(): adjective = word if not len(negation_words) > 0: negative_sentiment_score += 1 else: positive_sentiment_score += 1 if word in self.positive_sentiments.words(): adjective = word if not len(negation_words) > 0: positive_sentiment_score += 1 else: negative_sentiment_score += 1 start = 0 negation_words = '' for j in xrange(start, tag_index): (word, tag) = tags[j] if word in self.negation_words.words(): negation_words = word if tag in ['JJ', 'JJR', 'JJS']: adjective = word if word in self.negative_sentiments.words(): adjective = word if not len(negation_words) > 0: negative_sentiment_score += 1 else: positive_sentiment_score += 1 if word in self.positive_sentiments.words(): if not len(negation_words) > 0: positive_sentiment_score += 1 else: negative_sentiment_score += 1 final_score = positive_sentiment_score - negative_sentiment_score #print "Sentiment Score", final_score, adjective return final_score, adjective
delete = (config.has_option('cooking', 'delete') and config.get('cooking', 'delete').split(REGEX_SEPARATOR)) or '' keywords = (config.has_option('cooking', 'keywords') and config.get( 'cooking', 'keywords').lower().split(',')) or 'movistar' oficial_users = (config.has_option('cooking', 'oficial_users') and config.get( 'cooking', 'oficial_users').lower().split(',')) or 'movistar' languages = (config.has_option('cooking', 'languages') and config.get( 'cooking', 'languages').lower().split(',')) or 'spanish' steps = config.get('cooking', 'steps').lower().split(',') text_field_out = (config.has_option('cooking', 'text_field_out') and config.get('cooking', 'text_field_out').lower()) or '' text_field_in = (config.has_option('cooking', 'text_field_in') and config.get('cooking', 'text_field_in').lower()) or '' # Loading serving section output_fields = (config.has_option('serving', 'output_fields') and config.get( 'serving', 'output_fields').lower().split(',')) or '' output_separator = (config.has_option('serving', 'output_separator') and config.get('serving', 'output_separator')) or ',' # Reading corpus data.path = [os.getcwd()] + data.path stopwords = WordListCorpusReader( data.GzipFileSystemPathPointer('stopwords.zip'), languages) # Loading dependencies yamlImport = zipimport.zipimporter('yaml.zip') yalm = yamlImport.load_module('yaml') nltkImport = zipimport.zipimporter('nltk.zip') nltk = nltkImport.load_module('nltk')
from .VerbValencyReader import VerbValencyReader from .DadeganReader import DadeganReader from .TreebankReader import TreebankReader from .WikipediaReader import WikipediaReader from .SentiPersReader import SentiPersReader from .TNewsReader import TNewsReader from .Normalizer import Normalizer from .InformalNormalizer import InformalNormalizer, InformalLemmatizer from .Stemmer import Stemmer from .Lemmatizer import Lemmatizer from .SequenceTagger import SequenceTagger, IOBTagger from .POSTagger import POSTagger, StanfordPOSTagger from .Chunker import Chunker, RuleBasedChunker, tree2brackets from .DependencyParser import DependencyParser, MaltParser, TurboParser from .utils import default_stopwords from nltk.corpus import WordListCorpusReader stopwords = WordListCorpusReader('', [default_stopwords], encoding='utf8') def sent_tokenize(text): if not hasattr(sent_tokenize, 'tokenizer'): sent_tokenize.tokenizer = SentenceTokenizer() return sent_tokenize.tokenizer.tokenize(text) def word_tokenize(sentence): if not hasattr(word_tokenize, 'tokenizer'): word_tokenize.tokenizer = WordTokenizer() return word_tokenize.tokenizer.tokenize(sentence)
class OpinionSentenceFinder: def __init__(self, features, feature_sentences): self.feature_sentences = feature_sentences self.opinion_sentences = [] self.features = features self.__init_corpora() for sent_index in xrange(len(self.feature_sentences)): sent = self.feature_sentences[sent_index] self.feature_sentences[sent_index]['opinion_sent'] = [] for feature in self.features: feature = feature[0] if feature in sent['nouns'] or feature in sent['noun_phrases']: for index in xrange(len(sent['tags'])): (w, t) = sent['tags'][index] if w.find(feature.split()[0]) > -1: JJ = self.get_nearest_JJ(sent['tags'], index) self.feature_sentences[sent_index][ 'opinion_sent'].append((feature, JJ)) self.opinion_sentences.append((feature, JJ)) def __init_corpora(self): self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words') self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends') self.negative_sentiments = WordListCorpusReader( '../data/corpora/sentiment-lexicon', 'negative-words.txt') self.positive_sentiments = WordListCorpusReader( '../data/corpora/sentiment-lexicon', 'positive-words.txt') def remove_uncertain_features(self): None """ Todo: concat consecutive JJ's (Opt.) Remove meaningless JJ's (95% done.) Implement lemmatizing while checking JJ's Stop scanning for JJ's, after the period or ',' or other sentence ends (done.) Negation of opinions. (done.) (Opt.) Append (RR, RB) to the JJ Special treatment for NOUNS in pros Fix neg bug """ def get_nearest_JJ(self, tags, n_index): adj = '' neg = '' sentiment = None for i in xrange(n_index + 1, len(tags)): (w, t) = tags[i] if w in self.sent_ends.words(): break if w in self.negation_words.words(): neg = w if t in ['JJ', 'JJR', 'JJS']: adj = w if unicode.encode(w) in self.negative_sentiments.words(): adj = w sentiment = False if unicode.encode(w) in self.positive_sentiments.words(): adj = w sentiment = True break start = n_index if len(adj) < 1: end = -1 neg = '' else: end = n_index - (i - n_index) - 1 for j in xrange(start, end, -1): (w, t) = tags[j] if w in self.sent_ends.words(): break if w in self.negation_words.words(): neg = w if t in ['JJ', 'JJR', 'JJS']: adj = w if unicode.encode(w) in self.negative_sentiments.words(): adj = w sentiment = False if unicode.encode(w) in self.positive_sentiments.words(): adj = w sentiment = True break if len(neg) > 1: sentiment = not sentiment return (sentiment, neg, adj)