def __init_corpora(self): self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words') self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends') self.negative_sentiments = WordListCorpusReader( '../data/corpora/sentiment-lexicon', 'negative-words.txt') self.positive_sentiments = WordListCorpusReader( '../data/corpora/sentiment-lexicon', 'positive-words.txt')
def __init__(self, language, sw_files=[], load_default=True): self.language = language self.stopwords = [] if load_default: wlcr = WordListCorpusReader(data.GzipFileSystemPathPointer(DEFAULT_SW_FILE), [language], encoding="utf-8") self.stopwords = wlcr.words(language) logging.info("Loaded default stopwords from file %s" % DEFAULT_SW_FILE) path = BASE_SW_PATH + language for sw_file in sw_files: wlcr = WordListCorpusReader(data.FileSystemPathPointer(path), sw_file, encoding="utf-8") self.stopwords += wlcr.words(sw_file) logging.info("Loaded stopwords from file '%s'" % sw_file)
def load_token_list(term_file): ''' load some stopword list from the corpus ''' __location__ = os.path.join(os.path.abspath(os.path.dirname(__file__)), _corpus_root) tokens = WordListCorpusReader(__location__, term_file) return [w.replace('+', '\+') for w in tokens.words()]
def extract_mimetypes(text, do_replace=True): ''' pull a list of mimetypes from some text feature return a list of mimetypes in the text block and the text, without mimetypes or unmodified ''' mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt') found_mimetypes = [w for w in mimetypes.words() if w in text] if do_replace: text = remove_tokens('mimetypes.txt', text) return found_mimetypes, text
@author: ake @software: PyCharm Community Edition @time: 2016/4/28 9:32 """ import re from gensim import corpora, models, similarities import xml.etree.ElementTree as Et from GetData.preprocess import getdata, ltp, ltp_pos from nltk.corpus import WordListCorpusReader import jieba import jieba.posseg as pseg import logging # 添加停用词 STOP_PATH = r'D:\MyProject\pythonProjects\TopicMine\LDA_T\data\\' stopwords = set(WordListCorpusReader(STOP_PATH, 'stopwords.txt').words()) # def parse_lda_xml(file): with open(file, 'r', encoding='utf-8') as f: xml_raw = f.read().strip().split('\n\n') # xml文本,可能包含多个xml,用双换行进行切分 f.close() docs = [] # 存储结果 for doc in xml_raw: xml = Et.fromstring(doc) doc_words = [] for sentence in xml.findall('./doc/para/sent'): # 循环读取句子 word_list = [words for words in sentence] # 循环读取word列表 wordsall = [] # 存储句子的匹配结果 for word in word_list: # 循环解析每个word要素
def read_stopwords(path): '''使用nltk读停用词表 ''' root,fileid=os.path.split(path) stopwords=WordListCorpusReader(root,[fileid]) return stopwords.words(fileid)
delete = (config.has_option('cooking', 'delete') and config.get('cooking', 'delete').split(REGEX_SEPARATOR)) or '' keywords = (config.has_option('cooking', 'keywords') and config.get( 'cooking', 'keywords').lower().split(',')) or 'movistar' oficial_users = (config.has_option('cooking', 'oficial_users') and config.get( 'cooking', 'oficial_users').lower().split(',')) or 'movistar' languages = (config.has_option('cooking', 'languages') and config.get( 'cooking', 'languages').lower().split(',')) or 'spanish' steps = config.get('cooking', 'steps').lower().split(',') text_field_out = (config.has_option('cooking', 'text_field_out') and config.get('cooking', 'text_field_out').lower()) or '' text_field_in = (config.has_option('cooking', 'text_field_in') and config.get('cooking', 'text_field_in').lower()) or '' # Loading serving section output_fields = (config.has_option('serving', 'output_fields') and config.get( 'serving', 'output_fields').lower().split(',')) or '' output_separator = (config.has_option('serving', 'output_separator') and config.get('serving', 'output_separator')) or ',' # Reading corpus data.path = [os.getcwd()] + data.path stopwords = WordListCorpusReader( data.GzipFileSystemPathPointer('stopwords.zip'), languages) # Loading dependencies yamlImport = zipimport.zipimporter('yaml.zip') yalm = yamlImport.load_module('yaml') nltkImport = zipimport.zipimporter('nltk.zip') nltk = nltkImport.load_module('nltk')
from .VerbValencyReader import VerbValencyReader from .DadeganReader import DadeganReader from .TreebankReader import TreebankReader from .WikipediaReader import WikipediaReader from .SentiPersReader import SentiPersReader from .TNewsReader import TNewsReader from .Normalizer import Normalizer from .InformalNormalizer import InformalNormalizer, InformalLemmatizer from .Stemmer import Stemmer from .Lemmatizer import Lemmatizer from .SequenceTagger import SequenceTagger, IOBTagger from .POSTagger import POSTagger, StanfordPOSTagger from .Chunker import Chunker, RuleBasedChunker, tree2brackets from .DependencyParser import DependencyParser, MaltParser, TurboParser from .utils import default_stopwords from nltk.corpus import WordListCorpusReader stopwords = WordListCorpusReader('', [default_stopwords], encoding='utf8') def sent_tokenize(text): if not hasattr(sent_tokenize, 'tokenizer'): sent_tokenize.tokenizer = SentenceTokenizer() return sent_tokenize.tokenizer.tokenize(text) def word_tokenize(sentence): if not hasattr(word_tokenize, 'tokenizer'): word_tokenize.tokenizer = WordTokenizer() return word_tokenize.tokenizer.tokenize(sentence)