コード例 #1
0
 def __init_corpora(self):
     self.negation_words = WordListCorpusReader('../data/corpora/',
                                                'negation_words')
     self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
     self.negative_sentiments = WordListCorpusReader(
         '../data/corpora/sentiment-lexicon', 'negative-words.txt')
     self.positive_sentiments = WordListCorpusReader(
         '../data/corpora/sentiment-lexicon', 'positive-words.txt')
コード例 #2
0
    def __init__(self, language, sw_files=[], load_default=True):
        self.language = language
        self.stopwords = []

        if load_default:
            wlcr = WordListCorpusReader(data.GzipFileSystemPathPointer(DEFAULT_SW_FILE), [language], encoding="utf-8")
            self.stopwords = wlcr.words(language)
            logging.info("Loaded default stopwords from file %s" % DEFAULT_SW_FILE)

        path = BASE_SW_PATH + language
        for sw_file in sw_files:
            wlcr = WordListCorpusReader(data.FileSystemPathPointer(path), sw_file, encoding="utf-8")
            self.stopwords += wlcr.words(sw_file)
            logging.info("Loaded stopwords from file '%s'" % sw_file)
コード例 #3
0
def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                _corpus_root)
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]
コード例 #4
0
def extract_mimetypes(text, do_replace=True):
    '''
    pull a list of mimetypes from some text feature

    return a list of mimetypes in the text block and
    the text, without mimetypes or unmodified
    '''
    mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt')

    found_mimetypes = [w for w in mimetypes.words() if w in text]

    if do_replace:
        text = remove_tokens('mimetypes.txt', text)

    return found_mimetypes, text
コード例 #5
0
@author: ake
@software: PyCharm Community Edition
@time: 2016/4/28 9:32
"""
import re
from gensim import corpora, models, similarities
import xml.etree.ElementTree as Et
from GetData.preprocess import getdata, ltp, ltp_pos
from nltk.corpus import WordListCorpusReader
import jieba
import jieba.posseg as pseg
import logging

# 添加停用词
STOP_PATH = r'D:\MyProject\pythonProjects\TopicMine\LDA_T\data\\'
stopwords = set(WordListCorpusReader(STOP_PATH, 'stopwords.txt').words())


#
def parse_lda_xml(file):
    with open(file, 'r', encoding='utf-8') as f:
        xml_raw = f.read().strip().split('\n\n')  # xml文本,可能包含多个xml,用双换行进行切分
    f.close()
    docs = []  # 存储结果
    for doc in xml_raw:
        xml = Et.fromstring(doc)
        doc_words = []
        for sentence in xml.findall('./doc/para/sent'):  # 循环读取句子
            word_list = [words for words in sentence]  # 循环读取word列表
            wordsall = []  # 存储句子的匹配结果
            for word in word_list:  # 循环解析每个word要素
コード例 #6
0
ファイル: nlp.py プロジェクト: tangxiangru/Factoid-QA-subtask
def read_stopwords(path):
    '''使用nltk读停用词表
    '''
    root,fileid=os.path.split(path)
    stopwords=WordListCorpusReader(root,[fileid])
    return stopwords.words(fileid)
コード例 #7
0
delete = (config.has_option('cooking', 'delete')
          and config.get('cooking', 'delete').split(REGEX_SEPARATOR)) or ''
keywords = (config.has_option('cooking', 'keywords') and config.get(
    'cooking', 'keywords').lower().split(',')) or 'movistar'
oficial_users = (config.has_option('cooking', 'oficial_users') and config.get(
    'cooking', 'oficial_users').lower().split(',')) or 'movistar'
languages = (config.has_option('cooking', 'languages') and config.get(
    'cooking', 'languages').lower().split(',')) or 'spanish'
steps = config.get('cooking', 'steps').lower().split(',')
text_field_out = (config.has_option('cooking', 'text_field_out')
                  and config.get('cooking', 'text_field_out').lower()) or ''
text_field_in = (config.has_option('cooking', 'text_field_in')
                 and config.get('cooking', 'text_field_in').lower()) or ''

# Loading serving section
output_fields = (config.has_option('serving', 'output_fields') and config.get(
    'serving', 'output_fields').lower().split(',')) or ''
output_separator = (config.has_option('serving', 'output_separator')
                    and config.get('serving', 'output_separator')) or ','

# Reading corpus
data.path = [os.getcwd()] + data.path
stopwords = WordListCorpusReader(
    data.GzipFileSystemPathPointer('stopwords.zip'), languages)

# Loading dependencies
yamlImport = zipimport.zipimporter('yaml.zip')
yalm = yamlImport.load_module('yaml')
nltkImport = zipimport.zipimporter('nltk.zip')
nltk = nltkImport.load_module('nltk')
コード例 #8
0
from .VerbValencyReader import VerbValencyReader
from .DadeganReader import DadeganReader
from .TreebankReader import TreebankReader
from .WikipediaReader import WikipediaReader
from .SentiPersReader import SentiPersReader
from .TNewsReader import TNewsReader
from .Normalizer import Normalizer
from .InformalNormalizer import InformalNormalizer, InformalLemmatizer
from .Stemmer import Stemmer
from .Lemmatizer import Lemmatizer
from .SequenceTagger import SequenceTagger, IOBTagger
from .POSTagger import POSTagger, StanfordPOSTagger
from .Chunker import Chunker, RuleBasedChunker, tree2brackets
from .DependencyParser import DependencyParser, MaltParser, TurboParser

from .utils import default_stopwords
from nltk.corpus import WordListCorpusReader
stopwords = WordListCorpusReader('', [default_stopwords], encoding='utf8')


def sent_tokenize(text):
    if not hasattr(sent_tokenize, 'tokenizer'):
        sent_tokenize.tokenizer = SentenceTokenizer()
    return sent_tokenize.tokenizer.tokenize(text)


def word_tokenize(sentence):
    if not hasattr(word_tokenize, 'tokenizer'):
        word_tokenize.tokenizer = WordTokenizer()
    return word_tokenize.tokenizer.tokenize(sentence)