コード例 #1
0
from helpers.extensions import sort_dict_by_value, keys_by_value


# Lower cases the files and saves it to another location
# lower_files('docs\\brown_hw\\Train\\', 'docs\\brown_hw_lowercase\\Train\\')
# lower_files('docs\\brown_hw\\Test\\', 'docs\\brown_hw_lowercase\\Test\\')

# load_multiple_corpus_files('docs\\brown_hw\\Train\\')
# load_multiple_corpus_files('docs\\brown_hw\\Test\\')


# TaggedCorpusReader for train set
train_root = 'docs\\brown_hw_lowercase\\Train'
train_reader = TaggedCorpusReader(train_root, '.*')

train_words = train_reader.words()

train_word_counts = nltk.FreqDist(train_words)

# Words of train set that occurs only once
train_word_counts_1 = keys_by_value(train_word_counts)

# Words with tags of train set
tagged_words_with_unk = [list(x) for x in train_reader.tagged_words()]

# Makes words 'UNK' whose counts are one
for index, tagged_word in enumerate(tagged_words_with_unk):
    if tagged_word[0] in train_word_counts_1.keys():
        tagged_words_with_unk[index][0] = 'UNK'

# Computes tag frequencies and put them into dictionary
コード例 #2
0
   import nltk

# Corpus texto simples
from nltk.corpus import PlaintextCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoSimples/'
corpus1 = PlaintextCorpusReader(loc, '.*\.txt')
print(corpus1.fileids())
print(corpus1.sents())
print(corpus1.words())

# Corpus texto etiquetado
from nltk.corpus.reader.tagged import TaggedCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoEtiquetas/'
corpus2 = TaggedCorpusReader(loc, '.*\.txt')
print(corpus2.fileids())
print(corpus2.words())
print("Palavras etiquetadas: ", corpus2.tagged_words())
print(corpus2.tagged_words('003.txt'))
print("Sentencas diretas:")
for s in corpus2.sents():
    print(' '.join(s))

from nltk.corpus.reader import CategorizedPlaintextCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoCategorias/'
corpus3 = CategorizedPlaintextCorpusReader(loc, '.*\.txt', cat_file="categorias.txt")
print(corpus3.fileids())
print(corpus3.categories())
print(corpus3.words(categories='brasnam'))

# Definicao de stopwords
stopwords = nltk.corpus.stopwords.words('portuguese')
コード例 #3
0
 files = cityFileSet[city]
 currentCity += 1
 trans = []
 wordGroupDict = dict()
 print '\n'+city+'\n'
 
 currentProgress(currentCity, totalCities, 'cities')
 totalReviews = 0
 
 for file in files:
     corpusdir = corpus_path+file[0:-5]+'/'
     if not os.path.isdir(corpusdir):
         missingCorpus(corpusdir)
     hotelcorpus = TaggedCorpusReader(corpusdir, '.*')
     
     stopset = getDefaultStopset(set(hotelcorpus.words("stopset.txt")))
     
     for review in hotelcorpus.fileids():
         if review == "stopset.txt":
             continue
         content = hotelcorpus.tagged_sents(review)
         if len(content) == 0:
             continue
         totalReviews += 1
         trimmedTokens = []
         for sentences in content:            
             for word, pos in sentences:
                 if word.isalpha() and word.lower() not in stopset:
                     trimmedTokens += [(word.lower(), pos)]
             trans += [lemmatize.getLemmas(trimmedTokens)]
             trimmedTokens = []
コード例 #4
0
ファイル: oanc_process.py プロジェクト: dbarbella/analogy
    :return:
    '''
    file_list = glob.glob(root_directory + "/**/*.txt", recursive=True)
    return file_list


# Use the ANC tool to build an nltk version of the data here.
oanc_directory = root + "\\corpora\\oanc\\nltk-data\\travel_guides"  # oanc/nltk-data"
oanc_files = build_file_list(oanc_directory)

# See http://www.nltk.org/howto/corpus.html
oanc_corpus = TaggedCorpusReader(
    oanc_directory, oanc_files,
    sep="_")  # Specify that _ is used as a separator.
print(oanc_corpus.fileids())
x = oanc_corpus.words()[:50]
print(x)
y = oanc_corpus.paras()[:10]
"""
This script is an alternative/demo to scrapeWordHunt.py, but is not
used in this folder.
"""
SOURCE_NAME = "OANC-TRAV"

txt_file_name = "analogy_sentences_OANC-TRAV.txt"
csv_file_name = "analogy_names_OANC-TRAV.csv"
output_handler = open(root + "\\corpora\\extractions\\" + txt_file_name,
                      "w",
                      encoding="utf-8")

# Find the indices of all paragraphs that contain the patterns as listed in