コード例 #1
0
ファイル: Statistics.py プロジェクト: jplahn/NLP-Capstone
def main():
	current_directory = os.path.dirname(__file__)
	corpus_root = os.path.abspath(current_directory)
	wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt')
	wordlists.fileids()
	ClassEvent = nltk.Text(wordlists.words())
	CEWords = ["Long Island", "Weather Service", "flooding", "August", 
		"heavy rains", "Wednesday", "Suffolk County", "New York", "rainfall",
		"record"]

	# ClassEvent Statistics
	print "--------- CLASS EVENT STATISTICS -------------"
	print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent)	
	print "ClassEvent WORD LENGTH DISTRIBUTIONS:"
	print_word_length_distributions(ClassEvent)
	print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:"
	print_percentage_of_word_in_collection(ClassEvent, CEWords)
	
	ClassEventLettersPerWord = average_letters_per_word(ClassEvent)
	ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents())
	ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \
		ClassEventWordsPerSent) - 21.43
	
	print "Average number of letters per word", ClassEventLettersPerWord
	print "Average number of words per sentence:", ClassEventWordsPerSent
	print "Automated Readability Index:", ClassEventARI


	print 

	wordlists_event = PlaintextCorpusReader(corpus_root, "Texas_Wild_Fire/.*\.txt")
	wordlists_event.fileids()
	YourSmall = nltk.Text(wordlists_event.words())
	SmallEventWords = ["Fire", "Wildfire", "Water", "Damage", "Ground", "Burn", 
		"Town", "Heat", "Wind", "Speed", "Size", "City", "People", "Home",
		"Weather", "Debris", "Death", "Smoke", "State", "Ash"]
	

	# YourSmall statistics
	print "--------- YOUR SMALL STATISTICS --------------"
	print "Texas_Wild_Fire", non_stopword_fraction(YourSmall)
	print "YourSmall WORD LENGTH DISTRIBUTIONS:"
	print_word_length_distributions(YourSmall)
	print "YourSmall PERCENTAGE OF WORD OCCURRENCES:"
	print_percentage_of_word_in_collection(YourSmall, SmallEventWords)
	
	YourSmallLettersPerWord = average_letters_per_word(YourSmall)
	YourSmallWordsPerSent = len(wordlists_event.words()) / \
		len(wordlists_event.sents())
	YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \
		YourSmallWordsPerSent) - 21.43

	print "Average number of letters per word", YourSmallLettersPerWord
	print "Average number of words per sentence:", YourSmallWordsPerSent
	print "Automated Readability Index", YourSmallARI
コード例 #2
0
ファイル: q1.py プロジェクト: cmstewart/galv
def textinfo(path):
    """
    Takes a file path and returns figures about the text file contained therein.
    """
    
    from nltk.corpus import PlaintextCorpusReader
    from nltk import FreqDist
    corpusReader = PlaintextCorpusReader(text, '.*')

    print "Total word count:", len([word for sentence in corpusReader.sents() for word in sentence])
    print "Unique words:", len(set(corpusReader.words()))
    print "Sentences:", len(corpusReader.sents())
    print "Average sentence length in words:", (len([word for sentence in corpusReader.sents() for word in sentence]) / len(corpusReader.sents()))
コード例 #3
0
ファイル: views.py プロジェクト: prashaantt/savitri-labs
def stats(request):
    errors = []
    statistics=[]
    if 'q' in request.GET:
        q = request.GET['q']
        if not q:
            errors.append('Enter a Canto Number')
        else:
           cantoname = "canto"+q+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           return render_to_response('stats.html', {'statistics':statistics})
    return render_to_response('stats.html', {'errors': errors})
コード例 #4
0
def get_coarse_level_features(dataset, output_file):
	# Import the corpus reader
	corpus_root = '/home1/c/cis530/data-hw2/'+dataset
	# Define the folder where the files are situated
	files_dataset = PlaintextCorpusReader(corpus_root, '.*')
	# Open the output_file
	output = open('/home1/c/cis530/data-hw2/'+output_file,'w')
	# Read the stopwlist
	stop_list = open('/home1/c/cis530/data-hw2/'+'stopwlist.txt').read()
	types_stop_list=stop_list.split()
	for fileid in files_dataset.fileids():
		# Output the docid
		output.write(dataset+'/'+fileid+' ')
		# Output the topic_name
		topic_name=fileid.split('/')[0]	
		output.write(topic_name+' ')
		# Output the num_tokens	
		tokens=files_dataset.words(fileid)
		output.write('tok:'+str(len(tokens))+' ')
		# Output the num_types
		types=set(tokens)
		output.write('typ:'+str(len(types))+' ')
		# Output the num_contents
		output.write('con:'+str(len([w for w in tokens if w not in types_stop_list]))+' ')
		# Output the num_sents
		sents = files_dataset.sents(fileid)
		output.write('sen:'+str(len(sents))+' ')
		# Output the avg_slen
		avg_slen=round(float(len(tokens))/float(len(sents)),2)
		output.write('len:'+str(avg_slen)+' ')
		# Output the num_caps
		output.write('cap:'+str(len([w for w in tokens if w[0]>='A' and w[0]<='Z'])))
		output.write('\n')
	output.close()
コード例 #5
0
ファイル: views.py プロジェクト: prashaantt/savitri-labs
def compare(request):
    errors = []
    statistics=[]
    stats=[]
    for x in range(1,3):
           cantoname = "canto"+str(x)+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append(x)
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           stats.append(statistics)
           
    return render_to_response('compare.html', {'stats':statistics})
コード例 #6
0
def main():
    corpus_root = '../posts/'
    newcorpus = PlaintextCorpusReader(corpus_root, '.*',
                                      para_block_reader=read_block_no_metadata)
    corpus_words = [w.lower() for w in newcorpus.words() if w.isalpha()]
    corpus_sentences = newcorpus.sents()
    analyst = TextAnalyst(corpus_words, corpus_sentences, 'french')
    analyst.print_analyze()
コード例 #7
0
ファイル: chunker.py プロジェクト: AdamMeyers/The_Termolator
def extractPossibleTerms(root, fileids):
    # get corpus
    #root, filename = os.path.split(path)
    reader = PlaintextCorpusReader(root, fileids)
    # get chunker
    grammar = 'NP: {<JJ>*<NNP>*<NN>*}'
    chunker = RegexpParser(grammar)
    # get terms
    terms = set()
    print len(reader.sents())
    i = 0
    for sent in reader.sents():
        i += 1
        if i%100==0:
            print i
        tree = chunker.parse(pos_tag(sent))
        for t in tree.subtrees(lambda t: t.node!='S'): # exclude Sentence node
            terms.add(' '.join([el[0] for el in t]))
    return terms
コード例 #8
0
def get_coarse_level_features(dataset, output_file):
# accessing the corpus
    corpus_root = '/home1/c/cis530/data-hw2/' 
    dataset_path = corpus_root + dataset

# Reading the files from the directories
    files = PlaintextCorpusReader(dataset_path, '.*')
    ids = files.fileids()
    stopFile = PlaintextCorpusReader(corpus_root, 'stopwlist.txt')
    stops = stopFile.words()

#Opening a file that has to be written to
    out = open(output_file, 'w')

    for i in range(0,len(ids) - 1):
#Initializing certain variables
        tokens_count=0
        types = 0
        non_stops_count=0
        sents_count = 0
        avg_sent_len=0
        cap_count = 0

        tokens=files.words(ids[i])
#Computing Number of Tokens
        tokens_count = len(tokens)

#Computing Number of types
        types = len(set(tokens))
        non_stops=[]

#Computing Number of Content Words
        for t in tokens:
            if t not in stops:
                non_stops.append(t)
        non_stops_count = len(non_stops)

#Finding Average Sentence Length
        sent = []
        sent = files.sents(ids[i])
        sents_count = len(sent)
        sent_len=0
        for s in sent:
            sent_len = sent_len + len(s)
        avg_sent_len = sent_len/float(sents_count)

#Computing Number of Captilized Words
        for c in non_stops:
            if c.istitle():
                cap_count = cap_count+1
        current_file = dataset + '/' + ids[i]
        e = current_file.split('/')
        out.write(current_file +' '+ e[-2] + ' tok:' + str(tokens_count) + ' typ:' + \
str(types) + ' con:' + str(non_stops_count) + ' sen:' + str(sents_count) + ' len:' + str(avg_sent_len) + ' cap:' + str(cap_count)+ '\n')
        out.flush()
コード例 #9
0
def train():

   wordlists = PlaintextCorpusReader('', file_path)

   st = stemmer()
   
   # Get blocks of text using NLTK
   words = wordlists.words(file_path)
   sents = wordlists.sents(file_path)
   paras = wordlists.paras(file_path)

   # LOGIC
   #       If a sentence contains a known [posi/nega]tive word, count the instances of words in that sentence as 
   #       [posi/nega]tive

   # Count words
   word_features = []

   # Go through paragraphs
   for p in paras:

      # Classify S
      score_positive_negative = 0
      for s in p:
         for word in s:

            word = st.stem(word)

            if word in words_positive:
               score_positive_negative += 1
            elif word in words_negative:
               score_positive_negative -= 1
   
      # Record class of paragraph for any words present
      for s in p:
         for word in s:

            word = st.stem(word)

            if score_positive_negative > 0:
               word_features.append( ({"word": word}, "+") )
            elif score_positive_negative < 0:
               word_features.append( ({"word": word}, "-") )
            else:
               word_features.append( ({"word": word}, " ") )

   # Create and return classifier
   classifier = nltk.NaiveBayesClassifier.train(word_features)
   return classifier
コード例 #10
0
def main():

   st = stemmer()

   # Get data
   wordlists = PlaintextCorpusReader('', file_path)
   words = wordlists.words(file_path)
   sents = wordlists.sents(file_path)
   paras = wordlists.paras(file_path)

   # Train
   classifier = train()

   # Get class probabilities (for MAP estimation)
   counts = {"P":0, "-":0, "N":0}
   for i in range(0,len(paras)):
      for s in paras[i]:

         score_pos = 0
         score_neg = 0

         # Classify paragraph
         for word in s:

            word = st.stem(word)

            feature = {"word":word}
            classified = classifier.classify(feature)

            if classified == "+":
               score_pos += 1
            elif classified == "-":
               score_neg += 1

         # Record result
         if score_pos > score_neg:
            counts["P"] += 1
         elif score_pos < score_neg:
            counts["N"] += 1
         else:
            counts["-"] += 1

   # Done!
   print counts
コード例 #11
0
    def classifyByYear(self) :
        corpusReader = PlaintextCorpusReader(self.txtDirectory, ".*.txt", encoding = self.codec)

        for journal in corpusReader.fileids() :
            print ("Start " + journal)

            sentList = corpusReader.sents(journal)

            for sent in sentList :
                getMonth = False
                getDOI = False

                line = ''.join(sent)

                if self.doiURLTypes[0] in line :
                    getDOI = True
                    self._extractYearByDOI(self.doiURLTypes[0], journal, line)
                    break
                elif self.doiURLTypes[1] in line :
                    getDOI = True
                    self._extractYearByDOI(self.doiURLTypes[1], journal, line)
                    break

                for word in sent :
                    if getMonth :
                        self._extractYearByMonth(journal, word)
                        break

                    if word.lower() in self.dictMonth :
                        getMonth = True

                if getMonth :
                    getMonth = False
                    break
                elif getDOI :
                    getDOI = False
                    break

            print ("End " + journal)

        print (str(self.yearDirectoryList))
コード例 #12
0
ファイル: utils.py プロジェクト: ufal/wiki-error-corpus
def get_sentences_for_text(corpus_root, filename, lang="english"):
    """Segments the given text into sentences.

  Args:
    corpus_root: Directory in which the text file is residing.
    filename: Name of the text file.
    lang: Tokenizer language. For possible values, look at:
    ${NLTK_DATA}/tokenizers/punkt

  Returns:
    Sentences in the given text. 

  """
    tokenizer_path = "tokenizers/punkt/" + lang + ".pickle"
    text = PlaintextCorpusReader(
        corpus_root,
        [filename],
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=nltk.data.LazyLoader(tokenizer_path),
    )
    return text.sents()
コード例 #13
0
ファイル: edgeBundle.py プロジェクト: alv53/Alvin_CS398VL_MP3
def network(chapter):
	if(chapter == 0):
		NEs = open("finalNEs/finalNEs.txt").read().split('\n')
		text_raw = open("ofk.txt").read()
	else:
		NEs = open("finalNEs/finalNEs_ch" + str(chapter) + ".txt").read().split('\n')
		text_raw = open("ofk_ch" + str(chapter) + ".txt").read()
	result = [dict(name="", relations=[""])]
	for NE in NEs:
		result.append(dict(name=NE, relations=[""]))

	# The next line is needed because of the extra blank list elements at the beginning and end (Beginning I added, end added from newlines in finalNEs.txt)
	result = result[1:len(result)-1]
	corpus = PlaintextCorpusReader('.', 'ofk\.txt')
	sentences = corpus.sents()
	for x in range(len(sentences)):
		for NEdict in result:
			if NEdict["name"] in sentences[x]:
	# 			# We are in a sentence with a named entity
				for n in result:
					if n["name"] in sentences[x] and n["name"] != NEdict["name"]:
						NEdict["relations"].append(n["name"])
	for NEdict in result:
		NEdict["relations"] = Set(NEdict["relations"][1:])
	final = [dict(name=r["name"], imports=list(r["relations"]), url=r["name"]+".html") for r in result]
	for finals in final:
		with open("../webpage/" + finals["name"] + ".html", "w") as f1:
			with open("part1.html") as f:
				for line in f:
					f1.write(line)
				f1.write(finals["name"])
			with open("part2.html") as f:
				for line in f:
					f1.write(line)
				f1.write("\tmain(\"data/" + finals["name"] + ".json" + "\");\n</script>")

	with open("../webpage/data/edgeBundle.json",'w') as outfile:
		json.dump(final,outfile, sort_keys = True, indent = 4, ensure_ascii=False)
コード例 #14
0
ファイル: graftempo.py プロジェクト: barliant/krextown
def build_graph(folder, file_pattern):
    corpus_root = os.getcwd() + "/" + folder
    print "Membuka korpus " + folder + " ..."
    word_lists = PlaintextCorpusReader(corpus_root, file_pattern)

    naskah = word_lists.sents()
    filelists = word_lists.fileids()
    teks = tokenize.sent_tokenize(word_lists.raw(fileids=filelists))

    print folder + " memiliki " + str(len(teks)) + ", " + str(len(naskah)) + " kalimat."

    G_result = nx.Graph()
    print "Membangun graf " + folder + " ..."
    for kalimat in naskah:
        kata = kalimat[0]
        prevToken = kata.lower()
        for idx in range(1, len(kalimat)):
            kata = kalimat[idx]
            token = kata.lower()
            if containsLetter(token) and containsLetter(prevToken):
                G_result.add_edge(prevToken, token)
                prevToken = token

    return G_result
コード例 #15
0
jacor = PlaintextCorpusReader(corpus_root, 'J.*txt')

# [B3] Print out some basic specs of the two corpora. First off, # of files.
# YOUR CODE BELOW.
print('The length of the Bulgarian essay corpus: ' + str(len(bucor.fileids())))
print('The length of the Japanese essay corpus: ' + str(len(jacor.fileids())))
print()

# [B4] Now, print total # of sentences and # of words.
# YOUR CODE BELOW.
print('The number of words in the Bulgarian essay corpus: ' +
      str(len(bucor.words())))
print('The number of words in the Japanese essay corpus: ' +
      str(len(jacor.words())))
print('The number of sentences in the Bulgarian essay corpus: ' +
      str(len(bucor.sents())))
print('The number of sentences in the Japanese essay corpus: ' +
      str(len(jacor.sents())))

# ------------------------------------------------------------------------
#                                                    BUILDING DATA OBJECTS
print("...Building data objects...")
# ------------------------------------------------------------------------

# [C1] Build lowercased token lists.
# EDIT THE CODE BELOW.
bu_toks = []
for x in bucor.words():
    bu_toks.append(x.lower())

ja_toks = []

for w in sub_words:
    allwords.append(w.lower())

for w in obj_words:
    allwords.append(w.lower())
allwords=nltk.FreqDist(allwords)
word_features=list(allwords.keys())[:300]

#WORDNET DATASET ARCHIVE
from nltk.corpus import wordnet
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'E:\EIGHTH SEMESTER\PROJECT AND THESIS II\SOFTWARE\WORDNET'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
training_data=wordlists.sents('document.txt')


#INPUT TEXTS
sentence = input("Please enter the sentence: ")
print("The input string: ", sentence)
corpora=sentence
wordtoken_test=word_tokenize(sentence)
wordtoken_train=training_data[0]
print("Tokenization of training data: ",wordtoken_train[0])
print("Tokenization of testing data: ",wordtoken_test[0])


#PARTS OF SPEECH TAGGING OF WORDS
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
コード例 #17
0
def read_file(file_path, file_name):

    text = PlaintextCorpusReader(file_path, file_name)
    sents = text.sents()
    words = text.words()
    text_complexity_score(words, sents, file_name)
コード例 #18
0
ファイル: eval.py プロジェクト: famaf/PLN_2017
    model_file = str(opts['-i'])

    # Abrimo el archivo que contiene el Modelo del lenguaje
    f = open(model_file, "rb")

    # Reconstruimos el objeto desde la representacion en cadena de bytes
    modelo = pickle.load(f)

    pattern = r'''(?ix)    # set flag to allow verbose regexps
          (?:sr\.|sra\.)
        | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
        | \w+(?:-\w+)*        # words with optional internal hyphens
        | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
        | \.\.\.            # ellipsis
        | [][.,;"'?():-_`]  # these are separate tokens; includes ]
    '''

    PATH = "./../../Corpus_Language_Modeling"  # Ubicacion del archivo
    FILENAME = "corpus_test.txt"  # Nombre del archivo

    # Load the data
    tokenizer = RegexpTokenizer(pattern)
    corpus = PlaintextCorpusReader(PATH, FILENAME, word_tokenizer=tokenizer)

    sents = corpus.sents()

    print("Perplexity =", modelo.perplexity(sents))

    # Cerramos el archivo
    f.close()
コード例 #19
0
        if w.isalnum():
            string += ' '
        string += w
    return string


print('Welcome to the extractive single-document summarizer')
iterations = 20  # iterations for textrank - should be 10 or more to converge
n = 2  # output n sentences as summary
corpus_root = './articles/'  # news articles to be summarized
wordlists = PlaintextCorpusReader(corpus_root, '.*')
print_pagerank_values = False

for fileid in wordlists.fileids():
    print('Article: ', fileid)
    sents = wordlists.sents(fileid)
    sim_graph_out = []
    size = len(sents)
    s1_count = 0
    print('Computing sentence similarity graph ...')
    # compute similarity of every sentence to all other sentences
    for i in range(0, size):
        s1 = sents[i]
        edges_out = []
        for j in range(0, size):
            s2 = sents[j]
            sim = similarity(s1, s2)
            if sim > 0 and i != j:
                edges_out.append((i, j, sim))
        sim_graph_out.append(edges_out)
コード例 #20
0
print '7. Obtenir la freqüència d\'aparició de les paraules en el primer fitxer del corpus. Obtindre la freqüència de la paraula \'a\''
ex_7 = freqs.values( )
ex_72 = freqs[ 'a' ]
print '\t', ex_7, '\n\tFreq. aparició \'a\':', ex_72

print '8. Obtenir el nombre de paraules que solament apareixen una volta al primer fitxer del corpus'
ex_8 = len( [ 1 for p in freqs.keys( ) if freqs[ p ] == 1 ] )
print '\t', ex_8

print '9. Obtenir la paraula més freqüent del primer fitxer del corpus'
ex_9 = freqs.max( )
print '\t', ex_9

print '10. Carrega els fitxers "spam.txt", "quijote.txt" i "tirantloblanc.txt" com un corpus propi'
corpus_root = 'C:\\Users\\nrikee\\PycharmProjects\\NLTK'

corpus_spam = PlaintextCorpusReader ( corpus_root, 'spam.txt' )
freqs_spam = FreqDist ( corpus_spam.words() )

corpus_quijote = PlaintextCorpusReader ( corpus_root, 'quijote.txt' )
freqs_quijote = FreqDist ( corpus_quijote.words() )

corpus_tirant = PlaintextCorpusReader ( corpus_root, 'tirantloblanc.txt' )
freqs_tirant = FreqDist ( corpus_tirant.words() )
print '\t', '...fet.'

print '11. Calcula el nombre de paraules, el nombre de paraules distintes i el nombre de frases'
print '\t', 'spam.txt', len( corpus_spam.words() ), len ( freqs_spam.keys() ), len ( corpus_spam.sents() )
print '\t', 'quijote.txt', len( corpus_quijote.words() ), len ( freqs_quijote.keys() ), len ( corpus_quijote.sents() )
print '\t', 'tirantloblanc.txt', len( corpus_tirant.words() ), len ( freqs_tirant.keys() ), len ( corpus_tirant.sents() )
コード例 #21
0
words = gutenberg.words("burgess-busterbrown.txt")
words[1:20]

sents = gutenberg.sents("burgess-busterbrown.txt")
sents[1:20]

# 载入自己的语料库
from nltk.corpus import PlaintextCorpusReader

corpus_root = 'D:/icwb2-data/training'  # 文件目录
wordlists = PlaintextCorpusReader(corpus_root, ['pku_training.utf8', 'cityu_training.utf8', 'msr_training.utf8',
                                                'pku_training.utf8'])
wordlists.fileids()
print(wordlists.raw('pku_training.utf8'))
print(len(wordlists.words('pku_training.utf8')))
print(len(wordlists.sents('pku_training.utf8')))

####条件频率分布####
# 条件与事件
'''
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',...]
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'),...]
'''

# 按文体计算词频
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
コード例 #22
0
# fdist1.plot( cumulative = True ) #cumulative plot of the frequency distribution
# fdist2 = FreqDist( text2 ) # create a frequency distribution containing the given samples
# fdist1 |= fdist2 # update fdist1 with counts from fdist2
# fdist1 < fdist2	# test if samples in fdist1 occur less frequently than in fdist2
# print( "===" )
# print( "nltk.corpus.gutenberg.fileids() = ", nltk.corpus.gutenberg.fileids() )
# print( "===" )
# emma = nltk.corpus.gutenberg.words('austen-emma.txt')
# print( "len( emma ) = ", len( emma ) )
# print( "===" )
# from nltk.corpus import udhr
# languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
# cfd = nltk.ConditionalFreqDist( ( lang, len( word ) ) for lang in languages for word in udhr.words(lang + '-Latin1'))
# cfd.plot( cumulative = True )
# Figure 1.2: Cumulative Word Length Distributions: Six translations of the Universal Declaration of Human Rights are processed; this graph shows that words having 5 or fewer letters account for about 80% of Ibibio text, 60% of German text, and 25% of Inuktitut text.
# print( "===" )
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'c:\\temp\\DDD3262\\'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
print("===")
print("wordlists.fileids() = ", wordlists.fileids())
print("===")
print("wordlists.words( '3262.txt' ) = ", wordlists.words('3262.txt'))
print("wordlists.words( '3262.txt' )[0:1000] = ",
      wordlists.words('3262.txt')[0:1000])
print("===")
print("wordlists.sents( '3262.txt' ) = ", wordlists.sents('3262.txt'))
print("wordlists.sents( '3262.txt' )[0:300] = ",
      wordlists.sents('3262.txt')[0:300])
print("===")
コード例 #23
0
ファイル: SAR-p4-NLTK.py プロジェクト: jodoldar/Codigo-Clase
print([fdist[w] for w, f in fdist.most_common()])

# Parte 8
print("No de palabras que aparecen una sole vez: ",
      len([w for w, f in fdist.most_common() if fdist[w] == 1]))

# Parte 9
print("La palabra más frecuente es", fdist.max())

# Parte 10
from nltk.corpus import PlaintextCorpusReader
corpus_root = '.'
wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')
for element in wordlists.fileids():
    print(element, len(wordlists.words(element)),
          len(set(wordlists.words(element))), len(wordlists.sents(element)))

# EJERCICIO 2
from nltk.corpus import brown
from nltk.probability import *

res = []

palabras = ['what', 'when', 'where', 'who', 'why']
for palabra in palabras:
    res.append(palabra)
    lista = []
    for cat in brown.categories():
        pal = FreqDist(brown.words(categories=cat))
        lista.append(cat)
        lista.append(pal[palabra])
コード例 #24
0
ファイル: run.py プロジェクト: vasilnv/NLP-Deep-Learning
import grads
import utils
import w2v_sgd
import sampling

#############################################################
#######   Зареждане на корпуса
#############################################################
startToken = '<START>'
endToken = '<END>'

corpus_root = 'JOURNALISM.BG/C-MassMedia'
myCorpus = PlaintextCorpusReader(corpus_root, '.*\.txt')

corpus = [ [startToken] + [w.lower() for w in sent] + [endToken] for sent in myCorpus.sents()]

windowSize = 3
negativesCount = 5
embDim = 50

words, word2ind, freqs = utils.extractDictionary(corpus, limit=20000)
data = utils.extractWordContextPairs(corpus, windowSize, word2ind)

del corpus

U0 = (np.random.rand(len(words), embDim) - 0.5) / embDim
V0 = (np.random.rand(len(words), embDim) - 0.5) / embDim

seq = sampling.createSamplingSequence(freqs)
contextFunction = lambda c: sampling.sampleContext(c, seq, negativesCount)
コード例 #25
0
new_corpus = PlaintextCorpusReader(corpus_root, '.*')
lista_ficheros = new_corpus.fileids()
print(
    "\n--------------------------------------------------------------------------------\n "
)
print("\n\n1.10) Ficheros que componen el corpus: \n" + str(lista_ficheros) +
      "\n")
print(
    "\n-------------IMPORTANTE CAMBIAR LA RUTA DE LOS ARCHIVOS EN EL CÓDIGO------------\n "
)
print("Ruta actual : " + corpus_root)

# Calcular el número de palabras, el número de palabras distintas y el número de frases de los tres documentos
print(
    "\n--------------------------------------------------------------------------------\n1.11) "
)
print("\n" + "Palabras".rjust(35, " ") + "Vocabulario".rjust(12, " ") +
      "Frases".rjust(12, " "))
for fichero in lista_ficheros:
    texto1 = new_corpus.words(fichero)
    fdist1 = FreqDist(texto1)
    numPalabras = len(texto1)
    numPalabrasDistintas = len(fdist1.keys())
    numFrases = len(new_corpus.sents(fichero))
    print("Fichero: " + str(fichero).ljust(20, " ") +
          str(numPalabras).ljust(10, " ") +
          str(numPalabrasDistintas).ljust(15, " ") + str(numFrases))
print(
    "\n\n--------------------------------------------------------------------------------\n"
)
コード例 #26
0
from nltk import download


from languagemodeling.ngram import NGram, AddOneNGram, InterpolatedNGram


models = {
    'ngram': NGram,
    'addone': AddOneNGram,
    'interpolated': InterpolatedNGram,
}


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    corpus = PlaintextCorpusReader('../../textos/', 'out.txt')
    train_sents = corpus.sents()[0:int(len(corpus.sents())*0.9)]

    # train the model
    n = int(opts['-n'])
    model_class = models[opts['-m']]
    model = model_class(n, train_sents)

    # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()
コード例 #27
0
    num_words = len(gutenberg.words(fileid))  #词的长度
    num_sents = len(gutenberg.sents(fileid))  #划为句子长度
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print(int(num_chars / num_words), int(num_words / num_sents),
          int(num_words / num_vocab), fileid)
#平均词长,平均句子长,每个词出现的平均次数

'加载自己的语料库'
from nltk.corpus import PlaintextCorpusReader

corpus_root = 'E:/python shell'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()

ll = wordlists.words('items1.txt')
wordlists.sents('items1.txt')

'过滤停用词,让原来的表不出现停用词表中出现的词'
from nltk.corpus import stopwords

stopwords.words('english')


def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')  #加载自己的停用词库就不用这句了
    content = [w for w in text if w.lower() not in stopwords]
    return content


'过滤了停用词,有必要的话,可以用同义词表'
コード例 #28
0
    non_terminals.add(s)
    grammar = Grammar(non_terminals, terminals, s)
    # This is only to tell me how advanced the process is
    count = 0.0
    len_fileids = len(model.fileids())

    # Get the tokenized corpus
    tokens_location = location + "tokenized"
    print("getting tokens from: " + tokens_location)
    f = open(tokens_location, 'rb')
    tokens = pickle.load(f)
    f.close()

    # Train the grammar model with a context of -+1
    for fileid in model.fileids():
        spanish_sents = model.sents(fileid)
        print(str((count / len_fileids) * 100) + "%")
        count += 1
        # Between training with the entire corpus or just bits I get a small difference of productions, so it's not worth it
        fro = 0.55 * len(spanish_sents)
        to = 0.6 * len(spanish_sents)
        for sent in spanish_sents[int(fro):int(to)]:
            tokenized_sentence = []
            for word in sent:
                ts = tokens[word]
                tokenized_sentence.append(ts)
                grammar.add_terminal(ts)

            i = 0
            for terminal in tokenized_sentence[:-1]:
                # if this is the longest we have generated so far, we will need new non terminals (All the k = i part is
コード例 #29
0
ファイル: pos.py プロジェクト: rueshyna/Taipei.py_20130425
#!/usr/bin/env python
# -*- coding: utf-8

import nltk
from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import LineTokenizer

OUTPUTFILE = './data/tagged_sent'

pair_str_pos = lambda x : '/'.join(x)

corpus_root = './data'
fileids = 'data_title_sample'

corpus = PlaintextCorpusReader(corpus_root,
    fileids,
    sent_tokenizer=LineTokenizer(),
    encoding='utf-8')

output = open(OUTPUTFILE,'w')

for sent in  corpus.sents() :
  tokens = map(pair_str_pos,nltk.pos_tag(sent))
  sent = ' '.join(tokens)
  output.write(sent+"\n")
コード例 #30
0
import re
from nltk.corpus import PlaintextCorpusReader
corpus_root = './texts/'
wordlist = PlaintextCorpusReader(corpus_root, '.*')
print(wordlist.fileids())

#numbr of words
print(len(wordlist.words('mobydick.txt')))
#number of sentences
print(len(wordlist.sents('mobydick.txt')))

#stores a fileobject into f
f = open("./texts/mobydick.txt")
#stores a string into data from f
data = f.readlines()
#corpus is now a giant string delimited by newline character
corpus = "\n".join(data)
#prints the number of times Ishmael appears in the file.
print(len(re.findall(r"\bIshmael\b", corpus)))
コード例 #31
0
ファイル: ReadSents.py プロジェクト: shopuz/hcsvlab-galaxy
    o = open(outp,'w')
    curr = 0
    for sent in sentences:
        times = count_occurences(sent, sent[-1])
        curr = text.find(sent[0], curr)
        end = find_nth(text, sent[-1], times, curr) + len(sent[-1])
        o.write(text[curr:end] + '\n')
        curr = end
    o.close()

def find_nth(string, sub, n, offset):
    start = string.find(sub, offset)
    while start >= 0 and n > 1:
        start = string.find(sub, start+len(sub))
        n -= 1
    return start

def count_occurences(lst, string):
    count = 0
    for item in lst:
        if string in item:
            count += 1
    return count

inp = sys.argv[1]
i = open(inp,'r').read()
corpus = PlaintextCorpusReader(os.path.dirname(inp),os.path.basename(inp))
sents = corpus.sents()
print_out(i, sents)

コード例 #32
0
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
'''Parte 1 Abra un documento y muestrelo en pantalla'''

corpus_root = '/BUAP/Tareas/EstudioCLaudia/texto-tarea4'
mi_corpus = PlaintextCorpusReader(corpus_root, '.*')
'''
texto = mi_corpus.raw('crimeandpunishment.txt')
print(texto)
'''
'''
Parte 2 Separelo en oraciones y muestre cada oración numerada 
(1 para la primera oración, 2 para la segunda, etc) 
, pregunte al usuario que numero de oración desea seleccionar'''

oraciones = mi_corpus.sents('crimeandpunishment.txt')


def separar_numerar_oraciones(texto):
    key = 0
    for sent in oraciones:
        key = key + 1
        if sent:
            print(str(key) + ':' + str(sent))


'''la funcion tokens pertenece a la parte 4'''


def tokens(sent, palabra):
    keyword = 0
コード例 #33
0
def textFileToSentList(pathToFileFolder, FileNameWithExtension):
    wordlists = PlaintextCorpusReader(pathToFileFolder, '.*')
    return wordlists.sents(FileNameWithExtension)
コード例 #34
0
        while True:
            sent = predict_next_word(sent, model)
            if sent.split(" ")[-1] == "<END>":
                sent = " ".join(sent.split(" ")[:-1])
                break
        print(sent)


if __name__ == '__main__':
    print("Lab 4 Exercise 2")
    corpus_reader = PlaintextCorpusReader(root="./twitter-files",
                                          fileids=".*\.txt",
                                          word_tokenizer=TweetTokenizer())

    # Convert tweets to tri-grams
    tweets = [tweet for tweet in corpus_reader.sents()]
    tweet_trigrams = [
        list(
            ngrams(sequence=tweet,
                   n=3,
                   pad_left=True,
                   pad_right=True,
                   left_pad_symbol="<START>",
                   right_pad_symbol="<END>")) for tweet in tweets
    ]
    all_trigrams = [gram for tweet in tweet_trigrams for gram in tweet]

    # Initialize the language model
    freq_dist = FreqDist(all_trigrams)
    model = KneserNeyProbDist(freq_dist)
コード例 #35
0
print("Freq aparición de la preposición a " + str(fdist['a']))

print("#act8")
print("No de palabras que aparecen una sóla vez: " + str(len(fdist.hapaxes())))

print("#act9")
print("La palabra más frecuente es " + fdist.max())

print("#act10")
dir_path = os.path.dirname(os.path.realpath(__file__))
wordlists = PlaintextCorpusReader(corpus_root + "/library", '.*')
print("#act11")
text = wordlists.words(wordlists.fileids()[0])
fdist = FreqDist(text)
for i in wordlists.fileids():
    text = wordlists.words(i)
    fdist = FreqDist(text)
    print((str(i) + " " + str(len(text)) + " " + str(len(fdist.keys())) + " " +
           str(len(wordlists.sents(i)))))
"""
12. ¿Coinciden estos resultados con los de la práctica anterior? Justifica la respuesta.

Los resultados no coinciden por varias razones. La primera es que en la practica 2 solo se tenian en cuenta
las palabras alphabeticas en cambio NLTK incluye las alphanumericas como '-Fpa-' ó '51_por_ciento". Anteriormente tambien
quitabamos las stopwords de las cuentas. Finalmente el criterio de separación de frases de la práctica 2 eran ".", ";" y "\n\n"
dando como resultado más lineas totales.

"""
#EJERCICIO2

#EJERCICIO3
コード例 #36
0
ファイル: extract.py プロジェクト: triveni692/hacku
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/home/vivkul/Downloads/project'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
# wordlists.fileids()
# wordlists.words('questions.txt')
amrit=wordlists.words('allquestion.txt')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.book import *
fo=open("selectedquestion.txt","wb")
a=wordlists.sents('allquestion.txt')
while(len(amrit)!=0):
	content=[w for w in amrit if w.lower() not in stopwords]
	voc=FreqDist(content)
	# sorted([w for w in set(content) if len(w) > 2 and 4voc[w] > 3])
	# set_voc_0=FreqDist(a[0])
	# set_voc_1=FreqDist(a[1])
	b=voc.keys()
	i=0
	while(i<len(b)):
		if(len(b[i])>2):
			j=i
			max=b[i]
			break
		i=i+1
	q_no=[]
	k=0
	while(k<len(a)):
		set_voc=FreqDist(a[k])
		if(set_voc[max]>0):
			q_no.append(len([w for w in a[k] if w.lower() not in stopwords]))
コード例 #37
0
ファイル: test1.py プロジェクト: vasilnv/NLP-Deep-Learning
    'нак', 'уна', 'ьа', 'фна', 'наф', 'гна', 'ана', 'иа', 'ща', 'нан', 'ьна',
    'ниа', 'рна', 'пна', 'не', 'цна', 'ныа', 'нва', 'нка', 'ну', 'зна', 'оа',
    'нау', 'нйа', 'наш', 'ена', 'яна', 'нба', 'нт', 'ню', 'н', 'нэ', 'нжа',
    'нла', 'нпа', 'но', 'яа', 'нна', 'нж', 'еа', 'нав', 'нац', 'нса', 'нщ',
    'нас', 'жна', 'нал', 'нц', 'нр', 'ина', 'лна', 'па', 'нп', 'нф', 'нс',
    'нащ', 'та', 'чна', 'нча', 'дна', 'йна', 'уа', 'нат', 'нв', 'нач', '-на',
    'ка', 'сна', 'нк', 'нма', 'жа', 'наь', 'нч', 'хна', 'ная', 'ны', 'н ',
    'наж', 'за', 'йа', 'ла'
}

print('Прочитане на корпуса от текстове...')
corpus_root = 'JOURNALISM.BG/C-MassMedia'
myCorpus = PlaintextCorpusReader(corpus_root, '.*\.txt')
fullSentCorpus = [[model.startToken] + [w.lower()
                                        for w in sent] + [model.endToken]
                  for sent in myCorpus.sents()]
print('Готово.')

print('Трениране на Марковски езиков модел...')
M2 = model.MarkovModel(fullSentCorpus, 2)
print('Готово.')

#############################################################################
#### Начало на тестовете
#### ВНИМАНИЕ! Тези тестове са повърхностни и тяхното успешно преминаване е само предпоставка за приемането, но не означава задължително, че програмата Ви ще бъде приета. За приемане на заданието Вашата програма ще бъде подложена на по-задълбочена серия тестове.
#############################################################################

#### Тест на editDistance
try:
    for s1, s2, d in zip(L1, L2, C):
        signal.alarm(60)
コード例 #38
0
pos_tokens = pos_tag(convote_training.tokenized())
prep_tokens = []

for (word, pos) in pos_tokens:
    if(pos == 'IN'):
        prep_tokens.append(word + '|' + pos)
    else:
        prep_tokens.append(pos)
        
trigram = nltk.trigrams(prep_tokens)
trigram_file = open('../data/pos_trigrams', 'w')

pickle.dump(bigram, outfile)
pickle.dump(trigram, trigram_file)
    
for sents in convote_test.sents():
    for index in range(0, len(sents)):
        if sents[index] == 'in':
            temp = deepcopy(sents)
            temp[index] = '*'
            in_test.append(temp)
        if sents[index] == 'on':
            temp = deepcopy(sents)
            temp[index] = "*"
            on_test.append(temp)
        if sents[index] == 'of':
            temp = deepcopy(sents)
            temp[index] = "*"
            of_test.append(temp)

for sents in convote_dev.sents():
コード例 #39
0
import nltk
import numpy as np
from nltk.corpus import PlaintextCorpusReader

vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

reader = PlaintextCorpusReader('./data/',
                               'reddit-comments-2015-08.csv',
                               encoding='utf-8')
sentences = reader.sents()
words = reader.words()
tokenized_sentences = [[sentence_start_token] + sent + [sentence_end_token]
                       for sent in sentences]

word_freq = nltk.FreqDist(words)
word_freq.plot(30)
print("Unique words", len(word_freq))

vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

print("Least freq word in vocab is:", f'"{vocab[-1][0]}"', "and it appeared",
      vocab[-1][1], "times")

for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = ([
コード例 #40
0
corpus_root = '/DTU/MSc/Code/Data'
wordlists = PlaintextCorpusReader(corpus_root, [
    'food_holiday_inn_london.txt.data', 'food_swissotel_chicago.txt.data',
    'room_holiday_inn_london.txt.data', 'rooms_swissotel_chicago.txt.data',
    'rooms_swissotel_chicago.txt.data',
    'service_bestwestern_hotel_sfo.txt.data',
    'service_holiday_inn_london.txt.data',
    'service_swissotel_hotel_chicago.txt.data',
    'staff_bestwestern_hotel_sfo.txt.data', 'staff_swissotel_chicago.txt.data'
])
wsj = nltk.corpus.treebank

print wordlists.fileids()
print wsj.fileids()

print(len(wordlists.sents()))
senLengths1 = [len(s) for s in wordlists.sents()]
freqDist1 = nltk.FreqDist(senLengths1)

print(len(wsj.sents()))
senLengths2 = [len(s) for s in wsj.sents()]
freqDist2 = nltk.FreqDist(senLengths2)

propDist1 = nltk.DictionaryProbDist(freqDist1, normalize=True)
propDist2 = nltk.DictionaryProbDist(freqDist2, normalize=True)

myfile = open('../Thesis/wsjdist.dat', 'wb')
wr = csv.writer(myfile, quoting=csv.QUOTE_NONE)

wr.writerow(["x", "y1", "y2"])
コード例 #41
0
fileid = 'neg/cv956_12547.txt'
text = movie_reviews.raw(fileid)
text1= movie_reviews.raw(categories='neg')
movie_reviews.categories(fileid)


#Frequency distribution by creating our own corpus

from nltk.corpus import PlaintextCorpusReader
fileid = 'C:/Users/ITRAIN-12/Desktop/Day 2/gaming.txt'
my_corpus = PlaintextCorpusReader(fileid, '.*')
text = my_corpus.raw(fileid)
text
my_corpus.raw(fileid)
my_corpus.words(fileid)
my_corpus.sents(fileid)
distr = nltk.FreqDist(text)
print(distr.most_common(5))


#Reuters
from nltk.corpus import reuters
fileid='training/9865'
text=reuters.raw(fileid)
text

#Load reuters category news
from nltk.corpus import reuters
reuters.fileids()
reuters.categories()
fileid = 'test/16399'
コード例 #42
0
ファイル: small_analysis.py プロジェクト: atokop/compling
yoursmall_wordlists = PlaintextCorpusReader(corpus_root, '.*')
classevent_wordlists = PlaintextCorpusReader(classevent_corpus_root, '.*')

# The next step is to show the file names under the directory (optional step)
lemmer = WordNetLemmatizer()

def clean_words(words):
    words = [w.lower() for w in words if w.isalnum()]
    return words

def clean_sents(sents):
    return [clean_words(sent) for sent in sents]

    
classevent_words = clean_words(classevent_wordlists.words())
classevent_sents = clean_sents(classevent_wordlists.sents())
classevent_words = [w.lower() for w in classevent_words if w.isalnum()]
classevent_words = nltk.Text(classevent_words)
classevent_words_lem = [lemmer.lemmatize(w) for w in classevent_words]
print "ClassEvent loaded"

yoursmall_words = yoursmall_wordlists.words()
yoursmall_sents = yoursmall_wordlists.sents()
yoursmall_words = [w.lower() for w in yoursmall_words if w.isalnum()]
yoursmall_words = nltk.Text(yoursmall_words)
yoursmall_words_lem = [lemmer.lemmatize(w) for w in yoursmall_words]
print "YourSmall loaded"

yourwords = ['earthquake', 'seismic', 'aftershocks', 'quake', 'damage', 'magnitude', 'tremor', 'richter', 'epicenter', 'depth', 'fault', 'hypocenter', 'focus', 'dead', 'casualties', 'structural', 'seismometer', 'temblor', 'hazard', 'impact']
yourwords_lem = [lemmer.lemmatize(w.lower()) for w in yourwords]
コード例 #43
0
ファイル: CRF_disease.py プロジェクト: EisRoot/LBD_NER
# sents_no=[]
# pmids=[]
# i=0
# for index,row in mydf.iterrows():
#     stence=row['text']
#     sents=stence.split('.')
#     newsents=[]
#     for sent in sents:
#         newsents.append(sent+'.')
#         i=i+1
#         sents_no.append(i)
#         pmids.append(row['pmid'])
#
#     # result= re.findall(r"<category=\".`+?\">(.+?)</category>",stence, re.S)
#     # print(result)
# print(sents_no)
# print(pmids)
str='you are my shine.'
str_list=list(str)
list=[]
for i in str_list:
    list.append({'str':str,"obj":"ss"})
str2=" ".join(list)
print(str2 )
wodslist = PlaintextCorpusReader(cr, '.*')

for i in wodslist.sents('NCBI_corpus_training.txt'):
    text=nltk.word_tokenize(' '.join(i))
    nltk.wordpunct_tokenize
    print(i)
    # print(nltk.pos_tag(text, tagset='universal'))
コード例 #44
0
def nltk_corpora():
    ## 1. PROJECT GUTENBERG << Formal Language - Literature;ebooks 60K++
    emma = nltk.corpus.gutenberg.words("austen-emma.txt")
    emma = nltk.Text(emma)

    len(emma)
    lexical_diversity(emma)

    emma.concordance("brave")
    emma.collocation_list()

    ## traits of the corpus text for each
    def corp_content(corporad):
        print(
            "{0} File {0} \t\tWord len   Sent len   Vocab   Lexical Complexity"
            .format(" " * 6))
        print("{}".format("-" * 100))
        for i, txt in enumerate(corporad.fileids()):
            sents_l = len(corporad.words(txt))
            try:
                sents_l = len(corporad.sents(txt))
            except:
                sents_l = len(corporad.posts(txt))
            w_len = round(len(corporad.raw(txt)) / len(corporad.words(txt)))
            s_len = round(len(corporad.words(txt)) / sents_l)
            voc = len(set(w.lower() for w in corporad.words(txt)))
            # lexp = round( voc / len( [w.lower() for w in gutenberg.words(txt)] ) * 100 )
            lexp = round(voc / len(corporad.words(txt)) * 100)
            print("{}. {} \t\t{}\t{}\t{}\t{}%\t{}".format(
                i, txt, w_len, s_len, voc, lexp,
                corporad.raw(txt)[:30]))
            # print( "{}. {} \t\t{}\t{}\t{}\t{}%\t{}".format(i, txt, w_len, s_len, voc, lexp, corporad.words(txt)[:5] ) )

    # 1. Formal Language - Project Gutenberg ebooks 60K++, 16+ languages
    corp_content(gutenberg)

    # 2. Informal Language - Web content and Chat rooms
    corp_content(webtext)
    corp_content(nps_chat)

    # 3. Brown Corpus - 15+ Multi-genre, 500+ sources, En_lang << http://icame.uib.no/brown/bcm-los.html
    # for studying systematic differences between genres I.E. stylistics
    corp_content(brown)

    brown.categories()
    brown.words(categories="news")
    brown.words(categories=["news", "editorial", "reviews"])

    # example stylistics - modal verbs usage between genres
    def modalz(modals):
        print("\tCategory\t", end=" ")
        for m in modals:
            print("\t{}".format(m), end=" ")
        print("\n" + "-" * 100)
        for i, cat in enumerate(brown.categories()):
            print("{}.{}\t\t".format(i, cat), end=" ")
            fdist = nltk.FreqDist(w.lower()
                                  for w in brown.words(categories=cat))
            for m in modals:
                print("\t{}".format(fdist[m]), end=" ")
            print("")

    modalz(["can", "could", "may", "might", "must", "will"])
    modalz(["should", "ought", "would", "could", "do", "did", "does"])
    modalz(["what", "when", "where", "why", "who"])

    ## ditto using nltk conditional frequency distributions
    cfdist = nltk.ConditionalFreqDist(
        (genre, word) for genre in brown.categories()
        for word in brown.words(categories=genre))

    genz = ["news", "religion", "hobbies", "humor", "romance"]
    modz = ["can", "could", "may", "might", "must", "will"]
    cfdist.tabulate(conditions=genz, samples=modz)

    # 4. Reuters Corpus - news articles, 90 topics, grouped into training and testing sets
    # << Apparent goal is to predict the category/topic of a given article??
    corp_content(reuters)
    # retrieve topic(s) of a given article
    reuters.categories("training/9865")
    reuters.categories(["training/9865", "training/9880"])
    # find articles that cover some topic(s)
    reuters.fileids("barley")
    reuters.fileids(["barley", "corn"])

    # the first words are in all CAPs and are the titles of the article. The rest is the story text
    for i, txt in enumerate(reuters.fileids(["barley", "oil"])):
        print("{}. {}\t{}".format(i, txt, reuters.words(txt)[:10]))

    # 5. Speeches - Inaugral Address Corpus << 55 USA Presidential addresses
    # << interesting in that there's a time horizon element from 1789  to 2009 (first 4 xters of fileid = year) ; can study how language changes with time; could reflect on priorities, culture, ???
    corp_content(inaugural)
    # how America and Citizen ar eused over time
    cfdist = nltk.ConditionalFreqDist((target, fileid[:4])
                                      for fileid in inaugural.fileids()
                                      for w in inaugural.words(fileid)
                                      for target in ['america', 'citizen']
                                      if w.lower().startswith(target))
    cfdist.plot()

    # 6. Annotated Text Corpora
    # annotations: POS, named entities, syntatic structures, semantic roles,

    # 7. Other Languages Corpora
    # includes udhr = Universal Declaration of Human Rights in over 300 languages

    # word length freq by diff languages
    langz = [
        "English", "Chickasaw", "German_Deutsch", "Kinyarwanda",
        "Swahili_Kiswahili"
    ]
    cfdist = nltk.ConditionalFreqDist((lang, len(word)) for lang in langz
                                      for word in udhr.words(lang + "-Latin1"))
    cfdist.plot()
    cfdist.plot(cumulative=True)

    # alphabet freq
    nltk.FreqDist(udhr.raw("Kinyarwanda-Latin1")).plot()

    # 8. Loading your own Corpora
    # << txt files. Use PlaintextCorpusReader. Check dir location
    #
    my_corpus = PlaintextCorpusReader(
        "root_dir_path_here", ".*"
    )  # second param is a list of fileids defined as a list or an ls pattern
    eg_corpus = PlaintextCorpusReader(
        "D:/zRepoz/dataSaysWhat/DocReader/res/txt_corpus", ".txt")
    eg_corpus.fileids()
    eg_corpus.words("example1.txt")
    len(eg_corpus.sents())

    #BracketParseCorpusReader
    my_corpus = nltk.corpus.BracketParseCorpusReader("path", "file_pattern")
コード例 #45
0
ファイル: small_analysis.py プロジェクト: atokop/compling
classevent_wordlists = PlaintextCorpusReader(classevent_corpus_root, '.*')
big_wordlists = PlaintextCorpusReader(big_corpus_root, '.*')

# The next step is to show the file names under the directory (optional step)
lemmer = WordNetLemmatizer()

def clean_words(words):
    words = [w.lower() for w in words if w.isalnum()]
    return words

def clean_sents(sents):
    return [clean_words(sent) for sent in sents]

    
classevent_words = classevent_wordlists.words()
classevent_sents = classevent_wordlists.sents()
classevent_words = [w.lower() for w in classevent_words if w.isalnum()]
classevent_words = nltk.Text(classevent_words)
classevent_words_lem = [lemmer.lemmatize(w) for w in classevent_words]
print "ClassEvent loaded"

yoursmall_words = yoursmall_wordlists.words()
yoursmall_sents = yoursmall_wordlists.sents()
yoursmall_words = [str(w).lower() for w in yoursmall_words if w.isalnum()]
yoursmall_words = nltk.Text(yoursmall_words)
yoursmall_words_lem = [lemmer.lemmatize(w) for w in yoursmall_words]
print "YourSmall loaded"

big_words = big_wordlists.words()
big_sents = big_wordlists.sents()
big_words = [w.lower() for w in big_words if w.isalnum()]
コード例 #46
0
ファイル: trainer.py プロジェクト: hypecycle/ingvar
###
#Abschnitt Naive Bayes tranieren
###

print("Lese neutralen Korpus")
neutralCorp = ConllCorpusReader(
    '.',
    'corpora/tiger_release_aug07.corrected.16012013.conll09',
    ['ignore', 'words', 'ignore', 'ignore', 'pos'],
    encoding='utf-8')

print("Lese Ingvar-Korpus")
ingvarCorp = PlaintextCorpusReader(".", "texts/latest.txt")

print("Generiere Wortlisten")
ingvarSentencesLong = ingvarCorp.sents()
neutralSentencesLong = neutralCorp.sents()

smallerSet = min(len(ingvarSentencesLong), len(neutralSentencesLong))

ingvarSentences = ingvarSentencesLong[:smallerSet]
neutralSentences = neutralSentencesLong[:smallerSet]

print(f'Zahl der Sätze limitiert auf kleineres Set mit {smallerSet} Sätzen')

print("Generiere Features")
ingFeats = [(word_feats(f), 'ing') for f in ingvarSentences]
neutFeats = [(word_feats(f), 'neu') for f in neutralSentences]

print("Generiere Cutoff")
ingCutoff = int(len(ingFeats) * 0.9)
コード例 #47
0
    def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None):
        """Extract features, reduce dimensions with a PCA and return data.

        Exports raw- and PCA-reduced data both in arff- and numpy-format.
        """
        start = time.clock()
        self.dictVectorizer = DictVectorizer(sparse=False)
        filename = os.path.split(filepath)[1]
        directory = os.path.split(filepath)[0]
        plain_reader = PlaintextCorpusReader(
            directory, 
            [filename],
            word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|["+string.punctuation+"]"),
            sent_tokenizer=LineTokenizer(blanklines="discard"),
            encoding='utf8')

        # create new subdir for extracted data
        if _NEW_SUBDIR is not None:
            path = os.path.join(directory, _NEW_SUBDIR)
            if not os.path.exists(path):
                os.makedirs(path)
            path = os.path.join(path, os.path.splitext(filename)[0])            
            # print "path {}".format(path)
        else:
            path = os.path.splitext(filepath)[0]
            # print "path {}".format(path)

        # filepaths for weka- and numpy-files
        arff_filepath = path + ".arff"
        arff_filepath_pca = path + "_pca95.arff"
        numpy_filepath = path + ".npy"
        numpy_filepath_pca = path + "_pca95.npy"
        
        # print(":time: Reader created, time elapsed {}").format(time.clock() - start)
        paras = plain_reader.paras()
        # print(":time: Paras created, time elapsed {}").format(time.clock() - start)
        sents = plain_reader.sents()
        # print(":time: Sents created, time elapsed {}").format(time.clock() - start)

        # get paragraph boundaries for sliding-window
        self.boundaries = util.get_boundaries(paras)
        boundaries_backup = self.boundaries

        # check if all files necessary exist, if yes - unpickle/load them and return data
        if util.files_already_exist([numpy_filepath_pca,]):
            print "Features already extracted. Calculating clusters...\n"
            matrix_sklearn_pca = numpy.load(numpy_filepath_pca)
            return filepath, self.boundaries, matrix_sklearn_pca, len(sents)

        # save correct target-labels and additional info of current data
        targets_path = open(path + ".tbs", "wb")
        pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path)

        # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start)
        self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features)
        # self.data[year] = self.extract_features_para(paras, ind_features, dep_features)
        # print(":time: Features extracted, time elapsed {}").format(time.clock() - start)
        self.all_features = self.unified_features(self.data)
        # print(":time: Unified features, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = self.feature_matrix_sklearn(self.generator_data(self.data))
        # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = util.normalize(matrix_sklearn)
        # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start)
        
        print "Exporting raw-data..."
        util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename+"_RAW", labels_per_window, file_info=None)
        numpy.save(numpy_filepath, matrix_sklearn)
        
        # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape)
        feature_names, feature_names_part = None, None
        if _DO_PCA:
            print "PCA calculation..."
            matrix_sklearn_pca, feature_names = util.pca(matrix_sklearn, self.dictVectorizer.get_feature_names())
            util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename+"_PCA95", labels_per_window, file_info=None)
            numpy.save(numpy_filepath_pca, matrix_sklearn_pca)
            
            del matrix_sklearn
        gc.collect()
        return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
# CALCULATION OF BINARY FEATURES

# 1: IF A TERM APPEAR IN A SENTENCE) 0 : DOESN'T APPEAR

import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

corpus_root = 'C:\MyData\PythonPractice\Mycorpus'
wordlists = PlaintextCorpusReader(corpus_root, 'resort.*\.txt')

print('\nFollowing file ids are there in this corpus: \n ')
print(wordlists.fileids())
print("\nNumber of sentences in the file are :")
sencount = len(wordlists.sents(fileids=['resort.txt']))
print(sencount)
print('\n Sentences are : \n')
sentences = wordlists.sents(fileids='resort.txt')
print(sentences)
sample = wordlists.raw("resort.txt")
s = sample.split('.')

#NO OF TERMS
unique_tokens = []

for i in range(sencount):
    print("\n Sentence " + str(i + 1))
    print(s[i])
    #print('\n Tokenization \n')
    word_tokens = word_tokenize(s[i])
コード例 #49
0
ファイル: analyser.py プロジェクト: RichardLitt/politics
from __future__ import division
import urllib2, sys, re, codecs
import nltk, pprint
from BeautifulSoup import BeautifulSoup
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize

# The name of the output file
#input_file_name = 'obama_speeches' 
#g = codecs.open(input_file_name, mode='r+').read()

#g = nltk.Text(nltk.word_tokenize(g))
#print g.concordance('freedom')
#print g.concordance('liberty')

from nltk.corpus import PlaintextCorpusReader
corpus_root = '/Users/richard/Github/politics/speeches/'
ocorpus = PlaintextCorpusReader(corpus_root, '.*')
for fileid in ocorpus.fileids():
	num_chars = len(ocorpus.raw(fileid))
	num_words = len(ocorpus.words(fileid))
	num_sents = len(ocorpus.sents(fileid))
	num_vocab = len(set([w.lower() for w in ocorpus.words(fileid)]))
	#print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
	print nltk.Text(ocorpus.words(fileid)).concordance('freedom')
	print nltk.Text(ocorpus.words(fileid)).concordance('liberty')
コード例 #50
0
import operator
import random
auth = tweepy.OAuthHandler("xxx", "xxx")
auth.set_access_token("xxx", "xxx")
api = tweepy.API(auth)

directory = "PATH-TO-DIRECTORY"

bandz = pickle.load(open(directory + "thug_tokens.p", "rb"))
thugtrainer = nltk.NgramModel(3, bandz)

corpus_root = directory + "/songs"
chainzcorpus = PlaintextCorpusReader(corpus_root, '.*')

chainzwords = nltk.probability.FreqDist()
for sent in chainzcorpus.sents():
	for word in sent:
		chainzwords.inc(word.lower())
chainzkeys = chainzwords.keys()

brownwords = nltk.probability.FreqDist()
for sent in brown.sents():
	for word in sent:
		brownwords.inc(word.lower())
brownkeys = brownwords.keys()

stopwords = nltk.corpus.stopwords.words('english')

trends_US = api.trends_place(23424977)

trendlist = []
コード例 #51
0
from nltk.corpus import PlaintextCorpusReader

corpus_root = "D:\develop\data\my_nltk"

word_lists = PlaintextCorpusReader(corpus_root, ".*")
print(word_lists.fileids())
print(word_lists.sents("a.txt"))
print(word_lists.words("a.txt"))
コード例 #52
0
#Obtener el número de palabras que sólo aparecen una vez en el primer fichero del corpus.
print("Número de palabras que aparecen solo una vez:")
print(len([w for w in set(text) if fdist[w] == 1]))
#Obtener la palabra más frecuente del primer fichero del corpus.
print("La palabra más frecuente es %s" % fdist.max())
#Cargar los ficheros de PoliformaT (“spam.txt”, “quijote.txt” y “tirantloblanc.txt” ) como un corpus propio.
corpus_root = '.'
wordlists = PlaintextCorpusReader(
    corpus_root, ["spam.txt", "quijote.txt", "tirantloblanc.txt"])
print("corpus cargado")
#Calcular el número de palabras, el número de palabras distintas y el número de frases de los tres documentos.
for i in range(0, 3):
    nombre = wordlists.fileids()[i]
    npalabras = len(wordlists.words(wordlists.fileids()[i]))
    npaldistintas = len(set(wordlists.words(wordlists.fileids()[i])))
    nfrases = len(wordlists.sents(wordlists.fileids()[i]))
    print(
        "fichero: %s num palabras: %d num palabras distintas: %d num frases %d"
        % (nombre, npalabras, npaldistintas, nfrases))

print("Ejercicio 2")
from nltk.corpus import brown
words = ["what", "where", "who", "when", "why"]
mydict = {}
for word in words:
    mydict[word] = []
categoriess = brown.categories()
for word in words:
    for category in categoriess:
        frecuencia = len(
            [w for w in brown.words(categories=category) if w == word])
コード例 #53
0
def read_sents(inp, outp):
    i = open(inp, 'r').read()
    corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp))
    sents = corpus.sents()
    print_out(outp, i, sents)
コード例 #54
0
ファイル: views.py プロジェクト: prashaantt/savitri-labs
def nlp(request):
	w=PlaintextCorpusReader("./","canto1.txt");
	w.words();
	t=nltk.text.Text(w.words())
	return render_to_response('lengths.html', {'word_length':len(set(w.words())), 'sentence_length' : len(w.sents())})
コード例 #55
0
from gensim.models import Word2Vec
from nltk.corpus import brown, movie_reviews, treebank, PlaintextCorpusReader
from six import string_types
from nltk.corpus.reader.util import concat

root_dir = '/home/diego/qdata/techarticles/parsed_articles/'
acr = PlaintextCorpusReader(root_dir, '.*\.txt')
from datetime import datetime

print(acr.fileids())

for fileid in acr.fileids():
    #    num_chars = len(acr.raw(fileid))
    #    num_words = len(acr.words(fileid))
    #    num_sents = len(acr.sents(fileid))
    #    num_vocab = len(set(w.lower() for w in acr.words(fileid)))
    #    print(round(num_chars / num_words), round(num_words / num_sents), round(num_words / num_vocab), fileid)
    print(" ============ " + fileid + " =============================")
    print(acr.words(fileid))
    print(acr.sents(fileid))
コード例 #56
0
ファイル: eval.py プロジェクト: acapello/PLN-2015
  -h --help     Show this screen.
"""
#  You must be in the virtualenv ($ workon pln-2015) to run this script
# Attention: you must run this (every) script from PLN-2015/ directory

import pickle
from docopt import docopt
from nltk.corpus import PlaintextCorpusReader
from languagemodeling.ngram import Eval


if __name__ == '__main__':

    opts = docopt(__doc__)

    i = str(opts['-i'])
    f = open(i, 'rb')
    model = pickle.load(f)

    test_corpus = PlaintextCorpusReader('corpus/spanish', 'test.txt')
    test_sents = test_corpus.sents()

    evaluator = Eval(model, test_sents)
    log_prob = evaluator.log_probability
    cross_ent = evaluator.cross_entropy
    perp = evaluator.perplexity

    print("Input filename: %s" % i)
    print(" Log-Probability: %f\n Cross-Entropy: %f\n Perplexity: %f\n"
          % (log_prob, cross_ent, perp))
コード例 #57
0
ファイル: u5_ne_chunk.py プロジェクト: jplahn/NLP-Capstone
""" Word and named entity 10 chunks """

import nltk
from nltk.corpus import PlaintextCorpusReader
import pickle

print 'getting files'
corpus_root = 'Texas_Wild_Fire'
english = pickle.load(open('./nltk_data/tokenizers/punkt/english.pickle', 'r'))
yourSmallReader = PlaintextCorpusReader(corpus_root, '.*', sent_tokenizer=english)

print 'getting sentences'
# 10324.txt 17749.txt 17859.txt
sents = yourSmallReader.sents('10324.txt') + yourSmallReader.sents('17749.txt') + yourSmallReader.sents('17859.txt')
# sents = yourSmallReader.sents()
sents = [nltk.pos_tag(sent) for sent in sents]

print 'getting chunks'
chunks = [nltk.ne_chunk(sent) for sent in sents]

# Getting a random assortment of chunks
print chunks[0]
print chunks[10]
print chunks[25]
print chunks[35]
print chunks[50]
print chunks[60]
print chunks[75]
print chunks[80]
print chunks[90]
print chunks[100]
コード例 #58
0
ファイル: train.py プロジェクト: danniccs/PLN-2019
from languagemodeling.ngram import NGram, AddOneNGram, InterpolatedNGram


models = {
    'ngram'  : NGram,
    'addone' : AddOneNGram,
    'inter'  : InterpolatedNGram,
}


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the corpus
    screenplay_dir = opts['-r']
    my_corpus = PlaintextCorpusReader(screenplay_dir, '.*.txt')

    sents = my_corpus.sents()

    # train the model
    n = int(opts['-n'])
    model_class = models[opts['-m']]
    model = model_class(n, sents)

    # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()
コード例 #59
0
ファイル: trigram_tagging.py プロジェクト: atokop/compling
#Make a stopset
stopset = set(nltk.corpus.stopwords.words('english'))
f = open("./stopwords.txt", "r")
for line in f.readlines():
    word = line.strip()
    if word not in stopset:
        stopset.add(word)

#Read in corpus
corpus_root = '.././Islip13Rain/'
classevent_wordlists = PlaintextCorpusReader(corpus_root, '.*') 

#sent tokenize
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# CEsents = sent_tokenizer.tokenize(classevent_wordlists.raw())
CEsents = classevent_wordlists.sents()

#tag and filter
def trigram_tag(sentences, default_tagger=get_regexp_tagger(), **kwargs):
    tagged_words = raw_trigram_tag(sentences)
    tagged_words = remove_stopwords_tagged(tagged_words, stopset)
    pos_filtered = remove_tags(["NN", "VB"], tagged_words)
    pos_filtered = remove_non_english(pos_filtered)
    pos_filtered = [(word.lower()) for word in pos_filtered]
    pos_filtered = lemmatize_words(pos_filtered)
    set_pos_filtered = list(set(pos_filtered))
    set_pos_filtered = sorted(set_pos_filtered, key=lambda word: pos_filtered.count(word))
    return set_pos_filtered

コード例 #60
0
ファイル: gen_data.py プロジェクト: colegleason/cs398vl-mp2
import csv
from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader, stopwords
MAX_WORDS = 4
NUM_WORDS = 100
wordlist = PlaintextCorpusReader('', 'ofk(_chap_[1234])?\.txt')

sents = wordlist.sents('ofk.txt')
seqs = []


def clean_sent(sent):
    sent = filter(lambda w: w.isalpha() or w in ['.', '!', '?'], sent)
    out = []
    for i in range(len(sent)):
        if sent[i] == 't':
            out[-1] += "'t"
        else:
            out.append(sent[i])
    return out


sents = map(clean_sent, sents)

for sent in sents:
    output = []
    for i in range(len(sent)):
        output.append(sent[i])
        if len(output) >= MAX_WORDS:
            break
    if output: