Esempio n. 1
0
def main():
	current_directory = os.path.dirname(__file__)
	corpus_root = os.path.abspath(current_directory)
	wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt')
	wordlists.fileids()
	ClassEvent = nltk.Text(wordlists.words())
	CEWords = ["Long Island", "Weather Service", "flooding", "August", 
		"heavy rains", "Wednesday", "Suffolk County", "New York", "rainfall",
		"record"]

	# ClassEvent Statistics
	print "--------- CLASS EVENT STATISTICS -------------"
	print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent)	
	print "ClassEvent WORD LENGTH DISTRIBUTIONS:"
	print_word_length_distributions(ClassEvent)
	print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:"
	print_percentage_of_word_in_collection(ClassEvent, CEWords)
	
	ClassEventLettersPerWord = average_letters_per_word(ClassEvent)
	ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents())
	ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \
		ClassEventWordsPerSent) - 21.43
	
	print "Average number of letters per word", ClassEventLettersPerWord
	print "Average number of words per sentence:", ClassEventWordsPerSent
	print "Automated Readability Index:", ClassEventARI


	print 

	wordlists_event = PlaintextCorpusReader(corpus_root, "Texas_Wild_Fire/.*\.txt")
	wordlists_event.fileids()
	YourSmall = nltk.Text(wordlists_event.words())
	SmallEventWords = ["Fire", "Wildfire", "Water", "Damage", "Ground", "Burn", 
		"Town", "Heat", "Wind", "Speed", "Size", "City", "People", "Home",
		"Weather", "Debris", "Death", "Smoke", "State", "Ash"]
	

	# YourSmall statistics
	print "--------- YOUR SMALL STATISTICS --------------"
	print "Texas_Wild_Fire", non_stopword_fraction(YourSmall)
	print "YourSmall WORD LENGTH DISTRIBUTIONS:"
	print_word_length_distributions(YourSmall)
	print "YourSmall PERCENTAGE OF WORD OCCURRENCES:"
	print_percentage_of_word_in_collection(YourSmall, SmallEventWords)
	
	YourSmallLettersPerWord = average_letters_per_word(YourSmall)
	YourSmallWordsPerSent = len(wordlists_event.words()) / \
		len(wordlists_event.sents())
	YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \
		YourSmallWordsPerSent) - 21.43

	print "Average number of letters per word", YourSmallLettersPerWord
	print "Average number of words per sentence:", YourSmallWordsPerSent
	print "Automated Readability Index", YourSmallARI
Esempio n. 2
0
File: q1.py Progetto: cmstewart/galv
def textinfo(path):
    """
    Takes a file path and returns figures about the text file contained therein.
    """
    
    from nltk.corpus import PlaintextCorpusReader
    from nltk import FreqDist
    corpusReader = PlaintextCorpusReader(text, '.*')

    print "Total word count:", len([word for sentence in corpusReader.sents() for word in sentence])
    print "Unique words:", len(set(corpusReader.words()))
    print "Sentences:", len(corpusReader.sents())
    print "Average sentence length in words:", (len([word for sentence in corpusReader.sents() for word in sentence]) / len(corpusReader.sents()))
Esempio n. 3
0
def stats(request):
    errors = []
    statistics=[]
    if 'q' in request.GET:
        q = request.GET['q']
        if not q:
            errors.append('Enter a Canto Number')
        else:
           cantoname = "canto"+q+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           return render_to_response('stats.html', {'statistics':statistics})
    return render_to_response('stats.html', {'errors': errors})
def get_coarse_level_features(dataset, output_file):
	# Import the corpus reader
	corpus_root = '/home1/c/cis530/data-hw2/'+dataset
	# Define the folder where the files are situated
	files_dataset = PlaintextCorpusReader(corpus_root, '.*')
	# Open the output_file
	output = open('/home1/c/cis530/data-hw2/'+output_file,'w')
	# Read the stopwlist
	stop_list = open('/home1/c/cis530/data-hw2/'+'stopwlist.txt').read()
	types_stop_list=stop_list.split()
	for fileid in files_dataset.fileids():
		# Output the docid
		output.write(dataset+'/'+fileid+' ')
		# Output the topic_name
		topic_name=fileid.split('/')[0]	
		output.write(topic_name+' ')
		# Output the num_tokens	
		tokens=files_dataset.words(fileid)
		output.write('tok:'+str(len(tokens))+' ')
		# Output the num_types
		types=set(tokens)
		output.write('typ:'+str(len(types))+' ')
		# Output the num_contents
		output.write('con:'+str(len([w for w in tokens if w not in types_stop_list]))+' ')
		# Output the num_sents
		sents = files_dataset.sents(fileid)
		output.write('sen:'+str(len(sents))+' ')
		# Output the avg_slen
		avg_slen=round(float(len(tokens))/float(len(sents)),2)
		output.write('len:'+str(avg_slen)+' ')
		# Output the num_caps
		output.write('cap:'+str(len([w for w in tokens if w[0]>='A' and w[0]<='Z'])))
		output.write('\n')
	output.close()
Esempio n. 5
0
def compare(request):
    errors = []
    statistics=[]
    stats=[]
    for x in range(1,3):
           cantoname = "canto"+str(x)+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append(x)
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           stats.append(statistics)
           
    return render_to_response('compare.html', {'stats':statistics})
Esempio n. 6
0
def main():
    corpus_root = '../posts/'
    newcorpus = PlaintextCorpusReader(corpus_root, '.*',
                                      para_block_reader=read_block_no_metadata)
    corpus_words = [w.lower() for w in newcorpus.words() if w.isalpha()]
    corpus_sentences = newcorpus.sents()
    analyst = TextAnalyst(corpus_words, corpus_sentences, 'french')
    analyst.print_analyze()
Esempio n. 7
0
def extractPossibleTerms(root, fileids):
    # get corpus
    #root, filename = os.path.split(path)
    reader = PlaintextCorpusReader(root, fileids)
    # get chunker
    grammar = 'NP: {<JJ>*<NNP>*<NN>*}'
    chunker = RegexpParser(grammar)
    # get terms
    terms = set()
    print len(reader.sents())
    i = 0
    for sent in reader.sents():
        i += 1
        if i%100==0:
            print i
        tree = chunker.parse(pos_tag(sent))
        for t in tree.subtrees(lambda t: t.node!='S'): # exclude Sentence node
            terms.add(' '.join([el[0] for el in t]))
    return terms
def get_coarse_level_features(dataset, output_file):
# accessing the corpus
    corpus_root = '/home1/c/cis530/data-hw2/' 
    dataset_path = corpus_root + dataset

# Reading the files from the directories
    files = PlaintextCorpusReader(dataset_path, '.*')
    ids = files.fileids()
    stopFile = PlaintextCorpusReader(corpus_root, 'stopwlist.txt')
    stops = stopFile.words()

#Opening a file that has to be written to
    out = open(output_file, 'w')

    for i in range(0,len(ids) - 1):
#Initializing certain variables
        tokens_count=0
        types = 0
        non_stops_count=0
        sents_count = 0
        avg_sent_len=0
        cap_count = 0

        tokens=files.words(ids[i])
#Computing Number of Tokens
        tokens_count = len(tokens)

#Computing Number of types
        types = len(set(tokens))
        non_stops=[]

#Computing Number of Content Words
        for t in tokens:
            if t not in stops:
                non_stops.append(t)
        non_stops_count = len(non_stops)

#Finding Average Sentence Length
        sent = []
        sent = files.sents(ids[i])
        sents_count = len(sent)
        sent_len=0
        for s in sent:
            sent_len = sent_len + len(s)
        avg_sent_len = sent_len/float(sents_count)

#Computing Number of Captilized Words
        for c in non_stops:
            if c.istitle():
                cap_count = cap_count+1
        current_file = dataset + '/' + ids[i]
        e = current_file.split('/')
        out.write(current_file +' '+ e[-2] + ' tok:' + str(tokens_count) + ' typ:' + \
str(types) + ' con:' + str(non_stops_count) + ' sen:' + str(sents_count) + ' len:' + str(avg_sent_len) + ' cap:' + str(cap_count)+ '\n')
        out.flush()
def train():

   wordlists = PlaintextCorpusReader('', file_path)

   st = stemmer()
   
   # Get blocks of text using NLTK
   words = wordlists.words(file_path)
   sents = wordlists.sents(file_path)
   paras = wordlists.paras(file_path)

   # LOGIC
   #       If a sentence contains a known [posi/nega]tive word, count the instances of words in that sentence as 
   #       [posi/nega]tive

   # Count words
   word_features = []

   # Go through paragraphs
   for p in paras:

      # Classify S
      score_positive_negative = 0
      for s in p:
         for word in s:

            word = st.stem(word)

            if word in words_positive:
               score_positive_negative += 1
            elif word in words_negative:
               score_positive_negative -= 1
   
      # Record class of paragraph for any words present
      for s in p:
         for word in s:

            word = st.stem(word)

            if score_positive_negative > 0:
               word_features.append( ({"word": word}, "+") )
            elif score_positive_negative < 0:
               word_features.append( ({"word": word}, "-") )
            else:
               word_features.append( ({"word": word}, " ") )

   # Create and return classifier
   classifier = nltk.NaiveBayesClassifier.train(word_features)
   return classifier
def main():

   st = stemmer()

   # Get data
   wordlists = PlaintextCorpusReader('', file_path)
   words = wordlists.words(file_path)
   sents = wordlists.sents(file_path)
   paras = wordlists.paras(file_path)

   # Train
   classifier = train()

   # Get class probabilities (for MAP estimation)
   counts = {"P":0, "-":0, "N":0}
   for i in range(0,len(paras)):
      for s in paras[i]:

         score_pos = 0
         score_neg = 0

         # Classify paragraph
         for word in s:

            word = st.stem(word)

            feature = {"word":word}
            classified = classifier.classify(feature)

            if classified == "+":
               score_pos += 1
            elif classified == "-":
               score_neg += 1

         # Record result
         if score_pos > score_neg:
            counts["P"] += 1
         elif score_pos < score_neg:
            counts["N"] += 1
         else:
            counts["-"] += 1

   # Done!
   print counts
    def classifyByYear(self) :
        corpusReader = PlaintextCorpusReader(self.txtDirectory, ".*.txt", encoding = self.codec)

        for journal in corpusReader.fileids() :
            print ("Start " + journal)

            sentList = corpusReader.sents(journal)

            for sent in sentList :
                getMonth = False
                getDOI = False

                line = ''.join(sent)

                if self.doiURLTypes[0] in line :
                    getDOI = True
                    self._extractYearByDOI(self.doiURLTypes[0], journal, line)
                    break
                elif self.doiURLTypes[1] in line :
                    getDOI = True
                    self._extractYearByDOI(self.doiURLTypes[1], journal, line)
                    break

                for word in sent :
                    if getMonth :
                        self._extractYearByMonth(journal, word)
                        break

                    if word.lower() in self.dictMonth :
                        getMonth = True

                if getMonth :
                    getMonth = False
                    break
                elif getDOI :
                    getDOI = False
                    break

            print ("End " + journal)

        print (str(self.yearDirectoryList))
Esempio n. 12
0
def get_sentences_for_text(corpus_root, filename, lang="english"):
    """Segments the given text into sentences.

  Args:
    corpus_root: Directory in which the text file is residing.
    filename: Name of the text file.
    lang: Tokenizer language. For possible values, look at:
    ${NLTK_DATA}/tokenizers/punkt

  Returns:
    Sentences in the given text. 

  """
    tokenizer_path = "tokenizers/punkt/" + lang + ".pickle"
    text = PlaintextCorpusReader(
        corpus_root,
        [filename],
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=nltk.data.LazyLoader(tokenizer_path),
    )
    return text.sents()
Esempio n. 13
0
def network(chapter):
	if(chapter == 0):
		NEs = open("finalNEs/finalNEs.txt").read().split('\n')
		text_raw = open("ofk.txt").read()
	else:
		NEs = open("finalNEs/finalNEs_ch" + str(chapter) + ".txt").read().split('\n')
		text_raw = open("ofk_ch" + str(chapter) + ".txt").read()
	result = [dict(name="", relations=[""])]
	for NE in NEs:
		result.append(dict(name=NE, relations=[""]))

	# The next line is needed because of the extra blank list elements at the beginning and end (Beginning I added, end added from newlines in finalNEs.txt)
	result = result[1:len(result)-1]
	corpus = PlaintextCorpusReader('.', 'ofk\.txt')
	sentences = corpus.sents()
	for x in range(len(sentences)):
		for NEdict in result:
			if NEdict["name"] in sentences[x]:
	# 			# We are in a sentence with a named entity
				for n in result:
					if n["name"] in sentences[x] and n["name"] != NEdict["name"]:
						NEdict["relations"].append(n["name"])
	for NEdict in result:
		NEdict["relations"] = Set(NEdict["relations"][1:])
	final = [dict(name=r["name"], imports=list(r["relations"]), url=r["name"]+".html") for r in result]
	for finals in final:
		with open("../webpage/" + finals["name"] + ".html", "w") as f1:
			with open("part1.html") as f:
				for line in f:
					f1.write(line)
				f1.write(finals["name"])
			with open("part2.html") as f:
				for line in f:
					f1.write(line)
				f1.write("\tmain(\"data/" + finals["name"] + ".json" + "\");\n</script>")

	with open("../webpage/data/edgeBundle.json",'w') as outfile:
		json.dump(final,outfile, sort_keys = True, indent = 4, ensure_ascii=False)
Esempio n. 14
0
def build_graph(folder, file_pattern):
    corpus_root = os.getcwd() + "/" + folder
    print "Membuka korpus " + folder + " ..."
    word_lists = PlaintextCorpusReader(corpus_root, file_pattern)

    naskah = word_lists.sents()
    filelists = word_lists.fileids()
    teks = tokenize.sent_tokenize(word_lists.raw(fileids=filelists))

    print folder + " memiliki " + str(len(teks)) + ", " + str(len(naskah)) + " kalimat."

    G_result = nx.Graph()
    print "Membangun graf " + folder + " ..."
    for kalimat in naskah:
        kata = kalimat[0]
        prevToken = kata.lower()
        for idx in range(1, len(kalimat)):
            kata = kalimat[idx]
            token = kata.lower()
            if containsLetter(token) and containsLetter(prevToken):
                G_result.add_edge(prevToken, token)
                prevToken = token

    return G_result
jacor = PlaintextCorpusReader(corpus_root, 'J.*txt')

# [B3] Print out some basic specs of the two corpora. First off, # of files.
# YOUR CODE BELOW.
print('The length of the Bulgarian essay corpus: ' + str(len(bucor.fileids())))
print('The length of the Japanese essay corpus: ' + str(len(jacor.fileids())))
print()

# [B4] Now, print total # of sentences and # of words.
# YOUR CODE BELOW.
print('The number of words in the Bulgarian essay corpus: ' +
      str(len(bucor.words())))
print('The number of words in the Japanese essay corpus: ' +
      str(len(jacor.words())))
print('The number of sentences in the Bulgarian essay corpus: ' +
      str(len(bucor.sents())))
print('The number of sentences in the Japanese essay corpus: ' +
      str(len(jacor.sents())))

# ------------------------------------------------------------------------
#                                                    BUILDING DATA OBJECTS
print("...Building data objects...")
# ------------------------------------------------------------------------

# [C1] Build lowercased token lists.
# EDIT THE CODE BELOW.
bu_toks = []
for x in bucor.words():
    bu_toks.append(x.lower())

ja_toks = []

for w in sub_words:
    allwords.append(w.lower())

for w in obj_words:
    allwords.append(w.lower())
allwords=nltk.FreqDist(allwords)
word_features=list(allwords.keys())[:300]

#WORDNET DATASET ARCHIVE
from nltk.corpus import wordnet
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'E:\EIGHTH SEMESTER\PROJECT AND THESIS II\SOFTWARE\WORDNET'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
training_data=wordlists.sents('document.txt')


#INPUT TEXTS
sentence = input("Please enter the sentence: ")
print("The input string: ", sentence)
corpora=sentence
wordtoken_test=word_tokenize(sentence)
wordtoken_train=training_data[0]
print("Tokenization of training data: ",wordtoken_train[0])
print("Tokenization of testing data: ",wordtoken_test[0])


#PARTS OF SPEECH TAGGING OF WORDS
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
Esempio n. 17
0
def read_file(file_path, file_name):

    text = PlaintextCorpusReader(file_path, file_name)
    sents = text.sents()
    words = text.words()
    text_complexity_score(words, sents, file_name)
Esempio n. 18
0
    model_file = str(opts['-i'])

    # Abrimo el archivo que contiene el Modelo del lenguaje
    f = open(model_file, "rb")

    # Reconstruimos el objeto desde la representacion en cadena de bytes
    modelo = pickle.load(f)

    pattern = r'''(?ix)    # set flag to allow verbose regexps
          (?:sr\.|sra\.)
        | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
        | \w+(?:-\w+)*        # words with optional internal hyphens
        | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
        | \.\.\.            # ellipsis
        | [][.,;"'?():-_`]  # these are separate tokens; includes ]
    '''

    PATH = "./../../Corpus_Language_Modeling"  # Ubicacion del archivo
    FILENAME = "corpus_test.txt"  # Nombre del archivo

    # Load the data
    tokenizer = RegexpTokenizer(pattern)
    corpus = PlaintextCorpusReader(PATH, FILENAME, word_tokenizer=tokenizer)

    sents = corpus.sents()

    print("Perplexity =", modelo.perplexity(sents))

    # Cerramos el archivo
    f.close()
Esempio n. 19
0
        if w.isalnum():
            string += ' '
        string += w
    return string


print('Welcome to the extractive single-document summarizer')
iterations = 20  # iterations for textrank - should be 10 or more to converge
n = 2  # output n sentences as summary
corpus_root = './articles/'  # news articles to be summarized
wordlists = PlaintextCorpusReader(corpus_root, '.*')
print_pagerank_values = False

for fileid in wordlists.fileids():
    print('Article: ', fileid)
    sents = wordlists.sents(fileid)
    sim_graph_out = []
    size = len(sents)
    s1_count = 0
    print('Computing sentence similarity graph ...')
    # compute similarity of every sentence to all other sentences
    for i in range(0, size):
        s1 = sents[i]
        edges_out = []
        for j in range(0, size):
            s2 = sents[j]
            sim = similarity(s1, s2)
            if sim > 0 and i != j:
                edges_out.append((i, j, sim))
        sim_graph_out.append(edges_out)
Esempio n. 20
0
print '7. Obtenir la freqüència d\'aparició de les paraules en el primer fitxer del corpus. Obtindre la freqüència de la paraula \'a\''
ex_7 = freqs.values( )
ex_72 = freqs[ 'a' ]
print '\t', ex_7, '\n\tFreq. aparició \'a\':', ex_72

print '8. Obtenir el nombre de paraules que solament apareixen una volta al primer fitxer del corpus'
ex_8 = len( [ 1 for p in freqs.keys( ) if freqs[ p ] == 1 ] )
print '\t', ex_8

print '9. Obtenir la paraula més freqüent del primer fitxer del corpus'
ex_9 = freqs.max( )
print '\t', ex_9

print '10. Carrega els fitxers "spam.txt", "quijote.txt" i "tirantloblanc.txt" com un corpus propi'
corpus_root = 'C:\\Users\\nrikee\\PycharmProjects\\NLTK'

corpus_spam = PlaintextCorpusReader ( corpus_root, 'spam.txt' )
freqs_spam = FreqDist ( corpus_spam.words() )

corpus_quijote = PlaintextCorpusReader ( corpus_root, 'quijote.txt' )
freqs_quijote = FreqDist ( corpus_quijote.words() )

corpus_tirant = PlaintextCorpusReader ( corpus_root, 'tirantloblanc.txt' )
freqs_tirant = FreqDist ( corpus_tirant.words() )
print '\t', '...fet.'

print '11. Calcula el nombre de paraules, el nombre de paraules distintes i el nombre de frases'
print '\t', 'spam.txt', len( corpus_spam.words() ), len ( freqs_spam.keys() ), len ( corpus_spam.sents() )
print '\t', 'quijote.txt', len( corpus_quijote.words() ), len ( freqs_quijote.keys() ), len ( corpus_quijote.sents() )
print '\t', 'tirantloblanc.txt', len( corpus_tirant.words() ), len ( freqs_tirant.keys() ), len ( corpus_tirant.sents() )
Esempio n. 21
0
words = gutenberg.words("burgess-busterbrown.txt")
words[1:20]

sents = gutenberg.sents("burgess-busterbrown.txt")
sents[1:20]

# 载入自己的语料库
from nltk.corpus import PlaintextCorpusReader

corpus_root = 'D:/icwb2-data/training'  # 文件目录
wordlists = PlaintextCorpusReader(corpus_root, ['pku_training.utf8', 'cityu_training.utf8', 'msr_training.utf8',
                                                'pku_training.utf8'])
wordlists.fileids()
print(wordlists.raw('pku_training.utf8'))
print(len(wordlists.words('pku_training.utf8')))
print(len(wordlists.sents('pku_training.utf8')))

####条件频率分布####
# 条件与事件
'''
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',...]
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'),...]
'''

# 按文体计算词频
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
Esempio n. 22
0
# fdist1.plot( cumulative = True ) #cumulative plot of the frequency distribution
# fdist2 = FreqDist( text2 ) # create a frequency distribution containing the given samples
# fdist1 |= fdist2 # update fdist1 with counts from fdist2
# fdist1 < fdist2	# test if samples in fdist1 occur less frequently than in fdist2
# print( "===" )
# print( "nltk.corpus.gutenberg.fileids() = ", nltk.corpus.gutenberg.fileids() )
# print( "===" )
# emma = nltk.corpus.gutenberg.words('austen-emma.txt')
# print( "len( emma ) = ", len( emma ) )
# print( "===" )
# from nltk.corpus import udhr
# languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
# cfd = nltk.ConditionalFreqDist( ( lang, len( word ) ) for lang in languages for word in udhr.words(lang + '-Latin1'))
# cfd.plot( cumulative = True )
# Figure 1.2: Cumulative Word Length Distributions: Six translations of the Universal Declaration of Human Rights are processed; this graph shows that words having 5 or fewer letters account for about 80% of Ibibio text, 60% of German text, and 25% of Inuktitut text.
# print( "===" )
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'c:\\temp\\DDD3262\\'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
print("===")
print("wordlists.fileids() = ", wordlists.fileids())
print("===")
print("wordlists.words( '3262.txt' ) = ", wordlists.words('3262.txt'))
print("wordlists.words( '3262.txt' )[0:1000] = ",
      wordlists.words('3262.txt')[0:1000])
print("===")
print("wordlists.sents( '3262.txt' ) = ", wordlists.sents('3262.txt'))
print("wordlists.sents( '3262.txt' )[0:300] = ",
      wordlists.sents('3262.txt')[0:300])
print("===")
Esempio n. 23
0
print([fdist[w] for w, f in fdist.most_common()])

# Parte 8
print("No de palabras que aparecen una sole vez: ",
      len([w for w, f in fdist.most_common() if fdist[w] == 1]))

# Parte 9
print("La palabra más frecuente es", fdist.max())

# Parte 10
from nltk.corpus import PlaintextCorpusReader
corpus_root = '.'
wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')
for element in wordlists.fileids():
    print(element, len(wordlists.words(element)),
          len(set(wordlists.words(element))), len(wordlists.sents(element)))

# EJERCICIO 2
from nltk.corpus import brown
from nltk.probability import *

res = []

palabras = ['what', 'when', 'where', 'who', 'why']
for palabra in palabras:
    res.append(palabra)
    lista = []
    for cat in brown.categories():
        pal = FreqDist(brown.words(categories=cat))
        lista.append(cat)
        lista.append(pal[palabra])
Esempio n. 24
0
import grads
import utils
import w2v_sgd
import sampling

#############################################################
#######   Зареждане на корпуса
#############################################################
startToken = '<START>'
endToken = '<END>'

corpus_root = 'JOURNALISM.BG/C-MassMedia'
myCorpus = PlaintextCorpusReader(corpus_root, '.*\.txt')

corpus = [ [startToken] + [w.lower() for w in sent] + [endToken] for sent in myCorpus.sents()]

windowSize = 3
negativesCount = 5
embDim = 50

words, word2ind, freqs = utils.extractDictionary(corpus, limit=20000)
data = utils.extractWordContextPairs(corpus, windowSize, word2ind)

del corpus

U0 = (np.random.rand(len(words), embDim) - 0.5) / embDim
V0 = (np.random.rand(len(words), embDim) - 0.5) / embDim

seq = sampling.createSamplingSequence(freqs)
contextFunction = lambda c: sampling.sampleContext(c, seq, negativesCount)
Esempio n. 25
0
new_corpus = PlaintextCorpusReader(corpus_root, '.*')
lista_ficheros = new_corpus.fileids()
print(
    "\n--------------------------------------------------------------------------------\n "
)
print("\n\n1.10) Ficheros que componen el corpus: \n" + str(lista_ficheros) +
      "\n")
print(
    "\n-------------IMPORTANTE CAMBIAR LA RUTA DE LOS ARCHIVOS EN EL CÓDIGO------------\n "
)
print("Ruta actual : " + corpus_root)

# Calcular el número de palabras, el número de palabras distintas y el número de frases de los tres documentos
print(
    "\n--------------------------------------------------------------------------------\n1.11) "
)
print("\n" + "Palabras".rjust(35, " ") + "Vocabulario".rjust(12, " ") +
      "Frases".rjust(12, " "))
for fichero in lista_ficheros:
    texto1 = new_corpus.words(fichero)
    fdist1 = FreqDist(texto1)
    numPalabras = len(texto1)
    numPalabrasDistintas = len(fdist1.keys())
    numFrases = len(new_corpus.sents(fichero))
    print("Fichero: " + str(fichero).ljust(20, " ") +
          str(numPalabras).ljust(10, " ") +
          str(numPalabrasDistintas).ljust(15, " ") + str(numFrases))
print(
    "\n\n--------------------------------------------------------------------------------\n"
)
Esempio n. 26
0
from nltk import download


from languagemodeling.ngram import NGram, AddOneNGram, InterpolatedNGram


models = {
    'ngram': NGram,
    'addone': AddOneNGram,
    'interpolated': InterpolatedNGram,
}


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    corpus = PlaintextCorpusReader('../../textos/', 'out.txt')
    train_sents = corpus.sents()[0:int(len(corpus.sents())*0.9)]

    # train the model
    n = int(opts['-n'])
    model_class = models[opts['-m']]
    model = model_class(n, train_sents)

    # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()
Esempio n. 27
0
    num_words = len(gutenberg.words(fileid))  #词的长度
    num_sents = len(gutenberg.sents(fileid))  #划为句子长度
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print(int(num_chars / num_words), int(num_words / num_sents),
          int(num_words / num_vocab), fileid)
#平均词长,平均句子长,每个词出现的平均次数

'加载自己的语料库'
from nltk.corpus import PlaintextCorpusReader

corpus_root = 'E:/python shell'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()

ll = wordlists.words('items1.txt')
wordlists.sents('items1.txt')

'过滤停用词,让原来的表不出现停用词表中出现的词'
from nltk.corpus import stopwords

stopwords.words('english')


def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')  #加载自己的停用词库就不用这句了
    content = [w for w in text if w.lower() not in stopwords]
    return content


'过滤了停用词,有必要的话,可以用同义词表'
Esempio n. 28
0
    non_terminals.add(s)
    grammar = Grammar(non_terminals, terminals, s)
    # This is only to tell me how advanced the process is
    count = 0.0
    len_fileids = len(model.fileids())

    # Get the tokenized corpus
    tokens_location = location + "tokenized"
    print("getting tokens from: " + tokens_location)
    f = open(tokens_location, 'rb')
    tokens = pickle.load(f)
    f.close()

    # Train the grammar model with a context of -+1
    for fileid in model.fileids():
        spanish_sents = model.sents(fileid)
        print(str((count / len_fileids) * 100) + "%")
        count += 1
        # Between training with the entire corpus or just bits I get a small difference of productions, so it's not worth it
        fro = 0.55 * len(spanish_sents)
        to = 0.6 * len(spanish_sents)
        for sent in spanish_sents[int(fro):int(to)]:
            tokenized_sentence = []
            for word in sent:
                ts = tokens[word]
                tokenized_sentence.append(ts)
                grammar.add_terminal(ts)

            i = 0
            for terminal in tokenized_sentence[:-1]:
                # if this is the longest we have generated so far, we will need new non terminals (All the k = i part is
Esempio n. 29
0
#!/usr/bin/env python
# -*- coding: utf-8

import nltk
from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import LineTokenizer

OUTPUTFILE = './data/tagged_sent'

pair_str_pos = lambda x : '/'.join(x)

corpus_root = './data'
fileids = 'data_title_sample'

corpus = PlaintextCorpusReader(corpus_root,
    fileids,
    sent_tokenizer=LineTokenizer(),
    encoding='utf-8')

output = open(OUTPUTFILE,'w')

for sent in  corpus.sents() :
  tokens = map(pair_str_pos,nltk.pos_tag(sent))
  sent = ' '.join(tokens)
  output.write(sent+"\n")
Esempio n. 30
0
import re
from nltk.corpus import PlaintextCorpusReader
corpus_root = './texts/'
wordlist = PlaintextCorpusReader(corpus_root, '.*')
print(wordlist.fileids())

#numbr of words
print(len(wordlist.words('mobydick.txt')))
#number of sentences
print(len(wordlist.sents('mobydick.txt')))

#stores a fileobject into f
f = open("./texts/mobydick.txt")
#stores a string into data from f
data = f.readlines()
#corpus is now a giant string delimited by newline character
corpus = "\n".join(data)
#prints the number of times Ishmael appears in the file.
print(len(re.findall(r"\bIshmael\b", corpus)))
Esempio n. 31
0
    o = open(outp,'w')
    curr = 0
    for sent in sentences:
        times = count_occurences(sent, sent[-1])
        curr = text.find(sent[0], curr)
        end = find_nth(text, sent[-1], times, curr) + len(sent[-1])
        o.write(text[curr:end] + '\n')
        curr = end
    o.close()

def find_nth(string, sub, n, offset):
    start = string.find(sub, offset)
    while start >= 0 and n > 1:
        start = string.find(sub, start+len(sub))
        n -= 1
    return start

def count_occurences(lst, string):
    count = 0
    for item in lst:
        if string in item:
            count += 1
    return count

inp = sys.argv[1]
i = open(inp,'r').read()
corpus = PlaintextCorpusReader(os.path.dirname(inp),os.path.basename(inp))
sents = corpus.sents()
print_out(i, sents)

from nltk import word_tokenize
from nltk.corpus import wordnet as wn
'''Parte 1 Abra un documento y muestrelo en pantalla'''

corpus_root = '/BUAP/Tareas/EstudioCLaudia/texto-tarea4'
mi_corpus = PlaintextCorpusReader(corpus_root, '.*')
'''
texto = mi_corpus.raw('crimeandpunishment.txt')
print(texto)
'''
'''
Parte 2 Separelo en oraciones y muestre cada oración numerada 
(1 para la primera oración, 2 para la segunda, etc) 
, pregunte al usuario que numero de oración desea seleccionar'''

oraciones = mi_corpus.sents('crimeandpunishment.txt')


def separar_numerar_oraciones(texto):
    key = 0
    for sent in oraciones:
        key = key + 1
        if sent:
            print(str(key) + ':' + str(sent))


'''la funcion tokens pertenece a la parte 4'''


def tokens(sent, palabra):
    keyword = 0
Esempio n. 33
0
def textFileToSentList(pathToFileFolder, FileNameWithExtension):
    wordlists = PlaintextCorpusReader(pathToFileFolder, '.*')
    return wordlists.sents(FileNameWithExtension)
Esempio n. 34
0
        while True:
            sent = predict_next_word(sent, model)
            if sent.split(" ")[-1] == "<END>":
                sent = " ".join(sent.split(" ")[:-1])
                break
        print(sent)


if __name__ == '__main__':
    print("Lab 4 Exercise 2")
    corpus_reader = PlaintextCorpusReader(root="./twitter-files",
                                          fileids=".*\.txt",
                                          word_tokenizer=TweetTokenizer())

    # Convert tweets to tri-grams
    tweets = [tweet for tweet in corpus_reader.sents()]
    tweet_trigrams = [
        list(
            ngrams(sequence=tweet,
                   n=3,
                   pad_left=True,
                   pad_right=True,
                   left_pad_symbol="<START>",
                   right_pad_symbol="<END>")) for tweet in tweets
    ]
    all_trigrams = [gram for tweet in tweet_trigrams for gram in tweet]

    # Initialize the language model
    freq_dist = FreqDist(all_trigrams)
    model = KneserNeyProbDist(freq_dist)
Esempio n. 35
0
print("Freq aparición de la preposición a " + str(fdist['a']))

print("#act8")
print("No de palabras que aparecen una sóla vez: " + str(len(fdist.hapaxes())))

print("#act9")
print("La palabra más frecuente es " + fdist.max())

print("#act10")
dir_path = os.path.dirname(os.path.realpath(__file__))
wordlists = PlaintextCorpusReader(corpus_root + "/library", '.*')
print("#act11")
text = wordlists.words(wordlists.fileids()[0])
fdist = FreqDist(text)
for i in wordlists.fileids():
    text = wordlists.words(i)
    fdist = FreqDist(text)
    print((str(i) + " " + str(len(text)) + " " + str(len(fdist.keys())) + " " +
           str(len(wordlists.sents(i)))))
"""
12. ¿Coinciden estos resultados con los de la práctica anterior? Justifica la respuesta.

Los resultados no coinciden por varias razones. La primera es que en la practica 2 solo se tenian en cuenta
las palabras alphabeticas en cambio NLTK incluye las alphanumericas como '-Fpa-' ó '51_por_ciento". Anteriormente tambien
quitabamos las stopwords de las cuentas. Finalmente el criterio de separación de frases de la práctica 2 eran ".", ";" y "\n\n"
dando como resultado más lineas totales.

"""
#EJERCICIO2

#EJERCICIO3
Esempio n. 36
0
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/home/vivkul/Downloads/project'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
# wordlists.fileids()
# wordlists.words('questions.txt')
amrit=wordlists.words('allquestion.txt')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.book import *
fo=open("selectedquestion.txt","wb")
a=wordlists.sents('allquestion.txt')
while(len(amrit)!=0):
	content=[w for w in amrit if w.lower() not in stopwords]
	voc=FreqDist(content)
	# sorted([w for w in set(content) if len(w) > 2 and 4voc[w] > 3])
	# set_voc_0=FreqDist(a[0])
	# set_voc_1=FreqDist(a[1])
	b=voc.keys()
	i=0
	while(i<len(b)):
		if(len(b[i])>2):
			j=i
			max=b[i]
			break
		i=i+1
	q_no=[]
	k=0
	while(k<len(a)):
		set_voc=FreqDist(a[k])
		if(set_voc[max]>0):
			q_no.append(len([w for w in a[k] if w.lower() not in stopwords]))
Esempio n. 37
0
    'нак', 'уна', 'ьа', 'фна', 'наф', 'гна', 'ана', 'иа', 'ща', 'нан', 'ьна',
    'ниа', 'рна', 'пна', 'не', 'цна', 'ныа', 'нва', 'нка', 'ну', 'зна', 'оа',
    'нау', 'нйа', 'наш', 'ена', 'яна', 'нба', 'нт', 'ню', 'н', 'нэ', 'нжа',
    'нла', 'нпа', 'но', 'яа', 'нна', 'нж', 'еа', 'нав', 'нац', 'нса', 'нщ',
    'нас', 'жна', 'нал', 'нц', 'нр', 'ина', 'лна', 'па', 'нп', 'нф', 'нс',
    'нащ', 'та', 'чна', 'нча', 'дна', 'йна', 'уа', 'нат', 'нв', 'нач', '-на',
    'ка', 'сна', 'нк', 'нма', 'жа', 'наь', 'нч', 'хна', 'ная', 'ны', 'н ',
    'наж', 'за', 'йа', 'ла'
}

print('Прочитане на корпуса от текстове...')
corpus_root = 'JOURNALISM.BG/C-MassMedia'
myCorpus = PlaintextCorpusReader(corpus_root, '.*\.txt')
fullSentCorpus = [[model.startToken] + [w.lower()
                                        for w in sent] + [model.endToken]
                  for sent in myCorpus.sents()]
print('Готово.')

print('Трениране на Марковски езиков модел...')
M2 = model.MarkovModel(fullSentCorpus, 2)
print('Готово.')

#############################################################################
#### Начало на тестовете
#### ВНИМАНИЕ! Тези тестове са повърхностни и тяхното успешно преминаване е само предпоставка за приемането, но не означава задължително, че програмата Ви ще бъде приета. За приемане на заданието Вашата програма ще бъде подложена на по-задълбочена серия тестове.
#############################################################################

#### Тест на editDistance
try:
    for s1, s2, d in zip(L1, L2, C):
        signal.alarm(60)
Esempio n. 38
0
pos_tokens = pos_tag(convote_training.tokenized())
prep_tokens = []

for (word, pos) in pos_tokens:
    if(pos == 'IN'):
        prep_tokens.append(word + '|' + pos)
    else:
        prep_tokens.append(pos)
        
trigram = nltk.trigrams(prep_tokens)
trigram_file = open('../data/pos_trigrams', 'w')

pickle.dump(bigram, outfile)
pickle.dump(trigram, trigram_file)
    
for sents in convote_test.sents():
    for index in range(0, len(sents)):
        if sents[index] == 'in':
            temp = deepcopy(sents)
            temp[index] = '*'
            in_test.append(temp)
        if sents[index] == 'on':
            temp = deepcopy(sents)
            temp[index] = "*"
            on_test.append(temp)
        if sents[index] == 'of':
            temp = deepcopy(sents)
            temp[index] = "*"
            of_test.append(temp)

for sents in convote_dev.sents():
Esempio n. 39
0
import nltk
import numpy as np
from nltk.corpus import PlaintextCorpusReader

vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

reader = PlaintextCorpusReader('./data/',
                               'reddit-comments-2015-08.csv',
                               encoding='utf-8')
sentences = reader.sents()
words = reader.words()
tokenized_sentences = [[sentence_start_token] + sent + [sentence_end_token]
                       for sent in sentences]

word_freq = nltk.FreqDist(words)
word_freq.plot(30)
print("Unique words", len(word_freq))

vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

print("Least freq word in vocab is:", f'"{vocab[-1][0]}"', "and it appeared",
      vocab[-1][1], "times")

for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = ([
Esempio n. 40
0
corpus_root = '/DTU/MSc/Code/Data'
wordlists = PlaintextCorpusReader(corpus_root, [
    'food_holiday_inn_london.txt.data', 'food_swissotel_chicago.txt.data',
    'room_holiday_inn_london.txt.data', 'rooms_swissotel_chicago.txt.data',
    'rooms_swissotel_chicago.txt.data',
    'service_bestwestern_hotel_sfo.txt.data',
    'service_holiday_inn_london.txt.data',
    'service_swissotel_hotel_chicago.txt.data',
    'staff_bestwestern_hotel_sfo.txt.data', 'staff_swissotel_chicago.txt.data'
])
wsj = nltk.corpus.treebank

print wordlists.fileids()
print wsj.fileids()

print(len(wordlists.sents()))
senLengths1 = [len(s) for s in wordlists.sents()]
freqDist1 = nltk.FreqDist(senLengths1)

print(len(wsj.sents()))
senLengths2 = [len(s) for s in wsj.sents()]
freqDist2 = nltk.FreqDist(senLengths2)

propDist1 = nltk.DictionaryProbDist(freqDist1, normalize=True)
propDist2 = nltk.DictionaryProbDist(freqDist2, normalize=True)

myfile = open('../Thesis/wsjdist.dat', 'wb')
wr = csv.writer(myfile, quoting=csv.QUOTE_NONE)

wr.writerow(["x", "y1", "y2"])
Esempio n. 41
0
fileid = 'neg/cv956_12547.txt'
text = movie_reviews.raw(fileid)
text1= movie_reviews.raw(categories='neg')
movie_reviews.categories(fileid)


#Frequency distribution by creating our own corpus

from nltk.corpus import PlaintextCorpusReader
fileid = 'C:/Users/ITRAIN-12/Desktop/Day 2/gaming.txt'
my_corpus = PlaintextCorpusReader(fileid, '.*')
text = my_corpus.raw(fileid)
text
my_corpus.raw(fileid)
my_corpus.words(fileid)
my_corpus.sents(fileid)
distr = nltk.FreqDist(text)
print(distr.most_common(5))


#Reuters
from nltk.corpus import reuters
fileid='training/9865'
text=reuters.raw(fileid)
text

#Load reuters category news
from nltk.corpus import reuters
reuters.fileids()
reuters.categories()
fileid = 'test/16399'
Esempio n. 42
0
yoursmall_wordlists = PlaintextCorpusReader(corpus_root, '.*')
classevent_wordlists = PlaintextCorpusReader(classevent_corpus_root, '.*')

# The next step is to show the file names under the directory (optional step)
lemmer = WordNetLemmatizer()

def clean_words(words):
    words = [w.lower() for w in words if w.isalnum()]
    return words

def clean_sents(sents):
    return [clean_words(sent) for sent in sents]

    
classevent_words = clean_words(classevent_wordlists.words())
classevent_sents = clean_sents(classevent_wordlists.sents())
classevent_words = [w.lower() for w in classevent_words if w.isalnum()]
classevent_words = nltk.Text(classevent_words)
classevent_words_lem = [lemmer.lemmatize(w) for w in classevent_words]
print "ClassEvent loaded"

yoursmall_words = yoursmall_wordlists.words()
yoursmall_sents = yoursmall_wordlists.sents()
yoursmall_words = [w.lower() for w in yoursmall_words if w.isalnum()]
yoursmall_words = nltk.Text(yoursmall_words)
yoursmall_words_lem = [lemmer.lemmatize(w) for w in yoursmall_words]
print "YourSmall loaded"

yourwords = ['earthquake', 'seismic', 'aftershocks', 'quake', 'damage', 'magnitude', 'tremor', 'richter', 'epicenter', 'depth', 'fault', 'hypocenter', 'focus', 'dead', 'casualties', 'structural', 'seismometer', 'temblor', 'hazard', 'impact']
yourwords_lem = [lemmer.lemmatize(w.lower()) for w in yourwords]
Esempio n. 43
0
# sents_no=[]
# pmids=[]
# i=0
# for index,row in mydf.iterrows():
#     stence=row['text']
#     sents=stence.split('.')
#     newsents=[]
#     for sent in sents:
#         newsents.append(sent+'.')
#         i=i+1
#         sents_no.append(i)
#         pmids.append(row['pmid'])
#
#     # result= re.findall(r"<category=\".`+?\">(.+?)</category>",stence, re.S)
#     # print(result)
# print(sents_no)
# print(pmids)
str='you are my shine.'
str_list=list(str)
list=[]
for i in str_list:
    list.append({'str':str,"obj":"ss"})
str2=" ".join(list)
print(str2 )
wodslist = PlaintextCorpusReader(cr, '.*')

for i in wodslist.sents('NCBI_corpus_training.txt'):
    text=nltk.word_tokenize(' '.join(i))
    nltk.wordpunct_tokenize
    print(i)
    # print(nltk.pos_tag(text, tagset='universal'))
Esempio n. 44
0
def nltk_corpora():
    ## 1. PROJECT GUTENBERG << Formal Language - Literature;ebooks 60K++
    emma = nltk.corpus.gutenberg.words("austen-emma.txt")
    emma = nltk.Text(emma)

    len(emma)
    lexical_diversity(emma)

    emma.concordance("brave")
    emma.collocation_list()

    ## traits of the corpus text for each
    def corp_content(corporad):
        print(
            "{0} File {0} \t\tWord len   Sent len   Vocab   Lexical Complexity"
            .format(" " * 6))
        print("{}".format("-" * 100))
        for i, txt in enumerate(corporad.fileids()):
            sents_l = len(corporad.words(txt))
            try:
                sents_l = len(corporad.sents(txt))
            except:
                sents_l = len(corporad.posts(txt))
            w_len = round(len(corporad.raw(txt)) / len(corporad.words(txt)))
            s_len = round(len(corporad.words(txt)) / sents_l)
            voc = len(set(w.lower() for w in corporad.words(txt)))
            # lexp = round( voc / len( [w.lower() for w in gutenberg.words(txt)] ) * 100 )
            lexp = round(voc / len(corporad.words(txt)) * 100)
            print("{}. {} \t\t{}\t{}\t{}\t{}%\t{}".format(
                i, txt, w_len, s_len, voc, lexp,
                corporad.raw(txt)[:30]))
            # print( "{}. {} \t\t{}\t{}\t{}\t{}%\t{}".format(i, txt, w_len, s_len, voc, lexp, corporad.words(txt)[:5] ) )

    # 1. Formal Language - Project Gutenberg ebooks 60K++, 16+ languages
    corp_content(gutenberg)

    # 2. Informal Language - Web content and Chat rooms
    corp_content(webtext)
    corp_content(nps_chat)

    # 3. Brown Corpus - 15+ Multi-genre, 500+ sources, En_lang << http://icame.uib.no/brown/bcm-los.html
    # for studying systematic differences between genres I.E. stylistics
    corp_content(brown)

    brown.categories()
    brown.words(categories="news")
    brown.words(categories=["news", "editorial", "reviews"])

    # example stylistics - modal verbs usage between genres
    def modalz(modals):
        print("\tCategory\t", end=" ")
        for m in modals:
            print("\t{}".format(m), end=" ")
        print("\n" + "-" * 100)
        for i, cat in enumerate(brown.categories()):
            print("{}.{}\t\t".format(i, cat), end=" ")
            fdist = nltk.FreqDist(w.lower()
                                  for w in brown.words(categories=cat))
            for m in modals:
                print("\t{}".format(fdist[m]), end=" ")
            print("")

    modalz(["can", "could", "may", "might", "must", "will"])
    modalz(["should", "ought", "would", "could", "do", "did", "does"])
    modalz(["what", "when", "where", "why", "who"])

    ## ditto using nltk conditional frequency distributions
    cfdist = nltk.ConditionalFreqDist(
        (genre, word) for genre in brown.categories()
        for word in brown.words(categories=genre))

    genz = ["news", "religion", "hobbies", "humor", "romance"]
    modz = ["can", "could", "may", "might", "must", "will"]
    cfdist.tabulate(conditions=genz, samples=modz)

    # 4. Reuters Corpus - news articles, 90 topics, grouped into training and testing sets
    # << Apparent goal is to predict the category/topic of a given article??
    corp_content(reuters)
    # retrieve topic(s) of a given article
    reuters.categories("training/9865")
    reuters.categories(["training/9865", "training/9880"])
    # find articles that cover some topic(s)
    reuters.fileids("barley")
    reuters.fileids(["barley", "corn"])

    # the first words are in all CAPs and are the titles of the article. The rest is the story text
    for i, txt in enumerate(reuters.fileids(["barley", "oil"])):
        print("{}. {}\t{}".format(i, txt, reuters.words(txt)[:10]))

    # 5. Speeches - Inaugral Address Corpus << 55 USA Presidential addresses
    # << interesting in that there's a time horizon element from 1789  to 2009 (first 4 xters of fileid = year) ; can study how language changes with time; could reflect on priorities, culture, ???
    corp_content(inaugural)
    # how America and Citizen ar eused over time
    cfdist = nltk.ConditionalFreqDist((target, fileid[:4])
                                      for fileid in inaugural.fileids()
                                      for w in inaugural.words(fileid)
                                      for target in ['america', 'citizen']
                                      if w.lower().startswith(target))
    cfdist.plot()

    # 6. Annotated Text Corpora
    # annotations: POS, named entities, syntatic structures, semantic roles,

    # 7. Other Languages Corpora
    # includes udhr = Universal Declaration of Human Rights in over 300 languages

    # word length freq by diff languages
    langz = [
        "English", "Chickasaw", "German_Deutsch", "Kinyarwanda",
        "Swahili_Kiswahili"
    ]
    cfdist = nltk.ConditionalFreqDist((lang, len(word)) for lang in langz
                                      for word in udhr.words(lang + "-Latin1"))
    cfdist.plot()
    cfdist.plot(cumulative=True)

    # alphabet freq
    nltk.FreqDist(udhr.raw("Kinyarwanda-Latin1")).plot()

    # 8. Loading your own Corpora
    # << txt files. Use PlaintextCorpusReader. Check dir location
    #
    my_corpus = PlaintextCorpusReader(
        "root_dir_path_here", ".*"
    )  # second param is a list of fileids defined as a list or an ls pattern
    eg_corpus = PlaintextCorpusReader(
        "D:/zRepoz/dataSaysWhat/DocReader/res/txt_corpus", ".txt")
    eg_corpus.fileids()
    eg_corpus.words("example1.txt")
    len(eg_corpus.sents())

    #BracketParseCorpusReader
    my_corpus = nltk.corpus.BracketParseCorpusReader("path", "file_pattern")
Esempio n. 45
0
classevent_wordlists = PlaintextCorpusReader(classevent_corpus_root, '.*')
big_wordlists = PlaintextCorpusReader(big_corpus_root, '.*')

# The next step is to show the file names under the directory (optional step)
lemmer = WordNetLemmatizer()

def clean_words(words):
    words = [w.lower() for w in words if w.isalnum()]
    return words

def clean_sents(sents):
    return [clean_words(sent) for sent in sents]

    
classevent_words = classevent_wordlists.words()
classevent_sents = classevent_wordlists.sents()
classevent_words = [w.lower() for w in classevent_words if w.isalnum()]
classevent_words = nltk.Text(classevent_words)
classevent_words_lem = [lemmer.lemmatize(w) for w in classevent_words]
print "ClassEvent loaded"

yoursmall_words = yoursmall_wordlists.words()
yoursmall_sents = yoursmall_wordlists.sents()
yoursmall_words = [str(w).lower() for w in yoursmall_words if w.isalnum()]
yoursmall_words = nltk.Text(yoursmall_words)
yoursmall_words_lem = [lemmer.lemmatize(w) for w in yoursmall_words]
print "YourSmall loaded"

big_words = big_wordlists.words()
big_sents = big_wordlists.sents()
big_words = [w.lower() for w in big_words if w.isalnum()]
Esempio n. 46
0
###
#Abschnitt Naive Bayes tranieren
###

print("Lese neutralen Korpus")
neutralCorp = ConllCorpusReader(
    '.',
    'corpora/tiger_release_aug07.corrected.16012013.conll09',
    ['ignore', 'words', 'ignore', 'ignore', 'pos'],
    encoding='utf-8')

print("Lese Ingvar-Korpus")
ingvarCorp = PlaintextCorpusReader(".", "texts/latest.txt")

print("Generiere Wortlisten")
ingvarSentencesLong = ingvarCorp.sents()
neutralSentencesLong = neutralCorp.sents()

smallerSet = min(len(ingvarSentencesLong), len(neutralSentencesLong))

ingvarSentences = ingvarSentencesLong[:smallerSet]
neutralSentences = neutralSentencesLong[:smallerSet]

print(f'Zahl der Sätze limitiert auf kleineres Set mit {smallerSet} Sätzen')

print("Generiere Features")
ingFeats = [(word_feats(f), 'ing') for f in ingvarSentences]
neutFeats = [(word_feats(f), 'neu') for f in neutralSentences]

print("Generiere Cutoff")
ingCutoff = int(len(ingFeats) * 0.9)
Esempio n. 47
0
    def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None):
        """Extract features, reduce dimensions with a PCA and return data.

        Exports raw- and PCA-reduced data both in arff- and numpy-format.
        """
        start = time.clock()
        self.dictVectorizer = DictVectorizer(sparse=False)
        filename = os.path.split(filepath)[1]
        directory = os.path.split(filepath)[0]
        plain_reader = PlaintextCorpusReader(
            directory, 
            [filename],
            word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|["+string.punctuation+"]"),
            sent_tokenizer=LineTokenizer(blanklines="discard"),
            encoding='utf8')

        # create new subdir for extracted data
        if _NEW_SUBDIR is not None:
            path = os.path.join(directory, _NEW_SUBDIR)
            if not os.path.exists(path):
                os.makedirs(path)
            path = os.path.join(path, os.path.splitext(filename)[0])            
            # print "path {}".format(path)
        else:
            path = os.path.splitext(filepath)[0]
            # print "path {}".format(path)

        # filepaths for weka- and numpy-files
        arff_filepath = path + ".arff"
        arff_filepath_pca = path + "_pca95.arff"
        numpy_filepath = path + ".npy"
        numpy_filepath_pca = path + "_pca95.npy"
        
        # print(":time: Reader created, time elapsed {}").format(time.clock() - start)
        paras = plain_reader.paras()
        # print(":time: Paras created, time elapsed {}").format(time.clock() - start)
        sents = plain_reader.sents()
        # print(":time: Sents created, time elapsed {}").format(time.clock() - start)

        # get paragraph boundaries for sliding-window
        self.boundaries = util.get_boundaries(paras)
        boundaries_backup = self.boundaries

        # check if all files necessary exist, if yes - unpickle/load them and return data
        if util.files_already_exist([numpy_filepath_pca,]):
            print "Features already extracted. Calculating clusters...\n"
            matrix_sklearn_pca = numpy.load(numpy_filepath_pca)
            return filepath, self.boundaries, matrix_sklearn_pca, len(sents)

        # save correct target-labels and additional info of current data
        targets_path = open(path + ".tbs", "wb")
        pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path)

        # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start)
        self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features)
        # self.data[year] = self.extract_features_para(paras, ind_features, dep_features)
        # print(":time: Features extracted, time elapsed {}").format(time.clock() - start)
        self.all_features = self.unified_features(self.data)
        # print(":time: Unified features, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = self.feature_matrix_sklearn(self.generator_data(self.data))
        # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = util.normalize(matrix_sklearn)
        # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start)
        
        print "Exporting raw-data..."
        util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename+"_RAW", labels_per_window, file_info=None)
        numpy.save(numpy_filepath, matrix_sklearn)
        
        # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape)
        feature_names, feature_names_part = None, None
        if _DO_PCA:
            print "PCA calculation..."
            matrix_sklearn_pca, feature_names = util.pca(matrix_sklearn, self.dictVectorizer.get_feature_names())
            util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename+"_PCA95", labels_per_window, file_info=None)
            numpy.save(numpy_filepath_pca, matrix_sklearn_pca)
            
            del matrix_sklearn
        gc.collect()
        return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
# CALCULATION OF BINARY FEATURES

# 1: IF A TERM APPEAR IN A SENTENCE) 0 : DOESN'T APPEAR

import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

corpus_root = 'C:\MyData\PythonPractice\Mycorpus'
wordlists = PlaintextCorpusReader(corpus_root, 'resort.*\.txt')

print('\nFollowing file ids are there in this corpus: \n ')
print(wordlists.fileids())
print("\nNumber of sentences in the file are :")
sencount = len(wordlists.sents(fileids=['resort.txt']))
print(sencount)
print('\n Sentences are : \n')
sentences = wordlists.sents(fileids='resort.txt')
print(sentences)
sample = wordlists.raw("resort.txt")
s = sample.split('.')

#NO OF TERMS
unique_tokens = []

for i in range(sencount):
    print("\n Sentence " + str(i + 1))
    print(s[i])
    #print('\n Tokenization \n')
    word_tokens = word_tokenize(s[i])
Esempio n. 49
0
from __future__ import division
import urllib2, sys, re, codecs
import nltk, pprint
from BeautifulSoup import BeautifulSoup
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize

# The name of the output file
#input_file_name = 'obama_speeches' 
#g = codecs.open(input_file_name, mode='r+').read()

#g = nltk.Text(nltk.word_tokenize(g))
#print g.concordance('freedom')
#print g.concordance('liberty')

from nltk.corpus import PlaintextCorpusReader
corpus_root = '/Users/richard/Github/politics/speeches/'
ocorpus = PlaintextCorpusReader(corpus_root, '.*')
for fileid in ocorpus.fileids():
	num_chars = len(ocorpus.raw(fileid))
	num_words = len(ocorpus.words(fileid))
	num_sents = len(ocorpus.sents(fileid))
	num_vocab = len(set([w.lower() for w in ocorpus.words(fileid)]))
	#print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
	print nltk.Text(ocorpus.words(fileid)).concordance('freedom')
	print nltk.Text(ocorpus.words(fileid)).concordance('liberty')
Esempio n. 50
0
import operator
import random
auth = tweepy.OAuthHandler("xxx", "xxx")
auth.set_access_token("xxx", "xxx")
api = tweepy.API(auth)

directory = "PATH-TO-DIRECTORY"

bandz = pickle.load(open(directory + "thug_tokens.p", "rb"))
thugtrainer = nltk.NgramModel(3, bandz)

corpus_root = directory + "/songs"
chainzcorpus = PlaintextCorpusReader(corpus_root, '.*')

chainzwords = nltk.probability.FreqDist()
for sent in chainzcorpus.sents():
	for word in sent:
		chainzwords.inc(word.lower())
chainzkeys = chainzwords.keys()

brownwords = nltk.probability.FreqDist()
for sent in brown.sents():
	for word in sent:
		brownwords.inc(word.lower())
brownkeys = brownwords.keys()

stopwords = nltk.corpus.stopwords.words('english')

trends_US = api.trends_place(23424977)

trendlist = []
from nltk.corpus import PlaintextCorpusReader

corpus_root = "D:\develop\data\my_nltk"

word_lists = PlaintextCorpusReader(corpus_root, ".*")
print(word_lists.fileids())
print(word_lists.sents("a.txt"))
print(word_lists.words("a.txt"))
Esempio n. 52
0
#Obtener el número de palabras que sólo aparecen una vez en el primer fichero del corpus.
print("Número de palabras que aparecen solo una vez:")
print(len([w for w in set(text) if fdist[w] == 1]))
#Obtener la palabra más frecuente del primer fichero del corpus.
print("La palabra más frecuente es %s" % fdist.max())
#Cargar los ficheros de PoliformaT (“spam.txt”, “quijote.txt” y “tirantloblanc.txt” ) como un corpus propio.
corpus_root = '.'
wordlists = PlaintextCorpusReader(
    corpus_root, ["spam.txt", "quijote.txt", "tirantloblanc.txt"])
print("corpus cargado")
#Calcular el número de palabras, el número de palabras distintas y el número de frases de los tres documentos.
for i in range(0, 3):
    nombre = wordlists.fileids()[i]
    npalabras = len(wordlists.words(wordlists.fileids()[i]))
    npaldistintas = len(set(wordlists.words(wordlists.fileids()[i])))
    nfrases = len(wordlists.sents(wordlists.fileids()[i]))
    print(
        "fichero: %s num palabras: %d num palabras distintas: %d num frases %d"
        % (nombre, npalabras, npaldistintas, nfrases))

print("Ejercicio 2")
from nltk.corpus import brown
words = ["what", "where", "who", "when", "why"]
mydict = {}
for word in words:
    mydict[word] = []
categoriess = brown.categories()
for word in words:
    for category in categoriess:
        frecuencia = len(
            [w for w in brown.words(categories=category) if w == word])
def read_sents(inp, outp):
    i = open(inp, 'r').read()
    corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp))
    sents = corpus.sents()
    print_out(outp, i, sents)
Esempio n. 54
0
def nlp(request):
	w=PlaintextCorpusReader("./","canto1.txt");
	w.words();
	t=nltk.text.Text(w.words())
	return render_to_response('lengths.html', {'word_length':len(set(w.words())), 'sentence_length' : len(w.sents())})
Esempio n. 55
0
from gensim.models import Word2Vec
from nltk.corpus import brown, movie_reviews, treebank, PlaintextCorpusReader
from six import string_types
from nltk.corpus.reader.util import concat

root_dir = '/home/diego/qdata/techarticles/parsed_articles/'
acr = PlaintextCorpusReader(root_dir, '.*\.txt')
from datetime import datetime

print(acr.fileids())

for fileid in acr.fileids():
    #    num_chars = len(acr.raw(fileid))
    #    num_words = len(acr.words(fileid))
    #    num_sents = len(acr.sents(fileid))
    #    num_vocab = len(set(w.lower() for w in acr.words(fileid)))
    #    print(round(num_chars / num_words), round(num_words / num_sents), round(num_words / num_vocab), fileid)
    print(" ============ " + fileid + " =============================")
    print(acr.words(fileid))
    print(acr.sents(fileid))
Esempio n. 56
0
  -h --help     Show this screen.
"""
#  You must be in the virtualenv ($ workon pln-2015) to run this script
# Attention: you must run this (every) script from PLN-2015/ directory

import pickle
from docopt import docopt
from nltk.corpus import PlaintextCorpusReader
from languagemodeling.ngram import Eval


if __name__ == '__main__':

    opts = docopt(__doc__)

    i = str(opts['-i'])
    f = open(i, 'rb')
    model = pickle.load(f)

    test_corpus = PlaintextCorpusReader('corpus/spanish', 'test.txt')
    test_sents = test_corpus.sents()

    evaluator = Eval(model, test_sents)
    log_prob = evaluator.log_probability
    cross_ent = evaluator.cross_entropy
    perp = evaluator.perplexity

    print("Input filename: %s" % i)
    print(" Log-Probability: %f\n Cross-Entropy: %f\n Perplexity: %f\n"
          % (log_prob, cross_ent, perp))
Esempio n. 57
0
""" Word and named entity 10 chunks """

import nltk
from nltk.corpus import PlaintextCorpusReader
import pickle

print 'getting files'
corpus_root = 'Texas_Wild_Fire'
english = pickle.load(open('./nltk_data/tokenizers/punkt/english.pickle', 'r'))
yourSmallReader = PlaintextCorpusReader(corpus_root, '.*', sent_tokenizer=english)

print 'getting sentences'
# 10324.txt 17749.txt 17859.txt
sents = yourSmallReader.sents('10324.txt') + yourSmallReader.sents('17749.txt') + yourSmallReader.sents('17859.txt')
# sents = yourSmallReader.sents()
sents = [nltk.pos_tag(sent) for sent in sents]

print 'getting chunks'
chunks = [nltk.ne_chunk(sent) for sent in sents]

# Getting a random assortment of chunks
print chunks[0]
print chunks[10]
print chunks[25]
print chunks[35]
print chunks[50]
print chunks[60]
print chunks[75]
print chunks[80]
print chunks[90]
print chunks[100]
Esempio n. 58
0
from languagemodeling.ngram import NGram, AddOneNGram, InterpolatedNGram


models = {
    'ngram'  : NGram,
    'addone' : AddOneNGram,
    'inter'  : InterpolatedNGram,
}


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the corpus
    screenplay_dir = opts['-r']
    my_corpus = PlaintextCorpusReader(screenplay_dir, '.*.txt')

    sents = my_corpus.sents()

    # train the model
    n = int(opts['-n'])
    model_class = models[opts['-m']]
    model = model_class(n, sents)

    # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()
Esempio n. 59
0
#Make a stopset
stopset = set(nltk.corpus.stopwords.words('english'))
f = open("./stopwords.txt", "r")
for line in f.readlines():
    word = line.strip()
    if word not in stopset:
        stopset.add(word)

#Read in corpus
corpus_root = '.././Islip13Rain/'
classevent_wordlists = PlaintextCorpusReader(corpus_root, '.*') 

#sent tokenize
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# CEsents = sent_tokenizer.tokenize(classevent_wordlists.raw())
CEsents = classevent_wordlists.sents()

#tag and filter
def trigram_tag(sentences, default_tagger=get_regexp_tagger(), **kwargs):
    tagged_words = raw_trigram_tag(sentences)
    tagged_words = remove_stopwords_tagged(tagged_words, stopset)
    pos_filtered = remove_tags(["NN", "VB"], tagged_words)
    pos_filtered = remove_non_english(pos_filtered)
    pos_filtered = [(word.lower()) for word in pos_filtered]
    pos_filtered = lemmatize_words(pos_filtered)
    set_pos_filtered = list(set(pos_filtered))
    set_pos_filtered = sorted(set_pos_filtered, key=lambda word: pos_filtered.count(word))
    return set_pos_filtered

Esempio n. 60
0
import csv
from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader, stopwords
MAX_WORDS = 4
NUM_WORDS = 100
wordlist = PlaintextCorpusReader('', 'ofk(_chap_[1234])?\.txt')

sents = wordlist.sents('ofk.txt')
seqs = []


def clean_sent(sent):
    sent = filter(lambda w: w.isalpha() or w in ['.', '!', '?'], sent)
    out = []
    for i in range(len(sent)):
        if sent[i] == 't':
            out[-1] += "'t"
        else:
            out.append(sent[i])
    return out


sents = map(clean_sent, sents)

for sent in sents:
    output = []
    for i in range(len(sent)):
        output.append(sent[i])
        if len(output) >= MAX_WORDS:
            break
    if output: