def main(): current_directory = os.path.dirname(__file__) corpus_root = os.path.abspath(current_directory) wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt') wordlists.fileids() ClassEvent = nltk.Text(wordlists.words()) CEWords = ["Long Island", "Weather Service", "flooding", "August", "heavy rains", "Wednesday", "Suffolk County", "New York", "rainfall", "record"] # ClassEvent Statistics print "--------- CLASS EVENT STATISTICS -------------" print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent) print "ClassEvent WORD LENGTH DISTRIBUTIONS:" print_word_length_distributions(ClassEvent) print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:" print_percentage_of_word_in_collection(ClassEvent, CEWords) ClassEventLettersPerWord = average_letters_per_word(ClassEvent) ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents()) ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \ ClassEventWordsPerSent) - 21.43 print "Average number of letters per word", ClassEventLettersPerWord print "Average number of words per sentence:", ClassEventWordsPerSent print "Automated Readability Index:", ClassEventARI print wordlists_event = PlaintextCorpusReader(corpus_root, "Texas_Wild_Fire/.*\.txt") wordlists_event.fileids() YourSmall = nltk.Text(wordlists_event.words()) SmallEventWords = ["Fire", "Wildfire", "Water", "Damage", "Ground", "Burn", "Town", "Heat", "Wind", "Speed", "Size", "City", "People", "Home", "Weather", "Debris", "Death", "Smoke", "State", "Ash"] # YourSmall statistics print "--------- YOUR SMALL STATISTICS --------------" print "Texas_Wild_Fire", non_stopword_fraction(YourSmall) print "YourSmall WORD LENGTH DISTRIBUTIONS:" print_word_length_distributions(YourSmall) print "YourSmall PERCENTAGE OF WORD OCCURRENCES:" print_percentage_of_word_in_collection(YourSmall, SmallEventWords) YourSmallLettersPerWord = average_letters_per_word(YourSmall) YourSmallWordsPerSent = len(wordlists_event.words()) / \ len(wordlists_event.sents()) YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \ YourSmallWordsPerSent) - 21.43 print "Average number of letters per word", YourSmallLettersPerWord print "Average number of words per sentence:", YourSmallWordsPerSent print "Automated Readability Index", YourSmallARI
def textinfo(path): """ Takes a file path and returns figures about the text file contained therein. """ from nltk.corpus import PlaintextCorpusReader from nltk import FreqDist corpusReader = PlaintextCorpusReader(text, '.*') print "Total word count:", len([word for sentence in corpusReader.sents() for word in sentence]) print "Unique words:", len(set(corpusReader.words())) print "Sentences:", len(corpusReader.sents()) print "Average sentence length in words:", (len([word for sentence in corpusReader.sents() for word in sentence]) / len(corpusReader.sents()))
def stats(request): errors = [] statistics=[] if 'q' in request.GET: q = request.GET['q'] if not q: errors.append('Enter a Canto Number') else: cantoname = "canto"+q+".txt" w=PlaintextCorpusReader("./",cantoname); w.words(); t=nltk.text.Text(w.words()); l_lines=len(line_tokenize(w.raw())) l_uwords=len(set(w.words())) l_words=len(w.words()) l_sents=len(w.sents()) l_paras=len(w.paras()) l_linperpara=l_lines/l_paras statistics.append("Number of Words - "+ str(l_words)) statistics.append("Number of Unique Words - "+ str(l_uwords)) statistics.append("Number of Setences - "+ str(l_sents)) statistics.append("Number of Lines - "+ str(l_lines)) statistics.append("Number of Paras - "+ str(l_paras)) statistics.append("Number of Lines/Paras - "+ str(l_linperpara)) lexical_density=l_words/l_uwords l_wordpersent = l_words/l_sents statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density)) statistics.append("Words per sentence - "+ str(l_wordpersent)) return render_to_response('stats.html', {'statistics':statistics}) return render_to_response('stats.html', {'errors': errors})
def get_coarse_level_features(dataset, output_file): # Import the corpus reader corpus_root = '/home1/c/cis530/data-hw2/'+dataset # Define the folder where the files are situated files_dataset = PlaintextCorpusReader(corpus_root, '.*') # Open the output_file output = open('/home1/c/cis530/data-hw2/'+output_file,'w') # Read the stopwlist stop_list = open('/home1/c/cis530/data-hw2/'+'stopwlist.txt').read() types_stop_list=stop_list.split() for fileid in files_dataset.fileids(): # Output the docid output.write(dataset+'/'+fileid+' ') # Output the topic_name topic_name=fileid.split('/')[0] output.write(topic_name+' ') # Output the num_tokens tokens=files_dataset.words(fileid) output.write('tok:'+str(len(tokens))+' ') # Output the num_types types=set(tokens) output.write('typ:'+str(len(types))+' ') # Output the num_contents output.write('con:'+str(len([w for w in tokens if w not in types_stop_list]))+' ') # Output the num_sents sents = files_dataset.sents(fileid) output.write('sen:'+str(len(sents))+' ') # Output the avg_slen avg_slen=round(float(len(tokens))/float(len(sents)),2) output.write('len:'+str(avg_slen)+' ') # Output the num_caps output.write('cap:'+str(len([w for w in tokens if w[0]>='A' and w[0]<='Z']))) output.write('\n') output.close()
def compare(request): errors = [] statistics=[] stats=[] for x in range(1,3): cantoname = "canto"+str(x)+".txt" w=PlaintextCorpusReader("./",cantoname); w.words(); t=nltk.text.Text(w.words()); l_lines=len(line_tokenize(w.raw())) l_uwords=len(set(w.words())) l_words=len(w.words()) l_sents=len(w.sents()) l_paras=len(w.paras()) l_linperpara=l_lines/l_paras statistics.append(x) statistics.append("Number of Words - "+ str(l_words)) statistics.append("Number of Unique Words - "+ str(l_uwords)) statistics.append("Number of Setences - "+ str(l_sents)) statistics.append("Number of Lines - "+ str(l_lines)) statistics.append("Number of Paras - "+ str(l_paras)) statistics.append("Number of Lines/Paras - "+ str(l_linperpara)) lexical_density=l_words/l_uwords l_wordpersent = l_words/l_sents statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density)) statistics.append("Words per sentence - "+ str(l_wordpersent)) stats.append(statistics) return render_to_response('compare.html', {'stats':statistics})
def main(): corpus_root = '../posts/' newcorpus = PlaintextCorpusReader(corpus_root, '.*', para_block_reader=read_block_no_metadata) corpus_words = [w.lower() for w in newcorpus.words() if w.isalpha()] corpus_sentences = newcorpus.sents() analyst = TextAnalyst(corpus_words, corpus_sentences, 'french') analyst.print_analyze()
def extractPossibleTerms(root, fileids): # get corpus #root, filename = os.path.split(path) reader = PlaintextCorpusReader(root, fileids) # get chunker grammar = 'NP: {<JJ>*<NNP>*<NN>*}' chunker = RegexpParser(grammar) # get terms terms = set() print len(reader.sents()) i = 0 for sent in reader.sents(): i += 1 if i%100==0: print i tree = chunker.parse(pos_tag(sent)) for t in tree.subtrees(lambda t: t.node!='S'): # exclude Sentence node terms.add(' '.join([el[0] for el in t])) return terms
def get_coarse_level_features(dataset, output_file): # accessing the corpus corpus_root = '/home1/c/cis530/data-hw2/' dataset_path = corpus_root + dataset # Reading the files from the directories files = PlaintextCorpusReader(dataset_path, '.*') ids = files.fileids() stopFile = PlaintextCorpusReader(corpus_root, 'stopwlist.txt') stops = stopFile.words() #Opening a file that has to be written to out = open(output_file, 'w') for i in range(0,len(ids) - 1): #Initializing certain variables tokens_count=0 types = 0 non_stops_count=0 sents_count = 0 avg_sent_len=0 cap_count = 0 tokens=files.words(ids[i]) #Computing Number of Tokens tokens_count = len(tokens) #Computing Number of types types = len(set(tokens)) non_stops=[] #Computing Number of Content Words for t in tokens: if t not in stops: non_stops.append(t) non_stops_count = len(non_stops) #Finding Average Sentence Length sent = [] sent = files.sents(ids[i]) sents_count = len(sent) sent_len=0 for s in sent: sent_len = sent_len + len(s) avg_sent_len = sent_len/float(sents_count) #Computing Number of Captilized Words for c in non_stops: if c.istitle(): cap_count = cap_count+1 current_file = dataset + '/' + ids[i] e = current_file.split('/') out.write(current_file +' '+ e[-2] + ' tok:' + str(tokens_count) + ' typ:' + \ str(types) + ' con:' + str(non_stops_count) + ' sen:' + str(sents_count) + ' len:' + str(avg_sent_len) + ' cap:' + str(cap_count)+ '\n') out.flush()
def train(): wordlists = PlaintextCorpusReader('', file_path) st = stemmer() # Get blocks of text using NLTK words = wordlists.words(file_path) sents = wordlists.sents(file_path) paras = wordlists.paras(file_path) # LOGIC # If a sentence contains a known [posi/nega]tive word, count the instances of words in that sentence as # [posi/nega]tive # Count words word_features = [] # Go through paragraphs for p in paras: # Classify S score_positive_negative = 0 for s in p: for word in s: word = st.stem(word) if word in words_positive: score_positive_negative += 1 elif word in words_negative: score_positive_negative -= 1 # Record class of paragraph for any words present for s in p: for word in s: word = st.stem(word) if score_positive_negative > 0: word_features.append( ({"word": word}, "+") ) elif score_positive_negative < 0: word_features.append( ({"word": word}, "-") ) else: word_features.append( ({"word": word}, " ") ) # Create and return classifier classifier = nltk.NaiveBayesClassifier.train(word_features) return classifier
def main(): st = stemmer() # Get data wordlists = PlaintextCorpusReader('', file_path) words = wordlists.words(file_path) sents = wordlists.sents(file_path) paras = wordlists.paras(file_path) # Train classifier = train() # Get class probabilities (for MAP estimation) counts = {"P":0, "-":0, "N":0} for i in range(0,len(paras)): for s in paras[i]: score_pos = 0 score_neg = 0 # Classify paragraph for word in s: word = st.stem(word) feature = {"word":word} classified = classifier.classify(feature) if classified == "+": score_pos += 1 elif classified == "-": score_neg += 1 # Record result if score_pos > score_neg: counts["P"] += 1 elif score_pos < score_neg: counts["N"] += 1 else: counts["-"] += 1 # Done! print counts
def classifyByYear(self) : corpusReader = PlaintextCorpusReader(self.txtDirectory, ".*.txt", encoding = self.codec) for journal in corpusReader.fileids() : print ("Start " + journal) sentList = corpusReader.sents(journal) for sent in sentList : getMonth = False getDOI = False line = ''.join(sent) if self.doiURLTypes[0] in line : getDOI = True self._extractYearByDOI(self.doiURLTypes[0], journal, line) break elif self.doiURLTypes[1] in line : getDOI = True self._extractYearByDOI(self.doiURLTypes[1], journal, line) break for word in sent : if getMonth : self._extractYearByMonth(journal, word) break if word.lower() in self.dictMonth : getMonth = True if getMonth : getMonth = False break elif getDOI : getDOI = False break print ("End " + journal) print (str(self.yearDirectoryList))
def get_sentences_for_text(corpus_root, filename, lang="english"): """Segments the given text into sentences. Args: corpus_root: Directory in which the text file is residing. filename: Name of the text file. lang: Tokenizer language. For possible values, look at: ${NLTK_DATA}/tokenizers/punkt Returns: Sentences in the given text. """ tokenizer_path = "tokenizers/punkt/" + lang + ".pickle" text = PlaintextCorpusReader( corpus_root, [filename], word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=nltk.data.LazyLoader(tokenizer_path), ) return text.sents()
def network(chapter): if(chapter == 0): NEs = open("finalNEs/finalNEs.txt").read().split('\n') text_raw = open("ofk.txt").read() else: NEs = open("finalNEs/finalNEs_ch" + str(chapter) + ".txt").read().split('\n') text_raw = open("ofk_ch" + str(chapter) + ".txt").read() result = [dict(name="", relations=[""])] for NE in NEs: result.append(dict(name=NE, relations=[""])) # The next line is needed because of the extra blank list elements at the beginning and end (Beginning I added, end added from newlines in finalNEs.txt) result = result[1:len(result)-1] corpus = PlaintextCorpusReader('.', 'ofk\.txt') sentences = corpus.sents() for x in range(len(sentences)): for NEdict in result: if NEdict["name"] in sentences[x]: # # We are in a sentence with a named entity for n in result: if n["name"] in sentences[x] and n["name"] != NEdict["name"]: NEdict["relations"].append(n["name"]) for NEdict in result: NEdict["relations"] = Set(NEdict["relations"][1:]) final = [dict(name=r["name"], imports=list(r["relations"]), url=r["name"]+".html") for r in result] for finals in final: with open("../webpage/" + finals["name"] + ".html", "w") as f1: with open("part1.html") as f: for line in f: f1.write(line) f1.write(finals["name"]) with open("part2.html") as f: for line in f: f1.write(line) f1.write("\tmain(\"data/" + finals["name"] + ".json" + "\");\n</script>") with open("../webpage/data/edgeBundle.json",'w') as outfile: json.dump(final,outfile, sort_keys = True, indent = 4, ensure_ascii=False)
def build_graph(folder, file_pattern): corpus_root = os.getcwd() + "/" + folder print "Membuka korpus " + folder + " ..." word_lists = PlaintextCorpusReader(corpus_root, file_pattern) naskah = word_lists.sents() filelists = word_lists.fileids() teks = tokenize.sent_tokenize(word_lists.raw(fileids=filelists)) print folder + " memiliki " + str(len(teks)) + ", " + str(len(naskah)) + " kalimat." G_result = nx.Graph() print "Membangun graf " + folder + " ..." for kalimat in naskah: kata = kalimat[0] prevToken = kata.lower() for idx in range(1, len(kalimat)): kata = kalimat[idx] token = kata.lower() if containsLetter(token) and containsLetter(prevToken): G_result.add_edge(prevToken, token) prevToken = token return G_result
jacor = PlaintextCorpusReader(corpus_root, 'J.*txt') # [B3] Print out some basic specs of the two corpora. First off, # of files. # YOUR CODE BELOW. print('The length of the Bulgarian essay corpus: ' + str(len(bucor.fileids()))) print('The length of the Japanese essay corpus: ' + str(len(jacor.fileids()))) print() # [B4] Now, print total # of sentences and # of words. # YOUR CODE BELOW. print('The number of words in the Bulgarian essay corpus: ' + str(len(bucor.words()))) print('The number of words in the Japanese essay corpus: ' + str(len(jacor.words()))) print('The number of sentences in the Bulgarian essay corpus: ' + str(len(bucor.sents()))) print('The number of sentences in the Japanese essay corpus: ' + str(len(jacor.sents()))) # ------------------------------------------------------------------------ # BUILDING DATA OBJECTS print("...Building data objects...") # ------------------------------------------------------------------------ # [C1] Build lowercased token lists. # EDIT THE CODE BELOW. bu_toks = [] for x in bucor.words(): bu_toks.append(x.lower()) ja_toks = []
for w in sub_words: allwords.append(w.lower()) for w in obj_words: allwords.append(w.lower()) allwords=nltk.FreqDist(allwords) word_features=list(allwords.keys())[:300] #WORDNET DATASET ARCHIVE from nltk.corpus import wordnet from nltk.corpus import PlaintextCorpusReader corpus_root = 'E:\EIGHTH SEMESTER\PROJECT AND THESIS II\SOFTWARE\WORDNET' wordlists = PlaintextCorpusReader(corpus_root, '.*') training_data=wordlists.sents('document.txt') #INPUT TEXTS sentence = input("Please enter the sentence: ") print("The input string: ", sentence) corpora=sentence wordtoken_test=word_tokenize(sentence) wordtoken_train=training_data[0] print("Tokenization of training data: ",wordtoken_train[0]) print("Tokenization of testing data: ",wordtoken_test[0]) #PARTS OF SPEECH TAGGING OF WORDS from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer
def read_file(file_path, file_name): text = PlaintextCorpusReader(file_path, file_name) sents = text.sents() words = text.words() text_complexity_score(words, sents, file_name)
model_file = str(opts['-i']) # Abrimo el archivo que contiene el Modelo del lenguaje f = open(model_file, "rb") # Reconstruimos el objeto desde la representacion en cadena de bytes modelo = pickle.load(f) pattern = r'''(?ix) # set flag to allow verbose regexps (?:sr\.|sra\.) | (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens; includes ] ''' PATH = "./../../Corpus_Language_Modeling" # Ubicacion del archivo FILENAME = "corpus_test.txt" # Nombre del archivo # Load the data tokenizer = RegexpTokenizer(pattern) corpus = PlaintextCorpusReader(PATH, FILENAME, word_tokenizer=tokenizer) sents = corpus.sents() print("Perplexity =", modelo.perplexity(sents)) # Cerramos el archivo f.close()
if w.isalnum(): string += ' ' string += w return string print('Welcome to the extractive single-document summarizer') iterations = 20 # iterations for textrank - should be 10 or more to converge n = 2 # output n sentences as summary corpus_root = './articles/' # news articles to be summarized wordlists = PlaintextCorpusReader(corpus_root, '.*') print_pagerank_values = False for fileid in wordlists.fileids(): print('Article: ', fileid) sents = wordlists.sents(fileid) sim_graph_out = [] size = len(sents) s1_count = 0 print('Computing sentence similarity graph ...') # compute similarity of every sentence to all other sentences for i in range(0, size): s1 = sents[i] edges_out = [] for j in range(0, size): s2 = sents[j] sim = similarity(s1, s2) if sim > 0 and i != j: edges_out.append((i, j, sim)) sim_graph_out.append(edges_out)
print '7. Obtenir la freqüència d\'aparició de les paraules en el primer fitxer del corpus. Obtindre la freqüència de la paraula \'a\'' ex_7 = freqs.values( ) ex_72 = freqs[ 'a' ] print '\t', ex_7, '\n\tFreq. aparició \'a\':', ex_72 print '8. Obtenir el nombre de paraules que solament apareixen una volta al primer fitxer del corpus' ex_8 = len( [ 1 for p in freqs.keys( ) if freqs[ p ] == 1 ] ) print '\t', ex_8 print '9. Obtenir la paraula més freqüent del primer fitxer del corpus' ex_9 = freqs.max( ) print '\t', ex_9 print '10. Carrega els fitxers "spam.txt", "quijote.txt" i "tirantloblanc.txt" com un corpus propi' corpus_root = 'C:\\Users\\nrikee\\PycharmProjects\\NLTK' corpus_spam = PlaintextCorpusReader ( corpus_root, 'spam.txt' ) freqs_spam = FreqDist ( corpus_spam.words() ) corpus_quijote = PlaintextCorpusReader ( corpus_root, 'quijote.txt' ) freqs_quijote = FreqDist ( corpus_quijote.words() ) corpus_tirant = PlaintextCorpusReader ( corpus_root, 'tirantloblanc.txt' ) freqs_tirant = FreqDist ( corpus_tirant.words() ) print '\t', '...fet.' print '11. Calcula el nombre de paraules, el nombre de paraules distintes i el nombre de frases' print '\t', 'spam.txt', len( corpus_spam.words() ), len ( freqs_spam.keys() ), len ( corpus_spam.sents() ) print '\t', 'quijote.txt', len( corpus_quijote.words() ), len ( freqs_quijote.keys() ), len ( corpus_quijote.sents() ) print '\t', 'tirantloblanc.txt', len( corpus_tirant.words() ), len ( freqs_tirant.keys() ), len ( corpus_tirant.sents() )
words = gutenberg.words("burgess-busterbrown.txt") words[1:20] sents = gutenberg.sents("burgess-busterbrown.txt") sents[1:20] # 载入自己的语料库 from nltk.corpus import PlaintextCorpusReader corpus_root = 'D:/icwb2-data/training' # 文件目录 wordlists = PlaintextCorpusReader(corpus_root, ['pku_training.utf8', 'cityu_training.utf8', 'msr_training.utf8', 'pku_training.utf8']) wordlists.fileids() print(wordlists.raw('pku_training.utf8')) print(len(wordlists.words('pku_training.utf8'))) print(len(wordlists.sents('pku_training.utf8'))) ####条件频率分布#### # 条件与事件 ''' text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',...] pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'),...] ''' # 按文体计算词频 from nltk.corpus import brown cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
# fdist1.plot( cumulative = True ) #cumulative plot of the frequency distribution # fdist2 = FreqDist( text2 ) # create a frequency distribution containing the given samples # fdist1 |= fdist2 # update fdist1 with counts from fdist2 # fdist1 < fdist2 # test if samples in fdist1 occur less frequently than in fdist2 # print( "===" ) # print( "nltk.corpus.gutenberg.fileids() = ", nltk.corpus.gutenberg.fileids() ) # print( "===" ) # emma = nltk.corpus.gutenberg.words('austen-emma.txt') # print( "len( emma ) = ", len( emma ) ) # print( "===" ) # from nltk.corpus import udhr # languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] # cfd = nltk.ConditionalFreqDist( ( lang, len( word ) ) for lang in languages for word in udhr.words(lang + '-Latin1')) # cfd.plot( cumulative = True ) # Figure 1.2: Cumulative Word Length Distributions: Six translations of the Universal Declaration of Human Rights are processed; this graph shows that words having 5 or fewer letters account for about 80% of Ibibio text, 60% of German text, and 25% of Inuktitut text. # print( "===" ) from nltk.corpus import PlaintextCorpusReader corpus_root = 'c:\\temp\\DDD3262\\' wordlists = PlaintextCorpusReader(corpus_root, '.*') print("===") print("wordlists.fileids() = ", wordlists.fileids()) print("===") print("wordlists.words( '3262.txt' ) = ", wordlists.words('3262.txt')) print("wordlists.words( '3262.txt' )[0:1000] = ", wordlists.words('3262.txt')[0:1000]) print("===") print("wordlists.sents( '3262.txt' ) = ", wordlists.sents('3262.txt')) print("wordlists.sents( '3262.txt' )[0:300] = ", wordlists.sents('3262.txt')[0:300]) print("===")
print([fdist[w] for w, f in fdist.most_common()]) # Parte 8 print("No de palabras que aparecen una sole vez: ", len([w for w, f in fdist.most_common() if fdist[w] == 1])) # Parte 9 print("La palabra más frecuente es", fdist.max()) # Parte 10 from nltk.corpus import PlaintextCorpusReader corpus_root = '.' wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt') for element in wordlists.fileids(): print(element, len(wordlists.words(element)), len(set(wordlists.words(element))), len(wordlists.sents(element))) # EJERCICIO 2 from nltk.corpus import brown from nltk.probability import * res = [] palabras = ['what', 'when', 'where', 'who', 'why'] for palabra in palabras: res.append(palabra) lista = [] for cat in brown.categories(): pal = FreqDist(brown.words(categories=cat)) lista.append(cat) lista.append(pal[palabra])
import grads import utils import w2v_sgd import sampling ############################################################# ####### Зареждане на корпуса ############################################################# startToken = '<START>' endToken = '<END>' corpus_root = 'JOURNALISM.BG/C-MassMedia' myCorpus = PlaintextCorpusReader(corpus_root, '.*\.txt') corpus = [ [startToken] + [w.lower() for w in sent] + [endToken] for sent in myCorpus.sents()] windowSize = 3 negativesCount = 5 embDim = 50 words, word2ind, freqs = utils.extractDictionary(corpus, limit=20000) data = utils.extractWordContextPairs(corpus, windowSize, word2ind) del corpus U0 = (np.random.rand(len(words), embDim) - 0.5) / embDim V0 = (np.random.rand(len(words), embDim) - 0.5) / embDim seq = sampling.createSamplingSequence(freqs) contextFunction = lambda c: sampling.sampleContext(c, seq, negativesCount)
new_corpus = PlaintextCorpusReader(corpus_root, '.*') lista_ficheros = new_corpus.fileids() print( "\n--------------------------------------------------------------------------------\n " ) print("\n\n1.10) Ficheros que componen el corpus: \n" + str(lista_ficheros) + "\n") print( "\n-------------IMPORTANTE CAMBIAR LA RUTA DE LOS ARCHIVOS EN EL CÓDIGO------------\n " ) print("Ruta actual : " + corpus_root) # Calcular el número de palabras, el número de palabras distintas y el número de frases de los tres documentos print( "\n--------------------------------------------------------------------------------\n1.11) " ) print("\n" + "Palabras".rjust(35, " ") + "Vocabulario".rjust(12, " ") + "Frases".rjust(12, " ")) for fichero in lista_ficheros: texto1 = new_corpus.words(fichero) fdist1 = FreqDist(texto1) numPalabras = len(texto1) numPalabrasDistintas = len(fdist1.keys()) numFrases = len(new_corpus.sents(fichero)) print("Fichero: " + str(fichero).ljust(20, " ") + str(numPalabras).ljust(10, " ") + str(numPalabrasDistintas).ljust(15, " ") + str(numFrases)) print( "\n\n--------------------------------------------------------------------------------\n" )
from nltk import download from languagemodeling.ngram import NGram, AddOneNGram, InterpolatedNGram models = { 'ngram': NGram, 'addone': AddOneNGram, 'interpolated': InterpolatedNGram, } if __name__ == '__main__': opts = docopt(__doc__) # load the data corpus = PlaintextCorpusReader('../../textos/', 'out.txt') train_sents = corpus.sents()[0:int(len(corpus.sents())*0.9)] # train the model n = int(opts['-n']) model_class = models[opts['-m']] model = model_class(n, train_sents) # save it filename = opts['-o'] f = open(filename, 'wb') pickle.dump(model, f) f.close()
num_words = len(gutenberg.words(fileid)) #词的长度 num_sents = len(gutenberg.sents(fileid)) #划为句子长度 num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print(int(num_chars / num_words), int(num_words / num_sents), int(num_words / num_vocab), fileid) #平均词长,平均句子长,每个词出现的平均次数 '加载自己的语料库' from nltk.corpus import PlaintextCorpusReader corpus_root = 'E:/python shell' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() ll = wordlists.words('items1.txt') wordlists.sents('items1.txt') '过滤停用词,让原来的表不出现停用词表中出现的词' from nltk.corpus import stopwords stopwords.words('english') def content_fraction(text): stopwords = nltk.corpus.stopwords.words('english') #加载自己的停用词库就不用这句了 content = [w for w in text if w.lower() not in stopwords] return content '过滤了停用词,有必要的话,可以用同义词表'
non_terminals.add(s) grammar = Grammar(non_terminals, terminals, s) # This is only to tell me how advanced the process is count = 0.0 len_fileids = len(model.fileids()) # Get the tokenized corpus tokens_location = location + "tokenized" print("getting tokens from: " + tokens_location) f = open(tokens_location, 'rb') tokens = pickle.load(f) f.close() # Train the grammar model with a context of -+1 for fileid in model.fileids(): spanish_sents = model.sents(fileid) print(str((count / len_fileids) * 100) + "%") count += 1 # Between training with the entire corpus or just bits I get a small difference of productions, so it's not worth it fro = 0.55 * len(spanish_sents) to = 0.6 * len(spanish_sents) for sent in spanish_sents[int(fro):int(to)]: tokenized_sentence = [] for word in sent: ts = tokens[word] tokenized_sentence.append(ts) grammar.add_terminal(ts) i = 0 for terminal in tokenized_sentence[:-1]: # if this is the longest we have generated so far, we will need new non terminals (All the k = i part is
#!/usr/bin/env python # -*- coding: utf-8 import nltk from nltk.probability import FreqDist from nltk.corpus import PlaintextCorpusReader from nltk.tokenize import LineTokenizer OUTPUTFILE = './data/tagged_sent' pair_str_pos = lambda x : '/'.join(x) corpus_root = './data' fileids = 'data_title_sample' corpus = PlaintextCorpusReader(corpus_root, fileids, sent_tokenizer=LineTokenizer(), encoding='utf-8') output = open(OUTPUTFILE,'w') for sent in corpus.sents() : tokens = map(pair_str_pos,nltk.pos_tag(sent)) sent = ' '.join(tokens) output.write(sent+"\n")
import re from nltk.corpus import PlaintextCorpusReader corpus_root = './texts/' wordlist = PlaintextCorpusReader(corpus_root, '.*') print(wordlist.fileids()) #numbr of words print(len(wordlist.words('mobydick.txt'))) #number of sentences print(len(wordlist.sents('mobydick.txt'))) #stores a fileobject into f f = open("./texts/mobydick.txt") #stores a string into data from f data = f.readlines() #corpus is now a giant string delimited by newline character corpus = "\n".join(data) #prints the number of times Ishmael appears in the file. print(len(re.findall(r"\bIshmael\b", corpus)))
o = open(outp,'w') curr = 0 for sent in sentences: times = count_occurences(sent, sent[-1]) curr = text.find(sent[0], curr) end = find_nth(text, sent[-1], times, curr) + len(sent[-1]) o.write(text[curr:end] + '\n') curr = end o.close() def find_nth(string, sub, n, offset): start = string.find(sub, offset) while start >= 0 and n > 1: start = string.find(sub, start+len(sub)) n -= 1 return start def count_occurences(lst, string): count = 0 for item in lst: if string in item: count += 1 return count inp = sys.argv[1] i = open(inp,'r').read() corpus = PlaintextCorpusReader(os.path.dirname(inp),os.path.basename(inp)) sents = corpus.sents() print_out(i, sents)
from nltk import word_tokenize from nltk.corpus import wordnet as wn '''Parte 1 Abra un documento y muestrelo en pantalla''' corpus_root = '/BUAP/Tareas/EstudioCLaudia/texto-tarea4' mi_corpus = PlaintextCorpusReader(corpus_root, '.*') ''' texto = mi_corpus.raw('crimeandpunishment.txt') print(texto) ''' ''' Parte 2 Separelo en oraciones y muestre cada oración numerada (1 para la primera oración, 2 para la segunda, etc) , pregunte al usuario que numero de oración desea seleccionar''' oraciones = mi_corpus.sents('crimeandpunishment.txt') def separar_numerar_oraciones(texto): key = 0 for sent in oraciones: key = key + 1 if sent: print(str(key) + ':' + str(sent)) '''la funcion tokens pertenece a la parte 4''' def tokens(sent, palabra): keyword = 0
def textFileToSentList(pathToFileFolder, FileNameWithExtension): wordlists = PlaintextCorpusReader(pathToFileFolder, '.*') return wordlists.sents(FileNameWithExtension)
while True: sent = predict_next_word(sent, model) if sent.split(" ")[-1] == "<END>": sent = " ".join(sent.split(" ")[:-1]) break print(sent) if __name__ == '__main__': print("Lab 4 Exercise 2") corpus_reader = PlaintextCorpusReader(root="./twitter-files", fileids=".*\.txt", word_tokenizer=TweetTokenizer()) # Convert tweets to tri-grams tweets = [tweet for tweet in corpus_reader.sents()] tweet_trigrams = [ list( ngrams(sequence=tweet, n=3, pad_left=True, pad_right=True, left_pad_symbol="<START>", right_pad_symbol="<END>")) for tweet in tweets ] all_trigrams = [gram for tweet in tweet_trigrams for gram in tweet] # Initialize the language model freq_dist = FreqDist(all_trigrams) model = KneserNeyProbDist(freq_dist)
print("Freq aparición de la preposición a " + str(fdist['a'])) print("#act8") print("No de palabras que aparecen una sóla vez: " + str(len(fdist.hapaxes()))) print("#act9") print("La palabra más frecuente es " + fdist.max()) print("#act10") dir_path = os.path.dirname(os.path.realpath(__file__)) wordlists = PlaintextCorpusReader(corpus_root + "/library", '.*') print("#act11") text = wordlists.words(wordlists.fileids()[0]) fdist = FreqDist(text) for i in wordlists.fileids(): text = wordlists.words(i) fdist = FreqDist(text) print((str(i) + " " + str(len(text)) + " " + str(len(fdist.keys())) + " " + str(len(wordlists.sents(i))))) """ 12. ¿Coinciden estos resultados con los de la práctica anterior? Justifica la respuesta. Los resultados no coinciden por varias razones. La primera es que en la practica 2 solo se tenian en cuenta las palabras alphabeticas en cambio NLTK incluye las alphanumericas como '-Fpa-' ó '51_por_ciento". Anteriormente tambien quitabamos las stopwords de las cuentas. Finalmente el criterio de separación de frases de la práctica 2 eran ".", ";" y "\n\n" dando como resultado más lineas totales. """ #EJERCICIO2 #EJERCICIO3
import nltk from nltk.corpus import PlaintextCorpusReader corpus_root = '/home/vivkul/Downloads/project' wordlists = PlaintextCorpusReader(corpus_root, '.*') # wordlists.fileids() # wordlists.words('questions.txt') amrit=wordlists.words('allquestion.txt') stopwords = nltk.corpus.stopwords.words('english') from nltk.book import * fo=open("selectedquestion.txt","wb") a=wordlists.sents('allquestion.txt') while(len(amrit)!=0): content=[w for w in amrit if w.lower() not in stopwords] voc=FreqDist(content) # sorted([w for w in set(content) if len(w) > 2 and 4voc[w] > 3]) # set_voc_0=FreqDist(a[0]) # set_voc_1=FreqDist(a[1]) b=voc.keys() i=0 while(i<len(b)): if(len(b[i])>2): j=i max=b[i] break i=i+1 q_no=[] k=0 while(k<len(a)): set_voc=FreqDist(a[k]) if(set_voc[max]>0): q_no.append(len([w for w in a[k] if w.lower() not in stopwords]))
'нак', 'уна', 'ьа', 'фна', 'наф', 'гна', 'ана', 'иа', 'ща', 'нан', 'ьна', 'ниа', 'рна', 'пна', 'не', 'цна', 'ныа', 'нва', 'нка', 'ну', 'зна', 'оа', 'нау', 'нйа', 'наш', 'ена', 'яна', 'нба', 'нт', 'ню', 'н', 'нэ', 'нжа', 'нла', 'нпа', 'но', 'яа', 'нна', 'нж', 'еа', 'нав', 'нац', 'нса', 'нщ', 'нас', 'жна', 'нал', 'нц', 'нр', 'ина', 'лна', 'па', 'нп', 'нф', 'нс', 'нащ', 'та', 'чна', 'нча', 'дна', 'йна', 'уа', 'нат', 'нв', 'нач', '-на', 'ка', 'сна', 'нк', 'нма', 'жа', 'наь', 'нч', 'хна', 'ная', 'ны', 'н ', 'наж', 'за', 'йа', 'ла' } print('Прочитане на корпуса от текстове...') corpus_root = 'JOURNALISM.BG/C-MassMedia' myCorpus = PlaintextCorpusReader(corpus_root, '.*\.txt') fullSentCorpus = [[model.startToken] + [w.lower() for w in sent] + [model.endToken] for sent in myCorpus.sents()] print('Готово.') print('Трениране на Марковски езиков модел...') M2 = model.MarkovModel(fullSentCorpus, 2) print('Готово.') ############################################################################# #### Начало на тестовете #### ВНИМАНИЕ! Тези тестове са повърхностни и тяхното успешно преминаване е само предпоставка за приемането, но не означава задължително, че програмата Ви ще бъде приета. За приемане на заданието Вашата програма ще бъде подложена на по-задълбочена серия тестове. ############################################################################# #### Тест на editDistance try: for s1, s2, d in zip(L1, L2, C): signal.alarm(60)
pos_tokens = pos_tag(convote_training.tokenized()) prep_tokens = [] for (word, pos) in pos_tokens: if(pos == 'IN'): prep_tokens.append(word + '|' + pos) else: prep_tokens.append(pos) trigram = nltk.trigrams(prep_tokens) trigram_file = open('../data/pos_trigrams', 'w') pickle.dump(bigram, outfile) pickle.dump(trigram, trigram_file) for sents in convote_test.sents(): for index in range(0, len(sents)): if sents[index] == 'in': temp = deepcopy(sents) temp[index] = '*' in_test.append(temp) if sents[index] == 'on': temp = deepcopy(sents) temp[index] = "*" on_test.append(temp) if sents[index] == 'of': temp = deepcopy(sents) temp[index] = "*" of_test.append(temp) for sents in convote_dev.sents():
import nltk import numpy as np from nltk.corpus import PlaintextCorpusReader vocabulary_size = 8000 unknown_token = "UNKNOWN_TOKEN" sentence_start_token = "SENTENCE_START" sentence_end_token = "SENTENCE_END" reader = PlaintextCorpusReader('./data/', 'reddit-comments-2015-08.csv', encoding='utf-8') sentences = reader.sents() words = reader.words() tokenized_sentences = [[sentence_start_token] + sent + [sentence_end_token] for sent in sentences] word_freq = nltk.FreqDist(words) word_freq.plot(30) print("Unique words", len(word_freq)) vocab = word_freq.most_common(vocabulary_size - 1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)]) print("Least freq word in vocab is:", f'"{vocab[-1][0]}"', "and it appeared", vocab[-1][1], "times") for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = ([
corpus_root = '/DTU/MSc/Code/Data' wordlists = PlaintextCorpusReader(corpus_root, [ 'food_holiday_inn_london.txt.data', 'food_swissotel_chicago.txt.data', 'room_holiday_inn_london.txt.data', 'rooms_swissotel_chicago.txt.data', 'rooms_swissotel_chicago.txt.data', 'service_bestwestern_hotel_sfo.txt.data', 'service_holiday_inn_london.txt.data', 'service_swissotel_hotel_chicago.txt.data', 'staff_bestwestern_hotel_sfo.txt.data', 'staff_swissotel_chicago.txt.data' ]) wsj = nltk.corpus.treebank print wordlists.fileids() print wsj.fileids() print(len(wordlists.sents())) senLengths1 = [len(s) for s in wordlists.sents()] freqDist1 = nltk.FreqDist(senLengths1) print(len(wsj.sents())) senLengths2 = [len(s) for s in wsj.sents()] freqDist2 = nltk.FreqDist(senLengths2) propDist1 = nltk.DictionaryProbDist(freqDist1, normalize=True) propDist2 = nltk.DictionaryProbDist(freqDist2, normalize=True) myfile = open('../Thesis/wsjdist.dat', 'wb') wr = csv.writer(myfile, quoting=csv.QUOTE_NONE) wr.writerow(["x", "y1", "y2"])
fileid = 'neg/cv956_12547.txt' text = movie_reviews.raw(fileid) text1= movie_reviews.raw(categories='neg') movie_reviews.categories(fileid) #Frequency distribution by creating our own corpus from nltk.corpus import PlaintextCorpusReader fileid = 'C:/Users/ITRAIN-12/Desktop/Day 2/gaming.txt' my_corpus = PlaintextCorpusReader(fileid, '.*') text = my_corpus.raw(fileid) text my_corpus.raw(fileid) my_corpus.words(fileid) my_corpus.sents(fileid) distr = nltk.FreqDist(text) print(distr.most_common(5)) #Reuters from nltk.corpus import reuters fileid='training/9865' text=reuters.raw(fileid) text #Load reuters category news from nltk.corpus import reuters reuters.fileids() reuters.categories() fileid = 'test/16399'
yoursmall_wordlists = PlaintextCorpusReader(corpus_root, '.*') classevent_wordlists = PlaintextCorpusReader(classevent_corpus_root, '.*') # The next step is to show the file names under the directory (optional step) lemmer = WordNetLemmatizer() def clean_words(words): words = [w.lower() for w in words if w.isalnum()] return words def clean_sents(sents): return [clean_words(sent) for sent in sents] classevent_words = clean_words(classevent_wordlists.words()) classevent_sents = clean_sents(classevent_wordlists.sents()) classevent_words = [w.lower() for w in classevent_words if w.isalnum()] classevent_words = nltk.Text(classevent_words) classevent_words_lem = [lemmer.lemmatize(w) for w in classevent_words] print "ClassEvent loaded" yoursmall_words = yoursmall_wordlists.words() yoursmall_sents = yoursmall_wordlists.sents() yoursmall_words = [w.lower() for w in yoursmall_words if w.isalnum()] yoursmall_words = nltk.Text(yoursmall_words) yoursmall_words_lem = [lemmer.lemmatize(w) for w in yoursmall_words] print "YourSmall loaded" yourwords = ['earthquake', 'seismic', 'aftershocks', 'quake', 'damage', 'magnitude', 'tremor', 'richter', 'epicenter', 'depth', 'fault', 'hypocenter', 'focus', 'dead', 'casualties', 'structural', 'seismometer', 'temblor', 'hazard', 'impact'] yourwords_lem = [lemmer.lemmatize(w.lower()) for w in yourwords]
# sents_no=[] # pmids=[] # i=0 # for index,row in mydf.iterrows(): # stence=row['text'] # sents=stence.split('.') # newsents=[] # for sent in sents: # newsents.append(sent+'.') # i=i+1 # sents_no.append(i) # pmids.append(row['pmid']) # # # result= re.findall(r"<category=\".`+?\">(.+?)</category>",stence, re.S) # # print(result) # print(sents_no) # print(pmids) str='you are my shine.' str_list=list(str) list=[] for i in str_list: list.append({'str':str,"obj":"ss"}) str2=" ".join(list) print(str2 ) wodslist = PlaintextCorpusReader(cr, '.*') for i in wodslist.sents('NCBI_corpus_training.txt'): text=nltk.word_tokenize(' '.join(i)) nltk.wordpunct_tokenize print(i) # print(nltk.pos_tag(text, tagset='universal'))
def nltk_corpora(): ## 1. PROJECT GUTENBERG << Formal Language - Literature;ebooks 60K++ emma = nltk.corpus.gutenberg.words("austen-emma.txt") emma = nltk.Text(emma) len(emma) lexical_diversity(emma) emma.concordance("brave") emma.collocation_list() ## traits of the corpus text for each def corp_content(corporad): print( "{0} File {0} \t\tWord len Sent len Vocab Lexical Complexity" .format(" " * 6)) print("{}".format("-" * 100)) for i, txt in enumerate(corporad.fileids()): sents_l = len(corporad.words(txt)) try: sents_l = len(corporad.sents(txt)) except: sents_l = len(corporad.posts(txt)) w_len = round(len(corporad.raw(txt)) / len(corporad.words(txt))) s_len = round(len(corporad.words(txt)) / sents_l) voc = len(set(w.lower() for w in corporad.words(txt))) # lexp = round( voc / len( [w.lower() for w in gutenberg.words(txt)] ) * 100 ) lexp = round(voc / len(corporad.words(txt)) * 100) print("{}. {} \t\t{}\t{}\t{}\t{}%\t{}".format( i, txt, w_len, s_len, voc, lexp, corporad.raw(txt)[:30])) # print( "{}. {} \t\t{}\t{}\t{}\t{}%\t{}".format(i, txt, w_len, s_len, voc, lexp, corporad.words(txt)[:5] ) ) # 1. Formal Language - Project Gutenberg ebooks 60K++, 16+ languages corp_content(gutenberg) # 2. Informal Language - Web content and Chat rooms corp_content(webtext) corp_content(nps_chat) # 3. Brown Corpus - 15+ Multi-genre, 500+ sources, En_lang << http://icame.uib.no/brown/bcm-los.html # for studying systematic differences between genres I.E. stylistics corp_content(brown) brown.categories() brown.words(categories="news") brown.words(categories=["news", "editorial", "reviews"]) # example stylistics - modal verbs usage between genres def modalz(modals): print("\tCategory\t", end=" ") for m in modals: print("\t{}".format(m), end=" ") print("\n" + "-" * 100) for i, cat in enumerate(brown.categories()): print("{}.{}\t\t".format(i, cat), end=" ") fdist = nltk.FreqDist(w.lower() for w in brown.words(categories=cat)) for m in modals: print("\t{}".format(fdist[m]), end=" ") print("") modalz(["can", "could", "may", "might", "must", "will"]) modalz(["should", "ought", "would", "could", "do", "did", "does"]) modalz(["what", "when", "where", "why", "who"]) ## ditto using nltk conditional frequency distributions cfdist = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genz = ["news", "religion", "hobbies", "humor", "romance"] modz = ["can", "could", "may", "might", "must", "will"] cfdist.tabulate(conditions=genz, samples=modz) # 4. Reuters Corpus - news articles, 90 topics, grouped into training and testing sets # << Apparent goal is to predict the category/topic of a given article?? corp_content(reuters) # retrieve topic(s) of a given article reuters.categories("training/9865") reuters.categories(["training/9865", "training/9880"]) # find articles that cover some topic(s) reuters.fileids("barley") reuters.fileids(["barley", "corn"]) # the first words are in all CAPs and are the titles of the article. The rest is the story text for i, txt in enumerate(reuters.fileids(["barley", "oil"])): print("{}. {}\t{}".format(i, txt, reuters.words(txt)[:10])) # 5. Speeches - Inaugral Address Corpus << 55 USA Presidential addresses # << interesting in that there's a time horizon element from 1789 to 2009 (first 4 xters of fileid = year) ; can study how language changes with time; could reflect on priorities, culture, ??? corp_content(inaugural) # how America and Citizen ar eused over time cfdist = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfdist.plot() # 6. Annotated Text Corpora # annotations: POS, named entities, syntatic structures, semantic roles, # 7. Other Languages Corpora # includes udhr = Universal Declaration of Human Rights in over 300 languages # word length freq by diff languages langz = [ "English", "Chickasaw", "German_Deutsch", "Kinyarwanda", "Swahili_Kiswahili" ] cfdist = nltk.ConditionalFreqDist((lang, len(word)) for lang in langz for word in udhr.words(lang + "-Latin1")) cfdist.plot() cfdist.plot(cumulative=True) # alphabet freq nltk.FreqDist(udhr.raw("Kinyarwanda-Latin1")).plot() # 8. Loading your own Corpora # << txt files. Use PlaintextCorpusReader. Check dir location # my_corpus = PlaintextCorpusReader( "root_dir_path_here", ".*" ) # second param is a list of fileids defined as a list or an ls pattern eg_corpus = PlaintextCorpusReader( "D:/zRepoz/dataSaysWhat/DocReader/res/txt_corpus", ".txt") eg_corpus.fileids() eg_corpus.words("example1.txt") len(eg_corpus.sents()) #BracketParseCorpusReader my_corpus = nltk.corpus.BracketParseCorpusReader("path", "file_pattern")
classevent_wordlists = PlaintextCorpusReader(classevent_corpus_root, '.*') big_wordlists = PlaintextCorpusReader(big_corpus_root, '.*') # The next step is to show the file names under the directory (optional step) lemmer = WordNetLemmatizer() def clean_words(words): words = [w.lower() for w in words if w.isalnum()] return words def clean_sents(sents): return [clean_words(sent) for sent in sents] classevent_words = classevent_wordlists.words() classevent_sents = classevent_wordlists.sents() classevent_words = [w.lower() for w in classevent_words if w.isalnum()] classevent_words = nltk.Text(classevent_words) classevent_words_lem = [lemmer.lemmatize(w) for w in classevent_words] print "ClassEvent loaded" yoursmall_words = yoursmall_wordlists.words() yoursmall_sents = yoursmall_wordlists.sents() yoursmall_words = [str(w).lower() for w in yoursmall_words if w.isalnum()] yoursmall_words = nltk.Text(yoursmall_words) yoursmall_words_lem = [lemmer.lemmatize(w) for w in yoursmall_words] print "YourSmall loaded" big_words = big_wordlists.words() big_sents = big_wordlists.sents() big_words = [w.lower() for w in big_words if w.isalnum()]
### #Abschnitt Naive Bayes tranieren ### print("Lese neutralen Korpus") neutralCorp = ConllCorpusReader( '.', 'corpora/tiger_release_aug07.corrected.16012013.conll09', ['ignore', 'words', 'ignore', 'ignore', 'pos'], encoding='utf-8') print("Lese Ingvar-Korpus") ingvarCorp = PlaintextCorpusReader(".", "texts/latest.txt") print("Generiere Wortlisten") ingvarSentencesLong = ingvarCorp.sents() neutralSentencesLong = neutralCorp.sents() smallerSet = min(len(ingvarSentencesLong), len(neutralSentencesLong)) ingvarSentences = ingvarSentencesLong[:smallerSet] neutralSentences = neutralSentencesLong[:smallerSet] print(f'Zahl der Sätze limitiert auf kleineres Set mit {smallerSet} Sätzen') print("Generiere Features") ingFeats = [(word_feats(f), 'ing') for f in ingvarSentences] neutFeats = [(word_feats(f), 'neu') for f in neutralSentences] print("Generiere Cutoff") ingCutoff = int(len(ingFeats) * 0.9)
def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None): """Extract features, reduce dimensions with a PCA and return data. Exports raw- and PCA-reduced data both in arff- and numpy-format. """ start = time.clock() self.dictVectorizer = DictVectorizer(sparse=False) filename = os.path.split(filepath)[1] directory = os.path.split(filepath)[0] plain_reader = PlaintextCorpusReader( directory, [filename], word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|["+string.punctuation+"]"), sent_tokenizer=LineTokenizer(blanklines="discard"), encoding='utf8') # create new subdir for extracted data if _NEW_SUBDIR is not None: path = os.path.join(directory, _NEW_SUBDIR) if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, os.path.splitext(filename)[0]) # print "path {}".format(path) else: path = os.path.splitext(filepath)[0] # print "path {}".format(path) # filepaths for weka- and numpy-files arff_filepath = path + ".arff" arff_filepath_pca = path + "_pca95.arff" numpy_filepath = path + ".npy" numpy_filepath_pca = path + "_pca95.npy" # print(":time: Reader created, time elapsed {}").format(time.clock() - start) paras = plain_reader.paras() # print(":time: Paras created, time elapsed {}").format(time.clock() - start) sents = plain_reader.sents() # print(":time: Sents created, time elapsed {}").format(time.clock() - start) # get paragraph boundaries for sliding-window self.boundaries = util.get_boundaries(paras) boundaries_backup = self.boundaries # check if all files necessary exist, if yes - unpickle/load them and return data if util.files_already_exist([numpy_filepath_pca,]): print "Features already extracted. Calculating clusters...\n" matrix_sklearn_pca = numpy.load(numpy_filepath_pca) return filepath, self.boundaries, matrix_sklearn_pca, len(sents) # save correct target-labels and additional info of current data targets_path = open(path + ".tbs", "wb") pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path) # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start) self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features) # self.data[year] = self.extract_features_para(paras, ind_features, dep_features) # print(":time: Features extracted, time elapsed {}").format(time.clock() - start) self.all_features = self.unified_features(self.data) # print(":time: Unified features, time elapsed {}").format(time.clock() - start) matrix_sklearn = self.feature_matrix_sklearn(self.generator_data(self.data)) # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start) matrix_sklearn = util.normalize(matrix_sklearn) # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start) print "Exporting raw-data..." util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename+"_RAW", labels_per_window, file_info=None) numpy.save(numpy_filepath, matrix_sklearn) # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape) feature_names, feature_names_part = None, None if _DO_PCA: print "PCA calculation..." matrix_sklearn_pca, feature_names = util.pca(matrix_sklearn, self.dictVectorizer.get_feature_names()) util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename+"_PCA95", labels_per_window, file_info=None) numpy.save(numpy_filepath_pca, matrix_sklearn_pca) del matrix_sklearn gc.collect() return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
# CALCULATION OF BINARY FEATURES # 1: IF A TERM APPEAR IN A SENTENCE) 0 : DOESN'T APPEAR import nltk from nltk.corpus import PlaintextCorpusReader from nltk.tokenize import word_tokenize from nltk.corpus import stopwords corpus_root = 'C:\MyData\PythonPractice\Mycorpus' wordlists = PlaintextCorpusReader(corpus_root, 'resort.*\.txt') print('\nFollowing file ids are there in this corpus: \n ') print(wordlists.fileids()) print("\nNumber of sentences in the file are :") sencount = len(wordlists.sents(fileids=['resort.txt'])) print(sencount) print('\n Sentences are : \n') sentences = wordlists.sents(fileids='resort.txt') print(sentences) sample = wordlists.raw("resort.txt") s = sample.split('.') #NO OF TERMS unique_tokens = [] for i in range(sencount): print("\n Sentence " + str(i + 1)) print(s[i]) #print('\n Tokenization \n') word_tokens = word_tokenize(s[i])
from __future__ import division import urllib2, sys, re, codecs import nltk, pprint from BeautifulSoup import BeautifulSoup from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize # The name of the output file #input_file_name = 'obama_speeches' #g = codecs.open(input_file_name, mode='r+').read() #g = nltk.Text(nltk.word_tokenize(g)) #print g.concordance('freedom') #print g.concordance('liberty') from nltk.corpus import PlaintextCorpusReader corpus_root = '/Users/richard/Github/politics/speeches/' ocorpus = PlaintextCorpusReader(corpus_root, '.*') for fileid in ocorpus.fileids(): num_chars = len(ocorpus.raw(fileid)) num_words = len(ocorpus.words(fileid)) num_sents = len(ocorpus.sents(fileid)) num_vocab = len(set([w.lower() for w in ocorpus.words(fileid)])) #print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid print nltk.Text(ocorpus.words(fileid)).concordance('freedom') print nltk.Text(ocorpus.words(fileid)).concordance('liberty')
import operator import random auth = tweepy.OAuthHandler("xxx", "xxx") auth.set_access_token("xxx", "xxx") api = tweepy.API(auth) directory = "PATH-TO-DIRECTORY" bandz = pickle.load(open(directory + "thug_tokens.p", "rb")) thugtrainer = nltk.NgramModel(3, bandz) corpus_root = directory + "/songs" chainzcorpus = PlaintextCorpusReader(corpus_root, '.*') chainzwords = nltk.probability.FreqDist() for sent in chainzcorpus.sents(): for word in sent: chainzwords.inc(word.lower()) chainzkeys = chainzwords.keys() brownwords = nltk.probability.FreqDist() for sent in brown.sents(): for word in sent: brownwords.inc(word.lower()) brownkeys = brownwords.keys() stopwords = nltk.corpus.stopwords.words('english') trends_US = api.trends_place(23424977) trendlist = []
from nltk.corpus import PlaintextCorpusReader corpus_root = "D:\develop\data\my_nltk" word_lists = PlaintextCorpusReader(corpus_root, ".*") print(word_lists.fileids()) print(word_lists.sents("a.txt")) print(word_lists.words("a.txt"))
#Obtener el número de palabras que sólo aparecen una vez en el primer fichero del corpus. print("Número de palabras que aparecen solo una vez:") print(len([w for w in set(text) if fdist[w] == 1])) #Obtener la palabra más frecuente del primer fichero del corpus. print("La palabra más frecuente es %s" % fdist.max()) #Cargar los ficheros de PoliformaT (“spam.txt”, “quijote.txt” y “tirantloblanc.txt” ) como un corpus propio. corpus_root = '.' wordlists = PlaintextCorpusReader( corpus_root, ["spam.txt", "quijote.txt", "tirantloblanc.txt"]) print("corpus cargado") #Calcular el número de palabras, el número de palabras distintas y el número de frases de los tres documentos. for i in range(0, 3): nombre = wordlists.fileids()[i] npalabras = len(wordlists.words(wordlists.fileids()[i])) npaldistintas = len(set(wordlists.words(wordlists.fileids()[i]))) nfrases = len(wordlists.sents(wordlists.fileids()[i])) print( "fichero: %s num palabras: %d num palabras distintas: %d num frases %d" % (nombre, npalabras, npaldistintas, nfrases)) print("Ejercicio 2") from nltk.corpus import brown words = ["what", "where", "who", "when", "why"] mydict = {} for word in words: mydict[word] = [] categoriess = brown.categories() for word in words: for category in categoriess: frecuencia = len( [w for w in brown.words(categories=category) if w == word])
def read_sents(inp, outp): i = open(inp, 'r').read() corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp)) sents = corpus.sents() print_out(outp, i, sents)
def nlp(request): w=PlaintextCorpusReader("./","canto1.txt"); w.words(); t=nltk.text.Text(w.words()) return render_to_response('lengths.html', {'word_length':len(set(w.words())), 'sentence_length' : len(w.sents())})
from gensim.models import Word2Vec from nltk.corpus import brown, movie_reviews, treebank, PlaintextCorpusReader from six import string_types from nltk.corpus.reader.util import concat root_dir = '/home/diego/qdata/techarticles/parsed_articles/' acr = PlaintextCorpusReader(root_dir, '.*\.txt') from datetime import datetime print(acr.fileids()) for fileid in acr.fileids(): # num_chars = len(acr.raw(fileid)) # num_words = len(acr.words(fileid)) # num_sents = len(acr.sents(fileid)) # num_vocab = len(set(w.lower() for w in acr.words(fileid))) # print(round(num_chars / num_words), round(num_words / num_sents), round(num_words / num_vocab), fileid) print(" ============ " + fileid + " =============================") print(acr.words(fileid)) print(acr.sents(fileid))
-h --help Show this screen. """ # You must be in the virtualenv ($ workon pln-2015) to run this script # Attention: you must run this (every) script from PLN-2015/ directory import pickle from docopt import docopt from nltk.corpus import PlaintextCorpusReader from languagemodeling.ngram import Eval if __name__ == '__main__': opts = docopt(__doc__) i = str(opts['-i']) f = open(i, 'rb') model = pickle.load(f) test_corpus = PlaintextCorpusReader('corpus/spanish', 'test.txt') test_sents = test_corpus.sents() evaluator = Eval(model, test_sents) log_prob = evaluator.log_probability cross_ent = evaluator.cross_entropy perp = evaluator.perplexity print("Input filename: %s" % i) print(" Log-Probability: %f\n Cross-Entropy: %f\n Perplexity: %f\n" % (log_prob, cross_ent, perp))
""" Word and named entity 10 chunks """ import nltk from nltk.corpus import PlaintextCorpusReader import pickle print 'getting files' corpus_root = 'Texas_Wild_Fire' english = pickle.load(open('./nltk_data/tokenizers/punkt/english.pickle', 'r')) yourSmallReader = PlaintextCorpusReader(corpus_root, '.*', sent_tokenizer=english) print 'getting sentences' # 10324.txt 17749.txt 17859.txt sents = yourSmallReader.sents('10324.txt') + yourSmallReader.sents('17749.txt') + yourSmallReader.sents('17859.txt') # sents = yourSmallReader.sents() sents = [nltk.pos_tag(sent) for sent in sents] print 'getting chunks' chunks = [nltk.ne_chunk(sent) for sent in sents] # Getting a random assortment of chunks print chunks[0] print chunks[10] print chunks[25] print chunks[35] print chunks[50] print chunks[60] print chunks[75] print chunks[80] print chunks[90] print chunks[100]
from languagemodeling.ngram import NGram, AddOneNGram, InterpolatedNGram models = { 'ngram' : NGram, 'addone' : AddOneNGram, 'inter' : InterpolatedNGram, } if __name__ == '__main__': opts = docopt(__doc__) # load the corpus screenplay_dir = opts['-r'] my_corpus = PlaintextCorpusReader(screenplay_dir, '.*.txt') sents = my_corpus.sents() # train the model n = int(opts['-n']) model_class = models[opts['-m']] model = model_class(n, sents) # save it filename = opts['-o'] f = open(filename, 'wb') pickle.dump(model, f) f.close()
#Make a stopset stopset = set(nltk.corpus.stopwords.words('english')) f = open("./stopwords.txt", "r") for line in f.readlines(): word = line.strip() if word not in stopset: stopset.add(word) #Read in corpus corpus_root = '.././Islip13Rain/' classevent_wordlists = PlaintextCorpusReader(corpus_root, '.*') #sent tokenize sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # CEsents = sent_tokenizer.tokenize(classevent_wordlists.raw()) CEsents = classevent_wordlists.sents() #tag and filter def trigram_tag(sentences, default_tagger=get_regexp_tagger(), **kwargs): tagged_words = raw_trigram_tag(sentences) tagged_words = remove_stopwords_tagged(tagged_words, stopset) pos_filtered = remove_tags(["NN", "VB"], tagged_words) pos_filtered = remove_non_english(pos_filtered) pos_filtered = [(word.lower()) for word in pos_filtered] pos_filtered = lemmatize_words(pos_filtered) set_pos_filtered = list(set(pos_filtered)) set_pos_filtered = sorted(set_pos_filtered, key=lambda word: pos_filtered.count(word)) return set_pos_filtered
import csv from nltk.probability import FreqDist from nltk.corpus import PlaintextCorpusReader, stopwords MAX_WORDS = 4 NUM_WORDS = 100 wordlist = PlaintextCorpusReader('', 'ofk(_chap_[1234])?\.txt') sents = wordlist.sents('ofk.txt') seqs = [] def clean_sent(sent): sent = filter(lambda w: w.isalpha() or w in ['.', '!', '?'], sent) out = [] for i in range(len(sent)): if sent[i] == 't': out[-1] += "'t" else: out.append(sent[i]) return out sents = map(clean_sent, sents) for sent in sents: output = [] for i in range(len(sent)): output.append(sent[i]) if len(output) >= MAX_WORDS: break if output: