class Corpus(object): def __init__(self, data_root): self.data_root = data_root self.data = PlaintextCorpusReader(data_root, '.*') self.words = [i for i in self.data.words() if i.isalpha()] self.text = Text(self.words) self.stop = set(stopwords.words('english')).union({ 'cid', 'et', 'al', 'also', 'and', 'editingboston', 'arxiv', 'pages', 'trackboston', 'preprint', 'page', 'vol', 'volume', 'march', 'boston', 'table' }) with open('bib.json') as fi: self.bib = json.load(fi) def documents(self): """Return a list of all documents in the corpus""" return sorted([i for i in os.listdir(self.data_root)]) def words_in_file(self, filename): """Given a file, return a list of tokenized words""" try: text = self.data.open(filename).read() except FileNotFoundError: print("The file does not exist.") return word_tokenize(text) def sentences_in_file(self, filename): """Given a file, return a list of sentences""" try: text = self.data.open(filename).read() except FileNotFoundError: print("The file does not exist.") return sent_tokenize(text) def tokenized_sentences_in_file(self, filename): """Given a file name, return a list of word tokenized sentences""" try: text = self.data.open(filename).read() sent = [word_tokenize(s) for s in sent_tokenize(text)] except FileNotFoundError: print("The file does not exist.") return sent def most_frequent_content_words(self, n_words): """Return a list with the most frequent content words and their frequencies in (word, frequency) pairs ordered by frequency""" content_words = [ w for w in self.words if w.lower() not in self.stop and w.isalpha() and len(w) > 1 ] content_words_dict = FreqDist(content_words) return content_words_dict.most_common(n_words) def most_frequent_bigrams(self, n_bigrams): """Return a list with the most frequent bigrams of content words in the form of pairs where the first element is the bigram and the second is its frequency""" bigram_dict = FreqDist([k for k in bigrams(self.words)if k[0].isalpha() and k[1].isalpha() and len(k[0])>1 and len(k[1])>1 \ and k[0].lower() not in self.stop and k[1].lower() not in self.stop]) return bigram_dict.most_common(n_bigrams) def most_frequent_trigrams(self, n_trigrams): trigram_dict = FreqDist([k for k in trigrams(self.words)if k[0].isalpha() and k[1].isalpha() and len(k[0])>1 and len(k[1])>1 \ and k[0].lower() not in self.stop and k[1].lower() not in self.stop and k[2].lower() not in self.stop]) return trigram_dict.most_common(n_trigrams) def get_info(self, fileID): """Return metadata associate with a file indexed by the following fields: author, title, booktitle, year, publisher, pages, location, doi, url""" return self.bib[fileID] def print_reference(self, fileID): """Print metadata (author, title of paper, title of book, publishing year) associated with each file as a reference""" d = self.bib[fileID] print("%s. %s. %s, %s" % (' '.join( d['author'].split('\n')), d['title'], d['booktitle'], d['year'])) def concordance(self, word): self.text.concordance(word)
# Check that our corpus do exist and the files are correct. assert os.path.isdir(corpusdir) for infile, text in zip(sorted(os.listdir(corpusdir)), corpus): assert open(corpusdir + infile, 'r').read().strip() == text.strip() # Create a new corpus by specifying the parameters # (1) directory of the new corpus # (2) the fileids of the corpus # NOTE: in this case the fileids are simply the filenames. newcorpus = PlaintextCorpusReader('newcorpus/', '.*') # Access each file in the corpus. for infile in sorted(newcorpus.fileids()): print infile # The fileids of each file. with newcorpus.open(infile) as fin: # Opens the file. print fin.read().strip() # Prints the content of the file print # Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras()
def remove_media(s): return re.sub(r'MEDIA', ' ', s) def stem_tokens(tokens, stemmer): stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed def tokenize(text): tokens = nltk.word_tokenize(text) stems = stem_tokens(tokens, stemmer) return stems for infile in sorted(newcorpus.fileids()): print infile # The fileids of each file. fin = newcorpus.open(infile) # Opens the file. text = fin.read().strip() # Prints the content of the file just_text = remove_websites(remove_media(remove_emojis(text)).lower()) no_punctuation = remove_punctiation(just_text) token_dict[infile] = no_punctuation tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english') tfs = tfidf.fit_transform(token_dict.values()) print token_dict
def try_out_some_functionalities(): corpusdir ="/media/benzro/OS/Users/benzro/Desktop/Studium Uni/2)" \ "ZweitesSemester/27)PCL-2/Uebungen/Uebung03/Enron/test/" newcorpus = PCR(corpusdir, '.*') print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access one file in the corpus" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" infile = corpusdir + "0001.1999-12-10.farmer.ham.txt" infile = "0004.1999-12-14.farmer.ham.txt" fin = newcorpus.open(infile) print fin.read().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "all file ids" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.fileids() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access each file in the corpus" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # (reduced output: [0:2]) for infile in sorted(newcorpus.fileids()): # the fileids of each file print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print infile # opens the file fin = newcorpus.open(infile) # prints the content of the file print fin.read().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access the plaintext; outputs pure string of all files" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.raw().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access paragraphs in the corpus. (list of list of list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access pargraphs of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.paras(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access sentences in the corpus. (list of list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # NOTE: That the texts are flattened into sentences that contains tokens. print newcorpus.sents() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access sentences of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.sents(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access just tokens/words in the corpus. (list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.words() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access tokens of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.words(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader # Create a new corpus by specifying the parameters # (1) directory of the new corpus # (2) the fileids of the corpus # NOTE: in this case the fileids are simply the filenames. newcorpus = PlaintextCorpusReader('nltkCorpusAll/', '.*') # Access each file in the corpus. for infile in sorted(newcorpus.fileids()): print infile # The fileids of each file. fin = newcorpus.open(infile)# Opens the file. print fin.read().strip() # Prints the content of the file print # Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print # To access pargraphs of a specific fileid.