Ejemplo n.º 1
0
class Corpus(object):
    def __init__(self, data_root):
        self.data_root = data_root
        self.data = PlaintextCorpusReader(data_root, '.*')
        self.words = [i for i in self.data.words() if i.isalpha()]
        self.text = Text(self.words)
        self.stop = set(stopwords.words('english')).union({
            'cid', 'et', 'al', 'also', 'and', 'editingboston', 'arxiv',
            'pages', 'trackboston', 'preprint', 'page', 'vol', 'volume',
            'march', 'boston', 'table'
        })
        with open('bib.json') as fi:
            self.bib = json.load(fi)

    def documents(self):
        """Return a list of all documents in the corpus"""
        return sorted([i for i in os.listdir(self.data_root)])

    def words_in_file(self, filename):
        """Given a file, return a list of tokenized words"""
        try:
            text = self.data.open(filename).read()
        except FileNotFoundError:
            print("The file does not exist.")
        return word_tokenize(text)

    def sentences_in_file(self, filename):
        """Given a file, return a list of sentences"""
        try:
            text = self.data.open(filename).read()
        except FileNotFoundError:
            print("The file does not exist.")
        return sent_tokenize(text)

    def tokenized_sentences_in_file(self, filename):
        """Given a file name, return a list of word tokenized sentences"""
        try:
            text = self.data.open(filename).read()
            sent = [word_tokenize(s) for s in sent_tokenize(text)]
        except FileNotFoundError:
            print("The file does not exist.")
        return sent

    def most_frequent_content_words(self, n_words):
        """Return a list with the most frequent content words and their
        frequencies in (word, frequency) pairs ordered by frequency"""
        content_words = [
            w for w in self.words
            if w.lower() not in self.stop and w.isalpha() and len(w) > 1
        ]
        content_words_dict = FreqDist(content_words)
        return content_words_dict.most_common(n_words)

    def most_frequent_bigrams(self, n_bigrams):
        """Return a list with the most frequent bigrams of content words
        in the form of pairs where the first element is the bigram and
        the second is its frequency"""
        bigram_dict = FreqDist([k for k in bigrams(self.words)if k[0].isalpha()
            and k[1].isalpha() and len(k[0])>1 and len(k[1])>1 \
            and k[0].lower() not in self.stop and k[1].lower() not in self.stop])
        return bigram_dict.most_common(n_bigrams)

    def most_frequent_trigrams(self, n_trigrams):
        trigram_dict = FreqDist([k for k in trigrams(self.words)if k[0].isalpha()
            and k[1].isalpha() and len(k[0])>1 and len(k[1])>1 \
            and k[0].lower() not in self.stop and k[1].lower() not in self.stop
            and k[2].lower() not in self.stop])
        return trigram_dict.most_common(n_trigrams)

    def get_info(self, fileID):
        """Return metadata associate with a file indexed by the following fields:
        author, title, booktitle, year, publisher, pages, location, doi, url"""
        return self.bib[fileID]

    def print_reference(self, fileID):
        """Print metadata (author, title of paper, title of book, publishing year)
        associated with each file as a reference"""
        d = self.bib[fileID]
        print("%s. %s. %s, %s" % (' '.join(
            d['author'].split('\n')), d['title'], d['booktitle'], d['year']))

    def concordance(self, word):
        self.text.concordance(word)
Ejemplo n.º 2
0
# Check that our corpus do exist and the files are correct.
assert os.path.isdir(corpusdir)
for infile, text in zip(sorted(os.listdir(corpusdir)), corpus):
    assert open(corpusdir + infile, 'r').read().strip() == text.strip()

# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
newcorpus = PlaintextCorpusReader('newcorpus/', '.*')

# Access each file in the corpus.
for infile in sorted(newcorpus.fileids()):
    print infile  # The fileids of each file.
    with newcorpus.open(infile) as fin:  # Opens the file.
        print fin.read().strip()  # Prints the content of the file
print

# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
Ejemplo n.º 3
0

def remove_media(s):
    return re.sub(r'MEDIA', ' ', s)


def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed


def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


for infile in sorted(newcorpus.fileids()):
    print infile  # The fileids of each file.
    fin = newcorpus.open(infile)  # Opens the file.
    text = fin.read().strip()  # Prints the content of the file
    just_text = remove_websites(remove_media(remove_emojis(text)).lower())
    no_punctuation = remove_punctiation(just_text)
    token_dict[infile] = no_punctuation

tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

print token_dict
Ejemplo n.º 4
0
def try_out_some_functionalities():

    corpusdir ="/media/benzro/OS/Users/benzro/Desktop/Studium Uni/2)" \
           "ZweitesSemester/27)PCL-2/Uebungen/Uebung03/Enron/test/"
    newcorpus = PCR(corpusdir, '.*')

    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access one file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    infile = corpusdir + "0001.1999-12-10.farmer.ham.txt"
    infile = "0004.1999-12-14.farmer.ham.txt"
    fin = newcorpus.open(infile)
    print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "all file ids"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.fileids()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access each file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # (reduced output: [0:2])
    for infile in sorted(newcorpus.fileids()):
        # the fileids of each file
        print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
        print infile
        # opens the file
        fin = newcorpus.open(infile)
        # prints the content of the file
        print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access the plaintext; outputs pure string of all files"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.raw().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access paragraphs in the corpus. (list of list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
    #       nltk.tokenize.word_tokenize.
    #
    # Each element in the outermost list is a paragraph, and
    # Each paragraph contains sentence(s), and
    # Each sentence contains token(s)
    print newcorpus.paras()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access pargraphs of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.paras(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access sentences in the corpus. (list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: That the texts are flattened into sentences that contains tokens.
    print newcorpus.sents()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access sentences of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.sents(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access just tokens/words in the corpus. (list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access tokens of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
Ejemplo n.º 5
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
newcorpus = PlaintextCorpusReader('nltkCorpusAll/', '.*')

# Access each file in the corpus.
for infile in sorted(newcorpus.fileids()):
    print infile # The fileids of each file.
    fin = newcorpus.open(infile)# Opens the file.
    print fin.read().strip() # Prints the content of the file
print

# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print 

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and 
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.