Ejemplo n.º 1
0
def load_corpus(race_code=None,
                gender_code=None
                ):  #loads corpora into an array based on race and gender

    if (race_code == None):  # if none is specified, search all
        race_code = ".."
    if (gender_code == None):
        gender_code = ".."

    reader = PlaintextCorpusReader(
        corpus_root, ".*_" + race_code + "_" + gender_code +
        "\.txt")  # uses filename encoding to load specified texts
    corpora = []

    for fileid in reader.fileids(
    ):  #creates ComedyCorpus object, populates with fileid and name
        new_corpus = ComedyCorpus()
        new_corpus.set_fileid(fileid)
        try:
            new_corpus.set_text(
                reader.raw(fileid))  #gets word content based on fileid
        except UnicodeDecodeError:
            continue
        fileid = re.sub("_" + race_code + "-" + gender_code + "\.txt", "",
                        fileid)
        #name is fileid without encoding
        fileid = fileid.replace("%20", " ")
        fileid = fileid.replace("_", "; ")
        print(fileid)
        new_corpus.set_name(fileid)
        corpora.append(new_corpus)

    return corpora
Ejemplo n.º 2
0
def load_feat_data(dir_array):

    data_list = []

    for direct in dir_array:

        data = []

        corpus_dir = 'dataset/' + direct
        corpus = PlaintextCorpusReader(corpus_dir, '.*\.*')
        file_ids = corpus.fileids()

        for file in file_ids:
            text = corpus.raw(file)
            e = email.message_from_string(text)

            if (e.is_multipart()):
                for payload in e.get_payload:
                    text = payload.get_payload

            else:
                text = e.get_payload()

            data.append(extract_features(text, corpus, file))

        data_list.extend(data)

    return data_list
Ejemplo n.º 3
0
def parseFolder( dirPath ):
    assignments = {}
    draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*')
    finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*')

    numFiles = len( os.listdir( dirPath ))
    assert numFiles % 2 == 0

    finalIdsSortedList = finalReader.fileids()
    draftIdsSortedList = draftReader.fileids()

    for pid in finalReader.fileids():
        final = finalReader.paras( pid ) #finalIdsSortedList[i] )
        draft = draftReader.paras( pid ) #draftIdsSortedList[i] )
        assn = assignment( draft, final )
        assignments[pid] = assn

    return assignments
Ejemplo n.º 4
0
def parseFolder( dirPath ):
    assignments = []
    draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*')
    finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*')

    numFiles = len( os.listdir( dirPath ))
    assert numFiles % 2 == 0

    finalIdsSortedList = finalReader.fileids()
    draftIdsSortedList = draftReader.fileids()

    for i in range(len(finalReader.fileids())):
        final = finalReader.paras( finalIdsSortedList[i] )
        draft = draftReader.paras( draftIdsSortedList[i] )
        assn = assignment( draft, final )
        assignments.append( assn )

    return assignments
Ejemplo n.º 5
0
def get_fileid_lst(source_dir):
    '''
    Use NLTK to pull in the list of file ids in the given source directory

    :param {str} source_dir:
        The relative path to the source directory that contains all the data (book) files
    :return {str} fileid_lst:
        List of all file id's ending in '.txt' in the source_dir
    '''
    temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt')
    fileid_lst = temp_corp.fileids()

    return fileid_lst
Ejemplo n.º 6
0
def main():
    """
    Main function of the program
    """

    corpus_dir = 'NLP_dataset/training_set'  # Directory of corpus.
    new_corpus = PlaintextCorpusReader(corpus_dir, '.*')
    for file_id in new_corpus.fileids():
        file_to_read = open(corpus_dir+"/"+file_id, "r")

        # reading each file to get matched sentences
        matched_sen = match_regular_expressions(file_to_read)

        # writing the matched sentences to files
        write_to_files(matched_sen, file_id)
Ejemplo n.º 7
0
def load_data(dir_label):

    data_list = []
    labels = []

    for dl in dir_label:

        data = []

        directory = dl[0]
        label = dl[1]

        corpus_dir = 'dataset/' + directory
        corpus = PlaintextCorpusReader(corpus_dir, '.*\.*')
        file_ids = corpus.fileids()

        for file in file_ids:

            d = []

            text = corpus.raw(file)
            e = email.message_from_string(text)

            if (e.is_multipart()):
                for payload in e.get_payload:
                    text = payload.get_payload
            else:
                text = e.get_payload()

            feats = [
                cf.charac_feats_extractor(text),
                wf.word_feats_extractor(text),
                syf.syntac_feats_extractor(text),
                stf.struct_feats_extractor(corpus, file, text),
                fwf.funct_word_feats_extractor(text)
            ]

            for f in feats:
                d.extend(list(f.values()))

            data.append(d)
            labels.append(label)

        data_list.extend(data)

    return [data_list, labels]
Ejemplo n.º 8
0
def processFile(newCorpusDir):
    if not os.path.isdir(newCorpusDir):
        os.mkdir(newCorpusDir)
    txt1 = getText('sample_feed.txt')
    txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf')
    txt3 = word.getTextWord('my_doc.docx')

    files = [txt1, txt2, txt3]
    for idx, f in enumerate(files):
        with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
            fout.write(f)

    newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

    print(newCorpus.words())
    print(newCorpus.sents(newCorpus.fileids()[1]))
    print(newCorpus.paras(newCorpus.fileids()[0]))
Ejemplo n.º 9
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
newcorpus = PlaintextCorpusReader('nltkCorpusAll/', '.*')

# Access each file in the corpus.
for infile in sorted(newcorpus.fileids()):
    print infile # The fileids of each file.
    fin = newcorpus.open(infile)# Opens the file.
    print fin.read().strip() # Prints the content of the file
print

# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print 

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and 
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
#Tried to find misspellings in a corpus of text files. See find_misspellings.py and grouping_docs.py for documentation.
#There are ~30,400 unique words in these 49 communication files
#Rebecca's laptop took too long to make the correlation matrix

import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
import numpy as np
from numpy import linalg

#make a new corpus
corpusdir = 'communications/small_test_batch' #where the files are
newcorpus = PlaintextCorpusReader(corpusdir, '.*')

fileids = newcorpus.fileids() #list of fileids
j = len(fileids) #number of docs

words_list = [] #['doc', '1', 'words', 'doc', '2', 'words',...]
doc_breaks = [0] #ith entry = index of first word in doc i in words_list
keywords = set() #{'doc', '1', 'words', '2',...}

tokenizer = RegexpTokenizer('\w+') #pick out alphanumeric sequences; discard punctuation, white space

#create set of keywords and list of file texts
for id in fileids:
    raw = newcorpus.raw(id)
    raw2 = ''.join([i if ord(i)<128 else '' for i in raw]) #remove unicode characters
    raw3 = raw2.encode('ascii')
    file_words = map(str.lower,tokenizer.tokenize(raw3)) #list of cleaned words: lower-case, no punct, no whitespace
    words_list = words_list + file_words
    doc_breaks = doc_breaks + [len(file_words)+doc_breaks[len(doc_breaks)-1]]
Ejemplo n.º 11
0
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import nltk

# Might need the below line once
# nltk.download('punkt')

corpusDir = 'own_corpus/'

newCorpus = PlaintextCorpusReader(corpusDir, '.*\.txt')

for file in sorted(newCorpus.fileids()):
    words = newCorpus.words(file)
    text = nltk.Text(words)
    print(text)
Ejemplo n.º 12
0
Archivo: nlp.py Proyecto: ddyson1/nltk
# nltk.download()
# nltk.download('gutenberg')

# text1.concordance("water")
# print(FreqDist(text1).most_common(50))
# FreqDist(text1).plot(50, cumulative=True)
# print(set(text1))

corpus_root = '/Users/devindyson/Desktop/nltk/corpora'
corpora = PlaintextCorpusReader(corpus_root, '.*')

# print(corpora.raw('meditations.txt'))
# print(SentimentIntensityAnalyzer().polarity_scores("NLTK is pretty dope."))

print(sorted(corpora.fileids()))
print(len(corpora.words('meditations.txt')))
print(len(corpora.words('benjamin.txt')))

meditations = Text(corpora.words('meditations.txt'))
benjamin = Text(corpora.words('benjamin.txt'))


def lexical_diversity(text_data):
    word_count = len(text_data)
    vocab_size = len(set(text_data))
    diversity_score = vocab_size / word_count
    return diversity_score


print(lexical_diversity(meditations))
Ejemplo n.º 13
0
                       tokenizer=word_tokenize, postagger=pos_tag,
                       lemmatizer=wnl, stemmer=porter):
    words, lemmas, poss = [], [], []
    for word, pos in postagger(sentence):
        pos = penn2morphy(pos)
        lemmas.append(lemmatize(word.lower(), pos, neverstem,
                                lemmatizer, stemmer))
        poss.append(pos)
        words.append(word)
    if keepWordPOS:
        return words, lemmas, [None if i == '' else i for i in poss]
    return lemmas

regex = re.compile('[_]+')

for f in corpus.fileids():
    outname = args.preprocess + "/" + f + ".out"
    fout = open(outname,"w", encoding="utf8")

splitter = nltk.data.load(‘tokenizers/punkt/english.pickle’)
tokenizer.tokenize(text)

word_tokenize

    for sent in corpus.sents(f):
        s = []
        for w in sent:
                w = regex.sub('',w).lower()
                if (
                        len(w)>2
                        and not w in stop_words
Ejemplo n.º 14
0
from nltk.stem.porter import *
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

#from modifiedtexttiling import TextTilingTokenizer
import modifiedtexttiling

#input as all the documents with preprocessed text.
corpusdir = '/home/abc/Desktop/adm/new_dataset'

#output as all the segmented documents with thier corresponding document names as prefix.
corpusdir_p = '/home/abc/Desktop/adm/segments'

newcorpus = PlaintextCorpusReader(corpusdir, '.*')

#sort all the document names alphabetically.
sortedall = sorted(newcorpus.fileids())
#print sortedall

for filename in sortedall:
    #open each document.
    fp = open(corpusdir + "/" + filename)
    #print message.
    print 'processing : ' + filename
    #save document text as string.
    n = fp.read()

    #Create TextTilingTokenizer() object
    t = modifiedtexttiling.TextTilingTokenizer()

    #get the segments as list of strings.
    k = t.tokenize(n)
import nltk
from nltk import word_tokenize
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus import floresta,mac_morpho
from parser_portuguese_risk import evaluateModel, splitTrainTestModel, simplify_tag
time1 =datetime.datetime.now()

###############################################################################
### ATENTION: if we have some tmp files like .DS_STORE in Mac OSX, we must remove it ###

# Reading corpus
corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/glossAnnotated/' # Directory of corpus.
#corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/test1/' # Directory of corpus.   
risco = PlaintextCorpusReader(corpusdir, '.*')
risco.fileids()

raw_text = risco.raw('gloss533.txt')
#print raw_text[0:]

# Some statistics

print 'Number of term: ', len(risco.words())
print 'Number of unique terms: ', len(set(risco.words()))

fd = nltk.FreqDist(risco.words())
print fd.freq('bem')
print fd['bem']

# presenting ngrams of the term
target_word = 'bem como'
Ejemplo n.º 16
0
import os
import word, pdf
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


def getText(txtFileName):
    file = open(txtFileName, 'r')
    return file.read()


# 새로운 corpus 폴더 생성-디렉터리
newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):
    os.mkdir(newCorpusDir)

txt1 = getText('sample_feed.txt')
txt2 = pdf.getTextPDF('sample-pdf.pdf')
txt3 = word.getTextWord('sample-one-line.docx')

# 세 문자열 객체의 내용을 디스크에 파일로 작성(쓰기모드)
files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

# 파일을 저장한 디렉터리에서 plaintext 객체 생성
newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')
print(newCorpus.words())  #0.txt 모든 단어 출력
print(newCorpus.sents(newCorpus.fileids()[1]))  #1.txt 문장 출력
print(newCorpus.sents(newCorpus.fileids()[0]))  #0.txt 단락별 출력
Ejemplo n.º 17
0
def try_out_some_functionalities():

    corpusdir ="/media/benzro/OS/Users/benzro/Desktop/Studium Uni/2)" \
           "ZweitesSemester/27)PCL-2/Uebungen/Uebung03/Enron/test/"
    newcorpus = PCR(corpusdir, '.*')

    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access one file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    infile = corpusdir + "0001.1999-12-10.farmer.ham.txt"
    infile = "0004.1999-12-14.farmer.ham.txt"
    fin = newcorpus.open(infile)
    print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "all file ids"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.fileids()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access each file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # (reduced output: [0:2])
    for infile in sorted(newcorpus.fileids()):
        # the fileids of each file
        print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
        print infile
        # opens the file
        fin = newcorpus.open(infile)
        # prints the content of the file
        print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access the plaintext; outputs pure string of all files"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.raw().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access paragraphs in the corpus. (list of list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
    #       nltk.tokenize.word_tokenize.
    #
    # Each element in the outermost list is a paragraph, and
    # Each paragraph contains sentence(s), and
    # Each sentence contains token(s)
    print newcorpus.paras()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access pargraphs of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.paras(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access sentences in the corpus. (list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: That the texts are flattened into sentences that contains tokens.
    print newcorpus.sents()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access sentences of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.sents(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access just tokens/words in the corpus. (list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access tokens of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
Ejemplo n.º 18
0
OUTPUT_SIGNATURE = "file*.lemmatized"

import nltk
import os
import glob
from os.path import join
from nltk.collocations import *
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# read in corpus, find all the 3-grams above the min frequency
print "Reading in corpus from", CORPUS_ROOT
my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
print "Read in " + str(len(my_corpus.fileids())) + " files"
print "Finding 3-grams"
finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words())
print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY
finder_3gram.apply_freq_filter(MIN_FREQUENCY)

# combine all the 3-grams meeting the PMI threshold
print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI
filelist = [f for f in glob.glob(CORPUS_ROOT + CORPUS_OUTPUT_EXTENSION)]

gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI)
processGrams(gen, filelist)

# now let's do the same for the 2-grams
# our previous step altered the corpus so let's read it in again
print "Reading in corpus from", CORPUS_ROOT
Ejemplo n.º 19
0
robotStoryCorpusDir = '../resources/robot_stories'

childStoryCorpus = PlaintextCorpusReader(childStoryCorpusDir, ".*\.txt")
robotStoryCorpus = PlaintextCorpusReader(robotStoryCorpusDir, ".*\.txt")


# average word length, average sentence length, and the number of times each vocabulary item appears in the text on average (our lexical diversity score)
# for fileid in childStoryCorpus.fileids():
#     num_chars = len(childStoryCorpus.raw(fileid))
#     num_words = len(childStoryCorpus.words(fileid))
#     num_sents = len(childStoryCorpus.sents(fileid))
#     num_vocab = len(set([w.lower() for w in childStoryCorpus.words(fileid)]))
#     print ((float(num_chars)/float(num_words)), float(num_words)/float(num_sents), float(num_words)/float(num_vocab), fileid)


for fileid in childStoryCorpus.fileids():

    print (fileid)
    file_path = os.path.join(childStoryCorpusDir, fileid)

    with open(file_path, 'r') as orgf:
        for line in orgf:
            for s in tokenize.sent_tokenize(line):
                print(s)
                #print(st.tag(tokenize.word_tokenize(s)))
                #print(st.tag(s.split()))
                print(list(parser.raw_parse(s)))

                # for line in parser.raw_parse(s):
                #     for sentence in line:
                #         sentence.draw()
Ejemplo n.º 20
0
    with open(corpusdir + str(filename) + '.txt', 'w') as fout:
        print << fout, text

# Check that our corpus do exist and the files are correct.
assert os.path.isdir(corpusdir)
for infile, text in zip(sorted(os.listdir(corpusdir)), corpus):
    assert open(corpusdir + infile, 'r').read().strip() == text.strip()

# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
newcorpus = PlaintextCorpusReader('newcorpus/', '.*')

# Access each file in the corpus.
for infile in sorted(newcorpus.fileids()):
    print(infile)  # The fileids of each file.
    with newcorpus.open(infile) as fin:  # Opens the file.
        print(fin.read().strip())  # Prints the content of the file

# # Access the plaintext; outputs pure string/basestring.
# print(newcorpus.raw().strip())
#
# # Access paragraphs in the corpus. (list of list of list of strings)
# # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
# #       nltk.tokenize.word_tokenize.
# #
# # Each element in the outermost list is a paragraph, and
# # Each paragraph contains sentence(s), and
# # Each sentence contains token(s)
# print newcorpus.paras()
Ejemplo n.º 21
0
from nltk.text import *
import nltk

if len(sys.argv) != 4:
  print "Usage:", sys.argv[0], "word sense1 sense2"
  exit(-1)

focal_word = sys.argv[1]
senses = [sys.argv[2], sys.argv[3]]
#focal_word = "plant"
#senses = ["manufacturing","life"]
corpus = PlaintextCorpusReader('outcorpus/', '.*')
collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ]
decision_list = wsd.DecisionList()
decision_list.load("senses_bootstrap_" + focal_word + ".csv")
corpus_ids = corpus.fileids()
random.shuffle(corpus_ids)

num_words = 1
num_words_max = 100
tagged = 0
ambiguous = 0
unknown = 0

for infile in corpus_ids:
  if num_words > num_words_max: break

  words = corpus.words(infile)
  text = Text(words)
  c = nltk.ConcordanceIndex(text.tokens)
  offsets = c.offsets(focal_word)
Ejemplo n.º 22
0
# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
print newcorpus.paras(newcorpus.fileids()[0])

# Access sentences in the corpus. (list of list of strings)
# NOTE: That the texts are flattened into sentences that contains tokens.
print newcorpus.sents()
print

# To access sentences of a specific fileid.
print newcorpus.sents(newcorpus.fileids()[0])

# Access just tokens/words in the corpus. (list of strings)
print newcorpus.words()

# To access tokens of a specific fileid.
print newcorpus.words(newcorpus.fileids()[0])
Ejemplo n.º 23
0
class Contract_Reader():
    def __init__(self, config):
        print('Filepath for texts = ', config.textpath)
        self.corpus = PCR(config.textpath,
                          '.*\.txt',
                          encoding='utf-16',
                          para_block_reader=read_line_block)
        if config.clean_paragraphs == 'yes':
            self.clean(config, mode='para')
        if config.clean_sentences == 'yes':
            self.clean(config, mode='sent')
        #Corpus summaries
        self.corpus_info()
        self.LDA(config.num_topics, config.num_words)
        self.plot(config.num_words)

    def clean(self, config, mode='sent'):
        stop = set(stopwords.words('english'))
        exclude = set(string.punctuation)
        lemma = WNL()
        if mode == 'para':
            #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings.
            self.para_list = [
                list(itertools.chain.from_iterable(para))
                for para in self.corpus.paras()
            ]
            for index, paragraph in enumerate(self.para_list):
                paragraph = " ".join(paragraph)
                stop_free = " ".join(
                    [i for i in paragraph.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.para_list[index] = normalized
            print(self.para_list[0])
            self.para_list = [para.split() for para in self.para_list]
            print(self.para_list[0])
        if mode == 'sent':
            #Obtain list of strings each one a sentence rather than list of lists.
            self.sents_list = [" ".join(sent) for sent in self.corpus.sents()]
            for index, sentence in enumerate(self.sents_list):
                stop_free = " ".join(
                    [i for i in sentence.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.sents_list[index] = normalized
            print(self.sents_list[0])
            self.sents_list = [
                sentence.split() for sentence in self.sents_list
            ]
            print(self.sents_list[0])

    def LDA(self, num_topics, num_words):
        dictionary = corpora.Dictionary(self.para_list)
        doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list]
        path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623'
        self.ldamodel = LdaVowpalWabbit(path,
                                        doc_term_matrix,
                                        num_topics=num_topics,
                                        id2word=dictionary)
        self.ldamodel.save('model/lda_model')
        print(self.ldamodel.print_topics(num_topics=10, num_words=num_words))

    def plot(self, num_words):
        for t in range(self.ldamodel.num_topics):
            plt.figure()
            tuples = [
                reversed(x) for x in self.ldamodel.show_topic(t, num_words)
            ]
            plt.imshow(WordCloud().fit_words(dict(tuples)))
            plt.axis("off")
            plt.title("Topic #" + str(t))
            plt.savefig('plots/topic' + str(t))

    def corpus_info(self):
        """
        Summary information about the status of a corpus.
        """
        fids = len(self.corpus.fileids())
        paras = len(self.corpus.paras())
        sents = len(self.corpus.sents())
        sperp = sum(len(para) for para in self.corpus.paras()) / float(paras)
        tokens = FreqDist(self.corpus.words())
        count = sum(tokens.values())
        vocab = len(tokens)
        lexdiv = float(count) / float(vocab)

        print(
            ("Text corpus contains {} files\n"
             "Composed of {} paragraphs and {} sentences.\n"
             "{:0.3f} sentences per paragraph\n"
             "Word count of {} with a vocabulary of {}\n"
             "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp,
                                                    count, vocab, lexdiv))
Ejemplo n.º 24
0
print(a)

from scipy.spatial.distance import cosine
print(cosine(dtm[0].toarray(),dtm[1].toarray()))


from sklearn.feature_extraction.text import TfidfVectorizer

tfid_vectors = TfidfVectorizer()
tfid_vectors = tfid_vectors.fit_transform([sent1,sent2])
print(pd.DataFrame(data = tfid_vectors.toarray()))
a1=pairwise_distances(tfid_vectors[0].toarray(),tfid_vectors[1].toarray(),metric='cosine')
print(a1)

print("________________Tf-idf corpus reader__________________________")

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
path="./text_docs/"

president_corpus = PlaintextCorpusReader(path,".*",encoding="utf-8")
tfid_vectors_corpus = TfidfVectorizer(input='filename')
files= [path+filename for filename in list(president_corpus.fileids())]
tf_idf_matrix = tfid_vectors_corpus.fit_transform(raw_documents=files)
barack = tf_idf_matrix.toarray()[0]
bush = tf_idf_matrix.toarray()[1]
trump = tf_idf_matrix.toarray()[2]

print(cosine(barack,bush))
print(cosine(bush,trump))
print(cosine(trump,barack))
    return file.read()


# 말뭉치 폴더 생성
newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):  # 말뭉치 폴더가 이미 존재하는가?
    os.mkdir(newCorpusDir)

# 파일 읽기
# 일반 텍스트 파일
txt1 = getText('./Files/sample_feed.txt')
# PDF 파일
txt2 = pdf.getTextPDF('./Files/sample-pdf.pdf')
# DOCX 파일
txt3 = word.getTextWord('./Files/sample-one-line.docx')

# 파일 쓰기
files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

# 사용자 정의 말뭉치 만들기
# 폴더 내의 모든 파일을 읽어와 파일들로부터 말뭉치를 생성한다
newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

# 사용자 정의 말뭉치가 잘 만들어 졌는지 확인
print(newCorpus.words())  # 말뭉치의 모든 단어를 포함하는 배열
print(newCorpus.sents(newCorpus.fileids()[1]))  # 1.txt에 있는 모든 문장 배열을 출력
print(newCorpus.paras(newCorpus.fileids()[0]))  # 0.txt에 있는 모든 단락 배열을 출력
    # marks occur replace it with null
    for x in string.lower():
        if x in punctuations:
            string = string.replace(x, "")
    return string


debug = None
big_paras = []

print_timestamp('\n' * 3 + 'End')
print_timestamp('\n' * 3 + 'Begin')

corpusdir = '..\\Thinkful\\Datafiles/UnsupervisedLearningCapstone\\fiction_corpus\\'
fiction_corpus = PlaintextCorpusReader(corpusdir, '.*.txt')
documents_stat = fiction_corpus.fileids()
if debug:
    print("documents_stat={} and is a {} datatype".format(
        documents_stat, type(documents_stat)))
documents_stat_0 = []
# documents_stat_0.append(documents_stat[0])

if debug:
    print("documents_stat_0 is a {} datatype".format(type(documents_stat)))
item_num = 0

book_block = []
word_counts = {}

for book in documents_stat:
    item_num += 1
Ejemplo n.º 27
0
'''

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import string
import csv
from fileinput import filename

corpusdir = 'C:/Users/Advaith GVK/workspace/Trial/src/Pack/New folder' # Directory of corpus.

newcorpus = PlaintextCorpusReader(corpusdir, '.*')

filenames = newcorpus.fileids()
# print newcorpus.sents()

def getWordNetType(tag):
        #print tag
        if tag in ['JJ', 'JJR', 'JJS']:
            return wn.ADJ
        elif tag in ['NN', 'NNS', 'NNP', 'NNPS','POS','FW']:
            return wn.NOUN
        elif tag in ['RB', 'RBR', 'RBS','WRB']:
            return wn.ADV
        elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            return wn.VERB
        return wn.NOUN

import os
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import numpy as np
import operator
import os
import sys
#corpus of segments
reload(sys)
sys.setdefaultencoding('Cp1252')
corpusdir = '/home/abc/Desktop/adm/segments'
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
sortedall = sorted(newcorpus.fileids())
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

doc_a = "/home/rashmi/Documents/adm_project/appended/0.txt"
doc_b = "/home/rashmi/Documents/adm_project/appended/1.txt"
doc_c = "/home/rashmi/Documents/adm_project/appended/2.txt"
doc_d = "/home/rashmi/Documents/adm_project/appended/3.txt"
doc_e = "/home/rashmi/Documents/adm_project/appended/4.txt"

# compile sample documents into a list
Ejemplo n.º 29
0
	if (verbose and j%inc==0): print('Progress:',j,'/',jobs)
	return term_yn

# prepocess the text here
def preprocess(t):
	rem_chars = "[!\"#$%&()*+,:;<=>?@[\\]^_`{|}~0123456789]" # remove these
	rep_chars = "[-./\']" # replace these
	t_temp = re.sub(rem_chars, "", t.lower())
	t_temp = re.sub(rep_chars, " ", t_temp)
	t_strip_lower_filt = [w for w in t_temp.split() if not w in stopwords.words('english')]
	return " ".join(t_strip_lower_filt)

# load the data
corpusdir = 'corpus_txt/' # Directory of corpus.
mycorp_raw = PlaintextCorpusReader(corpusdir, '.*')
file_index = mycorp_raw.fileids()

# preprocess the text (slow)
# uncomment one of the following lines for usual vs parallel processing
#mycorp_proc = nltk.Text([preprocess(mycorp_raw.raw(f)) for f in file_index])
mycorp_proc = Parallel(n_jobs=3,verbose=True)(delayed(preprocess)(mycorp_raw.raw(f)) for f in file_index)


# get ngrams (1-3)
vectorizer_ngrams = CountVectorizer(min_df = 0.05, ngram_range=(1, 3))
mat_ngrams = vectorizer_ngrams.fit_transform(mycorp_proc)
n_df = pd.DataFrame(data = mat_ngrams.A, 
	columns = vectorizer_ngrams.get_feature_names())
n_df['pt_id'] = [i[:-4] for i in file_index]
# write results to file
n_df.to_csv('ngrams_dtm.csv', index = False)
Ejemplo n.º 30
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import word_tokenize
import re


corpusdir = 'python/' # Directory of corpus.

newcorpus = PlaintextCorpusReader(corpusdir, '.*')
print(newcorpus.fileids()[0])
print(type(newcorpus))
#print newcorpus.raw()
print newcorpus.words(newcorpus.fileids()[0])
print(len(newcorpus.words()))

tokens = word_tokenize(newcorpus.raw())
#type(tokens)
print len(tokens)
print tokens[:50]
#tokens[:10]
print newcorpus.sents()
print

#to remove comments
def removeComments(string):
    string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,string) # remove all occurance streamed comments (/*COMMENT */) from string fdf
    string = re.sub(re.compile("//.*?\n" ) ,"" ,string) # remove all occurance singleline comments (//COMMENT\n ) from string
    return string

print(removeComments(newcorpus.words(newcorpus.raw())))
Ejemplo n.º 31
0
#         text_list.append(text)

# preprocessed_docs = []
# for n,t in enumerate(text_list):
#     # print sample of text before and after processing
#     #if n == (len(text_list) - 1):
#     #    print(("Doc {} (before preproc): {}").format(n, t))
#     #    print(("Doc {}: {}").format(n, p))
#     p = preprocess(t)
#     preprocessed_docs.append(p)

# print("Preprocessed docs len:", len(text_list))

texts = PlaintextCorpusReader(d, ".*\.txt")

boc_texts = [extract(texts.raw(fileid)) for fileid in texts.fileids()]

dictionary = gensim.corpora.Dictionary(boc_texts)
#dictionary = gensim.corpora.Dictionary(preprocessed_docs)
#dictionary.filter_extremes(no_below=10,no_above=.5,keep_n=100000)
#bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]
bow_corpus = [dictionary.doc2bow(doc) for boc_text in boc_texts]
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

fileids = texts.fileids()

for idx, doc in enumerate(corpus_tfidf):
    new_file.write("Document '{}' key phrases:\n".format(fileids[idx]))
    # Get top 100 terms by TF-IDF score
    for wid, score in heapq.nlargest(100, doc, key=itemgetter(1)):
Ejemplo n.º 32
0
CORPUS_EXTENSION =r'.*\.txt'

import nltk
import os
from os import listdir
from os.path import isfile, join
from nltk.collocations import *
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# read in corpus, find all the 3-grams above the min frequency
print "Reading in corpus from", CORPUS_ROOT
my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
print "Read in " + str(len(my_corpus.fileids())) + " files"
print "Finding 3-grams"
finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words())
print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY
finder_3gram.apply_freq_filter(MIN_FREQUENCY)

# combine all the 3-grams meeting the PMI threshold
print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI
filelist = [ join(CORPUS_ROOT,f) for f in listdir(CORPUS_ROOT) if isfile(join(CORPUS_ROOT,f)) ]
gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI)
processGrams(gen, filelist)

# now let's do the same for the 2-grams
# our previous step altered the corpus so let's read it in again
print "Reading in corpus from", CORPUS_ROOT
my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
Ejemplo n.º 33
0
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

#Loading the file you want to Train
corpusdir = 'E:\MTech' # Directory of corpus.

newcorpus = PlaintextCorpusReader(corpusdir, '.*')
print(newcorpus.fileids())
dictlist=[]

#Converting from word/tag pait to a list containing two tuple i.e,[(word1,tag),(word2,tag)]
for i in newcorpus.fileids():
    tagged_sent=newcorpus.raw(i)
    tagged=tagged_sent.split()
    for t in tagged:
        temp1=nltk.tag.str2tuple(t)
        dictlist.append(temp1)
print(dictlist)
print("This is the length of distinct words")
print(len(set(dictlist)))
fdist=nltk.FreqDist(dictlist)
print("fdist items")
print(fdist.items())
print(fdist.max())




rawtext = '''
 '''
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('wordnet')
download('punkt')
download('stopwords')

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

############################  Read Data  #############################################

#if you run this code with ipython you should enter the address of your file: instead of './txts'
corpus_directory = './txts'
textsfile = PlaintextCorpusReader(corpus_directory, '.*')
ID_files = textsfile.fileids()
print(ID_files, len(ID_files))

##############################  Preprossesing Data  ######################################

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
Index_of_files = []
texts = []
count = 0

#file with file_ids
for fileid in ID_files:
    texts.append(textsfile.raw(fileids=fileid))
    Index_of_files.append(fileid)
Ejemplo n.º 35
0
if len(sys.argv) != 4:
  print "Usage:", sys.argv[0], "word sense1 sense2"
  exit(-1)

focal_word = sys.argv[1]
senses = [sys.argv[2], sys.argv[3]]
#focal_word = "plant"
#senses = ["manufacturing","life"]
corpus = PlaintextCorpusReader('outcorpus/', '.*')
collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ]
decision_list = wsd.DecisionList()
decision_list.load("senses_bootstrap_" + focal_word + ".csv")    

i = 0
for infile in sorted(corpus.fileids()):
  print i, "/", len(corpus.fileids())
  i += 1
  
  words = corpus.words(infile)
  text = Text(words)
  c = nltk.ConcordanceIndex(text.tokens)
  offsets = c.offsets(focal_word)
  
  for offset in offsets:
    for collocation in collocations:
      tokens = collocation.get_collocation(text, offset)
      if tokens == None: continue
      sense = decision_list.get_sense(tokens, collocation.index)
      if sense == None: continue
      collocation.add_collocation(text, offset, sense)
Ejemplo n.º 36
0
import glob
import os
import string
import nltk
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import scipy.stats as stats

names = []
corpus = []

co = PlaintextCorpusReader("./election", ".*\.txt")

for fileids in co.fileids():
    names.append(fileids)
    corpus.append(co.raw(fileids))

print len(names), 'documents in the corpus'
print names[:30]

for idx in range(len(corpus) - 1, -1, -1):
    print
    print names[idx]
    print corpus[idx][:70].replace('\n', ' ')

vectorizer = TfidfVectorizer(stop_words='english', min_df=2)
dtm = vectorizer.fit_transform(corpus)
print dtm.shape
vocab = vectorizer.get_feature_names(
Ejemplo n.º 37
0
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from decimal import Decimal
from math import pi

if __name__ == '__main__':
    ptcr = PlaintextCorpusReader('C:\Users\Jakub\Downloads\pr4\Trzeci plik', ['znormalizowane.txt', 'katy.txt'])
    data = []
    t = ptcr.raw(fileids=ptcr.fileids()[1]).replace(',', '.').replace('\r', '').split('\n')
    t.remove('')
    for x in t:
        data.append(float(Decimal(x)*360/315))
    print data
    data_ = []
    t = ptcr.raw(fileids=ptcr.fileids()[0]).replace(',', '.').replace('\r', '').split('\n')
    t.remove('')
    for x in t:
        data_.append(float(x)/100)
    print data_
Ejemplo n.º 38
0
    with open(corpusdir + str(filename) + '.txt', 'w') as fout:
        print >> fout, text

# Check that our corpus do exist and the files are correct.
assert os.path.isdir(corpusdir)
for infile, text in zip(sorted(os.listdir(corpusdir)), corpus):
    assert open(corpusdir + infile, 'r').read().strip() == text.strip()

# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
newcorpus = PlaintextCorpusReader('newcorpus/', '.*')

# Access each file in the corpus.
for infile in sorted(newcorpus.fileids()):
    print infile  # The fileids of each file.
    with newcorpus.open(infile) as fin:  # Opens the file.
        print fin.read().strip()  # Prints the content of the file
print

# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
Ejemplo n.º 39
0
# 150 common_words x 6 fileids x sentences(937 total) x words + append/join sentence into string + 6x search fileids
from nltk.tokenize.treebank import TreebankWordDetokenizer

common_words = dict(freq.most_common(150))
print(common_words)

data = []
hmap = {}
detokenized = {}

for word, frequency in common_words.items():
    datum = {'word': word, 'frequency': frequency}
    docs = []
    sents = []

    for key, fileid in enumerate(corpus.fileids()):
        if key not in hmap:
            hmap[key] = {}

        for s_id, sentence in enumerate(corpus.sents(fileid)):
            if key in hmap and s_id in hmap[key]:
                words = hmap[key][s_id]
            else:
                words = [lemmatizer.lemmatize(w.lower()) for w in sentence]
                hmap[key][s_id] = words

            if word in words:
                s_key = f'{key}-{s_id}'
                sent = ''

                if s_key in detokenized:
import os
import word, pdf
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


def getText(txtFileName):
    file = open(txtFileName, 'r')
    return file.read()


newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):
    os.mkdir(newCorpusDir)

txt1 = getText('sample_feed.txt')
txt2 = pdf.getTextPDF('sample-pdf.pdf')
txt3 = word.getTextWord('sample-one-line.docx')

files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

print(newCorpus.words())
print(newCorpus.sents(newCorpus.fileids()[1]))
print(newCorpus.paras(newCorpus.fileids()[0]))
Ejemplo n.º 41
0
    boc_texts = [
        extract(texts.raw(fileid)) for fileid in texts.fileids()
    ]

    # make gensim dictionary and corpus
    dictionary = gensim.corpora.Dictionary(boc_texts)
    corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]

    # transform corpus with tf*idf model
    tfidf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    return corpus_tfidf, dictionary

# Can change this to output just keywords by commenting out 1st, 2nd, and 4th "new_file.write" lines
if __name__ == '__main__':
    tfidfs, id2word = score_keyphrases_by_tfidf(texts)#, 'words')
    fileids = texts.fileids()

    # Print top keywords by TF-IDF
    for idx, doc in enumerate(tfidfs):
        new_file.write("Document '{}' key phrases:\n".format(fileids[idx]))
        # Get top 10 terms by TF-IDF score
        for wid, score in heapq.nlargest(keyphrase_num, doc, key=itemgetter(1)):
            new_file.write("{:0.3f}: {}\n".format(score, id2word[wid]))
            #new_file.write("{}\n".format(id2word[wid]))

        new_file.write("\n")

print("Done! Look for {} in the 'Classifier' directory".format(new_file_name))
Ejemplo n.º 42
0
def create_content (gdocs,graphicsdir,gcontent):
    for file in gdocs:
        gcontent.append(open(graphicsdir+'/'+str(file),'r').read())


# defining the directory path for each category
graphicsdir,autosdir,gunsdir = '20news-bydate/train/comp.graphics','20news-bydate/train/rec.autos','20news-bydate/train/talk.politics.guns'
graphicstest,autostest,gunstest = '20news-bydate/test/comp.graphics','20news-bydate/test/rec.autos','20news-bydate/test/talk.politics.guns'
graphicscorpus,autoscorpus,gunscorpus = PlaintextCorpusReader(graphicsdir, '.*'),PlaintextCorpusReader(autosdir, '.*'),PlaintextCorpusReader(gunsdir, '.*')
graphicscorpustest,autoscorpustest,gunscorpustest = PlaintextCorpusReader(graphicstest, '.*'),PlaintextCorpusReader(autostest, '.*'),PlaintextCorpusReader(gunstest, '.*')

# initializing the lists
gdocs,adocs,ndocs,gcontent,acontent,ncontent,gwords,awords,nwords,vocab = [],[],[],[],[],[],[],[],[],[]
gtdocs,atdocs,ntdocs,gtcontent,atcontent,ntcontent,gtwords,atwords,ntwords,vtocab = [],[],[],[],[],[],[],[],[],[]
# for train dataset
gdocs.extend(graphicscorpus.fileids()) # for graphics category
adocs.extend(autoscorpus.fileids()) # for autos category
ndocs.extend(gunscorpus.fileids()) # for guns category
# for test dataset
gtdocs.extend(graphicscorpustest.fileids()) # for graphics category
atdocs.extend(autoscorpustest.fileids()) # for autos category
ntdocs.extend(gunscorpustest.fileids()) # for guns category
# retriving the words for each category
# for train dataset
create_content(gdocs,graphicsdir,gcontent)
create_content(adocs,autosdir,acontent)
create_content(ndocs,gunsdir,ncontent)
# for test dataset
create_content(gtdocs,graphicstest,gtcontent)
create_content(atdocs,autostest,atcontent)
create_content(ntdocs,gunstest,ntcontent)