コード例 #1
0
def load_corpus(race_code=None,
                gender_code=None
                ):  #loads corpora into an array based on race and gender

    if (race_code == None):  # if none is specified, search all
        race_code = ".."
    if (gender_code == None):
        gender_code = ".."

    reader = PlaintextCorpusReader(
        corpus_root, ".*_" + race_code + "_" + gender_code +
        "\.txt")  # uses filename encoding to load specified texts
    corpora = []

    for fileid in reader.fileids(
    ):  #creates ComedyCorpus object, populates with fileid and name
        new_corpus = ComedyCorpus()
        new_corpus.set_fileid(fileid)
        try:
            new_corpus.set_text(
                reader.raw(fileid))  #gets word content based on fileid
        except UnicodeDecodeError:
            continue
        fileid = re.sub("_" + race_code + "-" + gender_code + "\.txt", "",
                        fileid)
        #name is fileid without encoding
        fileid = fileid.replace("%20", " ")
        fileid = fileid.replace("_", "; ")
        print(fileid)
        new_corpus.set_name(fileid)
        corpora.append(new_corpus)

    return corpora
コード例 #2
0
def load_feat_data(dir_array):

    data_list = []

    for direct in dir_array:

        data = []

        corpus_dir = 'dataset/' + direct
        corpus = PlaintextCorpusReader(corpus_dir, '.*\.*')
        file_ids = corpus.fileids()

        for file in file_ids:
            text = corpus.raw(file)
            e = email.message_from_string(text)

            if (e.is_multipart()):
                for payload in e.get_payload:
                    text = payload.get_payload

            else:
                text = e.get_payload()

            data.append(extract_features(text, corpus, file))

        data_list.extend(data)

    return data_list
コード例 #3
0
ファイル: Parse.py プロジェクト: bflick/FYE-NLP
def parseFolder( dirPath ):
    assignments = {}
    draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*')
    finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*')

    numFiles = len( os.listdir( dirPath ))
    assert numFiles % 2 == 0

    finalIdsSortedList = finalReader.fileids()
    draftIdsSortedList = draftReader.fileids()

    for pid in finalReader.fileids():
        final = finalReader.paras( pid ) #finalIdsSortedList[i] )
        draft = draftReader.paras( pid ) #draftIdsSortedList[i] )
        assn = assignment( draft, final )
        assignments[pid] = assn

    return assignments
コード例 #4
0
ファイル: testEditDist.py プロジェクト: bflick/FYE-NLP
def parseFolder( dirPath ):
    assignments = []
    draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*')
    finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*')

    numFiles = len( os.listdir( dirPath ))
    assert numFiles % 2 == 0

    finalIdsSortedList = finalReader.fileids()
    draftIdsSortedList = draftReader.fileids()

    for i in range(len(finalReader.fileids())):
        final = finalReader.paras( finalIdsSortedList[i] )
        draft = draftReader.paras( draftIdsSortedList[i] )
        assn = assignment( draft, final )
        assignments.append( assn )

    return assignments
コード例 #5
0
def get_fileid_lst(source_dir):
    '''
    Use NLTK to pull in the list of file ids in the given source directory

    :param {str} source_dir:
        The relative path to the source directory that contains all the data (book) files
    :return {str} fileid_lst:
        List of all file id's ending in '.txt' in the source_dir
    '''
    temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt')
    fileid_lst = temp_corp.fileids()

    return fileid_lst
コード例 #6
0
def main():
    """
    Main function of the program
    """

    corpus_dir = 'NLP_dataset/training_set'  # Directory of corpus.
    new_corpus = PlaintextCorpusReader(corpus_dir, '.*')
    for file_id in new_corpus.fileids():
        file_to_read = open(corpus_dir+"/"+file_id, "r")

        # reading each file to get matched sentences
        matched_sen = match_regular_expressions(file_to_read)

        # writing the matched sentences to files
        write_to_files(matched_sen, file_id)
コード例 #7
0
def load_data(dir_label):

    data_list = []
    labels = []

    for dl in dir_label:

        data = []

        directory = dl[0]
        label = dl[1]

        corpus_dir = 'dataset/' + directory
        corpus = PlaintextCorpusReader(corpus_dir, '.*\.*')
        file_ids = corpus.fileids()

        for file in file_ids:

            d = []

            text = corpus.raw(file)
            e = email.message_from_string(text)

            if (e.is_multipart()):
                for payload in e.get_payload:
                    text = payload.get_payload
            else:
                text = e.get_payload()

            feats = [
                cf.charac_feats_extractor(text),
                wf.word_feats_extractor(text),
                syf.syntac_feats_extractor(text),
                stf.struct_feats_extractor(corpus, file, text),
                fwf.funct_word_feats_extractor(text)
            ]

            for f in feats:
                d.extend(list(f.values()))

            data.append(d)
            labels.append(label)

        data_list.extend(data)

    return [data_list, labels]
コード例 #8
0
def processFile(newCorpusDir):
    if not os.path.isdir(newCorpusDir):
        os.mkdir(newCorpusDir)
    txt1 = getText('sample_feed.txt')
    txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf')
    txt3 = word.getTextWord('my_doc.docx')

    files = [txt1, txt2, txt3]
    for idx, f in enumerate(files):
        with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
            fout.write(f)

    newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

    print(newCorpus.words())
    print(newCorpus.sents(newCorpus.fileids()[1]))
    print(newCorpus.paras(newCorpus.fileids()[0]))
コード例 #9
0
ファイル: testNLTK.py プロジェクト: Ousteau/projetPython
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
newcorpus = PlaintextCorpusReader('nltkCorpusAll/', '.*')

# Access each file in the corpus.
for infile in sorted(newcorpus.fileids()):
    print infile # The fileids of each file.
    fin = newcorpus.open(infile)# Opens the file.
    print fin.read().strip() # Prints the content of the file
print

# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print 

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and 
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
コード例 #10
0
#Tried to find misspellings in a corpus of text files. See find_misspellings.py and grouping_docs.py for documentation.
#There are ~30,400 unique words in these 49 communication files
#Rebecca's laptop took too long to make the correlation matrix

import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
import numpy as np
from numpy import linalg

#make a new corpus
corpusdir = 'communications/small_test_batch' #where the files are
newcorpus = PlaintextCorpusReader(corpusdir, '.*')

fileids = newcorpus.fileids() #list of fileids
j = len(fileids) #number of docs

words_list = [] #['doc', '1', 'words', 'doc', '2', 'words',...]
doc_breaks = [0] #ith entry = index of first word in doc i in words_list
keywords = set() #{'doc', '1', 'words', '2',...}

tokenizer = RegexpTokenizer('\w+') #pick out alphanumeric sequences; discard punctuation, white space

#create set of keywords and list of file texts
for id in fileids:
    raw = newcorpus.raw(id)
    raw2 = ''.join([i if ord(i)<128 else '' for i in raw]) #remove unicode characters
    raw3 = raw2.encode('ascii')
    file_words = map(str.lower,tokenizer.tokenize(raw3)) #list of cleaned words: lower-case, no punct, no whitespace
    words_list = words_list + file_words
    doc_breaks = doc_breaks + [len(file_words)+doc_breaks[len(doc_breaks)-1]]
コード例 #11
0
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import nltk

# Might need the below line once
# nltk.download('punkt')

corpusDir = 'own_corpus/'

newCorpus = PlaintextCorpusReader(corpusDir, '.*\.txt')

for file in sorted(newCorpus.fileids()):
    words = newCorpus.words(file)
    text = nltk.Text(words)
    print(text)
コード例 #12
0
ファイル: nlp.py プロジェクト: ddyson1/nltk
# nltk.download()
# nltk.download('gutenberg')

# text1.concordance("water")
# print(FreqDist(text1).most_common(50))
# FreqDist(text1).plot(50, cumulative=True)
# print(set(text1))

corpus_root = '/Users/devindyson/Desktop/nltk/corpora'
corpora = PlaintextCorpusReader(corpus_root, '.*')

# print(corpora.raw('meditations.txt'))
# print(SentimentIntensityAnalyzer().polarity_scores("NLTK is pretty dope."))

print(sorted(corpora.fileids()))
print(len(corpora.words('meditations.txt')))
print(len(corpora.words('benjamin.txt')))

meditations = Text(corpora.words('meditations.txt'))
benjamin = Text(corpora.words('benjamin.txt'))


def lexical_diversity(text_data):
    word_count = len(text_data)
    vocab_size = len(set(text_data))
    diversity_score = vocab_size / word_count
    return diversity_score


print(lexical_diversity(meditations))
コード例 #13
0
                       tokenizer=word_tokenize, postagger=pos_tag,
                       lemmatizer=wnl, stemmer=porter):
    words, lemmas, poss = [], [], []
    for word, pos in postagger(sentence):
        pos = penn2morphy(pos)
        lemmas.append(lemmatize(word.lower(), pos, neverstem,
                                lemmatizer, stemmer))
        poss.append(pos)
        words.append(word)
    if keepWordPOS:
        return words, lemmas, [None if i == '' else i for i in poss]
    return lemmas

regex = re.compile('[_]+')

for f in corpus.fileids():
    outname = args.preprocess + "/" + f + ".out"
    fout = open(outname,"w", encoding="utf8")

splitter = nltk.data.load(‘tokenizers/punkt/english.pickle’)
tokenizer.tokenize(text)

word_tokenize

    for sent in corpus.sents(f):
        s = []
        for w in sent:
                w = regex.sub('',w).lower()
                if (
                        len(w)>2
                        and not w in stop_words
コード例 #14
0
from nltk.stem.porter import *
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

#from modifiedtexttiling import TextTilingTokenizer
import modifiedtexttiling

#input as all the documents with preprocessed text.
corpusdir = '/home/abc/Desktop/adm/new_dataset'

#output as all the segmented documents with thier corresponding document names as prefix.
corpusdir_p = '/home/abc/Desktop/adm/segments'

newcorpus = PlaintextCorpusReader(corpusdir, '.*')

#sort all the document names alphabetically.
sortedall = sorted(newcorpus.fileids())
#print sortedall

for filename in sortedall:
    #open each document.
    fp = open(corpusdir + "/" + filename)
    #print message.
    print 'processing : ' + filename
    #save document text as string.
    n = fp.read()

    #Create TextTilingTokenizer() object
    t = modifiedtexttiling.TextTilingTokenizer()

    #get the segments as list of strings.
    k = t.tokenize(n)
コード例 #15
0
import nltk
from nltk import word_tokenize
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus import floresta,mac_morpho
from parser_portuguese_risk import evaluateModel, splitTrainTestModel, simplify_tag
time1 =datetime.datetime.now()

###############################################################################
### ATENTION: if we have some tmp files like .DS_STORE in Mac OSX, we must remove it ###

# Reading corpus
corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/glossAnnotated/' # Directory of corpus.
#corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/test1/' # Directory of corpus.   
risco = PlaintextCorpusReader(corpusdir, '.*')
risco.fileids()

raw_text = risco.raw('gloss533.txt')
#print raw_text[0:]

# Some statistics

print 'Number of term: ', len(risco.words())
print 'Number of unique terms: ', len(set(risco.words()))

fd = nltk.FreqDist(risco.words())
print fd.freq('bem')
print fd['bem']

# presenting ngrams of the term
target_word = 'bem como'
コード例 #16
0
import os
import word, pdf
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


def getText(txtFileName):
    file = open(txtFileName, 'r')
    return file.read()


# 새로운 corpus 폴더 생성-디렉터리
newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):
    os.mkdir(newCorpusDir)

txt1 = getText('sample_feed.txt')
txt2 = pdf.getTextPDF('sample-pdf.pdf')
txt3 = word.getTextWord('sample-one-line.docx')

# 세 문자열 객체의 내용을 디스크에 파일로 작성(쓰기모드)
files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

# 파일을 저장한 디렉터리에서 plaintext 객체 생성
newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')
print(newCorpus.words())  #0.txt 모든 단어 출력
print(newCorpus.sents(newCorpus.fileids()[1]))  #1.txt 문장 출력
print(newCorpus.sents(newCorpus.fileids()[0]))  #0.txt 단락별 출력
コード例 #17
0
def try_out_some_functionalities():

    corpusdir ="/media/benzro/OS/Users/benzro/Desktop/Studium Uni/2)" \
           "ZweitesSemester/27)PCL-2/Uebungen/Uebung03/Enron/test/"
    newcorpus = PCR(corpusdir, '.*')

    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access one file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    infile = corpusdir + "0001.1999-12-10.farmer.ham.txt"
    infile = "0004.1999-12-14.farmer.ham.txt"
    fin = newcorpus.open(infile)
    print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "all file ids"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.fileids()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access each file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # (reduced output: [0:2])
    for infile in sorted(newcorpus.fileids()):
        # the fileids of each file
        print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
        print infile
        # opens the file
        fin = newcorpus.open(infile)
        # prints the content of the file
        print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access the plaintext; outputs pure string of all files"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.raw().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access paragraphs in the corpus. (list of list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
    #       nltk.tokenize.word_tokenize.
    #
    # Each element in the outermost list is a paragraph, and
    # Each paragraph contains sentence(s), and
    # Each sentence contains token(s)
    print newcorpus.paras()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access pargraphs of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.paras(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access sentences in the corpus. (list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: That the texts are flattened into sentences that contains tokens.
    print newcorpus.sents()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access sentences of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.sents(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access just tokens/words in the corpus. (list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access tokens of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
コード例 #18
0
ファイル: ngram.py プロジェクト: amodig/Matching-System
OUTPUT_SIGNATURE = "file*.lemmatized"

import nltk
import os
import glob
from os.path import join
from nltk.collocations import *
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# read in corpus, find all the 3-grams above the min frequency
print "Reading in corpus from", CORPUS_ROOT
my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
print "Read in " + str(len(my_corpus.fileids())) + " files"
print "Finding 3-grams"
finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words())
print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY
finder_3gram.apply_freq_filter(MIN_FREQUENCY)

# combine all the 3-grams meeting the PMI threshold
print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI
filelist = [f for f in glob.glob(CORPUS_ROOT + CORPUS_OUTPUT_EXTENSION)]

gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI)
processGrams(gen, filelist)

# now let's do the same for the 2-grams
# our previous step altered the corpus so let's read it in again
print "Reading in corpus from", CORPUS_ROOT
コード例 #19
0
robotStoryCorpusDir = '../resources/robot_stories'

childStoryCorpus = PlaintextCorpusReader(childStoryCorpusDir, ".*\.txt")
robotStoryCorpus = PlaintextCorpusReader(robotStoryCorpusDir, ".*\.txt")


# average word length, average sentence length, and the number of times each vocabulary item appears in the text on average (our lexical diversity score)
# for fileid in childStoryCorpus.fileids():
#     num_chars = len(childStoryCorpus.raw(fileid))
#     num_words = len(childStoryCorpus.words(fileid))
#     num_sents = len(childStoryCorpus.sents(fileid))
#     num_vocab = len(set([w.lower() for w in childStoryCorpus.words(fileid)]))
#     print ((float(num_chars)/float(num_words)), float(num_words)/float(num_sents), float(num_words)/float(num_vocab), fileid)


for fileid in childStoryCorpus.fileids():

    print (fileid)
    file_path = os.path.join(childStoryCorpusDir, fileid)

    with open(file_path, 'r') as orgf:
        for line in orgf:
            for s in tokenize.sent_tokenize(line):
                print(s)
                #print(st.tag(tokenize.word_tokenize(s)))
                #print(st.tag(s.split()))
                print(list(parser.raw_parse(s)))

                # for line in parser.raw_parse(s):
                #     for sentence in line:
                #         sentence.draw()
コード例 #20
0
    with open(corpusdir + str(filename) + '.txt', 'w') as fout:
        print << fout, text

# Check that our corpus do exist and the files are correct.
assert os.path.isdir(corpusdir)
for infile, text in zip(sorted(os.listdir(corpusdir)), corpus):
    assert open(corpusdir + infile, 'r').read().strip() == text.strip()

# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
newcorpus = PlaintextCorpusReader('newcorpus/', '.*')

# Access each file in the corpus.
for infile in sorted(newcorpus.fileids()):
    print(infile)  # The fileids of each file.
    with newcorpus.open(infile) as fin:  # Opens the file.
        print(fin.read().strip())  # Prints the content of the file

# # Access the plaintext; outputs pure string/basestring.
# print(newcorpus.raw().strip())
#
# # Access paragraphs in the corpus. (list of list of list of strings)
# # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
# #       nltk.tokenize.word_tokenize.
# #
# # Each element in the outermost list is a paragraph, and
# # Each paragraph contains sentence(s), and
# # Each sentence contains token(s)
# print newcorpus.paras()
コード例 #21
0
ファイル: verify.py プロジェクト: statguy/wsd
from nltk.text import *
import nltk

if len(sys.argv) != 4:
  print "Usage:", sys.argv[0], "word sense1 sense2"
  exit(-1)

focal_word = sys.argv[1]
senses = [sys.argv[2], sys.argv[3]]
#focal_word = "plant"
#senses = ["manufacturing","life"]
corpus = PlaintextCorpusReader('outcorpus/', '.*')
collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ]
decision_list = wsd.DecisionList()
decision_list.load("senses_bootstrap_" + focal_word + ".csv")
corpus_ids = corpus.fileids()
random.shuffle(corpus_ids)

num_words = 1
num_words_max = 100
tagged = 0
ambiguous = 0
unknown = 0

for infile in corpus_ids:
  if num_words > num_words_max: break

  words = corpus.words(infile)
  text = Text(words)
  c = nltk.ConcordanceIndex(text.tokens)
  offsets = c.offsets(focal_word)
コード例 #22
0
# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
print newcorpus.paras(newcorpus.fileids()[0])

# Access sentences in the corpus. (list of list of strings)
# NOTE: That the texts are flattened into sentences that contains tokens.
print newcorpus.sents()
print

# To access sentences of a specific fileid.
print newcorpus.sents(newcorpus.fileids()[0])

# Access just tokens/words in the corpus. (list of strings)
print newcorpus.words()

# To access tokens of a specific fileid.
print newcorpus.words(newcorpus.fileids()[0])
コード例 #23
0
class Contract_Reader():
    def __init__(self, config):
        print('Filepath for texts = ', config.textpath)
        self.corpus = PCR(config.textpath,
                          '.*\.txt',
                          encoding='utf-16',
                          para_block_reader=read_line_block)
        if config.clean_paragraphs == 'yes':
            self.clean(config, mode='para')
        if config.clean_sentences == 'yes':
            self.clean(config, mode='sent')
        #Corpus summaries
        self.corpus_info()
        self.LDA(config.num_topics, config.num_words)
        self.plot(config.num_words)

    def clean(self, config, mode='sent'):
        stop = set(stopwords.words('english'))
        exclude = set(string.punctuation)
        lemma = WNL()
        if mode == 'para':
            #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings.
            self.para_list = [
                list(itertools.chain.from_iterable(para))
                for para in self.corpus.paras()
            ]
            for index, paragraph in enumerate(self.para_list):
                paragraph = " ".join(paragraph)
                stop_free = " ".join(
                    [i for i in paragraph.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.para_list[index] = normalized
            print(self.para_list[0])
            self.para_list = [para.split() for para in self.para_list]
            print(self.para_list[0])
        if mode == 'sent':
            #Obtain list of strings each one a sentence rather than list of lists.
            self.sents_list = [" ".join(sent) for sent in self.corpus.sents()]
            for index, sentence in enumerate(self.sents_list):
                stop_free = " ".join(
                    [i for i in sentence.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.sents_list[index] = normalized
            print(self.sents_list[0])
            self.sents_list = [
                sentence.split() for sentence in self.sents_list
            ]
            print(self.sents_list[0])

    def LDA(self, num_topics, num_words):
        dictionary = corpora.Dictionary(self.para_list)
        doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list]
        path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623'
        self.ldamodel = LdaVowpalWabbit(path,
                                        doc_term_matrix,
                                        num_topics=num_topics,
                                        id2word=dictionary)
        self.ldamodel.save('model/lda_model')
        print(self.ldamodel.print_topics(num_topics=10, num_words=num_words))

    def plot(self, num_words):
        for t in range(self.ldamodel.num_topics):
            plt.figure()
            tuples = [
                reversed(x) for x in self.ldamodel.show_topic(t, num_words)
            ]
            plt.imshow(WordCloud().fit_words(dict(tuples)))
            plt.axis("off")
            plt.title("Topic #" + str(t))
            plt.savefig('plots/topic' + str(t))

    def corpus_info(self):
        """
        Summary information about the status of a corpus.
        """
        fids = len(self.corpus.fileids())
        paras = len(self.corpus.paras())
        sents = len(self.corpus.sents())
        sperp = sum(len(para) for para in self.corpus.paras()) / float(paras)
        tokens = FreqDist(self.corpus.words())
        count = sum(tokens.values())
        vocab = len(tokens)
        lexdiv = float(count) / float(vocab)

        print(
            ("Text corpus contains {} files\n"
             "Composed of {} paragraphs and {} sentences.\n"
             "{:0.3f} sentences per paragraph\n"
             "Word count of {} with a vocabulary of {}\n"
             "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp,
                                                    count, vocab, lexdiv))
コード例 #24
0
print(a)

from scipy.spatial.distance import cosine
print(cosine(dtm[0].toarray(),dtm[1].toarray()))


from sklearn.feature_extraction.text import TfidfVectorizer

tfid_vectors = TfidfVectorizer()
tfid_vectors = tfid_vectors.fit_transform([sent1,sent2])
print(pd.DataFrame(data = tfid_vectors.toarray()))
a1=pairwise_distances(tfid_vectors[0].toarray(),tfid_vectors[1].toarray(),metric='cosine')
print(a1)

print("________________Tf-idf corpus reader__________________________")

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
path="./text_docs/"

president_corpus = PlaintextCorpusReader(path,".*",encoding="utf-8")
tfid_vectors_corpus = TfidfVectorizer(input='filename')
files= [path+filename for filename in list(president_corpus.fileids())]
tf_idf_matrix = tfid_vectors_corpus.fit_transform(raw_documents=files)
barack = tf_idf_matrix.toarray()[0]
bush = tf_idf_matrix.toarray()[1]
trump = tf_idf_matrix.toarray()[2]

print(cosine(barack,bush))
print(cosine(bush,trump))
print(cosine(trump,barack))
    return file.read()


# 말뭉치 폴더 생성
newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):  # 말뭉치 폴더가 이미 존재하는가?
    os.mkdir(newCorpusDir)

# 파일 읽기
# 일반 텍스트 파일
txt1 = getText('./Files/sample_feed.txt')
# PDF 파일
txt2 = pdf.getTextPDF('./Files/sample-pdf.pdf')
# DOCX 파일
txt3 = word.getTextWord('./Files/sample-one-line.docx')

# 파일 쓰기
files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

# 사용자 정의 말뭉치 만들기
# 폴더 내의 모든 파일을 읽어와 파일들로부터 말뭉치를 생성한다
newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

# 사용자 정의 말뭉치가 잘 만들어 졌는지 확인
print(newCorpus.words())  # 말뭉치의 모든 단어를 포함하는 배열
print(newCorpus.sents(newCorpus.fileids()[1]))  # 1.txt에 있는 모든 문장 배열을 출력
print(newCorpus.paras(newCorpus.fileids()[0]))  # 0.txt에 있는 모든 단락 배열을 출력
コード例 #26
0
    # marks occur replace it with null
    for x in string.lower():
        if x in punctuations:
            string = string.replace(x, "")
    return string


debug = None
big_paras = []

print_timestamp('\n' * 3 + 'End')
print_timestamp('\n' * 3 + 'Begin')

corpusdir = '..\\Thinkful\\Datafiles/UnsupervisedLearningCapstone\\fiction_corpus\\'
fiction_corpus = PlaintextCorpusReader(corpusdir, '.*.txt')
documents_stat = fiction_corpus.fileids()
if debug:
    print("documents_stat={} and is a {} datatype".format(
        documents_stat, type(documents_stat)))
documents_stat_0 = []
# documents_stat_0.append(documents_stat[0])

if debug:
    print("documents_stat_0 is a {} datatype".format(type(documents_stat)))
item_num = 0

book_block = []
word_counts = {}

for book in documents_stat:
    item_num += 1
コード例 #27
0
ファイル: reader.py プロジェクト: advaith2/Data
'''

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import string
import csv
from fileinput import filename

corpusdir = 'C:/Users/Advaith GVK/workspace/Trial/src/Pack/New folder' # Directory of corpus.

newcorpus = PlaintextCorpusReader(corpusdir, '.*')

filenames = newcorpus.fileids()
# print newcorpus.sents()

def getWordNetType(tag):
        #print tag
        if tag in ['JJ', 'JJR', 'JJS']:
            return wn.ADJ
        elif tag in ['NN', 'NNS', 'NNP', 'NNPS','POS','FW']:
            return wn.NOUN
        elif tag in ['RB', 'RBR', 'RBS','WRB']:
            return wn.ADV
        elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            return wn.VERB
        return wn.NOUN

import os
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import numpy as np
import operator
import os
import sys
#corpus of segments
reload(sys)
sys.setdefaultencoding('Cp1252')
corpusdir = '/home/abc/Desktop/adm/segments'
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
sortedall = sorted(newcorpus.fileids())
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

doc_a = "/home/rashmi/Documents/adm_project/appended/0.txt"
doc_b = "/home/rashmi/Documents/adm_project/appended/1.txt"
doc_c = "/home/rashmi/Documents/adm_project/appended/2.txt"
doc_d = "/home/rashmi/Documents/adm_project/appended/3.txt"
doc_e = "/home/rashmi/Documents/adm_project/appended/4.txt"

# compile sample documents into a list
コード例 #29
0
	if (verbose and j%inc==0): print('Progress:',j,'/',jobs)
	return term_yn

# prepocess the text here
def preprocess(t):
	rem_chars = "[!\"#$%&()*+,:;<=>?@[\\]^_`{|}~0123456789]" # remove these
	rep_chars = "[-./\']" # replace these
	t_temp = re.sub(rem_chars, "", t.lower())
	t_temp = re.sub(rep_chars, " ", t_temp)
	t_strip_lower_filt = [w for w in t_temp.split() if not w in stopwords.words('english')]
	return " ".join(t_strip_lower_filt)

# load the data
corpusdir = 'corpus_txt/' # Directory of corpus.
mycorp_raw = PlaintextCorpusReader(corpusdir, '.*')
file_index = mycorp_raw.fileids()

# preprocess the text (slow)
# uncomment one of the following lines for usual vs parallel processing
#mycorp_proc = nltk.Text([preprocess(mycorp_raw.raw(f)) for f in file_index])
mycorp_proc = Parallel(n_jobs=3,verbose=True)(delayed(preprocess)(mycorp_raw.raw(f)) for f in file_index)


# get ngrams (1-3)
vectorizer_ngrams = CountVectorizer(min_df = 0.05, ngram_range=(1, 3))
mat_ngrams = vectorizer_ngrams.fit_transform(mycorp_proc)
n_df = pd.DataFrame(data = mat_ngrams.A, 
	columns = vectorizer_ngrams.get_feature_names())
n_df['pt_id'] = [i[:-4] for i in file_index]
# write results to file
n_df.to_csv('ngrams_dtm.csv', index = False)
コード例 #30
0
ファイル: qwe.py プロジェクト: dilip-dmk/vdsc
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import word_tokenize
import re


corpusdir = 'python/' # Directory of corpus.

newcorpus = PlaintextCorpusReader(corpusdir, '.*')
print(newcorpus.fileids()[0])
print(type(newcorpus))
#print newcorpus.raw()
print newcorpus.words(newcorpus.fileids()[0])
print(len(newcorpus.words()))

tokens = word_tokenize(newcorpus.raw())
#type(tokens)
print len(tokens)
print tokens[:50]
#tokens[:10]
print newcorpus.sents()
print

#to remove comments
def removeComments(string):
    string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,string) # remove all occurance streamed comments (/*COMMENT */) from string fdf
    string = re.sub(re.compile("//.*?\n" ) ,"" ,string) # remove all occurance singleline comments (//COMMENT\n ) from string
    return string

print(removeComments(newcorpus.words(newcorpus.raw())))
コード例 #31
0
ファイル: tf-idf.py プロジェクト: elisa-lj11/tier2019
#         text_list.append(text)

# preprocessed_docs = []
# for n,t in enumerate(text_list):
#     # print sample of text before and after processing
#     #if n == (len(text_list) - 1):
#     #    print(("Doc {} (before preproc): {}").format(n, t))
#     #    print(("Doc {}: {}").format(n, p))
#     p = preprocess(t)
#     preprocessed_docs.append(p)

# print("Preprocessed docs len:", len(text_list))

texts = PlaintextCorpusReader(d, ".*\.txt")

boc_texts = [extract(texts.raw(fileid)) for fileid in texts.fileids()]

dictionary = gensim.corpora.Dictionary(boc_texts)
#dictionary = gensim.corpora.Dictionary(preprocessed_docs)
#dictionary.filter_extremes(no_below=10,no_above=.5,keep_n=100000)
#bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]
bow_corpus = [dictionary.doc2bow(doc) for boc_text in boc_texts]
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

fileids = texts.fileids()

for idx, doc in enumerate(corpus_tfidf):
    new_file.write("Document '{}' key phrases:\n".format(fileids[idx]))
    # Get top 100 terms by TF-IDF score
    for wid, score in heapq.nlargest(100, doc, key=itemgetter(1)):
コード例 #32
0
CORPUS_EXTENSION =r'.*\.txt'

import nltk
import os
from os import listdir
from os.path import isfile, join
from nltk.collocations import *
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# read in corpus, find all the 3-grams above the min frequency
print "Reading in corpus from", CORPUS_ROOT
my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
print "Read in " + str(len(my_corpus.fileids())) + " files"
print "Finding 3-grams"
finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words())
print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY
finder_3gram.apply_freq_filter(MIN_FREQUENCY)

# combine all the 3-grams meeting the PMI threshold
print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI
filelist = [ join(CORPUS_ROOT,f) for f in listdir(CORPUS_ROOT) if isfile(join(CORPUS_ROOT,f)) ]
gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI)
processGrams(gen, filelist)

# now let's do the same for the 2-grams
# our previous step altered the corpus so let's read it in again
print "Reading in corpus from", CORPUS_ROOT
my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
コード例 #33
0
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

#Loading the file you want to Train
corpusdir = 'E:\MTech' # Directory of corpus.

newcorpus = PlaintextCorpusReader(corpusdir, '.*')
print(newcorpus.fileids())
dictlist=[]

#Converting from word/tag pait to a list containing two tuple i.e,[(word1,tag),(word2,tag)]
for i in newcorpus.fileids():
    tagged_sent=newcorpus.raw(i)
    tagged=tagged_sent.split()
    for t in tagged:
        temp1=nltk.tag.str2tuple(t)
        dictlist.append(temp1)
print(dictlist)
print("This is the length of distinct words")
print(len(set(dictlist)))
fdist=nltk.FreqDist(dictlist)
print("fdist items")
print(fdist.items())
print(fdist.max())




rawtext = '''
 '''
コード例 #34
0
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('wordnet')
download('punkt')
download('stopwords')

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

############################  Read Data  #############################################

#if you run this code with ipython you should enter the address of your file: instead of './txts'
corpus_directory = './txts'
textsfile = PlaintextCorpusReader(corpus_directory, '.*')
ID_files = textsfile.fileids()
print(ID_files, len(ID_files))

##############################  Preprossesing Data  ######################################

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
Index_of_files = []
texts = []
count = 0

#file with file_ids
for fileid in ID_files:
    texts.append(textsfile.raw(fileids=fileid))
    Index_of_files.append(fileid)
コード例 #35
0
ファイル: tag_senses_bootstrap.py プロジェクト: statguy/wsd
if len(sys.argv) != 4:
  print "Usage:", sys.argv[0], "word sense1 sense2"
  exit(-1)

focal_word = sys.argv[1]
senses = [sys.argv[2], sys.argv[3]]
#focal_word = "plant"
#senses = ["manufacturing","life"]
corpus = PlaintextCorpusReader('outcorpus/', '.*')
collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ]
decision_list = wsd.DecisionList()
decision_list.load("senses_bootstrap_" + focal_word + ".csv")    

i = 0
for infile in sorted(corpus.fileids()):
  print i, "/", len(corpus.fileids())
  i += 1
  
  words = corpus.words(infile)
  text = Text(words)
  c = nltk.ConcordanceIndex(text.tokens)
  offsets = c.offsets(focal_word)
  
  for offset in offsets:
    for collocation in collocations:
      tokens = collocation.get_collocation(text, offset)
      if tokens == None: continue
      sense = decision_list.get_sense(tokens, collocation.index)
      if sense == None: continue
      collocation.add_collocation(text, offset, sense)
コード例 #36
0
ファイル: NMF.py プロジェクト: shirishruu/Tweet-analysis
import glob
import os
import string
import nltk
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import scipy.stats as stats

names = []
corpus = []

co = PlaintextCorpusReader("./election", ".*\.txt")

for fileids in co.fileids():
    names.append(fileids)
    corpus.append(co.raw(fileids))

print len(names), 'documents in the corpus'
print names[:30]

for idx in range(len(corpus) - 1, -1, -1):
    print
    print names[idx]
    print corpus[idx][:70].replace('\n', ' ')

vectorizer = TfidfVectorizer(stop_words='english', min_df=2)
dtm = vectorizer.fit_transform(corpus)
print dtm.shape
vocab = vectorizer.get_feature_names(
コード例 #37
0
ファイル: to_matlab.py プロジェクト: Vivaq/scripts
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from decimal import Decimal
from math import pi

if __name__ == '__main__':
    ptcr = PlaintextCorpusReader('C:\Users\Jakub\Downloads\pr4\Trzeci plik', ['znormalizowane.txt', 'katy.txt'])
    data = []
    t = ptcr.raw(fileids=ptcr.fileids()[1]).replace(',', '.').replace('\r', '').split('\n')
    t.remove('')
    for x in t:
        data.append(float(Decimal(x)*360/315))
    print data
    data_ = []
    t = ptcr.raw(fileids=ptcr.fileids()[0]).replace(',', '.').replace('\r', '').split('\n')
    t.remove('')
    for x in t:
        data_.append(float(x)/100)
    print data_
コード例 #38
0
    with open(corpusdir + str(filename) + '.txt', 'w') as fout:
        print >> fout, text

# Check that our corpus do exist and the files are correct.
assert os.path.isdir(corpusdir)
for infile, text in zip(sorted(os.listdir(corpusdir)), corpus):
    assert open(corpusdir + infile, 'r').read().strip() == text.strip()

# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
newcorpus = PlaintextCorpusReader('newcorpus/', '.*')

# Access each file in the corpus.
for infile in sorted(newcorpus.fileids()):
    print infile  # The fileids of each file.
    with newcorpus.open(infile) as fin:  # Opens the file.
        print fin.read().strip()  # Prints the content of the file
print

# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
コード例 #39
0
# 150 common_words x 6 fileids x sentences(937 total) x words + append/join sentence into string + 6x search fileids
from nltk.tokenize.treebank import TreebankWordDetokenizer

common_words = dict(freq.most_common(150))
print(common_words)

data = []
hmap = {}
detokenized = {}

for word, frequency in common_words.items():
    datum = {'word': word, 'frequency': frequency}
    docs = []
    sents = []

    for key, fileid in enumerate(corpus.fileids()):
        if key not in hmap:
            hmap[key] = {}

        for s_id, sentence in enumerate(corpus.sents(fileid)):
            if key in hmap and s_id in hmap[key]:
                words = hmap[key][s_id]
            else:
                words = [lemmatizer.lemmatize(w.lower()) for w in sentence]
                hmap[key][s_id] = words

            if word in words:
                s_key = f'{key}-{s_id}'
                sent = ''

                if s_key in detokenized:
コード例 #40
0
import os
import word, pdf
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


def getText(txtFileName):
    file = open(txtFileName, 'r')
    return file.read()


newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):
    os.mkdir(newCorpusDir)

txt1 = getText('sample_feed.txt')
txt2 = pdf.getTextPDF('sample-pdf.pdf')
txt3 = word.getTextWord('sample-one-line.docx')

files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

print(newCorpus.words())
print(newCorpus.sents(newCorpus.fileids()[1]))
print(newCorpus.paras(newCorpus.fileids()[0]))
コード例 #41
0
    boc_texts = [
        extract(texts.raw(fileid)) for fileid in texts.fileids()
    ]

    # make gensim dictionary and corpus
    dictionary = gensim.corpora.Dictionary(boc_texts)
    corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]

    # transform corpus with tf*idf model
    tfidf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    return corpus_tfidf, dictionary

# Can change this to output just keywords by commenting out 1st, 2nd, and 4th "new_file.write" lines
if __name__ == '__main__':
    tfidfs, id2word = score_keyphrases_by_tfidf(texts)#, 'words')
    fileids = texts.fileids()

    # Print top keywords by TF-IDF
    for idx, doc in enumerate(tfidfs):
        new_file.write("Document '{}' key phrases:\n".format(fileids[idx]))
        # Get top 10 terms by TF-IDF score
        for wid, score in heapq.nlargest(keyphrase_num, doc, key=itemgetter(1)):
            new_file.write("{:0.3f}: {}\n".format(score, id2word[wid]))
            #new_file.write("{}\n".format(id2word[wid]))

        new_file.write("\n")

print("Done! Look for {} in the 'Classifier' directory".format(new_file_name))
コード例 #42
0
def create_content (gdocs,graphicsdir,gcontent):
    for file in gdocs:
        gcontent.append(open(graphicsdir+'/'+str(file),'r').read())


# defining the directory path for each category
graphicsdir,autosdir,gunsdir = '20news-bydate/train/comp.graphics','20news-bydate/train/rec.autos','20news-bydate/train/talk.politics.guns'
graphicstest,autostest,gunstest = '20news-bydate/test/comp.graphics','20news-bydate/test/rec.autos','20news-bydate/test/talk.politics.guns'
graphicscorpus,autoscorpus,gunscorpus = PlaintextCorpusReader(graphicsdir, '.*'),PlaintextCorpusReader(autosdir, '.*'),PlaintextCorpusReader(gunsdir, '.*')
graphicscorpustest,autoscorpustest,gunscorpustest = PlaintextCorpusReader(graphicstest, '.*'),PlaintextCorpusReader(autostest, '.*'),PlaintextCorpusReader(gunstest, '.*')

# initializing the lists
gdocs,adocs,ndocs,gcontent,acontent,ncontent,gwords,awords,nwords,vocab = [],[],[],[],[],[],[],[],[],[]
gtdocs,atdocs,ntdocs,gtcontent,atcontent,ntcontent,gtwords,atwords,ntwords,vtocab = [],[],[],[],[],[],[],[],[],[]
# for train dataset
gdocs.extend(graphicscorpus.fileids()) # for graphics category
adocs.extend(autoscorpus.fileids()) # for autos category
ndocs.extend(gunscorpus.fileids()) # for guns category
# for test dataset
gtdocs.extend(graphicscorpustest.fileids()) # for graphics category
atdocs.extend(autoscorpustest.fileids()) # for autos category
ntdocs.extend(gunscorpustest.fileids()) # for guns category
# retriving the words for each category
# for train dataset
create_content(gdocs,graphicsdir,gcontent)
create_content(adocs,autosdir,acontent)
create_content(ndocs,gunsdir,ncontent)
# for test dataset
create_content(gtdocs,graphicstest,gtcontent)
create_content(atdocs,autostest,atcontent)
create_content(ntdocs,gunstest,ntcontent)