Esempio n. 1
0
 def __init__(self, *args, **kwargs):
     if 'element_class' in kwargs:
         self.element_class = kwargs['element_class']
         del kwargs['element_class']
     else:
         self.element_class = Essay
     CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
Esempio n. 2
0
def main():
    articles = CategorizedPlaintextCorpusReader(corpusdir, '.*', cat_pattern = r'(.*)[/]')
    feats = {}
    trainfeats = []
    testfeats = []
    for cat in articles.categories():
        wow = len([f for f in articles.fileids(cat)]) # such variable name
        print "for category", cat, ":", wow
        feats[cat] = [(word_feats(articles.words(fileids = [f])), cat) for f in articles.fileids(cat)]
        cutoff = wow - hold_back(wow)
        trainfeats.append(feats[cat][:cutoff])
        testfeats.append(feats[cat][cutoff:])

    train = [item for sublist in trainfeats for item in sublist]
    test = [item for sublist in testfeats for item in sublist]

    print 'train on %d instances, test on %d instances' % (len(train), len(test))

    classifier = NaiveBayesClassifier.train(train)
    print 'accuracy:', nltk.classify.util.accuracy(classifier, test)
    classifier.show_most_informative_features() # I don't understand the output for more than 2 categories :(

    # load with:
    # import pickle
    # f = open('my_classifier.pickle')
    # classifier = pickle.load(f)
    # f.close()
    with open('../data/classifier.pickle', 'wb') as f:
        pickle.dump(classifier, f)
Esempio n. 3
0
 def create_categorized_corpus(self, categories_directory):
     boolean_list = []
     boolean_for_categories_test = ''
     reader = CategorizedPlaintextCorpusReader(categories_directory, r'\.txt.*wordtype_(\w+)', cat_pattern=r'\.txt.*wordtype_(\w+)')
     for category in reader.categories():
         boolean_list.append(category != '') 
     if False in boolean_list:
         boolean_for_categories_test = False
     else:
         boolean_for_categories_test = True
     return reader, boolean_for_categories_test
 def __init__(self, rootLocation = config.POLARITY_DATASET,reader=None):
     super(PolarityDataReader, self).__init__()
     if reader == None:
         self.reader = Reader(rootLocation,r'.*/.*', cat_pattern=r'(.*)/.*')
     else:
         self.reader = reader
     self.setStopWords()
     self.documents = None;
     self.terms = None;
Esempio n. 5
0
 def __init__(self, rootLocation=config.POLARITY_DATASET, reader=None):
     super(PolarityDataReader, self).__init__()
     if reader == None:
         self.reader = Reader(rootLocation,
                              r'.*/.*',
                              cat_pattern=r'(.*)/.*')
     else:
         self.reader = reader
     self.setStopWords()
     self.documents = None
     self.terms = None
Esempio n. 6
0
import nltk, random, string, os
from nltk.collocations import *
from nltk.corpus.reader import CategorizedPlaintextCorpusReader 
from nltk.corpus import stopwords



bigram_measures = nltk.collocations.BigramAssocMeasures()
#print reader.categories()
for name in os.listdir("."):
	if os.path.isdir(name):
		reader = CategorizedPlaintextCorpusReader(name, r'.*\.txt', cat_pattern=r'(\w+)/*')
	#	reader = CategorizedPlaintextCorpusReader(name, r'./raw_reviews/\.txt', cat_pattern=r'(\w+)/*')

		print reader.fileids()
		table = string.maketrans("","")
		stopwords = nltk.corpus.stopwords.words('english')
		filtered_words = [w for w in reader.words() if not w in stopwords]
		filtered_words_nopunc = [w for w in filtered_words if not w in string.punctuation]
		#all_words = nltk.FreqDist(w.lower() for w in filtered_words_nopunc)
		finder = BigramCollocationFinder.from_words(filtered_words_nopunc)
		#scored = finder.score_ngrams(bigram_measures.raw_freq)
		#a = sorted(bigram for bigram, score in scored) 
		 	
		finder.apply_freq_filter(3)
		a = finder.nbest(bigram_measures.pmi, 5) 
		#b = finder.score_ngrams(bigram_measures.pmi)
		print a

Esempio n. 7
0
categ_dict = {
    'test_1.txt': 'Regulatory Update',
    'test_2.txt': 'Press Release',
    'test_3.txt': 'Regulatory Update',
    'test_4.txt': 'Regulatory Update',
    'test_5.txt': 'Stock Update',
    'test_6.txt': 'Press Release',
    'test_7.txt': 'Market Opinion'
}
art_i = []
class_i = []
#Conversion of Train Data into Single Input File
corpus_root = 'Train_set'

newcorpus = CategorizedPlaintextCorpusReader(corpus_root,
                                             r'.*\.txt',
                                             cat_pattern=r'(\w+)/*')

myfile = open('Input_Article_Data.csv', 'wb')
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL, lineterminator="\n")

for category in newcorpus.categories():
    for fileid in newcorpus.fileids(category):
        #print fileid,category
        data1 = (newcorpus.raw(fileid).encode('utf-8')).replace(",", " ")
        data_list = [data1, category]
        wr.writerow(data_list)

myfile.close()

#Reading of Train Data as Lists
Esempio n. 8
0
    #print(bigrams)
    cfd = nltk.ConditionalFreqDist(bigrams)
    print(cfd[word])
    print(generate_model(cfd, word))


def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word, end=' ')
        word = cfdist[word].max()


# 1. Construir Corpus texto categorizado
locPT = 'ch02/ES'
corpusPT = CategorizedPlaintextCorpusReader(locPT,
                                            '.*\.txt',
                                            cat_file="cat.txt")

print(corpusPT.fileids())
print(corpusPT.categories())
print(corpusPT.words(categories='ciencia'))
#print(corpusPT.raw())

vocab = set(w.lower() for w in corpusPT.words())
print('Tamanho Vocabulario:', len(vocab))
corpusCom = corpusPT.raw()
corpusComList = corpusCom.split()
print('Tamanho Total de palabras:', len(corpusComList))

# 2. Calcular medidas estadisticas simples
'''
data = get_data()
print(len(data))
evrth, maindict = tags_assignment(data)

# Save new final dictionary as well as the mapping for categories-numbers
listingssss = json.dumps(evrth)
with open("FinalCleanJuly1.json", "w") as f:
    f.write(listingssss)
dictionaries = json.dumps(maindict)
with open("CorpusCatMapJuly1.json", "w") as f:
    f.write(dictionaries)

#### This is IMPORTANT - CHOOSE ! ##### default is key2
#### Choose the label you want to have for naming!
### two options:
### 1) key1 with format: docID + _(i) where i numerated number of category e.g. -doc-_cr14021.txt
### 2) key2 with format country name + year + _(i) e.g. Albania2015_1.txt
### if you want to change--> line 90: "key2: taglist" to key1
### line 121: filename=evrth[i]['key2'] to key1
create_corpus(evrth)

#### Check if working
reader = CategorizedPlaintextCorpusReader('corpusCategory/',
                                          r'\w+\d+_.*\.txt',
                                          cat_map=maindict)
print(reader.categories())  #print all categories in a list
print(reader.fileids(categories=['Fiscal']))  #check docIDs in fiscal category

#Good reference - https://www.packtpub.com/books/content/python-text-processing-nltk-20-creating-custom-corpora
#They have options for creating chunked (by words, sentences, paragraphs and even customized paragraphs) corpora, tagged corpora etc
Esempio n. 10
0
from nltk.tokenize import LineTokenizer, RegexpTokenizer
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.stem import SnowballStemmer
from processor import Processor as Proc

data_folder = './data'
encoding = 'UTF8'
language = 'italian'

wordTok = RegexpTokenizer(
    r'(\w+|@\w+|<3|(\:\-?\))|(\:\-?\()|(\;\-?\))|((\:|(X|x))\-?(D|d)))')
sentTok = LineTokenizer()
reader = CategorizedPlaintextCorpusReader(data_folder,
                                          r'SENTIPOLC-.*\.txt',
                                          cat_pattern=r'SENTIPOLC-(\w+)\.txt',
                                          encoding=encoding,
                                          word_tokenizer=wordTok,
                                          sent_tokenizer=sentTok)

pos_tweets = reader.sents(reader.fileids('pos'))
neg_tweets = reader.sents(reader.fileids('neg'))

# Inspection
rndP = random.randrange(len(pos_tweets))
rndN = random.randrange(len(neg_tweets))
print 'Pos:\n', pos_tweets[rndP:rndP + 3], '\nNeg:\n', neg_tweets[rndN:rndN +
                                                                  3], '\n'

# All lowercase
pos_tweets = Proc.lowerize(pos_tweets)
neg_tweets = Proc.lowerize(neg_tweets)
            output = open('airline_review\\pos\\' + \
                          filename + '.txt', 'w')
            output.write(row['reviewcontent'])
            output.close()
            count_pos += 1
            
        elif row['rating_overall'] in ("1", "2", "3", "4", "5") and count_neg < 15000:
            output = open('airline_review\\neg\\' + \
                          filename + '.txt', 'w')
            output.write(row['reviewcontent'])
            output.close()
            count_neg += 1
        

os.chdir("E:/Documents/GSU/Python Development/Unstructured Data/Team Project/machine_learning_text_analysis")
reader = CategorizedPlaintextCorpusReader('./airline_review', r'.*\.txt',
                                          cat_pattern = r'(\w+)/*') # file name format

# Positive reviews file ids
pos_ids = reader.fileids('pos')

# Negative reviews file ids
neg_ids = reader.fileids('neg')

'''Generating word feature list'''
def word_feats(words):
    return dict([(word, True) for word in words])


'''Building positive and negative feature lists. Each 
item is the positive/negative word features for a review file'''
pos_feat = [(word_feats(reader.words(fileids = f)), 'pos')
Esempio n. 12
0
from nltk.corpus import stopwords
from nltk_trainer.classification.featx import bag_of_words
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
import collections
import pickle

from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader(
    '/home/arjun/nltk_data/health/diabetes',
    r'health.*?[0-9]+.txt',
    cat_pattern=r'health(.*?)[0-9]+.txt')

#print reader.categories()


#takes a corpus .. creates labelled feature sets
def label_feats_from_corpus(corp, feature_detector=bag_of_words):
    label_feats = collections.defaultdict(list)
    for label in corp.categories():
        for fileid in corp.fileids(categories=[label]):
            feats = feature_detector(corp.words(fileids=[fileid]))
            label_feats[label].append(feats)
    return label_feats


#creates test and train features
def split_label_feats(lfeats, split=0.75):
    train_feats = []
    test_feats = []
    for label, feats in lfeats.items():
Esempio n. 13
0
def process_plaintext(dir_path):
    reader = CategorizedPlaintextCorpusReader(dir_path,
                    r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt')
    facilitator_files = reader.fileids(categories='facilitator')
    participant_files = reader.fileids(categories='participant')
    print facilitator_files, participant_files

    #print reader.categories()
    #print len(reader.words())
    #print len(reader.sents())

    fac_words = [word for word in reader.words(facilitator_files)]
    par_words = [word for word in reader.words(participant_files)]

    fac_words = edit_tokens(fac_words)
    par_words = edit_tokens(par_words)

    speakers = (
        [(word, 'facilitator') for word in reader.words(facilitator_files)] +
        [(word, 'participant') for word in reader.words(participant_files)]
    )

    features = get_features(speakers)

    size = int(len(features) * 0.3)
    nb_train = features[size:]
    nb_test = features[:size]

    classifier = nltk.NaiveBayesClassifier.train(nb_train)
    print "Classifier labels:", classifier.labels()
    print classifier.show_most_informative_features()
    print "Clasify test:", nltk.classify.accuracy(classifier, nb_test)
    #print classifier.classify(get_features(["Yolo", "bag", "sp"], False))
    
    #random.shuffle(speakers)
    three_quarters = int(len(speakers) * 0.75)
    train = speakers[:three_quarters]
    test = speakers[three_quarters:]

    est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist)
    un_lm = NgramModel(1, train, estimator=est)
    bi_lm = NgramModel(2, train, estimator=est)
    tr_lm = NgramModel(3, train, estimator=est)
    qu_lm = NgramModel(4, train, estimator=est)
    pe_lm = NgramModel(5, train, estimator=est)
    print un_lm
    print bi_lm
    print tr_lm
    print qu_lm
    print pe_lm
    print "1 gram Perplexity:", un_lm.perplexity(test)
    print "2 gram Perplexity:", bi_lm.perplexity(test)
    print "3 gram Perplexity:", tr_lm.perplexity(test)
    print "4 gram Perplexity:", qu_lm.perplexity(test)
    print "5 gram Perplexity:", pe_lm.perplexity(test)

    print bi_lm.generate(10, ["uh", "sp"])

    fd_fac = nltk.FreqDist(fac_words)
    vocab_fac = fd_fac.keys()

    fd_par = nltk.FreqDist(par_words)
    vocab_par = fd_par.keys()

    print "Fac Vocab: " , len(vocab_fac)
    print "Fac Tokens: " , len(fac_words)
    print vocab_fac[:20]
    print "Par Vocab: " , len(vocab_par)
    print "Par Tokens: " , len(par_words)
    print vocab_par[:20]
    fd_par.plot(50)
Esempio n. 14
0
def process_plaintext(dir_path):
    reader = CategorizedPlaintextCorpusReader(dir_path,
                                              r'.*\.txt',
                                              cat_pattern=r'.+_.+_(.*)\.txt')
    facilitator_files = reader.fileids(categories='facilitator')
    participant_files = reader.fileids(categories='participant')
    print facilitator_files, participant_files

    #print reader.categories()
    #print len(reader.words())
    #print len(reader.sents())

    fac_words = [word for word in reader.words(facilitator_files)]
    par_words = [word for word in reader.words(participant_files)]

    fac_words = edit_tokens(fac_words)
    par_words = edit_tokens(par_words)

    speakers = ([(word, 'facilitator')
                 for word in reader.words(facilitator_files)] +
                [(word, 'participant')
                 for word in reader.words(participant_files)])

    features = get_features(speakers)

    size = int(len(features) * 0.3)
    nb_train = features[size:]
    nb_test = features[:size]

    classifier = nltk.NaiveBayesClassifier.train(nb_train)
    print "Classifier labels:", classifier.labels()
    print classifier.show_most_informative_features()
    print "Clasify test:", nltk.classify.accuracy(classifier, nb_test)
    #print classifier.classify(get_features(["Yolo", "bag", "sp"], False))

    #random.shuffle(speakers)
    three_quarters = int(len(speakers) * 0.75)
    train = speakers[:three_quarters]
    test = speakers[three_quarters:]

    est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist)
    un_lm = NgramModel(1, train, estimator=est)
    bi_lm = NgramModel(2, train, estimator=est)
    tr_lm = NgramModel(3, train, estimator=est)
    qu_lm = NgramModel(4, train, estimator=est)
    pe_lm = NgramModel(5, train, estimator=est)
    print un_lm
    print bi_lm
    print tr_lm
    print qu_lm
    print pe_lm
    print "1 gram Perplexity:", un_lm.perplexity(test)
    print "2 gram Perplexity:", bi_lm.perplexity(test)
    print "3 gram Perplexity:", tr_lm.perplexity(test)
    print "4 gram Perplexity:", qu_lm.perplexity(test)
    print "5 gram Perplexity:", pe_lm.perplexity(test)

    print bi_lm.generate(10, ["uh", "sp"])

    fd_fac = nltk.FreqDist(fac_words)
    vocab_fac = fd_fac.keys()

    fd_par = nltk.FreqDist(par_words)
    vocab_par = fd_par.keys()

    print "Fac Vocab: ", len(vocab_fac)
    print "Fac Tokens: ", len(fac_words)
    print vocab_fac[:20]
    print "Par Vocab: ", len(vocab_par)
    print "Par Tokens: ", len(par_words)
    print vocab_par[:20]
    fd_par.plot(50)
Esempio n. 15
0
        features['contains({})'.format(bigram)] = (bigram in article_bigrams)

    article_words = set(article_words)
    for word in word_features:
        features['contains({})'.format(word)] = (word in article_words)
    return features


if __name__ == '__main__':
    #set up path to data
    data_folder_name = sys.argv[1]
    data_path = os.path.join(os.getcwd(), '', data_folder_name)

    #make article object to read in files
    article = CategorizedPlaintextCorpusReader(data_path,
                                               r'.*\.*\.txt',
                                               cat_pattern=r'(\w+).*\.txt')

    #make list of all articles with labels based on what folder the file is in
    all_articles = []
    for category in article.categories():
        for fileid in article.fileids(category):
            #lowercases words and takes out stopwords
            process = list(
                w.lower() for w in list(article.words(fileid))
                if w.isalpha() and w not in stopwords.words('english'))
            entry = [process, category]
            all_articles.append(entry)

    random.shuffle(all_articles)
Esempio n. 16
0
                 i,
                 cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')


if __name__ == '__main__':
    print("\nStarting the Classifier. First, let's set everything up.")
    traincorpus_root = raw_input(
        "Please specify the location of the training data: ")
    # traincorpus_root = '/Users/taniamaldonado/PycharmProjects/corpora/humanin/train4'
    traincorpus = CategorizedPlaintextCorpusReader(
        traincorpus_root, r".*_.*\.txt", cat_pattern=r'(\w+)_.*\.txt')

    testcorpus_root = raw_input(
        "Please specify the location of the test data: ")
    # testcorpus_root = '/Users/taniamaldonado/PycharmProjects/corpora/humanin/test'
    testcorpus = CategorizedPlaintextCorpusReader(testcorpus_root,
                                                  r".*_.*\.txt",
                                                  cat_pattern=r'(\w+)_.*\.txt')

    try:
        traindata, testdata = datainput(traincorpus, testcorpus)
    except NameError:
        print "The training/test corpus is not defined, please check if the location is correct."

    print("\nPlease choose a classification algorithm:")
    print("1. Multinomial Naive Bayes")
Esempio n. 17
0
########## CATEGORIZED CORPUS READER ###############

from nltk.corpus import brown
print brown.categories()
print brown.tagged_sents(categories=['news'])


from nltk.corpus.reader import CategorizedPlaintextCorpusReader as CPCR
root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"

#Using cat_pattern
reader=CPCR(root
            ,r'movie_.*\.txt'
            ,cat_pattern=r'movie_(\w+)\.txt')
print reader.categories()
print reader.fileids(categories=['neg'])
print reader.fileids(categories=['pos'])

#Using cat_map: a dictionary mapping a ""fileid arg"" to a ""list of  category labels""
reader=CPCR(root
            ,r'movie_.*\.txt'
            ,cat_map={'movie_pos.txt':['pos'],'movie_neg.txt':['neg']})
print reader.categories()

#Using cat file: it is a file containing mapping of fileid to category i.e. cats.txt
#for more details refer brown corpus folder.
from textblob.classifiers import NaiveBayesClassifier
from nltk.corpus.reader import PlaintextCorpusReader, CategorizedPlaintextCorpusReader
from nltk.corpus import movie_reviews
import nltk
import random
from BeautifulSoup import BeautifulSoup

p = nltk.data.find('corpora/SecurityThreat-MaxEnt')
reader = CategorizedPlaintextCorpusReader(p,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*')
from nltk import WordNetLemmatizer

#Using Wordnet Lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
all_words = nltk.FreqDist(word for word in reader.words())
top_words = list(all_words)[:100]
print(top_words)


def word_feats(words):
    return {word: True for word in words if word in top_words}


#def word_feats(words):
#return dict([(wordnet_lemmatizer.lemmatize(word), True) for word in words])

# Generate all the files based on ThreatType.
IdentityThreat = reader.fileids('IdentityThreat')
InsiderThreat = reader.fileids('InsiderThreat')
Malware = reader.fileids('Malware')
Esempio n. 19
0
def fetch_news(dir):
    base = 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/{}/rss.xml'

    for category in ['world', 'technology']:
        rss = fp.parse(base.format(category))

        for i, entry in enumerate(rss.entries):
            fname = '{0}_bbc_{1}.txt'.format(i, category)
            fname = os.path.join(dir, fname)

            if not dl.conf.file_exists(fname):
                store_txt(entry.link, fname, entry.title)


if __name__ == "__main__":
    dir = os.path.join(dl.data.get_data_dir(), 'bbc_news_corpus')

    if not os.path.exists(dir):
        os.mkdir(dir)

    fetch_news(dir)
    reader = CategorizedPlaintextCorpusReader(dir,
                                              r'.*bbc.*\.txt',
                                              cat_pattern=r'.*bbc_(\w+)\.txt')
    printer = dl.log_api.Printer(nelems=3)
    printer.print('Categories', reader.categories())
    printer.print('World fileids', reader.fileids(categories=['world']))
    printer.print('Technology fileids',
                  reader.fileids(categories=['technology']))
Esempio n. 20
0
 def __init__(self, *args, **kwargs):        
     CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
Esempio n. 21
0
import sys

if (len(sys.argv) != 5):
    print(
        'Usage: Pass Arguments for Input PDF path, Category-File Mapping Path, Input Links File and Output Sub-Category Update File'
    )
    sys.exit(1)

print("Input PDFs File Path " + sys.argv[1])
print("Category File Name and Path " + sys.argv[2])
print("Input Links File Name & Path is " + sys.argv[3])
print("SubCategory Update File is " + sys.argv[4])

reader = CategorizedPlaintextCorpusReader(sys.argv[1],
                                          r'.*\.txt',
                                          cat_file=sys.argv[2],
                                          cat_delimiter='|')

# Access each file in the corpus.
#for infile in sorted(reader.fileids()):
#    print (infile) # The fileids of each file.
#    #file = reader.open(infile)
#    #print (file.read().strip()) # Prints the content of the file

#print(reader.fileids())

#print(reader.fileids(categories=['General']))
#print(reader.categories())

#print(reader.categories())
Esempio n. 22
0
    return BOW


"""
# Recurso de emociones CANADA
emotions_dict = pd.read_csv("emolex.csv")
emotions_dict = emotions_dict.set_index('Spanish (es)')

# Recurso de emociones SEL
sel_emotions_dict = pd.read_csv("SEL_full.txt", sep='\t', encoding = "ISO-8859-1")
sel_emotions_dict = sel_emotions_dict.set_index('Palabra')
"""

# Lee corpus de tweets
reader = CategorizedPlaintextCorpusReader('./',
                                          r'mex.*\.txt',
                                          cat_pattern=r'(\w+)/*')

tweets_train = reader.raw('mex_train.txt').split('\n')[:-1]
labels_train = reader.raw('mex_train_labels.txt').split('\n')[:-1]
labels_train = list(map(int, labels_train))

tweets_val = reader.raw('mex_val.txt').split('\n')[:-1]
labels_val = reader.raw('mex_val_labels.txt').split('\n')[:-1]
labels_val = list(map(int, labels_val))

tweets_test = reader.raw('mex_test.txt').split('\n')[:-1]
"""
corpus_palabras = []
for doc in tweets_train:
    corpus_palabras += doc.split()
Esempio n. 23
0
###############################################
def getDirnames( path ) :
  dirList = []
  for f in os.listdir( path ) :
    if not os.path.isfile( path ) :
      if not f == ".DS_Store" :
        dirList.append(f)
  return dirList

###############################################
###############################################

#################
# TRAINING DATA #
#################
train_reader = CategorizedPlaintextCorpusReader('./training_data', r'.*\_.*\.txt', cat_pattern=r'.*\_(\w+)\.txt')
train_documents = [(list(train_reader.words(fileid)), category)
                   for category in train_reader.categories()
                   for fileid in train_reader.fileids(category)]
random.shuffle(train_documents)
#print train_documents

train_documents_clean = []
for i in train_documents :
  cat = i[1]
  #print cat
  newList = []
  for word in i[0] :
    #print j
    clean_word = word.encode('ascii', 'ignore').decode('ascii').encode('ascii', 'ignore')
    newList.append(clean_word)
Esempio n. 24
0
__author__ = 'Piotr'
from random import shuffle
from pickle import dump
import os

from nltk import word_tokenize
from nltk.corpus.reader import CategorizedPlaintextCorpusReader

from text_processing.replacers import RegexpReplacer

training = CategorizedPlaintextCorpusReader("Articles", r'.*\.txt', cat_pattern=r'(\w+)', encoding="utf-8")


def print_corpus_info():
    print("Training Corpus INFO")

    for category in training.categories():
        print("Number of documents in {0:8} category: {1}".format(category, len(training.fileids(category))))

    print("\n")


def save_documents(documents, name):
     with open(os.path.join("Classifiers", name + ".pickle"), 'wb') as file_handler:
        dump(documents, file_handler)


def get_training_documents(cut_off=0.75, save=False):
    train_set = []
    test_set = []
                
                corpusfile=open(corpusfolder+'/'+fname,'a')
                corpusfile.write(str(body))
                corpusfile.close()
                
            except Exception as e:
                print('Error on :'+id_)
                corpusfile.close()
                os.remove(mydir+'\\'+fname)
                pass
        else:
            print('Empty File:'+id)

CreateCorpusFromDataFrame(mydir,data_sample)

my_corpus=CategorizedPlaintextCorpusReader(mydir,r'.*', cat_pattern=r'.*_(.*).txt') 

def preprocess(words, to_lowercase = True, remove_punctuation = True, remove_digits = True, remove_odd_chars = True, remove_stopwords=True, stem = True):
    if to_lowercase:
        words = [w.lower() for w in words]
    
    if remove_punctuation:
        words = [w for w in words if not (re.match(r'^\W+$', w) != None)]
    
    if remove_digits:
        words = [w for w in words if not w.replace('.','',1).isdigit()]

    if remove_odd_chars:
        words = [re.sub(r'[^a-zA-Z0-9_]','_', w) for w in words]
    
    if remove_stopwords:
import os
import glob
from nltk import NaiveBayesClassifier
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import nltk
from nltk.corpus import wordnet as wn
import sys

Feature_Set={}


training_directory= "reviews"
Training_Corpus = CategorizedPlaintextCorpusReader(training_directory, r'pos|neg.*\.txt$', cat_pattern='(\w+)/*')

testing_directory= "reviews"
Testing_Corpus  = CategorizedPlaintextCorpusReader(testing_directory, r'pos|neg.*\.txt$', cat_pattern='(\w+)/*')



Training_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw())
Positive_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw(categories="pos"))
Negative_Corpus_Text=nltk.RegexpTokenizer('\w+').tokenize(Training_Corpus.raw(categories="neg"))

Training_Vocabulary = nltk.FreqDist(w.lower() for w in Training_Corpus_Text)
Positive_Vocabulary = nltk.FreqDist(w.lower() for w in Positive_Corpus_Text)
Negative_Vocabulary = nltk.FreqDist(w.lower() for w in Negative_Corpus_Text)


pos_den=float(len(Positive_Corpus_Text))+float(len(Positive_Vocabulary.keys()))
neg_den=float(len(Negative_Corpus_Text))+float(len(Negative_Vocabulary.keys()))
Esempio n. 27
0
import time
import nltk
import pickle
import re
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.corpus import stopwords

reader = CategorizedPlaintextCorpusReader('/media/storage/dpla-data/words/colls/', r'.*\.txt', cat_pattern=r'(\w+)\.txt')

# Removing oversized collections: hathi, nypl; Also, chunking them out:
# First batch represents what was completed on 4/10-4/11. 
colls = ["searches"]
#colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard",
#        "ia","getty","kentucky","minnesota","missouri","mwdl","nara","nocar",
#        "smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"]
#colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"]
#colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"]

#data = {}
stats = {}
common = {}

for coll in colls:
    print(reader.categories(coll+".txt"))
    stats[coll] = {}
    # 'kay. Can't pickle words. It's a stream reader.
    # But maybe you can if you tokenize we regex
    # Which also pulls out punctuation
    print("prep & pickle words")
    words = re.split(r'\W+', reader.raw(coll+'.txt'))
    pickle.dump( words, open( "/media/storage/dpla-data/pickles/new/"+coll+"_words.p", "wb"))
Esempio n. 28
0
                             r'brown.pos',
                             word_tokenizer=SpaceTokenizer())

print(reader.words())
print(reader.sents())
print(reader.tagged_words())
print(reader.tagged_sents())
print(
    reader.tagged_words(tagset='universal')
)  ## Mapping tags to universal format, if tagset is not correct every TAG will have UNK

## Reading chunk corpora #######
reader = ChunkedCorpusReader('/Users/atul/nltk_data',
                             r'treebank.chunk',
                             tagset='en-brown')
print(reader.chunked_words())  ## Word level structure
print(reader.chunked_sents())  ## Sentence level structure
print(reader.chunked_paras())  ## Paragraph level structure

## Reading classifed corpora ##################
## classification extracted using cat_pattern (from file name), or cat_dict or cat_file ######
from nltk.corpus.reader import CategorizedPlaintextCorpusReader

reader = CategorizedPlaintextCorpusReader(
    '/Users/atul/nltk_data', r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt'
)  ## Easiest is to read files for different category
reader.categories()
reader.fileids(categories=['neg'])
reader.fileids(categories=['pos'])
reader.fileids()
Esempio n. 29
0
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.tokenize.casual import TweetTokenizer

from normalization import normalizeTwitterWordsWithExtraFeatures, normalizeTwitterWordsWithNegationHandle
import pickle, nltk

tweetTokenizer = TweetTokenizer(reduce_len=True, preserve_case=True, strip_handles=False)
corpus = CategorizedPlaintextCorpusReader('corpus/2-step/polar', r'(\w+)-tweet[0-9]+\.txt', cat_pattern=r'(\w+)-tweet[0-9]+\.txt', word_tokenizer=tweetTokenizer)

normalizationFunction = normalizeTwitterWordsWithNegationHandle

wordsTaggedToCategory = []

i = 1
for category in corpus.categories():
    for fileid in corpus.fileids(category):
        words = corpus.words(fileids=[fileid])
        normalizedWords = normalizationFunction(words)
        extraNormalizedWords = normalizeTwitterWordsWithExtraFeatures(words)
        wordsTagged = nltk.pos_tag(normalizedWords)
        wordsTaggedToCategory += [(wordsTagged, category)]
        print(i)
        i += 1

with open("wordsTaggedToCategory-polar", 'wb') as fileout:
    pickle.dump(wordsTaggedToCategory, fileout)
Esempio n. 30
0
import nltk, random, string
from nltk.corpus.reader import CategorizedPlaintextCorpusReader 
from nltk.corpus import stopwords

reader = CategorizedPlaintextCorpusReader('./', r'.*\.txt', cat_pattern=r'(\w+)/*')
print reader.categories()
print reader.fileids()

documents = [(list(reader.words(fileid)), category)
	for category in reader.categories()
	for fileid in reader.fileids(category)]
random.shuffle(documents)

# Remove stopwords & punc from content
table = string.maketrans("","")
stopwords = nltk.corpus.stopwords.words('english')
filtered_words = [w for w in reader.words() if not w in stopwords]
filtered_words_nopunc = [w for w in filtered_words if not w in string.punctuation]
all_words = nltk.FreqDist(w.lower() for w in filtered_words_nopunc)

print all_words

word_features = all_words.keys()[:2000]




def document_features(document):
	document_words = set(document)
	features = {}
Esempio n. 31
0
 def __init__(self, *args, **kwargs):
     self.annotation_word_tokenizer = RegexpTokenizer(r'(Agree|Disagree) Strongly|(Agree|Disagree) Somewhat|Never Addressed|No Opinion|[AC]-\w+|\d+-\d+|\w+|[^\w\s]+')
     CategorizedPlaintextCorpusReader.__init__(self, *args, **kwargs)
import nltk
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import random
from BeautifulSoup import BeautifulSoup

#Reading from  custom created categorized corpora
#categorized corpora will be categorized for topic, genre, polarity, etc.
#In addition to the standard corpus interface, these corpora provide access to the list of categories
#and the mapping between the documents and their categories (in both directions)
# Access the categories using the categories() method

d = nltk.data.find('corpora/SecurityThreat')
reader = CategorizedPlaintextCorpusReader(d,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*')
from textblob.classifiers import NaiveBayesClassifier
random.seed(1)
train = [
    ('Identity', 'IdentityThreat'),
    ('identity', 'IdentityThreat'),
    ('identities', 'IdentityThreat'),
    ('identity loss', 'IdentityThreat'),
    ('insider', 'InsiderThreat'),
    ('Malware', 'Malware'),
]

# Categorized corpora Reader collect the respective words based on ThreatType
ThreatTypes = [(list(reader.words(fileid)), category)
               for category in reader.categories()
               for fileid in reader.fileids(category)]
random.shuffle(ThreatTypes)
Esempio n. 33
0
class PolarityDataReader(object):
    """
    PolarityDataReader:
        Reader for POS/NEG Categorized Sentiword data

    uses:
        nltk.corpus.reader.CategorizedPlaintextCorpusReader

    usage:
        
        dataReader = PolarityDataReader([rootLocation],[readerObject])
        dataReader.getDocuments()
        dataReader.setTerms([No:ofTerms])

        featuresets = dataReader.getTermDocMatrix()

    """
    def __init__(self, rootLocation=config.POLARITY_DATASET, reader=None):
        super(PolarityDataReader, self).__init__()
        if reader == None:
            self.reader = Reader(rootLocation,
                                 r'.*/.*',
                                 cat_pattern=r'(.*)/.*')
        else:
            self.reader = reader
        self.setStopWords()
        self.documents = None
        self.terms = None

    def getDocuments(self):
        if not self.documents:
            self.documents = [(list(self.reader.words(fileid)), category)
                              for category in self.reader.categories()
                              for fileid in self.reader.fileids(category)]
        return self.documents

    def setStopWords(self, fileLocation=config.STOP_WORDS_FILE):
        stopfile = open(fileLocation, 'r')
        self.stopwords = stopfile.read().split()

    def removeStopWords(self, wordList):
        """ Remove common words which have no search value """
        return [word for word in wordList if word not in self.stopwords]

    def setTerms(self, size=2000, featureSelection='PD', removeStopWords=True):
        if featureSelection == 'PD':
            self.__setTermsPD__(size)
            print "Feature Selection : PD :done "

        elif featureSelection == 'CHI_SQUARE':
            self.__setTermsCHISQUARE__(size)
            print "Feature Selection : CHI_SQUARE :done "

        else:
            """
            geting most frequent Words
            """
            all_words = [w.lower() for w in self.reader.words()]
            if removeStopWords:
                all_words = self.removeStopWords(all_words)
            all_words = FreqDist(w for w in all_words)
            self.terms = all_words.keys()[:size]
            print "Feature Selection: frequent Words :done "

    def documentFeatures(self, document, sentiwordnet=False):
        document_words = set(document)
        features = {}
        if sentiwordnet:
            pass
            #TODO
        else:
            for word in self.terms:
                features[word] = (word in document_words)
        return features

    def getTermDocMatrix(self):
        return [(self.documentFeatures(document), category)
                for (document, category) in self.documents]

    def __setTermsPD__(self, size):
        """
        score=|(posDF-negDF)|/(posDF+negDF)
        """
        posWord = {}
        negWord = {}

        for word in self.reader.words(categories=['pos']):
            inc(posWord, word.lower())
        for word in self.reader.words(categories=['neg']):
            inc(negWord, word.lower())

        wordScores = {}
        for word in self.reader.words():
            try:
                posScore = posWord[word]
            except KeyError, e:
                posScore = 0
            try:
                negScore = negWord[word]
            except KeyError, e:
                negScore = 0
            totalScore = posScore + negScore
            if totalScore <= 10:  # min total count
                wordScores[word] = 0.1
            else:
                wordScore[word] = abs(posScore - negScore) / totalScore
Esempio n. 34
0
        {
            "category": cat, "doc": doc,
            "tot_words": tot_words, "avg_char": avg_char,
            "sentences": sentences, "avg_word": avg_word,
            "most_common": common[0][0], "most_common_freq": common[0][1]
        }
    )

def clean_string(text):
    clean_text = text.lower()
    clean_text = re.sub('[^0-9a-zA-Z //]+', '', clean_text)
    return clean_text.strip()

#create document properties
corpus = CategorizedPlaintextCorpusReader(
    'C:/Users/gavin_000/Python/texts',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*'
)

stop = stopwords.words('english')

results = pd.DataFrame()

for category in corpus.categories():
    for document in corpus.fileids(category):
        doc_properties = create_document_properties(category, document)
        results = results.append(doc_properties, ignore_index=True)

print results
#From 1491 of cookbook
#This is how we would load in the customized corpus
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
#reader = CategorizedPlaintextCorpusReader('.',r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt')
#reader = CategorizedPlaintextCorpusReader('.',r'movie_.*\.txt', cat_map={'movie_pos.txt':['pos'],'movie_next.txt':['neg']})
reader = CategorizedPlaintextCorpusReader('./nltk_data/custom_corpora/',r'content_.*\.txt', cat_map={'content_good.txt':['good'],'content_bad.txt':['bad']})

reader.categories()
#['neg','pos']
reader.fileids(categories=['good'])
#['movie_neg.txt']
reader.fileids(categories=['bad'])
#['movie_pos.txt']

#location 3442
#extract features from the corpus

def bag_of_words(words):
  return dict([(word, True) for word in words])

def bag_of_words_not_in_set(words, badwords):
  return bag_of_words(set(words) - set(badwords))

from nltk.corpus import stopwords
def bag_of_non_stopwords(words, stopfile='english'):
  badwords = stopwords.words(stopfile)
  return bag_of_words_not_in_set(words, badwords)

from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
Esempio n. 36
0
]
for topic in topics:
    statuses = Cursor(api.search,
                      q=f"{topic} -filter:retweets",
                      tweet_mode="extended").items(200)
    for status in statuses:
        if status.lang == "en":
            file = open(
                f"C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/tweets_{topic}.txt",
                "a",
                encoding="utf-8")
            file.write(status.full_text)
            file.close()

reader = CategorizedPlaintextCorpusReader(
    "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus",
    r'tweets_.*\.txt',
    cat_pattern=r'tweets_(\w+)\.txt')

# setting up stopwords
stopword_reader = PlaintextCorpusReader(
    "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/twitterstopwords/",
    r'.*\.txt',
    encoding='latin-1')
stop_words = set(['“', '”', '’', ",", "#", "—", "__", "_", "___"])

for file in stopword_reader.fileids():
    stops = stopword_reader.raw(file).replace("\n", ",").split(",")
    for word in stops:
        stop_words.add(word)

# text wrangling functions:
Esempio n. 37
0
#!/usr/bin/env python
# coding: utf-8

import nltk
from nltk.corpus.reader import CategorizedPlaintextCorpusReader

corpus_root = '/Users/athessen/nltk_data/corpora/eco'
reader = CategorizedPlaintextCorpusReader(corpus_root,r'lion|shark\d*\.txt',cat_file='cats.txt')
print reader.fileids()

print reader.categories()


"""
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000] [1]

def document_features(document): [2]
    document_words = set(document) [3]
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features
"""
Esempio n. 38
0
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = API(auth,
          wait_on_rate_limit=True)  # setting limit to avoid upsetting Twitter
'''accounts = [("NASA", 11348282), ("BarackObama", 813286)]
for account in accounts:
    statuses = Cursor(api.user_timeline, user_id=account[1], include_rts=False, exclude_replies=True, count=10000, tweet_mode="extended").items()
    for status in statuses:
        if status.lang == "en":
            file = open(f"C:/Users/olgur/nltk_data/twitter_corpus/tweets_{account[0]}.txt", "a",
                        encoding="utf-8")
            file.write(status.full_text.replace("\n", " ") + "\n")
            file.close()'''

reader = CategorizedPlaintextCorpusReader(
    "C:/Users/olgur/nltk_data/twitter_corpus",
    r'tweets_.*\.txt',
    cat_pattern=r'tweets_(\w+)\.txt')

# setting up stopwords
stop_words = set([
    '“', '”', '’', ",", "#", "—", "__", "_", "___", ".", ":", '"', "?", "!",
    "-", ")", "(", "...", "$"
]).union(set(stopwords.words("english")))


def remove_links(text):
    http_regex = re.compile(r"(https|http)://.*")
    return http_regex.sub(r"", text)


def remove_users(text):
Esempio n. 39
0
import time
import nltk
import pickle
import re
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.corpus import stopwords

reader = CategorizedPlaintextCorpusReader('/media/storage/dpla-data/words/colls.oct/', r'.*\.txt', cat_pattern=r'(\w+)\.txt')

# Removing oversized collections: hathi, nypl; Also, chunking them out:
# First batch represents what was completed on 4/10-4/11. 
#colls = ["searches"]
colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard",
        "ia","getty","kentucky","minnesota","missouri","mwdl","nara","nocar",
        "smiths","socar","texas","gpo","illinois","usc","virginia","nocoll",
        "hathi","nypl"]
#colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"]
#colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"]

#data = {}
stats = {}
common = {}

for coll in colls:
    print(reader.categories(coll+".txt"))
    stats[coll] = {}
    # 'kay. Can't pickle words. It's a stream reader.
    # But maybe you can if you tokenize we regex
    # Which also pulls out punctuation
    print("prep & pickle words")
    words = re.split(r'\W+', reader.raw(coll+'.txt'))
Esempio n. 40
0
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import nltk

d = nltk.data.find('corpora/cookbook')
reader = CategorizedPlaintextCorpusReader(d, r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt')
print(reader.categories())
print(reader.fileids(categories='neg'))
print(reader.fileids(categories='pos'))

# from nltk.corpus import brown
# print(brown.categories())
Esempio n. 41
0
from nltk.corpus import stopwords
from nltk.tokenize import LineTokenizer, RegexpTokenizer
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.stem import SnowballStemmer
from processor import Processor as Proc

data_folder = './data'
encoding = 'UTF8'
language = 'italian'

wordTok = RegexpTokenizer(r'(\w+|@\w+|<3|(\:\-?\))|(\:\-?\()|(\;\-?\))|((\:|(X|x))\-?(D|d)))')
sentTok = LineTokenizer()
reader = CategorizedPlaintextCorpusReader(data_folder, r'SENTIPOLC-.*\.txt',
                                          cat_pattern=r'SENTIPOLC-(\w+)\.txt',
                                          encoding=encoding,
                                          word_tokenizer=wordTok,
                                          sent_tokenizer=sentTok)

pos_tweets = reader.sents(reader.fileids('pos'))
neg_tweets = reader.sents(reader.fileids('neg'))

# Inspection
rndP = random.randrange(len(pos_tweets))
rndN = random.randrange(len(neg_tweets))
print 'Pos:\n', pos_tweets[rndP:rndP+3], '\nNeg:\n', neg_tweets[rndN:rndN+3], '\n'

# All lowercase
pos_tweets = Proc.lowerize(pos_tweets)
neg_tweets = Proc.lowerize(neg_tweets)
Esempio n. 42
0
loc = '/Users/rmoura/nltk_data/corpora/rai/textoSimples/'
corpus1 = PlaintextCorpusReader(loc, '.*\.txt')
print(corpus1.fileids())
print(corpus1.sents())
print(corpus1.words())

# Corpus texto etiquetado
from nltk.corpus.reader.tagged import TaggedCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoEtiquetas/'
corpus2 = TaggedCorpusReader(loc, '.*\.txt')
print(corpus2.fileids())
print(corpus2.words())
print("Palavras etiquetadas: ", corpus2.tagged_words())
print(corpus2.tagged_words('003.txt'))
print("Sentencas diretas:")
for s in corpus2.sents():
    print(' '.join(s))

from nltk.corpus.reader import CategorizedPlaintextCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoCategorias/'
corpus3 = CategorizedPlaintextCorpusReader(loc, '.*\.txt', cat_file="categorias.txt")
print(corpus3.fileids())
print(corpus3.categories())
print(corpus3.words(categories='brasnam'))

# Definicao de stopwords
stopwords = nltk.corpus.stopwords.words('portuguese')
fd = nltk.FreqDist(w.lower() for w in corpus3.words())
fd1 = nltk.FreqDist(w.lower() for w in corpus3.words()
                    if w.isalpha() and w not in stopwords)
Esempio n. 43
0
#!pip install wordcloud


# In[2]:


from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.corpus import stopwords
stopwordlist=stopwords.words('german')
from wordcloud import WordCloud

rootDir="../01access/GERMAN"
filepattern=r"(?!\.)[\w_]+(/RSS/FeedText/)[\w-]+/[\w-]+\.txt"
#filepattern=r"(?!\.)[\w_]+(/RSS/FullText/)[\w-]+/[\w-]+\.txt"
catpattern=r"([\w_]+)/.*"
rssreader=CategorizedPlaintextCorpusReader(rootDir,filepattern,cat_pattern=catpattern)


# In[3]:


singleDoc=rssreader.paras(categories="TECH")[0]
print("The first paragraph:\n",singleDoc)
print("Number of paragraphs in the corpus: ",len(rssreader.paras(categories="TECH")))


# In[4]:


techdocs=[[w.lower() for sent in singleDoc for w in sent if (len(w)>1 and w.lower() not in stopwordlist)] for singleDoc in rssreader.paras(categories="TECH")]
print("Number of documents in category Tech: ",len(techdocs))
@author: jagpr
"""

import collections, itertools
import nltk.classify.util, nltk.metrics
from nltk.metrics import *
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

#Creating Corpus using WordListCorpusReader
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
imdb_reviews = CategorizedPlaintextCorpusReader(
    'D://USF//Independent Research Project//Dataset//Movie Review Dataset Pos Neg//aclImdb//train//negpos',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')

len(imdb_reviews.fileids())


def evaluate_classifier(featx):
    negids = imdb_reviews.fileids('neg')
    posids = imdb_reviews.fileids('pos')

    negfeats = [(featx(imdb_reviews.words(fileids=[f])), 'neg')
                for f in negids]
    posfeats = [(featx(imdb_reviews.words(fileids=[f])), 'pos')
                for f in posids]

    negcutoff = len(negfeats) * 3 / 4
class PolarityDataReader(object):
    """
    PolarityDataReader:
        Reader for POS/NEG Categorized Sentiword data

    uses:
        nltk.corpus.reader.CategorizedPlaintextCorpusReader

    usage:
        
        dataReader = PolarityDataReader([rootLocation],[readerObject])
        dataReader.getDocuments()
        dataReader.setTerms([No:ofTerms])

        featuresets = dataReader.getTermDocMatrix()

    """
    
    def __init__(self, rootLocation = config.POLARITY_DATASET,reader=None):
        super(PolarityDataReader, self).__init__()
        if reader == None:
            self.reader = Reader(rootLocation,r'.*/.*', cat_pattern=r'(.*)/.*')
        else:
            self.reader = reader
        self.setStopWords()
        self.documents = None;
        self.terms = None;


    def getDocuments(self):
        if not self.documents:
            self.documents = [(list(self.reader.words(fileid)), category) 
                              for category in self.reader.categories()
                              for fileid in self.reader.fileids(category)]
        return self.documents;

    def setStopWords(self,fileLocation = config.STOP_WORDS_FILE):
        stopfile = open(fileLocation, 'r')
        self.stopwords = stopfile.read().split()

    def removeStopWords(self,wordList):
        """ Remove common words which have no search value """
        return [word for word in wordList if word not in self.stopwords]

    def setTerms(self,size=2000,featureSelection='PD',removeStopWords=True):
        if featureSelection == 'PD':
            self.__setTermsPD__(size)
            print "Feature Selection : PD :done "
            
        elif featureSelection == 'CHI_SQUARE':
            self.__setTermsCHISQUARE__(size)
            print "Feature Selection : CHI_SQUARE :done "
        elif featureSelection == 'SWNSS':
            self.__setTermsSWNSS__(size)
            print "Feature Selection : SWNPD :done "
        else:
            """
            geting most frequent Words
            """
            all_words = [w.lower() for w in self.reader.words()];
            if removeStopWords:
                all_words = self.removeStopWords(all_words);
            all_words = FreqDist(w for w  in all_words)
            self.terms = all_words.keys()[:size]
            print "Feature Selection: frequent Words :done "


    def documentFeatures(self,document,sentiwordnet=False):
        document_words = set(document)
        features = {}
        if sentiwordnet:
            pass
            #TODO
        else :
            for word in self.terms:
                features[word] = (word in document_words)
            return features
                

    def getTermDocMatrix(self):
        return [(self.documentFeatures(document), category) 
                for (document,category) in self.documents]

    def __setTermsPD__(self,size):
        """
        score=|(posDF-negDF)|/(posDF+negDF)
        """
        posWord = {};
        negWord = {};
        
        for word in self.reader.words(categories = ['pos']):
            inc(posWord,word.lower());
        for word in self.reader.words(categories = ['neg']):
            inc(negWord,word.lower());
                
        wordScores = {}
        for word in self.reader.words():
            try:
                posScore = posWord[word]
            except KeyError, e:
                posScore = 0
            try:
                negScore = negWord[word]
            except KeyError, e:
                negScore = 0
                totalScore = posScore + negScore
            if totalScore <= 10 : # min total count
                wordScores[word] = 0.1
            else :
                wordScores[word] = abs(posScore-negScore)/totalScore
                #removeStopWords does no affect accurcy          
            termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size]
            self.terms = [w for (w,s) in termScore];
Esempio n. 46
0
# NLTK - train nb_classifier


import random
import nltk as nltk
#nltk.download()
from nltk.corpus import stopwords
import os, os.path
path = os.path.expanduser('~/nltk_data')
if not os.path.exists(path):
    os.mkdir(path)
os.path.exists(path)
import nltk.data
path in nltk.data.path
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader('.', r'.*_news_.*\.csv', cat_pattern=r'.*_news_(\w+)\.csv')
reader.categories()

def bag_of_words(words):
    return dict([(word, True) for word in words if word[0].isalpha()])
import collections
def bag_of_words_not_in_set(words, badwords):
    return bag_of_words(set(words)-set(badwords))

def bag_of_non_stopwords(words, stopfile='english'):
    badwords = stopwords.words(stopfile)
    return bag_of_words_not_in_set(words, badwords)

from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
import nltk as nltk
import os, os.path
path = os.path.expanduser('~/nltk_data')
if not os.path.exists(path):
    os.mkdir(path)
os.path.exists(path)
import nltk.data
path in nltk.data.path
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader2 = CategorizedPlaintextCorpusReader('.', r'news_.*\.csv', cat_pattern=r'news_(\w+)\.csv')
reader.categories()
reader.fileids(categories=['UP'])
def bag_of_words(words):
    return dict([(word, True) for word in words])
import collections

def label_feats_from_corpus(corp, feature_detector=bag_of_words):
    label_feats = collections.defaultdict(list)
    for label in corp.categories():
        for fileid in corp.fileids(categories=[label]):
            feats = feature_detector(corp.words(fileids=[fileid]))
            label_feats[label].append(feats)
    return label_feats

def split_label_feats(lfeats, split=0.75):
    train_feats = []
    test_feats = []
    for label, feats in lfeats.iteritems():
        cutoff = int(len(feats) * split)
        train_feats.extend([(feat, label) for feat in feats[:cutoff]])
        test_feats.extend([(feat, label) for feat in feats[cutoff:]])