def extract_related_terms(self):
     re = ReportEnviroments()
     new_corpus_clusters_fileids_list = PlaintextCorpusReader(re.cluster_corpus_path, '.*')
     raw_text_list = []
     for i in range(len(new_corpus_clusters_fileids_list.fileids())):
         raw_text_list.extend([[new_corpus_clusters_fileids_list.raw(fileids=new_corpus_clusters_fileids_list.fileids()[i])]])
     return raw_text_list
Ejemplo n.º 2
0
	def fileids(self, years='*'):
		"""
			Returns list all files or files exist in specific folder(s)
			
			>>> len(hr.fileids())
			3206
			>>> len(hr.fileids(years=1996))
			157
			>>> len(hr.fileids(years=[1996,2007]))
			246
			>>> hr.fileids()[0]
			'1996/HAM2-960622.xml'
		"""
		if type(years) is int:
			years = [str(years)]
		
		if years=='*':
			wordlists = PlaintextCorpusReader(self.hamshahri_root, '.*\.xml')
			fids = wordlists.fileids()
			return fids
		else:
			fids = []
			for year in years:
				wordlists = PlaintextCorpusReader(self.hamshahri_root, str(year) + '/.*\.xml')
				fids = fids + wordlists.fileids()
			return fids
Ejemplo n.º 3
0
def main():
	current_directory = os.path.dirname(__file__)
	corpus_root = os.path.abspath(current_directory)
	wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt')
	wordlists.fileids()
	ClassEvent = nltk.Text(wordlists.words())
	CEWords = ["Long Island", "Weather Service", "flooding", "August", 
		"heavy rains", "Wednesday", "Suffolk County", "New York", "rainfall",
		"record"]

	# ClassEvent Statistics
	print "--------- CLASS EVENT STATISTICS -------------"
	print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent)	
	print "ClassEvent WORD LENGTH DISTRIBUTIONS:"
	print_word_length_distributions(ClassEvent)
	print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:"
	print_percentage_of_word_in_collection(ClassEvent, CEWords)
	
	ClassEventLettersPerWord = average_letters_per_word(ClassEvent)
	ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents())
	ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \
		ClassEventWordsPerSent) - 21.43
	
	print "Average number of letters per word", ClassEventLettersPerWord
	print "Average number of words per sentence:", ClassEventWordsPerSent
	print "Automated Readability Index:", ClassEventARI


	print 

	wordlists_event = PlaintextCorpusReader(corpus_root, "Texas_Wild_Fire/.*\.txt")
	wordlists_event.fileids()
	YourSmall = nltk.Text(wordlists_event.words())
	SmallEventWords = ["Fire", "Wildfire", "Water", "Damage", "Ground", "Burn", 
		"Town", "Heat", "Wind", "Speed", "Size", "City", "People", "Home",
		"Weather", "Debris", "Death", "Smoke", "State", "Ash"]
	

	# YourSmall statistics
	print "--------- YOUR SMALL STATISTICS --------------"
	print "Texas_Wild_Fire", non_stopword_fraction(YourSmall)
	print "YourSmall WORD LENGTH DISTRIBUTIONS:"
	print_word_length_distributions(YourSmall)
	print "YourSmall PERCENTAGE OF WORD OCCURRENCES:"
	print_percentage_of_word_in_collection(YourSmall, SmallEventWords)
	
	YourSmallLettersPerWord = average_letters_per_word(YourSmall)
	YourSmallWordsPerSent = len(wordlists_event.words()) / \
		len(wordlists_event.sents())
	YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \
		YourSmallWordsPerSent) - 21.43

	print "Average number of letters per word", YourSmallLettersPerWord
	print "Average number of words per sentence:", YourSmallWordsPerSent
	print "Automated Readability Index", YourSmallARI
Ejemplo n.º 4
0
def carga():
    client = pymongo.MongoClient(MONGODB_URI)
    db = client.docs
    docs=db.SIMILITUD

    completo=[]
    newcorpus = PlaintextCorpusReader(corpus_root, '.*')
    result={}
    for fileid in newcorpus.fileids():
        for file2 in newcorpus.fileids():
            result= {"f1": fileid, "f2":file2, "value": compare_texts(newcorpus.raw(fileid), newcorpus.raw(file2))}
            docs.insert_one(result).inserted_id
Ejemplo n.º 5
0
def loadCorpora():

    corpus_root = '/usr/share/dict'
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    wordlists.fileids()
    wordlists.words('connectives')

    corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
    file_pattern = r".*/wsj_.*\.mrg" 
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    ptb.fileids()
    len(ptb.sents())
    ptb.sents(fileids='20/wsj_2013.mrg')[19]
Ejemplo n.º 6
0
class Documents:
    def __init__(self, root):
        self.files = PlaintextCorpusReader(root, '.*\.txt')
        self.posting = {}
        self.idf = {}
        self.file_length = {}
        self.file_id_names = {}
        self.N = len(self.files.fileids())

    def process(self):
        for idx, file in enumerate(self.files.fileids()):
            print idx
            filename = file.strip('.txt')
            self.file_id_names[idx] = filename
            text = self.files.raw(file)
            words = process(text)
            if settings['phrase_query']:
                raw_words = raw_process(text)
            if words.values():
                self.file_length[idx] = normalization(words.values())
            for word, freq in words.iteritems():
                if self.idf.has_key(word):
                    self.idf[word] += 1
                else:
                    self.idf[word]  = 1

                if not self.posting.has_key(word):
                    self.posting[word] = {}
                if settings['phrase_query']:
                    self.posting[word][idx] = indices(raw_words, word)
                else:
                    self.posting[word][idx] = freq
        for word, idf in self.idf.iteritems():
            self.posting[word]['idf'] = idf

    def dump(self):
        posting_pickle = open('posting.pkl', 'wb')
        for term, value in self.posting.iteritems():
          self.posting[term] = str(value)
        pickle.dump(self.posting, posting_pickle, 2)
        posting_pickle.close()

        length_pickle = open('file_length.pkl', 'wb')
        pickle.dump(self.file_length, length_pickle, 2)
        length_pickle.close()

        file_ids_pickle = open('file_ids.pkl', 'wb')
        pickle.dump(self.file_id_names, file_ids_pickle, 2)
        file_ids_pickle.close()
Ejemplo n.º 7
0
def represent_docs(corpus,cat,dictio_classes,categories):
    
    docs_train = []
    for dirs in os.walk(corpus):
        corpus_root = dirs[0] #parcour l'arborescence du chemin
        
        if corpus_root != corpus:
            if os.path.basename(corpus_root) == cat:
                dictio = dictio_classes[cat]
                textlist = PlaintextCorpusReader(corpus_root,'.*')
                for files in textlist.fileids():
                    test= corpus_root + '/' + files
                    x = open(test,'r')
                    l=dictio.items()
                    l.sort(key=itemgetter(1),reverse=True)
                    l=l[:2000]
                    l=dict(l)               
                    for mot,fval in l.items():
                        val=fval
                        for ligne in x:
                            if ligne.find(mot)>0:
                                l[mot]=val
                            else:
                                l[mot]=0.0                     
                    x.close()
                    docs_train.append((l,'Yes'))
            else:
                if os.path.basename(corpus_root) in categories:
                    cat_else = os.path.basename(corpus_root)
                    dictio = dictio_classes[cat_else]
                    textlist = PlaintextCorpusReader(corpus_root,'.*')
                    for files in textlist.fileids():
                        test= corpus_root + '/' + files
                        x = open(test,'r')
                        l=dictio.items()
                        l.sort(key=itemgetter(1),reverse=True)
                        l=l[:2000]
                        l=dict(l)               
                        for mot,fval in l.items():
                            val_1=fval
                            for ligne in x:
                                if ligne.find(mot)>0:
                                    l[mot]=val_1
                                else:
                                    l[mot]=0.0                 
                        x.close()
                        docs_train.append((l,'No'))        
    return docs_train
Ejemplo n.º 8
0
def preprocTrain(corpus, tf_file, vocab_file):
    global MIN_FREQ
    stopwds = stopwords.words('english')

    TF = {} #gets the freq for each token
    filter_TF = {} #get the freq for each token having freq > minFreq
    feature_train = {} #final features for training class. Passed on to write ARFF files
    vocabulary = []
    ctDocs = {}
    totalDocs = 0
    minFreq = MIN_FREQ
    TrainingFiles = {}

    #loading our corpus
    corpus_root=corpus
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    ctDocs = len(wordlists.fileids()) #total no of files in each class
    totalDocs = ctDocs + totalDocs #total no of files
    TrainingFiles = wordlists.fileids() #contains files for each class

    sys.stderr.write("Reading corpus")
    for fileid in wordlists.fileids():
        sys.stderr.write(".")

        raw = wordlists.raw(fileid)
        tokens = nltk.word_tokenize(raw)
        text = nltk.Text(tokens)

        words = [w.lower() for w in text if w.isalnum() and w.lower() not in stopwds and len(w) > 3]
        vocab = set(words)
        words = nltk.Text(words)

        #calculate TF
        TF[fileid] = {fileid:fileid}
        filter_TF[fileid] = {fileid:fileid}
        for token in vocab:
            TF[fileid][token] = freq(token, words)

            if TF[fileid][token] > minFreq:  #min feature freq.
                vocabulary.append(token)
                filter_TF[fileid][token] = tf(TF[fileid][token],words)

    pickle.dump(filter_TF, open(tf_file, "wb"));
    sys.stderr.write("done\nCalculating TF*IDF scores")
    all_vocabulary = list(set(vocabulary))
    pickle.dump(all_vocabulary, open(vocab_file, "wb"));
    #featureIDF = idf(totalDocs,filter_TF,all_vocabulary)
    pprint(TF, stream=sys.stderr)
Ejemplo n.º 9
0
 def plot_cfreq(self,corpus,patt,n):
     wordlists = PlaintextCorpusReader(corpus,patt)
     fileids = wordlists.fileids()
     for id in fileids:
         words = wordlists.words(id)
         fre = FreqDist(word.lower() for word in words if word.isalpha()) 
     return fre.plot(n,cumulative=True)
Ejemplo n.º 10
0
def tokenisation (path):
    tokens = []
    min_length = 3
    for dirs in os.walk(path):
        corpus_root = dirs[0] #parcour l'arborescence du chemin
        if corpus_root != path:
            textlist = PlaintextCorpusReader(corpus_root,'.*')
            for files in textlist.fileids():
                test= corpus_root + '/' + files
                fs = open(test,'r')
                texte=fs.readlines()
                texte=str(texte)
                words = map(lambda word: word.lower(), wordpunct_tokenize(texte))
                j=0
                while j<len(words):
                    if words[j] not in cachedStopWords:
                        tokens.append(words[j] )
                    j+=1
                fs.close() 
    p = re.compile('[a-zA-Z]+')
    tokens_filtered = filter(lambda token: p.match(token) and len(token)>= min_length, tokens)

#    vocab = []
#    for words in tokens_filtered:
#        vocab.append(SnowballStemmer("english").stem(words))
    
#    tokens_filtered_sans = set(vocab)
    tokens_filtered_sans = set(tokens_filtered)
    tokens_filtered_sans = list(tokens_filtered_sans)
    
    return tokens_filtered_sans
def get_coarse_level_features(dataset, output_file):
	# Import the corpus reader
	corpus_root = '/home1/c/cis530/data-hw2/'+dataset
	# Define the folder where the files are situated
	files_dataset = PlaintextCorpusReader(corpus_root, '.*')
	# Open the output_file
	output = open('/home1/c/cis530/data-hw2/'+output_file,'w')
	# Read the stopwlist
	stop_list = open('/home1/c/cis530/data-hw2/'+'stopwlist.txt').read()
	types_stop_list=stop_list.split()
	for fileid in files_dataset.fileids():
		# Output the docid
		output.write(dataset+'/'+fileid+' ')
		# Output the topic_name
		topic_name=fileid.split('/')[0]	
		output.write(topic_name+' ')
		# Output the num_tokens	
		tokens=files_dataset.words(fileid)
		output.write('tok:'+str(len(tokens))+' ')
		# Output the num_types
		types=set(tokens)
		output.write('typ:'+str(len(types))+' ')
		# Output the num_contents
		output.write('con:'+str(len([w for w in tokens if w not in types_stop_list]))+' ')
		# Output the num_sents
		sents = files_dataset.sents(fileid)
		output.write('sen:'+str(len(sents))+' ')
		# Output the avg_slen
		avg_slen=round(float(len(tokens))/float(len(sents)),2)
		output.write('len:'+str(avg_slen)+' ')
		# Output the num_caps
		output.write('cap:'+str(len([w for w in tokens if w[0]>='A' and w[0]<='Z'])))
		output.write('\n')
	output.close()
def get_lm_features(dataset,output_file):
        # Import the corpus reader
	corpus_root = '/home1/c/cis530/data-hw2/'+dataset
	# Define the folder where the files are situated
	files_dataset = PlaintextCorpusReader(corpus_root, '.*')	
        fin_model = BigramModel('Finance',corpus_root)
        hel_model = BigramModel('Health',corpus_root)
        res_model = BigramModel('Computers_and_the_Internet',corpus_root)
        co_model = BigramModel('Research',corpus_root)
        output = open('/home1/c/cis530/data-hw2/'+output_file,'w')
        for fileid in files_dataset.fileids():
		# Output the docid
		output.write(dataset+'/'+fileid+' ')
		# Output the topic_name
		topic_name=fileid.split('/')[0]
		output.write(topic_name+' ')		
		word_list = files_dataset.words(fileid)
		finprob,finper = fin_model.get_prob_and_per(word_list)		
		hlprob,hlper = hel_model.get_prob_and_per(word_list)	
		resprob,resper = res_model.get_prob_and_per(word_list)
		coprob,coper = co_model.get_prob_and_per(word_list)
		output.write('finprob:'+str(round(finprob,1))+' ')
		output.write('hlprob:'+str(round(hlprob,1))+' ')
		output.write('resprob:'+str(round(resprob,1))+' ')
		output.write('coprob:'+str(round(coprob,1))+' ')
		output.write('finper:'+str(round(finper,1))+' ')
		output.write('hlper:'+str(round(hlper,1))+' ')
		output.write('resper:'+str(round(resper,1))+' ')
		output.write('coper:'+str(round(coper,1))+' ')
		output.write('\n')
        output.close()
Ejemplo n.º 13
0
 def save_my_count(self,corpus,patt,n,filename):
     wordlists = PlaintextCorpusReader(corpus,patt)
     fileids = wordlists.fileids()
     res = []
     for id in fileids:    
         leng = len(wordlists.words(id))
         wordc = len(set(wordlists.words(id)))
         wor = "=> corpus tokens: " + `leng` + "\n"
         dis = "=> corpus token types: " + `wordc` + "\n"
         ric = "=> ind lex richness: " + `leng / wordc` + "\n"
         res.append(dis)
         res.append(ric)
         res.append(wor)
         for word in sorted(set(wordlists.words(id))):
             freq = (wordlists.words(id)).count(word)
             f = "(" + word.lower() + "," + `round(100 * (freq / leng),1)` + ")\n"
             t = "(" + word.lower() + "," + `freq` + "/" + `leng` + ")"
             res.append(f)
             res.append(t)
     out = open("../data/"+filename,"w")
     try:
         for t in res[:n]:
             out.write(t + "\n")
     finally:
         out.close()
Ejemplo n.º 14
0
class Document(object):
    """
    A container object for a set of chapters.

    This allows us to keep track of document frequencies when computing them the
    first time so we don't repeat computations for common words. It also handles
    the PlaintextCorpusReader functions for us.
    """

    def __init__(self, chapter_paths):
        """
        Create a new Document.

        chapter_paths - A list of the paths for chapters in the document.
        """
        self.corpus = PlaintextCorpusReader("", chapter_paths)
        self.chapter_lists = self._sanitize_chapters()
        self.chapter_dists = [(FreqDist(chapter), chapter) for chapter in
                self.chapter_lists]
        self.words = {}

    def get_chapters(self):
        return self.chapter_lists

    def average_chapter_frequency(self, word):
        freqs = []
        if word in self.words:
            return self.words[word]
        else:
            for (dist, wordlist) in self.chapter_dists:
                freqs.append(dist[word]/float(len(wordlist)))

            # Store and return the average frequency
            avg_frq = mean(freqs)
            self.words[word] = avg_frq
            return avg_frq

    def _sanitize_chapters(self):
        # Sanitize the wordlists and return them
        lists = [self.corpus.words(file_id) for file_id in
                self.corpus.fileids()]

        new_lists = []

        for word_list in lists:
            # Convert everything to lowercase (e.g. so "the" and "The" match)
            word_list = [word.lower() for word in word_list]
            # Remove any punctuation
            word_list = [re.sub('\p{P}','',word) for word in word_list]
            # Remove stopwords, punctuation, and any empty word
            stops = stopwords.words('english')
            stops.append('')
            stops.append('said')
            word_list = [word for word in word_list if (word not in stops and
                word.isalpha())]

            new_lists.append(word_list)

        return new_lists
Ejemplo n.º 15
0
def get_sub_directories(directory):
    files = PlaintextCorpusReader(directory, ".*")
    dirs = list()
    for f in files.fileids():
        if "/" in f:
            if (f[:f.index("/")] not in dirs):
                dirs.append(f[:f.index("/")])
    return dirs
Ejemplo n.º 16
0
    def _compute_unigram_frequency(self):
        wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*')
        tokenizer = TreebankWordTokenizer()
        total = len(wordlists.fileids())
        count = 0
        fdist = nltk.FreqDist()
        for fl in wordlists.fileids():
            count += 1
            fl_abs_path = os.path.join(self.prepared_training_data_root, fl)
            with open(fl_abs_path, 'r') as f:
                words = tokenizer.tokenize(f.read())
                fdist.update(words)
            print 'freqdist: %s of %s' % (count, total)

        with open(os.path.join(self.corpus_root, 'unigram_frequency.txt'), 'w') as f:
            f.writelines(['%s %s\n' % (word, freq) for (word, freq) in fdist.items()])
        return None
Ejemplo n.º 17
0
 def _compute_biagram_frequency(self):
     if not os.path.exists(self.bigram_frequency_dir):
         os.mkdir(self.bigram_frequency_dir)
     wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*')
     tokenizer = TreebankWordTokenizer()
     total = len(wordlists.fileids())
     count = 0
     for fl in wordlists.fileids():
         count += 1
         print 'freqdist: %s of %s' % (count, total)
         fl_abs_path = os.path.join(self.prepared_training_data_root, fl)
         with open(fl_abs_path, 'r') as f:
             words = tokenizer.tokenize(f.read())
             bi_words = nltk.bigrams(words)
             fdist = nltk.FreqDist(bi_words)
         with open(os.path.join(self.bigram_frequency_dir, fl), 'w') as f:
             f.writelines(['%s %s %s\n' % (word[0], word[1], freq) for (word, freq) in fdist.items()])
     return None
Ejemplo n.º 18
0
def get_corpus(corpusdir):
	newcorpus = PlaintextCorpusReader(corpusdir, '.*')
	titles = newcorpus.fileids() # returns all the .txt files in the dir
	words = []
	for title in titles:
		newcorpus_txt = newcorpus.words(title)
		words.extend([ e for e in newcorpus_txt if re.match(r"[aA-zZ]",e)])
	
	return words
def get_coarse_level_features(dataset, output_file):
# accessing the corpus
    corpus_root = '/home1/c/cis530/data-hw2/' 
    dataset_path = corpus_root + dataset

# Reading the files from the directories
    files = PlaintextCorpusReader(dataset_path, '.*')
    ids = files.fileids()
    stopFile = PlaintextCorpusReader(corpus_root, 'stopwlist.txt')
    stops = stopFile.words()

#Opening a file that has to be written to
    out = open(output_file, 'w')

    for i in range(0,len(ids) - 1):
#Initializing certain variables
        tokens_count=0
        types = 0
        non_stops_count=0
        sents_count = 0
        avg_sent_len=0
        cap_count = 0

        tokens=files.words(ids[i])
#Computing Number of Tokens
        tokens_count = len(tokens)

#Computing Number of types
        types = len(set(tokens))
        non_stops=[]

#Computing Number of Content Words
        for t in tokens:
            if t not in stops:
                non_stops.append(t)
        non_stops_count = len(non_stops)

#Finding Average Sentence Length
        sent = []
        sent = files.sents(ids[i])
        sents_count = len(sent)
        sent_len=0
        for s in sent:
            sent_len = sent_len + len(s)
        avg_sent_len = sent_len/float(sents_count)

#Computing Number of Captilized Words
        for c in non_stops:
            if c.istitle():
                cap_count = cap_count+1
        current_file = dataset + '/' + ids[i]
        e = current_file.split('/')
        out.write(current_file +' '+ e[-2] + ' tok:' + str(tokens_count) + ' typ:' + \
str(types) + ' con:' + str(non_stops_count) + ' sen:' + str(sents_count) + ' len:' + str(avg_sent_len) + ' cap:' + str(cap_count)+ '\n')
        out.flush()
Ejemplo n.º 20
0
    def calculate_errors(self, classifier, word_features, test_path, log=0):
        # load the pickle file with the classifier progress

        corpus_news = PlaintextCorpusReader(test_path, '.*\.txt')

        Gold = []
        Test = []

        TP = 0
        TN = 0
        FP = 0
        FN = 0

        Errors = []

        for file in corpus_news.fileids():
            category = file.split(self.DELIMITER)[-1].split('--')[0]
            Gold.append(category)
            words = corpus_news.words(file)
            testing_set = get_features(set(words), word_features)
            result = classifier.classify(testing_set)
            Test.append(result)
            if category == result == 'attack':
                TP += 1
            elif category == result == 'nonattack':
                TN += 1
            elif category != 'attack' and result == 'attack':
                Errors.append('{} : false positive {}'.format(file, result))
                FP += 1
            elif category == 'nonattack' and result == 'attack':
                Errors.append('{} : false negative {}'.format(file, result))
                FN += 1

        Accuracy = (TP + TN) / len(corpus_news.fileids())
        Precision = TP / (TP + FP)
        Recall = TP / (TP + FN)
        F1 = (2 * Precision * Recall) / (Precision + Recall)

        if log == 1:
            self.PrintResult(Accuracy, Precision, Recall, F1, Gold, Test)

        return Errors
Ejemplo n.º 21
0
def get_coarse_level_features(dataset,output_file):

    output = open(output_file,'w')
    root = ROOT+dataset

    files = PlaintextCorpusReader(root,'.*')
    for fileid in files.fileids():
        __output_header(output,dataset,fileid)
        __write_coarse(output,files,fileid)
        output.write('\n')
    output.close()
def create_LM_on_dataset(dataset):
    
     corpus_root = '/home1/c/cis530/data-hw2/Language_model_set/'
     dataset_path = corpus_root + dataset
     files = PlaintextCorpusReader(dataset_path, '.*')
     ids = files.fileids()
     for i in range(len(ids)):
         words = files.words(ids[i])
     lang_model = NGramModel2(words,2)
     
     return lang_model
Ejemplo n.º 23
0
def get_pos_features(dataset,feature_set_file,output_file):
    root = ROOT+dataset
    files = PlaintextCorpusReader(root,'.*')
    feature_list = open(feature_set_file).read().split()

    output = open(output_file,'w')
    for fileid in files.fileids():
        __output_header(output,dataset,fileid)
        __write_pos(output,files,fileid,feature_list)
        output.write('\n')
    output.close()
Ejemplo n.º 24
0
def taille_corpus(corpus):
    taille = 0
    for dirs in os.walk(corpus):
        corpus_root = dirs[0] #parcour l'arborescence du chemin
        if corpus_root != corpus:
            textlist = PlaintextCorpusReader(corpus_root,'.*')
            for files in textlist.fileids():
                test= corpus_root + '/' + files
                taille += os.path.getsize(test)
                
    return taille 
Ejemplo n.º 25
0
    def stemming_files(self, source_folder, destination_folder):
        if not os.path.exists(destination_folder):
            os.makedirs(destination_folder)

        corpus_news = PlaintextCorpusReader(source_folder, '.*\.txt')

        for file in corpus_news.fileids():
            file_name = os.path.basename(os.path.normpath(file))
            words = corpus_news.words(file)
            stemmed_content = self.stemming_text(words)
            with open(destination_folder + "/" + file_name, 'w', encoding='utf8') as modified:
                modified.write(' '.join(stemmed_content))
Ejemplo n.º 26
0
def doc_test(corpus,dictio_feature,categories):
    docs_test = []
    for dirs in os.walk(corpus):
        corpus_root = dirs[0] #parcour l'arborescence du chemin        
        if corpus_root != corpus and os.path.basename(corpus_root) in categories:
            classe_doc = os.path.basename(corpus_root)
            textlist = PlaintextCorpusReader(corpus_root,'.*')
            for files in textlist.fileids():
                test= corpus_root + '/' + files
                doc_cat = represent_doc_test(test, classe_doc, all_dictio)
                docs_test.append(doc_cat)

    return docs_test
def get_lm_features(dataset, output_file):      
    corpus_root = '/home1/c/cis530/data-hw2/'
    bigram_root = corpus_root + 'Language_model_set/'

    fin_files = PlaintextCorpusReader(bigram_root+'Finance/','.*')
    fin_words = list(fin_files.words())
    fin_model = NGramModel(fin_words, 2)

    health_files = PlaintextCorpusReader(bigram_root+'Health/','.*')
    health_words = list(health_files.words())
    health_model = NGramModel(health_words, 2)

    res_files = PlaintextCorpusReader(bigram_root+'Research/','.*')
    res_words = list(res_files.words())
    res_model = NGramModel(res_words, 2)

    com_files = PlaintextCorpusReader(bigram_root+'Computers_and_the_Internet/','.*')
    com_words = list(com_files.words())
    com_model = NGramModel(com_words, 2)

    test_files = PlaintextCorpusReader(corpus_root+dataset, '.*')
    ids = test_files.fileids()

    out_file = open(output_file,'w')

    for j in range(0,len(ids)):
        file_words = test_files.words(ids[j])
        out_str = ''
        current_file = dataset + '/'+ids[j]
        e = current_file.split('/')
        out_str = out_str + current_file+ ' '+e[-2]
        sum_fin=0
        sum_health=0
        sum_res=0
        sum_com=0                                                                         
        text_len = len(file_words)
        for i in range(1,len(file_words)):
            sum_fin = sum_fin + math.log(fin_model.prob((file_words[i-1],),file_words[i]))
            comp_fin = float((-sum_fin)*(1/float(text_len)))
            sum_health = sum_health + math.log(health_model.prob((file_words[i-1],),file_words[i]))

            comp_health = (float(-sum_health))*(1/float(text_len))
            sum_res = sum_res + math.log(res_model.prob((file_words[i-1],),file_words[i]))
            comp_res = (float(-sum_res))*(1/float(text_len))
            sum_com = sum_com + math.log(com_model.prob((file_words[i-1],),file_words[i])) 
            comp_com = (float(-sum_com))*(1/float(text_len))
            out_str = out_str + ' finprob:'+str(round(sum_fin,2))+' hlprob:'+str(round(sum_health,2))+' resprob:'\
+str(round(sum_res,2))+ ' coprob:' + str(round(sum_com,2)) + ' finper:' + str(round(comp_fin,2)) + ' hlper:'+\
str(round(comp_health,2))+ ' resper:' + str(round(comp_res,2)) + ' coper:' + str(round(comp_com,2)) 
           out_file.write(out_str + '\n')
           out_file.flush()
Ejemplo n.º 28
0
 def occStats(self,path,format,list,plotting):
     wordlists = PlaintextCorpusReader(path,format)
     fileids = wordlists.fileids()
     k = len(fileids)
     
     # computing frequencies
     self.fileStats(path,fileids)
     
     # save stats
     statsname = "Base GQs (disjoint patterns)"
     savpath = plotting +'/'+ statsname.replace(' ', '-')
     
     # generating report
     SaveStats(self.classstats,self.stats,"",savpath,plotting) # all
Ejemplo n.º 29
0
def occurrence_mot_i_corpus(mot,corpus):
    compteur = 0
    for dirs in os.walk(corpus):
        corpus_root = dirs[0] #parcour l'arborescence du chemin
        if corpus_root != corpus:
            textlist = PlaintextCorpusReader(corpus_root,'.*')
            for files in textlist.fileids():
                test= corpus_root + '/' + files
                x = open(test,'r')
                for ligne in x:
                    if ligne.find(mot)>0:
                        compteur+=1
                x.close()
    return compteur
Ejemplo n.º 30
0
def carga_mongodb():

    client = pymongo.MongoClient(MONGODB_URI)
    db = client.docs
    docs=db.DOCS
    spanish_stops = set(stopwords.words('spanish'))
    newcorpus = PlaintextCorpusReader(corpus_root, '.*')
    newcorpus.fileids()

    for fileid in newcorpus.fileids():

        try:
            num_words = len(newcorpus.words(fileid))
            words = newcorpus.words(fileid)
            # num_sents = len(newcorpus.sents(fileid))
            # print(newcorpus.raw(fileid))
            #bcf = BigramCollocationFinder.from_words(words)
            #filter_stops = lambda w: len(w) < 3 or w in spanish_stops
            #bcf.apply_word_filter(filter_stops)
            tags_array=vocab_words(newcorpus.raw(fileid))
            tags=tags_array[0]
            tags_vocab=tags_array[1]
            cloud=tags_array[2]
            total_cloud=[]

            for c in cloud:
                reg={}
                reg['word']=c[0]
                reg['total']=c[1]
                total_cloud.append(reg)

            #insertamos el documento
            post = {"nombre":  fileid, "fecha": datetime.datetime.utcnow(), "texto":preparar_texto(newcorpus.raw(fileid)), "tags_vocab":tags_vocab, "tags":tags, "enc":random.randint(1, 50), "pos":random.randint(1, 10), "neg":random.randint(1, 5), "num_words":num_words, "cloud":total_cloud}
            post_id = docs.insert_one(post).inserted_id

        except:
            print("Importacion Fallida:" + fileid)
    dl_feature_types[i]=dl_line_words[1]
    dl_feature_values[i]=dl_line_words[2]  # خواندن مقادیر فیچرها از فايل متني ليست تصميم و نگهداري آنها در يک ليست جداگانه براي مقايسه
    dl_tag_values[i]=dl_line_words[3]      # خواندن مقادیر برچسبها از فايل متني ليست تصميم و نگهداري آنها در يک ليست جداگانه براي مقايسه


fp_train_sorted_decision_list.close()

# خواندن اطلاعات از فايل متني تست
#############################################

from nltk.corpus import PlaintextCorpusReader
corpus_root = '/'
corpus_root = 'WSD/shir_4 folds_texts/shir_test/' # یا هر نسخه دیگر پایتون که بر روی دستگاه نصب شده) قرارگيرد)  python34 که شامل زیرفولدرهاي ذکر شده است بايد حتما در دايرکتوري WSD فولدر
peykare = PlaintextCorpusReader(corpus_root, '.*') 

f=peykare.fileids()
l=len(f)

fp=peykare.open(f[3])

peykare_lines=fp.read().split('\n')
peykare_lines_count=len(peykare_lines) # تعداد خطوط پيکره

fp.seek(0)


test_given_lines_tags=['' for i in range(peykare_lines_count)]
for i in range(peykare_lines_count):
    line_str=fp.readline()       
    line_words=line_str.split()       
    test_given_lines_tags[i]=line_words[-1]     # test_given_lines_tags ذخیره‌کردن برچسب خطوط فايل در بردار  
Ejemplo n.º 32
0
from nltk.corpus import PlaintextCorpusReader

corpus_root = './twitter_data'
wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')
id = wordlists.fileids()[0]

print(wordlists.words(id))
Ejemplo n.º 33
0
def load_corpus():
    print(gutenberg.root)
    from nltk.corpus import PlaintextCorpusReader
    corpus_root = '/path/of/corpus'
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    wordlists.fileids()
Ejemplo n.º 34
0
    corpus = PlaintextCorpusReader(corpus_root, exp_archivos)

if dataset == 4:
    #tweets
    corpus_root = '/home/mguevara/datasets/tweet/'
    export_indices = '/home/mguevara/datasets/info/indices/tweet/'
    export_matrices = '/home/mguevara/datasets/info/matrices/tweet/'
    export_vocabularios = '/home/mguevara/datasets/info/vocabularios/tweet/'
    exp_archivos = '.*'
    termino_ejemplo = 'jaja'  #'articulo857.txt'
    documento_ejemplo = '12Jul-3.json.txt'
    print_titulo("CREAR CORPUS")
    from nltk.corpus import PlaintextCorpusReader
    corpus = PlaintextCorpusReader(corpus_root, exp_archivos)

if len(corpus.words()) < 100: print corpus.fileids()
print corpus.fileids()
print "\n\t\t\t\t...Corpus creado"

a_stopwords = 0
a_porter = 0
a_lema = 0
a_alpha = 1
a_lower = 0
verbose = 0  #para mostrar mensajes en la funcion MiraVocab

#corpus_root = '/home/mguevara/datasets'
#corpus = PlaintextCorpusReader(corpus_root, 'reuters/.*')
#print inverted_index['SELZ']
#termino_ejemplo = 'machine'
Ejemplo n.º 35
0
    #1 Overview of using collocations
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_words(pairs)
    f2 = nltk.collocations.BigramCollocationFinder.from_words(pairs)
    for i in range(2, 6):
        f2.apply_freq_filter(i)
    scored = finder.score_ngrams(bigram_measures.raw_freq)
    word_fd = nltk.FreqDist(tokens)
    bigram_fd = nltk.FreqDist(nltk.bigrams(tokens))
    finder = BigramCollocationFinder(word_fd, bigram_fd)
    #need to be examined which one is better
    #print sorted(finder.nbest(trigram_measures.raw_freq, 2))
    print sorted(finder.nbest(trigram_measures.raw_freq, 12))
    return sorted(finder.nbest(trigram_measures.raw_freq, 15))


#Texas folder collocation start
corpus_root = "Islip13Rain"
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
ClassEvent = nltk.Text(wordlists.words())
artcle, nonArtcl = 0, 0
art, nonArt = [], []
extractTxt = ' '
topWords = extractTopWrds(ClassEvent)
phrases = extractPhrases(ClassEvent)

print topWords
print phrases
Ejemplo n.º 36
0
    table3['fecha']=table3['fecha'].str.upper()
    table3['fecha_format']=table3['fecha'].apply(format_date)
    table3=table3[['key','clase_doc','Description','fecha','fecha_format']]

    if table1.empty:
        new_row = {'Description':'Not found', 'clase_doc':'Not Found'}
        table1 = table1.append(new_row, ignore_index=True)
        table1=table1[['Description','clase_doc']]
        
    return(table3)

# Ejecucion principal

# charge corpus txt
corpus = PlaintextCorpusReader(path_input_txt, file_ids_text)
ids = corpus.fileids()

# charge corpus ocr
corpus_ocr = PlaintextCorpusReader(path_input_ocr, file_ids_ocr)
ids_ocr = corpus_ocr.fileids()

# black_lists expansion with more posibilities
black_list_magistrado=list_concordances_ltc(black_list_magistrado)
black_list_opositor=list_concordances_ltc(black_list_opositor)
black_list_area=list_concordances_ltc(black_list_area)

########################
###  RDS connection  ###
########################
engine = create_engine(t96.sqlConnString)
engine.execute("insert into tt_log_transaccion (operacion, comentario) values ('Extracion opositor','Inicia proceso')")
from __future__ import division
import nltk, re, pprint
from urllib import urlopen
from nltk.corpus import PlaintextCorpusReader, stopwords
from HTMLParser import HTMLParser
from nltk.tokenize import *
import shutil, os
from bs4 import BeautifulSoup

corpus_root = 'C:\Users\Brent\Documents\My Research\Supreme Court Justices\Output Documentation\\1959\\yo\\'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
fileIds = wordlists.fileids()

for i in range(1959, 2014):
    for name in fileIds:
        print name
        if str(i) in name:
            os.rename(
                'C:\Users\Brent\Documents\My Research\Supreme Court Justices\Output Documentation\\1959\\yo\\'
                + name,
                'C:\Users\Brent\Documents\My Research\Supreme Court Justices\Output Documentation\\1959'
            )
# Reading the e960401 file for basic manipulation

import nltk
from nltk.corpus import PlaintextCorpusReader

corpus_root = '../../../Corpus'

excelsior = PlaintextCorpusReader(corpus_root, '.*\.txt')
print("Available articles ", excelsior.fileids())

article_name = 'e960401.txt'
article = excelsior.words(fileids=article_name)
article_lower = [w.lower() for w in article]

print(article_name, " has ", len(article_lower), " tokens.")

vocabulary = sorted(set(article_lower))
print(vocabulary)
print(article_name, " has a vocabulary length of ", len(vocabulary), ".")
text = nltk.Text(article_lower)
# text.concordance('empresa')

bag = []
for cl in text.concordance_list('empresa'):
    left = list(cl[0][-4:])
    right = list(cl[2][-4:])
    bag += left
    bag += right

print("The bag of words of 'empresa' is: ", bag)
Ejemplo n.º 39
0
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords

import string

corpus_root = "abstracts"
wordlists = PlaintextCorpusReader(corpus_root, '.*')

all_words_list = []
for fid in wordlists.fileids():
    try:
        all_words_list += list(wordlists.words(fid))
    except Exception as e:
        print e

fd = FreqDist(Text([w.lower() for w in all_words_list]))

vocabulary = fd.keys()
clean_vocabulary = [
    v for v in vocabulary
    if v not in stopwords.words("english") and v not in string.punctuation
]

print clean_vocabulary[:50]

# TODOs:
# 1. Take care of non-meaningful words, like "1", ").", etc.
# 2. In the whole vocabulary, there are some words like "\x00", why?
Ejemplo n.º 40
0
    'to', 'for', 'it', 'in', 'on'
]


def create_word_features(clist):
    useful_words = [word for word in clist if word not in long_stop_list]
    my_dict = dict([(word, True) for word in useful_words])
    return my_dict


corpus_root = 'C:/Users/Bindu/Desktop/samp1/neg'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
neg_re = []
mylist = []
x = []
for fileids in wordlists.fileids():
    words = wordlists.words(fileids)
    neg_re.append((create_word_features(words), "Depressed"))

corpus_root = 'C:/Users/Bindu/Desktop/samp1/pos'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
pos_re = []
mylist = []
x = []
for fileids in wordlists.fileids():
    words = wordlists.words(fileids)
    pos_re.append((create_word_features(words), "Not depressed"))

train_set = neg_re[:45] + pos_re[:45]
test_set = neg_re[45:] + pos_re[45:]
print(len(train_set), len(test_set))
Ejemplo n.º 41
0
for i in range(20): #loop 10 times
	fd.tabulate(counter, counter+10)
	raw_input("Hit Enter")
	counter += 10
	
#############################################################
#or we might want to create an nltk corpus
#############################################################
from nltk.corpus import PlaintextCorpusReader
#http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.plaintext.PlaintextCorpusReader-class.html
summaries_root = "/Volumes/Optibay-1TB/Python/scrapingCode/billSummaries/" #say where your text file is
billsCorpora = PlaintextCorpusReader(summaries_root, r'h[1-9].*\.txt') #read it into the corpora
dir(billsCorpora) # see what methods can be used on the corpora

rawBills = billsCorpora.raw() #all the rawtext
billsCorpora.fileids() #show the files in the corpora
billsCorpora.fileids()[0] #show the first fileid

billsCorpora.raw()[1:30]
len(billsCorpora.fileids()) #show how many files
len(billsCorpora.sents())

#billsCorpora.fileids('h1.txt')
billsCorpora.raw() #the whole corpora
billsCorpora.sents() #all of the sentences in the corpora
billsCorpora.sents()[1]
billsCorpora.words() #all of the words in all of the corpora
billsCorpora.words('h1.txt')  #all of the words in one corposa
twoBillsWords = billsCorpora.words(['h1.txt', 'h1447.txt'])
twoBillsWords[500:525] #not that puncutation is included and bills have not been lemmatized
billWords = billsCorpora.words()
Ejemplo n.º 42
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, TruncatedSVD
#from sklearn.pipeline import make_pipeline
import numpy as np
import matplotlib.pyplot as plt
import re
import pandas as pd

print("Creating corpus")

corpus_root = './docs/'  #directory path for txt files
#ls -l ./docs | wc -l #to get number of files in dir
newCorpus = PlaintextCorpusReader(corpus_root, '[a-zA-Z0-9_\-]+.txt')

files = newCorpus.fileids()

print('Extracting text from all docs..')
docs = []  #empty list, append from corpus
row = [
]  #row index to grab hand coded labels and filer names from external csv file
for f in files:
    docs.append(newCorpus.raw(fileids=f))
    row.append(int(re.sub('(file)|(-[1-9].txt)', '', f)))  #document numbers

## vectorize the words (i.e. count and transform in one step): unigrams and bigrams

print('Creating term document matrix with TfIdf vectorizer..')
vectorizer = TfidfVectorizer(min_df=0.15,
                             max_df=.5,
                             stop_words='english',
Ejemplo n.º 43
0
                            categories_10)
 #                print corpus_test_OVA
 fin = time.clock()
 print '---Duree :', fin - deb, 'secondes'
 print 'le nombre de documents a tester est le suivant:', len(
     corpus_test_OVA)
 cmpt_correct = 0
 print '---debut du processus de test---'
 deb = time.clock()
 for dirs in os.walk(folder_path_test):
     corpus_root = dirs[0]  #parcour l'arborescence du chemin
     if corpus_root != folder_path_test and os.path.basename(
             corpus_root) in categories_10:
         classe_doc = os.path.basename(corpus_root)
         textlist = PlaintextCorpusReader(corpus_root, '.*')
         for files in textlist.fileids()[:1]:
             test = corpus_root + '/' + files
             #        for (fs,l) in corpus_test_OVA:
             #            print '---classification---'
             dict_result_classif = classify_one_doc(
                 test, dict_models, categories_10)
             classe_predite = OVA(dict_result_classif,
                                  categories_10)
             print 'la classe predite pour le document est la suivante:', classe_predite
             print 'label attendu:', classe_doc
             if classe_predite == classe_doc:
                 correct = 'true'
             else:
                 correct = 'false'
             print 'verification du label predit, true sil correspond et false si non:', correct
             if correct == 'true':
Ejemplo n.º 44
0
        if a + 1 < len(sys.argv) and not sys.argv[a + 1].startswith("-"):
            corpus_root = sys.argv[a + 1]
            i = i + 1

    a = a + i

print "Corpus is", corpus_root
print 'Prob. of n-grams depend on history:', CG_REPR
print 'Good-Turing:', GT_SMOOTH
print 'Kneser-Ney:', KN_SMOOTH

start = time.time()

# Load corpus
corpus = PlaintextCorpusReader(corpus_root, '.*txt', encoding='UTF-8')
n_texts = len(corpus.fileids())

# Matrix containing features; a list for each text
feature_matrix = [[] for i in range(n_texts)]
#print feature_matrix

# List of classes, one for each text in corpus
text_classes = fextract_helper.find_classes(corpus.fileids())

if char_ngrams:
    s1 = time.time()
    a, t = fextract_helper.char_ngram_stats(corpus.fileids(), corpus, \
                                            char_ngram_size, CG_REPR or KN_SMOOTH)
    e1 = time.time()
    print 'Char: Finding took', e1 - s1, 'seconds'
Ejemplo n.º 45
0
import nltk, re, pprint, string
from gensim import corpora, models, similarities
import codecs
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, gutenberg, PlaintextCorpusReader

gutenberg_dir = nltk.data.find('corpora/gutenberg.zip').join('gutenberg/')
root = '/usr/local/share/nltk_data/corpora/gutenberg/'
reader = PlaintextCorpusReader(
    gutenberg_dir,
    '.*\emma.txt')  # doctest: +SKIP  #actual regexp should read '.*\.txt'

documents = []

for f in reader.fileids():
    for sent in reader.sents(f):
        documents.append(sent)
        #print(sent)

print(documents)[1:100]

stoplist = stopwords.words('english')
punctuation = string.punctuation + "''"
print(punctuation)

texts = [[
    word.lower() for word in document
    if (word not in stoplist and not re.search("\W.*", word))
] for document in documents]
print(texts[1:100])
Ejemplo n.º 46
0
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'E:\\NLP\\MyFiles'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
list = wordlists.fileids()
print(list)
list2 = wordlists.words('kc.txt')
print(list2)
Ejemplo n.º 47
0
from textrank.keyword_extraction import extract_keywords
from nltk.corpus import PlaintextCorpusReader
import os.path
import random

path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "citeulike180",
                    "documents")

print("Parsing corpus from path {0}".format(path))

corpus = PlaintextCorpusReader(path, ".*", encoding="latin-1")

files = corpus.fileids()

print("Found {0} files within corpus.".format(len(files)))

fileid = random.randint(0, len(files))

file = files[fileid]

print("Randomly selected file {0} for processing.".format(file))

print("Extracting keywords...")

print(extract_keywords(corpus.raw(file)))
Ejemplo n.º 48
0
"""

# Criação de Corpus com Python 

import matplotlib.pyplot as plt 
import nltk # Já vem no anaconda 

# nltk.download() 
    # abre uma janela de download de todos os pacotes adicionais 

# Criando o Corpus: 
from nltk.corpus import PlaintextCorpusReader 

corpus = PlaintextCorpusReader('Dados', '.*') # .* = todas as extensões 

arquivos = corpus.fileids() # Cria uma lista com todos os arquivos da pasta 

# Observando os arquivos: 
arquivos[0]
arquivos[0:100]
for a in arquivos:
    print(a)
    
# Vendo o texto de um arquivo: 
texto = corpus.raw('1.txt')

# Acessar o texto de todo o corpus: 
todo_texto = corpus.raw()

# Acessar cada uma das palavras que existem no corpus: 
palavras = corpus.words()
Ejemplo n.º 49
0
import random
import os
import shutil
from nltk.corpus import PlaintextCorpusReader

corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/almedad/all_known"
#corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/ansar1/all"
output_dir = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/almedad/al3"
#output_dir = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/ansar1/an9"

N_TEXTS = 3000

B = 50

corpus = PlaintextCorpusReader(corpus_root, '.*txt', encoding="UTF-8")
n = len(corpus.fileids())

#start = random.randint(0,n)
#print 'start', start
count = 0
#next = start
picked = []

b = 0
next = random.randint(0, n - B)

while count < N_TEXTS:

    b = 0

    while b < B:
Ejemplo n.º 50
0
from nltk.corpus import PlaintextCorpusReader
corpus_root = "./SH"
my_corpus = PlaintextCorpusReader(corpus_root, '[^__].*txt')
print(my_corpus.fileids())
#my_corpus.words('hound_of_baskerville.txt')[10:20]
#sentOut = my_corpus.sents('hound_of_baskerville.txt')[10]
#print(sentOut)
from nltk.text import Text
hound = Text(my_corpus.words('hound_of_baskerville.txt'))
#hound.concordance("Watson")
#hound.similar("hound")
#hound.collocations()
#from nltk.probability import FreqDist
#my_fdist = FreqDist(hound)
#top_100 = my_fdist.most_common(100)
#print(top_100[50:99])
#my_fdist.hapaxes()
#hound.dispersion_plot(["Holmes","Watson"])
#hound.dispersion_plot(["Stapleton", "Henry", "Barrymore"])

#ne_chunk: name chunk ; pos_tag: part of speech ; word_tokenize: chunking
import nltk
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from collections import Counter


def get_characters(words_param):
    chunked = ne_chunk(pos_tag(words_param))
    prev = None
    continuous_chunk = []
Ejemplo n.º 51
0



from nltk import word_tokenize
from nltk.tag import pos_tag
V = ['VB', 'VBZ', 'VBP', 'VBD', 'VBG']
N = ['NN', 'NNS', 'NNP', 'NNPS']
ADV = ['RB', 'RBR', 'RBS']
ADJ = ['JJ', 'JJR', 'JJS']
wLen = []       # number of words
vLen = []       # number of verbs
advLen = []     # number of adverbs
adjLen = []     # number of adjectives
vLen, nLen, advLen, adjLen, wLen = ([] for i in range(5))
for fileid in newcorpus.fileids():
 tokens = word_tokenize(newcorpus.raw(fileid))
 words = [t for t in tokens if t.isalpha()]
 taggedW = pos_tag(words)
 verbs, nouns, advs, adjs = ([] for i in range(4))
 for (w,tag) in taggedW:
     if tag in V: verbs.append(w)
     elif tag in N: nouns.append(w)
     elif tag in ADV: advs.append(w)
     elif tag in ADJ: adjs.append(w)
 wLen.append(len(words))
 vLen.append(len(verbs))
 nLen.append(len(nouns))
 advLen.append(len(advs))
 adjLen.append(len(adjs))
Ejemplo n.º 52
0
        return True
    else:
        return False


############# Loading, processing, and bigrams for Democrat speeches ###########

dem_root = "E:/Documents/IST 664/Final Project/Democrats"  # change this line as appropriate
dems = PlaintextCorpusReader(dem_root, [
    "Arizona.txt", "Connecticut.txt", "Illinois.txt", "Kentucky.txt",
    "Louisiana.txt", "Maine.txt", "Massachusetts.txt", "Michigan.txt",
    "New Hampshire.txt", "Vermont.txt"
])
dem_sentences = []
for i in range(10):
    temp = dems.fileids()[i]
    temptext = dems.raw(temp)
    tempsent = sent_tokenize(temptext)
    dem_sentences = dem_sentences + tempsent

Dems = []
for i in range(10):
    temp = dems.fileids()[i]
    temptext = dems.raw(temp)
    tempTokens = nltk.word_tokenize(temptext)
    Dems = Dems + tempTokens

Demwords = [w.lower() for w in Dems]
DemAlphaWords = [w for w in Demwords if not alpha_filter(w)]
DemStoppedWords = [w for w in DemAlphaWords if not w in stopwords]
print("Democrats used",
Ejemplo n.º 53
0
	print(calculate_lines_II(volume))	

# In order to calculate the amount of words and sentences in each volume,
# We made a new corpus of the volumes using the nltk PlaintextCorpusReader
# which has some easy tools that can split a text into a list of words or sentences.

corpus_root= 'data'
volumes = PlaintextCorpusReader(corpus_root, 'arabian.*')

list_of_sentences = volumes.sents()
print('The ten volumes consist of ' + str(len(list_of_sentences)) + ' sentences')

list_of_words = volumes.words()
print('The ten volumes consist of ' + str(len(list_of_words)) + ' words')

for item in volumes.fileids(): #calculate the amount of words in each volume
	print(item,':', len(volumes.words(item)), 'words')

for item in volumes.fileids(): #calculate the amount of sentences in each volume
	print(item,':', len(volumes.sents(item)), 'sentences')	


##################################################################
#visualisation of the statistiscs with basic plotting techniques
##################################################################

# Visualise the characters per volume
# and make a list of the total characters per volume. 

characters_per_volume = (calculate_characters(corpus))[0]
x = [1,2,3,4,5,6,7,8,9,10]
Ejemplo n.º 54
0
def edit_nltk():
    corpus_root = r'/data'
    books = PlaintextCorpusReader(corpus_root, '.*')
    print(books.fileids())
Ejemplo n.º 55
0
#Obtener la palabra más frecuente del fichero del corpus
palabra_mas_frecuente = fdist.max()
print(
    "\n--------------------------------------------------------------------------------\n "
)
print("\n\n1.9) Palabra mas frecuente del fichero del corpus: \n" +
      str(palabra_mas_frecuente))

# Cargar los ficheros de PoliformaT (“spam.txt”, “quijote.txt” y “tirantloblanc.txt” ) como un corpus propio.
from nltk.corpus import PlaintextCorpusReader

corpus_root = './ficheros'

new_corpus = PlaintextCorpusReader(corpus_root, '.*')
lista_ficheros = new_corpus.fileids()
print(
    "\n--------------------------------------------------------------------------------\n "
)
print("\n\n1.10) Ficheros que componen el corpus: \n" + str(lista_ficheros) +
      "\n")
print(
    "\n-------------IMPORTANTE CAMBIAR LA RUTA DE LOS ARCHIVOS EN EL CÓDIGO------------\n "
)
print("Ruta actual : " + corpus_root)

# Calcular el número de palabras, el número de palabras distintas y el número de frases de los tres documentos
print(
    "\n--------------------------------------------------------------------------------\n1.11) "
)
print("\n" + "Palabras".rjust(35, " ") + "Vocabulario".rjust(12, " ") +
Ejemplo n.º 56
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import matplotlib.pyplot as plt
import nltk  #natural language toolkit

#vai abrir uma janela, onde pode fazer o download de todos os pacotes
#nltk.download()

from nltk.corpus import PlaintextCorpusReader

corpus = PlaintextCorpusReader('Arquivos', '.*')  #.*: todas as extensões

arquivos = corpus.fileids()
arquivos[0]
arquivos[0:100]
for a in arquivos:
    print(a)

texto = corpus.raw('1.txt')  #conteúdo
todo_texto = corpus.raw()  #todos os textos em todos os arquivos
palavras = corpus.words(
)  #não mostra no variable explorer e mostra apenas '...' no console, pois é um vetor muito grande
palavras[1]  #visualizar a segunda palavra
len(palavras)  #contagem de palavras
from nltk.corpus import PlaintextCorpusReader

# Link to download plays in txt format of Shakespeare - http://www.textfiles.com/etext/AUTHORS/SHAKESPEARE/

import os

corpus_root = os.getcwd() + '/'
file_ids = '.*.txt'
wordlists = PlaintextCorpusReader(corpus_root, file_ids)
print(wordlists.fileids())
print(
    wordlists.words(
        r'C:\Users\nathani_n\Desktop\Nlp_Udemy\shakespeare-taming-of-the-shrew.txt'
    ))
Ejemplo n.º 58
0
import nltk
from nltk.corpus import PlaintextCorpusReader
import pickle

root = '../Charlotte/'
wordlist = PlaintextCorpusReader(root, '.*')
wordlist.fileids()
charlotte = wordlist.words('Charlotte.txt')
# other = wordlist.words('other.txt')
# join = charlotte + other

with open('nltk_german_classifier_data.pickle', 'rb') as f:
    tagger = pickle.load(f)

charlotte_tagged = tagger.tag(charlotte)

with open('tagged_charlotte_data.pickle', 'wb') as f:
    pickle.dump(charlotte_tagged, f)

charlotte_nouns = []

for word in charlotte_tagged:
    if word[1] == 'NN':
        charlotte_nouns.append(word[0])
Ejemplo n.º 59
0
def extract_network(file):
    #POS tag text
    text = my_corpus.words(file)
    chunked = ne_chunk(pos_tag(text))

    #Extract list of people
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            if (i.label() == 'PERSON'):
                current_chunk.append(" ".join(
                    [token for token, pos in i.leaves()]))

    #Create ordered list of how many mentions per name
    agg_list = Counter(current_chunk).most_common()
    agg_list.insert(0, ('Thinker', 'Frequency'))  #Insert headings for csv file
    #return agg_list

    #Write the list to a csv file with name in one column and frequency in the next
    myfile = open(
        file[:-4] + '_nechunk.csv', 'w', newline=''
    )  #Name csv after the text file, removing .txt and adding .csv
    with myfile:
        writer = csv.writer(myfile)
        writer.writerows(agg_list)
    print('Writing complete')


for book in my_corpus.fileids():
    extract_network(book)
Ejemplo n.º 60
0
    'service_holiday_inn_london.txt.data',
    'service_swissotel_hotel_chicago.txt.data',
    'staff_bestwestern_hotel_sfo.txt.data', 'staff_swissotel_chicago.txt.data'
])
wsj = PlaintextCorpusReader(corpus_root, [
    'accuracy_garmin_nuvi_255W_gps.txt.data',
    'directions_garmin_nuvi_255W_gps.txt.data',
    'display_garmin_nuvi_255W_gps.txt.data',
    'satellite_garmin_nuvi_255W_gps.txt.data',
    'screen_garmin_nuvi_255W_gps.txt.data',
    'speed_garmin_nuvi_255W_gps.txt.data',
    'updates_garmin_nuvi_255W_gps.txt.data',
    'voice_garmin_nuvi_255W_gps.txt.data'
])

print wordlists.fileids()
print wsj.fileids()

print(len(wordlists.sents()))
senLengths1 = [len(s) for s in wordlists.sents()]
freqDist1 = nltk.FreqDist(senLengths1)

print(len(wsj.sents()))
senLengths2 = [len(s) for s in wsj.sents()]
freqDist2 = nltk.FreqDist(senLengths2)

propDist1 = nltk.DictionaryProbDist(freqDist1, normalize=True)
propDist2 = nltk.DictionaryProbDist(freqDist2, normalize=True)

myfile = open('../Thesis/wsjdist2.dat', 'wb')
wr = csv.writer(myfile, quoting=csv.QUOTE_NONE)