Example #1
0
def read_and_train_doc2vec(root_dir, fileids, output_file='', options={}):
    fileids = fileids if isinstance(fileids, list) else [fileids]
    fileids = [unicode(f, 'utf8') for f in fileids]
    output_file = output_file or '-'.join(fileids)
    output_file = u"{0}{1}-{2}".format(MODELS_DIR, output_file,
                                       options_to_string(options))
    reader = PlaintextCorpusReader(root=root_dir, fileids=fileids)
    try:
        docs = [
            TaggedDocument(reader.words(fileid), [fileid])
            for fileid in fileids
        ]
        train_and_save_doc2vec(docs, output_file, options)
    except UnicodeDecodeError:
        file_encodings = {}
        for fileid in fileids:
            file_content = open(root_dir + fileid).read()
            file_encoding = chardet.detect(file_content)
            file_encodings[fileid] = file_encoding['encoding']
        reader._encoding = file_encodings
        pdb.set_trace()
        docs = [
            TaggedDocument(reader.words(fileid), [fileid])
            for fileid in fileids
        ]
        train_and_save_doc2vec(docs, output_file, options)
Example #2
0
def compare(request):
    errors = []
    statistics=[]
    stats=[]
    for x in range(1,3):
           cantoname = "canto"+str(x)+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append(x)
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           stats.append(statistics)
           
    return render_to_response('compare.html', {'stats':statistics})
 def my_bar(self, corpus, patt, n):
     wordlists = PlaintextCorpusReader(corpus, patt)
     fileids = wordlists.fileids()
     k = len(fileids)
     figA = pylab.figure(1)
     figB = pylab.figure(2)
     li = ['Brown corpus']
     for id in fileids:
         if k > 1:
             i = fileids.index(id) + 1
             words = wordlists.words(id)
             fre = FreqDist(word.lower() for word in words
                            if word.isalpha())
             self.bar_count(fre, n, figA, 2 * k, 2 * i, id, li)
             self.bar_freq(fre, n, figB, 2 * k, 2 * i, id, li)
             figA.savefig('/home/camilo/Desktop/complex-freq.pdf')
             figB.savefig('/home/camilo/Desktop/complex-relfreq.pdf')
         else:
             words = wordlists.words(id)
             fre = FreqDist(word.lower() for word in words
                            if word.isalpha())
             self.bar_count(fre, n, figA, k, 1, id, li)
             self.bar_freq(fre, n, figB, k, 1, id, li)
             figA.savefig('/home/camilo/Desktop/simple-freq.pdf')
             figB.savefig('/home/camilo/Desktop/simple-relfreq.pdf')
     pylab.show()
Example #4
0
def get_stopwords(wdir, inpath, outfile, mfw):
	"""
	Arguments:
	
	wdir (str): path to the working directory
	inpath (str): relative path to the input directory
	outfile (str): relative path to the output file
	mfw (int): number of most frequent words to include in the stop word list
	"""

	print("starting: get_stopwords...")
	
	corpus = PlaintextCorpusReader(os.path.join(wdir, inpath), ".*")
	
	#print(corpus.fileids())
	print("words in the corpus: " + str(len(corpus.words())))
	
	fdist_corpus = FreqDist(corpus.words())
	
	with open(os.path.join(wdir, outfile), "w", encoding="utf-8") as stopwords_out:
		
		# from the list of tuples, create a list with the X MFW
		top_words = [w[0] for w in fdist_corpus.most_common(mfw)]

		# store list, one word per line
		stopwords_out.write("\n".join(top_words))

	print("Done!")
Example #5
0
def stats(request):
    errors = []
    statistics=[]
    if 'q' in request.GET:
        q = request.GET['q']
        if not q:
            errors.append('Enter a Canto Number')
        else:
           cantoname = "canto"+q+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           return render_to_response('stats.html', {'statistics':statistics})
    return render_to_response('stats.html', {'errors': errors})
Example #6
0
 def my_count(self, corpus, patt, n, filename):
     wordlists = PlaintextCorpusReader(corpus, patt)
     fileids = wordlists.fileids()
     res = []
     for id in fileids:
         leng = len(wordlists.words(id))
         wordc = len(set(wordlists.words(id)))
         wor = "=> num corpus words: " + ` leng `
         dis = "=> num distinct words: " + ` wordc `
         ric = "=> ind lex richness: " + ` leng / wordc `
         res.append(dis)
         res.append(ric)
         res.append(wor)
         for word in sorted(set(wordlists.words(id))):
             freq = (wordlists.words(id)).count(word)
             f = "(" + word.lower() + "," + ` round(100 * (freq / leng),
                                                    1) ` + ")"
             t = "(" + word.lower() + "," + ` freq ` + "/" + ` leng ` + ")"
             res.append(f)
             res.append(t)
     out = open("/home/camilo/" + filename, "w")
     try:
         for t in res[:n]:
             out.write(t + "\n")
     finally:
         out.close()
 def save_my_count(self,corpus,patt,n,filename):
     wordlists = PlaintextCorpusReader(corpus,patt)
     fileids = wordlists.fileids()
     res = []
     for id in fileids:    
         leng = len(wordlists.words(id))
         wordc = len(set(wordlists.words(id)))
         wor = "=> corpus tokens: " + `leng` + "\n"
         dis = "=> corpus token types: " + `wordc` + "\n"
         ric = "=> ind lex richness: " + `leng / wordc` + "\n"
         res.append(dis)
         res.append(ric)
         res.append(wor)
         for word in sorted(set(wordlists.words(id))):
             freq = (wordlists.words(id)).count(word)
             f = "(" + word.lower() + "," + `round(100 * (freq / leng),1)` + ")\n"
             t = "(" + word.lower() + "," + `freq` + "/" + `leng` + ")"
             res.append(f)
             res.append(t)
     out = open("../data/"+filename,"w")
     try:
         for t in res[:n]:
             out.write(t + "\n")
     finally:
         out.close()
Example #8
0
def hybrid_cfdist():
    sherlock_corpus = PlaintextCorpusReader(CORPUS_ROOT_SHERLOCK, '.*', encoding='utf-8')
    sherlock_bigrams = nltk.bigrams(sherlock_corpus.words())

    pokemon_corpus = PlaintextCorpusReader(CORPUS_ROOT_POKEMON, '.*', encoding='utf-8')
    pokemon_bigrams = nltk.bigrams(pokemon_corpus.words())

    return nltk.ConditionalFreqDist(sherlock_bigrams + pokemon_bigrams)
Example #9
0
def main():
    current_directory = os.path.dirname(__file__)
    corpus_root = os.path.abspath(current_directory)
    wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt')
    wordlists.fileids()
    ClassEvent = nltk.Text(wordlists.words())
    CEWords = [
        "Long Island", "Weather Service", "flooding", "August", "heavy rains",
        "Wednesday", "Suffolk County", "New York", "rainfall", "record"
    ]

    # ClassEvent Statistics
    print "--------- CLASS EVENT STATISTICS -------------"
    print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent)
    print "ClassEvent WORD LENGTH DISTRIBUTIONS:"
    print_word_length_distributions(ClassEvent)
    print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:"
    print_percentage_of_word_in_collection(ClassEvent, CEWords)

    ClassEventLettersPerWord = average_letters_per_word(ClassEvent)
    ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents())
    ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \
     ClassEventWordsPerSent) - 21.43

    print "Average number of letters per word", ClassEventLettersPerWord
    print "Average number of words per sentence:", ClassEventWordsPerSent
    print "Automated Readability Index:", ClassEventARI

    print

    wordlists_event = PlaintextCorpusReader(corpus_root,
                                            "Texas_Wild_Fire/.*\.txt")
    wordlists_event.fileids()
    YourSmall = nltk.Text(wordlists_event.words())
    SmallEventWords = [
        "Fire", "Wildfire", "Water", "Damage", "Ground", "Burn", "Town",
        "Heat", "Wind", "Speed", "Size", "City", "People", "Home", "Weather",
        "Debris", "Death", "Smoke", "State", "Ash"
    ]

    # YourSmall statistics
    print "--------- YOUR SMALL STATISTICS --------------"
    print "Texas_Wild_Fire", non_stopword_fraction(YourSmall)
    print "YourSmall WORD LENGTH DISTRIBUTIONS:"
    print_word_length_distributions(YourSmall)
    print "YourSmall PERCENTAGE OF WORD OCCURRENCES:"
    print_percentage_of_word_in_collection(YourSmall, SmallEventWords)

    YourSmallLettersPerWord = average_letters_per_word(YourSmall)
    YourSmallWordsPerSent = len(wordlists_event.words()) / \
     len(wordlists_event.sents())
    YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \
     YourSmallWordsPerSent) - 21.43

    print "Average number of letters per word", YourSmallLettersPerWord
    print "Average number of words per sentence:", YourSmallWordsPerSent
    print "Automated Readability Index", YourSmallARI
Example #10
0
def get_coarse_level_features(dataset, output_file):
# accessing the corpus
    corpus_root = '/home1/c/cis530/data-hw2/' 
    dataset_path = corpus_root + dataset

# Reading the files from the directories
    files = PlaintextCorpusReader(dataset_path, '.*')
    ids = files.fileids()
    stopFile = PlaintextCorpusReader(corpus_root, 'stopwlist.txt')
    stops = stopFile.words()

#Opening a file that has to be written to
    out = open(output_file, 'w')

    for i in range(0,len(ids) - 1):
#Initializing certain variables
        tokens_count=0
        types = 0
        non_stops_count=0
        sents_count = 0
        avg_sent_len=0
        cap_count = 0

        tokens=files.words(ids[i])
#Computing Number of Tokens
        tokens_count = len(tokens)

#Computing Number of types
        types = len(set(tokens))
        non_stops=[]

#Computing Number of Content Words
        for t in tokens:
            if t not in stops:
                non_stops.append(t)
        non_stops_count = len(non_stops)

#Finding Average Sentence Length
        sent = []
        sent = files.sents(ids[i])
        sents_count = len(sent)
        sent_len=0
        for s in sent:
            sent_len = sent_len + len(s)
        avg_sent_len = sent_len/float(sents_count)

#Computing Number of Captilized Words
        for c in non_stops:
            if c.istitle():
                cap_count = cap_count+1
        current_file = dataset + '/' + ids[i]
        e = current_file.split('/')
        out.write(current_file +' '+ e[-2] + ' tok:' + str(tokens_count) + ' typ:' + \
str(types) + ' con:' + str(non_stops_count) + ' sen:' + str(sents_count) + ' len:' + str(avg_sent_len) + ' cap:' + str(cap_count)+ '\n')
        out.flush()
Example #11
0
    def corpus_metrics(self, corpus_path):
        corpus_news = PlaintextCorpusReader(corpus_path, '.*\.txt')

        print('Corpus documents',  len(corpus_news.fileids()))
        print('Train documents', len([c for c in corpus_news.fileids() if c.startswith('train')]))
        print('Dev documents', len([c for c in corpus_news.fileids() if c.startswith('dev')]))
        print('Test documents', len([c for c in corpus_news.fileids() if c.startswith('test')]))

        words = set(corpus_news.words())
        words = sorted(words)
        print('Corpus different words', len(words))


        longwords = [w for w in corpus_news.words() if len(w) > 2]

        fdist = nltk.FreqDist(longwords)

        bigramController = BigramController()

        bigrams = bigramController.BuildBrigramFeatures(longwords)

        bigramController.BigramStatistics(bigrams)



        trigramdist = nltk.FreqDist(nltk.trigrams(longwords))

        #fdist.plot(50, cumulative=False)

        print(fdist.most_common(20))
        print("Trigram distribution")
        print(trigramdist.most_common(20))

        words_attack = []
        files_attack = [f for f in corpus_news.fileids()
                        if os.path.basename(os.path.normpath(f)).startswith('attack--')]
        for file in files_attack:
            for w in corpus_news.words(file):
                words_attack.append(w)
        words_nonattack = []
        files_nonattack = [f for f in corpus_news.fileids()
                           if os.path.basename(os.path.normpath(f)).startswith('nonattack--')]
        for file in files_nonattack:
            for w in corpus_news.words(file):
                words_nonattack.append(w)


        words_bag = { }
        words_bag['attack'] = words_attack
        words_bag['nonattack'] = words_nonattack
        #print(words_bag['attack'])
        cfd = nltk.ConditionalFreqDist((category, word)
                                       for category in ['attack', 'nonattack']
                                       for word in words_bag[category]
                                       )
Example #12
0
def main():
	current_directory = os.path.dirname(__file__)
	corpus_root = os.path.abspath(current_directory)
	wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt')
	wordlists.fileids()
	ClassEvent = nltk.Text(wordlists.words())
	CEWords = ["Long Island", "Weather Service", "flooding", "August", 
		"heavy rains", "Wednesday", "Suffolk County", "New York", "rainfall",
		"record"]

	# ClassEvent Statistics
	print "--------- CLASS EVENT STATISTICS -------------"
	print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent)	
	print "ClassEvent WORD LENGTH DISTRIBUTIONS:"
	print_word_length_distributions(ClassEvent)
	print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:"
	print_percentage_of_word_in_collection(ClassEvent, CEWords)
	
	ClassEventLettersPerWord = average_letters_per_word(ClassEvent)
	ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents())
	ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \
		ClassEventWordsPerSent) - 21.43
	
	print "Average number of letters per word", ClassEventLettersPerWord
	print "Average number of words per sentence:", ClassEventWordsPerSent
	print "Automated Readability Index:", ClassEventARI


	print 

	wordlists_event = PlaintextCorpusReader(corpus_root, "Texas_Wild_Fire/.*\.txt")
	wordlists_event.fileids()
	YourSmall = nltk.Text(wordlists_event.words())
	SmallEventWords = ["Fire", "Wildfire", "Water", "Damage", "Ground", "Burn", 
		"Town", "Heat", "Wind", "Speed", "Size", "City", "People", "Home",
		"Weather", "Debris", "Death", "Smoke", "State", "Ash"]
	

	# YourSmall statistics
	print "--------- YOUR SMALL STATISTICS --------------"
	print "Texas_Wild_Fire", non_stopword_fraction(YourSmall)
	print "YourSmall WORD LENGTH DISTRIBUTIONS:"
	print_word_length_distributions(YourSmall)
	print "YourSmall PERCENTAGE OF WORD OCCURRENCES:"
	print_percentage_of_word_in_collection(YourSmall, SmallEventWords)
	
	YourSmallLettersPerWord = average_letters_per_word(YourSmall)
	YourSmallWordsPerSent = len(wordlists_event.words()) / \
		len(wordlists_event.sents())
	YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \
		YourSmallWordsPerSent) - 21.43

	print "Average number of letters per word", YourSmallLettersPerWord
	print "Average number of words per sentence:", YourSmallWordsPerSent
	print "Automated Readability Index", YourSmallARI
def generate_scoring_dictionary():
    # create lists of positive and negative words using Hu and Liu (2004) lists
    positive_list = PlaintextCorpusReader(directory, 'Hu_Liu_positive_word_list.txt', encoding = 'latin-1')
    negative_list = PlaintextCorpusReader(directory, 'Hu_Liu_negative_word_list.txt', encoding = 'latin-1')
    positive_words = positive_list.words()
    negative_words = negative_list.words()
    
    # define bag-of words dictionaries
    positive_scoring = dict([(positive_words, 1) for positive_words in positive_words])
    negative_scoring = dict([(negative_words, -1) for negative_words in negative_words])
    scoring_dictionary = dict(positive_scoring.items() + negative_scoring.items())
    
    return(scoring_dictionary)
Example #14
0
def loadCorpora():

    corpus_root = '/usr/share/dict'
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    wordlists.fileids()
    wordlists.words('connectives')

    corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
    file_pattern = r".*/wsj_.*\.mrg" 
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    ptb.fileids()
    len(ptb.sents())
    ptb.sents(fileids='20/wsj_2013.mrg')[19]
Example #15
0
def get_lm_features(dataset, output_file):      
    corpus_root = '/home1/c/cis530/data-hw2/'
    bigram_root = corpus_root + 'Language_model_set/'

    fin_files = PlaintextCorpusReader(bigram_root+'Finance/','.*')
    fin_words = list(fin_files.words())
    fin_model = NGramModel(fin_words, 2)

    health_files = PlaintextCorpusReader(bigram_root+'Health/','.*')
    health_words = list(health_files.words())
    health_model = NGramModel(health_words, 2)

    res_files = PlaintextCorpusReader(bigram_root+'Research/','.*')
    res_words = list(res_files.words())
    res_model = NGramModel(res_words, 2)

    com_files = PlaintextCorpusReader(bigram_root+'Computers_and_the_Internet/','.*')
    com_words = list(com_files.words())
    com_model = NGramModel(com_words, 2)

    test_files = PlaintextCorpusReader(corpus_root+dataset, '.*')
    ids = test_files.fileids()

    out_file = open(output_file,'w')

    for j in range(0,len(ids)):
        file_words = test_files.words(ids[j])
        out_str = ''
        current_file = dataset + '/'+ids[j]
        e = current_file.split('/')
        out_str = out_str + current_file+ ' '+e[-2]
        sum_fin=0
        sum_health=0
        sum_res=0
        sum_com=0                                                                         
        text_len = len(file_words)
        for i in range(1,len(file_words)):
            sum_fin = sum_fin + math.log(fin_model.prob((file_words[i-1],),file_words[i]))
            comp_fin = float((-sum_fin)*(1/float(text_len)))
            sum_health = sum_health + math.log(health_model.prob((file_words[i-1],),file_words[i]))

            comp_health = (float(-sum_health))*(1/float(text_len))
            sum_res = sum_res + math.log(res_model.prob((file_words[i-1],),file_words[i]))
            comp_res = (float(-sum_res))*(1/float(text_len))
            sum_com = sum_com + math.log(com_model.prob((file_words[i-1],),file_words[i])) 
            comp_com = (float(-sum_com))*(1/float(text_len))
            out_str = out_str + ' finprob:'+str(round(sum_fin,2))+' hlprob:'+str(round(sum_health,2))+' resprob:'\
+str(round(sum_res,2))+ ' coprob:' + str(round(sum_com,2)) + ' finper:' + str(round(comp_fin,2)) + ' hlper:'+\
str(round(comp_health,2))+ ' resper:' + str(round(comp_res,2)) + ' coper:' + str(round(comp_com,2)) 
           out_file.write(out_str + '\n')
           out_file.flush()
Example #16
0
def get_sent():
    corpus_root = 'dict'
    # corpus_root = '/Users/abirqasem/nlp/dict'
    sentence = PlaintextCorpusReader(corpus_root, 'test3.txt')
    words = sentence.words()
    result = {"sentence": sentence, "words": words}
    return result
Example #17
0
def prepare_pos_features(Language_model_set, output_file):
    corpus_root = '/home1/c/cis530/data-hw2/' + Language_model_set
    texts = PlaintextCorpusReader(corpus_root, '.*')
    text = texts.words()
    tagged_text = nltk.pos_tag(text)
    merged_tag_text = mergeTags(tagged_text)
    lists = seperate_pos(merged_tag_text)
    nouns_dist = FreqDist(lists[0])
    top_nouns = nouns_dist.keys()[:200]
    verbs_dist = FreqDist(lists[1])
    top_verbs =verbs_dist.keys()[:200]
    advs_dist = FreqDist(lists[2])
    top_advs =advs_dist.keys()[:100]
    prep_dist = FreqDist(lists[3])
    top_preps =prep_dist.keys()[:100]
    adjs_dist = FreqDist(lists[4])
    top_adjs =adjs_dist.keys()[:200]


    out = open(output_file, 'w')

    for n in top_nouns:
        out.write('NN'+ n + '\n')
    for v in top_verbs:
        out.write('VV'+ v + '\n')
    for av in top_advs:
        out.write('ADV'+ av + '\n')
    for p in top_preps:
        out.write('PREP'+ p + '\n')
    for aj in top_adjs:
        out.write('ADJ'+ aj + '\n')
Example #18
0
def extract_keys(deal_dirs, save_path, numbers, save_file, flag):
    """
    批量提取文本中的出现频率最高的关键词
    :param deal_dirs: 处理文本的父目录
    :param save_path: 处理结果保存的路径
    :param numbers: 提取词数量
    :param save_file: 提取结果保存的文件
    :return:
    """

    news_corpus = PlaintextCorpusReader(deal_dirs, '.*')
    files = news_corpus.fileids()
    print(files)

    # 创建保存文件目录
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        print(save_path, ' 创建成功!')

    savepath = os.path.join(save_path, save_file)

    for file in files:
        deal_f = PlaintextCorpusReader(deal_dirs, ['{}'.format(file)])
        word_list = deal_f.words()

        fdist1 = nltk.FreqDist(word_list)
        result = fdist1.most_common(numbers)
        print(result)

        ss = ''
        for word, num in result:
            if flag:
                ss += ("%s: %s " % (word, num))
            ss += word + " "
        save_analy_result(savepath, ss+'\n')
def corpus_from_directory(path, filetype='.*'):
	'''
	Make a corpus of all files in a given directory. Can limit type by passing
	the desired extension, proper format is, e.g., '.*\.txt'
	'''
	corpus_reader = PlaintextCorpusReader(path, filetype)
	return nltk.Text( corpus_reader.words() )
Example #20
0
def setData(domain):
	
	# domain variable can take one of the following values
	#
	# "chicago_crime_data",
	# "economics",
	# "software_vulnerability",
	# "cyber_threat",
	# "articles",
	# "msds"

	
	corpus_root = getRoot(domain)					# based on the selected domain corpus root will hold the relative address of the corpus
	wordlists = PlaintextCorpusReader(corpus_root, '.*')		# NLTK's laintextCorpusReader load text files in the root
	words = wordlists.words()					# and extract all the words in each file 

	my_stopwords = nltk.corpus.stopwords.words('english')		# my_stopwords holds a list of non-relevant (stop) words in english
	content = [w for w in words if w.lower() not in my_stopwords]	# stop words are removed
	content = [w for w in content if len(w) > 2]			# words shorther than two(2) characters are removed
	content = [w for w in content if not w.isdigit()]		# digit only words (e.g. "10", "30", "450") are removed

	result = {}							
	
	# a list of related words is created for each word in the content variable
	
	for word in content:						
		result[word] = []
		for sset in wn.synsets(word):				# the first synonym of a set is selected, this can be expanded to the rest of the words in the set for more accuracy but at the cost of performace
			for synset in sset.hyponyms():			# a set of hyponyms is added for the main synonym
				result[word].append(synset.name[0:synset.name.find('.')])

	return result,content # both the synonyms and the original word corpus is returned 
Example #21
0
 def plot_cfreq(self,corpus,patt,n):
     wordlists = PlaintextCorpusReader(corpus,patt)
     fileids = wordlists.fileids()
     for id in fileids:
         words = wordlists.words(id)
         fre = FreqDist(word.lower() for word in words if word.isalpha()) 
     return fre.plot(n,cumulative=True)
def get_coarse_level_features(dataset, output_file):
	# Import the corpus reader
	corpus_root = '/home1/c/cis530/data-hw2/'+dataset
	# Define the folder where the files are situated
	files_dataset = PlaintextCorpusReader(corpus_root, '.*')
	# Open the output_file
	output = open('/home1/c/cis530/data-hw2/'+output_file,'w')
	# Read the stopwlist
	stop_list = open('/home1/c/cis530/data-hw2/'+'stopwlist.txt').read()
	types_stop_list=stop_list.split()
	for fileid in files_dataset.fileids():
		# Output the docid
		output.write(dataset+'/'+fileid+' ')
		# Output the topic_name
		topic_name=fileid.split('/')[0]	
		output.write(topic_name+' ')
		# Output the num_tokens	
		tokens=files_dataset.words(fileid)
		output.write('tok:'+str(len(tokens))+' ')
		# Output the num_types
		types=set(tokens)
		output.write('typ:'+str(len(types))+' ')
		# Output the num_contents
		output.write('con:'+str(len([w for w in tokens if w not in types_stop_list]))+' ')
		# Output the num_sents
		sents = files_dataset.sents(fileid)
		output.write('sen:'+str(len(sents))+' ')
		# Output the avg_slen
		avg_slen=round(float(len(tokens))/float(len(sents)),2)
		output.write('len:'+str(avg_slen)+' ')
		# Output the num_caps
		output.write('cap:'+str(len([w for w in tokens if w[0]>='A' and w[0]<='Z'])))
		output.write('\n')
	output.close()
Example #23
0
 def plot_cfreq(self, corpus, patt, n):
     wordlists = PlaintextCorpusReader(corpus, patt)
     fileids = wordlists.fileids()
     for id in fileids:
         words = wordlists.words(id)
         fre = FreqDist(word.lower() for word in words if word.isalpha())
     return fre.plot(n, cumulative=True)
def get_all_words():

    #direc = "../nouns_steps/whole/"
    direc = "../filtered_letters/"

    words = set()

    corpus = PCR(direc, '.*')
    all_files = corpus.fileids()
    txt_files = []
    #zuerst Liste aller zu durchsuchenden Files erstellen:
    for file in all_files:
        if ".txt" in file:
            txt_files.append(file)

    #dann aus jedem dieser Files alle Wörter zum Set hinzufügen(doppelte werden übergangen)
    for file in txt_files:
        text = corpus.words(file)
        for word in text:
            words.add(word.casefold())

    words = sorted(words)
    #print(words)
    #print(len(words))
    # json.dump(words, open("../all_nouns.json", 'w'))
    # with open('../all_nouns_set.pickle', 'wb') as f:
    #     pickle.dump(words, f)

    return words
class BigramModel:
        category_root=[]
        files_dataset_category=[]
        word_list=[]
        bigram=[]
        fd = []
        cfd = []
        def __init__(self,category,corpus_root):
                self.category_root=[]
                self.files_dataset_category=[]
                self.word_list=[]
                self.bigram=[]
                self.fd = []
                self.cfd = []
                self.category_root=corpus_root+'/'+category
                self.files_dataset_category=PlaintextCorpusReader(self.category_root,'.*')
                self.word_list = self.files_dataset_category.words()
                self.bigram = nltk.bigrams(self.word_list)
                self.fd = FreqDist(self.word_list)
                self.cfd = nltk.ConditionalFreqDist(self.bigram)
        def get_prob_and_per(self,word_list):
                # The function takes a word_list and return both the log probability and log perplexity under the language model 
                n_types = len(set(word_list))
                n_tokens=len(word_list)
                # Calculate Log Prob with Laplace smoothing.
                log_prob = math.log(self.fd[word_list[0]]+1)-math.log(n_tokens+n_types)  #initializing prob for the first word
                for (w1,w2) in nltk.bigrams(word_list):
                    log_prob = log_prob+math.log(self.cfd[w1][w2]+1)-math.log(len(self.cfd[w1].keys())+n_types)
                # Calculate Log Perplexity
                log_per=float(1)/float(-n_tokens)*log_prob
                return log_prob, log_per
def get_lm_features(dataset,output_file):
        # Import the corpus reader
	corpus_root = '/home1/c/cis530/data-hw2/'+dataset
	# Define the folder where the files are situated
	files_dataset = PlaintextCorpusReader(corpus_root, '.*')	
        fin_model = BigramModel('Finance',corpus_root)
        hel_model = BigramModel('Health',corpus_root)
        res_model = BigramModel('Computers_and_the_Internet',corpus_root)
        co_model = BigramModel('Research',corpus_root)
        output = open('/home1/c/cis530/data-hw2/'+output_file,'w')
        for fileid in files_dataset.fileids():
		# Output the docid
		output.write(dataset+'/'+fileid+' ')
		# Output the topic_name
		topic_name=fileid.split('/')[0]
		output.write(topic_name+' ')		
		word_list = files_dataset.words(fileid)
		finprob,finper = fin_model.get_prob_and_per(word_list)		
		hlprob,hlper = hel_model.get_prob_and_per(word_list)	
		resprob,resper = res_model.get_prob_and_per(word_list)
		coprob,coper = co_model.get_prob_and_per(word_list)
		output.write('finprob:'+str(round(finprob,1))+' ')
		output.write('hlprob:'+str(round(hlprob,1))+' ')
		output.write('resprob:'+str(round(resprob,1))+' ')
		output.write('coprob:'+str(round(coprob,1))+' ')
		output.write('finper:'+str(round(finper,1))+' ')
		output.write('hlper:'+str(round(hlper,1))+' ')
		output.write('resper:'+str(round(resper,1))+' ')
		output.write('coper:'+str(round(coper,1))+' ')
		output.write('\n')
        output.close()
def w_find(path_c, fname_c):

    work_dir = os.curdir
    os.chdir(work_dir)

    corp = PlaintextCorpusReader(path_c, fname_c, encoding="utf")
    text = nltk.Text(corp.words())

    ##        for i in text:
    ##                j = str(i)
    ##                k = re.finditer("Meraki",j)
    ##                for count, l in enumerate(k):
    ##                        t_count = count
    ##                print("Total matches found:",t_count)

    with open("words_list.txt", "r") as f:
        words = f.read()
        word = words.split('\n')
        for x in word:
            print("Fetching match for word :", str(x), "in file : ", fname_c)
            text.concordance(str(x))
            print(x)

    print()
    print("----------------------")
Example #28
0
def getHighFreqWords():
    maxlen = 15
    maxlen1 = 5

    corpath = ''
    wordlist = PlaintextCorpusReader(corpath, '.*')
    allwords = nltk.Text(wordlist.words('temp.txt'))
    stop = []
    swords = [i for i in allwords if i not in stop]
    fdist = nltk.FreqDist(swords)

    with open('highFreqWords.txt', 'w', encoding='utf-8') as file:
        for item in fdist.most_common(nWords):
            # print(item,item[0])
            word0 = item[0]
            try:
                q = getTranslation(item[0])
            except Exception as e:
                print(e)

            if not q:
                continue
            while len(word0) < maxlen:
                word0 += ' '
            num = str(item[1])
            while len(num) < maxlen1:
                num = ' ' + num
            file.write(word0 + ' ' + num + '  ')
            for translate in q:
                file.write(translate + ' ')
            file.write("\n")
Example #29
0
    def extractWordsOnly(self, article):
        templist = []
        listtextstring = []
        articlename = article + '.txt'
        #corpus_root = '/home/jesal/onedump/'
        wl = PlaintextCorpusReader(corpus_root, '.*')
        allwords = wl.words(fileids = articlename)
        exturllist = self.extractexternalURL(article)
        textstring = wl.raw(articlename)
        for item in exturllist:
            textstring = textstring.replace(item,' ')
    

        
        #templist = re.sub(r'[.!,;?]', ' ', textstring).split()
        templist = nltk.word_tokenize(textstring)
        listtemp = []
        for i in templist:
        	j = re.sub('[^A-Za-z]+', '', i)
        	listtemp.append(str(j))
		    
		    
		    
		    
        templistfinal = []
        templistfinal= self.removeEmpty(listtemp)
        return templistfinal
Example #30
0
def similar (text, word):
    if re.match ("^[a-zA-Z0-9_\(\),\.]+$", text) and re.match ("^[a-zA-Z0-9_]+$", word):
        text = '%s.txt' % text
        
        f = open(os.path.join(CORPUS_ROOT, text), 'r')
        source = f.read()
        f.close()
        
        corpus = PlaintextCorpusReader(CORPUS_ROOT, [text])
        n_text = nltk.text.Text(corpus.words(text))
        context_index = nltk.text.ContextIndex(n_text.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower())
        word = word.lower()
        wci = context_index._word_to_contexts
        result = []
        
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = nltk.probability.FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word)
            words = nltk.util.tokenwrap(fd.keys()[:20])
            
            for middle_word in words.split(' '):
                for context in contexts:
                    if re.search ("/" + context[0] + "(\W|\s)+" + middle_word + "(\W|\s)+" + context[1] + "/i", source) != 'none':
                        print (context[0], middle_word, context[1])
                        result.append ({'word': word, 'context_left': context[0], 'context_right': context[1]})
            
        return dumps ({'name': text, 'word': word, 'result': result})    
Example #31
0
def noun_filter():
    '''filter letters so only the nouns come through'''

    direc = "../Text/missing_letters/"
    #direc = "../../Letters/"
    stopWords = set(stopwords.words('german'))
    with open('../POS_tagger/nltk_german_classifier_data.pickle', 'rb') as t:
        tagger = pickle.load(t)

    wordlist = PlaintextCorpusReader(direc, ".*")
    for file in os.listdir(direc):
        text = ""
        filtered = ""
        filename = os.path.join(direc, file)
        to_tag = wordlist.words(file)
        tagged = tagger.tag(to_tag)
        path = "../Text/tagged_letters/"
        with open('%stagged_%s.pickle' % (path, file.replace(".txt", "")),
                  'wb') as f:
            pickle.dump(tagged, f)

        nouns = []
        for word in tagged:
            if word[1] == 'NN':
                nouns.append(word[0])

        for w in nouns:
            word = w.casefold().strip()
            if word not in stopWords:
                filtered += w + " "

        name = filename.replace(direc, "")
        #name = name.replace("html", "txt")

        intoTxt(filtered, file.replace("html", "txt"))
Example #32
0
def create_stopword_list(mfw, corpus_dir, stopwords_out):
    """
    Creates a stop word list for a collection of text files.
    The most frequent words of the collection are used as stop words.
    How many of the MFW should be used, can be indicated with the mfw parameter.

    author: uhk

    Arguments:
    mfw (int): number of MFW to consider as stop words
    corpus_dir (str): path to the corpus directory
    stopwords_out (str): path to the output stopwords file 
    
    """

    print("\nLaunched create_stopword_list.")

    from nltk.corpus import PlaintextCorpusReader
    from nltk.probability import FreqDist

    corpus = PlaintextCorpusReader(corpus_dir, '.*')
    fdist_corpus = FreqDist(corpus.words())

    with open(stopwords_out, "w", encoding="utf-8") as stopwords_out_file:

        # get the most frequent words
        mfw_list = [w[0] for w in fdist_corpus.most_common(mfw)]

        # write stop word list to file
        stopwords_out_file.write("\n".join(mfw_list))

    print("Done.")
Example #33
0
def trigramModel(corpus):
    newcorpus = PlaintextCorpusReader(corpus, "nlp_project2_corpus.txt")

    newcorpus.raw("nlp_project2_corpus.txt")
    newcorpus.sents("nlp_project2_corpus.txt")
    enwords = newcorpus.words("nlp_project2_corpus.txt")
    entext = newcorpus.raw("nlp_project2_corpus.txt")
    entokens = nltk.word_tokenize(entext)
    # Applying trigram to sentence
    trigram = nltk.trigrams(entokens)

    trigrams_freq = nltk.FreqDist(trigram)
    ourTextArr2 = []
    counter = 0
    prob = 0
    trigramCounter = 0
    probBiGram = 0

    bigrams = nltk.bigrams(entokens)

    bigrams_freq = nltk.FreqDist(bigrams)

    ourTextArr = []
    bigramCounter = 0
    for i in bigrams_freq.most_common():
        bigramCounter += 1

    for i in trigrams_freq.most_common():
        trigramCounter += 1

    for i, j in trigrams_freq.most_common():

        if prob > 0.50:
            print("********PROBB****: ", prob)
        if (j > 0):

            for k, l in bigrams_freq.most_common():
                if (j > 2):
                    probBiGram += l / (bigramCounter / 10)

            prob += j / (trigramCounter / 10)
        prob = ((prob + probBiGram) - (prob * probBiGram)) / trigramCounter

        if prob > 0.45:
            str1 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i[0])
            str2 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i[1])
            str3 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i[2])
            ourTextArr2.append(str1 + " " + str2 + " " + str3)
            if (len(ourTextArr2) > 200):
                break
    ourTextArr2 = list(set(ourTextArr2))
    finalText2 = ""
    counter3 = 0
    ourTextArr2.reverse()

    for i in range(len(ourTextArr2)):
        counter3 += 1
        finalText2 += " " + ourTextArr2[i]
    print(finalText2)
Example #34
0
class Document(object):
    """
    A container object for a set of chapters.

    This allows us to keep track of document frequencies when computing them the
    first time so we don't repeat computations for common words. It also handles
    the PlaintextCorpusReader functions for us.
    """

    def __init__(self, chapter_paths):
        """
        Create a new Document.

        chapter_paths - A list of the paths for chapters in the document.
        """
        self.corpus = PlaintextCorpusReader("", chapter_paths)
        self.chapter_lists = self._sanitize_chapters()
        self.chapter_dists = [(FreqDist(chapter), chapter) for chapter in
                self.chapter_lists]
        self.words = {}

    def get_chapters(self):
        return self.chapter_lists

    def average_chapter_frequency(self, word):
        freqs = []
        if word in self.words:
            return self.words[word]
        else:
            for (dist, wordlist) in self.chapter_dists:
                freqs.append(dist[word]/float(len(wordlist)))

            # Store and return the average frequency
            avg_frq = mean(freqs)
            self.words[word] = avg_frq
            return avg_frq

    def _sanitize_chapters(self):
        # Sanitize the wordlists and return them
        lists = [self.corpus.words(file_id) for file_id in
                self.corpus.fileids()]

        new_lists = []

        for word_list in lists:
            # Convert everything to lowercase (e.g. so "the" and "The" match)
            word_list = [word.lower() for word in word_list]
            # Remove any punctuation
            word_list = [re.sub('\p{P}','',word) for word in word_list]
            # Remove stopwords, punctuation, and any empty word
            stops = stopwords.words('english')
            stops.append('')
            stops.append('said')
            word_list = [word for word in word_list if (word not in stops and
                word.isalpha())]

            new_lists.append(word_list)

        return new_lists
Example #35
0
def main():
    corpus_root = '../posts/'
    newcorpus = PlaintextCorpusReader(corpus_root, '.*',
                                      para_block_reader=read_block_no_metadata)
    corpus_words = [w.lower() for w in newcorpus.words() if w.isalpha()]
    corpus_sentences = newcorpus.sents()
    analyst = TextAnalyst(corpus_words, corpus_sentences, 'french')
    analyst.print_analyze()
Example #36
0
def loading_corpus():
    from nltk.corpus import PlaintextCorpusReader
    corpus_root = "wiki_corpus"
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    print(wordlists.fileids())
    print(wordlists.words())
    tokens = nltk.word_tokenize(wordlists.raw())
    print(tokens)
Example #37
0
def read_BNC_baby_stem(root_local):
	global fdist
	BNC_baby = []
	stemmer = SnowballStemmer("english")
	wordlists = PlaintextCorpusReader(root_local, '.*', encoding='latin-1')
	for word in wordlists.words():
		BNC_baby.append(stemmer.stem(word))
	fdist = FreqDist(word.lower() for word in BNC_baby)
	return(fdist)
Example #38
0
def get_corpus(corpusdir):
	newcorpus = PlaintextCorpusReader(corpusdir, '.*')
	titles = newcorpus.fileids() # returns all the .txt files in the dir
	words = []
	for title in titles:
		newcorpus_txt = newcorpus.words(title)
		words.extend([ e for e in newcorpus_txt if re.match(r"[aA-zZ]",e)])
	
	return words
def extractingFromFolders():
    folder2 = os.path.expanduser('~\\My Documents\\Tara\\Ongoing\\CharacterCorpus\\Reference')
    fileresult = os.path.expanduser('~\\My Documents\\Tara\\Ongoing\\CharacterCorpus\\results.txt')
    refer = PlaintextCorpusReader(folder2, 'harrygrepster.txt')
    grepster = refer.words()
    results = open(fileresult, 'a')
    completeWords = wordlist.words()
    stoppers = stopwords.words()
    return grepster, results, completeWords, stoppers
Example #40
0
 def plot_freq(self, corpus, patt, n):
     wordlists = PlaintextCorpusReader(corpus, patt)
     fileids = wordlists.fileids()
     words = []
     for id in fileids:
         words = append(words, wordlists.words(id))
     fre = FreqDist(word.lower() for word in words if word.isalpha())
     fre.tabulate(n)
     return fre.plot(n)
def read_BNC_baby_stem(root_local):
    global fdist
    BNC_baby = []
    stemmer = SnowballStemmer("english")
    wordlists = PlaintextCorpusReader(root_local, '.*', encoding='latin-1')
    for word in wordlists.words():
        BNC_baby.append(stemmer.stem(word))
    fdist = FreqDist(word.lower() for word in BNC_baby)
    return (fdist)
Example #42
0
def get_corpus(corpusdir):
    newcorpus = PlaintextCorpusReader(corpusdir, '.*')
    titles = newcorpus.fileids()  # returns all the .txt files in the dir
    words = []
    for title in titles:
        newcorpus_txt = newcorpus.words(title)
        words.extend([e for e in newcorpus_txt if re.match(r"[aA-zZ]", e)])

    return words
Example #43
0
def train():
    # Reads in data, preprocesses and trains it in a Naive Bayes Classifier and returns the classifier object
    neg = PlaintextCorpusReader('C:\\Users\\Darren\\Downloads\\aclImdb\\train\\neg', '.+\.txt')
    pos = PlaintextCorpusReader('C:\\Users\\Darren\\Downloads\\aclImdb\\train\\pos', '.+\.txt')

    neg_docs1 = [neg.words(fid) for fid in neg.fileids()]
    pos_docs1 = [pos.words(fid) for fid in pos.fileids()]

    # Combine the categories of the corpus
    all_docs1 = neg_docs1 + pos_docs1
    num_neg_docs = len(neg_docs1)

    # Processsing for stopwords, alphabetic words, Stemming 
    all_docs2 = [[w.lower() for w in doc] for doc in all_docs1]
    print("lowering done")
    import re
    all_docs3 = [[w for w in doc if re.search('^[a-z]+$',w)] for doc in all_docs2]
    print("regex done")
    from nltk.corpus import stopwords
    stop_list = stopwords.words('english')
    all_docs4 = [[w for w in doc if w not in stop_list] for doc in all_docs3]
    print("stopword done")
    stemmer = PorterStemmer()
    all_docs5 = [[stemmer.stem(w) for w in doc] for doc in all_docs4]

    #Create dictionary
    dictionary = corpora.Dictionary(all_docs5)
    # print(dictionary)
    # Export as a text file to use with the pickled classifier
    dictionary.save_as_text("dictionary.txt")

    # Convert all documents to TF Vectors
    all_tf_vectors = [dictionary.doc2bow(doc) for doc in all_docs5]

    #Label the taining data. Since the folder name is the label, I use the same labels.
    all_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in all_tf_vectors]
    neg_data = [(d, 'negative') for d in all_data_as_dict[0:num_neg_docs-1]]
    pos_data = [(d, 'positive') for d in all_data_as_dict[num_neg_docs:]]
    all_labeled_data = neg_data + pos_data

    #Generate the trained classifier
    #Can use max entropy as well
    classifier = nltk.NaiveBayesClassifier.train(all_labeled_data)
    return classifier, dictionary
Example #44
0
    def load_vocab(self, root='.', files='.*'):
        """
        Load new vocabulary.

        :param root: the root directory for the corpus.
        :param files: A list or regexp specifying the files in this corpus.
        """
        voc = PlaintextCorpusReader(root, files)
        for word in voc.words():
            self.vocab[word.lower()] += 1
Example #45
0
def read_text(path):
    if os.path.isfile(path) == True:
        raw = open(path, 'r').read()
        tokens = nltk.word_tokenize(raw)
        text = [token.lower() for token in tokens]
    elif os.path.isdir(path) == True:
        filelists = PlaintextCorpusReader(path, '.*')
        tokens = filelists.words()
        text = [token.lower() for token in tokens]
    return nltk.Text(text)
Example #46
0
def spimi_corpus_process(path_corpus,file_names,block_size):
	from nltk.corpus import PlaintextCorpusReader
	wordlists = PlaintextCorpusReader(path_corpus,file_names,encoding='latin-1')
	block     = []
	for fileid in wordlists.fileids(): 
		docid = fileid[:fileid.rfind(".")][-1:]
		block += [(word,docid) for word in wordlists.words(fileid)]
	while len(block)!=0:
		try: count = spimi_invert([block.pop() for x in xrange(block_size)])
		except IndexError as ie: pass	
def create_LM_on_dataset(dataset):
    
     corpus_root = '/home1/c/cis530/data-hw2/Language_model_set/'
     dataset_path = corpus_root + dataset
     files = PlaintextCorpusReader(dataset_path, '.*')
     ids = files.fileids()
     for i in range(len(ids)):
         words = files.words(ids[i])
     lang_model = NGramModel2(words,2)
     
     return lang_model
Example #48
0
def concordance(text):
    """Returns an alphabetical list of words for the given text."""
    corpus = PlaintextCorpusReader(CORPUS_ROOT, [text])
    n_text = nltk.text.Text(corpus.words(text))
    interesting = [
            'data',
            'grey',
            'literature',
            'relation',
            'user',
            'information',
            'error',
            'value',
            'other',
            ]
    # TODO: use NLTK built-in functions for this!
    word_list = map(str.lower, list(set(list(corpus.words()))))
    word_list.sort()
    #word_list = "<br />".join(word_list)
    return template('templates/split', word_list=word_list, text=text)
    def stemming_files(self, source_folder, destination_folder):
        if not os.path.exists(destination_folder):
            os.makedirs(destination_folder)

        corpus_news = PlaintextCorpusReader(source_folder, '.*\.txt')

        for file in corpus_news.fileids():
            file_name = os.path.basename(os.path.normpath(file))
            words = corpus_news.words(file)
            stemmed_content = self.stemming_text(words)
            with open(destination_folder + "/" + file_name, 'w', encoding='utf8') as modified:
                modified.write(' '.join(stemmed_content))
Example #50
0
File: q1.py Project: cmstewart/galv
def textinfo(path):
    """
    Takes a file path and returns figures about the text file contained therein.
    """
    
    from nltk.corpus import PlaintextCorpusReader
    from nltk import FreqDist
    corpusReader = PlaintextCorpusReader(text, '.*')

    print "Total word count:", len([word for sentence in corpusReader.sents() for word in sentence])
    print "Unique words:", len(set(corpusReader.words()))
    print "Sentences:", len(corpusReader.sents())
    print "Average sentence length in words:", (len([word for sentence in corpusReader.sents() for word in sentence]) / len(corpusReader.sents()))
def main():
    
    # Corpus Location
    #for training data
    posTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_train'
    negTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_train'

    #for test data
    posTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_test'
    negTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_test'

    # Create Plain Text Corpus for training data
    posCorpus = PlaintextCorpusReader(posTrainCorpus, '.*')
    negCorpus = PlaintextCorpusReader(negTrainCorpus, '.*')


    # Create Plain Text Corpus for test data
    posTstCorpus = PlaintextCorpusReader(posTestCorpus, '.*')
    negTstCorpus = PlaintextCorpusReader(negTestCorpus, '.*')
    
    #GetBigrams
    posBigrams = nltk.bigrams(posCorpus.words())
    negBigrams = nltk.bigrams(negCorpus.words())

    #Get no. of words per corpus
    posWordLen = len(posCorpus.words())
    negWordLen = len(negCorpus.words())
    
    # Creating object of Lang_Model_classifier
    obj1 = Lang_Model_Classifier()
    obj1.freq_dst(posCorpus, negCorpus)
    
    #For negative test data
    for filename in os.listdir(negTestCorpus):
        wordSet =  negTstCorpus.words(filename)
    
        print '**Unigram**'
        unigr = obj1.perp(wordSet)
    
        print unigr
    
        print '**Bigram**'
        bigr = obj1.perpBi(nltk.bigrams(wordSet))
    
        print bigr
        
    #For positive test data    
    for filename in os.listdir(posTestCorpus):
        wordSet2 =  posTstCorpus.words(filename)
    
        print '**Unigram**'
        posunigr = obj1.perp(wordSet2)
    
        print posunigr
    
        print '**Bigram**'
        posbigr = obj1.perpBi(nltk.bigrams(wordSet2))
    
        print posbigr
Example #52
0
def repl():
    '''
    This is the Read-Eval-Print loop for the dialogue.
    '''

    # Setup the dictionary, preprocessing
    print "You'll have to pardon me, at my age, it takes several moments to memorize all of Shakespeare..."
    #shake = gutenberg.words('shakespeare-caesar.txt')
    #shake = gutenberg.words('shakespeare-complete.txt')
    #print "Done with getwords"
    pcr = PlaintextCorpusReader(".", 'shakespeare.*')
    shake = pcr.words("shakespeare-complete.txt")
    imps = getNimportant(shake,500)
    print imps
    #print "Done with get imps"
    
    # divide the text into blocks of 3000 words (split on periods?)
    # store blocks? Hmm. or just read from shake by line, based on block number
    # can actually just index each word.

    # need a way to index the text
    kps = []
    for word in imps:
        #kps.append(KeyPhrase(word, getPhrases(word, shake)))
        kps.append(KeyPhrase(word))

    #print "Done with kps stuff"

    #print imps

    # Define words that will exit the program
    goodbyeWords = ["quit", "bye", "goodbye", "q", "exit", "leave"]

    # Greetings
    print "Ah, finally someone who will speak Shakespeare with me! How do you do, sir?"
    print

    # Main loop
    while True:
        # Prompt
        text = raw_input('> ').lower()
        print

        # Exit strategy
        if text in goodbyeWords:
            print "Goodbye!"
            break

        # Answer
        provideAnswer(text, kps, shake)
        print
Example #53
0
def carga_mongodb():

    client = pymongo.MongoClient(MONGODB_URI)
    db = client.docs
    docs=db.DOCS
    spanish_stops = set(stopwords.words('spanish'))
    newcorpus = PlaintextCorpusReader(corpus_root, '.*')
    newcorpus.fileids()

    for fileid in newcorpus.fileids():

        try:
            num_words = len(newcorpus.words(fileid))
            words = newcorpus.words(fileid)
            # num_sents = len(newcorpus.sents(fileid))
            # print(newcorpus.raw(fileid))
            #bcf = BigramCollocationFinder.from_words(words)
            #filter_stops = lambda w: len(w) < 3 or w in spanish_stops
            #bcf.apply_word_filter(filter_stops)
            tags_array=vocab_words(newcorpus.raw(fileid))
            tags=tags_array[0]
            tags_vocab=tags_array[1]
            cloud=tags_array[2]
            total_cloud=[]

            for c in cloud:
                reg={}
                reg['word']=c[0]
                reg['total']=c[1]
                total_cloud.append(reg)

            #insertamos el documento
            post = {"nombre":  fileid, "fecha": datetime.datetime.utcnow(), "texto":preparar_texto(newcorpus.raw(fileid)), "tags_vocab":tags_vocab, "tags":tags, "enc":random.randint(1, 50), "pos":random.randint(1, 10), "neg":random.randint(1, 5), "num_words":num_words, "cloud":total_cloud}
            post_id = docs.insert_one(post).inserted_id

        except:
            print("Importacion Fallida:" + fileid)