Beispiel #1
0
def main():
    
    #create bits
    from nltk.corpus import webtext
    overheard_sents = webtext.sents('overheard.txt')

    convo1 = conversation.Conversation(overheard_sents[0:3],"white_and_asian")
    print "Participants:", convo1.participants
Beispiel #2
0
def create_corpus():
    """Removing punctuations from the sentences of corpus."""
    # Here, I am taking a text from rueters, webtext and brown corpus here.
    rr_corpus = reuters.sents() + webtext.sents() + brown.sents()
    punctuations = [p for p in string.punctuation]
    cleaned_corpus = []

    for idx in range(len(rr_corpus)):
        cleaned_corpus.append(
            [w for w in rr_corpus[idx] if w not in punctuations])

    return cleaned_corpus
Beispiel #3
0
def process_webtext():
    print 'webtext'
    from nltk.corpus import webtext
    count = 0
    word = 'bank'
    sen1 = 'depository_financial_institution.n.01'
    sen2 = 'bank.n.01'
    file_name = 'data/bank_webtext_tmp.txt'
    for f in webtext.fileids():
        sents = webtext.sents(f)
        for i in range(len(sents)):
            sent = sents[i]
            if (word in sent):
                appendToFile(file_name, sentToStr(sent, '0'))
                count = count + 1
                print count
Beispiel #4
0
def get_default_sentences() -> list:
    nltk.download('brown')
    brown_tokenized_sentences = brown.sents()
    brown_sentences = detok_sentences(brown_tokenized_sentences)
    nltk.download('gutenberg')
    nltk.download('punkt')
    gutenberg_tokenized_sentences = gutenberg.sents()
    gutenberg_sentences = detok_sentences(gutenberg_tokenized_sentences)
    nltk.download('reuters')
    reuters_tokenized_sentences = reuters.sents()
    reuters_sentences = detok_sentences(reuters_tokenized_sentences)
    nltk.download('webtext')
    webtext_tokenized_sentences = webtext.sents()
    webtext_sentences = detok_sentences(webtext_tokenized_sentences)
    nltk.download('inaugural')
    inaugural_tokenized_sentences = inaugural.sents()
    inaugural_sentences = detok_sentences(inaugural_tokenized_sentences)
    return brown_sentences + gutenberg_sentences + reuters_sentences + webtext_sentences + inaugural_sentences
Beispiel #5
0
def webtext():
    from nltk.corpus import webtext as webtext
    from nltk.corpus import nps_chat

    # list comprehension version
    file_ids = [fileid for fileid in webtext.fileids()]
    chat_file_ids = [fileid for fileid in nps_chat.fileids()]

    pirates = webtext.raw('pirates.txt')
    pirates_words = len(webtext.words('pirates.txt'))
    pirates_sents = len(webtext.sents('pirates.txt'))
    uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')]))

    lexical_diversity = lexical_div(uniqs, pirates_words)

    # import nltk.book as book
    # text1 = book.text1
    # pirates = webtext.raw('pirates.txt')

    return render_template('webtext.html',
                           file_ids=file_ids,
                           chat_file_ids=chat_file_ids,
                           pirates=pirates)
Beispiel #6
0
import matplotlib.pyplot as plt
'''
Train Tagger
'''
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.corpus import treebank
train = treebank.tagged_sents()[:10000]
t0 = DefaultTagger('NN')
t1 = UnigramTagger(train, backoff=t0)
t2 = BigramTagger(train, backoff=t1)
'''
Initialize
'''
my_corp = web.sents(fileids='firefox.txt')
sent_count = 0
ques_count = 0
All_count = 1
NN_count = 0
NNS_count = 0
NNP_count = 0
VB_count = 0
VBN_count = 0
VBG_count = 0
VBD_count = 0
VBZ_count = 0
JJ_count = 0
WP_count = 0
NN_Num = 0
NNS_Num = 0
Beispiel #7
0
def pmi_with_cython(input_corpus):
    logging.debug(msg='With cython is True')
    start = time.time()
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True)
    elapsed_time = time.time() - start
    print(("elapsed_time with cython:{} [sec]".format(elapsed_time)))


from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import genesis
from nltk.corpus import abc

abs_corpus = abc.sents()
genesis_corpus = genesis.sents()
web_corpus = webtext.sents()
gutenberg_corpus = gutenberg.sents()

input_corpus = {
    'abs': list(abs_corpus),
    'genesis': list(genesis_corpus),
    'web': list(web_corpus),
    'gutenberg': list(gutenberg_corpus)
}

pmi_with_cython(input_corpus)
pmi_with_parallel(input_corpus)
#pmi_with_threading(input_corpus)
Beispiel #8
0
#Importing data
import nltk
from nltk.corpus import webtext
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import string
nltk.download('webtext')
wt_sentences = webtext.sents('firefox.txt')
wt_words = webtext.words('firefox.txt')
print(len(wt_sentences))
print(len(wt_words))
Beispiel #9
0
import heapq
'''
Train Tagger
'''
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.corpus import treebank
train = treebank.tagged_sents()[:10000]
t0 = DefaultTagger('NN')
t1 = UnigramTagger(train, backoff=t0)
t2 = BigramTagger(train, backoff=t1)
'''
Initialize
'''
my_corp = web.sents(fileids='overheard.txt')
sent_count = 0
ques_count = 0
All_count = 1
NN_count = 0
NNS_count = 0
NNP_count = 0
VB_count = 0
VBN_count = 0
VBG_count = 0
VBD_count = 0
VBZ_count = 0
JJ_count = 0
WP_count = 0
NN_Num = 0
NNS_Num = 0
#Using a windows terminal? Run "CHCP 65001" before this script to enable Unicode Character being printed to the terminal.

from nltk.corpus import webtext, stopwords
#from nltk.stem.porter import PorterStemmer
import pickle

stopWords = set(stopwords.words('english'))
corpus = webtext
#stemmer = PorterStemmer()

sentences = [[
    token.lower() for token in s if token.isalpha() and token not in stopWords
] for s in webtext.sents()]
#sentences = [[stemmer.stem(token.lower()) for token in s if token.isalpha()] for s in webtext.sents()]

words = set([word for sen in sentences for word in sen])

coocMap = {}


def addToMap(coocMap, w1, w2, val):
    if w1 in coocMap:
        if w2 in coocMap[w1]:
            coocMap[w1][w2] += val
        else:
            coocMap[w1][w2] = val
    else:
        coocMap[w1] = {w2: val}
    return coocMap

print("Frame_net to sents")
state_union_corp_sents = state_union.sents()
print('state union to sents')
subject_corp_sents = subjectivity.sents()
print('Subjectvity to sents')
brown_corp_sents = brown.sents()
print("Brown corpus to sents")
movie_reviews_corp_sents = movie_reviews.sents()
print("Movie reviews to sents ")
guttenberg_corp_sents = gutenberg.sents()
print("Guttenberg to sents")
treebank_corb_sents = treebank.sents()
print("Freebank to sents")
reuters_corp_sents = reuters.sents()
print("Reuters to sents")
webtext_corp_sents = webtext.sents()
print("Webtext to sents")

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

print("Cleaning data ...")

discard_punctuation_and_lowercased_sents_condll2007 = [[
    word.lower() for word in sent if word not in punctuation
] for sent in conll2007_corp_sents]

discard_punctuation_and_lowercased_sents_condll2000 = [[
    word.lower() for word in sent if word not in punctuation
] for sent in conll2000_corp_sents]
        sentences.append(' '.join(tok_sent).strip())
    return sentences


print("Loading sentences.")
nltk.download('brown')
brown_tokenized_sentences = brown.sents()
brown_sentences = detok_sentences(brown_tokenized_sentences)
nltk.download('gutenberg')
nltk.download('punkt')
gutenberg_tokenized_sentences = gutenberg.sents()
gutenberg_sentences = detok_sentences(gutenberg_tokenized_sentences)
nltk.download('reuters')
reuters_tokenized_sentences = reuters.sents()
reuters_sentences = detok_sentences(reuters_tokenized_sentences)
nltk.download('webtext')
webtext_tokenized_sentences = webtext.sents()
webtext_sentences = detok_sentences(webtext_tokenized_sentences)
nltk.download('inaugural')
inaugural_tokenized_sentences = inaugural.sents()
inaugural_sentences = detok_sentences(inaugural_tokenized_sentences)
all_sentences = brown_sentences + gutenberg_sentences + reuters_sentences + webtext_sentences + inaugural_sentences

outfile = codecs.open('output.txt', 'w')
for sentence in all_sentences:
    cleaned_sentence = sentence.replace(" ' s ", "'s ")
    cleaned_sentence = cleaned_sentence.replace("n ' t ", "n't ")
    cleaned_sentence = cleaned_sentence.replace(" ,", ",")
    cleaned_sentence = cleaned_sentence.replace(" .", ".")
    outfile.write('{}\n'.format(cleaned_sentence))
Beispiel #13
0
import logging
import re
from gensim.parsing import PorterStemmer
from StemmingHelper import StemmingHelper
global_stemmer = PorterStemmer()
ignored_words = stopwords.words('english')
word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
my_corpus = []

regex = re.compile('[^a-zA-Z]+')

#unicode sentences
#transform into strings
for file in webtext.fileids():
    my_sentences = []
    for sentences in webtext.sents(file):
        words = [
            StemmingHelper.stem(str(regex.sub('', word))).lower()
            for word in sentences
            if StemmingHelper.stem(str(regex.sub('', word))).lower() != ''
        ]
        print words
        my_sentences.append(words)
    my_corpus = my_corpus + my_sentences
print my_corpus

min_count = 2
size = 50
window = 4

model = Word2Vec(my_corpus, min_count=min_count, size=size, window=window)
    start = time.time()
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus,
        method='pmi',
        n_jobs=-1,
        use_cython=True
    )
    elapsed_time = time.time() - start
    print ("elapsed_time with cython:{} [sec]".format(elapsed_time))

from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import genesis
from nltk.corpus import abc

abs_corpus = abc.sents()
genesis_corpus = genesis.sents()
web_corpus = webtext.sents()
gutenberg_corpus = gutenberg.sents()

input_corpus = {
    'abs': list(abs_corpus),
    'genesis': list(genesis_corpus),
    'web': list(web_corpus),
    'gutenberg': list(gutenberg_corpus)
    }

pmi_with_cython(input_corpus)
pmi_with_parallel(input_corpus)
#pmi_with_threading(input_corpus)
Beispiel #15
0
import nltk
from nltk.corpus import webtext
from nltk.corpus import nps_chat
from nltk.corpus import brown 

# for fileid in webtext.fileids():
# 	print fileid, webtext.raw(fileid)[:65]

# for fileId in nps_chat.fileids():
# 	print fileId

pirates = webtext.raw('pirates.txt')
pirates_char = len(webtext.raw('pirates.txt'))
pirates_words = len(webtext.words('pirates.txt'))
pirates_sents = len(webtext.sents('pirates.txt'))
print 'pirates_char: ', pirates_char, 'pirates_words: ', pirates_words, 'pirates_sents: ', pirates_sents, 'avg char per word: ', int(pirates_char/pirates_words), 'avg words per sentence: ', int(pirates_words/pirates_sents)

uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')]))

def lexical_div(un, total):
	return total/un

print 'lexical diversity: ', lexical_div(uniqs, pirates_words)

# brown_categories = brown.categories()
# for genre in brown_categories:
# 	print genre

news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
# modal verbs
Beispiel #16
0
textId = webtext.fileids()
print(textId)

# "케리비안 해적"의 영화 대본 텍스트 문서를 조회한다.
# http://www.actorpoint.com/movie-scripts/scripts/pirates-of-the-caribbean-dead-man%27s-chest.html
text = webtext.raw('pirates.txt')
print(text[:1000])
print("문자 개수 = ", len(text))

# 문서를 word 단위로 읽어온다.
word = webtext.words('pirates.txt')
print(word)
print("word 개수 = ", len(word))

# 문서를 문장 단위로 읽어온다.
sentence = webtext.sents('pirates.txt')
for i in range(5):
    print(sentence[i])
print("문장 개수 = ", len(sentence))

# Firefox의 게시판 텍스트 문서를 조회한다.
text = webtext.raw('firefox.txt')
print(text[:1000])
print("문자 개수 = ", len(text))

# 문서를 word 단위로 읽어온다.
word = webtext.words('firefox.txt')
print(word)
print("word 개수 = ", len(word))

# 문서를 문장 단위로 읽어온다.