Example #1
0
def test_austen():
  from nltk.data import load
  from nltk.corpus import gutenberg as g
  stok = load('tokenizers/punkt/english.pickle')
  train = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-emma.txt'))]
  test1 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-sense.txt'))]
  test2 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-persuasion.txt'))]

  model1 = AdditiveSmoothing(n=2)
  model1.generate_model(train)
  print 'cross entropy additive smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model1, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model1, test2)
  model2 = KnesserNey(n=2)
  model2.generate_model(train)
  print 'cross entropy knesser-ney smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model2, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model2, test2)
  model3 = SimpleGoodTuring(n=2)
  model3.generate_model(train)
  print 'cross entropy simple good-turing smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model3, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model3, test2)

  model4 = KatzSmoothing(n=2)
  model4.generate_model(train)
  print 'cross entropy katz smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model4, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model4, test2)
Example #2
0
def test_austen():
    from nltk.data import load
    from nltk.corpus import gutenberg as g
    stok = load('tokenizers/punkt/english.pickle')
    train = [[w for w in tokenize(preprocess(sent))]
             for sent in stok.tokenize(g.raw('austen-emma.txt'))]
    test1 = [[w for w in tokenize(preprocess(sent))]
             for sent in stok.tokenize(g.raw('austen-sense.txt'))]
    test2 = [[w for w in tokenize(preprocess(sent))]
             for sent in stok.tokenize(g.raw('austen-persuasion.txt'))]

    model1 = AdditiveSmoothing(n=2)
    model1.generate_model(train)
    print 'cross entropy additive smoothing:'
    print 'emma to sense&sensibility: %f0.8' % cross_entropy(model1, test1)
    print 'emma to persuasion: %f0.8' % cross_entropy(model1, test2)
    model2 = KnesserNey(n=2)
    model2.generate_model(train)
    print 'cross entropy knesser-ney smoothing:'
    print 'emma to sense&sensibility: %f0.8' % cross_entropy(model2, test1)
    print 'emma to persuasion: %f0.8' % cross_entropy(model2, test2)
    model3 = SimpleGoodTuring(n=2)
    model3.generate_model(train)
    print 'cross entropy simple good-turing smoothing:'
    print 'emma to sense&sensibility: %f0.8' % cross_entropy(model3, test1)
    print 'emma to persuasion: %f0.8' % cross_entropy(model3, test2)

    model4 = KatzSmoothing(n=2)
    model4.generate_model(train)
    print 'cross entropy katz smoothing:'
    print 'emma to sense&sensibility: %f0.8' % cross_entropy(model4, test1)
    print 'emma to persuasion: %f0.8' % cross_entropy(model4, test2)
Example #3
0
def exercise_gutenberg():
    # 打印古腾堡项目的文件列表
    print gutenberg.fileids()

    # 挑选一个文本: 简-奥斯丁的《爱玛》
    emma = gutenberg.words("austen-emma.txt")

    # 查看书的长度
    print len(emma)

    # 导入文本
    emma_text = nltk.Text(emma)
    emma_text.concordance("surprize")

    for file_id in gutenberg.fileids():
        chars_list = gutenberg.raw(file_id)
        words_list = gutenberg.words(file_id)
        sents_list = gutenberg.sents(file_id)

        # 统计文件的总字符数
        num_chars = len(chars_list)
        # 统计文件的总单词数
        num_words = len(words_list)
        # 统计文件的总句子数
        num_sents = len(sents_list)
        # 统计文件的非重复单词数
        num_vocab = len(set([w.lower() for w in words_list]))
        # 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
        print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
Example #4
0
def gutenberg():
    from nltk.corpus import gutenberg
    file_ids = get_fileids(gutenberg)

    # average characters in a word: raw/words
    # average word in a sentence: words/sents
    # lexical diversity - num_words/num_vocab

    for fileid in file_ids:
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars / num_words), int(num_words / num_sents), int(
            num_words / num_vocab), fileid

    emma = gutenberg.words('austen-emma.txt')
    emma_len = len(emma)
    # print 'percentage', percentage(text1.count('monstrous'), len(text1))

    macbeth_sents = gutenberg.sents('shakespeare-macbeth.txt')
    macbeth_longest_len = max([len(s) for s in macbeth_sents])
    macbeth_longest_sent = [
        s for s in macbeth_sents if len(s) == macbeth_longest_len
    ]

    return render_template('gutenberg.html',
                           file_ids=file_ids,
                           emma=emma,
                           emma_len=emma_len,
                           macbeth_longest_sent=macbeth_longest_sent)
def get_text_chars(file):
    _text = ''
    for txt in file:
        if 'shakespeare' in txt:
            _text += gutenberg.raw(txt).lower()
    _chars = sorted(list(set(_text)))
    return _chars, _text
Example #6
0
def test():

    from nltk.corpus import gutenberg
    emma = gutenberg.raw('austen-emma.txt')
    print len(emma)
    ex = createexercise(emma, pos='v', last_index=False, fast=True)
    print len(ex)
Example #7
0
def exercise_gutenberg():
    # 打印古腾堡项目的文件列表
    print(gutenberg.fileids())

    # 挑选一个文本: 简-奥斯丁的《爱玛》
    emma = gutenberg.words("austen-emma.txt")

    # 查看书的长度
    print(len(emma))

    # 导入文本
    emma_text = nltk.Text(emma)
    emma_text.concordance("surprize")

    for file_id in gutenberg.fileids():
        chars_list = gutenberg.raw(file_id)
        words_list = gutenberg.words(file_id)
        sents_list = gutenberg.sents(file_id)

        # 统计文件的总字符数
        num_chars = len(chars_list)
        # 统计文件的总单词数
        num_words = len(words_list)
        # 统计文件的总句子数
        num_sents = len(sents_list)
        # 统计文件的非重复单词数
        num_vocab = len(set([w.lower() for w in words_list]))
        # 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
        print(num_chars / num_words, num_words / num_sents,
              num_words / num_vocab, file_id)
Example #8
0
def load_moby_dick_analysis():
    
    tokens = get_moby_dick_tokens()
    text = gutenberg.raw('melville-moby_dick.txt')
    try:
        moby_dick_doc = Document(
            url='gutenberg',
            name='moby dick',
            text=text,
            month='Jan',
            year='1851'
            )
        odm_session.flush()
    except DuplicateKeyError:
        moby_dick_doc = Document.query.get(name='moby dick')

    for sum_threshold in sum_thresholds:
        log.info("Trying analysis for threshold = %s" % sum_threshold)
        analysis = get_optimal_window_size(tokens, window_sizes, 20, sum_threshold=sum_threshold)[1]
        anal_dict = analysis.encode()
        window_size = anal_dict['window_size']

        log.debug("Best result = %s" % window_size)
        InformationValueResult(
            window_size = window_size,
            threshold = sum_threshold,
            document = moby_dick_doc,
            iv_words = anal_dict['top_words'],
            max_iv = anal_dict['max_iv'],
            sum_iv = anal_dict['sum_iv']
        )
        odm_session.flush()
Example #9
0
def Asst2(text):
    raw_txt = gutenberg.raw(text)

    #deleting all spaces in the text
    split_txt = re.sub('(\n)+', '', raw_txt)
    split_txt = re.sub(' ', '', split_txt)
    #leaving only letters
    split_txt = "".join(re.findall("[a-zA-Z]+", split_txt))
    #making all letters to lower case
    split_txt = split_txt.lower()
    #counting all the letters
    counter = Counter(split_txt)

    #calculating the frequency of each letter and puting it into a Counter called prob_counter
    prob_counter = probability(counter)
    #making the prob_counter into an ordered list
    prob_counter_sorted = prob_counter.most_common()

    #making a bar plot of the frequency of each letter
    letter = []
    frequency = []
    letter, frequency = zip(*prob_counter_sorted)
    indices = np.arange(len(prob_counter_sorted))
    plt.bar(indices, frequency, color='b')
    plt.xticks(indices, letter, rotation='horizontal')
    plt.tight_layout()
    plt.show()
Example #10
0
def convert_to_json_split(filename):
    try:
        input_txt = gut.raw(filename).split('\n')
        input_txt = [line for line in input_txt if line != ""]
        output_txt = input_txt[1:]
        raw_data = {'Input': input_txt[:-1], 'Output': output_txt}
        df = pd.DataFrame(raw_data, columns=['Input', 'Output'])

        train, test = train_test_split(df, test_size=0.25)
        valid, test = train_test_split(test, test_size=0.4)

        train.to_json(os.path.join(TRAIN_PATH,
                                   'train-{}.json'.format(filename)),
                      orient='records',
                      lines=True)
        test.to_json(os.path.join(TEST_PATH, 'test-{}.json'.format(filename)),
                     orient='records',
                     lines=True)
        valid.to_json(os.path.join(VALIDATION_PATH,
                                   'validation-{}.json'.format(filename)),
                      orient='records',
                      lines=True)

        print("Processed {}".format(filename))
        return df
    except Exception as e:
        print('Error {} occurred'.format(e))
        print('Failed to process {}'.format(filename))
def gutenFreqListNoStop():
    # Obtain the list of words
    gutenberg_words = gutenberg.raw().split(' ')
    englishstop = stopwords.words('english')
    filtered_gutenberg_words = [
        w for w in gutenberg_words if not w in englishstop
    ]

    num_gutenberg_words = len(filtered_gutenberg_words)
    print "We have " + str(num_gutenberg_words) + " gutenberg filtered words"
    counter = 0

    gutenberg_frequ = defaultdict(int)

    sleep(2)
    for word in filtered_gutenberg_words:
        counter += 1
        gutenberg_frequ[word] += 1
        if counter % 1000 == 0:
            print "Progress : " + str(
                (counter / float(num_gutenberg_words)) * 100) + " %"

    gutenberg_frequ = sorted(gutenberg_frequ.values(), reverse=True)
    gutenberg_rank = np.array(xrange(1, len(gutenberg_frequ) + 1))

    c, alpha = powerLaw(gutenberg_frequ, gutenberg_rank)
    plotPowerLaws(
        gutenberg_rank,
        gutenberg_frequ, [c, c], [-1, -alpha],
        title=
        "Relation between word rank and frequency for gutenberg, no stop words",
        xlabel="Word Rank",
        ylabel="Word Frequency")

    return 0
Example #12
0
def get_austen_emma_sample():
    nlp = en_core_web_sm.load()
    emma = gutenberg.raw('austen-emma.txt')
    parsed_emma = nlp(emma)
    seed(181520)
    sample_size = 100
    my_sample = random.sample(list(parsed_emma.sents), sample_size)
    sample = []
    for sent in my_sample:
        sent = re.sub("\s+", " ", sent.text)
        sample.append(sent)

    entities = []
    type_entity = []
    sentences = []
    for sent in sample:
        parsed_sentence = nlp(sent)
        for ent in parsed_sentence.ents:
            if ent.text not in entities:
                entities.append(ent.text)
                sentences.append(sent)
                type_entity.append(ent.label_)
    Entities = pd.DataFrame({
        'Sentence': sentences,
        'Entity': entities,
        'Entity_type': type_entity
    })
    return Entities
def demo():
    """ LOAD DATA , veri setini yukleyelim.
        """
    # Sense and Sensibility by Jane Austen 1811
    text = gutenberg.raw('austen-sense.txt')
    print "Manual CLeaning : \n", cleaning(text)
    print "\nNLTK: Cleaning & Stemming : \n", cleaning_and_stemming(text)
    def extractCorpus(self):
        # .raw() returns raw text in a strign format
        raw_text = gutenberg.raw(self.pos_ex_fn)
        # print(raw_text[:500])

        #removing text inside []
        text = re.sub("^\[.*\]", " ", raw_text)
        #print("text after removing brackets ....")
        #         print(text[:200])
        #removing VOLUME and Chapter nos.
        text = re.sub("\sVOLUME\s[A-Z]", " ", text)
        #         print("removing volume....")
        #         print(text[:500])
        text = re.sub("\sCHAPTER\s[A-Z]", " ", text)
        text = re.sub(r"--", " ", text)
        text = re.sub(r'\"', " ", text)
        #text = re.sub(r'[\"|\?\"|\.\"]'," ", text)
        text = re.sub(r'(?<=[MmSDsdr]){2}\.\s', ' ', text)
        text = re.sub(r'(?<=[MmSDsdr]){3}\.\s', ' ', text)
        text = re.sub(r'_.*_', ' ', text)

        # removing  multiple spaces
        text = re.sub(r"\s+", " ", text)

        sents = re.split(r'\.|\?', text)
        # sents = text.lower().split(".")
        #         print("sentences generated : ")
        #        print(sents[1:10])
        return sents
def Main():
    db = Database()
    index = InvertedIndex(db)
    brown_list = brown.fileids()
    gutenberg_list = gutenberg.fileids()
    # document1 = {
    #     'id': '1',
    #     'text': 'The big sharks of Belgium drink beer.'
    # }
    # document2 = {
    #     'id': '2',
    #     'text': 'Belgium has great beer. They drink beer all the time.'
    # }
    i = 0
    for item in brown_list:
        documentTemp = {'id': str(i), 'text': brown.raw(item)}
        index.index_document(documentTemp)

    for item in gutenberg_list:
        documentTemp = {'id': str(i), 'text': gutenberg.raw(item)}
        index.index_document(documentTemp)

    while True:
        search_term = input("Enter term(s) to search: ")
        result = index.lookup_query(search_term.lower())
        for term in result.keys():
            for appearance in result[term]:
                # Belgium: { docId: 1, frequency: 1}
                document = db.get(appearance.docId)
                print(highlight_term(appearance.docId, term, document['text']))
            print("-----------------------------")
def getNgramFreqDict(n, retrain=False):
    if not retrain:
        try:
            with open('data/%dgram_freq.json' % n) as fin:
                print('Trained frequency for n=%d found; Reading data...' % n)
                ngram_freq = json.load(fin)
            return ngram_freq
        except FileNotFoundError:
            pass

    print('Training frequency for n=%d...' % n)

    # using whole gutenberg corpus
    corpus = gutenberg.raw()
    corpus = re.sub('[^a-z. ]', ' ', corpus.lower())
    corpus = ' '.join(corpus.split())
    corpus_ngram = ngrams(corpus, n)

    ngram_freq = {}

    for gram in corpus_ngram:
        key = ''.join(gram)
        if key in ngram_freq:
            ngram_freq[key] += 1
        else:
            ngram_freq[key] = 1

    sum_count = sum([tup[1] for tup in ngram_freq.items()])
    for k in ngram_freq.keys():
        ngram_freq[k] = ngram_freq[k] / sum_count

    with open('data/%dgram_freq.json' % n, 'w') as fout:
        json.dump(ngram_freq, fout)

    return ngram_freq
def getMLE(word):

    text = gutenberg.raw()

    words = getWords(text)

    unigramFreq = getFreqUnigram(words)

    bigramFreq = getFreqBigram(getBigram(words))

    tempDict = {}

    for key in bigramFreq:
        if key[0] == word:
            tempDict[key] = bigramFreq[key]

    mle = {}

    if bool(tempDict):
        sortedList = sorted(tempDict, key=tempDict.get, reverse=True)

        for i in range(0, 3):
            count = tempDict[sortedList[i]]

            prob = count / float(unigramFreq[word])

            mle[sortedList[i][1]] = prob

    return mle
Example #18
0
def extract_word_vectors(corpus):
    # Read in text
    text = gutenberg.raw(corpus)[:10000]

    # Extract one word and the following one
    tokenizer = Tokenizer()
    # Extracts sequences of text
    tokenizer.fit_on_texts([text])
    # Convert sequences of text to sequences of ints
    int_enc = tokenizer.texts_to_sequences([text])[0]

    # Store vocabulary length for embedding layer (+ 1 to encode longest word)
    vocab_len = len(tokenizer.word_index) + 1

    # Create word-word sequences
    sequences = list()
    for i in range(1, len(int_enc)):
        tmp = int_enc[i - 1:i + 1]
        sequences.append(tmp)

    # Split into first and second element of sequence
    sequences = array(sequences)
    X, y = sequences[:, :-1], sequences[:, -1]

    # Use Keras to_categorical() function to one-hot encode the output / second word
    y = to_categorical(y, num_classes=vocab_len)

    return [X, y, vocab_len, tokenizer]
def getNgramFreqTree(n, retrain=False):
    filename = 'data/%dgram_tree.pickle' % n
    if not retrain:
        try:
            with open(filename, 'rb') as fin:
                print('Trained frequency for n=%d found; Reading data...' % n)
                ngramtree = pickle.load(fin)
            return ngramtree
        except FileNotFoundError:
            pass

    print('Training frequency tree for n=%d...' % n)

    ngramtree = NgramTree()

    corpus = gutenberg.raw()
    corpus = re.sub('[^a-z. ]', ' ', corpus.lower())
    corpus = ' '.join(corpus.split())
    corpus_ngram = ngrams(corpus, n)

    for gram in corpus_ngram:
        ngramtree.addGram(gram)

    ngramtree.addUp()
    ngramtree.normalize()

    with open(filename, 'wb') as fout:
        pickle.dump(ngramtree, fout)

    return ngramtree
Example #20
0
def load_text(filename):
    if (filename == None or filename == ''):
        text = gutenberg.raw(fileids='carroll-alice.txt')
    else:
        with open(filename, 'w') as f:
            text = f.read()
    return text
Example #21
0
def load_sents():
    global sents
    default_st = nltk.sent_tokenize

    alice = gutenberg.raw(fileids='carroll-alice.txt')
    mobyd = gutenberg.raw(fileids='melville-moby_dick.txt')
    shak1 = gutenberg.raw(fileids='shakespeare-hamlet.txt')
    shak2 = gutenberg.raw(fileids='shakespeare-macbeth.txt')
    bbkjv = gutenberg.raw(fileids='bible-kjv.txt')

    alice_sentences = default_st(text=alice)
    mobyd_sentences = default_st(text=mobyd)
    shak1_sentences = default_st(text=shak1)
    shak2_sentences = default_st(text=shak2)
    bbkjv_sentences = default_st(text=bbkjv)

    sents = alice_sentences + mobyd_sentences + shak1_sentences + shak2_sentences + bbkjv_sentences
Example #22
0
File: main.py Project: kwdhd/nlp
def gutenberg():
    from nltk.corpus import gutenberg
    for t in gutenberg.fileids():
        num_chars = len(gutenberg.raw(t))
        num_words = len(gutenberg.words(t))
        num_sents = len(gutenberg.sents(t))
        num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
Example #23
0
    def clear_libs(self, MainWindow):
        self.textBrowser.clear()
        comboText = self.comboBox.currentText()

        for i in text.textDict:
            if comboText == i:
                rawText = gb.raw(text.textDict[i])
                self.textBrowser.append(rawText)
	def handle(self, *args, **options):
		for fileid in gutenberg.fileids():
			out_dir = CORPUS_DIR + os.sep + fileid.replace(".txt", "")
			if not os.path.isdir(out_dir):
				os.makedirs(out_dir)
			f = open(out_dir + os.sep + "sentences.txt", 'w')
			f.write(gutenberg.raw(fileid))
			f.close()
Example #25
0
def get_gutenberg_statistics():
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
        print(round(num_chars / num_words), round(num_words / num_sents),
              round(num_words / num_vocab), fileid)
Example #26
0
 def show_text(self, MainWindow):
     text.libd_storage = []
     self.textBrowser.clear()
     comboText = self.comboBox.currentText()
     for i in text.textDict:
         if comboText == i:
             rawText = gb.raw(text.textDict[i])
             self.textBrowser.append(rawText)
def data_builder(file_id):
    d = gutenberg.raw(fileids=file_id)
    d_sentences = default_st(text=d)
    d_tuples = [nltk.pos_tag(default_wt(sentence)) for sentence in d_sentences]
    d_words = [[word[0] for word in sentence] for sentence in d_tuples]
    d_tags = [[word[1] for word in sentence] for sentence in d_tuples]
    d_len = len(d_sentences)
    return d_sentences, d_words, d_tags, d_len
Example #28
0
def gutenberg_file_info():
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print(int(num_chars / num_words), int(num_words / num_sents),
              int(num_words / num_vocab), fileid)
Example #29
0
def demo():
    """ LOAD DATA , veri setini yukleyelim.
        """
    # Sense and Sensibility by Jane Austen 1811
    text = gutenberg.raw('austen-sense.txt')
    sentences = sent_tokenize(text[:1000])
    modal = word_2_vec_with_gensim(sentences)
    print "Modal : ", modal
Example #30
0
 def handle(self, *args, **options):
     for fileid in gutenberg.fileids():
         out_dir = CORPUS_DIR + os.sep + fileid.replace(".txt", "")
         if not os.path.isdir(out_dir):
             os.makedirs(out_dir)
         f = open(out_dir + os.sep + "sentences.txt", 'w')
         f.write(gutenberg.raw(fileid))
         f.close()
Example #31
0
def get_training_text():
    text = ""

    nltk.download('gutenberg')

    for file_id in gutenberg.fileids():
        text += gutenberg.raw(file_id)

    return text
   def getSentences(self):
      if self.category == "novel":
         sentences = gutenberg.raw(gutenberg.fileids()[0])
         sentences = sentences.split('\n')

      elif self.category == "news":
         sentences = brown.sents(categories='news')

      return sentences
Example #33
0
def structure():

    raw = gutenberg.raw("burgess-busterbrown.txt")
    raw[1:20]

    words = gutenberg.words("burgess-busterbrown.txt")
    words[1:20]

    sents = gutenberg.sents("burgess-busterbrown.txt")
    sents[1:20]
Example #34
0
def similarity_gutenberg():
    for x in range(2,6):
        a = []
        b = 0
        c = 0
        d = 1

        for fid in gutenberg.fileids():
            a.append([])
            for ffid in gutenberg.fileids():
               a[b].append(Jaccard(n_window(gutenberg.raw(fid),x),n_window(gutenberg.raw(ffid),x)))
            b += 1

        for i in range(len(a)):
            for j in range(len(a)):
               c += a[i][j]/(len(a)*len(a))
               d = min(d,a[i][j])
        print("Media: "+ str(c))
        print("Minimo: "+ str(d))
Example #35
0
def generate_tokens(titles):
    corpus = []
    for title in titles:
        novel: str = gutenberg.raw(title)
        novel = novel.strip()
        novel = novel.lower()
        novel = re.sub('\W+', ' ', novel)
        words = novel.split(' ')
        corpus.extend(words)
    return corpus
Example #36
0
File: ch02.py Project: gree2/hobby
def fun02():
    """fun02"""
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        # average word length average sentence length
        print int(num_chars/num_words), int(num_words/num_sents),
        # number of times each vocabulary item appers in the text
        print int(num_words/num_vocab), fileid
Example #37
0
def page57():
    """Statistics from the Gutenberg corpora"""
    from nltk.corpus import gutenberg

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars / num_words), int(num_words / num_sents),
        print int(num_words / num_vocab), fileid
Example #38
0
def for_print():
    '''
    显示每个文本的三个统计量
    :return:
    '''
    for fileid in gutenberg.fileids():
        num_chars=len(gutenberg.raw(fileid))
        num_words=len(gutenberg.words(fileid))
        num_sents=len(gutenberg.sents(fileid))
        num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
Example #39
0
def preprocessing_text_file(input_file):


    train_text = gutenberg.raw(input_file)
    sample_text = gutenberg.raw(input_file)
    #### understand input type and content
    #print(train_text)
    #print("=====================")
    #print(type(train_text))
    #### unicode text 

    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    #### show chunked result
    #print("\n\n".join(tokenized))
    #print("=================")
    #print(type(tokenized))
    #### list of sentences (separated by ".")

    return tokenized
Example #40
0
def solve_p2_greedy(file):
  lines = [l.lower().split("|")[1:-1] for l in open(file)]
  slices = slice(lines)

  n = 3
  corpus = NgramLetterCorpus(n)
  for fileid in gutenberg.fileids()[:3]:
    corpus.update(gutenberg.raw(fileid))

  slices = unshred3(slices, corpus)
  print "FINAL: "
  for l in linearize(slices):
    print "".join(l)
 def test_moby_dick_window(self):
     #just make sure we
     window_sizes = xrange(100, 6000, 100)
     text = gutenberg.raw('melville-moby_dick.txt')
     tokens = tokenize(text, only_alphanum=True, clean_punctuation=True)
     total_number_of_tokens = len(tokens)
     for window_size in window_sizes:
         count = 0
         number_of_windows = int(math.ceil( total_number_of_tokens / window_size))
         for current_window in range(0, number_of_windows+1):
             word_window = Window(tokens, window_size, current_window)
             for word in word_window:
                 count += 1
         self.assertEquals(count, total_number_of_tokens)
Example #42
0
    def benchmark_sbd():
        ps = []
        rs = []
        f1s = []
        c = 0
        for fileid in gutenberg.fileids():
            c += 1
            copy_sents_gold = gutenberg.sents(fileid)
            sents_gold = [s for s in copy_sents_gold]
            for sent_i in range(len(sents_gold)):
                new_sent = [w for w in sents_gold[sent_i] if w.isalpha()]
                sents_gold[sent_i] = new_sent
            text = gutenberg.raw(fileid)
            sents_obtained = split_text(text)
            copy_sents_obtained = sents_obtained.copy()
            for sent_i in range(len(sents_obtained)):
                new_sent = [w.group()
                            for w in re.finditer(r'\w+', sents_obtained[sent_i])
                            if w.group().isalpha()]
                sents_obtained[sent_i] = new_sent
            c_common = 0
            for sent in sents_obtained:
                if sent in  sents_gold:
                    c_common += 1
            p, r, f1 = get_prf(c_common, len(sents_obtained), len(sents_gold))
            print('\n\n', fileid)
            print('Precision: {:0.2f}, Recall: {:0.2f}, F1: {:0.2f}'.format(p, r, f1))
            ps.append(p)
            rs.append(r)
            f1s.append(f1)

        print('\n\nPrecision stats: {:0.3f} +- {:0.4f}'.format(np.mean(ps),
                                                           np.std(ps)))
        print('Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(rs),
                                                        np.std(rs)))
        print('F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(f1s),
                                                    np.std(f1s)))
        print(len(f1s))

        good_ps = [p for p in ps if p >= 0.8]
        good_rs = [r for r in rs if r >= 0.8]
        good_f1s = [f1 for f1 in f1s if f1 >= 0.8]
        print('\n Good precision stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_ps),
                                                           np.std(good_ps)))
        print('Good Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_rs),
                                                        np.std(good_rs)))
        print('Good F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_f1s),
                                                    np.std(good_f1s)))
        print(len(good_f1s))
Example #43
0
def access():

    monty[0]
    monty[3]
    monty[5]
    monty[-1]

    sent = 'colorless green ideas sleep furiously'
    for char in sent:
        print char,

    from nltk.corpus import gutenberg
    raw = gutenberg.raw('melville-moby_dick.txt')
    fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
    fdist.keys()
Example #44
0
def load_hamlet():
    """
    Loads the contents of the play Hamlet into a string.

    Returns
    -------
    str
        The one big, raw, unprocessed string.

    Example
    -------
    >>> document = load_hamlet()
    >>> document[:80]
    '[The Tragedie of Hamlet by William Shakespeare 1599]\n\n\nActus Primus. Scoena Prim'
    """
    return gutenberg.raw("shakespeare-hamlet.txt")
Example #45
0
def mean_len():
    a = []
    d = 1

    for fid in gutenberg.fileids():
        b = 0
        c = 0
        st = gutenberg.raw(fid)
        stl = re.split("\n|\.|\!|\?", st)
        stw = re.split("\n|\.|\!|\?| |,| - ", st)
        for el in stl:
            b += len(el)*(1.0)/len(stl)
        for el in stw:
            c += len(el)*(1.0)/len(stw)
        print(fid)
        print("Media Frases: "+ str(b))
        print("Media Palavras: "+ str(c))
def get_moby_dick_document():
    moby_dick = gutenberg.raw('melville-moby_dick.txt')
    document = Document(
        url = 'melville-moby_dick.txt',
        name = 'Moby dick',
        text = moby_dick,
        month = 'Oct',
        year = 1851
    )
    # document uses tokenizer func for create tokens, since we need to enforce
    # only_alphanum and clean_punct we need a wrapper
    def tokenizer_wrapper(raw_text):
        return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)))
    document.tokenizer = tokenizer_wrapper

    odm_session.flush()

    return document
Example #47
0
def gutenberg():

    emma = nltk.corpus.gutenberg.words('austen-emma.txt')
    print len(emma)

    print gutenberg.fileids()
    emma = gutenberg.words('austen-emma.txt')

    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    macbeth_sentences[1037]
    longest_len = max([len(s) for s in macbeth_sentences])
    [s for s in macbeth_sentences if len(s) == longest_len]

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
def sentenceTokenization():

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    mySentenceTokenizer = nltk.sent_tokenize

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    sample_text = 'We will discuss briefly about the basic syntax, structure and design philosophies. There is a defined hierarchical syntax for Python code which you should remember when writing code! Python is a really powerful programming language!'

    sentences_sample = mySentenceTokenizer(text = sample_text)

    print( '\nTotal number of sentences in sample_text: ' + str(len(sentences_sample)) )
    print( '\nSample sentences:' )
    print( sentences_sample )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    alice = gutenberg.raw(fileids = 'carroll-alice.txt')
    print( "\n### len(alice), total number of characters: " + str(len(alice)) )
    print( "\n### First 1000 characters of carroll-alice.txt:\n" )
    print( alice[0:1000] )

    sentences_alice  = mySentenceTokenizer(text = alice)
    print( '\nTotal number of sentences in Alice: ' + str(len(sentences_alice)) )
    print( '\nFirst 5 sentences in Alice:' )
    for temp_sentence in sentences_alice[0:5]:
        print( "\n### ~~~~~~~~~~ ###\n" + temp_sentence )
    print( "\n### ~~~~~~~~~~ ###" )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    text_german = europarl_raw.german.raw(fileids = "ep-00-01-17.de")
    print( "\n### len(German text), total number of characters: " + str(len(text_german)) )
    print( "\n### First 1000 characters of ep-00-01-17.de (German text):\n" )
    print( text_german[0:1000] )

    sentences_german = mySentenceTokenizer(text = text_german, language = "german")
    print( '\nTotal number of sentences in German text: ' + str(len(sentences_german)) )
    print( '\nFirst 5 sentences in German text:' )
    for temp_sentence in sentences_german[0:5]:
        print( "\n### ~~~~~~~~~~ ###\n" + temp_sentence )
    print( "\n### ~~~~~~~~~~ ###" )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Example #49
0
def create_random_statements(count=50):
    """
    This function scans the ``nltk`` Project Gutenberg dataset, extracts random
    sentences containing some form of "it is" and tags them with a random tag.
    NB: This thing can take a while.
    """
    created_count = 0
    tags = Tag.objects.order_by("?")
    gutenberg_files = gutenberg.fileids()
    random.shuffle(gutenberg_files)
    for file_name in gutenberg_files:
        exists, not_exists = extract.from_text(gutenberg.raw(file_name))
        for sentence in [_linebreak.sub(" ", s) for s in exists]:
            if created_count == count:
                break
            statement = Statement(text=sentence, tag=random.choice(tags))
            try:
                statement.save()
                created_count += 1
                transaction.commit()
            except IntegrityError:
                transaction.rollback()
Example #50
0
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

sample_text = gutenberg.raw('bible-kjv.txt')
tok = sent_tokenize(sample_text)
print(tok[5:15])
Example #51
0
import nltk
from nltk import FreqDist
from nltk.corpus import gutenberg
import json
import csv


print "* Loading corpus"
#raw = gutenberg.raw('melville-moby_dick.txt')
#raw = gutenberg.raw('bible-kjv.txt')
raw = gutenberg.raw('blake-poems.txt')
print "* Tokenizing"
tokens = nltk.word_tokenize(raw)

print "* Tagging parts of speech"
# Save this to strip articles later
parts_of_speech = nltk.pos_tag(tokens)

print "* Converting POS list into a dict for lookup"
# TODO -- fix this.  this is going to f**k up on homonyms
parts_of_speech = dict(parts_of_speech)

# You can ban other parts of speech by adding their tags to this list.
# You can find out what the part-of-speech tags mean by using code like
# this:
# >>> print nltk.help.upenn_tagset('DT')
# DT: determiner
#     all an another any both del each either every half la many much nary
#     neither no some such that the them these this those
banned_parts_of_speech = [
    'DT',
Example #52
0
sorted([w for w in set(text1) if w.endswith('ableness')])
[w.upper() for w in text1]

for word in ['Call', 'me', 'Ishmael', '.']:
    print word

#获取语料库
nltk.corpus.gutenberg.fileids()
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance("surprize")
from nltk.corpus import gutenberg
gutenberg.fileids()

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid

#句子划分
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
longest_len = max([len(s) for s in macbeth_sentences])
#网络聊天语料库
from nltk.corpus import webtext
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]
from nltk.corpus import brown
brown.categories()
Example #53
0
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize
from pprint import pprint as pp

sample = gutenberg.raw('bible-kjv.txt')

sentences = sent_tokenize(sample)

pp(sentences[0:10])
Example #54
0
import nltk
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
from nltk.util import ngrams
from nltk import sent_tokenize, word_tokenize
from operator import itemgetter

text = gutenberg.raw('chesterton-thursday.txt')
nltk_sents = sent_tokenize(text)						# contains the list of sentences detected from the tool
nltk_words = word_tokenize(text)
tokens = nltk_words
uni_freq = FreqDist(tokens)
x = FreqDist()
y = FreqDist()
Bigram_count = 0
for line in nltk_sents:
	w = word_tokenize(line)
	for window in ngrams(w,3,pad_right=True):
		p = window[0]
		if p is None:
			continue
		for p1 in window[1:]:
			if p1 is not None:
				Bigram_count = Bigram_count +1
				x[p,p1] = x[p,p1]+1
				y[p] = y[p]+1
				y[p1] = y[p1]+1 

ct = 0
coll = []
for k,v in x.items():
Example #55
0
import nltk
import math
from nltk.corpus import gutenberg
from pattern.en import *


text = gutenberg.raw('austen-emma.txt')
#pprint(parse(text,chunks = False, tags = False).split())
pattern_words = parse(text,chunks = False, tags = False).split()
pattern_sent = tokenize(text)
#print pattern_words
tokens = pattern_words
l = []
for token in tokens:
	for i in token:
		for j in i:
			l.append(j.lower())
tokens = l
tokens = [token.lower() for token in tokens if len(token) > 1]
#dictn=list(set(tokens))
r = ' '.join(tokens)
dictn=list(set(tokens))

uni_tokens = ngrams(r,n = 1)
bi_tokens =  ngrams(r, n = 2)
tri_tokens =  ngrams(r, n = 3)

uni_fdist = nltk.FreqDist(uni_tokens)


uni_freq = 0
Example #56
0
#!/usr/bin/python

"""Just a testing program for NLTK library. It is a NLP library for Python. Some kick-ass library this is. :)
	Pre-Requisites: NLTK Library installed, And Download additional data for the library using it's command.
	You can use "Natural Language Processing with Python" book from O'Reilley Publications for further details.
	This program prints some statistics for the Corpus(a large compiled collection of text files). """

import nltk
from nltk.corpus import gutenberg

for fid in gutenberg.fileids():
	nchars=gutenberg.raw(fid)
	nwords=gutenberg.words(fid)
	nsents=gutenberg.sents(fid)
	nvocab=len(set(w.lower() for w in gutenberg.words(fid))
	print "%s %s %s %s",(% str(int(nchars/nwords)), % str(int(nwords/nsents)), % str(int(nwords/nvocab)), % fid),
Example #57
0
File: NLP.py Project: Toma-L/NLP
import nltk
nltk.corpus.gutenberg.fileids()
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
len(emma)
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance('surprize')

#another way to do this

from nltk.corpus import gutenberg
gutenberg.fileids()
emma = gutenberg.words('austen-emma.txt')

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print (int(num_chars/num_words), int(num_words/num_sents)) #avg word & sentence length and the diversity of words

macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences #load sentences of Macbeth
macbeth_sentences[1037]
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len] #find longest sentence

from nltk.corpus import webtext
for fileid in webtext.fileids():
    print (fileid, webtext.raw(fileid)[:65], '...')
Example #58
0
#Lemmatizing

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("cats")) #cat
print(lemmatizer.lemmatize("cacti")) #cactus
print(lemmatizer.lemmatize("geese")) #goose
print(lemmatizer.lemmatize("python")) #python
print(lemmatizer.lemmatize("better",pos="a")) #good
print(lemmatizer.lemmatize("run",'v')) #run
#Importing any file from nltk.data

from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize
sample = gutenberg.raw("bible-kjv.txt")
tok = sent_tokenize(sample)

#using wordnet to get synonyms,meanings,examples and antonyms of words

from nltk.corpus import wordnet
syns = wordnet.synsets("program")

print(syns) #will give all the synonyms like 
print(syns[0].lemmas()[0].name) #will give the first synonym.
print(syns[0].definition()) #will give the dictionary meaning of the synonym.
print(syns[0].examples()) #will give some examples of sentences using that synonyms.

synonyms = []
antonyms = []