Python raw Examples, nltk.corpus.gutenberg.raw Python Examples

Example #1

0

Show file

File: calc_score.py Project: JoeDumoulin/nlp_working

def test_austen():
  from nltk.data import load
  from nltk.corpus import gutenberg as g
  stok = load('tokenizers/punkt/english.pickle')
  train = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-emma.txt'))]
  test1 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-sense.txt'))]
  test2 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-persuasion.txt'))]

  model1 = AdditiveSmoothing(n=2)
  model1.generate_model(train)
  print 'cross entropy additive smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model1, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model1, test2)
  model2 = KnesserNey(n=2)
  model2.generate_model(train)
  print 'cross entropy knesser-ney smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model2, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model2, test2)
  model3 = SimpleGoodTuring(n=2)
  model3.generate_model(train)
  print 'cross entropy simple good-turing smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model3, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model3, test2)

  model4 = KatzSmoothing(n=2)
  model4.generate_model(train)
  print 'cross entropy katz smoothing:'
  print 'emma to sense&sensibility: %f0.8' %cross_entropy(model4, test1)
  print 'emma to persuasion: %f0.8' %cross_entropy(model4, test2)

Example #2

0

Show file

def test_austen():
    from nltk.data import load
    from nltk.corpus import gutenberg as g
    stok = load('tokenizers/punkt/english.pickle')
    train = [[w for w in tokenize(preprocess(sent))]
             for sent in stok.tokenize(g.raw('austen-emma.txt'))]
    test1 = [[w for w in tokenize(preprocess(sent))]
             for sent in stok.tokenize(g.raw('austen-sense.txt'))]
    test2 = [[w for w in tokenize(preprocess(sent))]
             for sent in stok.tokenize(g.raw('austen-persuasion.txt'))]

    model1 = AdditiveSmoothing(n=2)
    model1.generate_model(train)
    print 'cross entropy additive smoothing:'
    print 'emma to sense&sensibility: %f0.8' % cross_entropy(model1, test1)
    print 'emma to persuasion: %f0.8' % cross_entropy(model1, test2)
    model2 = KnesserNey(n=2)
    model2.generate_model(train)
    print 'cross entropy knesser-ney smoothing:'
    print 'emma to sense&sensibility: %f0.8' % cross_entropy(model2, test1)
    print 'emma to persuasion: %f0.8' % cross_entropy(model2, test2)
    model3 = SimpleGoodTuring(n=2)
    model3.generate_model(train)
    print 'cross entropy simple good-turing smoothing:'
    print 'emma to sense&sensibility: %f0.8' % cross_entropy(model3, test1)
    print 'emma to persuasion: %f0.8' % cross_entropy(model3, test2)

    model4 = KatzSmoothing(n=2)
    model4.generate_model(train)
    print 'cross entropy katz smoothing:'
    print 'emma to sense&sensibility: %f0.8' % cross_entropy(model4, test1)
    print 'emma to persuasion: %f0.8' % cross_entropy(model4, test2)

Example #3

0

Show file

File: chapter_02.py Project: BurnellLiu/LiuProject

def exercise_gutenberg():
    # 打印古腾堡项目的文件列表
    print gutenberg.fileids()

    # 挑选一个文本： 简-奥斯丁的《爱玛》
    emma = gutenberg.words("austen-emma.txt")

    # 查看书的长度
    print len(emma)

    # 导入文本
    emma_text = nltk.Text(emma)
    emma_text.concordance("surprize")

    for file_id in gutenberg.fileids():
        chars_list = gutenberg.raw(file_id)
        words_list = gutenberg.words(file_id)
        sents_list = gutenberg.sents(file_id)

        # 统计文件的总字符数
        num_chars = len(chars_list)
        # 统计文件的总单词数
        num_words = len(words_list)
        # 统计文件的总句子数
        num_sents = len(sents_list)
        # 统计文件的非重复单词数
        num_vocab = len(set([w.lower() for w in words_list]))
        # 打印词的平均字符数， 句子的平均单词数， 每个单词出现的平均次数， 文件名
        print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id

Example #4

0

Show file

def gutenberg():
    from nltk.corpus import gutenberg
    file_ids = get_fileids(gutenberg)

    # average characters in a word: raw/words
    # average word in a sentence: words/sents
    # lexical diversity - num_words/num_vocab

    for fileid in file_ids:
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars / num_words), int(num_words / num_sents), int(
            num_words / num_vocab), fileid

    emma = gutenberg.words('austen-emma.txt')
    emma_len = len(emma)
    # print 'percentage', percentage(text1.count('monstrous'), len(text1))

    macbeth_sents = gutenberg.sents('shakespeare-macbeth.txt')
    macbeth_longest_len = max([len(s) for s in macbeth_sents])
    macbeth_longest_sent = [
        s for s in macbeth_sents if len(s) == macbeth_longest_len
    ]

    return render_template('gutenberg.html',
                           file_ids=file_ids,
                           emma=emma,
                           emma_len=emma_len,
                           macbeth_longest_sent=macbeth_longest_sent)

Example #5

0

Show file

File: ShakespeareSonnet.py Project: ayans-git/Natural_Language_Processing

def get_text_chars(file):
    _text = ''
    for txt in file:
        if 'shakespeare' in txt:
            _text += gutenberg.raw(txt).lower()
    _chars = sorted(list(set(_text)))
    return _chars, _text

Example #6

0

Show file

File: wordex.py Project: SuzanaK/wordgap

def test():

    from nltk.corpus import gutenberg
    emma = gutenberg.raw('austen-emma.txt')
    print len(emma)
    ex = createexercise(emma, pos='v', last_index=False, fast=True)
    print len(ex)

Example #7

0

Show file

File: chapter_02.py Project: ChaoXs08201991/LiuProject

def exercise_gutenberg():
    # 打印古腾堡项目的文件列表
    print(gutenberg.fileids())

    # 挑选一个文本： 简-奥斯丁的《爱玛》
    emma = gutenberg.words("austen-emma.txt")

    # 查看书的长度
    print(len(emma))

    # 导入文本
    emma_text = nltk.Text(emma)
    emma_text.concordance("surprize")

    for file_id in gutenberg.fileids():
        chars_list = gutenberg.raw(file_id)
        words_list = gutenberg.words(file_id)
        sents_list = gutenberg.sents(file_id)

        # 统计文件的总字符数
        num_chars = len(chars_list)
        # 统计文件的总单词数
        num_words = len(words_list)
        # 统计文件的总句子数
        num_sents = len(sents_list)
        # 统计文件的非重复单词数
        num_vocab = len(set([w.lower() for w in words_list]))
        # 打印词的平均字符数， 句子的平均单词数， 每个单词出现的平均次数， 文件名
        print(num_chars / num_words, num_words / num_sents,
              num_words / num_vocab, file_id)

Example #8

0

Show file

File: moby_dick.py Project: dataista0/leninanalysis

def load_moby_dick_analysis():
    
    tokens = get_moby_dick_tokens()
    text = gutenberg.raw('melville-moby_dick.txt')
    try:
        moby_dick_doc = Document(
            url='gutenberg',
            name='moby dick',
            text=text,
            month='Jan',
            year='1851'
            )
        odm_session.flush()
    except DuplicateKeyError:
        moby_dick_doc = Document.query.get(name='moby dick')

    for sum_threshold in sum_thresholds:
        log.info("Trying analysis for threshold = %s" % sum_threshold)
        analysis = get_optimal_window_size(tokens, window_sizes, 20, sum_threshold=sum_threshold)[1]
        anal_dict = analysis.encode()
        window_size = anal_dict['window_size']

        log.debug("Best result = %s" % window_size)
        InformationValueResult(
            window_size = window_size,
            threshold = sum_threshold,
            document = moby_dick_doc,
            iv_words = anal_dict['top_words'],
            max_iv = anal_dict['max_iv'],
            sum_iv = anal_dict['sum_iv']
        )
        odm_session.flush()

Example #9

0

Show file

def Asst2(text):
    raw_txt = gutenberg.raw(text)

    #deleting all spaces in the text
    split_txt = re.sub('(\n)+', '', raw_txt)
    split_txt = re.sub(' ', '', split_txt)
    #leaving only letters
    split_txt = "".join(re.findall("[a-zA-Z]+", split_txt))
    #making all letters to lower case
    split_txt = split_txt.lower()
    #counting all the letters
    counter = Counter(split_txt)

    #calculating the frequency of each letter and puting it into a Counter called prob_counter
    prob_counter = probability(counter)
    #making the prob_counter into an ordered list
    prob_counter_sorted = prob_counter.most_common()

    #making a bar plot of the frequency of each letter
    letter = []
    frequency = []
    letter, frequency = zip(*prob_counter_sorted)
    indices = np.arange(len(prob_counter_sorted))
    plt.bar(indices, frequency, color='b')
    plt.xticks(indices, letter, rotation='horizontal')
    plt.tight_layout()
    plt.show()

Example #10

0

Show file

def convert_to_json_split(filename):
    try:
        input_txt = gut.raw(filename).split('\n')
        input_txt = [line for line in input_txt if line != ""]
        output_txt = input_txt[1:]
        raw_data = {'Input': input_txt[:-1], 'Output': output_txt}
        df = pd.DataFrame(raw_data, columns=['Input', 'Output'])

        train, test = train_test_split(df, test_size=0.25)
        valid, test = train_test_split(test, test_size=0.4)

        train.to_json(os.path.join(TRAIN_PATH,
                                   'train-{}.json'.format(filename)),
                      orient='records',
                      lines=True)
        test.to_json(os.path.join(TEST_PATH, 'test-{}.json'.format(filename)),
                     orient='records',
                     lines=True)
        valid.to_json(os.path.join(VALIDATION_PATH,
                                   'validation-{}.json'.format(filename)),
                      orient='records',
                      lines=True)

        print("Processed {}".format(filename))
        return df
    except Exception as e:
        print('Error {} occurred'.format(e))
        print('Failed to process {}'.format(filename))

Example #11

0

Show file

File: sbeaulieu_MTI380_devoir4.py Project: jizhihang/mti830

def gutenFreqListNoStop():
    # Obtain the list of words
    gutenberg_words = gutenberg.raw().split(' ')
    englishstop = stopwords.words('english')
    filtered_gutenberg_words = [
        w for w in gutenberg_words if not w in englishstop
    ]

    num_gutenberg_words = len(filtered_gutenberg_words)
    print "We have " + str(num_gutenberg_words) + " gutenberg filtered words"
    counter = 0

    gutenberg_frequ = defaultdict(int)

    sleep(2)
    for word in filtered_gutenberg_words:
        counter += 1
        gutenberg_frequ[word] += 1
        if counter % 1000 == 0:
            print "Progress : " + str(
                (counter / float(num_gutenberg_words)) * 100) + " %"

    gutenberg_frequ = sorted(gutenberg_frequ.values(), reverse=True)
    gutenberg_rank = np.array(xrange(1, len(gutenberg_frequ) + 1))

    c, alpha = powerLaw(gutenberg_frequ, gutenberg_rank)
    plotPowerLaws(
        gutenberg_rank,
        gutenberg_frequ, [c, c], [-1, -alpha],
        title=
        "Relation between word rank and frequency for gutenberg, no stop words",
        xlabel="Word Rank",
        ylabel="Word Frequency")

    return 0

Example #12

0

Show file

def get_austen_emma_sample():
    nlp = en_core_web_sm.load()
    emma = gutenberg.raw('austen-emma.txt')
    parsed_emma = nlp(emma)
    seed(181520)
    sample_size = 100
    my_sample = random.sample(list(parsed_emma.sents), sample_size)
    sample = []
    for sent in my_sample:
        sent = re.sub("\s+", " ", sent.text)
        sample.append(sent)

    entities = []
    type_entity = []
    sentences = []
    for sent in sample:
        parsed_sentence = nlp(sent)
        for ent in parsed_sentence.ents:
            if ent.text not in entities:
                entities.append(ent.text)
                sentences.append(sent)
                type_entity.append(ent.label_)
    Entities = pd.DataFrame({
        'Sentence': sentences,
        'Entity': entities,
        'Entity_type': type_entity
    })
    return Entities

Example #13

0

Show file

File: preprocessing.py Project: trax11/Natural-Language-Processing-with-Python

def demo():
    """ LOAD DATA , veri setini yukleyelim.
        """
    # Sense and Sensibility by Jane Austen 1811
    text = gutenberg.raw('austen-sense.txt')
    print "Manual CLeaning : \n", cleaning(text)
    print "\nNLTK: Cleaning & Stemming : \n", cleaning_and_stemming(text)

Example #14

0

Show file

File: CreateSampleData.py Project: NidhiRustagi/SentenceClassification

    def extractCorpus(self):
        # .raw() returns raw text in a strign format
        raw_text = gutenberg.raw(self.pos_ex_fn)
        # print(raw_text[:500])

        #removing text inside []
        text = re.sub("^\[.*\]", " ", raw_text)
        #print("text after removing brackets ....")
        #         print(text[:200])
        #removing VOLUME and Chapter nos.
        text = re.sub("\sVOLUME\s[A-Z]", " ", text)
        #         print("removing volume....")
        #         print(text[:500])
        text = re.sub("\sCHAPTER\s[A-Z]", " ", text)
        text = re.sub(r"--", " ", text)
        text = re.sub(r'\"', " ", text)
        #text = re.sub(r'[\"|\?\"|\.\"]'," ", text)
        text = re.sub(r'(?<=[MmSDsdr]){2}\.\s', ' ', text)
        text = re.sub(r'(?<=[MmSDsdr]){3}\.\s', ' ', text)
        text = re.sub(r'_.*_', ' ', text)

        # removing  multiple spaces
        text = re.sub(r"\s+", " ", text)

        sents = re.split(r'\.|\?', text)
        # sents = text.lower().split(".")
        #         print("sentences generated : ")
        #        print(sents[1:10])
        return sents

Example #15

0

Show file

File: main.py Project: CommunicativeEngineer/searchEngine

def Main():
    db = Database()
    index = InvertedIndex(db)
    brown_list = brown.fileids()
    gutenberg_list = gutenberg.fileids()
    # document1 = {
    #     'id': '1',
    #     'text': 'The big sharks of Belgium drink beer.'
    # }
    # document2 = {
    #     'id': '2',
    #     'text': 'Belgium has great beer. They drink beer all the time.'
    # }
    i = 0
    for item in brown_list:
        documentTemp = {'id': str(i), 'text': brown.raw(item)}
        index.index_document(documentTemp)

    for item in gutenberg_list:
        documentTemp = {'id': str(i), 'text': gutenberg.raw(item)}
        index.index_document(documentTemp)

    while True:
        search_term = input("Enter term(s) to search: ")
        result = index.lookup_query(search_term.lower())
        for term in result.keys():
            for appearance in result[term]:
                # Belgium: { docId: 1, frequency: 1}
                document = db.get(appearance.docId)
                print(highlight_term(appearance.docId, term, document['text']))
            print("-----------------------------")

Example #16

0

Show file

File: ngram_corpus.py Project: panangam/csci181_project_2_scripts

def getNgramFreqDict(n, retrain=False):
    if not retrain:
        try:
            with open('data/%dgram_freq.json' % n) as fin:
                print('Trained frequency for n=%d found; Reading data...' % n)
                ngram_freq = json.load(fin)
            return ngram_freq
        except FileNotFoundError:
            pass

    print('Training frequency for n=%d...' % n)

    # using whole gutenberg corpus
    corpus = gutenberg.raw()
    corpus = re.sub('[^a-z. ]', ' ', corpus.lower())
    corpus = ' '.join(corpus.split())
    corpus_ngram = ngrams(corpus, n)

    ngram_freq = {}

    for gram in corpus_ngram:
        key = ''.join(gram)
        if key in ngram_freq:
            ngram_freq[key] += 1
        else:
            ngram_freq[key] = 1

    sum_count = sum([tup[1] for tup in ngram_freq.items()])
    for k in ngram_freq.keys():
        ngram_freq[k] = ngram_freq[k] / sum_count

    with open('data/%dgram_freq.json' % n, 'w') as fout:
        json.dump(ngram_freq, fout)

    return ngram_freq

Example #17

0

Show file

File: mle.py Project: paragj96/language-model-hindi-treebank

def getMLE(word):

    text = gutenberg.raw()

    words = getWords(text)

    unigramFreq = getFreqUnigram(words)

    bigramFreq = getFreqBigram(getBigram(words))

    tempDict = {}

    for key in bigramFreq:
        if key[0] == word:
            tempDict[key] = bigramFreq[key]

    mle = {}

    if bool(tempDict):
        sortedList = sorted(tempDict, key=tempDict.get, reverse=True)

        for i in range(0, 3):
            count = tempDict[sortedList[i]]

            prob = count / float(unigramFreq[word])

            mle[sortedList[i][1]] = prob

    return mle

Example #18

0

Show file

def extract_word_vectors(corpus):
    # Read in text
    text = gutenberg.raw(corpus)[:10000]

    # Extract one word and the following one
    tokenizer = Tokenizer()
    # Extracts sequences of text
    tokenizer.fit_on_texts([text])
    # Convert sequences of text to sequences of ints
    int_enc = tokenizer.texts_to_sequences([text])[0]

    # Store vocabulary length for embedding layer (+ 1 to encode longest word)
    vocab_len = len(tokenizer.word_index) + 1

    # Create word-word sequences
    sequences = list()
    for i in range(1, len(int_enc)):
        tmp = int_enc[i - 1:i + 1]
        sequences.append(tmp)

    # Split into first and second element of sequence
    sequences = array(sequences)
    X, y = sequences[:, :-1], sequences[:, -1]

    # Use Keras to_categorical() function to one-hot encode the output / second word
    y = to_categorical(y, num_classes=vocab_len)

    return [X, y, vocab_len, tokenizer]

Example #19

0

Show file

File: ngram_corpus.py Project: panangam/csci181_project_2_scripts

def getNgramFreqTree(n, retrain=False):
    filename = 'data/%dgram_tree.pickle' % n
    if not retrain:
        try:
            with open(filename, 'rb') as fin:
                print('Trained frequency for n=%d found; Reading data...' % n)
                ngramtree = pickle.load(fin)
            return ngramtree
        except FileNotFoundError:
            pass

    print('Training frequency tree for n=%d...' % n)

    ngramtree = NgramTree()

    corpus = gutenberg.raw()
    corpus = re.sub('[^a-z. ]', ' ', corpus.lower())
    corpus = ' '.join(corpus.split())
    corpus_ngram = ngrams(corpus, n)

    for gram in corpus_ngram:
        ngramtree.addGram(gram)

    ngramtree.addUp()
    ngramtree.normalize()

    with open(filename, 'wb') as fout:
        pickle.dump(ngramtree, fout)

    return ngramtree

Example #20

0

Show file

File: xyq_nlp.py Project: graytheone/xyq

def load_text(filename):
    if (filename == None or filename == ''):
        text = gutenberg.raw(fileids='carroll-alice.txt')
    else:
        with open(filename, 'w') as f:
            text = f.read()
    return text

Example #21

0

Show file

def load_sents():
    global sents
    default_st = nltk.sent_tokenize

    alice = gutenberg.raw(fileids='carroll-alice.txt')
    mobyd = gutenberg.raw(fileids='melville-moby_dick.txt')
    shak1 = gutenberg.raw(fileids='shakespeare-hamlet.txt')
    shak2 = gutenberg.raw(fileids='shakespeare-macbeth.txt')
    bbkjv = gutenberg.raw(fileids='bible-kjv.txt')

    alice_sentences = default_st(text=alice)
    mobyd_sentences = default_st(text=mobyd)
    shak1_sentences = default_st(text=shak1)
    shak2_sentences = default_st(text=shak2)
    bbkjv_sentences = default_st(text=bbkjv)

    sents = alice_sentences + mobyd_sentences + shak1_sentences + shak2_sentences + bbkjv_sentences

Example #22

0

Show file

File: main.py Project: kwdhd/nlp

def gutenberg():
    from nltk.corpus import gutenberg
    for t in gutenberg.fileids():
        num_chars = len(gutenberg.raw(t))
        num_words = len(gutenberg.words(t))
        num_sents = len(gutenberg.sents(t))
        num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t

Example #23

0

Show file

File: NovLib.py Project: AlamarW/NovLIb

    def clear_libs(self, MainWindow):
        self.textBrowser.clear()
        comboText = self.comboBox.currentText()

        for i in text.textDict:
            if comboText == i:
                rawText = gb.raw(text.textDict[i])
                self.textBrowser.append(rawText)

Example #24

0

Show file

File: create_collection.py Project: hashx101/wordseerbackend_python

	def handle(self, *args, **options):
		for fileid in gutenberg.fileids():
			out_dir = CORPUS_DIR + os.sep + fileid.replace(".txt", "")
			if not os.path.isdir(out_dir):
				os.makedirs(out_dir)
			f = open(out_dir + os.sep + "sentences.txt", 'w')
			f.write(gutenberg.raw(fileid))
			f.close()

Example #25

0

Show file

def get_gutenberg_statistics():
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
        print(round(num_chars / num_words), round(num_words / num_sents),
              round(num_words / num_vocab), fileid)

Example #26

0

Show file

File: NovLib.py Project: AlamarW/NovLIb

 def show_text(self, MainWindow):
     text.libd_storage = []
     self.textBrowser.clear()
     comboText = self.comboBox.currentText()
     for i in text.textDict:
         if comboText == i:
             rawText = gb.raw(text.textDict[i])
             self.textBrowser.append(rawText)

Example #27

0

Show file

File: LSTM-TextClass_POS_Saver.py Project: knut0815/ADL-NLP

def data_builder(file_id):
    d = gutenberg.raw(fileids=file_id)
    d_sentences = default_st(text=d)
    d_tuples = [nltk.pos_tag(default_wt(sentence)) for sentence in d_sentences]
    d_words = [[word[0] for word in sentence] for sentence in d_tuples]
    d_tags = [[word[1] for word in sentence] for sentence in d_tuples]
    d_len = len(d_sentences)
    return d_sentences, d_words, d_tags, d_len

Example #28

0

Show file

File: textcorpus.py Project: Tetsuya-Masuda/nlplearning

def gutenberg_file_info():
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print(int(num_chars / num_words), int(num_words / num_sents),
              int(num_words / num_vocab), fileid)

Example #29

0

Show file

def demo():
    """ LOAD DATA , veri setini yukleyelim.
        """
    # Sense and Sensibility by Jane Austen 1811
    text = gutenberg.raw('austen-sense.txt')
    sentences = sent_tokenize(text[:1000])
    modal = word_2_vec_with_gensim(sentences)
    print "Modal : ", modal

Example #30

0

Show file

 def handle(self, *args, **options):
     for fileid in gutenberg.fileids():
         out_dir = CORPUS_DIR + os.sep + fileid.replace(".txt", "")
         if not os.path.isdir(out_dir):
             os.makedirs(out_dir)
         f = open(out_dir + os.sep + "sentences.txt", 'w')
         f.write(gutenberg.raw(fileid))
         f.close()

Example #31

0

Show file

def get_training_text():
    text = ""

    nltk.download('gutenberg')

    for file_id in gutenberg.fileids():
        text += gutenberg.raw(file_id)

    return text

Example #32

0

Show file

File: WordToVecModel.py Project: ZhouXing19/WordEmbeddingfromMultiDomain

   def getSentences(self):
      if self.category == "novel":
         sentences = gutenberg.raw(gutenberg.fileids()[0])
         sentences = sentences.split('\n')

      elif self.category == "news":
         sentences = brown.sents(categories='news')

      return sentences

Example #33

0

Show file

def structure():

    raw = gutenberg.raw("burgess-busterbrown.txt")
    raw[1:20]

    words = gutenberg.words("burgess-busterbrown.txt")
    words[1:20]

    sents = gutenberg.sents("burgess-busterbrown.txt")
    sents[1:20]

Example #34

0

Show file

File: mineracao.py Project: gabrielsqsf/nltkfun

def similarity_gutenberg():
    for x in range(2,6):
        a = []
        b = 0
        c = 0
        d = 1

        for fid in gutenberg.fileids():
            a.append([])
            for ffid in gutenberg.fileids():
               a[b].append(Jaccard(n_window(gutenberg.raw(fid),x),n_window(gutenberg.raw(ffid),x)))
            b += 1

        for i in range(len(a)):
            for j in range(len(a)):
               c += a[i][j]/(len(a)*len(a))
               d = min(d,a[i][j])
        print("Media: "+ str(c))
        print("Minimo: "+ str(d))

Example #35

0

Show file

File: app.py Project: p-severin/ml_repo

def generate_tokens(titles):
    corpus = []
    for title in titles:
        novel: str = gutenberg.raw(title)
        novel = novel.strip()
        novel = novel.lower()
        novel = re.sub('\W+', ' ', novel)
        words = novel.split(' ')
        corpus.extend(words)
    return corpus

Example #36

0

Show file

File: ch02.py Project: gree2/hobby

def fun02():
    """fun02"""
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        # average word length average sentence length
        print int(num_chars/num_words), int(num_words/num_sents),
        # number of times each vocabulary item appers in the text
        print int(num_words/num_vocab), fileid

Example #37

0

Show file

File: book_examples.py Project: andreoliwa/nlp-book

def page57():
    """Statistics from the Gutenberg corpora"""
    from nltk.corpus import gutenberg

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars / num_words), int(num_words / num_sents),
        print int(num_words / num_vocab), fileid

Example #38

0

Show file

File: toturial.py Project: Paul-Lin/misc

def for_print():
    '''
    显示每个文本的三个统计量
    :return:
    '''
    for fileid in gutenberg.fileids():
        num_chars=len(gutenberg.raw(fileid))
        num_words=len(gutenberg.words(fileid))
        num_sents=len(gutenberg.sents(fileid))
        num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid

Example #39

0

Show file

File: chunking_test.py Project: RimiChen/QAnarrative

def preprocessing_text_file(input_file):


    train_text = gutenberg.raw(input_file)
    sample_text = gutenberg.raw(input_file)
    #### understand input type and content
    #print(train_text)
    #print("=====================")
    #print(type(train_text))
    #### unicode text 

    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    #### show chunked result
    #print("\n\n".join(tokenized))
    #print("=================")
    #print(type(tokenized))
    #### list of sentences (separated by ".")

    return tokenized

Example #40

0

Show file

File: solve.py Project: indraastra/puzzles

def solve_p2_greedy(file):
  lines = [l.lower().split("|")[1:-1] for l in open(file)]
  slices = slice(lines)

  n = 3
  corpus = NgramLetterCorpus(n)
  for fileid in gutenberg.fileids()[:3]:
    corpus.update(gutenberg.raw(fileid))

  slices = unshred3(slices, corpus)
  print "FINAL: "
  for l in linearize(slices):
    print "".join(l)

Example #41

0

Show file

File: test_window.py Project: finiteautomata/leninanalysis

 def test_moby_dick_window(self):
     #just make sure we
     window_sizes = xrange(100, 6000, 100)
     text = gutenberg.raw('melville-moby_dick.txt')
     tokens = tokenize(text, only_alphanum=True, clean_punctuation=True)
     total_number_of_tokens = len(tokens)
     for window_size in window_sizes:
         count = 0
         number_of_windows = int(math.ceil( total_number_of_tokens / window_size))
         for current_window in range(0, number_of_windows+1):
             word_window = Window(tokens, window_size, current_window)
             for word in word_window:
                 count += 1
         self.assertEquals(count, total_number_of_tokens)

Example #42

0

Show file

File: readability.py Project: artreven/assessment_tools

    def benchmark_sbd():
        ps = []
        rs = []
        f1s = []
        c = 0
        for fileid in gutenberg.fileids():
            c += 1
            copy_sents_gold = gutenberg.sents(fileid)
            sents_gold = [s for s in copy_sents_gold]
            for sent_i in range(len(sents_gold)):
                new_sent = [w for w in sents_gold[sent_i] if w.isalpha()]
                sents_gold[sent_i] = new_sent
            text = gutenberg.raw(fileid)
            sents_obtained = split_text(text)
            copy_sents_obtained = sents_obtained.copy()
            for sent_i in range(len(sents_obtained)):
                new_sent = [w.group()
                            for w in re.finditer(r'\w+', sents_obtained[sent_i])
                            if w.group().isalpha()]
                sents_obtained[sent_i] = new_sent
            c_common = 0
            for sent in sents_obtained:
                if sent in  sents_gold:
                    c_common += 1
            p, r, f1 = get_prf(c_common, len(sents_obtained), len(sents_gold))
            print('\n\n', fileid)
            print('Precision: {:0.2f}, Recall: {:0.2f}, F1: {:0.2f}'.format(p, r, f1))
            ps.append(p)
            rs.append(r)
            f1s.append(f1)

        print('\n\nPrecision stats: {:0.3f} +- {:0.4f}'.format(np.mean(ps),
                                                           np.std(ps)))
        print('Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(rs),
                                                        np.std(rs)))
        print('F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(f1s),
                                                    np.std(f1s)))
        print(len(f1s))

        good_ps = [p for p in ps if p >= 0.8]
        good_rs = [r for r in rs if r >= 0.8]
        good_f1s = [f1 for f1 in f1s if f1 >= 0.8]
        print('\n Good precision stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_ps),
                                                           np.std(good_ps)))
        print('Good Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_rs),
                                                        np.std(good_rs)))
        print('Good F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_f1s),
                                                    np.std(good_f1s)))
        print(len(good_f1s))

Example #43

0

Show file

File: c03_strings.py Project: AkiraKane/Python

def access():

    monty[0]
    monty[3]
    monty[5]
    monty[-1]

    sent = 'colorless green ideas sleep furiously'
    for char in sent:
        print char,

    from nltk.corpus import gutenberg
    raw = gutenberg.raw('melville-moby_dick.txt')
    fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
    fdist.keys()

Example #44

0

Show file

File: test.py Project: efrenaguilar95/Yelp_Analyzer

def load_hamlet():
    """
    Loads the contents of the play Hamlet into a string.

    Returns
    -------
    str
        The one big, raw, unprocessed string.

    Example
    -------
    >>> document = load_hamlet()
    >>> document[:80]
    '[The Tragedie of Hamlet by William Shakespeare 1599]\n\n\nActus Primus. Scoena Prim'
    """
    return gutenberg.raw("shakespeare-hamlet.txt")

Example #45

0

Show file

File: mineracao.py Project: gabrielsqsf/nltkfun

def mean_len():
    a = []
    d = 1

    for fid in gutenberg.fileids():
        b = 0
        c = 0
        st = gutenberg.raw(fid)
        stl = re.split("\n|\.|\!|\?", st)
        stw = re.split("\n|\.|\!|\?| |,| - ", st)
        for el in stl:
            b += len(el)*(1.0)/len(stl)
        for el in stw:
            c += len(el)*(1.0)/len(stw)
        print(fid)
        print("Media Frases: "+ str(b))
        print("Media Palavras: "+ str(c))

Example #46

0

Show file

File: moby_dick_tests.py Project: finiteautomata/leninanalysis

def get_moby_dick_document():
    moby_dick = gutenberg.raw('melville-moby_dick.txt')
    document = Document(
        url = 'melville-moby_dick.txt',
        name = 'Moby dick',
        text = moby_dick,
        month = 'Oct',
        year = 1851
    )
    # document uses tokenizer func for create tokens, since we need to enforce
    # only_alphanum and clean_punct we need a wrapper
    def tokenizer_wrapper(raw_text):
        return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)))
    document.tokenizer = tokenizer_wrapper

    odm_session.flush()

    return document

Example #47

0

Show file

File: c02_text_corpora.py Project: AkiraKane/Python

def gutenberg():

    emma = nltk.corpus.gutenberg.words('austen-emma.txt')
    print len(emma)

    print gutenberg.fileids()
    emma = gutenberg.words('austen-emma.txt')

    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    macbeth_sentences[1037]
    longest_len = max([len(s) for s in macbeth_sentences])
    [s for s in macbeth_sentences if len(s) == longest_len]

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid

Example #48

0

Show file

File: TextTokenization.py Project: paradisepilot/statistics

def sentenceTokenization():

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    mySentenceTokenizer = nltk.sent_tokenize

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    sample_text = 'We will discuss briefly about the basic syntax, structure and design philosophies. There is a defined hierarchical syntax for Python code which you should remember when writing code! Python is a really powerful programming language!'

    sentences_sample = mySentenceTokenizer(text = sample_text)

    print( '\nTotal number of sentences in sample_text: ' + str(len(sentences_sample)) )
    print( '\nSample sentences:' )
    print( sentences_sample )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    alice = gutenberg.raw(fileids = 'carroll-alice.txt')
    print( "\n### len(alice), total number of characters: " + str(len(alice)) )
    print( "\n### First 1000 characters of carroll-alice.txt:\n" )
    print( alice[0:1000] )

    sentences_alice  = mySentenceTokenizer(text = alice)
    print( '\nTotal number of sentences in Alice: ' + str(len(sentences_alice)) )
    print( '\nFirst 5 sentences in Alice:' )
    for temp_sentence in sentences_alice[0:5]:
        print( "\n### ~~~~~~~~~~ ###\n" + temp_sentence )
    print( "\n### ~~~~~~~~~~ ###" )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    text_german = europarl_raw.german.raw(fileids = "ep-00-01-17.de")
    print( "\n### len(German text), total number of characters: " + str(len(text_german)) )
    print( "\n### First 1000 characters of ep-00-01-17.de (German text):\n" )
    print( text_german[0:1000] )

    sentences_german = mySentenceTokenizer(text = text_german, language = "german")
    print( '\nTotal number of sentences in German text: ' + str(len(sentences_german)) )
    print( '\nFirst 5 sentences in German text:' )
    for temp_sentence in sentences_german[0:5]:
        print( "\n### ~~~~~~~~~~ ###\n" + temp_sentence )
    print( "\n### ~~~~~~~~~~ ###" )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )

Example #49

0

Show file

File: data.py Project: mazelife/remixthought.org

def create_random_statements(count=50):
    """
    This function scans the ``nltk`` Project Gutenberg dataset, extracts random
    sentences containing some form of "it is" and tags them with a random tag.
    NB: This thing can take a while.
    """
    created_count = 0
    tags = Tag.objects.order_by("?")
    gutenberg_files = gutenberg.fileids()
    random.shuffle(gutenberg_files)
    for file_name in gutenberg_files:
        exists, not_exists = extract.from_text(gutenberg.raw(file_name))
        for sentence in [_linebreak.sub(" ", s) for s in exists]:
            if created_count == count:
                break
            statement = Statement(text=sentence, tag=random.choice(tags))
            try:
                statement.save()
                created_count += 1
                transaction.commit()
            except IntegrityError:
                transaction.rollback()

Example #50

0

Show file

File: corpora.py Project: ylcnky/PythonDilAnaliz

from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

sample_text = gutenberg.raw('bible-kjv.txt')
tok = sent_tokenize(sample_text)
print(tok[5:15])

Example #51

0

Show file

File: beannltk.py Project: decause/nlTKO

import nltk
from nltk import FreqDist
from nltk.corpus import gutenberg
import json
import csv


print "* Loading corpus"
#raw = gutenberg.raw('melville-moby_dick.txt')
#raw = gutenberg.raw('bible-kjv.txt')
raw = gutenberg.raw('blake-poems.txt')
print "* Tokenizing"
tokens = nltk.word_tokenize(raw)

print "* Tagging parts of speech"
# Save this to strip articles later
parts_of_speech = nltk.pos_tag(tokens)

print "* Converting POS list into a dict for lookup"
# TODO -- fix this.  this is going to f**k up on homonyms
parts_of_speech = dict(parts_of_speech)

# You can ban other parts of speech by adding their tags to this list.
# You can find out what the part-of-speech tags mean by using code like
# this:
# >>> print nltk.help.upenn_tagset('DT')
# DT: determiner
#     all an another any both del each either every half la many much nary
#     neither no some such that the them these this those
banned_parts_of_speech = [
    'DT',

Example #52

0

Show file

File: test.py Project: zhurui1351/RSTOCK_TRAIL

sorted([w for w in set(text1) if w.endswith('ableness')])
[w.upper() for w in text1]

for word in ['Call', 'me', 'Ishmael', '.']:
    print word

#获取语料库
nltk.corpus.gutenberg.fileids()
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance("surprize")
from nltk.corpus import gutenberg
gutenberg.fileids()

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid

#句子划分
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
longest_len = max([len(s) for s in macbeth_sentences])
#网络聊天语料库
from nltk.corpus import webtext
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]
from nltk.corpus import brown
brown.categories()

Example #53

0

Show file

File: corpora.py Project: jmarthernandez/py-nltk

from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize
from pprint import pprint as pp

sample = gutenberg.raw('bible-kjv.txt')

sentences = sent_tokenize(sample)

pp(sentences[0:10])

Example #54

0

Show file

File: part3_2.py Project: ddeveshh/nlp

import nltk
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
from nltk.util import ngrams
from nltk import sent_tokenize, word_tokenize
from operator import itemgetter

text = gutenberg.raw('chesterton-thursday.txt')
nltk_sents = sent_tokenize(text)						# contains the list of sentences detected from the tool
nltk_words = word_tokenize(text)
tokens = nltk_words
uni_freq = FreqDist(tokens)
x = FreqDist()
y = FreqDist()
Bigram_count = 0
for line in nltk_sents:
	w = word_tokenize(line)
	for window in ngrams(w,3,pad_right=True):
		p = window[0]
		if p is None:
			continue
		for p1 in window[1:]:
			if p1 is not None:
				Bigram_count = Bigram_count +1
				x[p,p1] = x[p,p1]+1
				y[p] = y[p]+1
				y[p1] = y[p1]+1 

ct = 0
coll = []
for k,v in x.items():

Example #55

0

Show file

File: part1_2.py Project: ddeveshh/nlp

import nltk
import math
from nltk.corpus import gutenberg
from pattern.en import *


text = gutenberg.raw('austen-emma.txt')
#pprint(parse(text,chunks = False, tags = False).split())
pattern_words = parse(text,chunks = False, tags = False).split()
pattern_sent = tokenize(text)
#print pattern_words
tokens = pattern_words
l = []
for token in tokens:
	for i in token:
		for j in i:
			l.append(j.lower())
tokens = l
tokens = [token.lower() for token in tokens if len(token) > 1]
#dictn=list(set(tokens))
r = ' '.join(tokens)
dictn=list(set(tokens))

uni_tokens = ngrams(r,n = 1)
bi_tokens =  ngrams(r, n = 2)
tri_tokens =  ngrams(r, n = 3)

uni_fdist = nltk.FreqDist(uni_tokens)


uni_freq = 0

Example #56

0

Show file

File: stats.py Project: Dakshesh10/JustForFun

#!/usr/bin/python

"""Just a testing program for NLTK library. It is a NLP library for Python. Some kick-ass library this is. :)
	Pre-Requisites: NLTK Library installed, And Download additional data for the library using it's command.
	You can use "Natural Language Processing with Python" book from O'Reilley Publications for further details.
	This program prints some statistics for the Corpus(a large compiled collection of text files). """

import nltk
from nltk.corpus import gutenberg

for fid in gutenberg.fileids():
	nchars=gutenberg.raw(fid)
	nwords=gutenberg.words(fid)
	nsents=gutenberg.sents(fid)
	nvocab=len(set(w.lower() for w in gutenberg.words(fid))
	print "%s %s %s %s",(% str(int(nchars/nwords)), % str(int(nwords/nsents)), % str(int(nwords/nvocab)), % fid),

Example #57

0

Show file

File: NLP.py Project: Toma-L/NLP

import nltk
nltk.corpus.gutenberg.fileids()
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
len(emma)
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance('surprize')

#another way to do this

from nltk.corpus import gutenberg
gutenberg.fileids()
emma = gutenberg.words('austen-emma.txt')

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print (int(num_chars/num_words), int(num_words/num_sents)) #avg word & sentence length and the diversity of words

macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences #load sentences of Macbeth
macbeth_sentences[1037]
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len] #find longest sentence

from nltk.corpus import webtext
for fileid in webtext.fileids():
    print (fileid, webtext.raw(fileid)[:65], '...')

Example #58

0

Show file

File: all_nltk.py Project: Utkagr/NLPrel

#Lemmatizing

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("cats")) #cat
print(lemmatizer.lemmatize("cacti")) #cactus
print(lemmatizer.lemmatize("geese")) #goose
print(lemmatizer.lemmatize("python")) #python
print(lemmatizer.lemmatize("better",pos="a")) #good
print(lemmatizer.lemmatize("run",'v')) #run
#Importing any file from nltk.data

from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize
sample = gutenberg.raw("bible-kjv.txt")
tok = sent_tokenize(sample)

#using wordnet to get synonyms,meanings,examples and antonyms of words

from nltk.corpus import wordnet
syns = wordnet.synsets("program")

print(syns) #will give all the synonyms like 
print(syns[0].lemmas()[0].name) #will give the first synonym.
print(syns[0].definition()) #will give the dictionary meaning of the synonym.
print(syns[0].examples()) #will give some examples of sentences using that synonyms.

synonyms = []
antonyms = []