Ejemplo n.º 1
0
def exercise_gutenberg():
    # 打印古腾堡项目的文件列表
    print(gutenberg.fileids())

    # 挑选一个文本: 简-奥斯丁的《爱玛》
    emma = gutenberg.words("austen-emma.txt")

    # 查看书的长度
    print(len(emma))

    # 导入文本
    emma_text = nltk.Text(emma)
    emma_text.concordance("surprize")

    for file_id in gutenberg.fileids():
        chars_list = gutenberg.raw(file_id)
        words_list = gutenberg.words(file_id)
        sents_list = gutenberg.sents(file_id)

        # 统计文件的总字符数
        num_chars = len(chars_list)
        # 统计文件的总单词数
        num_words = len(words_list)
        # 统计文件的总句子数
        num_sents = len(sents_list)
        # 统计文件的非重复单词数
        num_vocab = len(set([w.lower() for w in words_list]))
        # 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
        print(num_chars / num_words, num_words / num_sents,
              num_words / num_vocab, file_id)
Ejemplo n.º 2
0
def exercise_gutenberg():
    # 打印古腾堡项目的文件列表
    print gutenberg.fileids()

    # 挑选一个文本: 简-奥斯丁的《爱玛》
    emma = gutenberg.words("austen-emma.txt")

    # 查看书的长度
    print len(emma)

    # 导入文本
    emma_text = nltk.Text(emma)
    emma_text.concordance("surprize")

    for file_id in gutenberg.fileids():
        chars_list = gutenberg.raw(file_id)
        words_list = gutenberg.words(file_id)
        sents_list = gutenberg.sents(file_id)

        # 统计文件的总字符数
        num_chars = len(chars_list)
        # 统计文件的总单词数
        num_words = len(words_list)
        # 统计文件的总句子数
        num_sents = len(sents_list)
        # 统计文件的非重复单词数
        num_vocab = len(set([w.lower() for w in words_list]))
        # 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
        print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
Ejemplo n.º 3
0
def fun01():
    """fun01"""
    print gutenberg.fileids()
    # emma by jane austen
    emma = gutenberg.words('austen-emma.txt')
    # how many words it contains
    print len(emma)
    print Text(emma).concordance("surprize")
Ejemplo n.º 4
0
Archivo: ch02.py Proyecto: gree2/hobby
def fun01():
    """fun01"""
    print gutenberg.fileids()
    # emma by jane austen
    emma = gutenberg.words('austen-emma.txt')
    # how many words it contains
    print len(emma)
    print Text(emma).concordance("surprize")
Ejemplo n.º 5
0
def show():
    print gutenberg.fileids()
    # 频率分布实例化
    fd = FreqDist()
    for word in gutenberg.words('austen-persuasion.txt'):
        fd[word] += 1

    print fd.N()
    print fd.B()
    # 得到前10个按频率排序后的词
    for word, value in sorted(fd.items(), key=lambda item: -item[1])[:10]:
        print word, value
def get_gutenberg_data(useN=100):
    try:
        fileids = gutenberg.fileids()
    except LookupError:
        import nltk
        nltk.download('gutenberg')
        fileids = gutenberg.fileids()
        
    fileids = fileids[:useN]
    texts = [gutenberg.raw(fid) for fid in fileids]
    
    fileids = [os.path.splitext(fid)[0] for fid in fileids]
    
    return texts, fileids
 def __init__(self, settings):
     self.window_size = settings['window_size']
     self.dim = settings['embedding_dim']
     # read from project gutenberg
     sents = []
     list(map(sents.extend, list(map(gutenberg.sents,
                                     gutenberg.fileids()))))
     print('\n{} sentences fetched.'.format(len(sents)))
     # load vocabulary file
     with open('vocab.json', 'r') as f:
         vocab = json.load(f)
     print('\n{} unique words found in corpus'.format(len(vocab)))
     self.word2id = dict((vocab[i], i) for i in range(len(vocab)))
     self.data = []
     for sent in sents:
         for i in range(len(sent)):
             try:
                 context = [
                     self.word2id[word]
                     for word in sent[max(0, i - self.window_size):i] +
                     sent[i + 1:min(len(sent), i + 1 + self.window_size)]
                 ]
                 target = self.word2id[sent[i]]
                 while len(context) < 2 * self.window_size:
                     context.append(0)
                 self.data.append((target, context))
             except KeyError:
                 print(sent[max(0, i - self.window_size
                                ):min(len(sent), i + 1 + self.window_size)])
     print('{} pairs found for training'.format(self.__len__()))
Ejemplo n.º 8
0
def getFreq(n):
    freq = {}
    length = {}

    for category in brown.categories():
        sentences = brown.sents(categories = category)
        length[category] =  len(sentences)
        for sentence in sentences[:int(length[category]*0.9)]:
            text = " <s> " + ' '.join(re.compile(r'\w+').findall(' '.join(sentence))).lower() +" </s> "
            model = myNGrams(text, n)
            for x in model:
                line = ' '.join(x)
                count = len(re.findall(" "+line+" ", text))
                if (x not in freq) and (count <> 0):
                    freq[x] = 0
                if count <> 0:
                    freq[x] += count
    length2 = {}
    for category in gutenberg.fileids():
        sentences = gutenberg.sents(category)
        length2[category] =  len(sentences)
        for sentence in sentences[:int(length2[category]*0.9)]:
            text = " <s> " + ' '.join(re.compile(r'\w+').findall(' '.join(sentence))).lower() +" </s> "
            model = myNGrams(text, n)
            for x in model:
                line = ' '.join(x)
                count = len(re.findall(" "+line+" ", text))
                if (x not in freq) and (count <> 0):
                    freq[x] = 0
                if count <> 0:
                    freq[x] += count


    return [freq, length]
Ejemplo n.º 9
0
 def setUp(self):
     nltk.download("gutenberg")
     self.docs = []
     for fid in gutenberg.fileids():
         f = gutenberg.open(fid)
         self.docs.append(f.read())
         f.close()
Ejemplo n.º 10
0
def load():
    train=[]
    test=[]    
    for fileid in gutenberg.fileids():
        sent=gutenberg.sents(fileid)
        s=[]
        for str1 in sent:
            s.append(str1)
            
        
        str2=[]
        for i in s:
            str2.append(' '.join(i).translate(str.maketrans('','',string.punctuation)))
        
        str3=''
        for i in str2:
            str3= str3+ ' <s> '+ i
                
        punctuation={'`','\''}
        for c in punctuation:
            str3= str3.replace(c,"")
 
        str3=' '.join(str3.split())
    #    str3=str3.translate(str.maketrans('','',string.punctuation))
    #    str3 = '<s> The Fulton County Grand Jury said Friday an investigation of Atlantas recent primary election produced no evidence that any irregularities took place . <s> The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , deserves the praise and thanks of the City of Atlanta for the manner in which the election was conducted . <s> The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible irregularities in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr. .'
        words = str3.split(' ')
        train.append(words[:round(len(words)*0.8)])
        test.append(words[-round(len(words)*0.2):])

    train = [item for sublist in train for item in sublist]
    test = [item for sublist in test for item in sublist]
    return train,test
Ejemplo n.º 11
0
def load_data():
    global N, words

    raw = list(word for fileid in corpus.fileids()
               for word in corpus.words(fileid))
    words = list(
        token
        for token in RegexpTokenizer('\w+').tokenize(' '.join(raw)))[100:1000]
    tokens = set(words)
    tokens_l = list(tokens)
    N = len(tokens)
    print 'Corpus size: {} words'.format(N)

    step = 4
    data = []
    for gram in ngrams(words, step):
        w1, w2, w3, pred = gram
        V = Vol(1, 1, N, 0.0)
        V.w[tokens_l.index(w1)] = 1
        V.w[tokens_l.index(w2)] = 1
        V.w[tokens_l.index(w3)] = 1
        label = tokens_l.index(pred)
        data.append((V, label))

    return data
Ejemplo n.º 12
0
def fetchGutData():
    names = []
    gut_names = gut.fileids()
    while not names:
        print("================================================\n")
        print("List of available text to train textGenerator:\n")
        print(gut_names)
        print("\n\n" + "Enter filenames seperated by whitespaces : ")
        user_input = [str(x) for x in input().split()]
        for user_in in user_input:
            if user_in not in gut_names:
                print("\n Error not found : " + user_in + "\n")

        names = list(set(user_input) & set(gut_names))
        if not names:
            print("Error no text selected ===> Try again Please \n\n")

    print("==============================")
    print("OK thanks training started\n\n")
    word_data: List[List[str]] = [[w.lower() for w in gut.words(name)]
                                  for name in names]
    data = list(zip(names, word_data))
    target_vocab = list(set(reduce(operator.concat, word_data)))
    t_vocab = {k: v for v, k in enumerate(target_vocab)}

    return data, target_vocab, t_vocab
Ejemplo n.º 13
0
def get_data(sub_task):
    """
    returns train data and test data according to sub_task

    :param sub_task:
    :return:
    """
    sentences_brown = list(brown.sents(brown.fileids()))
    sentences_gutenberg = list(gutenberg.sents(gutenberg.fileids()))

    # adding stop symbols
    add_stop_symbol(sentences_brown)
    add_stop_symbol(sentences_gutenberg)

    # get training and test data
    sentences_brown_train, sentences_brown_test = split(sentences_brown, 0.9)
    sentences_gutenberg_train, sentences_gutenberg_test = split(
        sentences_gutenberg, 0.9)

    if sub_task == S1:
        return sentences_brown_train, sentences_brown_test
    elif sub_task == S2:
        return sentences_gutenberg_train, sentences_gutenberg_test
    elif sub_task == S3:
        sentences_brown_train.extend(sentences_gutenberg_train)
        return sentences_brown_train, sentences_brown_test
    elif sub_task == S4:
        sentences_brown_train.extend(sentences_gutenberg_train)
        return sentences_brown_train, sentences_gutenberg_test
    else:
        print("Provide proper sub_task")
        exit(0)
Ejemplo n.º 14
0
def Main():
    db = Database()
    index = InvertedIndex(db)
    brown_list = brown.fileids()
    gutenberg_list = gutenberg.fileids()
    # document1 = {
    #     'id': '1',
    #     'text': 'The big sharks of Belgium drink beer.'
    # }
    # document2 = {
    #     'id': '2',
    #     'text': 'Belgium has great beer. They drink beer all the time.'
    # }
    i = 0
    for item in brown_list:
        documentTemp = {'id': str(i), 'text': brown.raw(item)}
        index.index_document(documentTemp)

    for item in gutenberg_list:
        documentTemp = {'id': str(i), 'text': gutenberg.raw(item)}
        index.index_document(documentTemp)

    while True:
        search_term = input("Enter term(s) to search: ")
        result = index.lookup_query(search_term.lower())
        for term in result.keys():
            for appearance in result[term]:
                # Belgium: { docId: 1, frequency: 1}
                document = db.get(appearance.docId)
                print(highlight_term(appearance.docId, term, document['text']))
            print("-----------------------------")
Ejemplo n.º 15
0
def build_word_count():
    if os.path.isfile('pickled/wcount.pickle'):
        return read_pickle('pickled/wcount.pickle')
    wcount = Counter()
    for fid in words.fileids():
        for word in words.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in gutenberg.fileids():
        for word in gutenberg.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in brown.fileids():
        for word in brown.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in reuters.fileids():
        for word in reuters.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in inaugural.fileids():
        for word in inaugural.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    dump_pickle(wcount, 'pickled/wcount.pickle')
    return wcount
Ejemplo n.º 16
0
	def calcTFIDF(self, num_books):
		#randomly select books to use
		books = gutenberg.fileids()
		sel_books = random.choices(books, k=num_books)	


		#make 2 tables of word frequencies
		self.tf_hash = {}
		self.idf_hash = {}

		#iterate through each
		for b in sel_books:
			#get the unique words from the book
			words = gutenberg.words(b)
			num_words = len(words)
			u, c = np.unique(words, return_counts=True)

			#get tf = (# times word w appears / # of words total)
			tf = {}
			for i in range(len(u)):
				tf[u[i]] = (c[i]/num_words)
			self.tf_hash[b] = tf


			#get pre-idf = (# documents with word w)
			for w in u:
				if w in self.idf:
					self.idf[w] += 1
				else
					self.idf[w] = 1

		#calculate final idf
		for w in self.idf.keys():
			self.idf[w] = np.log(num_books/self.idf[w])
Ejemplo n.º 17
0
def fetch_train_test(corpora, test_corpus):
    train = []
    test = []
    unknown.clear()
    vocab.clear()
    pred_count_dict.clear()
    succ_count_dict.clear()
    for corpus in corpora:
        if corpus == 'brown':
            files = brown.fileids()
        elif corpus == 'gutenberg':
            files = gutenberg.fileids()
        else:
            print("config Error")
        for file in files:
            if corpus == 'brown':
                sentences = brown.sents(file)
            elif corpus == 'gutenberg':
                sentences = gutenberg.sents(file)
            else:
                print("config Error")
            permute = np.ones(len(sentences))
            if corpus == test_corpus:
                permute[:int(len(sentences) * 0.2)] = 0
            np.random.shuffle(permute)
            for index in range(len(sentences)):
                if permute[index] == 0:
                    test.append(sentences[index])
                else:
                    train.append(sentences[index])
        return [train, test]
	def handle(self, *args, **options):
		for fileid in gutenberg.fileids():
			out_dir = CORPUS_DIR + os.sep + fileid.replace(".txt", "")
			if not os.path.isdir(out_dir):
				os.makedirs(out_dir)
			f = open(out_dir + os.sep + "sentences.txt", 'w')
			f.write(gutenberg.raw(fileid))
			f.close()
Ejemplo n.º 19
0
def get_gutenberg_statistics():
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
        print(round(num_chars / num_words), round(num_words / num_sents),
              round(num_words / num_vocab), fileid)
Ejemplo n.º 20
0
 def handle(self, *args, **options):
     for fileid in gutenberg.fileids():
         out_dir = CORPUS_DIR + os.sep + fileid.replace(".txt", "")
         if not os.path.isdir(out_dir):
             os.makedirs(out_dir)
         f = open(out_dir + os.sep + "sentences.txt", 'w')
         f.write(gutenberg.raw(fileid))
         f.close()
Ejemplo n.º 21
0
Archivo: main.py Proyecto: kwdhd/nlp
def gutenberg():
    from nltk.corpus import gutenberg
    for t in gutenberg.fileids():
        num_chars = len(gutenberg.raw(t))
        num_words = len(gutenberg.words(t))
        num_sents = len(gutenberg.sents(t))
        num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
Ejemplo n.º 22
0
def gutenberg_file_info():
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print(int(num_chars / num_words), int(num_words / num_sents),
              int(num_words / num_vocab), fileid)
Ejemplo n.º 23
0
def find_suitable_text():
    """Find suitable text for background with checking length of Gutenberg texts,
    Brown categories and Reuters categories. Texts or categories over 50k words are marked green.
    Total length of the corpus is at the top as header"""
    print('\033[95m')
    print("--------------- Gutenberg ---------------")
    print("Total Length: ", len(gutenberg.words()))
    print('\033[0m')
    for fid in gutenberg.fileids():
        words = gutenberg.words(fid)
        length = len(words)
        if length > 50000:
            print('\033[92m')
        print("Text: ", fid)
        print("Length: ", length)
        print("Content preview: ", words[:20])
        if length > 50000:
            print('\033[0m')
        else:
            print("")

    # brown texts are too short, therefore check categories
    print('\033[95m')
    print("--------------- Brown ---------------")
    print("Total Length: ", len(brown.words()))
    print('\033[0m')
    for cat in brown.categories():
        words = brown.words(categories=cat)
        length = len(words)
        if length > 50000:
            print('\033[92m')
        print("Text category: ", cat)
        print("Length: ", length)
        print("Content preview: ", words[:20])
        if length > 50000:
            print('\033[0m')
        else:
            print("")

    # reuters texts are too short, therefore check categories
    # reuters actually has some funny categories
    # reuters categories are rather small, however the total corpus is quire large
    print('\033[95m')
    print("--------------- Reuters ---------------")
    print("Total Length: ", len(reuters.words()))
    print('\033[0m')
    for cat in reuters.categories():
        words = reuters.words(categories=[cat])
        if length > 50000:
            print('\033[92m')
        print("Text category: ", cat)
        print("Length: ", len(words))
        print("Content preview: ", words[:20])
        if length > 50000:
            print('\033[0m')
        else:
            print("")
   def getSentences(self):
      if self.category == "novel":
         sentences = gutenberg.raw(gutenberg.fileids()[0])
         sentences = sentences.split('\n')

      elif self.category == "news":
         sentences = brown.sents(categories='news')

      return sentences
Ejemplo n.º 25
0
def get_training_text():
    text = ""

    nltk.download('gutenberg')

    for file_id in gutenberg.fileids():
        text += gutenberg.raw(file_id)

    return text
Ejemplo n.º 26
0
def gutenberg():

    emma = nltk.corpus.gutenberg.words('austen-emma.txt')
    print len(emma)

    print gutenberg.fileids()
    emma = gutenberg.words('austen-emma.txt')

    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    macbeth_sentences[1037]
    longest_len = max([len(s) for s in macbeth_sentences])
    [s for s in macbeth_sentences if len(s) == longest_len]

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
Ejemplo n.º 27
0
def similarity_gutenberg():
    for x in range(2,6):
        a = []
        b = 0
        c = 0
        d = 1

        for fid in gutenberg.fileids():
            a.append([])
            for ffid in gutenberg.fileids():
               a[b].append(Jaccard(n_window(gutenberg.raw(fid),x),n_window(gutenberg.raw(ffid),x)))
            b += 1

        for i in range(len(a)):
            for j in range(len(a)):
               c += a[i][j]/(len(a)*len(a))
               d = min(d,a[i][j])
        print("Media: "+ str(c))
        print("Minimo: "+ str(d))
Ejemplo n.º 28
0
def create_model_from_NLTK():
    filepath = "nltkcorpus.txt"
    if isfile(filepath):
        return create_model(filepath= filepath, save=False)
    else:
        from nltk.corpus import reuters, brown, gutenberg
        sents = reuters.sents() + brown.sents()
        for gsents in [gutenberg.sents(fid) for fid in gutenberg.fileids()]:
            sents += gsents

        return create_model(sentences=sents, savename=filepath)
Ejemplo n.º 29
0
def for_print():
    '''
    显示每个文本的三个统计量
    :return:
    '''
    for fileid in gutenberg.fileids():
        num_chars=len(gutenberg.raw(fileid))
        num_words=len(gutenberg.words(fileid))
        num_sents=len(gutenberg.sents(fileid))
        num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
Ejemplo n.º 30
0
 def get_words(self):
     # Create Vocabulary for all the words in Project Gutenberg
     if not self.sents:
         self.get_sents()
     for file in gutenberg.fileids():
         for word in gutenberg.words(file):
             self.words.append(word)
     # words = list(set(words))
     print(
         "Total number of words appeared: {}, including {} unique words.\n".
         format(len(self.words), len(list(set(self.words)))))
Ejemplo n.º 31
0
def page57():
    """Statistics from the Gutenberg corpora"""
    from nltk.corpus import gutenberg

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars / num_words), int(num_words / num_sents),
        print int(num_words / num_vocab), fileid
Ejemplo n.º 32
0
def fun02():
    """fun02"""
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        # average word length average sentence length
        print int(num_chars / num_words), int(num_words / num_sents),
        # number of times each vocabulary item appers in the text
        print int(num_words / num_vocab), fileid
def get_nltk_freq_words():
    """Use Brown corpus frequent words
    More corpora: https://www.nltk.org/book/ch02.html
    """
    freq_dict = nltk.FreqDist(brown.words())

    for fileid in gutenberg.fileids():
        freq_dict.update(nltk.FreqDist(gutenberg.words(fileid)))

    freq_words = [k for k, v in freq_dict.items() if v > 10]
    return freq_words, freq_dict
Ejemplo n.º 34
0
Archivo: ch02.py Proyecto: gree2/hobby
def fun02():
    """fun02"""
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        # average word length average sentence length
        print int(num_chars/num_words), int(num_words/num_sents),
        # number of times each vocabulary item appers in the text
        print int(num_words/num_vocab), fileid
Ejemplo n.º 35
0
    def trainSentenceTokenizer(self):
        text = ""
        for file_id in gutenberg.fileids():
            text += gutenberg.raw(file_id)

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train(text)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        tokenizer._params.abbrev_types.add('dr')
        tokenizer._params.abbrev_types.add('fig')
        return tokenizer
Ejemplo n.º 36
0
def tf_idf():
    nltk.download('gutenberg')
    titles = gutenberg.fileids()
    corpus = []
    for title in titles:
        corpus.append(gutenberg.raw(title))
    vectorizer = TfidfVectorizer(min_df=1, stop_words="english")
    X = vectorizer.fit_transform(corpus)
    global XA
    XA = X.toarray()
    global d1
    d1 = vectorizer.vocabulary_
Ejemplo n.º 37
0
def iter_gutenberg():
    for fileid in gutenberg.fileids():
        # 我们没有进行过任何语言学处理的文件的内容
        num_chars = len(gutenberg.raw(fileid))

        num_words = len(gutenberg.words(fileid))
        # 文本划分成句子,其中每一个句子是一个词链表
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        """平均词长  平均句子长度  本文中每个词出现的平均次数(我们的词汇多样性得分)"""
        print(int(num_chars / num_words), int(num_words / num_sents),
              int(num_words / num_vocab), fileid)
Ejemplo n.º 38
0
def solve_p2_greedy(file):
  lines = [l.lower().split("|")[1:-1] for l in open(file)]
  slices = slice(lines)

  n = 3
  corpus = NgramLetterCorpus(n)
  for fileid in gutenberg.fileids()[:3]:
    corpus.update(gutenberg.raw(fileid))

  slices = unshred3(slices, corpus)
  print "FINAL: "
  for l in linearize(slices):
    print "".join(l)
Ejemplo n.º 39
0
    def write_sentence(tag_dict=tagged_words_dict):
        """Step 2: Choose a work, identify the author, and choose a sentence."""
        work = gutenberg.fileids()[np.random.randint(len(gutenberg.fileids()))]
        author = re.findall('(\w+)-', work)[0].title()

        sentences = gutenberg.sents(work)
        vocab = set(gutenberg.words(work))

        rndm_sentence = sentences[np.random.randint(len(sentences))]

        tagged_rndm_sentence = pos_tag(rndm_sentence)
        """Step 3: Replace every word in the sentence with another word that can have the same POS."""

        tag_dict = {
            k: [word for word in v if word in vocab]
            for k, v in tag_dict.items()
        }
        tag_dict = {k: v for k, v in tag_dict.items() if v}

        new_sentence = [
            tup[0] if tup[1] in ['DT', 'NNP', '.', ',', "''", ':']
            or tup[1] not in tag_dict else tag_dict[tup[1]][np.random.randint(
                len(tag_dict[tup[1]]))] for tup in tagged_rndm_sentence
        ]

        new_detokenized_sentence = str(
            TreebankWordDetokenizer().detokenize(new_sentence))
        new_detokenized_sentence = new_detokenized_sentence[0].upper(
        ) + new_detokenized_sentence[1:]
        if new_detokenized_sentence[-1].isalnum():
            new_detokenized_sentence = new_detokenized_sentence + '.'

        if len(new_sentence) <= 3:
            return write_sentence()
        if len(author) + len(new_detokenized_sentence) > 278:
            return write_sentence()
        else:
            return (author, re.sub(r'[\)\\"]', '', new_detokenized_sentence))
Ejemplo n.º 40
0
 def train(self):
     self.vocabulary=set()
     
     this_bigrams=[]
     self.unigrams = FreqDist([])
     
     for fileid in gutenberg.fileids():
         for sentence in gutenberg.sents(fileid):
             words=["<s>",] + [x.lower() for x in sentence if wordRE.search(x)] + ["</s>",]
             this_bigrams += bigrams(words)
             self.vocabulary.update(words)
             self.unigrams.update(words)
     self.bigrams=ConditionalFreqDist(this_bigrams)
     self.V = len(self.vocabulary)
def main():
    fileids = gutenberg.fileids()[:10]

    vocab = get_vocab(fileids)
    print (f'len of vocab: {len(vocab)}')

    token_to_id_map, id_to_token_map = utils.get_token_id_maps(vocab)
    word_freq = get_word_freq_map(fileids)

    print (word_freq)

    for f in fileids:
        print (get_target_context_pairs_sg(f, token_to_id_map, win_size=3))
        input()
Ejemplo n.º 42
0
    def benchmark_sbd():
        ps = []
        rs = []
        f1s = []
        c = 0
        for fileid in gutenberg.fileids():
            c += 1
            copy_sents_gold = gutenberg.sents(fileid)
            sents_gold = [s for s in copy_sents_gold]
            for sent_i in range(len(sents_gold)):
                new_sent = [w for w in sents_gold[sent_i] if w.isalpha()]
                sents_gold[sent_i] = new_sent
            text = gutenberg.raw(fileid)
            sents_obtained = split_text(text)
            copy_sents_obtained = sents_obtained.copy()
            for sent_i in range(len(sents_obtained)):
                new_sent = [w.group()
                            for w in re.finditer(r'\w+', sents_obtained[sent_i])
                            if w.group().isalpha()]
                sents_obtained[sent_i] = new_sent
            c_common = 0
            for sent in sents_obtained:
                if sent in  sents_gold:
                    c_common += 1
            p, r, f1 = get_prf(c_common, len(sents_obtained), len(sents_gold))
            print('\n\n', fileid)
            print('Precision: {:0.2f}, Recall: {:0.2f}, F1: {:0.2f}'.format(p, r, f1))
            ps.append(p)
            rs.append(r)
            f1s.append(f1)

        print('\n\nPrecision stats: {:0.3f} +- {:0.4f}'.format(np.mean(ps),
                                                           np.std(ps)))
        print('Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(rs),
                                                        np.std(rs)))
        print('F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(f1s),
                                                    np.std(f1s)))
        print(len(f1s))

        good_ps = [p for p in ps if p >= 0.8]
        good_rs = [r for r in rs if r >= 0.8]
        good_f1s = [f1 for f1 in f1s if f1 >= 0.8]
        print('\n Good precision stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_ps),
                                                           np.std(good_ps)))
        print('Good Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_rs),
                                                        np.std(good_rs)))
        print('Good F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_f1s),
                                                    np.std(good_f1s)))
        print(len(good_f1s))
Ejemplo n.º 43
0
def find_phrases(regexp):
	fids = gutenberg.fileids()
	rs = []
	for fid in fids:
		txt = nltk.Text(gutenberg.words(fid))
		ts = nltk.text.TokenSearcher(txt)
		r = ts.findall(regexp)
		for x in r:
			if x[0].lower() in wrong_vbs:
				x[0] = 'looking at'
			if x[-1].lower() in wrong_vbs:
				x[-1] = 'me'
		rs.extend(r)

	return rs
Ejemplo n.º 44
0
    def __init__(self):
        self.num_passages = 10
        self.passagesize = 1000
        self.maxpeople = 10
        self.maxnouns = 5
        self.total_passages = 10*len(gutenberg.fileids())

        self.skeletons = []
        self.index_dicts = []
        #Load all of the things into memory
        #j = 0
        for fileid in gutenberg.fileids():
            for k in range(self.num_passages):
                filename = fileid+'_'+str(k) +'_skeleton.txt' 
                f = open(filename, 'r')
                self.skeletons.append(f.read().split(" ")) 
                f.close()
                filename = fileid+'_'+str(k) +'_indices.txt'
                f = open(filename, 'r')
                self.index_dicts.append({}) 
                for line in f.readlines():
                    splitted = line.split()
                    self.index_dicts[-1][splitted[0]] = splitted[1:]
                f.close()
Ejemplo n.º 45
0
def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        x = volumize(dist)
        data.append((x, x.w))

    return data
Ejemplo n.º 46
0
def mean_len():
    a = []
    d = 1

    for fid in gutenberg.fileids():
        b = 0
        c = 0
        st = gutenberg.raw(fid)
        stl = re.split("\n|\.|\!|\?", st)
        stw = re.split("\n|\.|\!|\?| |,| - ", st)
        for el in stl:
            b += len(el)*(1.0)/len(stl)
        for el in stw:
            c += len(el)*(1.0)/len(stw)
        print(fid)
        print("Media Frases: "+ str(b))
        print("Media Palavras: "+ str(c))
Ejemplo n.º 47
0
def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, V.w))

    return data
Ejemplo n.º 48
0
def nltk_test_2():
	# Count each token in each text of the Gutenberg collection
	fd = FreqDist()
	for text in gutenberg.fileids():
		for word in gutenberg.words(text):
			fd[word.lower()] += 1
    # Initialize two empty lists which will hold our ranks and frequencies
	ranks = []
	freqs = []
	# Generate a (rank, frequency) point for each counted token and append to the respective lists
	for rank, word in enumerate(fd):
		ranks.append(rank + 1)
		freqs.append(fd[word])
	freqs.sort(reverse=True)

	# Plot rank vs frequency on a log􀀀log plot and show the plot
	plt.loglog(ranks, freqs)
	plt.xlabel('frequency(f)', fontsize = 14, fontweight = 'bold')
	plt.ylabel('rank(r)', fontsize = 14, fontweight = 'bold')
	plt.grid(True)
	plt.show()
Ejemplo n.º 49
0
def main():
  # store word lengths
  brown_word_lens = []
  web_word_lens = []
  inaugural_word_lens = []
  gutenberg_word_lens = []
  genesis_word_lens = []

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      gutenberg_word_lens.append(len(word))

  for file in brown.fileids():
    for word in brown.words(file):
      brown_word_lens.append(len(word))

  for file in webtext.fileids():
    for word in webtext.words(file):
      web_word_lens.append(len(word))

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      inaugural_word_lens.append(len(word))

  for file in genesis.fileids():
    for word in genesis.words(file):
      genesis_word_lens.append(len(word))
  with open("wordlens.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), 
        len(web_word_lens), len(brown_word_lens),
        len(gutenberg_word_lens))):
      for corpus in [genesis_word_lens, inaugural_word_lens,
          web_word_lens, brown_word_lens, gutenberg_word_lens]:
        if(i >= len(corpus)):
          f.write(",")
        else:
          f.write(str(corpus[i]) + ",")
      f.write("\n")
Ejemplo n.º 50
0
def create_random_statements(count=50):
    """
    This function scans the ``nltk`` Project Gutenberg dataset, extracts random
    sentences containing some form of "it is" and tags them with a random tag.
    NB: This thing can take a while.
    """
    created_count = 0
    tags = Tag.objects.order_by("?")
    gutenberg_files = gutenberg.fileids()
    random.shuffle(gutenberg_files)
    for file_name in gutenberg_files:
        exists, not_exists = extract.from_text(gutenberg.raw(file_name))
        for sentence in [_linebreak.sub(" ", s) for s in exists]:
            if created_count == count:
                break
            statement = Statement(text=sentence, tag=random.choice(tags))
            try:
                statement.save()
                created_count += 1
                transaction.commit()
            except IntegrityError:
                transaction.rollback()
Ejemplo n.º 51
0
def load_data():
    global N, words

    raw = list(word 
            for fileid in corpus.fileids()
            for word in corpus.words(fileid))
    words = list(token for token in RegexpTokenizer('\w+').tokenize(' '.join(raw)))[100:1000]
    tokens = set(words)
    tokens_l = list(tokens)
    N = len(tokens)
    print 'Corpus size: {} words'.format(N)

    step = 4
    data = []
    for gram in ngrams(words, step):
        w1, w2, w3, pred = gram
        V = Vol(1, 1, N, 0.0)
        V.w[tokens_l.index(w1)] = 1
        V.w[tokens_l.index(w2)] = 1
        V.w[tokens_l.index(w3)] = 1
        label = tokens_l.index(pred)
        data.append((V, label))

    return data
Ejemplo n.º 52
0
def get_poem():
    """
    This function should extract hexametric sentences from Gutenberg texts, but it doesn't.
    Either hexametric sentences are too rare, or the absence of basic function words from CMUdict results in problems
    with the matching of the whole sentence.
    """
    outtext = []
    for corpus in gutenberg.fileids():
        text = gutenberg.sents(corpus)
        for sentence in text:
            transcription = ""
            discard = False
            for word in sentence:
                if word.lower() in words:
                    transcription += words[word.lower()]
                elif re.match(one_syllable, word.lower()):
                    # consider this word a "small", unstressed word
                    transcription += "A0A"
                else:
                    discard = True
            if re.match(verse, transcription) and not discard:
                print(sentence, transcription)
                outtext.append(" ".join(sentence))
    return "\n".join(outtext)
Ejemplo n.º 53
0
def ex7():
  from nltk.corpus import gutenberg
  for fileid in gutenberg.fileids():
    text = nltk.Text(gutenberg.words(fileid))
    print ("file: " + fileid)
    print text.concordance("however")
Ejemplo n.º 54
0
# coding:utf8
import nltk

from nltk.corpus import gutenberg

gutenberg.fileids()

def for_print():
    '''
    显示每个文本的三个统计量
    :return:
    '''
    for fileid in gutenberg.fileids():
        num_chars=len(gutenberg.raw(fileid))
        num_words=len(gutenberg.words(fileid))
        num_sents=len(gutenberg.sents(fileid))
        num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid

def print_longest():
    macbeth_sentences=gutenberg.sents('shakespeare-macbeth.txt')
    # print macbeth_sentences
    # print macbeth_sentences[1037]
    longest_len=max([len(s) for s in macbeth_sentences])
    print [s for s in macbeth_sentences if len(s)==longest_len]

def print_private():
    from nltk.corpus import webtext
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:65]
from nltk.corpus import gutenberg
from nltk.probability import *

# **********************************************************************
#   Resource u'corpora/gutenberg' not found.  Please use the NLTK
#   Downloader to obtain the resource:  >>> nltk.download()
#   Searched in:
#     - '/Users/muzilan/nltk_data'
#     - '/usr/share/nltk_data'
#     - '/usr/local/share/nltk_data'
#     - '/usr/lib/nltk_data'
#     - '/usr/local/lib/nltk_data'
# ********************************************************************** nltk.download()

print gutenberg.fileids()
allwords = gutenberg.words('shakespeare-hamlet.txt')
fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()])
print fd2.B()
print fd2.N()
fd2.tabulate(20)
fd2.plot(20)
fd2.plot(20, cumulative=True)
Ejemplo n.º 56
0
#!/usr/bin/env python


# import the gutenberg collection
from nltk.corpus import gutenberg

# import FreqDist class
from nltk import FreqDist

# what corpora are in the collection ?
print(gutenberg.fileids())

# create frequency distribution object
fd = FreqDist()

# for each token in the relevant text, increment its counter
for word in gutenberg.words('austen-persuasion.txt'):
    fd[word] += 1

print(fd.N()) # total number of samples
print(fd.B()) # number of bins or unique samples

# Get a list of the top 10 words sorted by frequency
for word, count in fd.most_common(10):
    print(word, count)

# matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

# Count each token in each text of the Gutenberg collection
fd = FreqDist()
Ejemplo n.º 57
0
fdist.freq(3)
fdist.max()

sorted([w for w in set(text1) if w.endswith('ableness')])
[w.upper() for w in text1]

for word in ['Call', 'me', 'Ishmael', '.']:
    print word

#获取语料库
nltk.corpus.gutenberg.fileids()
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance("surprize")
from nltk.corpus import gutenberg
gutenberg.fileids()

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid

#句子划分
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
longest_len = max([len(s) for s in macbeth_sentences])
#网络聊天语料库
from nltk.corpus import webtext
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
Ejemplo n.º 58
0
import nltk
from nltk.corpus import gutenberg

fileids = gutenberg.fileids()
# print 'fileids: ', fileids

emma = gutenberg.words('austen-emma.txt')

# average characters in a word: raw/words
# average word in a sentence: words/sents
# lexical diversity - num_words/num_vocab

# for fileid in fileids:
# 	num_chars = len(gutenberg.raw(fileid))
# 	num_words = len(gutenberg.words(fileid))
# 	num_sents = len(gutenberg.sents(fileid))
# 	num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
# 	print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid

macbeth_sents = gutenberg.sents('shakespeare-macbeth.txt')

longest_len = max([len(s) for s in macbeth_sents])
longest_sent = [s for s in macbeth_sents if len(s) == longest_len]

print 'longest_sent: ', longest_sent
#HW12 NLTL Solution 01 code 
import nltk 
from nltk.corpus import gutenberg
nltk.corpus.gutenberg.fileids()

#check for corpus files 
gutenberg.fileids()

#find total number of words in a text 

for fileid in gutenberg.fileids():
    text_words=len(gutenberg.words(fileid))
    print ((text_words),fileid)

#total model frequencies in corpus 

from nltk.corpus import gutenberg
news_text = gutenberg.words()
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']

for m in modals:
    print (m + ':', fdist[m])

#relative frequencies 

from nltk.corpus import gutenberg
from nltk.probability import ConditionalFreqDist
cfd = nltk.ConditionalFreqDist(
  (id, word)
  for id in gutenberg.fileids()