Ejemplo n.º 1
0
def select_genres(n):
    '''
    Selects genres with more than n files. Returns raw data and the genre of each file
    in the selected genres as two 1d numpy arrays.
    
    Parameters
    ----------
    n: An integer.
    
    Returns
    -------
    A tuple of (raw, genres)
    raw: A 1d numpy array.
    genres: A 1d numpy array.
    '''
    genres = []
    raw = []
    #Creates arrays of the genres and raw data for genres with more than n files
    for file in brown.fileids():

        for k in brown.categories(file):

            if len(brown.fileids(k)) > n:
                genres.append(k)
                raw.append(brown.raw(file))

    return raw, genres
Ejemplo n.º 2
0
    def test_clusterer(self):
        """Here we take 10 documents categorized as 'government' and
        'mystery' from the brown corpus, and perform k-means clustering on
        these. Optimally we would like the clusterer to divide them in two
        clusters.
        The clusterer generates clusters depending on random initial
        conditions, so the result can be different in different test runs.
        In order to account for that that we run a lot of iterations
        (50) which hopefully will generate a good result. The success
        condition is that a max of 2 out of 10 documents will fall in the
        wrong cluster.
        """

        clusterer = KMeans()
        government_ids = brown.fileids(categories='government')[:10]
        mystery_ids = brown.fileids(categories='mystery')[:10]
        government_uids = []
        mystery_uids = []

        for articleid in government_ids:
            text = " ".join(brown.words(articleid))
            self.folder.invokeFactory('Document', articleid, text=text)
            government_uids.append(self.folder[articleid].UID())

        for articleid in mystery_ids:
            text = " ".join(brown.words(articleid))
            self.folder.invokeFactory('Document', articleid, text=text)
            mystery_uids.append(self.folder[articleid].UID())

        result = clusterer.clusterize(2, 50, repeats=50)
        cluster1 = set(result[0])
        missed = min(len(cluster1-set(government_uids)),
                     len(cluster1-set(mystery_uids)))
        self.failUnless(missed<=2)
Ejemplo n.º 3
0
def load_corpus(range):
    m = re.match(r'(\d+):(\d+)$', range)
    print 'm=', m
    if m:
        start = int(m.group(1))
        end = int(m.group(2))
        from nltk.corpus import brown as corpus
	print corpus.fileids()
        return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
Ejemplo n.º 4
0
 def _hasNextSearchBrown(self):
     sizeOfBrownCorpus = len(brown.fileids())
     if self._countText < sizeOfBrownCorpus:
         self._isReady = True
         self._nameOfNextFile = brown.fileids()[self._countText]
         return True
     else:
         self._isReady = False
         return False
Ejemplo n.º 5
0
def third_lexicon(positive_seeds, negative_seeds):
    positive_list = []
    negative_list = []

    all_dic = {}
    seed_total_dic = {}
    for fileid in brown.fileids():
        bow = get_BOW(brown.words(fileid))
        for aword in bow:
            all_dic[aword] = all_dic.get(aword, {})
            all_dic[aword]['word_count'] = all_dic[aword].get('word_count',
                                                              0) + 1
            for pseed in positive_seeds:
                if pseed in bow:
                    all_dic[aword][pseed] = all_dic[aword].get(pseed, 0) + 1
            for nseed in negative_seeds:
                if nseed in bow:
                    all_dic[aword][nseed] = all_dic[aword].get(nseed, 0) + 1

        for pseed in positive_seeds:
            if pseed in bow:
                seed_total_dic[pseed] = seed_total_dic.get(pseed, 0) + 1
        for nseed in negative_seeds:
            if nseed in bow:
                seed_total_dic[nseed] = seed_total_dic.get(nseed, 0) + 1

    total_count = float(len(brown.fileids()))

    for aword in all_dic:
        score = 0
        for pseed in positive_seeds:
            if all_dic[aword].get(pseed) != None:
                a_score = math.log(
                    (all_dic[aword][pseed] / total_count) /
                    ((all_dic[aword]['word_count'] / total_count) *
                     (seed_total_dic[pseed] / total_count)), 2)
                if a_score > 0:
                    score += a_score

        for nseed in negative_seeds:
            if all_dic[aword].get(nseed) != None:
                a_score = math.log(
                    (all_dic[aword][nseed] / total_count) /
                    ((all_dic[aword]['word_count'] / total_count) *
                     (seed_total_dic[nseed] / total_count)), 2)
                if a_score > 0:
                    score -= a_score

        score = score / 16.0

        if score > 0.3:
            positive_list.append(aword)
        elif score < -0.3:
            negative_list.append(aword)

    return positive_list, negative_list
Ejemplo n.º 6
0
def load_corpus_each_sentence(range):
    m = re.match(r'(\d+):(\d+)$', range)
    if m:
        start = int(m.group(1))
        end = int(m.group(2))

        #        from nltk.corpus import brown as corpus
        from nltk.corpus import movie_reviews as corpus
        print([corpus.sents(fileid) for fileid in corpus.fileids()[start:end]])
        return [corpus.sents(fileid) for fileid in corpus.fileids()[start:end]]
Ejemplo n.º 7
0
def pre_processor(grams=3):

    vocabulary = set()

    t = 0

    for di in brown.fileids():
        vocabulary = vocabulary.union(set(brown.words(di)))
        t += 1
        if t == 2:
            break

    vocabulary = list(vocabulary)

    for i, word in enumerate(vocabulary):
        wordDic[word] = i
        posiDic[i] = word

    t = 0

    x1 = np.zeros(shape=(0, grams-1), dtype=int)
    x2 = np.zeros(shape=(0, grams-1), dtype=int)
    y1 = np.zeros(shape=(0, 1), dtype=int)
    y2 = np.zeros(shape=(0, 1), dtype=int)

    for _id in brown.fileids():
        if t == 0:
            t += 1

            text = brown.words(_id)

            size_ant = x1.shape[0]
            x1.resize((x1.shape[0] + len(text) - grams - 1, grams-1))
            y1.resize((y1.shape[0] + len(text) - grams - 1, 1))

            for i in range(size_ant, size_ant + len(text) - grams-1):
                x1[i] = [wordDic[text[index]] for index in range(i, i+grams-1)]
                y1[i] = [wordDic[text[i + grams-1]]]

            continue

        text = brown.words(_id)

        size_ant = x2.shape[0]
        x2.resize((x2.shape[0] + len(text) - grams - 1, grams-1))
        y2.resize((y2.shape[0] + len(text) - grams - 1, 1))

        for i in range(size_ant, size_ant + len(text) - grams-1):
            x2[i] = [wordDic[text[index]] for index in range(i, i+grams-1)]
            y2[i] = [wordDic[text[i + grams-1]]]

        break

    return vocabulary, x1, y1, x2, y2
Ejemplo n.º 8
0
def generate():
    global CDICT, DDICT
    for category in CATAGORIES:
        # skip the two categories with too little information
        CDICT[category] = generate_corpus(brown.fileids(categories=category), "cavnar")
        DDICT[category] = generate_corpus(brown.fileids(categories=category), "damashek")
    dict_file = open('cdict.pk1', 'wb')
    pickle.dump(CDICT, dict_file)
    dict_file.close()
    dict_file = open('ddict.pk1', 'wb')
    pickle.dump(DDICT, dict_file)
    dict_file.close()
Ejemplo n.º 9
0
def main():
    print(str.format(
        'Количество слов в коллекции текстов (corpus) brown: {}',
        len(brown.words())
    ))
    print(str.format(
        'Список файлов, которые формируют коллекцию brown: {}',
        brown.fileids()
    ))
    print(str.format(
        'Количество файлов в коллекции brown: {}',
        len(brown.fileids())
    ))
Ejemplo n.º 10
0
def third_lexicon(positive_seeds,negative_seeds):
    positive_list=[]
    negative_list=[]

    all_dic={}
    seed_total_dic={}
    for fileid in brown.fileids():
        bow=get_BOW(brown.words(fileid))
        for aword in bow:
            all_dic[aword]=all_dic.get(aword,{})
            all_dic[aword]['word_count']=all_dic[aword].get('word_count',0)+1
            for pseed in positive_seeds:
                if pseed in bow:
                    all_dic[aword][pseed]=all_dic[aword].get(pseed,0)+1
            for nseed in negative_seeds:
                if nseed in bow:
                    all_dic[aword][nseed]=all_dic[aword].get(nseed,0)+1

        for pseed in positive_seeds:
            if pseed in bow:
                seed_total_dic[pseed]=seed_total_dic.get(pseed,0)+1
        for nseed in negative_seeds:
            if nseed in bow:
                seed_total_dic[nseed]=seed_total_dic.get(nseed,0)+1

    total_count=float(len(brown.fileids()))

    for aword in all_dic:
        score=0
        for pseed in positive_seeds:
            if all_dic[aword].get(pseed) != None:
                a_score=math.log((all_dic[aword][pseed]/total_count)/((all_dic[aword]['word_count']/total_count)*(seed_total_dic[pseed]/total_count)), 2)
                if a_score>0:
                    score+=a_score

        for nseed in negative_seeds:
            if all_dic[aword].get(nseed) != None:
                a_score=math.log((all_dic[aword][nseed]/total_count)/((all_dic[aword]['word_count']/total_count)*(seed_total_dic[nseed]/total_count)), 2)
                if a_score>0:
                    score-=a_score


        score=score/16.0


        if score>0.3:
            positive_list.append(aword)
        elif score<-0.3:
            negative_list.append(aword)

    return positive_list,negative_list
def get_brown_data(useN=100):
    try:
        fileids = brown.fileids()
    except LookupError:
        import nltk
        nltk.download('brown')
        fileids = brown.fileids()
        
    fileids = fileids[:useN]
    texts = [brown.raw(fid) for fid in fileids]
    
    fileids = [os.path.splitext(fid)[0] for fid in fileids]
    
    return texts, fileids
Ejemplo n.º 12
0
def TrainMultinomialNB(train):
#V Extract Vocabulary
	for i in train:
		v=brown.words(fileids=i)
	# print v
	vsort=sorted(set(v))
	print vsort
	#Count docs
	count=len(train)
	# for j in brown.fileids():
	
	#Count docs in each class
	# out=0
	countd=[]
	dicti={}
	condprob=collections.defaultdict(dict)
	for i in brown.categories():
		# if i=="adventure":
		gho=0
		print "Hey1"
		for j in brown.fileids(categories=i):
			if j in train:
				gho=gho+1  #Nc = Number of documents in each class
		countd.append(gho)

		# out=out+1
		prior[i]=float(gho/(1.0*count))
		textc=[]
		for t in brown.fileids(categories=i):
			if t in train:
				tp=brown.words(fileids=t)
				for ko in tp:
					textc.append(ko)
		# print textc
		for j in vsort:
			# print "Hey2"			
			freq=0
			for k in textc:
				if j==k:
					freq=freq+1 #Tct
			dicti[j]=freq
			tot=0
		for t in range(len(dicti)):
			tot=tot+dicti[j]
		for t in vsort:
			condprob[t][i]=float((dicti[t]+1)/(1.0*tot))

	return v,prior,condprob
Ejemplo n.º 13
0
 def get_corpus(self):
     for fileid in brown.fileids():
         sentences = brown.sents(fileids=[fileid])
         for s in sentences:
             clean_sentence = [w.lower() for w in s if w.isalpha()]
             self.uni_corpus.extend(clean_sentence)
             self.bigram_corpus.extend(list(nltk.bigrams(clean_sentence)))
Ejemplo n.º 14
0
Archivo: brown.py Proyecto: tan506/AI
def load_brown_corpus():
    import nltk
    nltk.download('brown')
    from nltk.corpus import brown

    corpus = []
    for cat in brown.categories():
        for text_id in brown.fileids(cat):
            sentences = []
            for sent in brown.sents(text_id):
                text = ' '.join(sent)
                text = text.lower()
                for punct in string.punctuation:
                    text.replace(punct, ' ')
                text = re.sub('[^a-z.,0-9 ]+', '', text)

                tokens = [w for w in text.split() if w != '']
                if len(tokens) == 0:
                    continue
                if tokens[-1] == '.':
                    del tokens[-1]
                tokens.append('<eos>')

                sentences.append(tokens)
            corpus.append(sentences)

    # list of sentences (which are lists of words)
    corpus = list(itertools.chain.from_iterable(corpus))

    return corpus
Ejemplo n.º 15
0
def Main():
    db = Database()
    index = InvertedIndex(db)
    brown_list = brown.fileids()
    gutenberg_list = gutenberg.fileids()
    # document1 = {
    #     'id': '1',
    #     'text': 'The big sharks of Belgium drink beer.'
    # }
    # document2 = {
    #     'id': '2',
    #     'text': 'Belgium has great beer. They drink beer all the time.'
    # }
    i = 0
    for item in brown_list:
        documentTemp = {'id': str(i), 'text': brown.raw(item)}
        index.index_document(documentTemp)

    for item in gutenberg_list:
        documentTemp = {'id': str(i), 'text': gutenberg.raw(item)}
        index.index_document(documentTemp)

    while True:
        search_term = input("Enter term(s) to search: ")
        result = index.lookup_query(search_term.lower())
        for term in result.keys():
            for appearance in result[term]:
                # Belgium: { docId: 1, frequency: 1}
                document = db.get(appearance.docId)
                print(highlight_term(appearance.docId, term, document['text']))
            print("-----------------------------")
Ejemplo n.º 16
0
def fetch_train_test(corpora, test_corpus):
    train = []
    test = []
    unknown.clear()
    vocab.clear()
    pred_count_dict.clear()
    succ_count_dict.clear()
    for corpus in corpora:
        if corpus == 'brown':
            files = brown.fileids()
        elif corpus == 'gutenberg':
            files = gutenberg.fileids()
        else:
            print("config Error")
        for file in files:
            if corpus == 'brown':
                sentences = brown.sents(file)
            elif corpus == 'gutenberg':
                sentences = gutenberg.sents(file)
            else:
                print("config Error")
            permute = np.ones(len(sentences))
            if corpus == test_corpus:
                permute[:int(len(sentences) * 0.2)] = 0
            np.random.shuffle(permute)
            for index in range(len(sentences)):
                if permute[index] == 0:
                    test.append(sentences[index])
                else:
                    train.append(sentences[index])
        return [train, test]
Ejemplo n.º 17
0
def get_data():
    data = []

    for fileid in brown.fileids():
        document = ' '.join(brown.words(fileid))
        data.append(document)
    return data
Ejemplo n.º 18
0
def update_elements(elements, dataset, common_adverbs):
    '''
    Update elements
    freq : See through corpus
    adjective : See through synsets
    used_common/uncommon : See through dataset
    '''
    # freq
    for fileid in corpus.fileids():
        for word in corpus.words(fileid):
            if word in elements:
                elements[word]['freq'] += 1
    # adjective
    for element in elements:
        for synset in wn.synsets(element):
            if synset.pos() in ['a', 's', 'r']:
                elements[element]['adjective'] += 1
    # used_common
    for common in common_adverbs:
        data_dict = dataset[common[0]]['used']
        for element in data_dict:
            num = data_dict[element]
            elements[element]['used_common'][common[0]] = num
    # used_uncommon
    uncommon_adverbs = [
        word for word in dataset
        if word not in [common for common, score in common_adverbs]
    ]
    for uncommon in uncommon_adverbs:
        data_dict = dataset[uncommon]['used']
        for element in data_dict:
            num = data_dict[element]
            elements[element]['used_uncommon'][uncommon] = num
Ejemplo n.º 19
0
def build_word_count():
    if os.path.isfile('pickled/wcount.pickle'):
        return read_pickle('pickled/wcount.pickle')
    wcount = Counter()
    for fid in words.fileids():
        for word in words.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in gutenberg.fileids():
        for word in gutenberg.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in brown.fileids():
        for word in brown.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in reuters.fileids():
        for word in reuters.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in inaugural.fileids():
        for word in inaugural.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    dump_pickle(wcount, 'pickled/wcount.pickle')
    return wcount
Ejemplo n.º 20
0
def cal_idf():
    # brown.sents()
    total_wordlists = []
    doc_sents = []
    for f in brown.fileids():
        print f
        doc_wordlist = []
        doc_sentlist = brown.sents(fileids=[f])
        d_sents = ''
        for sent in doc_sentlist:
            s = ''
            # sent = stem_tokens(sent)
            for w in sent:
                w = w.lower()
                s += w + ' '
            d_sents += s + '\n'
            doc_wordlist.extend(sent)
        total_wordlists.append(doc_wordlist)
        doc_sents.append(d_sents)
    print 'start caling tfidf'

    from sklearn.feature_extraction.text import TfidfVectorizer
    corpus = doc_sents
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(corpus)
    idf = vectorizer.idf_
    # print dict(zip(vectorizer.get_feature_names(), idf))
    pickle.dump(vectorizer, open('idf_vectorizer', 'w'))
    dictionary = corpora.Dictionary(total_wordlists)
    dic, corps = get_corpus_by_lists(total_wordlists)
    tfidf = models.TfidfModel(corps, id2word=dic)
    pickle.dump(tfidf, open('brown_tfidf', 'w'))
Ejemplo n.º 21
0
def evaluation():
    print "=============== The Test Set ==============="
    import random
    from nltk.corpus import brown
    tagged_sents = list(brown.tagged_sents(categories='news'))
    random.shuffle(tagged_sents)
    size = int(len(tagged_sents) * 0.1)
    train_set, test_set = tagged_sents[size:], tagged_sents[:size]

    file_ids = brown.fileids(categories='news')
    size = int(len(file_ids) * 0.1)
    train_set = brown.tagged_sents(file_ids[size:])
    test_set = brown.tagged_sents(file_ids[:size])

    train_set = brown.tagged_sents(categories='news')
    test_set = brown.tagged_sents(categories='fiction')

    print "=============== Accuracy ==============="
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print 'Accuracy: %4.2f' % nltk.classify.accuracy(classifier, test_set)

    print "=============== Confusion Matrices ==============="
    def tag_list(tagged_sents):
        return [tag for sent in tagged_sents for (word, tag) in sent]

    def apply_tagger(tagger, corpus):
        return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

    gold = tag_list(brown.tagged_sents(categories='editorial'))
    test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))
    cm = nltk.ConfusionMatrix(gold, test)
def ex_constitution_corpus():
    themes = {
        "news": ["news", "reviews", "editorial"],
        "literature": ["science_fiction", "romance", "fiction", "mystery"],
        "sciences": ["learned"]
    }
    nb_instances = 0
    corpus = {}
    for category in themes:
        print(category, ":")
        nb_doc = len(brown.fileids(categories=themes[category]))
        print("  ", nb_doc, "documents")
        nb_instances += nb_doc
        corpus[category] = brown.fileids(categories=themes[category])
    print("NB instances :", nb_instances)
    return corpus
def load_movie_corpus_each_sentence(range):
    m = re.match(r'(\d+):(\d+)$', range)
    if m:
        start = int(m.group(1))
        end = int(m.group(2))
        from nltk.corpus import movie_reviews as corpus
        return [corpus.sents(fileid) for fileid in corpus.fileids()[start:end]]
Ejemplo n.º 24
0
def load_corpus(range):#使用nltk中的语料brown来做测试,m为切片值,选择语料起始和终止
    m = re.match(r'(\d+):(\d+)$', range)#正则表达式 形如 1:5
    if m:
        start = int(m.group(1))
        end = int(m.group(2))
        from nltk.corpus import brown as corpus
        return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
Ejemplo n.º 25
0
def CreateFeatures():
    features = []
    sw = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()  # yielded an extra 0.1% so I kept it in.

    # get the categories in brown
    for c in brown.categories():
        # get the files in a category
        for d in brown.fileids(categories=c):
            # get words from a file
            words = brown.words(fileids=d)

            # extracted words of a document appended together.
            extracted_words = ""

            # filter each word
            for w in words:
                w = lemmatizer.lemmatize(w)
                w = stemmer.stem(w)
                if (w not in sw) and (w.isalnum()):
                    # append until we have the filtered document recreated
                    extracted_words += w.lower() + " "

                # create features and add them to a feature set.
                feature = ({"words": extracted_words}, c)
                features.append(feature)

    return features
Ejemplo n.º 26
0
def get_data(sub_task):
    """
    returns train data and test data according to sub_task

    :param sub_task:
    :return:
    """
    sentences_brown = list(brown.sents(brown.fileids()))
    sentences_gutenberg = list(gutenberg.sents(gutenberg.fileids()))

    # adding stop symbols
    add_stop_symbol(sentences_brown)
    add_stop_symbol(sentences_gutenberg)

    # get training and test data
    sentences_brown_train, sentences_brown_test = split(sentences_brown, 0.9)
    sentences_gutenberg_train, sentences_gutenberg_test = split(
        sentences_gutenberg, 0.9)

    if sub_task == S1:
        return sentences_brown_train, sentences_brown_test
    elif sub_task == S2:
        return sentences_gutenberg_train, sentences_gutenberg_test
    elif sub_task == S3:
        sentences_brown_train.extend(sentences_gutenberg_train)
        return sentences_brown_train, sentences_brown_test
    elif sub_task == S4:
        sentences_brown_train.extend(sentences_gutenberg_train)
        return sentences_brown_train, sentences_gutenberg_test
    else:
        print("Provide proper sub_task")
        exit(0)
Ejemplo n.º 27
0
def load_corpus(range):
    m = re.match(r'(\d+):(\d+)$', range)
    if m:
        start = int(m.group(1))
        end = int(m.group(2))
        from nltk.corpus import brown as corpus
        return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
Ejemplo n.º 28
0
def update_wordset(adverbs):
    for fileid in corpus.fileids():
        source = corpus.words(fileid)
        for i in range(len(source)):
            if source[
                    i] in adverbs:  # If we find the adverb, update the neightbor words
                cand = source[i + 1]
                update_word(cand, source[i])
Ejemplo n.º 29
0
 def get_texts(self):
     i = 0
     for fileid in brown.fileids():
         self.texts[fileid] = brown.words(fileid)
         if self.max_docs is not None:
             i += 1
             if i >= self.max_docs:
                 break
Ejemplo n.º 30
0
def calcDF():
    
    doc_dic = defaultdict(set)
    for fileid in brown.fileids():
        for token in brown.words(fileids=[fileid]):
            doc_dic[token].add(fileid)
            
    return doc_dic
Ejemplo n.º 31
0
 def __init__(self, max_ocd_size=7, do_count=True):
     self.file_ids = brown.fileids()
     self.max_id = len(self.file_ids)
     self.max_ocd_size = max_ocd_size
     self.all_files = []
     self.fgram_files = []
     if do_count is True:
         self.ocds = self.count_occurrences()
Ejemplo n.º 32
0
def create_news_or_not():
    news_or_not = []
    for category in brown.categories():
        for fileid in brown.fileids(category):
            if category == 'news':
                news_or_not.append((brown.words(fileid), category))
            else:
                news_or_not.append((brown.words(fileid), 'non-news'))
    return news_or_not
def brownWordOccurences():
    wordOccurences = defaultdict(lambda: 0)

    for fileid in brown.fileids():
        wordList = brown.words(fileid)
        for word in wordList:
            wordOccurences[word.lower()] += 1

    return wordOccurences
Ejemplo n.º 34
0
def fun5():
    from nltk.corpus import brown
    # nltk.download('brown')
    for i in brown.fileids():
        print i
    print brown.categories()
    print brown.words(categories='news')
    print brown.words(fileids='cg22')
    print brown.sents(categories=['news', 'editorial', 'reviews'])
Ejemplo n.º 35
0
def main():
    data = []


    for fileid in brown.fileids():
        document = ' '.join( brown.words( fileid ) )
        data.append( document )

    NO_DOCUMENTS = len( data )
    print( NO_DOCUMENTS )
    print( data[:5] )
# For gensim we need to tokenize the data and filter out stopwords
    tokenized_data = []
    for text in data:
        tokenized_data.append( clean_text( text ) )

# Build a Dictionary - association word to numeric id
    dictionary = corpora.Dictionary( tokenized_data )

# Transform the collection of texts to a numerical form
    corpus = [dictionary.doc2bow( text ) for text in tokenized_data]

# Have a look at how the 20th document looks like: [(word_id, count), ...]
    print( corpus[20] )
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...

# Build the LDA model
    lda_model = models.LdaModel( corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary )

# Build the LSI model
    lsi_model = models.LsiModel( corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary )


    print( "LDA Model:" )

    for idx in range( NUM_TOPICS ):
    # Print the first 10 most representative topics
        print( "Topic #%s:" % idx, lda_model.print_topic( idx, 10 ) )

    print( "=" * 20 )

    print( "LSI Model:" )

    for idx in range( NUM_TOPICS ):
    # Print the first 10 most representative topics
        print( "Topic #%s:" % idx, lsi_model.print_topic( idx, 10 ) )

    print( "=" * 20 )

    text = open('17170-0.txt', 'r').read()
    bow = dictionary.doc2bow( clean_text( text ) )

    print( lsi_model[bow] )
    # [(0, 0.091615426138426506), (1, -0.0085557463300508351), (2, 0.016744863677828108), (3, 0.040508186718598529), (4, 0.014201267714185898), (5, -0.012208538275305329), (6, 0.031254053085582149), (7, 0.017529584659403553), (8, 0.056957633371540077),
    #(9, 0.025989149894888153)]

    print( lda_model[bow] )
Ejemplo n.º 36
0
def main():

    #create our training set and testing set from the news category in the
    #brown corpus
    fileids = brown.fileids(categories='news')
    training_ids = fileids[:33]
    test_ids = fileids[33:]
    training = brown.tagged_words(fileids=training_ids)
    test = brown.tagged_words(fileids=test_ids)

    trainingTokens = reviseTags(training)  #revise tags in our training set
    hist = createHistogram(
        trainingTokens)  #used revised tags to make histogram
    fdist = mostCommonTags(trainingTokens)
    mostCommonTag = fdist.most_common(1)[0][0]  #retrieve most common tag

    print("\nQuestion 8\n")
    tagGuesses, unknownTokens = assignTags(hist, test, [mostCommonTag])
    tagTruth = reviseTags(test)
    incorrectTags = analyzeAssignedTags(tagGuesses, tagTruth)

    print("\nQuestion 9\n")
    analyzeIncorrectTags(incorrectTags)

    print("\nQuestion 10\n")
    incorrectTags = analyzeAssignedTagsUnknownTokens(unknownTokens, tagTruth)
    analyzeIncorrectTags(incorrectTags)

    print("\nQuestion 10 part 2\n")
    # repeat test, but randomly assign the 10 most frequently found tags
    top5Tags = fdist.most_common(10)
    defaultTags = [tag for tag, count in top5Tags]
    tagGuesses, unknownTokens = assignTags(hist, test, defaultTags)
    incorrectTags = analyzeAssignedTagsUnknownTokens(unknownTokens, tagTruth)
    analyzeIncorrectTags(incorrectTags)

    print("\nQuestion 11\n")
    testids = brown.fileids(categories='romance')
    test = brown.tagged_words(fileids=testids)
    tagGuesses, unknowTokens = assignTags(hist, test, [mostCommonTag])
    tagTruth = reviseTags(test)
    incorrectTags = analyzeAssignedTags(tagGuesses, tagTruth)
    analyzeIncorrectTags(incorrectTags)
Ejemplo n.º 37
0
def create_train_test_ids():
    train_id = []
    test_id = []
    brown_categories = brown.categories()
    for category in brown_categories:
        ttid = tts(brown.fileids(category))
        train_id.append(ttid[0])
        test_id.append(ttid[1])
    pickle.dump((train_id, test_id), open("./data/train_test_ids.pkl", 'wb'))
    return train_id, test_id
Ejemplo n.º 38
0
def load_corpus(ranges):
    """
    load data from corpus
    """
    tmp = re.match(r'(\d+):(\d+)$', ranges)
    if tmp:
        start = int(tmp.group(1))
        end = int(tmp.group(2))
        from nltk.corpus import brown as corpus
        return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
Ejemplo n.º 39
0
def generate_dataset():

    print("Processando tokens dos documentos do corpus Brown")
    brown_tokens = get_brown_documents()
    print("Carregando árvore etimológica")
    etymwn = load_etymology()
    print("Extraíndo assinatura etimológica dos documentos")
    fingerprints = generate_sig_dataset(brown_tokens, etymwn)
    # Indexar por nome do documento
    fingerprints.index = brown.fileids()
    fingerprints.to_csv('brown_fingerprints_2.csv')
Ejemplo n.º 40
0
def ex_constitution_corpus():
    themes = {
        "news": ["news", "reviews", "editorial"],
        "literature": ["science_fiction", "romance", "fiction", "mystery"],
        "sciences": ["learned"]
    }
    nb_instances = 0
    corpus = {}
    for category in themes:
        print("category : ", category)
        nb_doc = len(brown.fileids(categories=themes[category]))
        print("Il y a ", nb_doc,
              " %s(category) documents in Brown corpus." % category)
        nb_instances += nb_doc
        corpus[category] = brown.fileids(categories=themes[category])
    print(
        "Il y a ", nb_instances,
        " documents dans les 3 catégories (news,literature,sciences) pour Brown."
    )
    return corpus
def populate_texts(session):
    if dl.db.not_empty(session, Text):
        # Cowardly refusing to continue
        return

    fids = brown.fileids(categories='news')

    for fid in fids:
        session.add(Text(file=fid))

    session.commit()
Ejemplo n.º 42
0
def get_data():
    data = []

    for fileid in brown.fileids():
        document = ' '.join(brown.words(fileid))
        data.append(document)

    NO_DOCUMENTS = len(data)
    print(NO_DOCUMENTS)
    print(data[:5])
    return data
 def test_clusterer(self):
     """Here we take 10 documents categorized as 'government' and
     'mystery' from the brown corpus, and perform k-means clustering on
     these. Optimally we would like the clusterer to divide them in two
     clusters.
     The clusterer generates clusters depending on random initial
     conditions, so the result can be different in different test runs.
     In order to account for that that we run a lot of iterations
     (50) which hopefully will generate a good result. The success
     condition is that a max of  1 out of 10 documents will fall in the 
     wrong cluster.
     """
     
     tagged_sents =  brown.tagged_sents(
         categories=['government','mystery'])
     tagger = getUtility(IPOSTagger,
         name="collective.classification.taggers.NgramTagger")
     tagger.train(tagged_sents)
     extractor = getUtility(ITermExtractor)
     extractor.setTagger(tagger)
     storage = getUtility(INounPhraseStorage)
     
     clusterer = KMeans()
     government_ids = brown.fileids(categories='government')[:10]        
     mystery_ids = brown.fileids(categories='mystery')[:10]
     
     for articleid in government_ids:
         text = " ".join(brown.words(articleid))
         storage.addDocument(articleid,text)
     
     for articleid in mystery_ids:
         text = " ".join(brown.words(articleid))
         storage.addDocument(articleid,text)
     result = clusterer.clusterize(2,20,repeats=50)
     cluster1 = set(result[0])
     missed = min(len(cluster1-set(government_ids)),
                  len(cluster1-set(mystery_ids)))
     self.failUnless(missed<2)
Ejemplo n.º 44
0
def iterate_brown_corpus():
    print("parsing brown corpus")
    for fileid in brown.fileids():
        print("parse - brown: " + fileid)
        sentence = ["^"]
        for sent in brown.sents(fileid):
            for word in sent:
                legal_form = legal_word.search(word.lower())
                if (legal_form):
                    sentence.append(legal_form.group())

            sentence.append("$")
            parseSentence(sentence)
            sentence = ["^"]
Ejemplo n.º 45
0
def testSet():

    tagged_sents = list(brown.tagged_sents(categories='news'))
    random.shuffle(tagged_sents)
    size = int(len(tagged_sents) * 0.1)
    train_set, test_set = tagged_sents[size:], tagged_sents[:size]

    file_ids = brown.fileids(categories='news')
    size = int(len(file_ids) * 0.1)
    train_set = brown.tagged_sents(file_ids[size:])
    test_set = brown.tagged_sents(file_ids[:size])

    train_set = brown.tagged_sents(categories='news')
    test_set = brown.tagged_sents(categories='fiction')
Ejemplo n.º 46
0
def build_all_brown(subset_size=None):
    documents = []
    categories = []

    all_categories = set()

    try:
        fileids = brown.fileids()

        for fileid in fileids:
            if subset_size:
                if len(all_categories) > subset_size:
                    break
            category = brown.categories(fileid)[0]
            words = [x.lower() for x in brown.words(fileid)]

            documents.append(words)
            categories.append(category)

            all_categories.add(category)

        if subset_size != len(brown.categories()):
            # exclude the final item, since it's the sole member of the next group
            documents = documents[:-1]
            categories = categories[:-1]

        documents = [" ".join(d) for d in documents]

    except LookupError:
        """ we don't have the Brown corpus via nltk on this machine """
        try:
            with open("brown_docs_cats.pickle") as f:
                documents, categories = pickle.load(f)
        except IOError:
            raise Exception("can't load Brown Corpus via NLTK or file")

    # documents = [' '.join(d) for d in documents]

    """
    # let's NOT get tempted to hide away the encoding
    # we'll probably need to access, e.g., the vectorizer, to do reverse
    # transformations once we want to interpret/evaluate the model

    doc_vectorizer = CountVectorizer()
    doc_vec = doc_vectorizer.fit_transform(documents)
    """

    return documents, categories
Ejemplo n.º 47
0
def main():
  # store word lengths
  brown_word_lens = []
  web_word_lens = []
  inaugural_word_lens = []
  gutenberg_word_lens = []
  genesis_word_lens = []

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      gutenberg_word_lens.append(len(word))

  for file in brown.fileids():
    for word in brown.words(file):
      brown_word_lens.append(len(word))

  for file in webtext.fileids():
    for word in webtext.words(file):
      web_word_lens.append(len(word))

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      inaugural_word_lens.append(len(word))

  for file in genesis.fileids():
    for word in genesis.words(file):
      genesis_word_lens.append(len(word))
  with open("wordlens.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), 
        len(web_word_lens), len(brown_word_lens),
        len(gutenberg_word_lens))):
      for corpus in [genesis_word_lens, inaugural_word_lens,
          web_word_lens, brown_word_lens, gutenberg_word_lens]:
        if(i >= len(corpus)):
          f.write(",")
        else:
          f.write(str(corpus[i]) + ",")
      f.write("\n")
Ejemplo n.º 48
0
def categorize_cat(category):
    print "categorizing: " + category
    total = 0
    count = 0
    cos_correct = 0
    oop_correct = 0
    shared = 0
    cos_false_negative = []
    oop_false_negative = []
    for text in brown.fileids(category)[10:]:
        # only compare the first five files after the training set
        if count >= COMPARISON_SET:
            count = 0
            break
        
        # indicate that we have compared one more file
        count += 1
        total += 1

        #categorize this text
        cos, oop = categorize(text)
        # check to see if the cos distance or out of place distance categorized
        # it correctly
        if cos == category:
            # cos distance correctly classified
            cos_correct += 1
        else:
            # cos distance incorrectly classified
            cos_false_negative.append(cos)
        if oop == category:
            # out of place distance correctly classified
            oop_correct += 1
            if cos == category:
                # both classified correctly
                shared += 1
        else:
            # out of place distance incorrectly classified
            oop_false_negative.append(oop)
    return (category, cos_correct, oop_correct, total, cos_false_negative, oop_false_negative)
Ejemplo n.º 49
0
from nltk.corpus import brown
from nltk.text import TextCollection

# the default initialization for a corpus fails
# since files() was changed to fileids()
# anyway, brown_collection = TextCollection(brown) doesn't work
# so we use this workaround
words = [brown.words(f) for f in brown.fileids()]
brown_collection = TextCollection(words)
Ejemplo n.º 50
0
from pprint import pprint

train_dict = nltk.defaultdict(list)
test_dict = nltk.defaultdict(list)

def FDtoDIC(fd):
    out_dict = nltk.defaultdict(float)
    for key in fd.keys():
        out_dict[key] = fd[key]
    out_dict['N'] = fd.N()
    return out_dict


for category in set(brown.categories()).\
    difference(set(['humor', 'science_fiction'])):
    cat_files = brown.fileids(categories=category)
    random.shuffle(cat_files)
    size = int(len(cat_files) * 0.85)
    train, test = cat_files[:size], cat_files[size:]
    key_list = []
    for f in train:
        temp = brown.open(f).read().split()
        temp = [entry.split('/')[0] for entry in temp]
        temp = [entry for entry in temp if entry \
                not in stopwords.words('english')]
        train_dict[category].append(FDtoDIC(nltk.FreqDist(temp)))
        key_list.extend(train_dict[category][-1].keys())
    # compute the averge sample for the given category
    key_list = set(key_list)
    cat_avg_dict = {}
    for word in key_list:
Ejemplo n.º 51
0
        if (isMatch):
            table[word][matchCount] += 1
        else:
            table[word][nonmatchCount] += 1
    else:
        table.append(word, category, matchCount, nonMatchCount)

import nltk.classify
import nltk.cluster
import nltk.corpus
import random

from nltk.corpus import brown
documents = [(list(brown.words(fileid)), category)
             for category in brown.categories()
             for fileid in brown.fileids(category)]
random.shuffle(documents)

all_words = nltk.FreqDist(w.lower() for w in brown.words())
word_features = all_words.keys()[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

def classify(documents, words):
    featuresets = [(document_features(d), c) for (d,c) in documents]
    train_set, test_set = featuresets[100:], featuresets[:100]
Ejemplo n.º 52
0
import TdMat
reload(TdMat)
from nltk.corpus import brown

for f in brown.fileids():
    docs = TdMat.process_sample(f)
    print f
    for doc in  docs:
        TdMat.tdm.add_doc(doc)

import re
class_num = []
for f in brown.fileids():
    docs = TdMat.process_sample(f)
    ch = re.findall(r'c([a-r])\d\d',f)[0]
    for doc in  docs:
        class_num.append(ord(ch) - 96)

Ejemplo n.º 53
0
print("""
----------------------------------------------------------------------
3  Evaluation
3.1  The Test Set
----------------------------------------------------------------------
""")

import random
from nltk.corpus import brown
tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]

file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_sents = brown.tagged_sents(file_ids[size:])
test_sents = brown.tagged_sents(file_ids[:size])

train_sents = brown.tagged_sents(categories='news')
test_sents = brown.tagged_sents(categories='fiction')
print("-" * 40)

print("""
----------------------------------------------------------------------
3.2  Accuracy
----------------------------------------------------------------------
""")
train_set = [({'word': w}, t) for sent in train_sents for (w, t) in sent]
test_set = [({'word': w}, t) for sent in test_sents for (w, t) in sent]
Ejemplo n.º 54
0
#!/usr/bin/python3
# coding: utf-8
# Brown Corpus (布朗语料库): Brown Corpus of Standard American English 被认为是第一个可以在计算语言学处理中使用的通用英语语料库
#     它包含了一百万字 1961 年出版的美语文本; 它代表了通用英语的样本, 采样自小说, 新闻和宗教文本; 随后, 在大量的人工标注后, 诞生了词性标注过的版本
from nltk.corpus import brown
print(len(brown.fileids()))  # 500; 个 文档
print(brown.fileids()[:5])  # ['ca01', 'ca02', 'ca03', 'ca04', 'ca05']
print(len(brown.words()))  # 1161192; 总共 1161192 个单词
print(brown.words()[:5])  # ['The', 'Fulton', 'County', 'Grand', 'Jury']; 打印前 5 个单词
print(len(brown.words('ca01')))  # 2242; 一片文档还是比较少的
##################################################################
## 标记数据
print(brown.tagged_words()[:3])  # [('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL')]; 打印前 3 个单词的标注
##################################################################
## categories
print(len(brown.categories()))  # 15; 个分类
print(brown.categories())  # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
print(len(brown.words(categories='news')))  # 100554; 统一类数据的单词
print(len(brown.sents(categories=['news', 'editorial', 'reviews'])))  # 9371
# brown 包括标记数据 和 非标记数据
print(len(brown.words()))  # 1161192
print(len(brown.words(categories=brown.categories())))  # 1161192; 所有数据都在 categories 里面
##################################################################
## 路径
print(brown.abspath('ca01'))  # /home/coder352/nltk_data/corpora/brown/ca01
print(brown.abspaths())  # 所有文档路径
##################################################################
## 类型
print(type(brown))  # <class 'nltk.corpus.reader.tagged.CategorizedTaggedCorpusReader'>
print(type(brown.words()))  # <class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>
print(type(brown.words('ca01')))  # <class 'nltk.corpus.reader.tagged.TaggedCorpusView'>
Ejemplo n.º 55
0
from nltk.corpus import brown
import numpy

feature_dictionary = []
feature_map = {}
for word in brown.words():
    word = word.lower()
    if word not in feature_map:
        feature_map[word] = len(feature_dictionary)
        feature_dictionary.append(word)

frequency_matrix = numpy.zeros((len(feature_dictionary), len(brown.fileids())), dtype=numpy.uint32)
for document_index, document in enumerate(brown.fileids()):
    for word in brown.words(document):
        word = word.lower()
        frequency_matrix[feature_map[word], document_index] += 1

with open("/tmp/feature-dictionary.scidb", "w") as file:
    file.write("{0}[\n")
    for feature in feature_dictionary[:-1]:
        file.write('("%s"),\n' % feature)
    for feature in feature_dictionary[-1:]:
        file.write('("%s")\n' % feature)
    file.write("]\n")

with open("/tmp/frequency-matrix.csv", "w") as file:
    for i in range(frequency_matrix.shape[0]):
        for j in range(frequency_matrix.shape[1]):
            file.write("%s,%s,%s\n" % (i, j, frequency_matrix[i, j]))
Ejemplo n.º 56
0
 def init_kwargs(cls, root=None, fileids=None):
     return dict(
         root=brown.root if root is None else root,
         paths=brown.fileids() if fileids is None else fileids,
     )
Ejemplo n.º 57
0
def main():
  # store word lengths
  brown_common_freq = []
  web_common_freq = []
  inaugural_common_freq = []
  gutenberg_common_freq = []
  genesis_common_freq = []

  common = ["the", "be", "to", "of", "and", "a", "in", "that", "have",
            "i", "it", "for", "not", "on", "with", "he", "as", "you",
            "do", "at", "this", "but", "his", "by", "from", "they",
            "we", "say", "her", "she", "or", "an", "will", "my", "one",
            "all", "would", "there", "their", "what", "so", "up", "out",
            "if", "about", "who", "get", "which", "go", "me", "when",
            "make", "can", "like", "time", "no", "just", "him", "know",
            "take", "people", "into", "year", "your", "good", "some",
            "could", "them", "see", "other", "than", "then", "now", "look",
            "only", "come", "its", "over", "think", "also", "back", "after",
            "use", "two", "how", "our", "work", "first", "well", "way",
            "even", "new", "want", "because", "any", "these", "give", "day",
            "most", "us"]
  common.sort()

  for file in gutenberg.fileids():
    total_words = len(gutenberg.words(file))
    total_common = 0
    for word in gutenberg.words(file):
      if word.lower() in common:
        total_common += 1
    gutenberg_common_freq.append(float(total_common)/total_words)

  for file in brown.fileids():
    total_words = len(brown.words(file))
    total_common = 0
    for word in brown.words(file):
      if word.lower() in common:
        total_common += 1
    brown_common_freq.append(float(total_common)/total_words)

  for file in webtext.fileids():
    total_words = len(webtext.words(file))
    total_common = 0
    for word in webtext.words(file):
      if word.lower() in common:
        total_common += 1
    web_common_freq.append(float(total_common)/total_words)

  for file in inaugural.fileids():
    total_words = len(inaugural.words(file))
    total_common = 0
    for word in inaugural.words(file):
      if word.lower() in common:
        total_common += 1
    inaugural_common_freq.append(float(total_common)/total_words)

  for file in genesis.fileids():
    total_words = len(genesis.words(file))
    total_common = 0
    for word in genesis.words(file):
      if word.lower() in common:
        total_common += 1
    genesis_common_freq.append(float(total_common)/total_words)

  with open("common-words.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq),
                        len(web_common_freq), len(brown_common_freq),
                        len(gutenberg_common_freq))):
      for corpus in [genesis_common_freq, inaugural_common_freq,
                     web_common_freq, brown_common_freq, gutenberg_common_freq]:
        if i >= len(corpus):
          f.write(",")
        else:
          f.write(str(round(corpus[i], 5)) + ",")
      f.write("\n")
Ejemplo n.º 58
0
def brownPart(part):
    """Return one section of the Brown corpus as a list of sentences."""
    return brown.sents([f for f in brown.fileids() if f.startswith('c'+part)])
Ejemplo n.º 59
0
    featurewords = words
    freqThreshold = 5
    while len(featurewords) > maxWordCount:
        featurewords = [word for word in featurewords if words[word] > freqThreshold]
        freqThreshold += 1
    return featurewords

#create dict with boolean values for existence of words in a document
def getDocFeatures(doc, words):
    features = {}
    for word in words:
        features[word] = (word in doc)
    return features

#import data into words, category pairs
docs = [(list(brown.words(fileid)), category) for category in brown.categories() for fileid in brown.fileids(category)]

#identify list of words to be used as features
allwords = nltk.FreqDist([word.lower() for word in brown.words()])
featurewords = getFeatureWords(5000, allwords)

#filter for stopwords
#featurewords = [word for word in featurewords if word not in set(stopwords.words('english'))]

#create category, featureset pairs
docfeatures = [(getDocFeatures(doc, featurewords), category) for (doc, category) in docs]

#Break into training and test sets
random.shuffle(docfeatures)
train, test = docfeatures[:400], docfeatures[400:]