Esempio n. 1
0
def make_unlabeled_set(labeled_set_files, storage_file):
    samples_to_exclude = []
    # From the list of labeled set files, get the items to exclude
    for next_file in labeled_set_files:
        with open(root + "corpora/" + next_file) as next_file_handler:
            for line in next_file_handler:
                ptrn = '"\\[.*\\]"'
                match = re.findall(ptrn, line)
                if match:
                    samples_to_exclude.append(match[0])
    # Go through the brown corpus, and if we're not looking at something in the list
    # of things to exclude, output it to at .csv
    # This is very inefficient, but is most foolproof and we shouldn't have to run this
    # many times.
    brown_paras = brown.paras()
    para_index, sent_index = 0, 1
    SOURCE_NAME = "BRWN"
    with open(root + "test_extractions/" + storage_file,
              'w') as storage_file_handler:
        for para in brown_paras:
            for sent in para:
                id_tag = '"[' + SOURCE_NAME + ", PARA#" + str(
                    para_index) + ", SENT#" + str(sent_index) + ']"'
                if id_tag not in samples_to_exclude:
                    print(id_tag + "," + " ".join(sent),
                          file=storage_file_handler)
                sent_index += 1
            para_index += 1
            sent_index = 1
def gen_corpus(authors, corpus):
    if corpus == 'all':
        all_texts = sum([x.known for x in authors], [])

        return ''.join(sorted(set(all_texts)))
    elif corpus == 'brown':
        paragraphs = brown.paras()

        paragraph_txt = ''
        for paragraph in paragraphs:

            sentence_txt = ''
            for sentence in paragraph:

                word_txt = ''
                for word in sentence:
                    if word == '.' or word == ',' or word == '!'\
                            or word == '?':
                        word_txt = word_txt[:-1] + word + ' '
                    else:
                        word_txt += word + ' '

                sentence_txt += word_txt

            paragraph_txt += sentence_txt + '\n\n'

        return paragraph_txt
    else:
        raise Exception('UNKNOWN CORPUS')
Esempio n. 3
0
def create_documents():
    brown_paras = brown.paras(categories='news')
    documents = []
    for p in brown_paras:
        for doc in p:
            documents.append(" ".join(doc))

    data = []
    for id, d in enumerate(documents):
        data.append((id, d))
    return data
Esempio n. 4
0
def create_documents():
    brown_paras = brown.paras(categories='news')
    documents = []
    for p in brown_paras:
        for doc in p:
            documents.append(" ".join(doc))

    data = []
    for id, d in enumerate(documents):
        data.append((id, d))
    return data
Esempio n. 5
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(normalize_tokens(content))

    w2v = Word2Vec(texts, size=100, window=5, min_count=5, workers=4)
    w2v.save(model_path)
Esempio n. 6
0
def do(file):
    # Read data from train
    X_train = pd.DataFrame(columns=('review', 'genre'))
    for genre in brown.categories():
        article = brown.paras(categories=genre)
        for review in article:
            X_train = X_train.append({
                'review': review,
                'genre': genre
            },
                                     ignore_index=True)
    # Read data from test
    X_test = pd.read_csv(file, header=0, delimiter=",")
    X_train = data_clean.convert_to_para(X_train)
    return X_train, X_test
Esempio n. 7
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(' '.join(normalize_tokens(content)))

    transformer = CountVectorizer()
    tf = transformer.fit_transform(texts)
    svd = TruncatedSVD(n_components=100)
    lsa = svd.fit_transform(tf.T)

    lsa.dump(open(model_path, 'wb'))
    pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
Esempio n. 8
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(' '.join(normalize_tokens(content)))

    transformer = CountVectorizer()
    tf = transformer.fit_transform(texts)

    test_vocab = set()

    reader = csv.reader(open(global_truth_path))
    for line in reader:
        w1, w2, score = line
        test_vocab.add(stemmer.stem(w1))
        test_vocab.add(stemmer.stem(w2))
    test_vocab = {k: v for v, k in enumerate(test_vocab)}

    model = np.zeros((len(test_vocab), len(transformer.vocabulary_)))

    for text in texts:
        text = text.split()
        for i in range(len(text)):
            if text[i] not in test_vocab:
                continue
            for j in (i - window_size, i + window_size + 1):
                if j < 0 or j >= len(text):
                    continue
                if text[j] not in transformer.vocabulary_:
                    continue
                model[test_vocab[text[i]]][transformer.vocabulary_[
                    text[j]]] += 1
    model.dump(model_path)
    pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
    pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
Esempio n. 9
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(' '.join(normalize_tokens(content)))

    transformer = CountVectorizer()
    tf = transformer.fit_transform(texts)

    test_vocab = set()

    reader = csv.reader(open(global_truth_path))
    for line in reader:
        w1, w2, score = line
        test_vocab.add(stemmer.stem(w1))
        test_vocab.add(stemmer.stem(w2))
    test_vocab = {k: v for v, k in enumerate(test_vocab)}

    model = np.zeros((len(test_vocab), len(transformer.vocabulary_)))

    for text in texts:
        text = text.split()
        for i in range(len(text)):
            if text[i] not in test_vocab:
                continue
            for j in (i-window_size, i+window_size+1):
                if j < 0 or j >= len(text):
                    continue
                if text[j] not in transformer.vocabulary_:
                    continue
                model[test_vocab[text[i]]][transformer.vocabulary_[text[j]]] += 1
    model.dump(model_path)
    pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
    pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
Esempio n. 10
0
def make_testdata(in_dim=10, out_dim=3, num_samples=100):
    """Make sample data from brown corpus"""

    X = []
    y = []

    tp = tops.TextProcessor()

    for idx, para in enumerate(brown.paras()):

        intlist = tp.string_to_ints(' '.join(para[0]), pad_len=in_dim)
        X.append(np.array(intlist))

        _tmpy = np.zeros((out_dim, ))
        _tmpy[idx % out_dim] = 1.0
        y.append(_tmpy)

        if idx > num_samples:
            break

    X = np.vstack(X)
    y = np.vstack(y)
    return X, y
Esempio n. 11
0
import nltk
# from nltk.book import *
# nltk.download()
from nltk.corpus import brown
from boyer_moore import find_boyer_moore_all
from boyer_moore import find_boyer_moore_all_paras

# A list of the sentences, each of which is a list of the words in the sentence
# brown_sents = brown.sents(categories=['news', 'editorial', 'reviews'])
# A list of the words
#brown_words = brown.words(categories=['news', 'editorial', 'reviews'])
brown_words = brown.words()
brown_paras = brown.paras()
brown_sents = brown.sents()
# The text, but marked up with POS information
# brown_raw = brown.raw(categories=['news', 'editorial', 'reviews'])
#print(brown_sents[0:10])
#print(brown_words[0:100])
#print(brown_raw[0:100])

brown_text = nltk.Text(brown_words)
'''
print(brown_words[:1000])
print("----------")
print(brown_text[:1000])

print(brown_paras[:3])
print("---------")
print(brown_sents[:3])
'''
para_indices = find_boyer_moore_all_paras(brown_paras, ['is', 'like', 'a'])
Esempio n. 12
0
brown_files.remove('cf35')
brown_files.remove('cj19')
brown_files.remove('cn16')

brown_files.remove('ch09')
brown_files.remove('ch12')
#'ca11','ca39','ce01','ce14','ce24','ce27','cf06','cf10','cf16','cf34','cg48','cg64','cj08','cj56','cj77','ck14','cl20','cl22','cm04','cn15','cd02','cf35','cj19','cn16','ch09','ch12'

f_out = open("coref_brown_temp2.txt", 'w')

for f in brown_files[brown_files.index('ch10'):]:
    inp = ''
    c = 0
    docs_parse = []
    print "parsing:" + str(f)
    for para in brown.paras(f):
        for sent in para:
            #print len(sent)
            #print sent
            #if len(sent)>=40:
            #continue
            for word in sent:
                c += len(word)
                c += 1
            if c >= 4094:
                # print c
                c = 0
                for word in sent:
                    c += len(word)
                    c += 1
                # print inp
Esempio n. 13
0
import nltk
from nltk.corpus import brown

ficbooks = brown.fileids(categories = ['fiction', 'science_fiction'])

nonficbooks = brown.fileids(categories = ['news', 'history', 'government', 'editorial', 'learned'])

for book in nonficbooks:
    outfile = open(book + '.txt', 'w')
    for para in brown.paras(book):
        sents = []
        for sent in para:
            sents.append(' '.join(sent).replace(',',''))
        p = ' '.join(sents)
        outfile.write(p + '\n')
    outfile.close()
Esempio n. 14
0
File: test.py Progetto: Misak233/NLP
# lemmatizer
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()


def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, 'n')
    return lemma


###
# Your answer BEGINS HERE
###
N = brown.paras()
brown_corpus = []
for i in N:
    x = []
    for j in i:
        for k in j:
            if k.isalpha():
                k = k.lower()
                k = lemmatize(k)
                x.append(k)
    brown_corpus.append(x)

BOW = {}
for i in brown_corpus:
    set1 = []
    for j in i:
Esempio n. 15
0
from nltk import FreqDist
"""Trainiert 2 verschiedene Naive Bayes classifier auf für einen User interessante
Texte.
Der erste classifier lernt Texte in verschiedene Kategorien einzuordnen
Der zweite lernt welche Kategorien der User interessant findet"""

# 1500 zufällige Textpassagen aus dem Reuters
print 'Load corpus'
#corp = reuters.raw()
print 'Loaded corpus'
#rnd = np.random.randint(0,len(corp)/2,1500)
#raw_documents = [corp[i:i+300] for i in rnd]
print 'Created docs'

pdb.set_trace()
corp = brown.paras(categories='hobbies')
rnd = np.random.randint(0, len(corp) - 3, 300)
raw_documents = [flatten(corp[i:i + 3]) for i in rnd]
pdb.set_trace()
raw_doc2 = list()
for doc in raw_documents:
    raw_doc2.append(''.join(str(word) + " " for word in doc))
raw_documents = raw_doc2

pdb.set_trace()
#posts_j = json.load(open('cogsci.json'))
#posts = posts_j.values()
#raw_documents = list()
#for post in posts:
#    if post.has_key('message'):
#        raw_documents.append(post['message'])
Esempio n. 16
0
        # write weights to file
        word_counts = test_graphs[1]
        with open(__save_dir__+"weight_"+str(class_ind)+"_"+str(i)+".csv","w") as weights_file:
            for j in xrange(0,len(word_counts)):
                if word_counts[j]:
                    weights_file.write(str(j)+","+str(word_counts[j])+"\n")
        i = i + 1
    return	


# print reuters.categories()

print brown.categories()

# print brown.sents(categories=['editorial'])[2]
print len(brown.paras(categories=['romance']))
print len(brown.paras(categories=['news']))
print len(brown.paras(categories=['government']))

# number from each class the global graphs is computed for
numTrain = 100
# number of testing article graphs computed
numTest = 150
# number of classes to compute over (this is hard coded in)
numClasses = 3


# classes choosen
cat1_para = brown.paras(categories=['romance'])
cat2_para = brown.paras(categories=['news'])
cat3_para = brown.paras(categories=['government'])
Esempio n. 17
0
from sklearn.decomposition import TruncatedSVD
from scipy.spatial.distance import cosine as cos_distance
from gensim.models import Word2Vec
from scipy.stats.stats import pearsonr

# Load 'combined.tab' file in dictionary
with open('../combined.tab') as tabFile:
    next(tabFile)
    tabSepWords = (line.split('\t') for line in tabFile)
    wordSimDict = {(words[0], words[1]): float(words[2])
                   for words in tabSepWords}

# for each paragraph in brown corpus, store a list of lower-cased, lemmatized word types
lemmatizer = WordNetLemmatizer()
brownParas = []
for paragraphs in brown.paras():
    wordTypes = set()
    wordTypes.update([
        lemmatizer.lemmatize(words.lower()) for sentences in paragraphs
        for words in sentences
    ])
    brownParas.append(wordTypes)

# create a dictionary of document frequency for word types in brown corpus
wordTypeDocFreqDict = {}
for paragraphs in brownParas:
    for word in paragraphs:
        wordTypeDocFreqDict[word] = wordTypeDocFreqDict.get(word, 0) + 1

# filter word pairs where frequency of either one of them is less than 10
for word1, word2 in list(wordSimDict):
Esempio n. 18
0
# nltk.download('abc')
print(nltk.corpus.abc.words())
print(nltk.corpus.genesis.words())
# nltk.download('gutenberg')
print(nltk.corpus.gutenberg.words(fileids='austen-emma.txt'))
print(nltk.corpus.inaugural.words())
# nltk.download('state_union')
print(nltk.corpus.state_union.words())
# nltk.download('webtext')
print(nltk.corpus.webtext.words())
# tagged corpora
print(brown.words())
print(brown.tagged_words())
print(brown.sents())  # doctest: +ELLIPSIS
print(brown.tagged_sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(brown.paras(
    categories='reviews'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(brown.tagged_paras(
    categories='reviews'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('indian')
print(indian.words())  # doctest: +SKIP
print(indian.tagged_words())  # doctest: +SKIP
# nltk.download('universal_tagset')
print(brown.tagged_sents(
    tagset='universal'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(conll2000.tagged_words(
    tagset='universal'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# chunked corpora
print(conll2000.sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2000.chunked_sents()[:2]:
    print(tree)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('conll2002')