def make_unlabeled_set(labeled_set_files, storage_file): samples_to_exclude = [] # From the list of labeled set files, get the items to exclude for next_file in labeled_set_files: with open(root + "corpora/" + next_file) as next_file_handler: for line in next_file_handler: ptrn = '"\\[.*\\]"' match = re.findall(ptrn, line) if match: samples_to_exclude.append(match[0]) # Go through the brown corpus, and if we're not looking at something in the list # of things to exclude, output it to at .csv # This is very inefficient, but is most foolproof and we shouldn't have to run this # many times. brown_paras = brown.paras() para_index, sent_index = 0, 1 SOURCE_NAME = "BRWN" with open(root + "test_extractions/" + storage_file, 'w') as storage_file_handler: for para in brown_paras: for sent in para: id_tag = '"[' + SOURCE_NAME + ", PARA#" + str( para_index) + ", SENT#" + str(sent_index) + ']"' if id_tag not in samples_to_exclude: print(id_tag + "," + " ".join(sent), file=storage_file_handler) sent_index += 1 para_index += 1 sent_index = 1
def gen_corpus(authors, corpus): if corpus == 'all': all_texts = sum([x.known for x in authors], []) return ''.join(sorted(set(all_texts))) elif corpus == 'brown': paragraphs = brown.paras() paragraph_txt = '' for paragraph in paragraphs: sentence_txt = '' for sentence in paragraph: word_txt = '' for word in sentence: if word == '.' or word == ',' or word == '!'\ or word == '?': word_txt = word_txt[:-1] + word + ' ' else: word_txt += word + ' ' sentence_txt += word_txt paragraph_txt += sentence_txt + '\n\n' return paragraph_txt else: raise Exception('UNKNOWN CORPUS')
def create_documents(): brown_paras = brown.paras(categories='news') documents = [] for p in brown_paras: for doc in p: documents.append(" ".join(doc)) data = [] for id, d in enumerate(documents): data.append((id, d)) return data
def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(normalize_tokens(content)) w2v = Word2Vec(texts, size=100, window=5, min_count=5, workers=4) w2v.save(model_path)
def do(file): # Read data from train X_train = pd.DataFrame(columns=('review', 'genre')) for genre in brown.categories(): article = brown.paras(categories=genre) for review in article: X_train = X_train.append({ 'review': review, 'genre': genre }, ignore_index=True) # Read data from test X_test = pd.read_csv(file, header=0, delimiter=",") X_train = data_clean.convert_to_para(X_train) return X_train, X_test
def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(' '.join(normalize_tokens(content))) transformer = CountVectorizer() tf = transformer.fit_transform(texts) svd = TruncatedSVD(n_components=100) lsa = svd.fit_transform(tf.T) lsa.dump(open(model_path, 'wb')) pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(' '.join(normalize_tokens(content))) transformer = CountVectorizer() tf = transformer.fit_transform(texts) test_vocab = set() reader = csv.reader(open(global_truth_path)) for line in reader: w1, w2, score = line test_vocab.add(stemmer.stem(w1)) test_vocab.add(stemmer.stem(w2)) test_vocab = {k: v for v, k in enumerate(test_vocab)} model = np.zeros((len(test_vocab), len(transformer.vocabulary_))) for text in texts: text = text.split() for i in range(len(text)): if text[i] not in test_vocab: continue for j in (i - window_size, i + window_size + 1): if j < 0 or j >= len(text): continue if text[j] not in transformer.vocabulary_: continue model[test_vocab[text[i]]][transformer.vocabulary_[ text[j]]] += 1 model.dump(model_path) pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb')) pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(' '.join(normalize_tokens(content))) transformer = CountVectorizer() tf = transformer.fit_transform(texts) test_vocab = set() reader = csv.reader(open(global_truth_path)) for line in reader: w1, w2, score = line test_vocab.add(stemmer.stem(w1)) test_vocab.add(stemmer.stem(w2)) test_vocab = {k: v for v, k in enumerate(test_vocab)} model = np.zeros((len(test_vocab), len(transformer.vocabulary_))) for text in texts: text = text.split() for i in range(len(text)): if text[i] not in test_vocab: continue for j in (i-window_size, i+window_size+1): if j < 0 or j >= len(text): continue if text[j] not in transformer.vocabulary_: continue model[test_vocab[text[i]]][transformer.vocabulary_[text[j]]] += 1 model.dump(model_path) pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb')) pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
def make_testdata(in_dim=10, out_dim=3, num_samples=100): """Make sample data from brown corpus""" X = [] y = [] tp = tops.TextProcessor() for idx, para in enumerate(brown.paras()): intlist = tp.string_to_ints(' '.join(para[0]), pad_len=in_dim) X.append(np.array(intlist)) _tmpy = np.zeros((out_dim, )) _tmpy[idx % out_dim] = 1.0 y.append(_tmpy) if idx > num_samples: break X = np.vstack(X) y = np.vstack(y) return X, y
import nltk # from nltk.book import * # nltk.download() from nltk.corpus import brown from boyer_moore import find_boyer_moore_all from boyer_moore import find_boyer_moore_all_paras # A list of the sentences, each of which is a list of the words in the sentence # brown_sents = brown.sents(categories=['news', 'editorial', 'reviews']) # A list of the words #brown_words = brown.words(categories=['news', 'editorial', 'reviews']) brown_words = brown.words() brown_paras = brown.paras() brown_sents = brown.sents() # The text, but marked up with POS information # brown_raw = brown.raw(categories=['news', 'editorial', 'reviews']) #print(brown_sents[0:10]) #print(brown_words[0:100]) #print(brown_raw[0:100]) brown_text = nltk.Text(brown_words) ''' print(brown_words[:1000]) print("----------") print(brown_text[:1000]) print(brown_paras[:3]) print("---------") print(brown_sents[:3]) ''' para_indices = find_boyer_moore_all_paras(brown_paras, ['is', 'like', 'a'])
brown_files.remove('cf35') brown_files.remove('cj19') brown_files.remove('cn16') brown_files.remove('ch09') brown_files.remove('ch12') #'ca11','ca39','ce01','ce14','ce24','ce27','cf06','cf10','cf16','cf34','cg48','cg64','cj08','cj56','cj77','ck14','cl20','cl22','cm04','cn15','cd02','cf35','cj19','cn16','ch09','ch12' f_out = open("coref_brown_temp2.txt", 'w') for f in brown_files[brown_files.index('ch10'):]: inp = '' c = 0 docs_parse = [] print "parsing:" + str(f) for para in brown.paras(f): for sent in para: #print len(sent) #print sent #if len(sent)>=40: #continue for word in sent: c += len(word) c += 1 if c >= 4094: # print c c = 0 for word in sent: c += len(word) c += 1 # print inp
import nltk from nltk.corpus import brown ficbooks = brown.fileids(categories = ['fiction', 'science_fiction']) nonficbooks = brown.fileids(categories = ['news', 'history', 'government', 'editorial', 'learned']) for book in nonficbooks: outfile = open(book + '.txt', 'w') for para in brown.paras(book): sents = [] for sent in para: sents.append(' '.join(sent).replace(',','')) p = ' '.join(sents) outfile.write(p + '\n') outfile.close()
# lemmatizer lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() def lemmatize(word): lemma = lemmatizer.lemmatize(word, 'v') if lemma == word: lemma = lemmatizer.lemmatize(word, 'n') return lemma ### # Your answer BEGINS HERE ### N = brown.paras() brown_corpus = [] for i in N: x = [] for j in i: for k in j: if k.isalpha(): k = k.lower() k = lemmatize(k) x.append(k) brown_corpus.append(x) BOW = {} for i in brown_corpus: set1 = [] for j in i:
from nltk import FreqDist """Trainiert 2 verschiedene Naive Bayes classifier auf für einen User interessante Texte. Der erste classifier lernt Texte in verschiedene Kategorien einzuordnen Der zweite lernt welche Kategorien der User interessant findet""" # 1500 zufällige Textpassagen aus dem Reuters print 'Load corpus' #corp = reuters.raw() print 'Loaded corpus' #rnd = np.random.randint(0,len(corp)/2,1500) #raw_documents = [corp[i:i+300] for i in rnd] print 'Created docs' pdb.set_trace() corp = brown.paras(categories='hobbies') rnd = np.random.randint(0, len(corp) - 3, 300) raw_documents = [flatten(corp[i:i + 3]) for i in rnd] pdb.set_trace() raw_doc2 = list() for doc in raw_documents: raw_doc2.append(''.join(str(word) + " " for word in doc)) raw_documents = raw_doc2 pdb.set_trace() #posts_j = json.load(open('cogsci.json')) #posts = posts_j.values() #raw_documents = list() #for post in posts: # if post.has_key('message'): # raw_documents.append(post['message'])
# write weights to file word_counts = test_graphs[1] with open(__save_dir__+"weight_"+str(class_ind)+"_"+str(i)+".csv","w") as weights_file: for j in xrange(0,len(word_counts)): if word_counts[j]: weights_file.write(str(j)+","+str(word_counts[j])+"\n") i = i + 1 return # print reuters.categories() print brown.categories() # print brown.sents(categories=['editorial'])[2] print len(brown.paras(categories=['romance'])) print len(brown.paras(categories=['news'])) print len(brown.paras(categories=['government'])) # number from each class the global graphs is computed for numTrain = 100 # number of testing article graphs computed numTest = 150 # number of classes to compute over (this is hard coded in) numClasses = 3 # classes choosen cat1_para = brown.paras(categories=['romance']) cat2_para = brown.paras(categories=['news']) cat3_para = brown.paras(categories=['government'])
from sklearn.decomposition import TruncatedSVD from scipy.spatial.distance import cosine as cos_distance from gensim.models import Word2Vec from scipy.stats.stats import pearsonr # Load 'combined.tab' file in dictionary with open('../combined.tab') as tabFile: next(tabFile) tabSepWords = (line.split('\t') for line in tabFile) wordSimDict = {(words[0], words[1]): float(words[2]) for words in tabSepWords} # for each paragraph in brown corpus, store a list of lower-cased, lemmatized word types lemmatizer = WordNetLemmatizer() brownParas = [] for paragraphs in brown.paras(): wordTypes = set() wordTypes.update([ lemmatizer.lemmatize(words.lower()) for sentences in paragraphs for words in sentences ]) brownParas.append(wordTypes) # create a dictionary of document frequency for word types in brown corpus wordTypeDocFreqDict = {} for paragraphs in brownParas: for word in paragraphs: wordTypeDocFreqDict[word] = wordTypeDocFreqDict.get(word, 0) + 1 # filter word pairs where frequency of either one of them is less than 10 for word1, word2 in list(wordSimDict):
# nltk.download('abc') print(nltk.corpus.abc.words()) print(nltk.corpus.genesis.words()) # nltk.download('gutenberg') print(nltk.corpus.gutenberg.words(fileids='austen-emma.txt')) print(nltk.corpus.inaugural.words()) # nltk.download('state_union') print(nltk.corpus.state_union.words()) # nltk.download('webtext') print(nltk.corpus.webtext.words()) # tagged corpora print(brown.words()) print(brown.tagged_words()) print(brown.sents()) # doctest: +ELLIPSIS print(brown.tagged_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(brown.paras( categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(brown.tagged_paras( categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('indian') print(indian.words()) # doctest: +SKIP print(indian.tagged_words()) # doctest: +SKIP # nltk.download('universal_tagset') print(brown.tagged_sents( tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(conll2000.tagged_words( tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # chunked corpora print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2000.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('conll2002')