def describe(corpus): print "\t".join(["c/w", "w/s", "w/v", "id"]) for fileid in corpus.fileids(): nchars = len(corpus.raw(fileid)) nwords = len(corpus.words(fileid)) nsents = len(corpus.sents(fileid)) nvocab = len(set([w.lower() for w in corpus.words(fileid)])) print "\t".join([str(nchars/nwords), str(nwords/nsents), str(nwords/nvocab), fileid])
def SearchCorpus(self, event): count=0 userword=self.text.GetValue() wordsenses=self.ShowSenses(self) corpus = LazyCorpusLoader('hamshahricorpus',XMLCorpusReader, r'(?!\.).*\.xml') for file in corpus.fileids(): #if num==1000: break for doc in corpus.xml(file).getchildren(): cat=doc.getchildren()[3].text# text=doc.getchildren()[5].text newtext=correctPersianString(text) allwords=newtext.split() # sents=newtext.split('.' if userword in allwords: overlap={} bestsenses=[] wordindx=allwords.index(userword) context=allwords[wordindx-8:wordindx+8] purecontextwords=set(context)-set(stopwords) for i in range(len(wordsenses[userword])): senseid=wordsenses[userword][i][1] glosswords=wordsenses[userword][i][0].split() pureglosswords=set(glosswords)-set(stopwords) common=set(pureglosswords)&set(purecontextwords) if userword in common: common.remove(userword) overlap[senseid]=len(common) bestoverlap=max(overlap.values()) if bestoverlap>0: for item in overlap.keys(): if overlap[item]==bestoverlap: bestsenses.append(item) if len(bestsenses)==1: # if 0 in bestsenses: print ' '.join(context),'\t',bestsenses frame.printarea.Clear() frame.printarea.write(' '.join(context)+'\t'+str(bestsenses))
def create_dfs(corpus): print("Gathering data..") hold_files = corpus.fileids() rowlist = [] for each in hold_files: each_row = {} each_row['Year'], each_row['Last_name'], _ = each.replace( '-', '.').split('.') each_row['Text'] = pre_process( corpus.raw(each)) # Preprocessed text file rowlist.append(each_row) print("Creating dataframe..") df = pd.DataFrame(rowlist) df['Year'] = df['Year'].astype(int) tf_idf_df = get_tfidf(df) return tf_idf_df, df
def build_tfidf(corpus_dir,model_filename): stemmer = nltk.stem.PorterStemmer() corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$') # a memory-friendly iterator dictionary = corpora.Dictionary() bigram_transformer = Phrases(TextCorpus(corpus)) for myfile in corpus.fileids(): try: chunks = bigram_transformer[[word.lower() for word in corpus.words(myfile)]] dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]]) except Exception as e: print 'Warning error in file:', myfile model = TfidfModel(BowCorpus(corpus,dictionary,bigram_transformer), id2word=dictionary) model.save(model_filename)
def hamshahri_targetword_corpus_maker(match, outpath): print 'loading hamshahri corpus' print corpus = LazyCorpusLoader('hamshahricorpus',XMLCorpusReader, r'(?!\.).*\.xml') outfile=codecs.open(outpath,'w','utf-8') punclist=[u'،',u'؛',u':',u'؟',u'#'] matchnum=0 count =0 print 'creating target corpus' for file in corpus.fileids(): #print file for doc in corpus.xml(file).getchildren(): # print doc.getchildren() # cat=doc.getchildren()[3].text# text=doc.getchildren()[5].text newtext=correctPersianString(text) newtext= newtext.replace('\n',' ') for item in punclist: if item in newtext: newtext=newtext.replace(item,'') # # # print newtext # # if match in newtext.split(): # matchnum+=1 print newtext print '#' count+=1 # outfile.write(newtext) outfile.write('ALI') outfile.close() print count
def hamshahri_targetword_corpus_maker(match, outpath): print 'loading hamshahri corpus' print corpus = LazyCorpusLoader('hamshahricorpus', XMLCorpusReader, r'(?!\.).*\.xml') outfile = codecs.open(outpath, 'w', 'utf-8') punclist = [u'،', u'؛', u':', u'؟', u'#'] matchnum = 0 count = 0 print 'creating target corpus' for file in corpus.fileids(): #print file for doc in corpus.xml(file).getchildren(): # print doc.getchildren() # cat=doc.getchildren()[3].text# text = doc.getchildren()[5].text newtext = correctPersianString(text) newtext = newtext.replace('\n', ' ') for item in punclist: if item in newtext: newtext = newtext.replace(item, '') # # # print newtext # # if match in newtext.split(): # matchnum += 1 print newtext print '#' count += 1 # outfile.write(newtext) outfile.write('ALI') outfile.close() print count
def ham_corpus_maker(outpath, word): corpus = LazyCorpusLoader('hamshahricorpus',XMLCorpusReader, r'(?!\.).*\.xml') outfile=codecs.open(outpath,'w','utf-8') count=0 instancenum=0 targetwordnum=0 for file in corpus.fileids(): #print file for doc in corpus.xml(file).getchildren(): # print doc.getchildren() cat=doc.getchildren()[3].text# text=doc.getchildren()[5].text newtext=correctPersianString(text) newtext= newtext.replace('\n',' ') textlines= newtext.split('.') if word in newtext.split(): print newtext outfile.write(newtext) outfile.write('\n') print print print str(instancenum)+" seeds found " print str(targetwordnum)+" target word found " outfile.close()
for item in c: for word in words: item[1]= correctPersianString(item[1]) item[0]= correctPersianString(item[0]) if word==item[0]: mixed=item[1]+' '+item[2] wordsense.setdefault(item[0], []).append((mixed,item[3])) anothercount=0 total=0 nooverlap=0 num=0 commonwords={} for file in corpus.fileids(): #if num==1000: break for doc in corpus.xml(file).getchildren(): cat=doc.getchildren()[3].text# # print cat text=doc.getchildren()[5].text newtext=correctPersianString(text) allwords=newtext.split() sents=newtext.split('.') for word in words: # print word for sent in sents: if word in sent.split(): # print sent
corpusname = "inaugural" if len(sys.argv) >= 2: corpusname = sys.argv[1] filelim = 4 if len(sys.argv) >= 3: filelim = int(sys.argv[2]) corpus = getattr(nltk.corpus, corpusname) def mkdir_p(path): try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise path = "./%s" % corpusname mkdir_p(path) for i in range(0, filelim): fid = corpus.fileids()[i] with open("%s/%s" % (path, fid), 'w') as out: # need to remove new lines here so MR interprets each file # as a single input out.write(corpus.raw(fid).replace('\n', ' '))
# # print ''.join(ite[1] print "real final" #m) # words = [u'شیر'] total = 0 nooverlap = 0 num = 0 number = 0 commonwords = {} #for i in realfinal: # print i, ' '.join(set(realfinal[i])) for file in corpus.fileids(): # # #if num==1000: break for doc in corpus.xml(file).getchildren(): # cat = doc.getchildren()[3].text # text = doc.getchildren()[5].text newtext = correctPersianString(text) allwords = text.split() sents = newtext.split('.') # for word in words: # # print word for sent in sents: if word in sent.split(): total += 1 overlap = {}
def contents(corpus): return corpus.fileids()
os.remove(os.path.join(root, fileid)) os.rmdir(root) # plaintext corpus reader root = make_testcorpus(ext='.txt', a=""" This is the first sentence. Here is another sentence! And here's a third sentence. This is the second paragraph. Tokenization is currently fairly simple, so the period in Mr. gets tokenized. """, b="""This is the second file.""") corpus = PlaintextCorpusReader(root, ['a.txt', 'b.txt']) print(corpus.fileids()) corpus = PlaintextCorpusReader(root, '.*\.txt') print(corpus.fileids()) print(str(corpus.root) == str(root)) print(corpus.words()) print(corpus.raw()[:40]) print(len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()]) print(corpus.words('a.txt')) print(corpus.words('b.txt')) print(corpus.words()[:4], corpus.words()[-4:]) # del_testcorpus(root) for corpus in (abc, genesis, inaugural, state_union, webtext): print(str(corpus).replace('\\\\', '/')) print(' ', repr(corpus.fileids())[:60]) print(' ', repr(corpus.words()[:10])[:60]) root = make_testcorpus(a="""