def preproc_dataset(username, fid): '''preprocess the dataset''' #the username is required so that the correct pubsub can be used by preproc folder = Folder.query.get(fid) vocab_min_doc = 2 # minimum number of documents a word must be in max_percent = .8 # maximum percentage of documents a word can be in folder.preprocTM(username, vocab_min_doc, max_percent) pubsub_msg = 'proc,' + str(folder.dataset_id) + "," + str(fid) + ",clean" msgServer.publish(username + 'Xmenus', "%s" % pubsub_msg)
def upload_drop(username=None): name = "My New Dataset" summary = "A summary of the dataset" user_id = g.user.id dset = Dataset(user_id,name,summary) db.session.add(dset) db.session.commit() did = dset.id main_folder = Folder(dset.id,"Main Folder",dict()) db.session.add(main_folder) db.session.commit() main_folder.initialize() db.session.commit() print "DROP" dset = Dataset.query.get(did) ufilename = request.form['filename'] fid = request.files.getlist('file')[0] #grab only a single file fn,ext = os.path.splitext(ufilename) userpath = "refinery/static/users/" + username + "/documents" channel = username + "Xmenus" if ext == ".zip": zip_file = zipfile.ZipFile(fid) files = zip_file.namelist() nFiles = len(files) lastProg = 0 count = 0.0 for member in files: filename = os.path.basename(member) if filename: fn,ext = os.path.splitext(filename) if ext == ".txt" or ext == ".pdf": add_txt(os.path.join(userpath,filename),zip_file.open(member),filename,dset) count += 1.0 update = str(int(count / float(nFiles) * 100)) if update != lastProg: lastProg = update s = 'uprog,' + update msgServer.publish(channel, "%s" % s) elif ext == ".txt" or ext == ".pdf": add_txt(os.path.join(userpath,filename),fid,filename,dset) elif ext == ".tar" or ext == ".gz" or ext == ".bz2": import tarfile tar_file = tarfile.open(fileobj=fid) tar_filename = os.path.join(userpath,ufilename) valid_names = [x for x in tar_file.getnames() if (os.path.splitext(x)[1] == '.txt') or (os.path.splitext(x)[1] == '.pdf')] nFiles = len(valid_names) print nFiles lastProg = 0 count = 0.0 for member in valid_names: filename = os.path.basename(member) if filename: add_txt(os.path.join(userpath,filename), tar_file.extractfile(member),filename,dset) count += 1.0 update = str(int(count / float(nFiles) * 100)) if update != lastProg: lastProg = update s = 'uprog,' + update msgServer.publish(channel, "%s" % s) else: print "unknown file format",ext,filename dset.dirty = "dirty" db.session.commit() print "GOT ",len(dset.folders[0].docIDs), "Documents" msgServer.publish(channel, "ucomplete," + ufilename) return Response(status="200")
def upload_drop(username=None): name = "My New Dataset" summary = "A summary of the dataset" user_id = g.user.id dset = Dataset(user_id, name, summary) db.session.add(dset) db.session.commit() did = dset.id main_folder = Folder(dset.id, "Main Folder", dict()) db.session.add(main_folder) db.session.commit() main_folder.initialize() db.session.commit() print "DROP" dset = Dataset.query.get(did) ufilename = request.form['filename'] fid = request.files.getlist('file')[0] #grab only a single file fn, ext = os.path.splitext(ufilename) userpath = "refinery/static/users/" + username + "/documents" channel = username + "Xmenus" if ext == ".zip": zip_file = zipfile.ZipFile(fid) files = zip_file.namelist() nFiles = len(files) lastProg = 0 count = 0.0 for member in files: filename = os.path.basename(member) if filename: fn, ext = os.path.splitext(filename) if ext == ".txt" or ext == ".pdf": add_txt(os.path.join(userpath, filename), zip_file.open(member), filename, dset) count += 1.0 update = str(int(count / float(nFiles) * 100)) if update != lastProg: lastProg = update s = 'uprog,' + update msgServer.publish(channel, "%s" % s) elif ext == ".txt" or ext == ".pdf": add_txt(os.path.join(userpath, filename), fid, filename, dset) elif ext == ".tar" or ext == ".gz" or ext == ".bz2": import tarfile tar_file = tarfile.open(fileobj=fid) tar_filename = os.path.join(userpath, ufilename) valid_names = [ x for x in tar_file.getnames() if (os.path.splitext(x)[1] == '.txt') or ( os.path.splitext(x)[1] == '.pdf') ] nFiles = len(valid_names) print nFiles lastProg = 0 count = 0.0 for member in valid_names: filename = os.path.basename(member) if filename: add_txt(os.path.join(userpath, filename), tar_file.extractfile(member), filename, dset) count += 1.0 update = str(int(count / float(nFiles) * 100)) if update != lastProg: lastProg = update s = 'uprog,' + update msgServer.publish(channel, "%s" % s) else: print "unknown file format", ext, filename dset.dirty = "dirty" db.session.commit() print "GOT ", len(dset.folders[0].docIDs), "Documents" msgServer.publish(channel, "ucomplete," + ufilename) return Response(status="200")
def preprocTM(self, username, min_doc, max_doc_percent): #we need to add options, like to get rid of xml tags! STOPWORDFILEPATH = 'refinery/static/assets/misc/stopwords.txt' stopwords = set([x.strip() for x in open(STOPWORDFILEPATH)]) allD = self.all_docs() nDocs = len(allD) WC = defaultdict(int) DWC = defaultdict( lambda: defaultdict(int) ) def addWord(f,w): WC[w] += 1 DWC[f][w] += 1 c = 0.0 prev = 0 for d in allD: filE = d.path c += 1.0 pc = int(c / float(nDocs) * 100) if pc > prev: prev = pc s = 'pprog,Step 1,' + str(self.id) + "," + str(pc) msgServer.publish(username + 'Xmenus', "%s" % s) [[addWord(filE,word) for word in tokenize_sentence(line) if word.lower() not in stopwords] for line in open(filE)] # now remove words with bad appearace stats to_remove = [] c = 0.0 oldpc = -1 for w in WC: c += 1.0 pc = int(c/float(len(WC)) * 100) if not oldpc == pc: s = 'pprog,Step 2,' + str(self.id) + "," + str(pc) #print s msgServer.publish(username + 'Xmenus', "%s" % s) oldpc = pc has_w = [d for d,m in DWC.items() if w in m] n_has_w = len(has_w) doc_percent = float(n_has_w)/float(nDocs) #print w,doc_percent,n_has_w if n_has_w < min_doc or doc_percent > max_doc_percent: [DWC[d].pop(w,None) for d in has_w] to_remove.append(w) [WC.pop(w,None) for w in to_remove] vocab = [w for w in WC] print "N VOCAB",len(vocab) v_enum = defaultdict(int) for w in vocab: v_enum[w] = len(v_enum) d_enum = defaultdict(int) for f in allD: d_enum[f.path] = len(d_enum) outfile = open(self.wordcount_path(),'w') for d in allD: f = d.path m = DWC[f] fID = d_enum[f] for w, c in m.items(): wID = v_enum[w] outfile.write(str(fID) + ',' + str(wID) + ',' + str(c) + '\n') outfile.close() self.vocabSize = len(vocab) outfile = open(self.vocab_path(),'w') [outfile.write(x + "\n") for x in vocab] outfile.close() self.dirty = "clean" db.session.commit()
def preprocTM(self, username, min_doc, max_doc_percent): #we need to add options, like to get rid of xml tags! STOPWORDFILEPATH = 'refinery/static/assets/misc/stopwords.txt' stopwords = set([x.strip() for x in open(STOPWORDFILEPATH)]) allD = self.all_docs() nDocs = len(allD) WC = defaultdict(int) DWC = defaultdict(lambda: defaultdict(int)) def addWord(f, w): WC[w] += 1 DWC[f][w] += 1 c = 0.0 prev = 0 for d in allD: filE = d.path c += 1.0 pc = int(c / float(nDocs) * 100) if pc > prev: prev = pc s = 'pprog,Step 1,' + str(self.id) + "," + str(pc) msgServer.publish(username + 'Xmenus', "%s" % s) [[ addWord(filE, word) for word in tokenize_sentence(line) if word.lower() not in stopwords ] for line in open(filE)] # now remove words with bad appearace stats to_remove = [] c = 0.0 oldpc = -1 for w in WC: c += 1.0 pc = int(c / float(len(WC)) * 100) if not oldpc == pc: s = 'pprog,Step 2,' + str(self.id) + "," + str(pc) #print s msgServer.publish(username + 'Xmenus', "%s" % s) oldpc = pc has_w = [d for d, m in DWC.items() if w in m] n_has_w = len(has_w) doc_percent = float(n_has_w) / float(nDocs) #print w,doc_percent,n_has_w if n_has_w < min_doc or doc_percent > max_doc_percent: [DWC[d].pop(w, None) for d in has_w] to_remove.append(w) [WC.pop(w, None) for w in to_remove] vocab = [w for w in WC] print "N VOCAB", len(vocab) v_enum = defaultdict(int) for w in vocab: v_enum[w] = len(v_enum) d_enum = defaultdict(int) for f in allD: d_enum[f.path] = len(d_enum) outfile = open(self.wordcount_path(), 'w') for d in allD: f = d.path m = DWC[f] fID = d_enum[f] for w, c in m.items(): wID = v_enum[w] outfile.write(str(fID) + ',' + str(wID) + ',' + str(c) + '\n') outfile.close() self.vocabSize = len(vocab) outfile = open(self.vocab_path(), 'w') [outfile.write(x + "\n") for x in vocab] outfile.close() self.dirty = "clean" db.session.commit()
def learn_summarize_model(username, folder_id): ''' The Learning step for the summarize model. Each document in the folder is sentence segmented, and each sentence classified as being a fact or not. ''' #SETUP STOPWORDFILEPATH = 'refinery/static/assets/misc/stopwords.txt' [folder, sum_ex, ex_info] = get_data(folder_id) set_sum_status(username, folder_id, sum_ex, 'inprogress') db.session.commit() #map vocab words to their index vocab = {} for word in open(folder.vocab_path()): vocab[word.strip()] = len(vocab) stopwords = set([x.strip() for x in open(STOPWORDFILEPATH)]) #sbd setup sbd_model_path = os.path.abspath("") + '/lib/model_svm/' sbd_model = sbd.load_sbd_model(sbd_model_path, True) ''' #fact svm setup fid = open("fact_classifier/factsvm") fact_svm = pickle.load(fid) fid.close() fid = open("fact_classifier/factfeat") feat_extractor = pickle.load(fid) fid.close() ''' # START WORKING all_documents = folder.all_docs() allsents = dict() total_docs = float(len(all_documents)) last_prog = 0 count = 0.0 for doc in all_documents: #send progress info count += 1.0 update = int(count / float(total_docs) * 100) if update != last_prog: last_prog = update msg = 'sum_prog,' + str(folder_id) + "," + str(update) msgServer.publish(username + "Xmenus", msg) #get the raw file text filE = doc.path raw_text = "" fid = codecs.open(filE, "r", "utf-8") for line in fid: tline = line.strip() raw_text += " " + tline fid.close() #sentence boundary detection sents = sbd.sbd_text(sbd_model, raw_text, do_tok=False) def filter_sentences(): ''' this generator uses fact classification to filter sentences''' for sent in sents: if len(sent) > 200: continue words = tokenize_sentence(sent) yield [sent, words] ''' #actual classifier, commented out for now... ws = defaultdict(int) for w in words: ws[w] += 1 pred = fact_svm.predict(feat_ex.transform(ws)) if(pred == 1): yield [sent,words] ''' def get_sentences(): '''tokenize and drop stopwords to get word count dicts''' for sent, words in filter_sentences(): word_counts = defaultdict(int) good_words = [word for word in words if word in vocab and word.lower() not in stopwords] for word in good_words: word_counts[vocab[word]] += 1 if len(word_counts) > 0: yield [sent, word_counts] allsents[doc.id] = [x for x in get_sentences()] #cleanup ex_info.sents = allsents set_sum_status(username, folder_id, sum_ex, 'finish') db.session.commit()
def set_sum_status(username, folder_id, sum_ex, status): '''set summarization status for a folder and publish to the main menu''' sum_ex.status = status channel = username + "Xmenus" msgServer.publish(channel, "sumstatus," + str(folder_id) + "," + status)
def learn_summarize_model(username, folder_id): ''' The Learning step for the summarize model. Each document in the folder is sentence segmented, and each sentence classified as being a fact or not. ''' #SETUP STOPWORDFILEPATH = 'refinery/static/assets/misc/stopwords.txt' [folder, sum_ex, ex_info] = get_data(folder_id) set_sum_status(username, folder_id, sum_ex, 'inprogress') db.session.commit() #map vocab words to their index vocab = {} for word in open(folder.vocab_path()): vocab[word.strip()] = len(vocab) stopwords = set([x.strip() for x in open(STOPWORDFILEPATH)]) #sbd setup sbd_model_path = os.path.abspath("") + '/lib/model_svm/' sbd_model = sbd.load_sbd_model(sbd_model_path, True) ''' #fact svm setup fid = open("fact_classifier/factsvm") fact_svm = pickle.load(fid) fid.close() fid = open("fact_classifier/factfeat") feat_extractor = pickle.load(fid) fid.close() ''' # START WORKING all_documents = folder.all_docs() allsents = dict() total_docs = float(len(all_documents)) last_prog = 0 count = 0.0 for doc in all_documents: #send progress info count += 1.0 update = int(count / float(total_docs) * 100) if update != last_prog: last_prog = update msg = 'sum_prog,' + str(folder_id) + "," + str(update) msgServer.publish(username + "Xmenus", msg) #get the raw file text filE = doc.path raw_text = "" fid = codecs.open(filE, "r", "utf-8") for line in fid: tline = line.strip() raw_text += " " + tline fid.close() #sentence boundary detection sents = sbd.sbd_text(sbd_model, raw_text, do_tok=False) def filter_sentences(): ''' this generator uses fact classification to filter sentences''' for sent in sents: if len(sent) > 200: continue words = tokenize_sentence(sent) yield [sent, words] ''' #actual classifier, commented out for now... ws = defaultdict(int) for w in words: ws[w] += 1 pred = fact_svm.predict(feat_ex.transform(ws)) if(pred == 1): yield [sent,words] ''' def get_sentences(): '''tokenize and drop stopwords to get word count dicts''' for sent, words in filter_sentences(): word_counts = defaultdict(int) good_words = [ word for word in words if word in vocab and word.lower() not in stopwords ] for word in good_words: word_counts[vocab[word]] += 1 if len(word_counts) > 0: yield [sent, word_counts] allsents[doc.id] = [x for x in get_sentences()] #cleanup ex_info.sents = allsents set_sum_status(username, folder_id, sum_ex, 'finish') db.session.commit()