Example #1
0
def preproc_dataset(username, fid):
    '''preprocess the dataset'''

    #the username is required so that the correct pubsub can be used by preproc

    folder = Folder.query.get(fid)
    vocab_min_doc = 2  # minimum number of documents a word must be in
    max_percent = .8  # maximum percentage of documents a word can be in
    folder.preprocTM(username, vocab_min_doc, max_percent)
    pubsub_msg = 'proc,' + str(folder.dataset_id) + "," + str(fid) + ",clean"
    msgServer.publish(username + 'Xmenus', "%s" % pubsub_msg)
Example #2
0
def preproc_dataset(username, fid):
    '''preprocess the dataset'''
    
    #the username is required so that the correct pubsub can be used by preproc

    folder = Folder.query.get(fid)
    vocab_min_doc = 2  # minimum number of documents a word must be in
    max_percent = .8  # maximum percentage of documents a word can be in
    folder.preprocTM(username, vocab_min_doc, max_percent)
    pubsub_msg = 'proc,' + str(folder.dataset_id) + "," + str(fid) + ",clean"
    msgServer.publish(username + 'Xmenus', "%s" % pubsub_msg)
Example #3
0
def upload_drop(username=None):

    name = "My New Dataset"
    summary = "A summary of the dataset"
    user_id = g.user.id
    
    dset = Dataset(user_id,name,summary)
    db.session.add(dset)
    db.session.commit()
    did = dset.id

    main_folder = Folder(dset.id,"Main Folder",dict())
    db.session.add(main_folder)
    db.session.commit()

    main_folder.initialize()
    db.session.commit()
    
    print "DROP"

    dset = Dataset.query.get(did)

    ufilename = request.form['filename']
    fid = request.files.getlist('file')[0]  #grab only a single file

    fn,ext = os.path.splitext(ufilename)

    userpath = "refinery/static/users/" + username + "/documents"
    channel = username + "Xmenus"

    if ext == ".zip":
        zip_file = zipfile.ZipFile(fid)
        files = zip_file.namelist()
        nFiles = len(files)
        lastProg = 0
        count = 0.0
        for member in files:
            filename = os.path.basename(member)
            if filename:
                fn,ext = os.path.splitext(filename)
                if ext == ".txt" or ext == ".pdf":
                    add_txt(os.path.join(userpath,filename),zip_file.open(member),filename,dset)
            count += 1.0
            update = str(int(count / float(nFiles) * 100))
            if update != lastProg:
                lastProg = update
                s = 'uprog,' + update
                msgServer.publish(channel, "%s" % s)
            
    elif ext == ".txt" or ext == ".pdf":
        add_txt(os.path.join(userpath,filename),fid,filename,dset)

    elif ext == ".tar" or ext == ".gz" or ext == ".bz2":
        import tarfile
        tar_file = tarfile.open(fileobj=fid)
        tar_filename = os.path.join(userpath,ufilename)
        valid_names = [x for x in tar_file.getnames() if (os.path.splitext(x)[1] == '.txt') or (os.path.splitext(x)[1] == '.pdf')]
        nFiles = len(valid_names)
        print nFiles
        lastProg = 0
        count = 0.0
        for member in valid_names:
            filename = os.path.basename(member)
            if filename:
                add_txt(os.path.join(userpath,filename), tar_file.extractfile(member),filename,dset)
            count += 1.0
            update = str(int(count / float(nFiles) * 100))
            if update != lastProg:
                lastProg = update
                s = 'uprog,' + update
                msgServer.publish(channel, "%s" % s)        

    else:
        print "unknown file format",ext,filename

    dset.dirty = "dirty"

    db.session.commit()

    print "GOT ",len(dset.folders[0].docIDs), "Documents"

    msgServer.publish(channel, "ucomplete," + ufilename) 

    return Response(status="200")
Example #4
0
def upload_drop(username=None):

    name = "My New Dataset"
    summary = "A summary of the dataset"
    user_id = g.user.id

    dset = Dataset(user_id, name, summary)
    db.session.add(dset)
    db.session.commit()
    did = dset.id

    main_folder = Folder(dset.id, "Main Folder", dict())
    db.session.add(main_folder)
    db.session.commit()

    main_folder.initialize()
    db.session.commit()

    print "DROP"

    dset = Dataset.query.get(did)

    ufilename = request.form['filename']
    fid = request.files.getlist('file')[0]  #grab only a single file

    fn, ext = os.path.splitext(ufilename)

    userpath = "refinery/static/users/" + username + "/documents"
    channel = username + "Xmenus"

    if ext == ".zip":
        zip_file = zipfile.ZipFile(fid)
        files = zip_file.namelist()
        nFiles = len(files)
        lastProg = 0
        count = 0.0
        for member in files:
            filename = os.path.basename(member)
            if filename:
                fn, ext = os.path.splitext(filename)
                if ext == ".txt" or ext == ".pdf":
                    add_txt(os.path.join(userpath, filename),
                            zip_file.open(member), filename, dset)
            count += 1.0
            update = str(int(count / float(nFiles) * 100))
            if update != lastProg:
                lastProg = update
                s = 'uprog,' + update
                msgServer.publish(channel, "%s" % s)

    elif ext == ".txt" or ext == ".pdf":
        add_txt(os.path.join(userpath, filename), fid, filename, dset)

    elif ext == ".tar" or ext == ".gz" or ext == ".bz2":
        import tarfile
        tar_file = tarfile.open(fileobj=fid)
        tar_filename = os.path.join(userpath, ufilename)
        valid_names = [
            x for x in tar_file.getnames()
            if (os.path.splitext(x)[1] == '.txt') or (
                os.path.splitext(x)[1] == '.pdf')
        ]
        nFiles = len(valid_names)
        print nFiles
        lastProg = 0
        count = 0.0
        for member in valid_names:
            filename = os.path.basename(member)
            if filename:
                add_txt(os.path.join(userpath, filename),
                        tar_file.extractfile(member), filename, dset)
            count += 1.0
            update = str(int(count / float(nFiles) * 100))
            if update != lastProg:
                lastProg = update
                s = 'uprog,' + update
                msgServer.publish(channel, "%s" % s)

    else:
        print "unknown file format", ext, filename

    dset.dirty = "dirty"

    db.session.commit()

    print "GOT ", len(dset.folders[0].docIDs), "Documents"

    msgServer.publish(channel, "ucomplete," + ufilename)

    return Response(status="200")
Example #5
0
    def preprocTM(self, username, min_doc, max_doc_percent):

        #we need to add options, like to get rid of xml tags!
        
        STOPWORDFILEPATH = 'refinery/static/assets/misc/stopwords.txt'
        stopwords = set([x.strip() for x in open(STOPWORDFILEPATH)])
        
        allD = self.all_docs()
        
        nDocs = len(allD)
        
        WC = defaultdict(int)
        DWC = defaultdict( lambda: defaultdict(int) )

        def addWord(f,w):
            WC[w] += 1
            DWC[f][w] += 1

        c = 0.0
        prev = 0
        for d in allD:
            filE = d.path
            
            c += 1.0
            pc = int(c / float(nDocs) * 100)
            if pc > prev:
                prev = pc
                s = 'pprog,Step 1,' + str(self.id) + "," + str(pc)
                msgServer.publish(username + 'Xmenus', "%s" % s)
            
            [[addWord(filE,word) for word in tokenize_sentence(line) if word.lower() not in stopwords] for line in open(filE)] 

        # now remove words with bad appearace stats
        to_remove = []
        c = 0.0
        oldpc = -1
        for w in WC:
            c += 1.0
            pc = int(c/float(len(WC)) * 100)
            if not oldpc == pc:
                s = 'pprog,Step 2,' + str(self.id) + "," + str(pc)
                #print s
                msgServer.publish(username + 'Xmenus', "%s" % s)
                oldpc = pc
            has_w = [d for d,m in DWC.items() if w in m]
            n_has_w = len(has_w)
            doc_percent = float(n_has_w)/float(nDocs)
            #print w,doc_percent,n_has_w
            if n_has_w < min_doc or doc_percent > max_doc_percent:
                [DWC[d].pop(w,None) for d in has_w]
                to_remove.append(w)
        [WC.pop(w,None) for w in to_remove]

        vocab = [w for w in WC]

        print "N VOCAB",len(vocab)
        
        v_enum = defaultdict(int)
        for w in vocab:
            v_enum[w] = len(v_enum) 
        d_enum = defaultdict(int)
        for f in allD:
            d_enum[f.path] = len(d_enum)
    
        outfile = open(self.wordcount_path(),'w')
        for d in allD:
            f = d.path
            m = DWC[f]
            fID = d_enum[f]
            for w, c in m.items():
                wID = v_enum[w]
                outfile.write(str(fID) + ',' + str(wID) + ',' + str(c) + '\n')
        outfile.close()

        self.vocabSize = len(vocab)
  
        outfile = open(self.vocab_path(),'w')
        [outfile.write(x + "\n") for x in vocab]
        outfile.close()


        self.dirty = "clean"
        db.session.commit()
Example #6
0
    def preprocTM(self, username, min_doc, max_doc_percent):

        #we need to add options, like to get rid of xml tags!

        STOPWORDFILEPATH = 'refinery/static/assets/misc/stopwords.txt'
        stopwords = set([x.strip() for x in open(STOPWORDFILEPATH)])

        allD = self.all_docs()

        nDocs = len(allD)

        WC = defaultdict(int)
        DWC = defaultdict(lambda: defaultdict(int))

        def addWord(f, w):
            WC[w] += 1
            DWC[f][w] += 1

        c = 0.0
        prev = 0
        for d in allD:
            filE = d.path

            c += 1.0
            pc = int(c / float(nDocs) * 100)
            if pc > prev:
                prev = pc
                s = 'pprog,Step 1,' + str(self.id) + "," + str(pc)
                msgServer.publish(username + 'Xmenus', "%s" % s)

            [[
                addWord(filE, word) for word in tokenize_sentence(line)
                if word.lower() not in stopwords
            ] for line in open(filE)]

        # now remove words with bad appearace stats
        to_remove = []
        c = 0.0
        oldpc = -1
        for w in WC:
            c += 1.0
            pc = int(c / float(len(WC)) * 100)
            if not oldpc == pc:
                s = 'pprog,Step 2,' + str(self.id) + "," + str(pc)
                #print s
                msgServer.publish(username + 'Xmenus', "%s" % s)
                oldpc = pc
            has_w = [d for d, m in DWC.items() if w in m]
            n_has_w = len(has_w)
            doc_percent = float(n_has_w) / float(nDocs)
            #print w,doc_percent,n_has_w
            if n_has_w < min_doc or doc_percent > max_doc_percent:
                [DWC[d].pop(w, None) for d in has_w]
                to_remove.append(w)
        [WC.pop(w, None) for w in to_remove]

        vocab = [w for w in WC]

        print "N VOCAB", len(vocab)

        v_enum = defaultdict(int)
        for w in vocab:
            v_enum[w] = len(v_enum)
        d_enum = defaultdict(int)
        for f in allD:
            d_enum[f.path] = len(d_enum)

        outfile = open(self.wordcount_path(), 'w')
        for d in allD:
            f = d.path
            m = DWC[f]
            fID = d_enum[f]
            for w, c in m.items():
                wID = v_enum[w]
                outfile.write(str(fID) + ',' + str(wID) + ',' + str(c) + '\n')
        outfile.close()

        self.vocabSize = len(vocab)

        outfile = open(self.vocab_path(), 'w')
        [outfile.write(x + "\n") for x in vocab]
        outfile.close()

        self.dirty = "clean"
        db.session.commit()
Example #7
0
def learn_summarize_model(username, folder_id):
    '''
    The Learning step for the summarize model.  Each document in the folder is
    sentence segmented, and each sentence classified as being a fact or not.
    '''

    #SETUP

    STOPWORDFILEPATH = 'refinery/static/assets/misc/stopwords.txt'
    [folder, sum_ex, ex_info] = get_data(folder_id)
    set_sum_status(username, folder_id, sum_ex, 'inprogress')
    db.session.commit()


    #map vocab words to their index
    vocab = {}
    for word in open(folder.vocab_path()):
        vocab[word.strip()] = len(vocab)

    stopwords = set([x.strip() for x in open(STOPWORDFILEPATH)])

    #sbd setup
    sbd_model_path = os.path.abspath("") + '/lib/model_svm/'
    sbd_model = sbd.load_sbd_model(sbd_model_path, True)
    '''
    #fact svm setup
    fid = open("fact_classifier/factsvm")
    fact_svm = pickle.load(fid)
    fid.close()
    fid = open("fact_classifier/factfeat")
    feat_extractor = pickle.load(fid)
    fid.close()
    '''
    # START WORKING

    all_documents = folder.all_docs()
    allsents = dict()
    total_docs = float(len(all_documents))
    last_prog = 0
    count = 0.0

    for doc in all_documents:

        #send progress info
        count += 1.0
        update = int(count / float(total_docs) * 100)
        if update != last_prog:
            last_prog = update
            msg = 'sum_prog,' + str(folder_id) + "," + str(update)
            msgServer.publish(username + "Xmenus", msg)

        #get the raw file text
        filE = doc.path
        raw_text = ""
        fid = codecs.open(filE, "r", "utf-8")
        for line in fid:
            tline = line.strip()
            raw_text += " " + tline
        fid.close()

        #sentence boundary detection
        sents = sbd.sbd_text(sbd_model, raw_text, do_tok=False)

        def filter_sentences():
            ''' this generator uses fact classification to filter sentences'''
            for sent in sents:
                if len(sent) > 200:
                    continue
                words = tokenize_sentence(sent)
                yield [sent, words]

                '''
                #actual classifier, commented out for now...
                ws = defaultdict(int)
                for w in words:
                    ws[w] += 1
                pred = fact_svm.predict(feat_ex.transform(ws))
                if(pred == 1):
                    yield [sent,words]
                '''


        def get_sentences():
            '''tokenize and drop stopwords to get word count dicts'''
            for sent, words in filter_sentences():
                word_counts = defaultdict(int)
                good_words = [word for word in words if word in vocab and
                              word.lower() not in stopwords]
                for word in good_words:
                    word_counts[vocab[word]] += 1
                if len(word_counts) > 0:
                    yield [sent, word_counts]

        allsents[doc.id] = [x for x in get_sentences()]

    #cleanup
    ex_info.sents = allsents
    set_sum_status(username, folder_id, sum_ex, 'finish')
    db.session.commit()
Example #8
0
def set_sum_status(username, folder_id, sum_ex, status):
    '''set summarization status for a folder and publish to the main menu'''
    sum_ex.status = status
    channel = username + "Xmenus"
    msgServer.publish(channel, "sumstatus," + str(folder_id) + "," + status)
Example #9
0
def learn_summarize_model(username, folder_id):
    '''
    The Learning step for the summarize model.  Each document in the folder is
    sentence segmented, and each sentence classified as being a fact or not.
    '''

    #SETUP

    STOPWORDFILEPATH = 'refinery/static/assets/misc/stopwords.txt'
    [folder, sum_ex, ex_info] = get_data(folder_id)
    set_sum_status(username, folder_id, sum_ex, 'inprogress')
    db.session.commit()

    #map vocab words to their index
    vocab = {}
    for word in open(folder.vocab_path()):
        vocab[word.strip()] = len(vocab)

    stopwords = set([x.strip() for x in open(STOPWORDFILEPATH)])

    #sbd setup
    sbd_model_path = os.path.abspath("") + '/lib/model_svm/'
    sbd_model = sbd.load_sbd_model(sbd_model_path, True)
    '''
    #fact svm setup
    fid = open("fact_classifier/factsvm")
    fact_svm = pickle.load(fid)
    fid.close()
    fid = open("fact_classifier/factfeat")
    feat_extractor = pickle.load(fid)
    fid.close()
    '''
    # START WORKING

    all_documents = folder.all_docs()
    allsents = dict()
    total_docs = float(len(all_documents))
    last_prog = 0
    count = 0.0

    for doc in all_documents:

        #send progress info
        count += 1.0
        update = int(count / float(total_docs) * 100)
        if update != last_prog:
            last_prog = update
            msg = 'sum_prog,' + str(folder_id) + "," + str(update)
            msgServer.publish(username + "Xmenus", msg)

        #get the raw file text
        filE = doc.path
        raw_text = ""
        fid = codecs.open(filE, "r", "utf-8")
        for line in fid:
            tline = line.strip()
            raw_text += " " + tline
        fid.close()

        #sentence boundary detection
        sents = sbd.sbd_text(sbd_model, raw_text, do_tok=False)

        def filter_sentences():
            ''' this generator uses fact classification to filter sentences'''
            for sent in sents:
                if len(sent) > 200:
                    continue
                words = tokenize_sentence(sent)
                yield [sent, words]
                '''
                #actual classifier, commented out for now...
                ws = defaultdict(int)
                for w in words:
                    ws[w] += 1
                pred = fact_svm.predict(feat_ex.transform(ws))
                if(pred == 1):
                    yield [sent,words]
                '''

        def get_sentences():
            '''tokenize and drop stopwords to get word count dicts'''
            for sent, words in filter_sentences():
                word_counts = defaultdict(int)
                good_words = [
                    word for word in words
                    if word in vocab and word.lower() not in stopwords
                ]
                for word in good_words:
                    word_counts[vocab[word]] += 1
                if len(word_counts) > 0:
                    yield [sent, word_counts]

        allsents[doc.id] = [x for x in get_sentences()]

    #cleanup
    ex_info.sents = allsents
    set_sum_status(username, folder_id, sum_ex, 'finish')
    db.session.commit()
Example #10
0
def set_sum_status(username, folder_id, sum_ex, status):
    '''set summarization status for a folder and publish to the main menu'''
    sum_ex.status = status
    channel = username + "Xmenus"
    msgServer.publish(channel, "sumstatus," + str(folder_id) + "," + status)