Beispiel #1
0
def get_wiki_pmi_coherence(
        topics,
        numterms=NUM_TERMS):  # TODO make sure the terms are already stemmed
    """
    Coherence score from (Newman, 2010 Automatic Evaluation of Topic Models)
    """
    dbase = db(WIKI_COCC_DB)
    if not dbase.check_table_existence('co_occ'):
        return {}
    scores = {}
    rtime = time()
    tid_dict = {}  # keep terms and cooccurence counts in memory for caching
    cocc_dict = {}
    for i in xrange(len(topics)):
        scores[topics[i].id] = []
        print 'Determining wikipedia PMI coherence for topic %i of %i; last topic took %0.1fs' % (
            i, len(topics), time() - rtime)
        rtime = time()

        # prep the top numterms terms
        titles = []
        topics[i].get_terms(numterms)
        for j in xrange(numterms):
            # TODO make sure stemming is handled consistently
            titles.append(stem(topics[i].get_term(j).title))
            if not tid_dict.has_key(titles[-1]):
                res = dbase.get_wiki_occ(titles[-1])
                if res == []:  # don't include terms that are not in the database TODO better way to handle this?
                    del (titles[-1])
                    numterms -= 1
                    continue
                tid_dict[titles[-1]] = [
                    res[0], res[1]
                ]  # res[0] is the term_id res[1] is the occurance

        for m in xrange(1, numterms):
            tid1 = tid_dict[titles[m]][0]
            t1_occ = tid_dict[titles[m]][1]
            for l in xrange(0, m):  # [x]range goes to m-1
                tid2 = tid_dict[titles[l]][0]
                t2_occ = tid_dict[titles[l]][1]
                min_tid = min(tid1, tid2)
                max_tid = max(tid1, tid2)
                # see if we already found the given cooccurence
                db_cocc_lookup = True
                if cocc_dict.has_key(min_tid):
                    if cocc_dict[min_tid].has_key(max_tid):
                        db_cocc_lookup = False
                else:
                    cocc_dict[min_tid] = {}

                if db_cocc_lookup:
                    cocc_dict[min_tid][max_tid] = dbase.get_wiki_cocc(
                        tid1, tid2, min(t1_occ, t2_occ))
                co_occs = cocc_dict[min_tid][max_tid]

                numer = (co_occs + 1) * WIKI_NUM_ABST  # +1 is for smoothing
                denom = t1_occ * t2_occ
                scores[topics[i].id].append(log((float(numer)) / denom))
    return scores
Beispiel #2
0
 def init_rel_db(self):
     """
     Initialize the relationship (TMA) database by creating the appropriate tables
     """
     self.dbase = db(self.params['outdir'] + '/tma.sqlite')
     self.dbase.add_table(
         "doc_doc (id INTEGER PRIMARY KEY, doc_a INTEGER, doc_b INTEGER, score FLOAT)"
     )
     self.dbase.add_table(
         "doc_topic (id INTEGER PRIMARY KEY, doc INTEGER, topic INTEGER, score FLOAT)"
     )
     self.dbase.add_table(
         "topics (id INTEGER PRIMARY KEY, title VARCHAR(100), score FLOAT)")
     self.dbase.add_table(
         "topic_term (id INTEGER PRIMARY KEY, topic INTEGER, term INTEGER, score FLOAT)"
     )
     self.dbase.add_table(
         "topic_topic (id INTEGER PRIMARY KEY, topic_a INTEGER, topic_b INTEGER, score FLOAT)"
     )
     self.dbase.add_table(
         "doc_term (id INTEGER PRIMARY KEY, doc INTEGER, term INTEGER, score FLOAT)"
     )
     self.dbase.add_table(
         "terms (id INTEGER PRIMARY KEY, title VARCHAR(100), count INTEGER)"
     )
     self.dbase.add_table(
         "docs (id INTEGER PRIMARY KEY, title VARCHAR(100))")
Beispiel #3
0
def get_bing_coherence_dict(terms_dict, corpus_dbloc, numtitles=50):
    """
    Coherence from (Newman, 2011 Automatic...) (search index with Bing)
    """
    dbase = db(corpus_dbloc)

    # do we have a de-stemming table?                        
    destem = dbase.check_table_existence("termid_to_prestem")
    bing = SBing(BING_API_KEY)
    scores = {}
    # TODO store more meta data so we can click through and see more of the anlaysis (e.g. which terms appeared in titles, frequency, cooccurance, which titles we were working with, etc)

    print 'Querying Bing...'
    for i, key in enumerate(terms_dict):
        terms = terms_dict[key]
        topic_terms = []
        for trm in terms:
            if destem:
                trm_title = (dbase.get_prestem(trm[1])[0][0]) # TODO what if the stemmed term doesn't exist for some reason?
            else:
                trm_title = trm[0]
            topic_terms.append(trm_title)
        topic_terms.sort() # sort for linear overlapping scans with search titles 
        search_qry = ' '.join(topic_terms)
        topic_terms = map(stem, topic_terms) # TODO make stemming optional on match?
        print '-topic %i of %i: %s' % (i, len(terms_dict), search_qry),

        tmatches = 0
        for j in xrange(0,numtitles, 50):
            try:
                json_response = bing.search(qry=search_qry, top=50, skip=j)
            except HTTPError:
                print 'Error accessing Bing -- make sure your API key is correct' # TODO propagate this message to the display
                return {}
            responses = json_response['d']['results']
            title_terms = map(lambda resp: resp['Title'].strip().lower().split(), responses) #TODO make case sensitive if desired  TODO make stemming optional
            title_terms = [item for sublist in title_terms for item in sublist]
            title_terms = map(stem, title_terms) # make list of lists into flat list
            title_terms.sort()
            tle_n = 0
            top_n=0
            # presorting the lists allows linear scans 
            while tle_n < len(title_terms) and top_n < len(topic_terms):
                cval = cmp(title_terms[tle_n], topic_terms[top_n])
                if cval == 0: # match 
                    tmatches += 1
                    tle_n += 1
                elif cval == -1: # title_terms is > topic_terms 
                    tle_n += 1
                else: # topic_terms > title_terms
                    top_n += 1
        print ': %i' % tmatches
        scores[key] = tmatches
    return scores
Beispiel #4
0
def get_wiki_pmi_coherence(topics, numterms=NUM_TERMS):   # TODO make sure the terms are already stemmed
    """
    Coherence score from (Newman, 2010 Automatic Evaluation of Topic Models)
    """
    dbase = db(WIKI_COCC_DB)
    if not dbase.check_table_existence('co_occ'):
        return {}
    scores = {}
    rtime = time()
    tid_dict = {} # keep terms and cooccurence counts in memory for caching
    cocc_dict = {}
    for i in xrange(len(topics)):
        scores[topics[i].id] = []
        print 'Determining wikipedia PMI coherence for topic %i of %i; last topic took %0.1fs' % (i,len(topics), time() - rtime)
        rtime = time()

        # prep the top numterms terms
        titles = []
        topics[i].get_terms(numterms)
        for j in xrange(numterms):
            # TODO make sure stemming is handled consistently
            titles.append(stem(topics[i].get_term(j).title))
            if not tid_dict.has_key(titles[-1]):
                res = dbase.get_wiki_occ(titles[-1])
                if res == []: # don't include terms that are not in the database TODO better way to handle this?
                    del(titles[-1])
                    numterms -= 1
                    continue
                tid_dict[titles[-1]] = [res[0], res[1]] # res[0] is the term_id res[1] is the occurance

        for m in xrange(1,numterms):
            tid1 = tid_dict[titles[m]][0]
            t1_occ = tid_dict[titles[m]][1]
            for l in xrange(0,m): # [x]range goes to m-1
                tid2 = tid_dict[titles[l]][0]
                t2_occ = tid_dict[titles[l]][1]
                min_tid = min(tid1,tid2)
                max_tid = max(tid1,tid2)
                # see if we already found the given cooccurence
                db_cocc_lookup = True
                if cocc_dict.has_key(min_tid):
                    if cocc_dict[min_tid].has_key(max_tid):
                        db_cocc_lookup = False
                else:
                    cocc_dict[min_tid] = {}

                if db_cocc_lookup:
                    cocc_dict[min_tid][max_tid] = dbase.get_wiki_cocc(tid1, tid2, min(t1_occ, t2_occ))
                co_occs = cocc_dict[min_tid][max_tid]

                numer = (co_occs + 1)*WIKI_NUM_ABST # +1 is for smoothing
                denom = t1_occ*t2_occ
                scores[topics[i].id].append( log((float(numer))/denom))
    return scores
Beispiel #5
0
 def init_rel_db(self):
     """
     Initialize the relationship (TMA) database by creating the appropriate tables
     """
     self.dbase = db(self.params['outdir'] + '/tma.sqlite')
     self.dbase.add_table("doc_doc (id INTEGER PRIMARY KEY, doc_a INTEGER, doc_b INTEGER, score FLOAT)")
     self.dbase.add_table("doc_topic (id INTEGER PRIMARY KEY, doc INTEGER, topic INTEGER, score FLOAT)")
     self.dbase.add_table("topics (id INTEGER PRIMARY KEY, title VARCHAR(100), score FLOAT)")
     self.dbase.add_table("topic_term (id INTEGER PRIMARY KEY, topic INTEGER, term INTEGER, score FLOAT)")
     self.dbase.add_table("topic_topic (id INTEGER PRIMARY KEY, topic_a INTEGER, topic_b INTEGER, score FLOAT)")
     self.dbase.add_table("doc_term (id INTEGER PRIMARY KEY, doc INTEGER, term INTEGER, score FLOAT)")
     self.dbase.add_table("terms (id INTEGER PRIMARY KEY, title VARCHAR(100), count INTEGER)")
     self.dbase.add_table("docs (id INTEGER PRIMARY KEY, title VARCHAR(100))")
Beispiel #6
0
def get_topic_coherence_scores(topics, corpus_dbloc, numterms=NUM_TERMS):
    """
    Coherence from (Mimno, 2011 Topic Coherence...)
    """
    dbase = db(corpus_dbloc)
    scores = {}#[[] for i in range(len(topics))]
    for i in xrange(len(topics)):
        scores[topics[i].id] = []
        topics[i].get_terms(numterms) # prep the top numterms terms
        for m in xrange(1,numterms):
            for l in xrange(m): # [x]range goes from 0 to m-1
                dl_set = set(dbase.get_doc_occ(topics[i].get_term(l).id)) # TODO: could optimize the intersection by sorting the sqlite query
                dm_set = set(dbase.get_doc_occ(topics[i].get_term(m).id))
                dl = len(dl_set)
                dml= len(dl_set.intersection(dm_set))
                scores[topics[i].id].append( log(float((dml + 1))/dl))
    del(dbase)
    return scores
Beispiel #7
0
def get_topic_coherence_scores(topics, corpus_dbloc, numterms=NUM_TERMS):
    """
    Coherence from (Mimno, 2011 Topic Coherence...)
    """
    dbase = db(corpus_dbloc)
    scores = {}  #[[] for i in range(len(topics))]
    for i in xrange(len(topics)):
        scores[topics[i].id] = []
        topics[i].get_terms(numterms)  # prep the top numterms terms
        for m in xrange(1, numterms):
            for l in xrange(m):  # [x]range goes from 0 to m-1
                dl_set = set(
                    dbase.get_doc_occ(topics[i].get_term(l).id)
                )  # TODO: could optimize the intersection by sorting the sqlite query
                dm_set = set(dbase.get_doc_occ(topics[i].get_term(m).id))
                dl = len(dl_set)
                dml = len(dl_set.intersection(dm_set))
                scores[topics[i].id].append(log(float((dml + 1)) / dl))
    del (dbase)
    return scores
Beispiel #8
0
    def parse_folder(self, folder):
        """
        parses the various datatypes in the folder and writes the lda-c format to file
        """
        
        # obtain list of all pdfs (TODO add heterogenous file types)
        pdflist = os.popen("find %s -name '*.pdf' -type f" % folder) 
        pdflist = pdflist.readlines()
        pdflist = map(lambda x: x.strip(), pdflist)
        self.pdf_list.extend(pdflist)
        toparsetexts = []
        if len(pdflist):
            print '--- beginning pdf to text conversion ---'
            for pdf in pdflist:
                doctitle = self._obtain_clean_title(pdf)
                txtname = self.textdir + '/%s.txt' % doctitle
                cmd = 'pdftotext %s %s' % (pdf, txtname) # TODO: figure out and print which documents did not convert
                os.system(cmd)
                toparsetexts.append(txtname)
                self.rawtextfiles.append(txtname)
            print '--- finished pdf to text conversion ---'
                           
        print '---adding text to corpus---'    
        # add textual data
        txtlist = os.popen("find %s -name '*.txt' -type f" % folder)  # add text files included in folder 
        txtlist = map(lambda x: x.strip(), txtlist) 
        for txtf in txtlist:
            doctitle = self._obtain_clean_title(txtf)
            txtname = self.textdir + '/%s.txt' % doctitle 
            try:
                os.system('ln -s %s %s' % (txtf, txtname))
            except IOError:
                print 'Warning: will not include %s, could not parse text file' % txtf 
                continue
                
            toparsetexts.append(txtname)
            self.rawtextfiles.append(txtname) # TODO: fix code repetition with parsing pdfs
            
        # now add all of the new texts to the corpus
        
        cfile = self.open_corpus()
        if self.usepara: # make a directory for each of the individual paragraphs
            if not os.path.exists(self.paradir): 
                os.makedirs(self.paradir)
        else:     # make a link to the textdir with the same name as the individual paragraph directory
            if not os.path.exists(self.paradir):
                os.system('ln -s %s %s' % (self.textdir, self.paradir))

        # initialize the database to keep track of term-doc occurances
        dbase = db(self.corpus_db)
        if not self.parsed_data:
            dbase.add_table('term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)')
            if self.make_stem_db:
                dbase.add_table('termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)')
            
        # add the data to the corpus
        for tfile in toparsetexts:
            title = tfile.split('/')[-1].split('.')[0].replace('-',' ')
            wordcounts = dict() 
            prestem_dic = dict() 
            try:
                infile = open(tfile,'r')
            except IOError:
                print 'WARNING: could not find %s, will not include' % tfile
                continue
            useparanum = 1
            totparanum = 1
            for paraline in infile:
                totparanum += 1
                words = paraline.split()
                for wrd in words:
                    wrd = self.parse_word(wrd)
                    if wrd=='':
                        continue
                    else:
                        prestem = wrd 
                        if self.dostem:
                            wrd = stem(wrd)
                        if wordcounts.has_key(wrd):
                            wordcounts[wrd] += 1
                        else:
                            wordcounts[wrd] = 1     
                            # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the  first encounter of a stemmed word: perhaps make more general?
                            if self.make_stem_db and not self.vocab.has_key(wrd): 
                                prestem_dic[wrd] = prestem
                                 
                if self.usepara:
                    if sum(wordcounts.values()) > self.minwords:
                        self.write_doc_line(cfile, wordcounts, dbase, prestem_dic)
                        usetitle = title + ' [P%d]' % useparanum
                        self.titles.append(usetitle)    
                        if not isinstance(usetitle, unicode):
                            usetitle = unicode(usetitle)                               
                        self.write_document(os.path.join(self.paradir, slugify(usetitle)),paraline)
                        useparanum += 1  
                    wordcounts = dict()
                    prestem_dic = dict() 
            infile.close()
            if not self.usepara:
                if sum(wordcounts.values()) > self.minwords: 
                    self.write_doc_line(cfile, wordcounts, dbase, prestem_dic)
                    self.titles.append(title)
        cfile.close()
        dbase.commit()
        if not self.parsed_data:
            dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)')
            dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)')
            dbase.commit()
        print '--- finished adding text to corpus ---'
        print
        self.parsed_data = True
Beispiel #9
0
    def tfidf_clean(self, top_k_terms=5000, min_df=5):
        """
        Use tf-idf to clean the corpus.
        Takes the top tf-idf score of each term and retains the top top_k_terms terms
        Warning: by default tfidf_clean changes the corpus's corpusfile to the cleaned version
        and moves the original version to {{original_name}}-pre_tfidf
        @param top_k_terms: keep the top_k_terms terms by tf-idf rank
        @param min_df: minimum document frequency for the terms
        """
        if not self.corpus_used:
            print "WARNING: You must first parse some data before calling tfidf_clean"
            return False
        orig_corpusfile = self.corpusfile + '-pre_tfidf'
        shutil.move(self.corpusfile, orig_corpusfile)

        # first obtain tf-idf scores for all terms
        tf_list = [0]*self.vocabct
        df_list = [0]*self.vocabct
        tfidf_list = [0]*self.vocabct
        for doc in open(orig_corpusfile,'r'):
            cts = doc.strip().split()[1:] #remove the term count
            term_ct_pairs = map(lambda x: x.split(':'), cts)
            doc_len = sum(map(lambda x: int(x[1]), term_ct_pairs))

            for pair in term_ct_pairs:
                trm = int(pair[0])
                tf = float(pair[1])/doc_len
                df_list[trm] += 1
                if tf > tf_list[trm]:
                    tf_list[trm] = tf

        # calculate tf-df scores
        for i in xrange(self.vocabct):
            tfidf_list[i] = tf_list[i]*log10(float(self.docct)/df_list[i])

        # determine the minimum tf-idf score
        srt_tfidf = sorted(tfidf_list, reverse=True)
        min_score = srt_tfidf[top_k_terms]

        # rewrite the corpus to file, only allowing terms whose max(tf-idf) score exceeds the minimum
        old_to_new_dict = dict()
        self.vocabct = 0
        self.wordct = 0
        writefile = open(self.corpusfile,'w');
        for doc in open(orig_corpusfile,'r'):
            writeline = ''
            cts = doc.strip().split()[1:]
            term_ct_pairs = map(lambda x: x.split(':'), cts)
            doc_term_ct = 0
            for tc_pair in term_ct_pairs:
                tid = int(tc_pair[0])
                if tfidf_list[tid] < min_score or df_list[tid] < min_df:
                    continue
                if not old_to_new_dict.has_key(tid):
                    old_to_new_dict[tid] = self.vocabct
                    self.vocabct += 1
                self.wordct += int(tc_pair[1])
                writeline += str(old_to_new_dict[tid]) + ':' + tc_pair[1] + ' '
                doc_term_ct += 1
            writeline = str(doc_term_ct) + " " + writeline
            writefile.write(writeline + '\n')
        writefile.close()
        remove_ct = len(tfidf_list)-len(old_to_new_dict)
        print 'Processing removed %i of %i terms, keeping %i terms. Min TF-IDF score is: %0.4f' % (remove_ct, len(tfidf_list), len(old_to_new_dict), min_score)

        # update the appropriate databases TODO: perhaps wait to form the databases for efficiecy
        dbase = db(self.corpus_db)
        if self.make_stem_db:
            dbase = db(self.corpus_db)
            oldid_to_prestem = dbase.fetch('SELECT * FROM termid_to_prestem')
            dbase.execute('DROP TABLE termid_to_prestem')
            dbase.add_table('termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)')
            id_prestem_list = []
            for op_item in map(list, oldid_to_prestem):
                if old_to_new_dict.has_key(op_item[0]):
                    op_item[0] = old_to_new_dict[op_item[0]]
                    id_prestem_list.append(op_item)
            dbase.executemany('INSERT INTO termid_to_prestem(id, prestem) VALUES(?,?)',id_prestem_list)


        dbase.execute('SELECT * FROM term_doc_pair')
        term_doc_items = []
        for item in dbase.cur:
            if old_to_new_dict.has_key(item[1]):
                item = list(item)
                item[1] = old_to_new_dict[item[1]]
                term_doc_items.append(item[1:])
        dbase.execute('DROP TABLE term_doc_pair')
        dbase.add_table('term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)')
        dbase.executemany('INSERT INTO term_doc_pair(term, doc) VALUES(?,?)', term_doc_items)
        dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)')
        dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)') 
        del(dbase)
        del(term_doc_items)


        # update corpus vocab
        oldid_to_term = dict((v,k) for k, v in self.vocab.iteritems())
        self.vocab = {}
        for k,v in old_to_new_dict.iteritems():
            self.vocab[oldid_to_term[k]] = v
Beispiel #10
0
def get_bing_coherence_dict(terms_dict, corpus_dbloc, numtitles=50):
    """
    Coherence from (Newman, 2011 Automatic...) (search index with Bing)
    """
    dbase = db(corpus_dbloc)

    # do we have a de-stemming table?
    destem = dbase.check_table_existence("termid_to_prestem")
    bing = SBing(BING_API_KEY)
    scores = {}
    # TODO store more meta data so we can click through and see more of the anlaysis (e.g. which terms appeared in titles, frequency, cooccurance, which titles we were working with, etc)

    print 'Querying Bing...'
    for i, key in enumerate(terms_dict):
        terms = terms_dict[key]
        topic_terms = []
        for trm in terms:
            if destem:
                trm_title = (
                    dbase.get_prestem(trm[1])[0][0]
                )  # TODO what if the stemmed term doesn't exist for some reason?
            else:
                trm_title = trm[0]
            topic_terms.append(trm_title)
        topic_terms.sort(
        )  # sort for linear overlapping scans with search titles
        search_qry = ' '.join(topic_terms)
        topic_terms = map(stem,
                          topic_terms)  # TODO make stemming optional on match?
        print '-topic %i of %i: %s' % (i, len(terms_dict), search_qry),

        tmatches = 0
        for j in xrange(0, numtitles, 50):
            try:
                json_response = bing.search(qry=search_qry, top=50, skip=j)
            except HTTPError:
                print 'Error accessing Bing -- make sure your API key is correct'  # TODO propagate this message to the display
                return {}
            responses = json_response['d']['results']
            title_terms = map(
                lambda resp: resp['Title'].strip().lower().split(), responses
            )  #TODO make case sensitive if desired  TODO make stemming optional
            title_terms = [item for sublist in title_terms for item in sublist]
            title_terms = map(stem,
                              title_terms)  # make list of lists into flat list
            title_terms.sort()
            tle_n = 0
            top_n = 0
            # presorting the lists allows linear scans
            while tle_n < len(title_terms) and top_n < len(topic_terms):
                cval = cmp(title_terms[tle_n], topic_terms[top_n])
                if cval == 0:  # match
                    tmatches += 1
                    tle_n += 1
                elif cval == -1:  # title_terms is > topic_terms
                    tle_n += 1
                else:  # topic_terms > title_terms
                    top_n += 1
        print ': %i' % tmatches
        scores[key] = tmatches
    return scores
Beispiel #11
0
    def tfidf_clean(self, top_k_terms=5000, min_df=5):
        """
        Use tf-idf to clean the corpus.
        Takes the top tf-idf score of each term and retains the top top_k_terms terms
        Warning: by default tfidf_clean changes the corpus's corpusfile to the cleaned version
        and moves the original version to {{original_name}}-pre_tfidf
        @param top_k_terms: keep the top_k_terms terms by tf-idf rank
        @param min_df: minimum document frequency for the terms
        """
        if not self.corpus_used:
            print "WARNING: You must first parse some data before calling tfidf_clean"
            return False
        orig_corpusfile = self.corpusfile + '-pre_tfidf'
        shutil.move(self.corpusfile, orig_corpusfile)

        # first obtain tf-idf scores for all terms
        tf_list = [0] * self.vocabct
        df_list = [0] * self.vocabct
        tfidf_list = [0] * self.vocabct
        for doc in open(orig_corpusfile, 'r'):
            cts = doc.strip().split()[1:]  #remove the term count
            term_ct_pairs = map(lambda x: x.split(':'), cts)
            doc_len = sum(map(lambda x: int(x[1]), term_ct_pairs))

            for pair in term_ct_pairs:
                trm = int(pair[0])
                tf = float(pair[1]) / doc_len
                df_list[trm] += 1
                if tf > tf_list[trm]:
                    tf_list[trm] = tf

        # calculate tf-df scores
        for i in xrange(self.vocabct):
            tfidf_list[i] = tf_list[i] * log10(float(self.docct) / df_list[i])

        # determine the minimum tf-idf score
        srt_tfidf = sorted(tfidf_list, reverse=True)
        min_score = srt_tfidf[top_k_terms]

        # rewrite the corpus to file, only allowing terms whose max(tf-idf) score exceeds the minimum
        old_to_new_dict = dict()
        self.vocabct = 0
        self.wordct = 0
        writefile = open(self.corpusfile, 'w')
        for doc in open(orig_corpusfile, 'r'):
            writeline = ''
            cts = doc.strip().split()[1:]
            term_ct_pairs = map(lambda x: x.split(':'), cts)
            doc_term_ct = 0
            for tc_pair in term_ct_pairs:
                tid = int(tc_pair[0])
                if tfidf_list[tid] < min_score or df_list[tid] < min_df:
                    continue
                if not old_to_new_dict.has_key(tid):
                    old_to_new_dict[tid] = self.vocabct
                    self.vocabct += 1
                self.wordct += int(tc_pair[1])
                writeline += str(old_to_new_dict[tid]) + ':' + tc_pair[1] + ' '
                doc_term_ct += 1
            writeline = str(doc_term_ct) + " " + writeline
            writefile.write(writeline + '\n')
        writefile.close()
        remove_ct = len(tfidf_list) - len(old_to_new_dict)
        print 'Processing removed %i of %i terms, keeping %i terms. Min TF-IDF score is: %0.4f' % (
            remove_ct, len(tfidf_list), len(old_to_new_dict), min_score)

        # update the appropriate databases TODO: perhaps wait to form the databases for efficiecy
        dbase = db(self.corpus_db)
        if self.make_stem_db:
            dbase = db(self.corpus_db)
            oldid_to_prestem = dbase.fetch('SELECT * FROM termid_to_prestem')
            dbase.execute('DROP TABLE termid_to_prestem')
            dbase.add_table(
                'termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)')
            id_prestem_list = []
            for op_item in map(list, oldid_to_prestem):
                if old_to_new_dict.has_key(op_item[0]):
                    op_item[0] = old_to_new_dict[op_item[0]]
                    id_prestem_list.append(op_item)
            dbase.executemany(
                'INSERT INTO termid_to_prestem(id, prestem) VALUES(?,?)',
                id_prestem_list)

        dbase.execute('SELECT * FROM term_doc_pair')
        term_doc_items = []
        for item in dbase.cur:
            if old_to_new_dict.has_key(item[1]):
                item = list(item)
                item[1] = old_to_new_dict[item[1]]
                term_doc_items.append(item[1:])
        dbase.execute('DROP TABLE term_doc_pair')
        dbase.add_table(
            'term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)')
        dbase.executemany('INSERT INTO term_doc_pair(term, doc) VALUES(?,?)',
                          term_doc_items)
        dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)')
        dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)')
        del (dbase)
        del (term_doc_items)

        # update corpus vocab
        oldid_to_term = dict((v, k) for k, v in self.vocab.iteritems())
        self.vocab = {}
        for k, v in old_to_new_dict.iteritems():
            self.vocab[oldid_to_term[k]] = v
Beispiel #12
0
    def parse_folder(self, folder):
        """
        parses the various datatypes in the folder and writes the lda-c format to file
        """

        # obtain list of all pdfs (TODO add heterogenous file types)
        pdflist = os.popen("find %s -name '*.pdf' -type f" % folder)
        pdflist = pdflist.readlines()
        pdflist = map(lambda x: x.strip(), pdflist)
        self.pdf_list.extend(pdflist)
        toparsetexts = []
        if len(pdflist):
            print '--- beginning pdf to text conversion ---'
            for pdf in pdflist:
                doctitle = self._obtain_clean_title(pdf)
                txtname = self.textdir + '/%s.txt' % doctitle
                cmd = 'pdftotext %s %s' % (
                    pdf, txtname
                )  # TODO: figure out and print which documents did not convert
                os.system(cmd)
                toparsetexts.append(txtname)
                self.rawtextfiles.append(txtname)
            print '--- finished pdf to text conversion ---'

        print '---adding text to corpus---'
        # add textual data
        txtlist = os.popen("find %s -name '*.txt' -type f" %
                           folder)  # add text files included in folder
        txtlist = map(lambda x: x.strip(), txtlist)
        for txtf in txtlist:
            doctitle = self._obtain_clean_title(txtf)
            txtname = self.textdir + '/%s.txt' % doctitle
            try:
                os.system('ln -s %s %s' % (txtf, txtname))
            except IOError:
                print 'Warning: will not include %s, could not parse text file' % txtf
                continue

            toparsetexts.append(txtname)
            self.rawtextfiles.append(
                txtname)  # TODO: fix code repetition with parsing pdfs

        # now add all of the new texts to the corpus

        cfile = self.open_corpus()
        if self.usepara:  # make a directory for each of the individual paragraphs
            if not os.path.exists(self.paradir):
                os.makedirs(self.paradir)
        else:  # make a link to the textdir with the same name as the individual paragraph directory
            if not os.path.exists(self.paradir):
                os.system('ln -s %s %s' % (self.textdir, self.paradir))

        # initialize the database to keep track of term-doc occurances
        dbase = db(self.corpus_db)
        if not self.parsed_data:
            dbase.add_table(
                'term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)'
            )
            if self.make_stem_db:
                dbase.add_table(
                    'termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)'
                )

        # add the data to the corpus
        for tfile in toparsetexts:
            title = tfile.split('/')[-1].split('.')[0].replace('-', ' ')
            wordcounts = dict()
            prestem_dic = dict()
            try:
                infile = open(tfile, 'r')
            except IOError:
                print 'WARNING: could not find %s, will not include' % tfile
                continue
            useparanum = 1
            totparanum = 1
            for paraline in infile:
                totparanum += 1
                words = paraline.split()
                for wrd in words:
                    wrd = self.parse_word(wrd)
                    if wrd == '':
                        continue
                    else:
                        prestem = wrd
                        if self.dostem:
                            wrd = stem(wrd)
                        if wordcounts.has_key(wrd):
                            wordcounts[wrd] += 1
                        else:
                            wordcounts[wrd] = 1
                            # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the  first encounter of a stemmed word: perhaps make more general?
                            if self.make_stem_db and not self.vocab.has_key(
                                    wrd):
                                prestem_dic[wrd] = prestem

                if self.usepara:
                    if sum(wordcounts.values()) > self.minwords:
                        self.write_doc_line(cfile, wordcounts, dbase,
                                            prestem_dic)
                        usetitle = title + ' [P%d]' % useparanum
                        self.titles.append(usetitle)
                        if not isinstance(usetitle, unicode):
                            usetitle = unicode(usetitle)
                        self.write_document(
                            os.path.join(self.paradir, slugify(usetitle)),
                            paraline)
                        useparanum += 1
                    wordcounts = dict()
                    prestem_dic = dict()
            infile.close()
            if not self.usepara:
                if sum(wordcounts.values()) > self.minwords:
                    self.write_doc_line(cfile, wordcounts, dbase, prestem_dic)
                    self.titles.append(title)
        cfile.close()
        dbase.commit()
        if not self.parsed_data:
            dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)')
            dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)')
            dbase.commit()
        print '--- finished adding text to corpus ---'
        print
        self.parsed_data = True