Ejemplo n.º 1
0
def stem(*args):

    """
    .. function:: stem(text1, text2,...) -> text

    Does stemming according to the porter algorithm.

    Examples:

    >>> sql("stem 'cutting and creating'")
    stem('cutting and creating')
    ----------------------------
    cut and creat

    >>> sql("stem ceci est en français cutting")
    stem('ceci est en français cutting')
    -------------------------------------
    ceci est en françai cut

    """

    out = []
    for i in args:
        o = i.lower()
        o = o.strip()
        o = o.split(" ")

        for k in o:
            if len(k) > 0:
                out.append(porter.stem(k))

    return " ".join(out)
Ejemplo n.º 2
0
	def _import_dfr(self, dfr_dir):
		citations = self._import_dfr_metadata(dfr_dir)

		wordcounts_dir = os.path.join(dfr_dir, "wordcounts")
		for doi in citations.keys():
			try:
				this_text = ''		
				for rowdict in self.parse_csv(os.path.join(wordcounts_dir, "wordcounts_" + doi.replace('/','_') + ".CSV")):
					word = rowdict["WORDCOUNTS"]
					if word in self.stopwords:
						continue
					if self.stemming:
						prestem = word
						if word not in self.stemmed:
							self.stemmed[prestem] = stem(prestem)
						word = self.stemmed[prestem]
					count = int(rowdict["WEIGHT"])

					this_text += (word + u' ') * count
				if len(this_text) < 20:
					continue
				yield doi, this_text
			except:
				logging.error(doi)
				logging.error(traceback.format_exc())
Ejemplo n.º 3
0
def get_wiki_pmi_coherence(
        topics,
        numterms=NUM_TERMS):  # TODO make sure the terms are already stemmed
    """
    Coherence score from (Newman, 2010 Automatic Evaluation of Topic Models)
    """
    dbase = db(WIKI_COCC_DB)
    if not dbase.check_table_existence('co_occ'):
        return {}
    scores = {}
    rtime = time()
    tid_dict = {}  # keep terms and cooccurence counts in memory for caching
    cocc_dict = {}
    for i in xrange(len(topics)):
        scores[topics[i].id] = []
        print 'Determining wikipedia PMI coherence for topic %i of %i; last topic took %0.1fs' % (
            i, len(topics), time() - rtime)
        rtime = time()

        # prep the top numterms terms
        titles = []
        topics[i].get_terms(numterms)
        for j in xrange(numterms):
            # TODO make sure stemming is handled consistently
            titles.append(stem(topics[i].get_term(j).title))
            if not tid_dict.has_key(titles[-1]):
                res = dbase.get_wiki_occ(titles[-1])
                if res == []:  # don't include terms that are not in the database TODO better way to handle this?
                    del (titles[-1])
                    numterms -= 1
                    continue
                tid_dict[titles[-1]] = [
                    res[0], res[1]
                ]  # res[0] is the term_id res[1] is the occurance

        for m in xrange(1, numterms):
            tid1 = tid_dict[titles[m]][0]
            t1_occ = tid_dict[titles[m]][1]
            for l in xrange(0, m):  # [x]range goes to m-1
                tid2 = tid_dict[titles[l]][0]
                t2_occ = tid_dict[titles[l]][1]
                min_tid = min(tid1, tid2)
                max_tid = max(tid1, tid2)
                # see if we already found the given cooccurence
                db_cocc_lookup = True
                if cocc_dict.has_key(min_tid):
                    if cocc_dict[min_tid].has_key(max_tid):
                        db_cocc_lookup = False
                else:
                    cocc_dict[min_tid] = {}

                if db_cocc_lookup:
                    cocc_dict[min_tid][max_tid] = dbase.get_wiki_cocc(
                        tid1, tid2, min(t1_occ, t2_occ))
                co_occs = cocc_dict[min_tid][max_tid]

                numer = (co_occs + 1) * WIKI_NUM_ABST  # +1 is for smoothing
                denom = t1_occ * t2_occ
                scores[topics[i].id].append(log((float(numer)) / denom))
    return scores
Ejemplo n.º 4
0
def stem(*args):
    """
    .. function:: stem(text1, text2,...) -> text

    Does stemming according to the porter algorithm.

    Examples:

    >>> sql("stem 'cutting and creating'")
    stem('cutting and creating')
    ----------------------------
    cut and creat

    >>> sql("stem ceci est en français cutting")
    stem('ceci est en français cutting')
    -------------------------------------
    ceci est en françai cut

    """

    out = []
    for i in args:
        o = i.lower()
        o = o.strip()
        o = o.split(' ')

        for k in o:
            if len(k) > 0:
                out.append(porter.stem(k))

    return ' '.join(out)
Ejemplo n.º 5
0
def get_wiki_pmi_coherence(topics, numterms=NUM_TERMS):   # TODO make sure the terms are already stemmed
    """
    Coherence score from (Newman, 2010 Automatic Evaluation of Topic Models)
    """
    dbase = db(WIKI_COCC_DB)
    if not dbase.check_table_existence('co_occ'):
        return {}
    scores = {}
    rtime = time()
    tid_dict = {} # keep terms and cooccurence counts in memory for caching
    cocc_dict = {}
    for i in xrange(len(topics)):
        scores[topics[i].id] = []
        print 'Determining wikipedia PMI coherence for topic %i of %i; last topic took %0.1fs' % (i,len(topics), time() - rtime)
        rtime = time()

        # prep the top numterms terms
        titles = []
        topics[i].get_terms(numterms)
        for j in xrange(numterms):
            # TODO make sure stemming is handled consistently
            titles.append(stem(topics[i].get_term(j).title))
            if not tid_dict.has_key(titles[-1]):
                res = dbase.get_wiki_occ(titles[-1])
                if res == []: # don't include terms that are not in the database TODO better way to handle this?
                    del(titles[-1])
                    numterms -= 1
                    continue
                tid_dict[titles[-1]] = [res[0], res[1]] # res[0] is the term_id res[1] is the occurance

        for m in xrange(1,numterms):
            tid1 = tid_dict[titles[m]][0]
            t1_occ = tid_dict[titles[m]][1]
            for l in xrange(0,m): # [x]range goes to m-1
                tid2 = tid_dict[titles[l]][0]
                t2_occ = tid_dict[titles[l]][1]
                min_tid = min(tid1,tid2)
                max_tid = max(tid1,tid2)
                # see if we already found the given cooccurence
                db_cocc_lookup = True
                if cocc_dict.has_key(min_tid):
                    if cocc_dict[min_tid].has_key(max_tid):
                        db_cocc_lookup = False
                else:
                    cocc_dict[min_tid] = {}

                if db_cocc_lookup:
                    cocc_dict[min_tid][max_tid] = dbase.get_wiki_cocc(tid1, tid2, min(t1_occ, t2_occ))
                co_occs = cocc_dict[min_tid][max_tid]

                numer = (co_occs + 1)*WIKI_NUM_ABST # +1 is for smoothing
                denom = t1_occ*t2_occ
                scores[topics[i].id].append( log((float(numer))/denom))
    return scores
Ejemplo n.º 6
0
  def groupNounsWithCounts(self, nounsWithCounts):
    ret = []
    stemToNouns = {} # { <stem> : { <noun> : True } }

    # populate stemToNouns
    for noun in nounsWithCounts.keys():
      currStem = stem(noun)
      if currStem not in stemToNouns:
        stemToNouns[currStem] = {}
      if noun not in stemToNouns[currStem]:
        stemToNouns[currStem][noun] = True

    # populate ret
    for currStem in stemToNouns.keys():
      nounToTrue = stemToNouns[currStem]
      currObj = {
        'values': [],
        'positive': {},
        'negative': {},
      }

      for noun in sorted(nounToTrue.keys()):
        currObj['values'].append(noun)
        positiveCountDict = nounsWithCounts[noun]['positive']
        negativeCountDict = nounsWithCounts[noun]['negative']
        for neighbor in positiveCountDict.keys():
          if neighbor not in currObj['positive']:
            currObj['positive'][neighbor] = {
              'count' : 0,
              'sentences' : []
            }
          currObj['positive'][neighbor]['count'] += positiveCountDict[neighbor]['count']
          currObj['positive'][neighbor]['sentences'].extend(positiveCountDict[neighbor]['sentences'])
        for neighbor in negativeCountDict.keys():
          if neighbor not in currObj['negative']:
            currObj['negative'][neighbor] = {
              'count' : 0,
              'sentences' : []
            }
          currObj['negative'][neighbor]['count'] += negativeCountDict[neighbor]['count']
          currObj['negative'][neighbor]['sentences'].extend(negativeCountDict[neighbor]['sentences'])
      ret.append(currObj)
    return ret
Ejemplo n.º 7
0
def stem_en(*args):

    """
    .. function:: stem_en(text1, text2,...) -> text

    Detects if the input is in english and only then does the porter stemming else
    it returns the input arguments concatenated

    Examples:

    >>> sql("stem_en 'cutting and creating'")
    stem_en('cutting and creating')
    -------------------------------
    cut and creat

    >>> sql("stem_en ceci est en français cutting")
    stem_en('ceci est en français cutting')
    ----------------------------------------
    ceci est en français cutting

    """

    jargs = "".join(args)

    if detectlang(*args) != "english":
        return jargs

    out = []
    for i in args:
        o = i.lower()
        o = o.strip()
        o = o.split(" ")

        for k in o:
            if len(k) > 0:
                out.append(porter.stem(k))

    return " ".join(out)
Ejemplo n.º 8
0
	def _import_files(self):
		if self.stemming:
			self.stemmed = {}
		self.docs = []
		with codecs.open(self.texts_file, 'w', encoding='utf-8') as f:
			for filename in self.files:
				with codecs.open(filename, 'r', encoding='utf-8') as input_file:
					text = input_file.read()
					text = re.sub(r"[^\w ]+", u'', text.lower(), flags=re.UNICODE)
					if self.stemming:
						newtext = u''
						for word in text.split():
							if word not in self.stemmed:
								self.stemmed[word] = stem(word)
							newtext += self.stemmed[word] + u' '
						text = newtext
					f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n')
					self.docs.append(filename)
			if self.dfr:
				for doi, text in self._import_dfr(self.dfr_dir):
					f.write(u'\t'.join([doi, self.metadata[doi]["label"], text]) + u'\n')
					self.docs.append(doi)
		self.doc_count = len(self.docs)
Ejemplo n.º 9
0
def stem_en(*args):
    """
    .. function:: stem_en(text1, text2,...) -> text

    Detects if the input is in english and only then does the porter stemming else
    it returns the input arguments concatenated

    Examples:

    >>> sql("stem_en 'cutting and creating'")
    stem_en('cutting and creating')
    -------------------------------
    cut and creat

    >>> sql("stem_en ceci est en français cutting")
    stem_en('ceci est en français cutting')
    ----------------------------------------
    ceci est en français cutting

    """

    jargs = ''.join(args)

    if detectlang(*args) != 'english':
        return jargs

    out = []
    for i in args:
        o = i.lower()
        o = o.strip()
        o = o.split(' ')

        for k in o:
            if len(k) > 0:
                out.append(porter.stem(k))

    return ' '.join(out)
Ejemplo n.º 10
0
Archivo: corpus.py Proyecto: corajr/TMA
    def parse_folder(self, folder):
        """
        parses the various datatypes in the folder and writes the lda-c format to file
        """
        
        # obtain list of all pdfs (TODO add heterogenous file types)
        pdflist = os.popen("find %s -name '*.pdf' -type f" % folder) 
        pdflist = pdflist.readlines()
        pdflist = map(lambda x: x.strip(), pdflist)
        self.pdf_list.extend(pdflist)
        toparsetexts = []
        if len(pdflist):
            print '--- beginning pdf to text conversion ---'
            for pdf in pdflist:
                doctitle = self._obtain_clean_title(pdf)
                txtname = self.textdir + '/%s.txt' % doctitle
                cmd = 'pdftotext %s %s' % (pdf, txtname) # TODO: figure out and print which documents did not convert
                os.system(cmd)
                toparsetexts.append(txtname)
                self.rawtextfiles.append(txtname)
            print '--- finished pdf to text conversion ---'
                           
        print '---adding text to corpus---'    
        # add textual data
        txtlist = os.popen("find %s -name '*.txt' -type f" % folder)  # add text files included in folder 
        txtlist = map(lambda x: x.strip(), txtlist) 
        for txtf in txtlist:
            doctitle = self._obtain_clean_title(txtf)
            txtname = self.textdir + '/%s.txt' % doctitle 
            try:
                os.system('ln -s %s %s' % (txtf, txtname))
            except IOError:
                print 'Warning: will not include %s, could not parse text file' % txtf 
                continue
                
            toparsetexts.append(txtname)
            self.rawtextfiles.append(txtname) # TODO: fix code repetition with parsing pdfs
            
        # now add all of the new texts to the corpus
        
        cfile = self.open_corpus()
        if self.usepara: # make a directory for each of the individual paragraphs
            if not os.path.exists(self.paradir): 
                os.makedirs(self.paradir)
        else:     # make a link to the textdir with the same name as the individual paragraph directory
            if not os.path.exists(self.paradir):
                os.system('ln -s %s %s' % (self.textdir, self.paradir))

        # initialize the database to keep track of term-doc occurances
        dbase = db(self.corpus_db)
        if not self.parsed_data:
            dbase.add_table('term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)')
            if self.make_stem_db:
                dbase.add_table('termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)')
            
        # add the data to the corpus
        for tfile in toparsetexts:
            title = tfile.split('/')[-1].split('.')[0].replace('-',' ')
            wordcounts = dict() 
            prestem_dic = dict() 
            try:
                infile = open(tfile,'r')
            except IOError:
                print 'WARNING: could not find %s, will not include' % tfile
                continue
            useparanum = 1
            totparanum = 1
            for paraline in infile:
                totparanum += 1
                words = paraline.split()
                for wrd in words:
                    wrd = self.parse_word(wrd)
                    if wrd=='':
                        continue
                    else:
                        prestem = wrd 
                        if self.dostem:
                            wrd = stem(wrd)
                        if wordcounts.has_key(wrd):
                            wordcounts[wrd] += 1
                        else:
                            wordcounts[wrd] = 1     
                            # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the  first encounter of a stemmed word: perhaps make more general?
                            if self.make_stem_db and not self.vocab.has_key(wrd): 
                                prestem_dic[wrd] = prestem
                                 
                if self.usepara:
                    if sum(wordcounts.values()) > self.minwords:
                        self.write_doc_line(cfile, wordcounts, dbase, prestem_dic)
                        usetitle = title + ' [P%d]' % useparanum
                        self.titles.append(usetitle)    
                        if not isinstance(usetitle, unicode):
                            usetitle = unicode(usetitle)                               
                        self.write_document(os.path.join(self.paradir, slugify(usetitle)),paraline)
                        useparanum += 1  
                    wordcounts = dict()
                    prestem_dic = dict() 
            infile.close()
            if not self.usepara:
                if sum(wordcounts.values()) > self.minwords: 
                    self.write_doc_line(cfile, wordcounts, dbase, prestem_dic)
                    self.titles.append(title)
        cfile.close()
        dbase.commit()
        if not self.parsed_data:
            dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)')
            dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)')
            dbase.commit()
        print '--- finished adding text to corpus ---'
        print
        self.parsed_data = True
Ejemplo n.º 11
0
def textacknowledgmentsstem(txt,span = 10,maxlen = 3,pattern = r'(?:support)|(?:thank)|(?:research)|(?:\bwork\b)|(?:\bgrant\b)|(?:project)|(?:scienc)|(?:\bfund\b)|(?:nation)|(?:author)|(?:foundat)|(?:\bprogram\b)|(?:\bhelp\b)|(?:univers)|(?:paper)|(?:technolog)|(?:partial)|(?:comment)|(?:develop)|(?:acknowledg)|(?:review)|(?:provid)|(?:grate)|(?:\bcenter\b)|(?:studi)|(?:discuss)|(?:particip)|(?:ministri)|(?:contribut)|(?:european)|(?:system)|(?:comput)|(?:number)|(?:valuabl)|(?:educ)|(?:council)|(?:award)|(?:contract)|(?:inform)|(?:institut)' ):
    """
    .. function:: textacknowledgmentsstem(text, span = 10, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_))

    Returns the "Reference" section of documents. To find it, it searches for parts of the document that
    have a high density of pattern matches.

    .. parameters:: txt,maxlen,pattern
       txt: input text.
       span: the size of the string in words that the txt is splited
       maxlen: the size of the scrolling window over the text in which the density is calculated.
       pattern: regular expression that is matched against the lines of the text. By default the pattern matches
                year occurences so as to extract sections that look like references.


    Examples:

    >>> sql("select textacknowledgmentsstem('')")
    textacknowledgmentsstem('')
    ------------------
    <BLANKLINE>
    """

    exp = re.sub('\r\n','\n',txt)
    exp = reduce_spaces.sub(' ', strip_remove_newlines.sub('', exp))

    if exp.count(' ') < span * 10:
        return exp

    acknowledgments = []
    origwords = exp.split(' ')
    words = exp.lower()
    words = words.split(' ')
    stemed = []
    for k in words:
        if len(k) > 0:
            try:
                stemed.append(porter.stem(k))
            except Exception:
                stemed.append(k)
    spanedorigtext = [' '.join(origwords[i:i+span]) for i in range(0, len(origwords), span)]
    spanedstemtext = [' '.join(stemed[i:i+span]) for i in range(0, len(stemed), span)]
    reversedtext = iter(spanedstemtext)
    results = []
    densities = []

    for i in xrange(maxlen/2):
        results.append(0)
    for i in reversedtext:
        count = sum(1 for m in re.finditer(pattern, i))
        if count:
                results.append(count)
        else:
                results.append(0)

    for i in xrange(maxlen/2):
        results.append(0)

    for i in xrange(maxlen/2,len(results)-maxlen/2):
        densities.append(sum(results[i-maxlen/2:i-maxlen/2+maxlen])*1.0/maxlen)

    threshold = 1

    current = 0
    for i in spanedorigtext:
        if len(i)>10:
            if densities[current] > threshold:
                acknowledgments.append(i)
            current+=1
    return '\n'.join(acknowledgments)
Ejemplo n.º 12
0
def textacknowledgmentsstem(
    txt,
    span=10,
    maxlen=3,
    pattern=r'(?:support)|(?:thank)|(?:research)|(?:\bwork\b)|(?:\bgrant\b)|(?:project)|(?:scienc)|(?:\bfund\b)|(?:nation)|(?:author)|(?:foundat)|(?:\bprogram\b)|(?:\bhelp\b)|(?:univers)|(?:paper)|(?:technolog)|(?:partial)|(?:comment)|(?:develop)|(?:acknowledg)|(?:review)|(?:provid)|(?:grate)|(?:\bcenter\b)|(?:studi)|(?:discuss)|(?:particip)|(?:ministri)|(?:contribut)|(?:european)|(?:system)|(?:comput)|(?:number)|(?:valuabl)|(?:educ)|(?:council)|(?:award)|(?:contract)|(?:inform)|(?:institut)'
):
    """
    .. function:: textacknowledgmentsstem(text, span = 10, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_))

    Returns the "Reference" section of documents. To find it, it searches for parts of the document that
    have a high density of pattern matches.

    .. parameters:: txt,maxlen,pattern
       txt: input text.
       span: the size of the string in words that the txt is splited
       maxlen: the size of the scrolling window over the text in which the density is calculated.
       pattern: regular expression that is matched against the lines of the text. By default the pattern matches
                year occurences so as to extract sections that look like references.


    Examples:

    >>> sql("select textacknowledgmentsstem('')")
    textacknowledgmentsstem('')
    ------------------
    <BLANKLINE>
    """

    exp = re.sub('\r\n', '\n', txt)
    exp = reduce_spaces.sub(' ', strip_remove_newlines.sub('', exp))

    if exp.count(' ') < span * 10:
        return exp

    acknowledgments = []
    origwords = exp.split(' ')
    words = exp.lower()
    words = words.split(' ')
    stemed = []
    for k in words:
        if len(k) > 0:
            try:
                stemed.append(porter.stem(k))
            except Exception:
                stemed.append(k)
    spanedorigtext = [
        ' '.join(origwords[i:i + span]) for i in range(0, len(origwords), span)
    ]
    spanedstemtext = [
        ' '.join(stemed[i:i + span]) for i in range(0, len(stemed), span)
    ]
    reversedtext = iter(spanedstemtext)
    results = []
    densities = []

    for i in xrange(maxlen / 2):
        results.append(0)
    for i in reversedtext:
        count = sum(1 for m in re.finditer(pattern, i))
        if count:
            results.append(count)
        else:
            results.append(0)

    for i in xrange(maxlen / 2):
        results.append(0)

    for i in xrange(maxlen / 2, len(results) - maxlen / 2):
        densities.append(
            sum(results[i - maxlen / 2:i - maxlen / 2 + maxlen]) * 1.0 /
            maxlen)

    threshold = 1

    current = 0
    for i in spanedorigtext:
        if len(i) > 10:
            if densities[current] > threshold:
                acknowledgments.append(i)
            current += 1
    return '\n'.join(acknowledgments)
Ejemplo n.º 13
0
    def parse_folder(self, folder):
        """
        parses the various datatypes in the folder and writes the lda-c format to file
        """

        # obtain list of all pdfs (TODO add heterogenous file types)
        pdflist = os.popen("find %s -name '*.pdf' -type f" % folder)
        pdflist = pdflist.readlines()
        pdflist = map(lambda x: x.strip(), pdflist)
        self.pdf_list.extend(pdflist)
        toparsetexts = []
        if len(pdflist):
            print '--- beginning pdf to text conversion ---'
            for pdf in pdflist:
                doctitle = self._obtain_clean_title(pdf)
                txtname = self.textdir + '/%s.txt' % doctitle
                cmd = 'pdftotext %s %s' % (
                    pdf, txtname
                )  # TODO: figure out and print which documents did not convert
                os.system(cmd)
                toparsetexts.append(txtname)
                self.rawtextfiles.append(txtname)
            print '--- finished pdf to text conversion ---'

        print '---adding text to corpus---'
        # add textual data
        txtlist = os.popen("find %s -name '*.txt' -type f" %
                           folder)  # add text files included in folder
        txtlist = map(lambda x: x.strip(), txtlist)
        for txtf in txtlist:
            doctitle = self._obtain_clean_title(txtf)
            txtname = self.textdir + '/%s.txt' % doctitle
            try:
                os.system('ln -s %s %s' % (txtf, txtname))
            except IOError:
                print 'Warning: will not include %s, could not parse text file' % txtf
                continue

            toparsetexts.append(txtname)
            self.rawtextfiles.append(
                txtname)  # TODO: fix code repetition with parsing pdfs

        # now add all of the new texts to the corpus

        cfile = self.open_corpus()
        if self.usepara:  # make a directory for each of the individual paragraphs
            if not os.path.exists(self.paradir):
                os.makedirs(self.paradir)
        else:  # make a link to the textdir with the same name as the individual paragraph directory
            if not os.path.exists(self.paradir):
                os.system('ln -s %s %s' % (self.textdir, self.paradir))

        # initialize the database to keep track of term-doc occurances
        dbase = db(self.corpus_db)
        if not self.parsed_data:
            dbase.add_table(
                'term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)'
            )
            if self.make_stem_db:
                dbase.add_table(
                    'termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)'
                )

        # add the data to the corpus
        for tfile in toparsetexts:
            title = tfile.split('/')[-1].split('.')[0].replace('-', ' ')
            wordcounts = dict()
            prestem_dic = dict()
            try:
                infile = open(tfile, 'r')
            except IOError:
                print 'WARNING: could not find %s, will not include' % tfile
                continue
            useparanum = 1
            totparanum = 1
            for paraline in infile:
                totparanum += 1
                words = paraline.split()
                for wrd in words:
                    wrd = self.parse_word(wrd)
                    if wrd == '':
                        continue
                    else:
                        prestem = wrd
                        if self.dostem:
                            wrd = stem(wrd)
                        if wordcounts.has_key(wrd):
                            wordcounts[wrd] += 1
                        else:
                            wordcounts[wrd] = 1
                            # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the  first encounter of a stemmed word: perhaps make more general?
                            if self.make_stem_db and not self.vocab.has_key(
                                    wrd):
                                prestem_dic[wrd] = prestem

                if self.usepara:
                    if sum(wordcounts.values()) > self.minwords:
                        self.write_doc_line(cfile, wordcounts, dbase,
                                            prestem_dic)
                        usetitle = title + ' [P%d]' % useparanum
                        self.titles.append(usetitle)
                        if not isinstance(usetitle, unicode):
                            usetitle = unicode(usetitle)
                        self.write_document(
                            os.path.join(self.paradir, slugify(usetitle)),
                            paraline)
                        useparanum += 1
                    wordcounts = dict()
                    prestem_dic = dict()
            infile.close()
            if not self.usepara:
                if sum(wordcounts.values()) > self.minwords:
                    self.write_doc_line(cfile, wordcounts, dbase, prestem_dic)
                    self.titles.append(title)
        cfile.close()
        dbase.commit()
        if not self.parsed_data:
            dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)')
            dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)')
            dbase.commit()
        print '--- finished adding text to corpus ---'
        print
        self.parsed_data = True