def doSpellcheck(self): canspell = IMC.spellCheck.isUp() nwords = IMC.wordCensus.size() if 0 >= nwords : # could be zero in a null document return pqMsgs.startBar(nwords,"Checking spelling...") for i in range(IMC.wordCensus.size()): (qword, cnt, wflags) = IMC.wordCensus.get(i) wflags = wflags & (0xff - IMC.WordMisspelt) # turn off flag if on # some words have /dict-tag, split that out as string or "" (w,x,d) = unicode(qword).partition("/") if IMC.goodWordList.check(w): pass elif IMC.badWordList.check(w) : wflags |= IMC.WordMisspelt elif canspell : # check word in its optional dictionary if not ( IMC.spellCheck.check(w,d) ) : wflags |= IMC.WordMisspelt IMC.wordCensus.setflags(i,wflags) if 0 == i & 0x1f : pqMsgs.rollBar(i) pqMsgs.endBar() IMC.needMetadataSave |= IMC.wordlistsChanged IMC.needSpellCheck = False if IMC.spellingHiliteSwitch : self.setHighlight(True) # force refresh of spell underlines
def doSpellcheck(self): canspell = IMC.spellCheck.isUp() nwords = IMC.wordCensus.size() if 0 >= nwords: # could be zero in a null document return pqMsgs.startBar(nwords, "Checking spelling...") for i in range(IMC.wordCensus.size()): (qword, cnt, wflags) = IMC.wordCensus.get(i) wflags = wflags & (0xff - IMC.WordMisspelt) # turn off flag if on # some words have /dict-tag, split that out as string or "" (w, x, d) = unicode(qword).partition("/") if IMC.goodWordList.check(w): pass elif IMC.badWordList.check(w): wflags |= IMC.WordMisspelt elif canspell: # check word in its optional dictionary if not (IMC.spellCheck.check(w, d)): wflags |= IMC.WordMisspelt IMC.wordCensus.setflags(i, wflags) if 0 == i & 0x1f: pqMsgs.rollBar(i) pqMsgs.endBar() IMC.needMetadataSave |= IMC.wordlistsChanged IMC.needSpellCheck = False if IMC.spellingHiliteSwitch: self.setHighlight(True) # force refresh of spell underlines
def doCensus(self, page=False) : global reLineSep, reTokens, reLang, qcLess # Clear the current census values IMC.wordCensus.clear() IMC.charCensus.clear() # Count chars locally for speed local_char_census = defaultdict(int) # Name of current alternate dictionary alt_dict = QString() # isEmpty when none # Tag from which we set an alternate dict alt_dict_tag = QString() # Start the progress bar based on the number of lines in the document pqMsgs.startBar(self.document().blockCount(),"Counting words and chars...") # Find the first text block of interest, skipping an HTML header file qtb = self.document().begin() # first text block if IMC.bookType.startsWith(QString(u"htm")) \ and qtb.text().startsWith(QString(u"<!DOCTYPE")) : while (qtb != self.document().end()) \ and (not qtb.text().startsWith(QString(u"<body"))) : qtb = qtb.next() # Scan all lines of the document to the end. while qtb != self.document().end() : qsLine = qtb.text() # text of line as qstring dbg = qsLine.size() dbg2 = qtb.length() if reLineSep.exactMatch(qsLine): # this is a page separator line if page : # We are doing page seps, it's for Open with no .meta seen, # the page table has been cleared. Store the page sep # data in the page table, with a textCursor to its start. qsfilenum = reLineSep.cap(1) # xxx from "File: xxx.png" qsproofers = reLineSep.cap(2) # \who\x\blah\etc # proofer names can contain spaces, replace with en-space char qsproofers.replace(QChar(" "),QChar(0x2002)) # create a new TextCursor instance tcursor = QTextCursor(self.document()) # point it to this text block tcursor.setPosition(qtb.position()) # dump all that in the page table IMC.pageTable.loadPsep(tcursor, qsfilenum, qsproofers) # else not doing pages, just ignore this psep line else: # not psep, ordinary text line, count chars and words pyLine = unicode(qsLine) # move into Python space to count for c in pyLine : local_char_census[c] += 1 j = 0 while True: j = reTokens.indexIn(qsLine,j) if j < 0 : # no more word-like units break qsWord = reTokens.cap(0) j += qsWord.size() if qsWord.startsWith(qcLess) : # Examine a captured HTML production. if not reTokens.cap(2).isEmpty() : # HTML open tag, look for lang='dict' if 0 <= reLang.indexIn(reTokens.cap(3)) : # found it: save tag and dict name alt_dict_tag = QString(reTokens.cap(2)) alt_dict = QString(reLang.cap(1)) alt_dict.prepend(u'/') # make "/en_GB" # else no lang= attribute else: # HTML close tag, see if it closes alt dict use if reTokens.cap(5) == alt_dict_tag : # yes, matches open-tag for dict, clear it alt_dict_tag = QString() alt_dict = QString() # else no alt dict in use, or didn't match else : # did not start with "<", process as a word # Set the property flags, which is harder now we don't # look at every character. Use the QString facilities # rather than python because python .isalnum fails # for a hyphenated number "1850-1910". flag = 0 if 0 != qsWord.compare(qsWord.toLower()) : flag |= IMC.WordHasUpper if 0 != qsWord.compare(qsWord.toUpper()) : flag |= IMC.WordHasLower if qsWord.contains(qcHyphen) : flag |= IMC.WordHasHyphen if qsWord.contains(qcApostrophe) or qsWord.contains(qcCurlyApostrophe) : flag |= IMC.WordHasApostrophe if qsWord.contains(reDigit) : flag |= IMC.WordHasDigit IMC.wordCensus.count(qsWord.append(alt_dict),flag) # end "while any more words in this line" # end of not-a-psep-line processing qtb = qtb.next() # move on to next block if (0 == (qtb.blockNumber() & 255)) : #every 256th block pqMsgs.rollBar(qtb.blockNumber()) # roll the bar QApplication.processEvents() # end of scanning all text blocks in the doc pqMsgs.endBar() # we accumulated the char counts in localCharCensus. Now read it out # in sorted order and stick it in the IMC.charCensus list. for one_char in sorted(local_char_census.keys()): qc = QChar(ord(one_char)) # get to QChar for category() method IMC.charCensus.append(QString(qc),local_char_census[one_char],qc.category()) IMC.needSpellCheck = True # after a census this is true IMC.staleCensus = 0 # but this is no longer true IMC.needMetadataSave |= IMC.wordlistsChanged
def loadStarts(self): pqMsgs.startBar(100,"Loading HTML")
def loadStarts(self): pqMsgs.startBar(100, "Loading HTML")
def doCensus(self, page=False): global reLineSep, reTokens, reLang, qcLess # Clear the current census values IMC.wordCensus.clear() IMC.charCensus.clear() # Count chars locally for speed local_char_census = defaultdict(int) # Name of current alternate dictionary alt_dict = QString() # isEmpty when none # Tag from which we set an alternate dict alt_dict_tag = QString() # Start the progress bar based on the number of lines in the document pqMsgs.startBar(self.document().blockCount(), "Counting words and chars...") # Find the first text block of interest, skipping an HTML header file qtb = self.document().begin() # first text block if IMC.bookType.startsWith(QString(u"htm")) \ and qtb.text().startsWith(QString(u"<!DOCTYPE")) : while (qtb != self.document().end()) \ and (not qtb.text().startsWith(QString(u"<body"))) : qtb = qtb.next() # Scan all lines of the document to the end. while qtb != self.document().end(): qsLine = qtb.text() # text of line as qstring dbg = qsLine.size() dbg2 = qtb.length() if reLineSep.exactMatch(qsLine): # this is a page separator line if page: # We are doing page seps, it's for Open with no .meta seen, # the page table has been cleared. Store the page sep # data in the page table, with a textCursor to its start. qsfilenum = reLineSep.cap(1) # xxx from "File: xxx.png" qsproofers = reLineSep.cap(2) # \who\x\blah\etc # proofer names can contain spaces, replace with en-space char qsproofers.replace(QChar(" "), QChar(0x2002)) # create a new TextCursor instance tcursor = QTextCursor(self.document()) # point it to this text block tcursor.setPosition(qtb.position()) # dump all that in the page table IMC.pageTable.loadPsep(tcursor, qsfilenum, qsproofers) # else not doing pages, just ignore this psep line else: # not psep, ordinary text line, count chars and words pyLine = unicode(qsLine) # move into Python space to count for c in pyLine: local_char_census[c] += 1 j = 0 while True: j = reTokens.indexIn(qsLine, j) if j < 0: # no more word-like units break qsWord = reTokens.cap(0) j += qsWord.size() if qsWord.startsWith(qcLess): # Examine a captured HTML production. if not reTokens.cap(2).isEmpty(): # HTML open tag, look for lang='dict' if 0 <= reLang.indexIn(reTokens.cap(3)): # found it: save tag and dict name alt_dict_tag = QString(reTokens.cap(2)) alt_dict = QString(reLang.cap(1)) alt_dict.prepend(u'/') # make "/en_GB" # else no lang= attribute else: # HTML close tag, see if it closes alt dict use if reTokens.cap(5) == alt_dict_tag: # yes, matches open-tag for dict, clear it alt_dict_tag = QString() alt_dict = QString() # else no alt dict in use, or didn't match else: # did not start with "<", process as a word # Set the property flags, which is harder now we don't # look at every character. Use the QString facilities # rather than python because python .isalnum fails # for a hyphenated number "1850-1910". flag = 0 if 0 != qsWord.compare(qsWord.toLower()): flag |= IMC.WordHasUpper if 0 != qsWord.compare(qsWord.toUpper()): flag |= IMC.WordHasLower if qsWord.contains(qcHyphen): flag |= IMC.WordHasHyphen if qsWord.contains(qcApostrophe) or qsWord.contains( qcCurlyApostrophe): flag |= IMC.WordHasApostrophe if qsWord.contains(reDigit): flag |= IMC.WordHasDigit IMC.wordCensus.count(qsWord.append(alt_dict), flag) # end "while any more words in this line" # end of not-a-psep-line processing qtb = qtb.next() # move on to next block if (0 == (qtb.blockNumber() & 255)): #every 256th block pqMsgs.rollBar(qtb.blockNumber()) # roll the bar QApplication.processEvents() # end of scanning all text blocks in the doc pqMsgs.endBar() # we accumulated the char counts in localCharCensus. Now read it out # in sorted order and stick it in the IMC.charCensus list. for one_char in sorted(local_char_census.keys()): qc = QChar(ord(one_char)) # get to QChar for category() method IMC.charCensus.append(QString(qc), local_char_census[one_char], qc.category()) IMC.needSpellCheck = True # after a census this is true IMC.staleCensus = 0 # but this is no longer true IMC.needMetadataSave |= IMC.wordlistsChanged