def _getRecentColumnIdx(siteText, tableBeginIdx): # type: (str, int) -> int ths = getNextRowCells(siteText, tableBeginIdx, "th") recentColumnIdx = COLUMN_ID_NOT_FOUND for i, th in reversed(tuple(enumerate(ths))): if "~1000" in th or "~1,000" in th: recentColumnIdx = i return recentColumnIdx
def _getOverallAndRecentColumnIdx(siteText, tableBeginIdx): ths = getNextRowCells(siteText, tableBeginIdx, "th") overallColumnIdx = COLUMN_ID_NOT_FOUND recentColumnIdx = COLUMN_ID_NOT_FOUND for i, th in reversed(tuple(enumerate(ths))): if "Общий" in th or "Overall" in th: overallColumnIdx = i if "~1000" in th or "~1,000" in th: recentColumnIdx = i assert overallColumnIdx != COLUMN_ID_NOT_FOUND, "No overall column found in %s" % ths return overallColumnIdx, recentColumnIdx
def _getTrsWithData(siteText, tableBeginIdx): iterations = 0 headerEndIdx = siteText.find("</tr>", tableBeginIdx) tableEndIdx = siteText.find("</table>", headerEndIdx) nextTrBeginIdx = headerEndIdx trs = list() while nextTrBeginIdx != -1 and nextTrBeginIdx < tableEndIdx: nowTrBeginIdx = nextTrBeginIdx tds = getNextRowCells(siteText, nowTrBeginIdx) trs.append(tds) nextTrBeginIdx = siteText.find("<tr", nowTrBeginIdx + 1) assert iterations < MAX_ITERATIONS, "Too many iterations: %s" % iterations iterations += 1 return trs