def save_path(count, xpathString, found): if count is True: countFound = 0 countNotfound = 0 try: with open(rss_config.PATH_FILENAME_STAT, "r") as file: for line in file: if line.startswith(xpathString): splittedLine = line.split(";") countFound = (int)(splittedLine[1]) countNotfound = (int)(splittedLine[2]) except Exception: with open(rss_config.PATH_FILENAME_STAT, 'a') as file: file.write("") if found: countFound += 1 elif not found: countNotfound += 1 xpathStringWithStats = xpathString + ";" + str(countFound) + ";" + str( countNotfound) + ";" replace_line_in_file(rss_config.PATH_FILENAME_STAT, xpathString + ";", xpathStringWithStats) rss_print.print_debug(__file__, xpathStringWithStats, 4)
def read_file_string_from_disk(osCacheFolderDomainArticle): if not os.path.isfile(osCacheFolderDomainArticle): rss_print.print_debug( __file__, "kettal pole lugemiseks faili: " + osCacheFolderDomainArticle, 2) return "" try: with gzip.open(osCacheFolderDomainArticle, 'rb') as cacheReadFile: htmlPageBytes = cacheReadFile.read() except Exception as e: rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1) # pakitud faili ei leitud, proovime tavalist try: with open(osCacheFolderDomainArticle, 'rb') as cacheReadFile: htmlPageBytes = cacheReadFile.read() except Exception as e: rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1) return "" try: htmlPageString = htmlPageBytes.decode(rss_config.CACHE_FILE_ENCODING) except Exception as e: rss_print.print_debug( __file__, "kettalt loetud faili dekodeerimine utf-8 vorminguga EBAõnnestus", 0) rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1) return "" return htmlPageString
def html_tree_from_document_string(htmlString, caller): """ See funktsioon teeb root html treed. """ if caller: rss_print.print_debug(__file__, "asume looma html objekti kutsujale: " + caller, 4) htmlString = htmlString.strip() if not htmlString: rss_print.print_debug( __file__, "puudub html stringi sisu kutsujal: '" + caller + "'", 0) htmlString = "<html><head></head></html>" if htmlString.startswith('<?xml version="1.0" encoding="utf-8"?>'): # kui unicode ei käi, proovime utf-8 "Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration." htmlStringUtf = htmlString.encode('utf-8') htmlTree = html.document_fromstring(htmlStringUtf) else: try: htmlTree = html.document_fromstring(htmlString) except Exception as e: rss_print.print_debug( __file__, "ei õnnestunud luua mitteutf-8 html objekti kutsujal: '" + caller + "'", 0) rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1) rss_print.print_debug( __file__, "ei õnnestunud luua mitteutf-8 html objekti stringist: '" + htmlString + "'", 3) return htmlTree
def html_string_children(htmlString): if not isinstance(htmlString, str): rss_print.print_debug(__file__, "sisend pole string, tagastame tühjuse", 0) return "" if htmlString[0] != "<": rss_print.print_debug( __file__, "katkestame, algus pole tag: '" + htmlString + "'", 4) return htmlString if htmlString[-1] != ">": rss_print.print_debug( __file__, "katkestame, lõpp pole tag: '" + htmlString + "'", 4) return htmlString if "</" not in htmlString: rss_print.print_debug( __file__, "sisendis pole child elementi, tagastame sisendi", 0) return htmlString if len(htmlString) <= 7: # <b></b> rss_print.print_debug( __file__, "liiga lühike, tagastame sisendi: '" + htmlString + "'", 0) return htmlString tagOpening = htmlString.find(">") + 1 tagClosing = htmlString.rfind("</") # lõikame stringist vajaliku osa htmlString = htmlString[tagOpening:tagClosing] htmlString = htmlString.strip() return htmlString
def html_tree_from_string(htmlString, caller): """ See funktsioon ei tee root html treed. """ if caller: rss_print.print_debug(__file__, "asume looma html objekti kutsujale: " + caller, 4) htmlString = htmlString.strip() if not htmlString: rss_print.print_debug( __file__, "puudub html stringi sisu kutsujal: '" + caller + "'", 0) try: htmlTree = html.fromstring(htmlString) except Exception as e: rss_print.print_debug( __file__, "ei õnnestunud luua mitteutf-8 html objekti kutsujal: '" + caller + "'", 0) rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1) rss_print.print_debug( __file__, "ei õnnestunud luua mitteutf-8 html objekti stringist: '" + htmlString + "'", 3) return htmlTree
def raw_to_float(rawDateTimeText, rawDateTimeSyntax): """ Teeb sisseantud ajatekstist ja süntaksist float tüüpi aja. rawDateTimeText = aeg teksti kujul, näiteks: "23. 11 2007 /" rawDateTimeSyntax = selle teksti süntaks, näiteks "%d. %m %Y /" Süntaksi seletus: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior """ curDateTimeText = rawDateTimeText.strip() if not curDateTimeText: rss_print.print_debug(__file__, "curDateTimeText = '" + curDateTimeText + "' tühi, tagastame nulli", 0) return 0 try: datetimeStruct = time.strptime(curDateTimeText, rawDateTimeSyntax) datetimeList = list(datetimeStruct) if datetimeList[0] == 1900: if datetimeList[1] > int(time.strftime('%m')): rss_print.print_debug(__file__, "curDateTimeText = '" + curDateTimeText + "', muudame puuduva aasta eelmiseks aastaks", 0) datetimeList[0] = int(time.strftime('%Y')) - 1 else: rss_print.print_debug(__file__, "curDateTimeText = '" + curDateTimeText + "', muudame puuduva aasta praeguseks aastaks", 0) datetimeList[0] = int(time.strftime('%Y')) datetimeTuple = tuple(datetimeList) datetimeFloat = time.mktime(datetimeTuple) except Exception as e: rss_print.print_debug(__file__, "curDateTimeText = '" + curDateTimeText + "' dekodeerimine rawDateTimeSyntax = '" + rawDateTimeSyntax + "' EBAõnnestus, tagastame nulli", 0) rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1) return 0 return datetimeFloat
def dict_add_dict(articleDataDictMain, articleDataDictNew): rss_print.print_debug(__file__, "ühendame dictCur ja dictNew", 4) for key in articleDataDictMain.keys(): articleDataDictMain[ key] = articleDataDictMain[key] + articleDataDictNew[key] return articleDataDictMain
def str_domain_url(domain, articleUrl): """ Ühendab domeeni URLiga. """ articleUrl = domain.rstrip('/') + '/' + articleUrl.lstrip('./').lstrip('/') rss_print.print_debug( __file__, "pärast domeeni lisamist lingile: " + str(articleUrl), 4) return articleUrl
def float_to_datetime_rfc2822(floatDateTime): """ Teeb sisse antud floadist rfc2822 aja. Süntaksi seletus: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior """ rss_print.print_debug(__file__, "floatDateTime = '" + str(floatDateTime) + "'", 5) datetimeRFC2822 = formatdate(floatDateTime, True, True) rss_print.print_debug(__file__, "datetimeRFC2822 = '" + str(datetimeRFC2822) + "'", 4) return datetimeRFC2822
def html_page_cleanup(htmlString): if not htmlString: rss_print.print_debug(__file__, "katkestame, tühi sisend: '" + htmlString + "'", 0) return htmlString rss_print.print_debug(__file__, "puhastame html stringi üleliigsest jamast", 3) # remove styles htmlString = re.sub(r"<style[\s\S]*?<\/style>", "", htmlString) # remove comments htmlString = re.sub(r"<!--[\s\S]*?-->", "", htmlString) # remove scripts from links htmlString = re.sub(r' onclick=(\")[\s\S]*?(\")', "", htmlString) htmlString = re.sub(r" onclick=(')[\s\S]*?(')", "", htmlString) # remove scripts htmlString = re.sub(r"<script[\s\S]*?<\/script>", "", htmlString) # remove trackers from links htmlString = htmlString.replace("&", "&") htmlString = re.sub(r'(&|\?)_[0-9A-Za-z_-]*', "", htmlString) # delfi htmlString = re.sub(r'=2\.[0-9.-]*', "", htmlString) htmlString = re.sub( r'_ga=[0-9.-]*', "", htmlString) # _ga=2.22935807.513285745.1595741966-250801514.1594127878 htmlString = re.sub(r'fbclid=[0-9A-Za-z-_]*', "", htmlString) htmlString = re.sub(r'gclid=[0-9A-Za-z-_]*', "", htmlString) htmlString = re.sub(r'refid=[0-9A-Za-z=.%_-]*', "", htmlString) htmlString = re.sub(r'utm_source=[0-9A-Za-z-_&=.]*', "", htmlString) # fix link without trackers htmlString = htmlString.replace("?&", "?") # eemaldame html-i vahelise whitespace-i htmlString = re.sub(r"\s\s+(?=<)", "", htmlString) # eemaldame allesjäänud tühikud htmlString = htmlString.replace('\\n', " ") htmlString = htmlString.replace('\\r', " ") htmlString = htmlString.replace('\\t', " ") # br - peab tegema, kuna muidu ei saa xpath oma teekondasid kätte htmlString = htmlString.replace("<br/>", "<br>") htmlString = htmlString.replace(" <br>", "<br>") htmlString = htmlString.replace("<br> ", "<br>") htmlString = htmlString.replace("<br><br>", "<br>") htmlString = " ".join(htmlString.split()) return htmlString
def get_service_log_path(articleUrl): if rss_config.PRINT_MESSAGE_LEVEL > 0: serviceLogPath = "/tmp/webdriver_" + articleUrl.replace("/", "|") + ".log" rss_print.print_debug(__file__, "logime asukohta: " + serviceLogPath, 0) else: os.environ['MOZ_HEADLESS'] = '1' serviceLogPath = os.devnull return serviceLogPath
def article_posts_range(articlePosts, maxArticlePosts): """ Viimasest tagasi kuni piirarvu täitumiseni. """ articlePostsLen = len(articlePosts) rss_print.print_debug( __file__, "xpath parsimisel leitud artikli poste: " + str(articlePostsLen), 2) retRange = range(max(0, articlePostsLen - maxArticlePosts), articlePostsLen) return retRange
def get_url_string_from_disk(articleUrl): rss_print.print_debug(__file__, "kettalt proovitav leht: " + articleUrl, 3) osPath = os.path.dirname(os.path.abspath(__file__)) osCacheFolder = osPath + '/' + 'article_cache' cacheArticleUrl = articleUrl.replace('/', '|') cacheDomainFolder = articleUrl.split('/')[2] osCacheFolderDomain = osCacheFolder + '/' + cacheDomainFolder osCacheFolderDomainArticle = osCacheFolderDomain + '/' + cacheArticleUrl htmlPageString = read_file_string_from_disk(osCacheFolderDomainArticle) return htmlPageString
def get_url_from_internet(curDomainLong, stamp, seleniumPath="", seleniumProfile=False): rss_print.print_debug(__file__, "algatame internetipäringu: " + curDomainLong, 2) seleniumClicks = [] # selenium if "auto24.ee" in curDomainLong: seleniumPath = '//div[@class="section messages"]' elif "err.ee/uudised" in curDomainLong: seleniumPath = '//div[@class="ng-scope"]' elif "kultuuriaken.tartu.ee/et/syndmused" in curDomainLong: seleniumClicks = [ '//input[@name="starting_time" and @value="2"]', '//a[@data-view="list-view"]' ] seleniumPath = '//div[@class="col-12"]/h1[@class="py-3"]' elif "levila.ee" in curDomainLong: seleniumPath = '//a[@class="post-item-meta__link"]' elif "mixcloud.com" in curDomainLong: seleniumPath = '//main/div[@class="content"]/div/div/div' elif "sky.ee" in curDomainLong: seleniumPath = '//div[@class="box-news-block-title "]' elif "treraadio.ee" in curDomainLong: seleniumPath = '//a[@id="scrollBtn"]' elif "tv3.ee" in curDomainLong: seleniumPath = '//a[@class="sc-1kym84g-0 dxESGf c950ig-0 eUNpOJ"]' elif "twitter.com" in curDomainLong: seleniumPath = '//article[@role="article"]' # teeme päringu if seleniumPath: htmlPageString = rss_selenium.get_article_string( curDomainLong, seleniumClicks, seleniumPath, seleniumProfile) else: htmlPageString = rss_requests.get_article_string( curDomainLong, rss_config.HEADERS) # puhastame lehe üleliigsest jamast htmlPageString = parsers_html.html_page_cleanup(htmlPageString) # salvestame kõikide netipäringute tulemused alati kettale if stamp: rss_disk.write_file_string_to_cache(curDomainLong + "#" + stamp, htmlPageString) else: rss_disk.write_file_string_to_cache(curDomainLong, htmlPageString) return htmlPageString
def add_value_to_time_string(curArtPubDate, curDateFormat, offsetDays=0): """ Lisab ajale stringi. @curArtPubDate = nt: 03.01 @curDateFormat = algusesse lisatav osa nt: 2019. @offsetDays = 0 täna, -1 eile """ datetimeOffset = datetime_offset_from_format(offsetDays) curArtPubDate = (datetime.now() + datetimeOffset).strftime(curDateFormat) + curArtPubDate rss_print.print_debug(__file__, "lisasime tänasele kellaajale kuupäeva: " + curArtPubDate, 3) return curArtPubDate
def raw_to_datetime(rawDateTimeText, rawDateTimeSyntax): """ Teeb sisseantud ajatekstist ja süntaksist datetime tüüpi aja. rawDateTimeText = aeg teksti kujul, näiteks: "23. 11 2007 /" rawDateTimeSyntax = selle teksti süntaks, näiteks "%d. %m %Y /" Süntaksi seletus: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior """ curDateTimeText = rawDateTimeText curDateTimeText = curDateTimeText.strip() curDateTimeText = parsers_common.str_lchop(curDateTimeText, "\\t") curDateTimeText = parsers_common.str_rchop(curDateTimeText, "\\r\\n") if not curDateTimeText: rss_print.print_debug(__file__, "tühi ajasisend: curDateTimeText = '" + curDateTimeText + "'", 0) else: rss_print.print_debug(__file__, "curDateTimeText = '" + curDateTimeText + "'", 5) if not rawDateTimeSyntax: rss_print.print_debug(__file__, "tühi ajasisend: rawDateTimeSyntax = '" + rawDateTimeSyntax + "'", 0) else: rss_print.print_debug(__file__, "rawDateTimeSyntax = '" + rawDateTimeSyntax + "'", 5) datetimeFloat = raw_to_float(curDateTimeText, rawDateTimeSyntax) datetimeRFC2822 = float_to_datetime_rfc2822(datetimeFloat) return datetimeRFC2822
def list_del_elem_if_set(inpList, inpIndex): inpListLen = len(inpList) indexHumanreadable = inpIndex + 1 if inpListLen >= indexHumanreadable: rss_print.print_debug( __file__, "listi pikkus on: " + str(inpListLen) + ", eemaldasime listi elemendi nr: " + str(indexHumanreadable), 4) del inpList[inpIndex] else: rss_print.print_debug( __file__, "listi pikkus on: " + str(inpListLen) + ", ei eemaldand listi elementi nr: " + str(indexHumanreadable), 4) return inpList
def replace_line_in_file(inpfile, searchExp, replaceExp): found = False for line in fileinput.input(inpfile, inplace=1): if line.startswith(searchExp): found = True line = replaceExp + "\n" sys.stdout.write(line) if not found: rss_print.print_debug(__file__, "lisame lõppu: " + replaceExp, 1) with open(inpfile, 'a') as file: file.write(replaceExp + "\n")
def replace_string_with_timeformat(inpString, stringToReplace, dateTimeformat, offsetDays=0): """ Asendab sisendis etteantud stringi mingit formaati ajaga. Sisendid: inpString="eile, 23:34" stringToReplace="eile", dateTimeformat="%d %m %Y", offsetDays=-1 Väljund: 24 05 2020, 23:34 """ if stringToReplace in inpString: datetimeOffset = datetime_offset_from_format(offsetDays) inpString = inpString.replace(stringToReplace, str((datetime.now() + datetimeOffset).strftime(dateTimeformat))) rss_print.print_debug(__file__, "asendasime stringis sõna ajaga: '" + stringToReplace + "' -> " + inpString, 3) return inpString
def xpath_to_single(elementStrings, elementsLen, xpathString, parent): """ Leiab etteantud artikli lehe puust etteantud xpathi väärtuse alusel objekt. """ element = "" # peab olema nii, kuna mitteoodatud mitmese leiu korral pandaks muidu väärtused kokku for i in range(elementsLen): elem = elementStrings[i] if not isinstance(elem, str): elem = parsers_html.html_to_string(elem, prettyPrint=False) rss_print.print_debug( __file__, "'" + xpathString + "' väärtus[" + str(i) + "] polnud string, stringimise järel: " + elem, 4) countParentNodes = parsers_html.html_string_count_parent_nodes( elem, "xpath_to_single") if not countParentNodes: if parent is True: rss_print.print_debug( __file__, "'" + xpathString + "' väärtus[" + str(i) + "] on valestimääratud parent?: " + elem, 0) elif countParentNodes == 1: elem = parsers_html.html_remove_single_parents(elem) elif countParentNodes > 1: if parent is True: rss_print.print_debug( __file__, "'" + xpathString + "' väärtus[" + str(i) + "] on valestimääratud parent? " + elem, 0) else: rss_print.print_debug( __file__, "'" + xpathString + "' väärtus[" + str(i) + "] on hoiatatud parent: " + elem, 3) elem = elem.strip() if elem: if element: element += "<br>" element += elem rss_print.print_debug(__file__, "'" + xpathString + "' väljund: '" + element + "'", 4) return element
def html_change_short_urls(htmlPageString, curDomainShort): """ Fix short urls. """ htmlPageString = htmlPageString.replace('src="//', 'src="http://') htmlPageString = htmlPageString.replace('src="./', 'src="' + curDomainShort + '/') htmlPageString = htmlPageString.replace('src="/', 'src="' + curDomainShort + '/') htmlPageString = htmlPageString.replace('href="//', 'href="http://') htmlPageString = htmlPageString.replace('href="./', 'href="' + curDomainShort + '/') htmlPageString = htmlPageString.replace('href="/', 'href="' + curDomainShort + '/') rss_print.print_debug(__file__, "html string: " + htmlPageString, 5) return htmlPageString
def article_urls_range(articleUrls): """ Esimesest edasi kuni objektide lõpuni. """ articleUrlsLen = len(articleUrls) if articleUrlsLen == 0: retRange = range(0) rss_print.print_debug( __file__, "xpath parsimisel leitud artikleid: " + str(articleUrlsLen), 1) else: retRange = range(articleUrlsLen) rss_print.print_debug( __file__, "xpath parsimisel leitud artikleid: " + str(articleUrlsLen), 3) return retRange
def str_lchop(curString, stripString): if not curString: rss_print.print_debug( __file__, "sisend tühi, katkestame: curString = '" + str(curString) + "'", 3) return curString if not stripString: rss_print.print_debug( __file__, "sisend tühi, katkestame: stripString = '" + str(stripString) + "'", 0) return curString # constant stripStringLen = len(stripString) while curString.startswith(stripString): curString = curString[stripStringLen:] return curString
def raw_to_datetime_guess_missing(inpArtPubDate, lastArtPubDate, dateStringPrefix, dateStringMain, daysToOffset): curOffsetDays = 0 curArtPubDate = inpArtPubDate curArtPubDate = add_value_to_time_string(curArtPubDate, dateStringPrefix, curOffsetDays) curArtPubDate = raw_to_datetime(curArtPubDate, dateStringPrefix + dateStringMain) if lastArtPubDate and not increasing_datetime_rfc2822(curArtPubDate, lastArtPubDate): rss_print.print_debug(__file__, "uudise päev: täna " + str(curArtPubDate) + " ja eile " + str(lastArtPubDate), 3) rss_print.print_debug(__file__, "esineb ajahüpe, peame muutma tambovi lisamise offsetti", 3) curOffsetDays += daysToOffset curArtPubDate = inpArtPubDate curArtPubDate = add_value_to_time_string(curArtPubDate, dateStringPrefix, curOffsetDays) curArtPubDate = raw_to_datetime(curArtPubDate, dateStringPrefix + dateStringMain) rss_print.print_debug(__file__, "uudise eelmine päev: " + str(lastArtPubDate), 3) rss_print.print_debug(__file__, "uudise praegune päev muutus: " + inpArtPubDate + " -> " + str(curArtPubDate), 2) else: rss_print.print_debug(__file__, "uudise päev: täna " + str(curArtPubDate) + " ja eile " + str(lastArtPubDate), 4) return curArtPubDate
def xpath_to_list(elementStrings, elementsLen, xpathString, parent): """ Leiab etteantud artikli lehe puust etteantud xpathi väärtuse alusel objektid. """ for i, elem in enumerate(elementStrings): if not isinstance(elem, str): elem = parsers_html.html_to_string(elem, prettyPrint=False) rss_print.print_debug( __file__, "'" + xpathString + "' väärtus[" + str(i) + "] polnud string, stringimise järel: " + elem, 4) countParentNodes = parsers_html.html_string_count_parent_nodes( elem, "xpath_to_list") if not countParentNodes: if parent is True: rss_print.print_debug( __file__, "'" + xpathString + "' väärtus[" + str(i) + "] on valestimääratud parent?: " + elem, 0) elif countParentNodes == 1: elem = parsers_html.html_remove_single_parents(elem) elif countParentNodes > 1: if parent is True: rss_print.print_debug( __file__, "'" + xpathString + "' väärtus[" + str(i) + "] on valestimääratud parent? " + elem, 0) else: rss_print.print_debug( __file__, "'" + xpathString + "' väärtus[" + str(i) + "] on hoiatatud parent: " + elem, 3) # ja siis ühine strip elementStrings[i] = elem.strip() rss_print.print_debug( __file__, "'" + xpathString + "' väärtused: elementStrings = " + str(elementStrings), 4) return elementStrings
def html_first_node(htmlString): htmlStringStartTag = htmlString.split(" ")[0] htmlStringStartTag = htmlStringStartTag.split(">")[0] htmlStringEndTag = htmlStringStartTag.replace("<", "</") + ">" htmlStringList = htmlString.split(htmlStringEndTag) countStartTags = htmlStringList[0].count(htmlStringStartTag) if countStartTags == 1: rss_print.print_debug( __file__, "esimesest splitist leiti " + str(countStartTags) + " esimest tagi '" + htmlStringStartTag + "'", 2) htmlString = htmlStringList[0] htmlString = htmlString + htmlStringEndTag else: rss_print.print_debug( __file__, "esimesest splitist leiti " + str(countStartTags) + " esimest tagi '" + htmlStringStartTag + "': " + str(htmlStringList[0]), 1) htmlString = htmlStringEndTag.join(htmlStringList[0:countStartTags]) htmlString = htmlString + htmlStringEndTag return htmlString
def dict_reverse_order(articleDataDict): """ Newest events last for feedly ordering. """ maxLen = -1 for key in articleDataDict.keys(): curLen = len(articleDataDict[key]) if curLen > 0: if maxLen == -1: maxLen = curLen elif maxLen != curLen: rss_print.print_debug( __file__, "mittekonsistentse pikkusega dict, katkestame", 0) return articleDataDict rss_print.print_debug(__file__, "pöörame suuna", 2) for key in articleDataDict.keys(): articleDataDict[key].reverse() return articleDataDict
def str_rchop(curString, stripString): """ Eemaldab sisendstringi lõpust kõik etteantud stringid. """ if not curString: rss_print.print_debug( __file__, "sisend tühi, katkestame: curString = '" + str(curString) + "'", 3) return curString if not stripString: rss_print.print_debug( __file__, "sisend tühi, katkestame: stripString = '" + str(stripString) + "'", 0) return curString # constant stripStringLen = len(stripString) while curString.endswith(stripString): curStringLenWithoutStripString = len(curString) - stripStringLen curString = curString[:curStringLenWithoutStripString] return curString
def write_file_string_to_cache(articleUrl, htmlPageString): osPath = os.path.dirname(os.path.abspath(__file__)) osCacheFolder = osPath + '/' + 'article_cache' cacheArticleUrl = articleUrl.replace('/', '|') cacheDomainFolder = articleUrl.split('/')[2] osCacheFolderDomain = osCacheFolder + '/' + cacheDomainFolder if not os.path.exists(osCacheFolder): rss_print.print_debug(__file__, "loome puuduva kausta: " + osCacheFolder, 0) os.makedirs(osCacheFolder) set_user_as_file_owner(osCacheFolder) if not os.path.exists(osCacheFolderDomain): rss_print.print_debug(__file__, "loome puuduva kausta: " + osCacheFolderDomain, 0) os.makedirs(osCacheFolderDomain) set_user_as_file_owner(osCacheFolderDomain) write_file(osCacheFolderDomain, cacheArticleUrl, htmlPageString, fileType="gzip")
def article_data_dict_clean(articleDataDict, dictList, dictCond, dictField): """ Eemaldame tingimusele vastavad kanded. """ if not articleDataDict[dictField]: rss_print.print_debug(__file__, "tühi sisend: articleDataDict", 0) return articleDataDict if not dictList: rss_print.print_debug(__file__, "tühi sisend: dictList", 0) return articleDataDict i = 0 while i < len(articleDataDict[dictField]): curArticleDictElem = articleDataDict[dictField][i] curArticleDictElem = curArticleDictElem.casefold() rss_print.print_debug( __file__, "kande(" + str(i + 1) + "/" + str(len(articleDataDict[dictField])) + ") kontrollime: " + curArticleDictElem[0:800], 3) found = False for dictListElem in dictList: dictListElem = dictListElem.casefold() if dictCond == "not in" and dictListElem not in curArticleDictElem: found = True break if dictCond == "in" and dictListElem in curArticleDictElem: found = True break if dictCond == "==" and dictListElem == curArticleDictElem: found = True break # kontrollime eemaldamistingimusele vastamist if found is True: rss_print.print_debug( __file__, "kande(" + str(i + 1) + "/" + str(len(articleDataDict[dictField])) + ") tingimus täidetud: '" + dictListElem + "' " + dictCond + " '" + curArticleDictElem + "'", 2) articleDataDict = dict_del_article_index(articleDataDict, i) else: i += 1 return articleDataDict