def saveCsv(sqlDF, csv): #sqlDF.show(10, truncate=False) #util.logMessage("count: %d" % sqlDF.count()) # output to csv file util.logMessage("save to csv: %s" % csv) csvTmp = csv+"."+time.strftime("%Y%m%d%H%M%S")+".tmp" # temp folder sqlDF.coalesce(1).write.csv(csvTmp, header=True, mode='overwrite', sep=',', dateFormat='yyyy-MM-dd', timestampFormat='yyyy-MM-dd HH:mm:ss') #timestampFormat='yyyy-MM-dd HH:mm:ss.SSS') # rename # check result file exist outputCsvList = glob.glob(csvTmp+"/*.csv") if len(outputCsvList) <= 0: # no file util.logMessage("no file to output: %s" % csv) return None # supposed only have 1 because of coalesce(1), but in case of more than one, it will just keep overwriting for curr_file in sorted(outputCsvList): os.system("rm -rf '%s'" % csv) # remove prev output shutil.move(curr_file, csv) os.system("rm -rf '%s'" % csvTmp) # remove temp output folder
def csvToDF(spark, csvFile, schema=None): try: if schema is None: df = spark.read.csv(csvFile, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True, header=True, timestampFormat='yyyy-MM-dd HH:mm') else: df = spark.read.csv(csvFile, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True, header=True, timestampFormat='yyyy-MM-dd HH:mm', schema=schema) return df except Exception as e: util.logMessage("Job: %s: Exception Error: %s!" % (APP_NAME, e)) return None except: util.logMessage("Job: %s: Other Unknown Error!" % APP_NAME) return None
def updateMasterInfo(): global optionJSON if optionJSON[u'master'] != '': # if master defined, not using zookeeper optionJSON[u'zkStr'] = '' util.logMessage("Master default at %s:%d" % (optionJSON[u'master'], optionJSON[u'masterPort'])) else: # if master not defined, use zookeeper if optionJSON[u'zkStr'] != '': util.logMessage( "Try to determine master using zookeeper string: %s" % optionJSON[u'zkStr']) master, masterPort = util.getMesosMaster(optionJSON[u'zkStr']) else: util.logMessage( "Try to determine master using default zookeeper string: %s" % "zk://mesos_master_01:2181,mesos_master_02:2181,mesos_master_03:2181/mesos" ) master, masterPort = util.getMesosMaster() if master == '': # master not found through zookeeper optionJSON[u'master'] = "mesos_master_01" util.logMessage( "Cannot get master from zookeeper; master default at %s:%d" % (optionJSON[u'master'], optionJSON[u'masterPort'])) else: # master found through zookeeper optionJSON[u'master'] = master optionJSON[u'masterPort'] = masterPort util.logMessage("Master detected at %s:%d" % (optionJSON[u'master'], optionJSON[u'masterPort']))
def logQuery(kind: str, table: str, sql: str, data: Iterable[Any]) -> None: if not logQueries: return sql = sql.replace('\t', ' ').replace('\n', ' ') while sql.find(' ') >= 0: sql = sql.replace(' ', ' ') data = transformQueryData(data) util.logMessage(f'{kind}: sql={sql} data={data}')
def _makeRequest(self, apiUrl: str, params: Optional[Dict[str, str]]) -> Optional[ScrapeMeta]: import requests r = None ts0 = time.time() # while we have time to retry while (time.time() - ts0) < self.timeout: try: # attempt a request timePassed = time.time() - ts0 timeLeft = self.timeout - timePassed r = requests.get( apiUrl, headers=self.headers, params=params, data=self.extraData, auth=self.auth, timeout=max(1, timeLeft), ) except: util.logMessage( 'SkitterClient._makeRequest|exception|{}'.format(apiUrl), 'scrape.log') raise # if we got rate limited if r.status_code == 429: retryAfter = int(r.headers['Retry-After']) # and there's enough left in our timeout to retry, then retry if (time.time() + retryAfter - ts0) >= self.timeout: # otherwise break with the 429 response break time.sleep(retryAfter) else: # otherwise break with the non-429 response break if r is None: # this _should_ be unreachable raise Exception( f'SkitterClient._makeRequest: failed to make request: {apiUrl}' ) if r.status_code == 404: return None if r.status_code != 200: raise Exception( 'SkitterClient._makeRequest: failed to download url {}: {}'. format(r.status_code, apiUrl)) ts = int(r.headers['X-Weaver-Created']) url = str(r.headers['X-Weaver-Url']) raw = r.content text = decodeRequest(raw, url) delaySecs(self.delay) return buildScrapeMeta(url, ts, text, r.status_code)
def f_map(filetuple): [fn,bw] = filetuple try: xml = XMLParser(fn,'bytes','file',bw,SparkFiles.get('config.ini')) #util.logMessage("inside map func, after XMLParser(), before xml.ProcessXML()") return xml.ProcessXML() except Exception as e: util.logMessage("err in file: %s" % fn) return ""
def handleFandom(self, fic: Fic, fandom: str) -> List[Fandom]: # save raw/messy fandom fandoms = [Fandom.define(fandom, sourceId=self.ftype)] # ensure messy is in our map if fandom not in ffNetFandomMap: util.logMessage('unknown fandom: {} (from {})'.format(fandom, fic.url)) else: fandoms.append(Fandom.define(ffNetFandomMap[fandom])) return fandoms
def deepSoftScrape(self, fic: Fic) -> None: # try to grab reader pages first to be sure we have them try: self.readerSoftScrape(fic) except: pass urls = self.getDeepPageUrls(fic) util.logMessage('deepSoftScrape|{}|{}'.format(fic.id, len(urls))) for url in urls: self.scrapeLike(url, 5)
def scrape(self, url: str) -> ScrapeMeta: url = canonizeUrl(url) # TODO staleOnly? if self.staleOnly: util.logMessage('staleScrape|{}'.format(url), 'scrape.log') #r = getMostRecentScrapeWithMeta(url, beforeId = _staleBefore) #if r is None or 'raw' not in r: # raise Exception('failed to stale scrape url: {}'.format(url)) #return { 'url': url, 'fetched': ts, 'raw': r['raw'] } res = self.crawl(url) saveWebRequest(res['fetched'], res['url'], res['status'], res['raw']) return res
def getId(cls, name: str, sourceId: int) -> int: assert (isinstance(sourceId, int)) es = Author.select({'name': name}, f''' case when exists ( select 1 from author_source s where s.authorId = authorId and s.sourceId = {sourceId} ) then 1 else 0 end desc''') if len(es) > 1: util.logMessage(f'many authors: name={name} sourceId={sourceId}') if len(es) >= 1: return es[0].id if len(es) > 0: raise Exception('FIXME') e = Author.new() e.name = name e.urlId = util.randomString(8, charset=util.urlIdCharset) e.insert() return Author.getId(name, sourceId)
def decodeRequest(data: Optional[bytes], url: str) -> Optional[str]: global decodeFailureDumpFile if data is None: return None try: return data.decode('utf-8') except: pass setupCP1252() # handle Mórrigan and façade in # http://www.fictionalley.org/authors/irina/galatea05.html # looks aggressively misencoded data = data.replace(b'M\xc3\x83\xc2\xb3rr\xc3\x83\xc2\xadgan', b'M\xf3rrigan') data = data.replace(b'fa\xc3\x83\xc2\xa7ade', b'fa\xe7ade') data = data.replace(b'#8211;—–\xb5–\xbb–\xb8', b'#8211;—–––') data = data.replace(b'#8211;––\xb9 – —\x83', b'#8211;–– – —') data = data.replace(b'–\xb9 – —\x83', b'#8211;–– – —') # replace misencoded utf-8 bits (likely from a header or footer) with their # cp1252 counterparts for utoc in utf8_to_cp1252: data = data.replace(utoc[0], utoc[1]) # do some cleanup on the remaining cp1252 to normalize smart quotes and # delete a few invalid chars that may have leaked through for ctom in cp1252_munge: data = data.replace(ctom[0], ctom[1]) try: return data.decode('cp1252') except Exception as e: util.logMessage('error decoding {}: {}\n{}'.format( url, e, traceback.format_exc())) with open(decodeFailureDumpFile, 'wb') as f: f.write(data) raise
def addPkAndSaveParquet(origDF, writemode, outputDir, numPartition=None): util.logMessage("adding partition columns...") ''' # add key col from HL_Area origDF = origDF.withColumn("HL_Area", lit('unassigned')) # add key col from HL_Cluster origDF = origDF.withColumn("HL_Cluster", lit('unassigned')) # add key col from HL_SectorLayer origDF = origDF.withColumn("HL_SectorLayer", lit(None).cast(StringType())) ''' # remove HL_Market column; it will be recovered later origDF = origDF.drop("HL_Market") origDF.createOrReplaceTempView('kpi') # recover from lookup parquet util.logMessage("start market-cluster-area recovery process...") # example join sql #sqlDF = spark.sql("SELECT l.TECH,l.VENDOR,l.MARKET,l.CLUSTER,l.AREA,k.UtranCell from kpi k left join lookup l on k.UtranCell = l.CELL") # create join dataframe #df = spark.sql("SELECT k.*, IFNULL(l.MARKET,'unassigned') as HL_Market, IFNULL(l.CLUSTER,'unassigned') AS HL_Cluster, IFNULL(l.AREA,'unassigned') AS HL_Area from kpi k left join lookup l on UPPER(k.OSSName) = UPPER(l.OSS) AND k.UtranCell = l.CELL") df = spark.sql("SELECT k.*, IFNULL(l.MARKET,'unassigned') as HL_Market, IFNULL(l.CLUSTER,'unassigned') AS HL_Cluster, IFNULL(l.AREA,'unassigned') AS HL_Area, IFNULL(l.SITE,'unassigned') AS HL_Site from kpi k left join lookup l on UPPER(k.OSSName) = UPPER(l.OSS) AND k.UtranCell = l.CELL") # add key col from HL_MARKET - need to add HL_MARKET because that column will be gone if we go into sub dir df = df.withColumn("pk_market", df['HL_MARKET']) # add key col from HL_DATE df = df.withColumn("pk_date", date_format(df['HL_DATE'], 'yyyy-MM-dd')) # add key col from PERIOD_START_TIME df = df.withColumn("pk_hr", date_format(df['PERIOD_START_TIME'], 'HH')) # show dtypes #util.logMessage("dtypes: %s" % df.dtypes) # show schema #df.printSchema() #df.show(1,truncate=False) util.logMessage("start writing parquet file (%s): %s" % (writemode, outputDir)) if numPartition is None: df.write.parquet(outputDir, compression='gzip', mode=writemode, partitionBy=('pk_date','pk_market','pk_hr')) else: # coalesce - num of partition - should match number of executor we run df.coalesce(numPartition).write.parquet(outputDir, compression='gzip', mode=writemode, partitionBy=('pk_date','pk_market','pk_hr')) util.logMessage("finish writing parquet file (%s): %s" % (writemode, outputDir))
def extractPostThreadmarkTitle(self, postSoup: Any) -> Optional[str]: title = '' # try to grab the title from the threadmark label try: labelSpans = postSoup.find_all('span', {'class': 'threadmarkLabel'}) if len(labelSpans) < 1: return None if len(labelSpans) > 1: util.logMessage( f'XenForoAdapter: too many threadmark labels: len: {len(labelSpans)}' ) return str(labelSpans[0].get_text()).strip() except Exception as e: util.logMessage('\n'.join([ f'XenForoAdapter.extractPostThreadmarkTitle: exception FIXME: {e}', traceback.format_exc() ])) return None
def getDeepAuthorPostUrls(self, fic: Fic) -> List[str]: urls = self.getDeepPageUrls(fic) util.logMessage( f'XenForo.getDeepAuthorPostUrls|deep page urls: {urls}') # TODO this should probably be more comprehensive... author = fic.getAuthorName() altAuthor = author.replace("'", ''') postUrls: List[str] = [] seenIdStubs = set() for url in urls: pageContent = self.scrapeLike(url) # See getReaderPostUrls for a fully parsed version for b in pageContent.split('<'): e = b.find('>') if e == -1: continue s = b[:e] # TODO FIXME this is bad :( # looking for li or article (the post container) if not (b.startswith('li id=') or b.startswith('article class=')): continue # check for 'message' -- simulates checking for message class if not 'message' in s: continue # to check the data-author we simply look for the author and hope # there aren't collisions if s.find(author) < 0 and s.find(altAuthor) < 0: continue # loop over spaced tokens looking for an unspaced id attribute for sb in s.split(): if not sb.startswith('id="') or not sb.endswith('"'): continue idStub = sb[len('id="'):-1] if idStub.startswith('js-'): idStub = idStub[len('js-'):] postUrl = url + '#' + idStub if idStub not in seenIdStubs: postUrls += [postUrl] seenIdStubs |= {idStub} util.logMessage(f'XenForo.getDeepAuthorPostUrls|postUrls: {postUrls}') return postUrls
def insert(self) -> None: table = type(self).getTableName() cols = type(self).getNonGeneratedColumns() sql = 'INSERT INTO {}({}) VALUES({})'.format( table, ', '.join([c.name for c in cols]), ', '.join(['%s'] * len(cols)) ) data = self.toInsertTuple() conn = type(self).getConnection() with conn.cursor() as curs: try: logQuery('insert', table, sql, data) curs.execute(sql, data) except: util.logMessage(f'failed to insert: {sql}: {data}', 'lite.log') raise global autocommit if autocommit == True: conn.commit()
def scrape(url: str, cookies: 'requests.cookies.RequestsCookieJar' = None, delay: float = 3, timeout: int = 15) -> ScrapeMeta: url = canonizeUrl(url) headers = {'User-Agent': __userAgent} ts = int(time.time()) if _staleOnly: pass # util.logMessage('staleScrape|{}'.format(url), 'scrape.log') last = getMostRecentScrapeWithMeta(url, beforeId=_staleBefore) if last is None or 'raw' not in last: raise Exception('failed to stale scrape url: {}'.format(url)) return {'url': url, 'fetched': ts, 'raw': last['raw']} import requests if cookies is None: import priv cookies = priv.getDefaultCookies() r = None try: r = requests.get(url, headers=headers, cookies=cookies, timeout=timeout) except: util.logMessage('scrape|exception|{}'.format(url), 'scrape.log') raise if r.status_code != 200: saveWebRequest(ts, url, r.status_code, None) delaySecs(delay) raise Exception('failed to download url {}: {}'.format( r.status_code, url)) raw = r.content text = decodeRequest(raw, url) saveWebRequest(ts, url, r.status_code, text) delaySecs(delay) return {'url': url, 'fetched': ts, 'raw': text}
def jsonToSchema(jsonFile): schema = None try: with open(jsonFile) as json_data: schemaJson = json.load(json_data) #print [item for item in schemaJson] except Exception as e: util.logMessage("Job: %s: Exception Error: %s!" % (APP_NAME, e)) return None except: util.logMessage("Job: %s: Other Unknown Error!" % APP_NAME) return None schema = StructType([StructField.fromJson(item) for item in schemaJson]) return schema
def f_map(filetuple): [fn, bw] = filetuple try: xml = nokiaXmlParser(bw, SparkFiles.get('config.ini'), fn) #util.logMessage("inside map func, after nokiaXmlParser(), before xml.Main()") data, type, err = xml.Main() #print fn, type, data obj = dict() obj['err'] = err obj['data'] = data xmlDict = { type: obj } # list of detail [0] and nb summary [1] (if exists) return xmlDict except Exception as e: util.logMessage("err in file: %s\n%s" % (fn, e)) return {}
def loadCellLookup(lookupPQ): global dfCellLookup global dictMarketLookup # read lookup parquet newlookupPQ = lookupPQ + "/TECH=%s/VENDOR=%s" % ('UMTS','Ericsson') util.logMessage("reading lookup parquet: %s" % newlookupPQ) dfLookup = spark.read.parquet(newlookupPQ) dfCellLookup = dfLookup # save to global dfLookup.createOrReplaceTempView('lookup') sqlDF = spark.sql("SELECT DISTINCT MARKET,MARKET_SUFFIX FROM lookup") arrMarketLookup = sqlDF.collect() dictMarketLookup = dict() for row in arrMarketLookup: dictMarketLookup[row['MARKET']] = row['MARKET_SUFFIX'] util.logMessage("finish reading lookup parquet: %s" % newlookupPQ)
def scrape(url: str, staleOnly: bool = False, fallback: bool = False) -> sc.ScrapeMeta: if sc._staleOnly: util.logMessage(f'skitter.scrape: HERMES_STALE only {url}') return sc.scrape(url) if staleOnly: util.logMessage(f'skitter.scrape: staleOnly {url}') for c in reversed(priv.skitterClients): ce = c.cache(url) if ce is not None: return ce raise Exception(f'skitter.scrape: unable to staleOnly scrape: {url}') for c in priv.skitterClients: try: #util.logMessage(f'skitter.scrape: calling {c.ident}.scrape({url})') r = c.scrape(url) return r except Exception as e: util.logMessage(f'skitter.scrape: {c.ident}.scrape failed: {e}') pass if fallback: return sc.scrape(url) raise Exception(f'skitter.scrape: unable to scrape: {url}')
def getId(cls, authorId: int, sourceId: int, name: str, url: str, localId: str) -> int: es = AuthorSource.select({'authorId': authorId, 'sourceId': sourceId}) if len(es) > 1: util.logMessage( f'many author source: authorId={authorId} sourceId={sourceId}') if len(es) >= 1: es[0].name = name es[0].url = url es[0].localId = localId es[0].update() return es[0].id if len(es) > 0: raise Exception('FIXME') e = AuthorSource.new() e.authorId = authorId e.sourceId = sourceId e.name = name e.url = url e.localId = localId e.insert() return AuthorSource.getId(authorId, sourceId, name, url, localId)
def getReaderPosts(self, fic: Fic) -> Tuple[Dict[str, Any], Dict[int, str]]: from bs4 import BeautifulSoup urls = self.getReaderUrls(fic) soups = {} titles = {} for url in urls: pageContent = self.scrapeLike(url) pageSoup = BeautifulSoup(pageContent, 'html5lib') posts = pageSoup.find_all(self.postContainer, {'class': 'message'}) if len(posts) < self.postsPerPage and url != urls[-1]: util.logMessage( f'XenForoAdapter.getReaderPosts: {url} is not the last page but is incomplete with {len(posts)} posts; attempting to refetch' ) pageContent = scrape.scrape(url, timeout=30)['raw'] time.sleep(self.defaultDelay) pageSoup = BeautifulSoup(pageContent, 'html5lib') posts = pageSoup.find_all(self.postContainer, {'class': 'message'}) if len(posts) < self.postsPerPage and url != urls[-1]: raise Exception( f'XenForoAdapter.getReaderPosts: {url} is not the last page but is incomplete with {len(posts)} posts' ) for post in posts: pid = post.get('id') if pid.startswith('js-'): pid = pid[len('js-'):] soups[pid] = post title = self.extractPostThreadmarkTitle(post) if title is None: title = '' titles[len(soups)] = title util.logMessage( f'XenForoAdapter.getReaderPostUrls|{fic.id}|{len(soups)}') return (soups, titles)
def handleCrossoverFandom( self, fic: Fic, fandom: str, fIds: List[int], href: str ) -> List[Fandom]: # save raw/messy fandom fandoms = [Fandom.define(fandom, sourceId=self.ftype)] # ensure fandom ids are in our map # check for missing id maps missingIds = [fId for fId in fIds if fId not in ffNetFandomIdMap] if len(missingIds) > 0: util.logMessage( 'unknown fandom ids: {} from {} in {}'.format( missingIds, href, fic.url ) ) return fandoms # translate to messy messys = [ffNetFandomIdMap[fId] for fId in fIds] # check for missing messy missingMessy = [m for m in messys if m not in ffNetFandomMap] if len(missingMessy) > 0: util.logMessage( 'unknown messy fandom: {} from {}'.format(missingMessy, href) ) return fandoms # check crossover value expected = '{}_and_{}_Crossovers'.format(messys[0], messys[1]) if expected != fandom: util.logMessage( 'crossover got "{}" expected "{}"'.format(fandom, expected) ) return fandoms # map messy to clean cleans = [ffNetFandomMap[m] for m in messys] for clean in cleans: if len(clean) > 0: fandoms.append(Fandom.define(clean)) return fandoms
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(wwwHtml, 'html5lib') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? if fic.ficStatus is None or fic.ficStatus == FicStatus.broken: fic.ficStatus = FicStatus.ongoing # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' # TODO? # grab title from <title> element titles = soup.find('head').find_all('title') if len(titles) != 1: raise Exception(f'error: cannot find title: {len(titles)}') ntitle = '' try: ntitle = titles[0].get_text() except: pass # TODO FIXME if fic.title is None or len(ntitle.strip()) > 0: fic.title = ntitle if len(self.titleSuffix) > 0 and fic.title.endswith(self.titleSuffix): fic.title = fic.title[:-len(self.titleSuffix)] fic.title = fic.title.strip() # determine author authorPost = self.getRealAuthorPost(fic) authorPostUsernames = authorPost.find_all('a', {'class': 'username'}) if len(authorPostUsernames) < 1: raise Exception('error: unable to find author username') author = authorPostUsernames[0].get_text() auth_href = authorPostUsernames[0].get('href') authorUrl = urllib.parse.urljoin(self.baseUrl, auth_href) if not authorUrl.startswith(self.baseUrl): raise Exception('error: unknown username href format') authorId = authorUrl[len(self.baseUrl):] if not authorId.startswith('members/'): raise Exception(f'error: unknown author id format: {authorId}') authorId = authorId.split('/')[1] self.setAuthor(fic, author, authorUrl, authorId) if fic.description is None: # TODO? fic.description = htmlEscape(fic.title + ' by ' + fic.getAuthorName()) # try grabbing reader version, fallback to full pages threadmarksHtml = None try: sep = '?' if self.baseUrl.find('?') < 0 else '&' url = f'{self.baseUrl}threads/{fic.localId}/threadmarks{sep}category_id=1' threadmarksHtml = self.scrapeLike(url) self.readerSoftScrape(fic) except: # note: we do this before the theardmarks check for old-style fics # soft scrape all thread pages to ensure we have everything self.deepSoftScrape(fic) postSoups: Dict[str, Any] = {} postUrls: List[str] = [] chapterTitles = {} try: # scrape the threadmarks page, assuming there is one threadmarksSoup = BeautifulSoup(threadmarksHtml, 'html5lib') # attempt to extract a fic description threadmarkExtraInfo = threadmarksSoup.find( 'div', {'class': 'threadmarkListingHeader-extraInfo'}) if threadmarkExtraInfo is not None: bbWrapper = threadmarkExtraInfo.find('div', {'class': 'bbWrapper'}) if bbWrapper is not None: desc = bbWrapper.decode_contents() descView = HtmlView(desc, markdown=False) fic.description = ''.join( [f'<p>{l}</p>' for l in descView.text]) # determine chapter count based on threadmarks threadmarkList = threadmarksSoup.find('div', {'class': 'threadmarkList'}) threadmarks = None if threadmarkList is not None: threadmarks = threadmarkList.find_all( 'li', {'class': 'threadmarkListItem'}) else: threadmarkList = threadmarksSoup.find( 'div', {'class': 'block-body--threadmarkBody'}) if threadmarkList is None: raise Exception('error: unable to find threadmark menu') if threadmarkList.find(class_='fa-ellipsis-h') is not None: raise Exception('unable to handle elided threamdarks') threadmarks = threadmarkList.find_all('li') if len(threadmarks) == 0: threadmarks = threadmarkList.find_all('tr') util.logMessage( f'XenForo|new threadmarks count|{len(threadmarks)}') for threadmark in threadmarks: if threadmark.find( 'span', {'class': 'message-newIndicator'}) is not None: continue a = threadmark.find('a') purl = a.get('href') if purl.startswith('threads/'): purl = '{}{}'.format(self.baseUrl, purl) elif purl.startswith('/threads/'): purl = '{}{}'.format(self.baseUrl, purl[1:]) postUrls += [purl] chapterTitles[len(postUrls)] = a.getText().strip() try: postSoups, _ = self.getReaderPosts(fic) except Exception as ie: # FIXME oh boy: # https://forum.questionablequesting.com/threads/worm-cyoa-things-to-do-in-brockton-bay-when-youre-a-bored-demigod.1247/reader # Reader page says 36 threadmarks, but actual threadmark list says 33 # First reader page abruptly stops at 27 threadmarks util.logMessage( 'XenForoAdapter: unable to getReaderPosts: {}\n{}'.format( ie, traceback.format_exc())) except Exception as e: util.logMessage( 'XenForoAdapter: unable to parse threadmarks: {}\n{}'.format( e, traceback.format_exc())) try: postUrls = self.getReaderPostUrls(fic) postSoups, chapterTitles = self.getReaderPosts(fic) except Exception as ie: util.logMessage( 'XenForoAdapter: unable to parse reader posts: {}\n{}'. format(ie, traceback.format_exc())) postUrls = self.getDeepAuthorPostUrls(fic) # if we fallback to here, don't immediately setup postSoups at all; # they'll be fetched as needed later fic.chapterCount = len(postUrls) chapterPosts: List[Optional[str]] = [] chapterUrls: List[str] = [] chapterPostIds: List[str] = [] lastSoupUrl: Optional[str] = None lastSoup: Optional[Any] = None for purl in postUrls: parts = purl.split('#') burl = parts[0] postId = authorPost.get('id') if len(parts) < 2 else parts[1] rawPost = None # first try getting the post from the reader pages if postId in postSoups and postSoups[postId] is not None: rawPost = str(postSoups[postId]) else: # if needed, fallback to grabbing that page from the entire thread pageSoup = None if lastSoupUrl is not None and lastSoupUrl == burl: pageSoup = lastSoup else: pageContent = self.scrapeLike(burl) pageSoup = BeautifulSoup(pageContent, 'html5lib') lastSoupUrl = burl lastSoup = pageSoup assert (pageSoup is not None) if postId is not None: poss = pageSoup.find_all(self.postContainer, {'id': postId}) if len(poss) != 1: # XenForo2 often has js- prefixed on the actual id attr poss = pageSoup.find_all(self.postContainer, {'id': 'js-' + postId}) if len(poss) != 1: raise Exception( f'error: cannot find post for chapter {postId}') rawPost = str(poss[0]) else: rawPost = str( pageSoup.find_all(self.postContainer, {'class': 'message'})[0]) chapterPosts += [rawPost] chapterUrls += [burl] chapterPostIds += [postId] fic.wordCount = 0 fic.published = None fic.updated = None chapterContents: List[str] = [] for rawPost in chapterPosts: post = BeautifulSoup(rawPost, 'html5lib') content = post.find_all( 'div', {'class': ['messageContent', 'message-content']}) if len(content) != 1: raise Exception('error: cannot find content for chapter post') content = content[0] lastEditedDivs = content.find_all('div', {'class': 'message-lastEdit'}) for lastEditedDiv in lastEditedDivs: br = soup.new_tag("br") lastEditedDiv.insert_before(br) chapterContents += [str(content)] fic.wordCount += len(str(content).split()) uts = self.getPostUpdatedOrPublished(post) if fic.published is None: fic.published = OilTimestamp(uts) fic.updated = OilTimestamp(uts) if fic.updated is None: raise Exception( f'unable to determine updated date: {len(chapterPosts)} {len(postUrls)}' ) fic.upsert() for cid in range(fic.chapterCount): chapter = fic.chapter(cid + 1) chapter.url = chapterUrls[cid] chapter.localChapterId = chapterPostIds[cid] if (cid + 1) in chapterTitles: chapter.title = chapterTitles[(cid + 1)] chapter.upsert() chapter.setHtml(str(chapterContents[cid])) # TODO: word count, published, updated can only be found once all chapters # each post is inside an li id="post-{number}" class="message" # each post has data-author="{author}" self.updateTitle(fic) return fic
# "closeToLimitDelay" : 6, # "exec_core_per_job" : 4, # "drvr_mem" : "512m", # "exec_mem" : "2g", # "logfile" : "", - empty = no log file # "uiStartPort" : "", - empty = default start port range for random func # "uiEndPort" : "", - empty = default end port range for random func # "numFileTypePerTask" : 1, # "exportType" : "" - empty = export all filetype # }' ## "":null --> None in python (no coalesce) ## "":false/true --> False/True in python # argv[6] - (optional) "cluster" or "client" mode if len(sys.argv) < 6: util.logMessage("Error: param incorrect.") sys.exit(2) # argv[5] - option json - get first to get all options optionJSON = "" if len(sys.argv) > 5: optionJSON = sys.argv[5] if optionJSON == "": optionJSON = '{"master":"", "masterPort":5050}' try: optionJSON = json.loads(optionJSON) except Exception as e: # error parsing json optionJSON = '{"master":"", "masterPort":5050}' optionJSON = json.loads(optionJSON) # default val if not exist
def canStartNewJob(statusJSON): bHaveResource = True delay_sec = general_retry_delay_sec # general retry delay global prev_jobname global check_ctr # get status statusJSON = getStatusJSON_mesos() # get cores used cores_max, cores_used = getCoresUsed_mesos(statusJSON) util.logMessage("Current cores used: %d/%d" % (cores_used, cores_max)) # get current job status numJobs, numWaitingJobs, bFoundLastSubmit = getCurrJobs_mesos(statusJSON) # get current worker resource status bHaveWorkersResource = haveWorkersResource_mesos(statusJSON) # re-calc max num jobs max_num_job = int(cores_max / core_per_job) if max_num_job > max_num_job_hardlimit: # check against hard limit max_num_job = max_num_job_hardlimit # case 1: cannot get job info if numJobs == -1 or numWaitingJobs == -1: bHaveResource = False check_ctr = 0 # reset retry counter util.logMessage("cannot get jobs info, retry again in %d sec" % delay_sec) ''' # turn off to relax the check so we not neccessary wait for job sumbit finish # case 2: last submitted job not show up yet elif prev_jobname != "" and not bFoundLastSubmit: bHaveResource = False delay_sec = prev_job_wait_delay_sec # only wait for little before update util.logMessage("last job submit: %s not completed, retry again in %d sec" % (prev_jobname, delay_sec)) ''' # case 3: allowed cores exceed elif cores_used > (cores_max - core_per_job): bHaveResource = False check_ctr = 0 # reset retry counter util.logMessage("cores exceeding limit, retry again in %d sec" % delay_sec) # case 4: do last n # of check before adding last available job slot # check_ctr == max_check_ctr means already check n # of times, pass test elif cores_used == (cores_max - core_per_job): if check_ctr < max_check_ctr: check_ctr += 1 bHaveResource = False delay_sec = core_close_to_limit_delay_sec util.logMessage("cores close to limit, retry again in %d sec" % (delay_sec)) else: check_ctr = 0 # condition met, reset retry counter # case 5: more than 1 waiting job elif numWaitingJobs > 1: bHaveResource = False check_ctr = 0 # reset retry counter util.logMessage("number of waiting job = %d, retry again in %d sec" % (numWaitingJobs, delay_sec)) ''' # cannot check this as now there are other different jobs in the pool # case 6: max job allowed reached elif numJobs >= max_num_job: bHaveResource = False check_ctr = 0 # reset retry counter util.logMessage("reached max num of job (%d/%d), retry again in %d sec" % (numJobs, max_num_job, delay_sec)) ''' # case 7: all worker occupied - either no avail core or no avail mem on all the workers elif bHaveWorkersResource == False: bHaveResource = False check_ctr = 0 # reset retry counter util.logMessage("all workers are occupied, retry again in %d sec" % delay_sec) return bHaveResource, delay_sec
def readerSoftScrape(self, fic: Fic) -> None: urls = self.getReaderUrls(fic) util.logMessage('readerSoftScrape|fic.id: {}|len(urls): {}'.format( fic.id, len(urls))) for url in urls: self.scrapeLike(url)
def worker(seqfile): global prev_jobname seqfile_dir, seqfile_file = os.path.split(seqfile) if exportMode == 2: # pq only seqfile_dir, seqfile_file = os.path.split( seqfile_dir) # parse again for the main folder (2nd lvl) if optionJSON[u'oss'] == "": job_oss = '' else: job_oss = '_' + optionJSON[u'oss'] if exportMode == 2: # pq only jobname_expMode = 'a' elif exportMode == 3: # csv only jobname_expMode = 'b' else: # combine jobname_expMode = 'c' jobname = "stg3%s_%s%s" % (jobname_expMode, seqfile_file, job_oss) jobname = jobname.replace( ' ', '-') # for cluster mode, job name should not contain space - spark bug util.logMessage("Task %s start..." % jobname) # get rnadom port for web UI port = util.getAvailablePortRand( optionJSON[u'uiStartPort'], optionJSON[u'uiEndPort']) # get random port # create master string if proc_mode == 'cluster': # assume the leading master that zk return is the one to be use for dispatcher exec_str_master = "mesos://%s:%d" % (optionJSON[u'master'], optionJSON[u'dispatcherPort']) else: # client if optionJSON[u'zkStr'] != '': exec_str_master = "mesos://%s" % (optionJSON[u'zkStr']) else: exec_str_master = "mesos://%s:%d" % (optionJSON[u'master'], optionJSON[u'masterPort']) # create spark string exec_str_spark = "/opt/spark/bin/spark-submit \ --conf spark.ui.port=%d \ --conf spark.network.timeout=900s \ --conf spark.rpc.askTimeout=900s \ --conf spark.executor.heartbeatInterval=900s \ --conf 'spark.driver.extraJavaOptions=-XX:ParallelGCThreads=2' \ --conf 'spark.executor.extraJavaOptions=-XX:ParallelGCThreads=2' \ --master %s \ --deploy-mode %s \ --driver-memory %s \ --executor-memory %s \ --total-executor-cores %d" % (port, exec_str_master, proc_mode, optionJSON[u'drvr_mem'], optionJSON[u'exec_mem'], optionJSON[u'exec_core_per_job']) if proc_mode == 'cluster': # cluster have more options to be set exec_str_spark += " --py-files \"%s,%s,%s\"" % ( "file://%s/../util.py" % curr_py_dir, "file://%s/../schema/%s_%s_cell_avail_schema.json" % (curr_py_dir, optionJSON[u'tech'], optionJSON[u'vendor']), "file://%s/../sql/%s_%s_sql.json" % (curr_py_dir, optionJSON[u'tech'], optionJSON[u'vendor'])) # create python string exec_str_py = "%s/../%s_%s_aggregator.py" % ( curr_py_dir, optionJSON[u'tech'], optionJSON[u'vendor']) if exportMode == 3: # mode 3 - export csv only exec_str_app = "%s \ 3 \ %s \ %s \ TMO \ \"%s\" \ \"%s\" \ \"%s\" \ '%s'" % (exec_str_py, optionJSON[u'vendorUp'], optionJSON[u'techUp'], output_parq, output_dir, input_celllookup_parq, json.dumps(optionJSON)) elif exportMode == 2: # mode 2 - create parquet only exec_str_app = "%s \ 2 \ %s \ %s \ TMO \ \"%s\" \ \"%s/*.txt\" \ \"%s\" \ \"%s\" \ '%s'" % (exec_str_py, optionJSON[u'vendorUp'], optionJSON[u'techUp'], input_dir, seqfile, input_celllookup_parq, output_parq, json.dumps(optionJSON)) else: # mode 1 - create parquet and export csv - not support anymore, should not run to here exec_str_app = "%s \ 1 \ %s \ %s \ TMO \ \"%s\" \ \"%s/*.txt\" \ \"%s\" \ \"%s\" \ \"%s\" \ '%s'" % (exec_str_py, optionJSON[u'vendorUp'], optionJSON[u'techUp'], input_dir, seqfile, input_celllookup_parq, output_parq, output_dir, json.dumps(optionJSON)) if proc_mode != 'cluster': # client - support multi master (zookeeper) exec_str_app += " &" else: # cluster - currently not support multi master (zookeeper) pass exec_str = exec_str_spark + " " + exec_str_app ''' # old samples # submit new job - xml parser #exec_str = "spark-submit --master spark://master:7077 --executor-memory 512m --driver-memory 512m --total-executor-cores 2 %s/kpi_parser_eric.py \"%s\" %s \"%s\" &" % (curr_py_dir, jobname, seqfile, output_dir) if proc_mode != 'cluster': # client - support multi master (zookeeper) # exec_str = "/opt/spark/bin/spark-submit --master mesos://mesos_master_01:5050 --driver-memory 512m --executor-memory 966m --total-executor-cores 2 %s/kpi_parser_lte_eric.py \"%s\" %s \"tts@mesos_fs_01|%s\" \"client\" &" % (curr_py_dir, jobname, seqfile, output_dir) exec_str = "/opt/spark/bin/spark-submit --master mesos://zk://mesos_master_01:2181,mesos_master_02:2181,mesos_master_03:2181/mesos --driver-memory 512m --executor-memory 966m --total-executor-cores 2 %s/kpi_parser_lte_eric.py \"%s\" %s \"imnosrf@mesos_fs_01|%s\" \"client\" &" % (curr_py_dir, jobname, seqfile, output_dir) else: # cluster - currently not support multi master (zookeeper) # exec_str = "/opt/spark/bin/spark-submit --master mesos://mesos_master_01:7077 --deploy-mode cluster --driver-memory 512m --executor-memory 966m --total-executor-cores 2 --py-files \"file:///home/tts/ttskpiraw/code/lte-eric/util.py,file:///home/tts/ttskpiraw/code/lte-eric/xmlparser_lte_eric.py,file:///home/tts/ttskpiraw/code/lte-eric/config.ini\" %s/kpi_parser_lte_eric.py \"%s\" %s \"tts@mesos_fs_01\|%s\" \"cluster\"" % (curr_py_dir, jobname, seqfile, output_dir) exec_str = "/opt/spark/bin/spark-submit --master mesos://mesos_master_01:7077 --deploy-mode cluster --driver-memory 512m --executor-memory 966m --total-executor-cores 2 --py-files \"file:///home/imnosrf/ttskpiraw/code/lte-eric/util.py,file:///home/imnosrf/ttskpiraw/code/lte-eric/xmlparser_lte_eric.py,file:///home/imnosrf/ttskpiraw/code/lte-eric/config.ini\" %s/kpi_parser_lte_eric.py \"%s\" %s \"imnosrf@mesos_fs_01\|%s\" \"cluster\"" % (curr_py_dir, jobname, seqfile, output_dir) ''' util.logMessage("%s" % exec_str) # update prev jobname prev_jobname = jobname os.system(exec_str)
def extractContent(self, fic: Fic, html: str) -> str: from bs4 import BeautifulSoup contentId = util.randomString(8) while html.find(contentId) >= 0: contentId = util.randomString(len(contentId) + 1) soup = BeautifulSoup(f'<div id="{contentId}">{html}</div>', 'html5lib') # SB for spoiler in soup.find_all('div', {'class': 'bbCodeSpoiler'}): button = spoiler.find('button') title = spoiler.find('span', {'class': 'bbCodeSpoiler-button-title'}) if title is not None and button is not None: t = soup.new_tag('span') t.append(title.get_text()) button.insert_after(t) if button is not None: button.extract() for spoiler in soup.find_all('div', {'class': 'bbCodeSpoiler-content'}): spoiler.attrs['class'] = 'spoiler' # QQ for spoiler in soup.find_all('div', {'class': 'bbCodeSpoilerContainer'}): spoiler.attrs.pop('class') spoiler.name = 'span' for spoiler in soup.find_all('div', {'class': 'bbCodeSpoilerText'}): spoiler.attrs['class'] = 'spoiler' # for the proxy js based img tags, fiddle with their attributes so the # html cleanup code gets the proxy url out of .data-url and the original # upstream url from .src (or the proxy url if we don't have a real # upstream) for img in soup.find_all('img'): # proxy img tags have data-src but no actual src if 'data-src' not in img.attrs: continue if 'src' in img.attrs: continue src = img.attrs['data-src'] if not src.startswith('http'): src = self.baseUrl + src altSrc = None if 'data-url' in img.attrs: altSrc = img.attrs['data-url'] img.attrs['data-url'] = src img.attrs['src'] = src if altSrc: img.attrs['src'] = altSrc # general 'click to expand' nonsense for div in soup.find_all('div', {'class': 'quoteExpand'}): if div.get_text().strip() in { 'Click to expand...', 'Click to expand…' }: div.extract() # CloudFlare protected "emails" for e in soup.find_all('a', {'class': '__cf_email__'}): if 'data-cfemail' not in e.attrs: continue t = e.get_text() if not t.startswith('[email') or not t.endswith('protected]'): continue cfemail = e.attrs['data-cfemail'] email = util.decodeCloudFlareEmail(cfemail) util.logMessage(f'decoding email|{cfemail}|{email}') e.name = 'span' e.attrs.clear() e.string = email content = soup.find('div', {'id': contentId}) content = content.contents if isinstance(content, list): content = content[0] return str(content)
def main(input_dir, optionJSON): ''' # sameple code # get status statusJSON = getStatusJSON_mesos() cores_max, cores_used = getCoresUsed_mesos(statusJSON) print 'max:%s, used:%s' % (cores_max, cores_used) print 'have resource: %s' % haveWorkersResource_mesos(statusJSON) numJobs, numWaitingJobs, bFoundLastSubmit = getCurrJobs_mesos(statusJSON, '1x2c_client') print 'numJobs: %s; numWaitingJobs: %s; bFoundLastSubmit: %s' % (numJobs, numWaitingJobs, bFoundLastSubmit) exit(0) ''' global exportMode if exportMode != 3: # not only export csv # go thru all seq file/folder inputSeqPath = input_dir + "/ttskpiraw_%s_%s_*_TMO*.tgz" % ( optionJSON[u'vendorFULL'], optionJSON[u'techUp']) inputSeqList = glob.glob(inputSeqPath) if len(inputSeqList) <= 0: # no file util.logMessage("No parser output to process: %s" % inputSeqPath) os.system( "rm -rf '%s'" % staging_dir_sub ) # remove staging sub folder (since will not be removed by proc) if exportMode == 2: # if save pq only (no output), and also no input, end process util.endProcess(lockpath, 0) else: # if no input, but have output, only do export exportMode = 3 # export only mode if exportMode == 3: util.getInfoFromPQNokia(output_parq) # from parquet dir get main info: filetypelist->datelist->marketlist->hrlist e.g. {"lte_cell_avail": {"2016-11-21": {"NY": {"00": "path"}}}} infoPq = util.getInfoFromPQNokia(output_parq) if infoPq is None or len(infoPq.items()) <= 0: # safeguard util.logMessage("Error! No data found from parquet file: %s" % output_parq) return 0 filetypeExportArr = [] filetypeCtr = 0 filetypeStr = '' for filetype, filetypeItem in sorted( infoPq.items() ): # on each file type, accum into file types list based on # filetype per task if filetypeCtr < int(optionJSON[u'numFileTypePerTask']): filetypeCtr += 1 else: filetypeCtr = 1 filetypeExportArr.append(filetypeStr) if filetypeCtr == 1: filetypeStr = filetype else: filetypeStr += '|' + filetype # leftover filetype filetypeExportArr.append(filetypeStr) for filetypeStr in filetypeExportArr: # on each file types list, spawn new task # submit one process to work on the whole folder (of multiple txt file) try: # get status statusJSON = getStatusJSON_mesos() bStartNewJob, delay_sec = canStartNewJob(statusJSON) while (bStartNewJob == False): time.sleep(delay_sec) bStartNewJob, delay_sec = canStartNewJob( statusJSON) # retest after the sleep # process file optionJSON[ u'exportType'] = filetypeStr # set new filetypes (| delimited list) worker(staging_dir_sub) # wait some sec before next task time.sleep(new_job_delay_sec) except Exception as e: util.logMessage("Error: failed to export file %s\n%s" % (staging_dir_sub, e)) except: util.logMessage("Unexpected error") return 0 # move seq file into staging_sub first to prevent other proc from touching them inputSeqStageList = [] for curr_file in sorted(inputSeqList): util.logMessage("Moving %s to staging dir %s" % (curr_file, staging_dir_sub)) try: shutil.move(curr_file, staging_dir_sub) curr_filedir, curr_filename = os.path.split(curr_file) inputSeqStageList.append( os.path.join(staging_dir_sub, curr_filename)) except shutil.Error as e: util.logMessage("Error: failed to move file %s\n%s" % (curr_file, e)) except: util.logMessage("Unexpected error") # going to each file in the staging area and unzip into one folder for curr_file in inputSeqStageList: try: exec_str = '' if optionJSON[u'vendor'] == 'eric': exec_str = "tar -xvzf %s -C %s *%s_%s*TMO.txt" % ( curr_file, staging_dir_sub, optionJSON[u'vendorFULL'], optionJSON[u'techUp']) else: # nokia exec_str = "tar -xvzf %s -C %s *%s_%s*TMO*.txt" % ( curr_file, staging_dir_sub, optionJSON[u'vendorFULL'], optionJSON[u'techUp']) util.logMessage('unzipping files: %s' % exec_str) os.system(exec_str) except Exception as e: util.logMessage("Error: failed to process file %s\n%s" % (curr_file, e)) # try to move it back to input dir for re-processing next round try: shutil.move(curr_file, input_dir) except shutil.Error as e: util.logMessage("Error: failed to move file %s\n%s" % (curr_file, e)) except: util.logMessage("Unexpected error") except: util.logMessage("Unexpected error") # try to move it back to input dir for re-processing next round try: shutil.move(curr_file, input_dir) except shutil.Error as e: util.logMessage("Error: failed to move file %s\n%s" % (curr_file, e)) except: util.logMessage("Unexpected error") # move files into sub folders by file type filetypeArr = {} filetypeSetArr = {} filetypeDirArr = [] stagingFileList = glob.glob(staging_dir_sub + "/*.txt") if len(stagingFileList) > 0: # safeguard for curr_file in stagingFileList: curr_stg_dir, curr_data_filename = os.path.split(curr_file) filenameArr = curr_data_filename.split('.')[0].split('_') filetype = '_'.join(filenameArr[6:]) ''' ##### old code - create subfolder by filetype ##### filetypeDir = staging_dir_sub + '/' + filetype if filetype not in filetypeArr: # create new dir filetypeArr.append(filetype) if not os.path.isdir(filetypeDir): # create if not exist try: os.mkdir(filetypeDir) filetypeDirArr.append(filetypeDir) except: util.logMessage("Failed to create folder \"%s\"!" % filetypeDir) util.logMessage("Process terminated.") util.endProcess(lockpath, 2) # move file by filetype try: shutil.move(curr_file, filetypeDir) except shutil.Error as e: util.logMessage("Error: failed to move file %s\n%s" % (curr_file, e)) except: util.logMessage("Unexpected error") ##### old code - create subfolder by filetype ##### ''' if filetype not in filetypeArr: # create new list filetypeArr[filetype] = [] filetypeArr[filetype].append(curr_file) numSet = int( math.ceil( len(filetypeArr) / float(optionJSON[u'numFileTypePerTask']))) setCntr = 1 # init filetypeCntr = 0 # init # reorganize set by grouping together multiple filetypes for filetype, filetypeItem in sorted(filetypeArr.items()): if filetypeCntr < optionJSON[u'numFileTypePerTask']: filetypeCntr += 1 else: filetypeCntr = 1 # reset setCntr += 1 # create set index and new array if not exist setIdx = "%d_%d" % (setCntr, numSet) if setIdx not in filetypeSetArr: filetypeSetArr[setIdx] = [] # insert filename into set array for file in filetypeItem: filetypeSetArr[setIdx].append(file) # move file to final set dir for file_set, fileArr in sorted(filetypeSetArr.items()): filetypeDir = staging_dir_sub + '/' + file_set if not os.path.isdir(filetypeDir): # create if not exist try: os.mkdir(filetypeDir) filetypeDirArr.append(filetypeDir) except: util.logMessage("Failed to create folder \"%s\"!" % filetypeDir) util.logMessage("Process terminated.") util.endProcess(lockpath, 2) for curr_file in fileArr: # move file by filetype try: shutil.move(curr_file, filetypeDir) except shutil.Error as e: util.logMessage("Error: failed to move file %s\n%s" % (curr_file, e)) except: util.logMessage("Unexpected error") # going to each file type folder in the staging area and submit process for curr_dir in filetypeDirArr: try: # get status statusJSON = getStatusJSON_mesos() bStartNewJob, delay_sec = canStartNewJob(statusJSON) while (bStartNewJob == False): time.sleep(delay_sec) bStartNewJob, delay_sec = canStartNewJob( statusJSON) # retest after the sleep # process file worker(curr_dir) # wait some sec before next task time.sleep(new_job_delay_sec) except Exception as e: util.logMessage("Error: failed to process file %s\n%s" % (curr_file, e)) # WES_TEST: doesn't work like that # try to move it back to input dir for re-processing next round try: shutil.move(curr_file, input_dir) except shutil.Error as e: util.logMessage("Error: failed to move file %s\n%s" % (curr_file, e)) except: util.logMessage("Unexpected error") except: util.logMessage("Unexpected error") # try to move it back to input dir for re-processing next round try: shutil.move(curr_file, input_dir) except shutil.Error as e: util.logMessage("Error: failed to move file %s\n%s" % (curr_file, e)) except: util.logMessage("Unexpected error") return 0