Example #1
0
def saveCsv(sqlDF, csv):

   #sqlDF.show(10, truncate=False)
   #util.logMessage("count: %d" % sqlDF.count())

   # output to csv file
   util.logMessage("save to csv: %s" % csv)

   csvTmp = csv+"."+time.strftime("%Y%m%d%H%M%S")+".tmp" # temp folder
   sqlDF.coalesce(1).write.csv(csvTmp,
                               header=True,
                               mode='overwrite',
                               sep=',',
                               dateFormat='yyyy-MM-dd',
                               timestampFormat='yyyy-MM-dd HH:mm:ss')
                               #timestampFormat='yyyy-MM-dd HH:mm:ss.SSS')

   # rename
   # check result file exist
   outputCsvList = glob.glob(csvTmp+"/*.csv")
   if len(outputCsvList) <= 0:  # no file
      util.logMessage("no file to output: %s" % csv)
      return None
   # supposed only have 1 because of coalesce(1), but in case of more than one, it will just keep overwriting
   for curr_file in sorted(outputCsvList):
      os.system("rm -rf '%s'" % csv) # remove prev output
      shutil.move(curr_file, csv)
   os.system("rm -rf '%s'" % csvTmp) # remove temp output folder
Example #2
0
def csvToDF(spark, csvFile, schema=None):

   try:
      if schema is None:
         df = spark.read.csv(csvFile,
                             ignoreLeadingWhiteSpace=True,
                             ignoreTrailingWhiteSpace=True,
                             header=True,
                             timestampFormat='yyyy-MM-dd HH:mm')
      else:
         df = spark.read.csv(csvFile,
                             ignoreLeadingWhiteSpace=True,
                             ignoreTrailingWhiteSpace=True,
                             header=True,
                             timestampFormat='yyyy-MM-dd HH:mm',
                             schema=schema)

      return df

   except Exception as e:
      util.logMessage("Job: %s: Exception Error: %s!" % (APP_NAME, e))
      return None

   except:
      util.logMessage("Job: %s: Other Unknown Error!" % APP_NAME)
      return None
Example #3
0
def updateMasterInfo():
    global optionJSON

    if optionJSON[u'master'] != '':  # if master defined, not using zookeeper
        optionJSON[u'zkStr'] = ''
        util.logMessage("Master default at %s:%d" %
                        (optionJSON[u'master'], optionJSON[u'masterPort']))
    else:  # if master not defined, use zookeeper
        if optionJSON[u'zkStr'] != '':
            util.logMessage(
                "Try to determine master using zookeeper string: %s" %
                optionJSON[u'zkStr'])
            master, masterPort = util.getMesosMaster(optionJSON[u'zkStr'])
        else:
            util.logMessage(
                "Try to determine master using default zookeeper string: %s" %
                "zk://mesos_master_01:2181,mesos_master_02:2181,mesos_master_03:2181/mesos"
            )
            master, masterPort = util.getMesosMaster()
        if master == '':  # master not found through zookeeper
            optionJSON[u'master'] = "mesos_master_01"
            util.logMessage(
                "Cannot get master from zookeeper; master default at %s:%d" %
                (optionJSON[u'master'], optionJSON[u'masterPort']))
        else:  # master found through zookeeper
            optionJSON[u'master'] = master
            optionJSON[u'masterPort'] = masterPort
            util.logMessage("Master detected at %s:%d" %
                            (optionJSON[u'master'], optionJSON[u'masterPort']))
Example #4
0
def logQuery(kind: str, table: str, sql: str, data: Iterable[Any]) -> None:
	if not logQueries:
		return
	sql = sql.replace('\t', ' ').replace('\n', ' ')
	while sql.find('  ') >= 0:
		sql = sql.replace('  ', ' ')
	data = transformQueryData(data)
	util.logMessage(f'{kind}: sql={sql} data={data}')
Example #5
0
    def _makeRequest(self, apiUrl: str,
                     params: Optional[Dict[str, str]]) -> Optional[ScrapeMeta]:
        import requests
        r = None
        ts0 = time.time()
        # while we have time to retry
        while (time.time() - ts0) < self.timeout:
            try:
                # attempt a request
                timePassed = time.time() - ts0
                timeLeft = self.timeout - timePassed
                r = requests.get(
                    apiUrl,
                    headers=self.headers,
                    params=params,
                    data=self.extraData,
                    auth=self.auth,
                    timeout=max(1, timeLeft),
                )
            except:
                util.logMessage(
                    'SkitterClient._makeRequest|exception|{}'.format(apiUrl),
                    'scrape.log')
                raise
            # if we got rate limited
            if r.status_code == 429:
                retryAfter = int(r.headers['Retry-After'])
                # and there's enough left in our timeout to retry, then retry
                if (time.time() + retryAfter - ts0) >= self.timeout:
                    # otherwise break with the 429 response
                    break
                time.sleep(retryAfter)
            else:
                # otherwise break with the non-429 response
                break

        if r is None:
            # this _should_ be unreachable
            raise Exception(
                f'SkitterClient._makeRequest: failed to make request: {apiUrl}'
            )

        if r.status_code == 404:
            return None

        if r.status_code != 200:
            raise Exception(
                'SkitterClient._makeRequest: failed to download url {}: {}'.
                format(r.status_code, apiUrl))

        ts = int(r.headers['X-Weaver-Created'])
        url = str(r.headers['X-Weaver-Url'])

        raw = r.content
        text = decodeRequest(raw, url)

        delaySecs(self.delay)
        return buildScrapeMeta(url, ts, text, r.status_code)
def f_map(filetuple):
   [fn,bw] = filetuple

   try:
      xml = XMLParser(fn,'bytes','file',bw,SparkFiles.get('config.ini'))
      #util.logMessage("inside map func, after XMLParser(), before xml.ProcessXML()")
      return xml.ProcessXML()
   except Exception as e:
      util.logMessage("err in file: %s" % fn)
      return ""
Example #7
0
	def handleFandom(self, fic: Fic, fandom: str) -> List[Fandom]:
		# save raw/messy fandom
		fandoms = [Fandom.define(fandom, sourceId=self.ftype)]

		# ensure messy is in our map
		if fandom not in ffNetFandomMap:
			util.logMessage('unknown fandom: {} (from {})'.format(fandom, fic.url))
		else:
			fandoms.append(Fandom.define(ffNetFandomMap[fandom]))

		return fandoms
Example #8
0
    def deepSoftScrape(self, fic: Fic) -> None:
        # try to grab reader pages first to be sure we have them
        try:
            self.readerSoftScrape(fic)
        except:
            pass

        urls = self.getDeepPageUrls(fic)
        util.logMessage('deepSoftScrape|{}|{}'.format(fic.id, len(urls)))
        for url in urls:
            self.scrapeLike(url, 5)
Example #9
0
    def scrape(self, url: str) -> ScrapeMeta:
        url = canonizeUrl(url)
        # TODO staleOnly?
        if self.staleOnly:
            util.logMessage('staleScrape|{}'.format(url), 'scrape.log')

            #r = getMostRecentScrapeWithMeta(url, beforeId = _staleBefore)
            #if r is None or 'raw' not in r:
            #	raise Exception('failed to stale scrape url: {}'.format(url))
            #return { 'url': url, 'fetched': ts, 'raw': r['raw'] }

        res = self.crawl(url)
        saveWebRequest(res['fetched'], res['url'], res['status'], res['raw'])
        return res
Example #10
0
    def getId(cls, name: str, sourceId: int) -> int:
        assert (isinstance(sourceId, int))
        es = Author.select({'name': name}, f'''
			case when exists (
				select 1 from author_source s
				where s.authorId = authorId and s.sourceId = {sourceId}
			) then 1 else 0 end desc''')
        if len(es) > 1:
            util.logMessage(f'many authors: name={name} sourceId={sourceId}')
        if len(es) >= 1:
            return es[0].id
        if len(es) > 0:
            raise Exception('FIXME')
        e = Author.new()
        e.name = name
        e.urlId = util.randomString(8, charset=util.urlIdCharset)
        e.insert()
        return Author.getId(name, sourceId)
Example #11
0
def decodeRequest(data: Optional[bytes], url: str) -> Optional[str]:
    global decodeFailureDumpFile
    if data is None:
        return None

    try:
        return data.decode('utf-8')
    except:
        pass

    setupCP1252()

    # handle Mórrigan and façade in
    # http://www.fictionalley.org/authors/irina/galatea05.html
    # looks aggressively misencoded
    data = data.replace(b'M\xc3\x83\xc2\xb3rr\xc3\x83\xc2\xadgan',
                        b'M\xf3rrigan')
    data = data.replace(b'fa\xc3\x83\xc2\xa7ade', b'fa\xe7ade')

    data = data.replace(b'#8211;&#8212;&#8211;\xb5&#8211;\xbb&#8211;\xb8',
                        b'#8211;&#8212;&#8211;&#8211;&#8211;')
    data = data.replace(b'#8211;&#8211;&#8211;\xb9 &#8211; &#8212;\x83',
                        b'#8211;&#8211;&#8211; &#8211; &#8212;')
    data = data.replace(b'&#8211;\xb9 &#8211; &#8212;\x83',
                        b'#8211;&#8211;&#8211; &#8211; &#8212;&#8')

    # replace misencoded utf-8 bits (likely from a header or footer) with their
    # cp1252 counterparts
    for utoc in utf8_to_cp1252:
        data = data.replace(utoc[0], utoc[1])

    # do some cleanup on the remaining cp1252 to normalize smart quotes and
    # delete a few invalid chars that may have leaked through
    for ctom in cp1252_munge:
        data = data.replace(ctom[0], ctom[1])

    try:
        return data.decode('cp1252')
    except Exception as e:
        util.logMessage('error decoding {}: {}\n{}'.format(
            url, e, traceback.format_exc()))
        with open(decodeFailureDumpFile, 'wb') as f:
            f.write(data)
        raise
Example #12
0
def addPkAndSaveParquet(origDF, writemode, outputDir, numPartition=None):

   util.logMessage("adding partition columns...")

   '''
   # add key col from HL_Area
   origDF = origDF.withColumn("HL_Area", lit('unassigned'))
   # add key col from HL_Cluster
   origDF = origDF.withColumn("HL_Cluster", lit('unassigned'))
   # add key col from HL_SectorLayer
   origDF = origDF.withColumn("HL_SectorLayer", lit(None).cast(StringType()))
   '''

   # remove HL_Market column; it will be recovered later
   origDF = origDF.drop("HL_Market")
   origDF.createOrReplaceTempView('kpi')

   # recover from lookup parquet
   util.logMessage("start market-cluster-area recovery process...")


   # example join sql
   #sqlDF = spark.sql("SELECT l.TECH,l.VENDOR,l.MARKET,l.CLUSTER,l.AREA,k.UtranCell from kpi k left join lookup l on k.UtranCell = l.CELL")
   # create join dataframe
   #df = spark.sql("SELECT k.*, IFNULL(l.MARKET,'unassigned') as HL_Market, IFNULL(l.CLUSTER,'unassigned') AS HL_Cluster, IFNULL(l.AREA,'unassigned') AS HL_Area from kpi k left join lookup l on UPPER(k.OSSName) = UPPER(l.OSS) AND k.UtranCell = l.CELL")
   df = spark.sql("SELECT k.*, IFNULL(l.MARKET,'unassigned') as HL_Market, IFNULL(l.CLUSTER,'unassigned') AS HL_Cluster, IFNULL(l.AREA,'unassigned') AS HL_Area, IFNULL(l.SITE,'unassigned') AS HL_Site from kpi k left join lookup l on UPPER(k.OSSName) = UPPER(l.OSS) AND k.UtranCell = l.CELL")

   # add key col from HL_MARKET - need to add HL_MARKET because that column will be gone if we go into sub dir
   df = df.withColumn("pk_market", df['HL_MARKET'])
   # add key col from HL_DATE
   df = df.withColumn("pk_date", date_format(df['HL_DATE'], 'yyyy-MM-dd'))
   # add key col from PERIOD_START_TIME
   df = df.withColumn("pk_hr", date_format(df['PERIOD_START_TIME'], 'HH'))


   # show dtypes
   #util.logMessage("dtypes: %s" % df.dtypes)
   # show schema
   #df.printSchema()
   #df.show(1,truncate=False)

   util.logMessage("start writing parquet file (%s): %s" % (writemode, outputDir))
   if numPartition is None:
      df.write.parquet(outputDir,
                       compression='gzip',
                       mode=writemode,
                       partitionBy=('pk_date','pk_market','pk_hr'))
   else:
      # coalesce - num of partition - should match number of executor we run
      df.coalesce(numPartition).write.parquet(outputDir,
                       compression='gzip',
                       mode=writemode,
                       partitionBy=('pk_date','pk_market','pk_hr'))
   util.logMessage("finish writing parquet file (%s): %s" % (writemode, outputDir))     
Example #13
0
 def extractPostThreadmarkTitle(self, postSoup: Any) -> Optional[str]:
     title = ''
     # try to grab the title from the threadmark label
     try:
         labelSpans = postSoup.find_all('span',
                                        {'class': 'threadmarkLabel'})
         if len(labelSpans) < 1:
             return None
         if len(labelSpans) > 1:
             util.logMessage(
                 f'XenForoAdapter: too many threadmark labels: len: {len(labelSpans)}'
             )
         return str(labelSpans[0].get_text()).strip()
     except Exception as e:
         util.logMessage('\n'.join([
             f'XenForoAdapter.extractPostThreadmarkTitle: exception FIXME: {e}',
             traceback.format_exc()
         ]))
     return None
Example #14
0
    def getDeepAuthorPostUrls(self, fic: Fic) -> List[str]:
        urls = self.getDeepPageUrls(fic)
        util.logMessage(
            f'XenForo.getDeepAuthorPostUrls|deep page urls: {urls}')
        # TODO this should probably be more comprehensive...
        author = fic.getAuthorName()
        altAuthor = author.replace("'", '&#039;')
        postUrls: List[str] = []
        seenIdStubs = set()
        for url in urls:
            pageContent = self.scrapeLike(url)

            # See getReaderPostUrls for a fully parsed version
            for b in pageContent.split('<'):
                e = b.find('>')
                if e == -1:
                    continue
                s = b[:e]
                # TODO FIXME this is bad :(
                # looking for li or article (the post container)
                if not (b.startswith('li id=')
                        or b.startswith('article class=')):
                    continue
                # check for 'message' -- simulates checking for message class
                if not 'message' in s:
                    continue
                # to check the data-author we simply look for the author and hope
                # there aren't collisions
                if s.find(author) < 0 and s.find(altAuthor) < 0:
                    continue
                # loop over spaced tokens looking for an unspaced id attribute
                for sb in s.split():
                    if not sb.startswith('id="') or not sb.endswith('"'):
                        continue
                    idStub = sb[len('id="'):-1]
                    if idStub.startswith('js-'):
                        idStub = idStub[len('js-'):]
                    postUrl = url + '#' + idStub
                    if idStub not in seenIdStubs:
                        postUrls += [postUrl]
                    seenIdStubs |= {idStub}
        util.logMessage(f'XenForo.getDeepAuthorPostUrls|postUrls: {postUrls}')
        return postUrls
Example #15
0
	def insert(self) -> None:
		table = type(self).getTableName()
		cols = type(self).getNonGeneratedColumns()
		sql = 'INSERT INTO {}({}) VALUES({})'.format(
			table, ', '.join([c.name for c in cols]), ', '.join(['%s'] * len(cols))
		)
		data = self.toInsertTuple()
		conn = type(self).getConnection()
		with conn.cursor() as curs:
			try:
				logQuery('insert', table, sql, data)
				curs.execute(sql, data)
			except:
				util.logMessage(f'failed to insert: {sql}: {data}', 'lite.log')
				raise

		global autocommit
		if autocommit == True:
			conn.commit()
Example #16
0
def scrape(url: str,
           cookies: 'requests.cookies.RequestsCookieJar' = None,
           delay: float = 3,
           timeout: int = 15) -> ScrapeMeta:
    url = canonizeUrl(url)
    headers = {'User-Agent': __userAgent}
    ts = int(time.time())

    if _staleOnly:
        pass  # util.logMessage('staleScrape|{}'.format(url), 'scrape.log')

        last = getMostRecentScrapeWithMeta(url, beforeId=_staleBefore)
        if last is None or 'raw' not in last:
            raise Exception('failed to stale scrape url: {}'.format(url))
        return {'url': url, 'fetched': ts, 'raw': last['raw']}

    import requests
    if cookies is None:
        import priv
        cookies = priv.getDefaultCookies()
    r = None
    try:
        r = requests.get(url,
                         headers=headers,
                         cookies=cookies,
                         timeout=timeout)
    except:
        util.logMessage('scrape|exception|{}'.format(url), 'scrape.log')
        raise

    if r.status_code != 200:
        saveWebRequest(ts, url, r.status_code, None)
        delaySecs(delay)
        raise Exception('failed to download url {}: {}'.format(
            r.status_code, url))

    raw = r.content
    text = decodeRequest(raw, url)

    saveWebRequest(ts, url, r.status_code, text)
    delaySecs(delay)
    return {'url': url, 'fetched': ts, 'raw': text}
Example #17
0
def jsonToSchema(jsonFile):

   schema = None   

   try:
      with open(jsonFile) as json_data:
         schemaJson = json.load(json_data)
         #print [item for item in schemaJson]

   except Exception as e:
      util.logMessage("Job: %s: Exception Error: %s!" % (APP_NAME, e))
      return None

   except:
      util.logMessage("Job: %s: Other Unknown Error!" % APP_NAME)
      return None


   schema = StructType([StructField.fromJson(item) for item in schemaJson])
   return schema
Example #18
0
def f_map(filetuple):
    [fn, bw] = filetuple

    try:
        xml = nokiaXmlParser(bw, SparkFiles.get('config.ini'), fn)
        #util.logMessage("inside map func, after nokiaXmlParser(), before xml.Main()")
        data, type, err = xml.Main()
        #print fn, type, data

        obj = dict()
        obj['err'] = err
        obj['data'] = data
        xmlDict = {
            type: obj
        }  # list of detail [0] and nb summary [1] (if exists)
        return xmlDict

    except Exception as e:
        util.logMessage("err in file: %s\n%s" % (fn, e))
        return {}
Example #19
0
def loadCellLookup(lookupPQ):
   global dfCellLookup
   global dictMarketLookup

   # read lookup parquet
   newlookupPQ = lookupPQ + "/TECH=%s/VENDOR=%s" % ('UMTS','Ericsson')
   util.logMessage("reading lookup parquet: %s" % newlookupPQ)

   dfLookup = spark.read.parquet(newlookupPQ)
   dfCellLookup = dfLookup # save to global
   dfLookup.createOrReplaceTempView('lookup')

   sqlDF = spark.sql("SELECT DISTINCT MARKET,MARKET_SUFFIX FROM lookup")
   arrMarketLookup = sqlDF.collect()

   dictMarketLookup = dict()
   for row in arrMarketLookup:
      dictMarketLookup[row['MARKET']] = row['MARKET_SUFFIX']

   util.logMessage("finish reading lookup parquet: %s" % newlookupPQ)
Example #20
0
def scrape(url: str,
           staleOnly: bool = False,
           fallback: bool = False) -> sc.ScrapeMeta:
    if sc._staleOnly:
        util.logMessage(f'skitter.scrape: HERMES_STALE only {url}')
        return sc.scrape(url)

    if staleOnly:
        util.logMessage(f'skitter.scrape: staleOnly {url}')
        for c in reversed(priv.skitterClients):
            ce = c.cache(url)
            if ce is not None:
                return ce
        raise Exception(f'skitter.scrape: unable to staleOnly scrape: {url}')

    for c in priv.skitterClients:
        try:
            #util.logMessage(f'skitter.scrape: calling {c.ident}.scrape({url})')
            r = c.scrape(url)
            return r
        except Exception as e:
            util.logMessage(f'skitter.scrape: {c.ident}.scrape failed: {e}')
            pass

    if fallback:
        return sc.scrape(url)
    raise Exception(f'skitter.scrape: unable to scrape: {url}')
Example #21
0
    def getId(cls, authorId: int, sourceId: int, name: str, url: str,
              localId: str) -> int:
        es = AuthorSource.select({'authorId': authorId, 'sourceId': sourceId})
        if len(es) > 1:
            util.logMessage(
                f'many author source: authorId={authorId} sourceId={sourceId}')
        if len(es) >= 1:
            es[0].name = name
            es[0].url = url
            es[0].localId = localId
            es[0].update()
            return es[0].id
        if len(es) > 0:
            raise Exception('FIXME')

        e = AuthorSource.new()
        e.authorId = authorId
        e.sourceId = sourceId
        e.name = name
        e.url = url
        e.localId = localId
        e.insert()
        return AuthorSource.getId(authorId, sourceId, name, url, localId)
Example #22
0
    def getReaderPosts(self,
                       fic: Fic) -> Tuple[Dict[str, Any], Dict[int, str]]:
        from bs4 import BeautifulSoup
        urls = self.getReaderUrls(fic)
        soups = {}
        titles = {}
        for url in urls:
            pageContent = self.scrapeLike(url)
            pageSoup = BeautifulSoup(pageContent, 'html5lib')
            posts = pageSoup.find_all(self.postContainer, {'class': 'message'})
            if len(posts) < self.postsPerPage and url != urls[-1]:
                util.logMessage(
                    f'XenForoAdapter.getReaderPosts: {url} is not the last page but is incomplete with {len(posts)} posts; attempting to refetch'
                )
                pageContent = scrape.scrape(url, timeout=30)['raw']
                time.sleep(self.defaultDelay)
                pageSoup = BeautifulSoup(pageContent, 'html5lib')
                posts = pageSoup.find_all(self.postContainer,
                                          {'class': 'message'})
            if len(posts) < self.postsPerPage and url != urls[-1]:
                raise Exception(
                    f'XenForoAdapter.getReaderPosts: {url} is not the last page but is incomplete with {len(posts)} posts'
                )
            for post in posts:
                pid = post.get('id')
                if pid.startswith('js-'):
                    pid = pid[len('js-'):]
                soups[pid] = post
                title = self.extractPostThreadmarkTitle(post)
                if title is None:
                    title = ''
                titles[len(soups)] = title

        util.logMessage(
            f'XenForoAdapter.getReaderPostUrls|{fic.id}|{len(soups)}')
        return (soups, titles)
Example #23
0
	def handleCrossoverFandom(
		self, fic: Fic, fandom: str, fIds: List[int], href: str
	) -> List[Fandom]:
		# save raw/messy fandom
		fandoms = [Fandom.define(fandom, sourceId=self.ftype)]

		# ensure fandom ids are in our map

		# check for missing id maps
		missingIds = [fId for fId in fIds if fId not in ffNetFandomIdMap]
		if len(missingIds) > 0:
			util.logMessage(
				'unknown fandom ids: {} from {} in {}'.format(
					missingIds, href, fic.url
				)
			)
			return fandoms

		# translate to messy
		messys = [ffNetFandomIdMap[fId] for fId in fIds]
		# check for missing messy
		missingMessy = [m for m in messys if m not in ffNetFandomMap]
		if len(missingMessy) > 0:
			util.logMessage(
				'unknown messy fandom: {} from {}'.format(missingMessy, href)
			)
			return fandoms

		# check crossover value
		expected = '{}_and_{}_Crossovers'.format(messys[0], messys[1])
		if expected != fandom:
			util.logMessage(
				'crossover got "{}" expected "{}"'.format(fandom, expected)
			)
			return fandoms

		# map messy to clean
		cleans = [ffNetFandomMap[m] for m in messys]
		for clean in cleans:
			if len(clean) > 0:
				fandoms.append(Fandom.define(clean))
		return fandoms
Example #24
0
    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(wwwHtml, 'html5lib')

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?
        if fic.ficStatus is None or fic.ficStatus == FicStatus.broken:
            fic.ficStatus = FicStatus.ongoing

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0
        fic.ageRating = 'M'  # TODO?

        # grab title from <title> element
        titles = soup.find('head').find_all('title')
        if len(titles) != 1:
            raise Exception(f'error: cannot find title: {len(titles)}')
        ntitle = ''
        try:
            ntitle = titles[0].get_text()
        except:
            pass  # TODO FIXME
        if fic.title is None or len(ntitle.strip()) > 0:
            fic.title = ntitle
        if len(self.titleSuffix) > 0 and fic.title.endswith(self.titleSuffix):
            fic.title = fic.title[:-len(self.titleSuffix)]
        fic.title = fic.title.strip()

        # determine author
        authorPost = self.getRealAuthorPost(fic)
        authorPostUsernames = authorPost.find_all('a', {'class': 'username'})
        if len(authorPostUsernames) < 1:
            raise Exception('error: unable to find author username')
        author = authorPostUsernames[0].get_text()
        auth_href = authorPostUsernames[0].get('href')
        authorUrl = urllib.parse.urljoin(self.baseUrl, auth_href)
        if not authorUrl.startswith(self.baseUrl):
            raise Exception('error: unknown username href format')
        authorId = authorUrl[len(self.baseUrl):]
        if not authorId.startswith('members/'):
            raise Exception(f'error: unknown author id format: {authorId}')
        authorId = authorId.split('/')[1]
        self.setAuthor(fic, author, authorUrl, authorId)

        if fic.description is None:
            # TODO?
            fic.description = htmlEscape(fic.title + ' by ' +
                                         fic.getAuthorName())

        # try grabbing reader version, fallback to full pages
        threadmarksHtml = None
        try:
            sep = '?' if self.baseUrl.find('?') < 0 else '&'
            url = f'{self.baseUrl}threads/{fic.localId}/threadmarks{sep}category_id=1'
            threadmarksHtml = self.scrapeLike(url)
            self.readerSoftScrape(fic)
        except:
            # note: we do this before the theardmarks check for old-style fics
            # soft scrape all thread pages to ensure we have everything
            self.deepSoftScrape(fic)

        postSoups: Dict[str, Any] = {}

        postUrls: List[str] = []
        chapterTitles = {}
        try:
            # scrape the threadmarks page, assuming there is one
            threadmarksSoup = BeautifulSoup(threadmarksHtml, 'html5lib')

            # attempt to extract a fic description
            threadmarkExtraInfo = threadmarksSoup.find(
                'div', {'class': 'threadmarkListingHeader-extraInfo'})
            if threadmarkExtraInfo is not None:
                bbWrapper = threadmarkExtraInfo.find('div',
                                                     {'class': 'bbWrapper'})
                if bbWrapper is not None:
                    desc = bbWrapper.decode_contents()
                    descView = HtmlView(desc, markdown=False)
                    fic.description = ''.join(
                        [f'<p>{l}</p>' for l in descView.text])

            # determine chapter count based on threadmarks
            threadmarkList = threadmarksSoup.find('div',
                                                  {'class': 'threadmarkList'})
            threadmarks = None
            if threadmarkList is not None:
                threadmarks = threadmarkList.find_all(
                    'li', {'class': 'threadmarkListItem'})
            else:
                threadmarkList = threadmarksSoup.find(
                    'div', {'class': 'block-body--threadmarkBody'})
                if threadmarkList is None:
                    raise Exception('error: unable to find threadmark menu')
                if threadmarkList.find(class_='fa-ellipsis-h') is not None:
                    raise Exception('unable to handle elided threamdarks')
                threadmarks = threadmarkList.find_all('li')
                if len(threadmarks) == 0:
                    threadmarks = threadmarkList.find_all('tr')
                util.logMessage(
                    f'XenForo|new threadmarks count|{len(threadmarks)}')

            for threadmark in threadmarks:
                if threadmark.find(
                        'span', {'class': 'message-newIndicator'}) is not None:
                    continue
                a = threadmark.find('a')
                purl = a.get('href')
                if purl.startswith('threads/'):
                    purl = '{}{}'.format(self.baseUrl, purl)
                elif purl.startswith('/threads/'):
                    purl = '{}{}'.format(self.baseUrl, purl[1:])
                postUrls += [purl]

                chapterTitles[len(postUrls)] = a.getText().strip()

            try:
                postSoups, _ = self.getReaderPosts(fic)
            except Exception as ie:
                # FIXME oh boy:
                # https://forum.questionablequesting.com/threads/worm-cyoa-things-to-do-in-brockton-bay-when-youre-a-bored-demigod.1247/reader
                # Reader page says 36 threadmarks, but actual threadmark list says 33
                # First reader page abruptly stops at 27 threadmarks
                util.logMessage(
                    'XenForoAdapter: unable to getReaderPosts: {}\n{}'.format(
                        ie, traceback.format_exc()))
        except Exception as e:
            util.logMessage(
                'XenForoAdapter: unable to parse threadmarks: {}\n{}'.format(
                    e, traceback.format_exc()))
            try:
                postUrls = self.getReaderPostUrls(fic)
                postSoups, chapterTitles = self.getReaderPosts(fic)
            except Exception as ie:
                util.logMessage(
                    'XenForoAdapter: unable to parse reader posts: {}\n{}'.
                    format(ie, traceback.format_exc()))
                postUrls = self.getDeepAuthorPostUrls(fic)
                # if we fallback to here, don't immediately setup postSoups at all;
                # they'll be fetched as needed later

        fic.chapterCount = len(postUrls)

        chapterPosts: List[Optional[str]] = []
        chapterUrls: List[str] = []
        chapterPostIds: List[str] = []

        lastSoupUrl: Optional[str] = None
        lastSoup: Optional[Any] = None

        for purl in postUrls:
            parts = purl.split('#')
            burl = parts[0]
            postId = authorPost.get('id') if len(parts) < 2 else parts[1]

            rawPost = None
            # first try getting the post from the reader pages
            if postId in postSoups and postSoups[postId] is not None:
                rawPost = str(postSoups[postId])
            else:
                # if needed, fallback to grabbing that page from the entire thread
                pageSoup = None
                if lastSoupUrl is not None and lastSoupUrl == burl:
                    pageSoup = lastSoup
                else:
                    pageContent = self.scrapeLike(burl)
                    pageSoup = BeautifulSoup(pageContent, 'html5lib')
                    lastSoupUrl = burl
                    lastSoup = pageSoup
                assert (pageSoup is not None)
                if postId is not None:
                    poss = pageSoup.find_all(self.postContainer,
                                             {'id': postId})
                    if len(poss) != 1:
                        # XenForo2 often has js- prefixed on the actual id attr
                        poss = pageSoup.find_all(self.postContainer,
                                                 {'id': 'js-' + postId})
                    if len(poss) != 1:
                        raise Exception(
                            f'error: cannot find post for chapter {postId}')
                    rawPost = str(poss[0])
                else:
                    rawPost = str(
                        pageSoup.find_all(self.postContainer,
                                          {'class': 'message'})[0])

            chapterPosts += [rawPost]
            chapterUrls += [burl]
            chapterPostIds += [postId]

        fic.wordCount = 0
        fic.published = None
        fic.updated = None

        chapterContents: List[str] = []
        for rawPost in chapterPosts:
            post = BeautifulSoup(rawPost, 'html5lib')
            content = post.find_all(
                'div', {'class': ['messageContent', 'message-content']})
            if len(content) != 1:
                raise Exception('error: cannot find content for chapter post')
            content = content[0]

            lastEditedDivs = content.find_all('div',
                                              {'class': 'message-lastEdit'})
            for lastEditedDiv in lastEditedDivs:
                br = soup.new_tag("br")
                lastEditedDiv.insert_before(br)

            chapterContents += [str(content)]
            fic.wordCount += len(str(content).split())

            uts = self.getPostUpdatedOrPublished(post)

            if fic.published is None:
                fic.published = OilTimestamp(uts)
            fic.updated = OilTimestamp(uts)

        if fic.updated is None:
            raise Exception(
                f'unable to determine updated date: {len(chapterPosts)} {len(postUrls)}'
            )

        fic.upsert()
        for cid in range(fic.chapterCount):
            chapter = fic.chapter(cid + 1)
            chapter.url = chapterUrls[cid]
            chapter.localChapterId = chapterPostIds[cid]
            if (cid + 1) in chapterTitles:
                chapter.title = chapterTitles[(cid + 1)]
            chapter.upsert()

            chapter.setHtml(str(chapterContents[cid]))

        # TODO: word count, published, updated can only be found once all chapters

        # each post is inside an li id="post-{number}" class="message"
        # each post has data-author="{author}"

        self.updateTitle(fic)

        return fic
Example #25
0
#              "closeToLimitDelay" : 6,
#              "exec_core_per_job" : 4,
#              "drvr_mem" : "512m",
#              "exec_mem" : "2g",
#              "logfile" : "", - empty = no log file
#              "uiStartPort" : "", - empty = default start port range for random func
#              "uiEndPort" : "", - empty = default end port range for random func
#              "numFileTypePerTask" : 1,
#              "exportType" : "" - empty = export all filetype
#             }'
##           "":null --> None in python (no coalesce)
##           "":false/true --> False/True in python
# argv[6] - (optional) "cluster" or "client" mode

if len(sys.argv) < 6:
    util.logMessage("Error: param incorrect.")
    sys.exit(2)

# argv[5] - option json - get first to get all options
optionJSON = ""
if len(sys.argv) > 5:
    optionJSON = sys.argv[5]
if optionJSON == "":
    optionJSON = '{"master":"", "masterPort":5050}'
try:
    optionJSON = json.loads(optionJSON)
except Exception as e:  # error parsing json
    optionJSON = '{"master":"", "masterPort":5050}'
    optionJSON = json.loads(optionJSON)

# default val if not exist
Example #26
0
def canStartNewJob(statusJSON):

    bHaveResource = True
    delay_sec = general_retry_delay_sec  # general retry delay
    global prev_jobname
    global check_ctr

    # get status
    statusJSON = getStatusJSON_mesos()

    # get cores used
    cores_max, cores_used = getCoresUsed_mesos(statusJSON)
    util.logMessage("Current cores used: %d/%d" % (cores_used, cores_max))

    # get current job status
    numJobs, numWaitingJobs, bFoundLastSubmit = getCurrJobs_mesos(statusJSON)

    # get current worker resource status
    bHaveWorkersResource = haveWorkersResource_mesos(statusJSON)

    # re-calc max num jobs
    max_num_job = int(cores_max / core_per_job)
    if max_num_job > max_num_job_hardlimit:  # check against hard limit
        max_num_job = max_num_job_hardlimit

    # case 1: cannot get job info
    if numJobs == -1 or numWaitingJobs == -1:
        bHaveResource = False
        check_ctr = 0  # reset retry counter
        util.logMessage("cannot get jobs info, retry again in %d sec" %
                        delay_sec)
        '''
		# turn off to relax the check so we not neccessary wait for job sumbit finish
	# case 2: last submitted job not show up yet
	elif prev_jobname != "" and not bFoundLastSubmit:
		bHaveResource = False
		delay_sec = prev_job_wait_delay_sec # only wait for little before update
		util.logMessage("last job submit: %s not completed, retry again in %d sec" % (prev_jobname, delay_sec))
		'''

    # case 3: allowed cores exceed
    elif cores_used > (cores_max - core_per_job):
        bHaveResource = False
        check_ctr = 0  # reset retry counter
        util.logMessage("cores exceeding limit, retry again in %d sec" %
                        delay_sec)

    # case 4: do last n # of check before adding last available job slot
    # check_ctr == max_check_ctr means already check n # of times, pass test
    elif cores_used == (cores_max - core_per_job):
        if check_ctr < max_check_ctr:
            check_ctr += 1
            bHaveResource = False
            delay_sec = core_close_to_limit_delay_sec
            util.logMessage("cores close to limit, retry again in %d sec" %
                            (delay_sec))
        else:
            check_ctr = 0  # condition met, reset retry counter

    # case 5: more than 1 waiting job
    elif numWaitingJobs > 1:
        bHaveResource = False
        check_ctr = 0  # reset retry counter
        util.logMessage("number of waiting job = %d, retry again in %d sec" %
                        (numWaitingJobs, delay_sec))
        '''
		# cannot check this as now there are other different jobs in the pool
	# case 6: max job allowed reached
	elif numJobs >= max_num_job:
		bHaveResource = False
		check_ctr = 0 # reset retry counter
		util.logMessage("reached max num of job (%d/%d), retry again in %d sec" % (numJobs, max_num_job, delay_sec))
		'''

    # case 7: all worker occupied - either no avail core or no avail mem on all the workers
    elif bHaveWorkersResource == False:
        bHaveResource = False
        check_ctr = 0  # reset retry counter
        util.logMessage("all workers are occupied, retry again in %d sec" %
                        delay_sec)

    return bHaveResource, delay_sec
Example #27
0
 def readerSoftScrape(self, fic: Fic) -> None:
     urls = self.getReaderUrls(fic)
     util.logMessage('readerSoftScrape|fic.id: {}|len(urls): {}'.format(
         fic.id, len(urls)))
     for url in urls:
         self.scrapeLike(url)
Example #28
0
def worker(seqfile):

    global prev_jobname
    seqfile_dir, seqfile_file = os.path.split(seqfile)
    if exportMode == 2:  # pq only
        seqfile_dir, seqfile_file = os.path.split(
            seqfile_dir)  # parse again for the main folder (2nd lvl)
    if optionJSON[u'oss'] == "":
        job_oss = ''
    else:
        job_oss = '_' + optionJSON[u'oss']
    if exportMode == 2:  # pq only
        jobname_expMode = 'a'
    elif exportMode == 3:  # csv only
        jobname_expMode = 'b'
    else:  # combine
        jobname_expMode = 'c'
    jobname = "stg3%s_%s%s" % (jobname_expMode, seqfile_file, job_oss)
    jobname = jobname.replace(
        ' ',
        '-')  # for cluster mode, job name should not contain space - spark bug

    util.logMessage("Task %s start..." % jobname)

    # get rnadom port for web UI
    port = util.getAvailablePortRand(
        optionJSON[u'uiStartPort'],
        optionJSON[u'uiEndPort'])  # get random port

    # create master string
    if proc_mode == 'cluster':  # assume the leading master that zk return is the one to be use for dispatcher
        exec_str_master = "mesos://%s:%d" % (optionJSON[u'master'],
                                             optionJSON[u'dispatcherPort'])
    else:  # client
        if optionJSON[u'zkStr'] != '':
            exec_str_master = "mesos://%s" % (optionJSON[u'zkStr'])
        else:
            exec_str_master = "mesos://%s:%d" % (optionJSON[u'master'],
                                                 optionJSON[u'masterPort'])

    # create spark string
    exec_str_spark = "/opt/spark/bin/spark-submit \
--conf spark.ui.port=%d \
--conf spark.network.timeout=900s \
--conf spark.rpc.askTimeout=900s \
--conf spark.executor.heartbeatInterval=900s \
--conf 'spark.driver.extraJavaOptions=-XX:ParallelGCThreads=2' \
--conf 'spark.executor.extraJavaOptions=-XX:ParallelGCThreads=2' \
--master %s \
--deploy-mode %s \
--driver-memory %s \
--executor-memory %s \
--total-executor-cores %d" % (port, exec_str_master, proc_mode,
                              optionJSON[u'drvr_mem'], optionJSON[u'exec_mem'],
                              optionJSON[u'exec_core_per_job'])
    if proc_mode == 'cluster':  # cluster have more options to be set
        exec_str_spark += " --py-files \"%s,%s,%s\"" % (
            "file://%s/../util.py" % curr_py_dir,
            "file://%s/../schema/%s_%s_cell_avail_schema.json" %
            (curr_py_dir, optionJSON[u'tech'], optionJSON[u'vendor']),
            "file://%s/../sql/%s_%s_sql.json" %
            (curr_py_dir, optionJSON[u'tech'], optionJSON[u'vendor']))

    # create python string
    exec_str_py = "%s/../%s_%s_aggregator.py" % (
        curr_py_dir, optionJSON[u'tech'], optionJSON[u'vendor'])
    if exportMode == 3:  # mode 3 - export csv only
        exec_str_app = "%s \
3 \
%s \
%s \
TMO \
\"%s\" \
\"%s\" \
\"%s\" \
'%s'" % (exec_str_py, optionJSON[u'vendorUp'],
         optionJSON[u'techUp'], output_parq, output_dir, input_celllookup_parq,
         json.dumps(optionJSON))
    elif exportMode == 2:  # mode 2 - create parquet only
        exec_str_app = "%s \
2 \
%s \
%s \
TMO \
\"%s\" \
\"%s/*.txt\" \
\"%s\" \
\"%s\" \
'%s'" % (exec_str_py, optionJSON[u'vendorUp'], optionJSON[u'techUp'],
         input_dir, seqfile, input_celllookup_parq, output_parq,
         json.dumps(optionJSON))
    else:  # mode 1 - create parquet and export csv - not support anymore, should not run to here
        exec_str_app = "%s \
1 \
%s \
%s \
TMO \
\"%s\" \
\"%s/*.txt\" \
\"%s\" \
\"%s\" \
\"%s\" \
'%s'" % (exec_str_py, optionJSON[u'vendorUp'], optionJSON[u'techUp'],
         input_dir, seqfile, input_celllookup_parq, output_parq, output_dir,
         json.dumps(optionJSON))
    if proc_mode != 'cluster':  # client - support multi master (zookeeper)
        exec_str_app += " &"
    else:  # cluster - currently not support multi master (zookeeper)
        pass

    exec_str = exec_str_spark + " " + exec_str_app
    '''
	# old samples
	# submit new job - xml parser
	#exec_str = "spark-submit --master spark://master:7077 --executor-memory 512m --driver-memory 512m --total-executor-cores 2 %s/kpi_parser_eric.py \"%s\" %s \"%s\" &" % (curr_py_dir, jobname, seqfile, output_dir)
	if proc_mode != 'cluster': # client - support multi master (zookeeper)
	#	exec_str = "/opt/spark/bin/spark-submit --master mesos://mesos_master_01:5050 --driver-memory 512m --executor-memory 966m --total-executor-cores 2 %s/kpi_parser_lte_eric.py \"%s\" %s \"tts@mesos_fs_01|%s\" \"client\" &" % (curr_py_dir, jobname, seqfile, output_dir)
		exec_str = "/opt/spark/bin/spark-submit --master mesos://zk://mesos_master_01:2181,mesos_master_02:2181,mesos_master_03:2181/mesos --driver-memory 512m --executor-memory 966m --total-executor-cores 2 %s/kpi_parser_lte_eric.py \"%s\" %s \"imnosrf@mesos_fs_01|%s\" \"client\" &" % (curr_py_dir, jobname, seqfile, output_dir)
	else: # cluster - currently not support multi master (zookeeper)
	#	exec_str = "/opt/spark/bin/spark-submit --master mesos://mesos_master_01:7077 --deploy-mode cluster --driver-memory 512m --executor-memory 966m --total-executor-cores 2 --py-files \"file:///home/tts/ttskpiraw/code/lte-eric/util.py,file:///home/tts/ttskpiraw/code/lte-eric/xmlparser_lte_eric.py,file:///home/tts/ttskpiraw/code/lte-eric/config.ini\" %s/kpi_parser_lte_eric.py \"%s\" %s \"tts@mesos_fs_01\|%s\" \"cluster\"" % (curr_py_dir, jobname, seqfile, output_dir)
		exec_str = "/opt/spark/bin/spark-submit --master mesos://mesos_master_01:7077 --deploy-mode cluster --driver-memory 512m --executor-memory 966m --total-executor-cores 2 --py-files \"file:///home/imnosrf/ttskpiraw/code/lte-eric/util.py,file:///home/imnosrf/ttskpiraw/code/lte-eric/xmlparser_lte_eric.py,file:///home/imnosrf/ttskpiraw/code/lte-eric/config.ini\" %s/kpi_parser_lte_eric.py \"%s\" %s \"imnosrf@mesos_fs_01\|%s\" \"cluster\"" % (curr_py_dir, jobname, seqfile, output_dir)
	'''

    util.logMessage("%s" % exec_str)

    # update prev jobname
    prev_jobname = jobname

    os.system(exec_str)
Example #29
0
    def extractContent(self, fic: Fic, html: str) -> str:
        from bs4 import BeautifulSoup
        contentId = util.randomString(8)
        while html.find(contentId) >= 0:
            contentId = util.randomString(len(contentId) + 1)
        soup = BeautifulSoup(f'<div id="{contentId}">{html}</div>', 'html5lib')

        # SB
        for spoiler in soup.find_all('div', {'class': 'bbCodeSpoiler'}):
            button = spoiler.find('button')
            title = spoiler.find('span',
                                 {'class': 'bbCodeSpoiler-button-title'})
            if title is not None and button is not None:
                t = soup.new_tag('span')
                t.append(title.get_text())
                button.insert_after(t)
            if button is not None:
                button.extract()
        for spoiler in soup.find_all('div',
                                     {'class': 'bbCodeSpoiler-content'}):
            spoiler.attrs['class'] = 'spoiler'

        # QQ
        for spoiler in soup.find_all('div',
                                     {'class': 'bbCodeSpoilerContainer'}):
            spoiler.attrs.pop('class')
            spoiler.name = 'span'
        for spoiler in soup.find_all('div', {'class': 'bbCodeSpoilerText'}):
            spoiler.attrs['class'] = 'spoiler'

        # for the proxy js based img tags, fiddle with their attributes so the
        # html cleanup code gets the proxy url out of .data-url and the original
        # upstream url from .src (or the proxy url if we don't have a real
        # upstream)
        for img in soup.find_all('img'):
            # proxy img tags have data-src but no actual src
            if 'data-src' not in img.attrs:
                continue
            if 'src' in img.attrs:
                continue

            src = img.attrs['data-src']
            if not src.startswith('http'):
                src = self.baseUrl + src
            altSrc = None
            if 'data-url' in img.attrs:
                altSrc = img.attrs['data-url']
            img.attrs['data-url'] = src
            img.attrs['src'] = src
            if altSrc:
                img.attrs['src'] = altSrc

        # general 'click to expand' nonsense
        for div in soup.find_all('div', {'class': 'quoteExpand'}):
            if div.get_text().strip() in {
                    'Click to expand...', 'Click to expand…'
            }:
                div.extract()

        # CloudFlare protected "emails"
        for e in soup.find_all('a', {'class': '__cf_email__'}):
            if 'data-cfemail' not in e.attrs:
                continue
            t = e.get_text()
            if not t.startswith('[email') or not t.endswith('protected]'):
                continue
            cfemail = e.attrs['data-cfemail']
            email = util.decodeCloudFlareEmail(cfemail)
            util.logMessage(f'decoding email|{cfemail}|{email}')

            e.name = 'span'
            e.attrs.clear()
            e.string = email

        content = soup.find('div', {'id': contentId})
        content = content.contents
        if isinstance(content, list):
            content = content[0]
        return str(content)
Example #30
0
def main(input_dir, optionJSON):
    '''
   # sameple code
   # get status
   statusJSON = getStatusJSON_mesos()
   cores_max, cores_used = getCoresUsed_mesos(statusJSON)
   print 'max:%s, used:%s' % (cores_max, cores_used)
   print 'have resource: %s' % haveWorkersResource_mesos(statusJSON)
   numJobs, numWaitingJobs, bFoundLastSubmit = getCurrJobs_mesos(statusJSON, '1x2c_client')
   print 'numJobs: %s; numWaitingJobs: %s; bFoundLastSubmit: %s' % (numJobs, numWaitingJobs, bFoundLastSubmit)
   exit(0)
   '''

    global exportMode

    if exportMode != 3:  # not only export csv
        # go thru all seq file/folder
        inputSeqPath = input_dir + "/ttskpiraw_%s_%s_*_TMO*.tgz" % (
            optionJSON[u'vendorFULL'], optionJSON[u'techUp'])
        inputSeqList = glob.glob(inputSeqPath)
        if len(inputSeqList) <= 0:  # no file
            util.logMessage("No parser output to process: %s" % inputSeqPath)
            os.system(
                "rm -rf '%s'" % staging_dir_sub
            )  # remove staging sub folder (since will not be removed by proc)
            if exportMode == 2:  # if save pq only (no output), and also no input, end process
                util.endProcess(lockpath, 0)
            else:  # if no input, but have output, only do export
                exportMode = 3

    # export only mode
    if exportMode == 3:
        util.getInfoFromPQNokia(output_parq)

        # from parquet dir get main info: filetypelist->datelist->marketlist->hrlist e.g. {"lte_cell_avail": {"2016-11-21": {"NY": {"00": "path"}}}}
        infoPq = util.getInfoFromPQNokia(output_parq)
        if infoPq is None or len(infoPq.items()) <= 0:  # safeguard
            util.logMessage("Error! No data found from parquet file: %s" %
                            output_parq)
            return 0

        filetypeExportArr = []
        filetypeCtr = 0
        filetypeStr = ''
        for filetype, filetypeItem in sorted(
                infoPq.items()
        ):  # on each file type, accum into file types list based on # filetype per task
            if filetypeCtr < int(optionJSON[u'numFileTypePerTask']):
                filetypeCtr += 1
            else:
                filetypeCtr = 1
                filetypeExportArr.append(filetypeStr)

            if filetypeCtr == 1:
                filetypeStr = filetype
            else:
                filetypeStr += '|' + filetype

        # leftover filetype
        filetypeExportArr.append(filetypeStr)

        for filetypeStr in filetypeExportArr:  # on each file types list, spawn new task

            # submit one process to work on the whole folder (of multiple txt file)
            try:
                # get status
                statusJSON = getStatusJSON_mesos()
                bStartNewJob, delay_sec = canStartNewJob(statusJSON)
                while (bStartNewJob == False):
                    time.sleep(delay_sec)
                    bStartNewJob, delay_sec = canStartNewJob(
                        statusJSON)  # retest after the sleep

                # process file
                optionJSON[
                    u'exportType'] = filetypeStr  # set new filetypes (| delimited list)
                worker(staging_dir_sub)

                # wait some sec before next task
                time.sleep(new_job_delay_sec)

            except Exception as e:
                util.logMessage("Error: failed to export file %s\n%s" %
                                (staging_dir_sub, e))
            except:
                util.logMessage("Unexpected error")

        return 0

    # move seq file into staging_sub first to prevent other proc from touching them
    inputSeqStageList = []
    for curr_file in sorted(inputSeqList):
        util.logMessage("Moving %s to staging dir %s" %
                        (curr_file, staging_dir_sub))
        try:
            shutil.move(curr_file, staging_dir_sub)
            curr_filedir, curr_filename = os.path.split(curr_file)
            inputSeqStageList.append(
                os.path.join(staging_dir_sub, curr_filename))
        except shutil.Error as e:
            util.logMessage("Error: failed to move file %s\n%s" %
                            (curr_file, e))
        except:
            util.logMessage("Unexpected error")

    # going to each file in the staging area and unzip into one folder
    for curr_file in inputSeqStageList:
        try:

            exec_str = ''
            if optionJSON[u'vendor'] == 'eric':
                exec_str = "tar -xvzf %s -C %s *%s_%s*TMO.txt" % (
                    curr_file, staging_dir_sub, optionJSON[u'vendorFULL'],
                    optionJSON[u'techUp'])
            else:  # nokia
                exec_str = "tar -xvzf %s -C %s *%s_%s*TMO*.txt" % (
                    curr_file, staging_dir_sub, optionJSON[u'vendorFULL'],
                    optionJSON[u'techUp'])
            util.logMessage('unzipping files: %s' % exec_str)
            os.system(exec_str)

        except Exception as e:
            util.logMessage("Error: failed to process file %s\n%s" %
                            (curr_file, e))
            # try to move it back to input dir for re-processing next round
            try:
                shutil.move(curr_file, input_dir)
            except shutil.Error as e:
                util.logMessage("Error: failed to move file %s\n%s" %
                                (curr_file, e))
            except:
                util.logMessage("Unexpected error")
        except:
            util.logMessage("Unexpected error")
            # try to move it back to input dir for re-processing next round
            try:
                shutil.move(curr_file, input_dir)
            except shutil.Error as e:
                util.logMessage("Error: failed to move file %s\n%s" %
                                (curr_file, e))
            except:
                util.logMessage("Unexpected error")

    # move files into sub folders by file type
    filetypeArr = {}
    filetypeSetArr = {}
    filetypeDirArr = []
    stagingFileList = glob.glob(staging_dir_sub + "/*.txt")
    if len(stagingFileList) > 0:  # safeguard
        for curr_file in stagingFileList:
            curr_stg_dir, curr_data_filename = os.path.split(curr_file)
            filenameArr = curr_data_filename.split('.')[0].split('_')
            filetype = '_'.join(filenameArr[6:])
            '''
         ##### old code - create subfolder by filetype #####
         filetypeDir = staging_dir_sub + '/' + filetype

         if filetype not in filetypeArr: # create new dir

            filetypeArr.append(filetype)        

            if not os.path.isdir(filetypeDir): # create if not exist
               try:
                  os.mkdir(filetypeDir)
                  filetypeDirArr.append(filetypeDir)
               except:
                  util.logMessage("Failed to create folder \"%s\"!" % filetypeDir)
                  util.logMessage("Process terminated.")
                  util.endProcess(lockpath, 2)            
        
         # move file by filetype
         try:
            shutil.move(curr_file, filetypeDir)
         except shutil.Error as e:
            util.logMessage("Error: failed to move file %s\n%s" % (curr_file, e))
         except:
            util.logMessage("Unexpected error")
         ##### old code - create subfolder by filetype #####
         '''

            if filetype not in filetypeArr:  # create new list
                filetypeArr[filetype] = []
            filetypeArr[filetype].append(curr_file)

        numSet = int(
            math.ceil(
                len(filetypeArr) / float(optionJSON[u'numFileTypePerTask'])))
        setCntr = 1  # init
        filetypeCntr = 0  # init
        # reorganize set by grouping together multiple filetypes
        for filetype, filetypeItem in sorted(filetypeArr.items()):

            if filetypeCntr < optionJSON[u'numFileTypePerTask']:
                filetypeCntr += 1
            else:
                filetypeCntr = 1  # reset
                setCntr += 1

            # create set index and new array if not exist
            setIdx = "%d_%d" % (setCntr, numSet)
            if setIdx not in filetypeSetArr:
                filetypeSetArr[setIdx] = []

            # insert filename into set array
            for file in filetypeItem:
                filetypeSetArr[setIdx].append(file)

        # move file to final set dir
        for file_set, fileArr in sorted(filetypeSetArr.items()):

            filetypeDir = staging_dir_sub + '/' + file_set
            if not os.path.isdir(filetypeDir):  # create if not exist
                try:
                    os.mkdir(filetypeDir)
                    filetypeDirArr.append(filetypeDir)
                except:
                    util.logMessage("Failed to create folder \"%s\"!" %
                                    filetypeDir)
                    util.logMessage("Process terminated.")
                    util.endProcess(lockpath, 2)

            for curr_file in fileArr:
                # move file by filetype
                try:
                    shutil.move(curr_file, filetypeDir)
                except shutil.Error as e:
                    util.logMessage("Error: failed to move file %s\n%s" %
                                    (curr_file, e))
                except:
                    util.logMessage("Unexpected error")

    # going to each file type folder in the staging area and submit process
    for curr_dir in filetypeDirArr:
        try:

            # get status
            statusJSON = getStatusJSON_mesos()
            bStartNewJob, delay_sec = canStartNewJob(statusJSON)
            while (bStartNewJob == False):
                time.sleep(delay_sec)
                bStartNewJob, delay_sec = canStartNewJob(
                    statusJSON)  # retest after the sleep

            # process file
            worker(curr_dir)

            # wait some sec before next task
            time.sleep(new_job_delay_sec)

        except Exception as e:
            util.logMessage("Error: failed to process file %s\n%s" %
                            (curr_file, e))
            # WES_TEST: doesn't work like that
            # try to move it back to input dir for re-processing next round
            try:
                shutil.move(curr_file, input_dir)
            except shutil.Error as e:
                util.logMessage("Error: failed to move file %s\n%s" %
                                (curr_file, e))
            except:
                util.logMessage("Unexpected error")
        except:
            util.logMessage("Unexpected error")
            # try to move it back to input dir for re-processing next round
            try:
                shutil.move(curr_file, input_dir)
            except shutil.Error as e:
                util.logMessage("Error: failed to move file %s\n%s" %
                                (curr_file, e))
            except:
                util.logMessage("Unexpected error")

    return 0