Python KastParsersLib.html2TagSignal 예제들

프로그래밍 언어: Python

클래스/타입: KastParsersLib

메소드/함수: html2TagSignal

hotexamples.com에서의 예제들: 2

Python KastParsersLib.html2TagSignal - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 KastParsersLib.html2TagSignal에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

html2TagSignal(2)

calculateThresholdDftDistanceScore(1)

cleanHtml(1)

convert2AbsoluteHyperlinks(1)

dftDistance(1)

extractHyperlinks(1)

fetchURL(1)

kastConfigFileParser(1)

populateUnseenUrlList(1)

예제 #1

파일 보기

파일: crawler.py 프로젝트: shirshen12/collection

def classify(htmlSeries, sm):

  global BASEFILESTORAGEDIR

  # Make the useless folder page.

  uselessPagesFolder = chkmkFolderStructure(BASEFILESTORAGEDIR + '/useless/')

  # List all the files, in the folder.

  listOfFiles = os.listdir(BASEFILESTORAGEDIR)
  listOfFiles = [BASEFILESTORAGEDIR + p for p in listOfFiles]

  # Now start the loop and process every file

  for l in range(0, len(listOfFiles)):

    # Choose a file randomly.

    page = random.choice(listOfFiles)

    # Extract the content of the file

    c = gzip.open(page, 'rb')
    contents = c.read()
    c.close()

    # Write to a tmp file.

    tmpFilename = '/tmp/' + page.split('/')[-1]
    f = file(tmpFilename, 'w')
    f.write(contents)
    f.close()

    # Generate html series of this file, tphs --> testPageHtmlSeries

    tphsUrl = 'file://' + tmpFilename
    tphs = KastParsersLib.html2TagSignal(tphsUrl)

    # dftDistance scoreboard

    dftDistanceScoreboard = []

    for d in htmlSeries:

      # Now calculate the score and append them to an array.

      dftDistanceScoreboard.append(KastParsersLib.dftDistance(tphs, d))

    # Now calculate average.

    s = KastGenericFunctionsLib.calcAvg(dftDistanceScoreboard)

    # Score is less than mean similarity measure, move it to the useless folder.

    if s < sm:
      os.system(page, uselessPagesFolder)

예제 #2

파일 보기

파일: crawler.py 프로젝트: shirshen12/collection

def main(targetWebsite, configFile):

  global unseenUrlList
  global BASELOGDIR
  global BASELOCKFILEDIR
  global BASEFILESTORAGEDIR
  global BASEERRORLOGDIR
  global BASECONTENTDIR
  global contentLogFile
  global mode

  # Extract website name

  sitename = KastGenericFunctionsLib.extractWebSiteName(targetWebsite)

  # First generate the folder structure if its does not exist.

  BASELOGDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASELOGDIR)
  BASELOCKFILEDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASELOCKFILEDIR)
  BASEFILESTORAGEDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASEFILESTORAGEDIR + sitename + '/')
  BASEERRORLOGDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASEERRORLOGDIR)
  BASECONTENTDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASECONTENTDIR)

  # Now generate the task/target specific filenames.

  lockFile = BASELOCKFILEDIR + sitename + '.lock'
  errorLog = BASEERRORLOGDIR + sitename + '.error'
  contentLogFile = BASECONTENTDIR + sitename + '-' + str(round(time.time(), 2))

  # Now check if the lock file exists and proceed with crawling.

  if os.path.exists(lockFile):
    KastGenericFunctionsLib.logException(sitename + ' crawl in progress - Exiting - ' + str(time.time()), BASELOGDIR + sitename + '.exit.log')
    sys.exit(-1)

  # Make a lock file.

  if mode == 'p':

    lf = file(lockFile, 'w')
    lf.close()

  # Read the config file into a Dictionary/Hash structure.

  targetWebsiteConfigs = KastParsersLib.kastConfigFileParser(configFile)

  if targetWebsiteConfigs == {}:

    KastGenericFunctionsLib.logException('Target website configs could not extracted - ' + str(time.time()), errorLog)
    sys.exit(-1)

  # Obtain the list of URLs from the above data structure and generate time domain
  # perfect series representation of html content.

  htmlSeries = [KastParsersLib.html2TagSignal(url) for url in targetWebsiteConfigs['SampleURLS']]

  # Calculate the average similarity measure.

  similarityMeasure = KastParsersLib.calculateThresholdDftDistanceScore(htmlSeries)

  # Populate the unseenUrlList

  unseenUrlList = KastParsersLib.populateUnseenUrlList(targetWebsite, unseenUrlList)
  if unseenUrlList == []:
    logException('Seed URL List is malformed. Crawl engine is exiting - ' + str(time.time()), errorLog)
    sys.exit(-1)

  # Start crawling

  crawl(targetWebsite)

  # Now apply the Page classification algorithm to preserve only the pages of interest.

  classify(htmlSeries, similarityMeasure)

  # Apply the CSS rules for scrapping content, this will serve as a simple rule engine template.

  contentExtractionRules = targetWebsiteConfigs['ContentExtractionRules']

  extractContent(contentExtractionRules)

  # Convert the log file into RDF N Triples file

  predicateList = targetWebsiteConfigs['PredicateList']

  nTriplesFile = table2RDFNTriplesConverter(contentLogFile, predicateList)

  # Now log all the information to AllegroGraphDB

  store2db(nTriplesFile)