Exemple #1
0
def extractContent(rules):

  global contentLogFile

  # Now obtain a list of all the files from the content folder.

  listOfFiles = os.listdir(BASEFILESTORAGEDIR)
  listOfFiles = [BASEFILESTORAGEDIR + l for l in listOfFiles]

  records = []

  # Now loop through the files and apply the rules

  for f in listOfFiles:

    # Read the gzipped file

    g = gzip.open(f, 'rb')
    c = g.read()
    g.close()

    record = []

    # Replace [kastSlash] with '/' when we will store and process data.

    f = string.replace(f, '[kastSlash]', '/')

    # Append the name of the file, because it serves as value for product location

    record.append(f.split('/')[-1])

    # Now apply the rules serially and extract content.

    for r in rules:

      # Get a jQuery type $ object for this html page.

      d = pq(c)

      # Apply the CSS selector

      ele = d(r)

      # Store the obtained text in an array.

      record.append(ele.text)

    # Now append the record to records.

    records.append(record)

  # Now write all the records to a designated content log file.

  KastGenericFunctionsLib.writeToDisk(contentLogFile, records)
Exemple #2
0
def classify(htmlSeries, sm):

  global BASEFILESTORAGEDIR

  # Make the useless folder page.

  uselessPagesFolder = chkmkFolderStructure(BASEFILESTORAGEDIR + '/useless/')

  # List all the files, in the folder.

  listOfFiles = os.listdir(BASEFILESTORAGEDIR)
  listOfFiles = [BASEFILESTORAGEDIR + p for p in listOfFiles]

  # Now start the loop and process every file

  for l in range(0, len(listOfFiles)):

    # Choose a file randomly.

    page = random.choice(listOfFiles)

    # Extract the content of the file

    c = gzip.open(page, 'rb')
    contents = c.read()
    c.close()

    # Write to a tmp file.

    tmpFilename = '/tmp/' + page.split('/')[-1]
    f = file(tmpFilename, 'w')
    f.write(contents)
    f.close()

    # Generate html series of this file, tphs --> testPageHtmlSeries

    tphsUrl = 'file://' + tmpFilename
    tphs = KastParsersLib.html2TagSignal(tphsUrl)

    # dftDistance scoreboard

    dftDistanceScoreboard = []

    for d in htmlSeries:

      # Now calculate the score and append them to an array.

      dftDistanceScoreboard.append(KastParsersLib.dftDistance(tphs, d))

    # Now calculate average.

    s = KastGenericFunctionsLib.calcAvg(dftDistanceScoreboard)

    # Score is less than mean similarity measure, move it to the useless folder.

    if s < sm:
      os.system(page, uselessPagesFolder)
Exemple #3
0
def main(targetWebsite, configFile):

  global unseenUrlList
  global BASELOGDIR
  global BASELOCKFILEDIR
  global BASEFILESTORAGEDIR
  global BASEERRORLOGDIR
  global BASECONTENTDIR
  global contentLogFile
  global mode

  # Extract website name

  sitename = KastGenericFunctionsLib.extractWebSiteName(targetWebsite)

  # First generate the folder structure if its does not exist.

  BASELOGDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASELOGDIR)
  BASELOCKFILEDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASELOCKFILEDIR)
  BASEFILESTORAGEDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASEFILESTORAGEDIR + sitename + '/')
  BASEERRORLOGDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASEERRORLOGDIR)
  BASECONTENTDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASECONTENTDIR)

  # Now generate the task/target specific filenames.

  lockFile = BASELOCKFILEDIR + sitename + '.lock'
  errorLog = BASEERRORLOGDIR + sitename + '.error'
  contentLogFile = BASECONTENTDIR + sitename + '-' + str(round(time.time(), 2))

  # Now check if the lock file exists and proceed with crawling.

  if os.path.exists(lockFile):
    KastGenericFunctionsLib.logException(sitename + ' crawl in progress - Exiting - ' + str(time.time()), BASELOGDIR + sitename + '.exit.log')
    sys.exit(-1)

  # Make a lock file.

  if mode == 'p':

    lf = file(lockFile, 'w')
    lf.close()

  # Read the config file into a Dictionary/Hash structure.

  targetWebsiteConfigs = KastParsersLib.kastConfigFileParser(configFile)

  if targetWebsiteConfigs == {}:

    KastGenericFunctionsLib.logException('Target website configs could not extracted - ' + str(time.time()), errorLog)
    sys.exit(-1)

  # Obtain the list of URLs from the above data structure and generate time domain
  # perfect series representation of html content.

  htmlSeries = [KastParsersLib.html2TagSignal(url) for url in targetWebsiteConfigs['SampleURLS']]

  # Calculate the average similarity measure.

  similarityMeasure = KastParsersLib.calculateThresholdDftDistanceScore(htmlSeries)

  # Populate the unseenUrlList

  unseenUrlList = KastParsersLib.populateUnseenUrlList(targetWebsite, unseenUrlList)
  if unseenUrlList == []:
    logException('Seed URL List is malformed. Crawl engine is exiting - ' + str(time.time()), errorLog)
    sys.exit(-1)

  # Start crawling

  crawl(targetWebsite)

  # Now apply the Page classification algorithm to preserve only the pages of interest.

  classify(htmlSeries, similarityMeasure)

  # Apply the CSS rules for scrapping content, this will serve as a simple rule engine template.

  contentExtractionRules = targetWebsiteConfigs['ContentExtractionRules']

  extractContent(contentExtractionRules)

  # Convert the log file into RDF N Triples file

  predicateList = targetWebsiteConfigs['PredicateList']

  nTriplesFile = table2RDFNTriplesConverter(contentLogFile, predicateList)

  # Now log all the information to AllegroGraphDB

  store2db(nTriplesFile)
Exemple #4
0
def crawl(targetWebsite):

  global sitename
  global errorLog
  global unseenUrlList
  global visitedUrlList
  global BASEFILESTORAGEDIR

  # Now start the crawling rountine.

  while (1):

    if unseenUrlList != []:

      # Choose a page randomly

      page = random.choice(unseenUrlList)

      # Fetch the content.

      r = KastParsersLib.fetchURL(page)

      # Clean the content.

      r = KastParsersLib.cleanHtml(r)

      # Write the content to a file, in the designated folder.

      filename = KastGenericFunctionsLib.extractWebSiteName(page) + '-' + str(round(time.time(), 2))
      # Replace all '/' with [kastSlash]
      filename = string.replace(filename, '/', '[kastSlash]')
      f = gzip.open(BASEFILESTORAGEDIR + filename + '.gz', 'wb')
      f.write(r)
      f.close()

      # Convert to DOM and apply the CSS rule engine

      d = pq(r)
      ele_a = d('a')

      # Extract the hyperlinks

      links_a = KastParsersLib.extractHyperlinks(ele_a)

      # Convert to absolute links.

      unseenUrlListTmp = KastParsersLib.convert2AbsoluteHyperlinks(links_a, targetWebsite)

      # Now check how many of these links exist in Visited URL list.

      for link in unseenUrlListTmp:
        if not visitedUrlList.__contains__(link):
          unseenUrlList.append(link)

      # Now append this page processed to visited URLs list.

      visitedUrlList.append(page)

      # Now remove the same link from unseenUrlList.

      unseenUrlList.remove(page)

      # Condition to end the crawl.

      # Debug ON, turn off in production.

      pdb.set_trace()

      if unseenUrlList == []:
        return