def classify(htmlSeries, sm): global BASEFILESTORAGEDIR # Make the useless folder page. uselessPagesFolder = chkmkFolderStructure(BASEFILESTORAGEDIR + '/useless/') # List all the files, in the folder. listOfFiles = os.listdir(BASEFILESTORAGEDIR) listOfFiles = [BASEFILESTORAGEDIR + p for p in listOfFiles] # Now start the loop and process every file for l in range(0, len(listOfFiles)): # Choose a file randomly. page = random.choice(listOfFiles) # Extract the content of the file c = gzip.open(page, 'rb') contents = c.read() c.close() # Write to a tmp file. tmpFilename = '/tmp/' + page.split('/')[-1] f = file(tmpFilename, 'w') f.write(contents) f.close() # Generate html series of this file, tphs --> testPageHtmlSeries tphsUrl = 'file://' + tmpFilename tphs = KastParsersLib.html2TagSignal(tphsUrl) # dftDistance scoreboard dftDistanceScoreboard = [] for d in htmlSeries: # Now calculate the score and append them to an array. dftDistanceScoreboard.append(KastParsersLib.dftDistance(tphs, d)) # Now calculate average. s = KastGenericFunctionsLib.calcAvg(dftDistanceScoreboard) # Score is less than mean similarity measure, move it to the useless folder. if s < sm: os.system(page, uselessPagesFolder)
def main(targetWebsite, configFile): global unseenUrlList global BASELOGDIR global BASELOCKFILEDIR global BASEFILESTORAGEDIR global BASEERRORLOGDIR global BASECONTENTDIR global contentLogFile global mode # Extract website name sitename = KastGenericFunctionsLib.extractWebSiteName(targetWebsite) # First generate the folder structure if its does not exist. BASELOGDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASELOGDIR) BASELOCKFILEDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASELOCKFILEDIR) BASEFILESTORAGEDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASEFILESTORAGEDIR + sitename + '/') BASEERRORLOGDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASEERRORLOGDIR) BASECONTENTDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASECONTENTDIR) # Now generate the task/target specific filenames. lockFile = BASELOCKFILEDIR + sitename + '.lock' errorLog = BASEERRORLOGDIR + sitename + '.error' contentLogFile = BASECONTENTDIR + sitename + '-' + str(round(time.time(), 2)) # Now check if the lock file exists and proceed with crawling. if os.path.exists(lockFile): KastGenericFunctionsLib.logException(sitename + ' crawl in progress - Exiting - ' + str(time.time()), BASELOGDIR + sitename + '.exit.log') sys.exit(-1) # Make a lock file. if mode == 'p': lf = file(lockFile, 'w') lf.close() # Read the config file into a Dictionary/Hash structure. targetWebsiteConfigs = KastParsersLib.kastConfigFileParser(configFile) if targetWebsiteConfigs == {}: KastGenericFunctionsLib.logException('Target website configs could not extracted - ' + str(time.time()), errorLog) sys.exit(-1) # Obtain the list of URLs from the above data structure and generate time domain # perfect series representation of html content. htmlSeries = [KastParsersLib.html2TagSignal(url) for url in targetWebsiteConfigs['SampleURLS']] # Calculate the average similarity measure. similarityMeasure = KastParsersLib.calculateThresholdDftDistanceScore(htmlSeries) # Populate the unseenUrlList unseenUrlList = KastParsersLib.populateUnseenUrlList(targetWebsite, unseenUrlList) if unseenUrlList == []: logException('Seed URL List is malformed. Crawl engine is exiting - ' + str(time.time()), errorLog) sys.exit(-1) # Start crawling crawl(targetWebsite) # Now apply the Page classification algorithm to preserve only the pages of interest. classify(htmlSeries, similarityMeasure) # Apply the CSS rules for scrapping content, this will serve as a simple rule engine template. contentExtractionRules = targetWebsiteConfigs['ContentExtractionRules'] extractContent(contentExtractionRules) # Convert the log file into RDF N Triples file predicateList = targetWebsiteConfigs['PredicateList'] nTriplesFile = table2RDFNTriplesConverter(contentLogFile, predicateList) # Now log all the information to AllegroGraphDB store2db(nTriplesFile)