def extractContent(rules): global contentLogFile # Now obtain a list of all the files from the content folder. listOfFiles = os.listdir(BASEFILESTORAGEDIR) listOfFiles = [BASEFILESTORAGEDIR + l for l in listOfFiles] records = [] # Now loop through the files and apply the rules for f in listOfFiles: # Read the gzipped file g = gzip.open(f, 'rb') c = g.read() g.close() record = [] # Replace [kastSlash] with '/' when we will store and process data. f = string.replace(f, '[kastSlash]', '/') # Append the name of the file, because it serves as value for product location record.append(f.split('/')[-1]) # Now apply the rules serially and extract content. for r in rules: # Get a jQuery type $ object for this html page. d = pq(c) # Apply the CSS selector ele = d(r) # Store the obtained text in an array. record.append(ele.text) # Now append the record to records. records.append(record) # Now write all the records to a designated content log file. KastGenericFunctionsLib.writeToDisk(contentLogFile, records)
def classify(htmlSeries, sm): global BASEFILESTORAGEDIR # Make the useless folder page. uselessPagesFolder = chkmkFolderStructure(BASEFILESTORAGEDIR + '/useless/') # List all the files, in the folder. listOfFiles = os.listdir(BASEFILESTORAGEDIR) listOfFiles = [BASEFILESTORAGEDIR + p for p in listOfFiles] # Now start the loop and process every file for l in range(0, len(listOfFiles)): # Choose a file randomly. page = random.choice(listOfFiles) # Extract the content of the file c = gzip.open(page, 'rb') contents = c.read() c.close() # Write to a tmp file. tmpFilename = '/tmp/' + page.split('/')[-1] f = file(tmpFilename, 'w') f.write(contents) f.close() # Generate html series of this file, tphs --> testPageHtmlSeries tphsUrl = 'file://' + tmpFilename tphs = KastParsersLib.html2TagSignal(tphsUrl) # dftDistance scoreboard dftDistanceScoreboard = [] for d in htmlSeries: # Now calculate the score and append them to an array. dftDistanceScoreboard.append(KastParsersLib.dftDistance(tphs, d)) # Now calculate average. s = KastGenericFunctionsLib.calcAvg(dftDistanceScoreboard) # Score is less than mean similarity measure, move it to the useless folder. if s < sm: os.system(page, uselessPagesFolder)
def main(targetWebsite, configFile): global unseenUrlList global BASELOGDIR global BASELOCKFILEDIR global BASEFILESTORAGEDIR global BASEERRORLOGDIR global BASECONTENTDIR global contentLogFile global mode # Extract website name sitename = KastGenericFunctionsLib.extractWebSiteName(targetWebsite) # First generate the folder structure if its does not exist. BASELOGDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASELOGDIR) BASELOCKFILEDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASELOCKFILEDIR) BASEFILESTORAGEDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASEFILESTORAGEDIR + sitename + '/') BASEERRORLOGDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASEERRORLOGDIR) BASECONTENTDIR = KastGenericFunctionsLib.chkmkFolderStructure(BASECONTENTDIR) # Now generate the task/target specific filenames. lockFile = BASELOCKFILEDIR + sitename + '.lock' errorLog = BASEERRORLOGDIR + sitename + '.error' contentLogFile = BASECONTENTDIR + sitename + '-' + str(round(time.time(), 2)) # Now check if the lock file exists and proceed with crawling. if os.path.exists(lockFile): KastGenericFunctionsLib.logException(sitename + ' crawl in progress - Exiting - ' + str(time.time()), BASELOGDIR + sitename + '.exit.log') sys.exit(-1) # Make a lock file. if mode == 'p': lf = file(lockFile, 'w') lf.close() # Read the config file into a Dictionary/Hash structure. targetWebsiteConfigs = KastParsersLib.kastConfigFileParser(configFile) if targetWebsiteConfigs == {}: KastGenericFunctionsLib.logException('Target website configs could not extracted - ' + str(time.time()), errorLog) sys.exit(-1) # Obtain the list of URLs from the above data structure and generate time domain # perfect series representation of html content. htmlSeries = [KastParsersLib.html2TagSignal(url) for url in targetWebsiteConfigs['SampleURLS']] # Calculate the average similarity measure. similarityMeasure = KastParsersLib.calculateThresholdDftDistanceScore(htmlSeries) # Populate the unseenUrlList unseenUrlList = KastParsersLib.populateUnseenUrlList(targetWebsite, unseenUrlList) if unseenUrlList == []: logException('Seed URL List is malformed. Crawl engine is exiting - ' + str(time.time()), errorLog) sys.exit(-1) # Start crawling crawl(targetWebsite) # Now apply the Page classification algorithm to preserve only the pages of interest. classify(htmlSeries, similarityMeasure) # Apply the CSS rules for scrapping content, this will serve as a simple rule engine template. contentExtractionRules = targetWebsiteConfigs['ContentExtractionRules'] extractContent(contentExtractionRules) # Convert the log file into RDF N Triples file predicateList = targetWebsiteConfigs['PredicateList'] nTriplesFile = table2RDFNTriplesConverter(contentLogFile, predicateList) # Now log all the information to AllegroGraphDB store2db(nTriplesFile)
def crawl(targetWebsite): global sitename global errorLog global unseenUrlList global visitedUrlList global BASEFILESTORAGEDIR # Now start the crawling rountine. while (1): if unseenUrlList != []: # Choose a page randomly page = random.choice(unseenUrlList) # Fetch the content. r = KastParsersLib.fetchURL(page) # Clean the content. r = KastParsersLib.cleanHtml(r) # Write the content to a file, in the designated folder. filename = KastGenericFunctionsLib.extractWebSiteName(page) + '-' + str(round(time.time(), 2)) # Replace all '/' with [kastSlash] filename = string.replace(filename, '/', '[kastSlash]') f = gzip.open(BASEFILESTORAGEDIR + filename + '.gz', 'wb') f.write(r) f.close() # Convert to DOM and apply the CSS rule engine d = pq(r) ele_a = d('a') # Extract the hyperlinks links_a = KastParsersLib.extractHyperlinks(ele_a) # Convert to absolute links. unseenUrlListTmp = KastParsersLib.convert2AbsoluteHyperlinks(links_a, targetWebsite) # Now check how many of these links exist in Visited URL list. for link in unseenUrlListTmp: if not visitedUrlList.__contains__(link): unseenUrlList.append(link) # Now append this page processed to visited URLs list. visitedUrlList.append(page) # Now remove the same link from unseenUrlList. unseenUrlList.remove(page) # Condition to end the crawl. # Debug ON, turn off in production. pdb.set_trace() if unseenUrlList == []: return