Python htmldiff Examples, htmlDiff.htmldiff Python Examples

Example #1

0

Show file

File: htmlDiffDriver.py Project: relyt0925/InternetContentAnalysisSystem

def diffDriver(startPath,endPath): 
    #Make sure paths exist or else exit  
    if not os.path.exists(startPath):
        print "Start Path Does Not Exist"
        return;
    if not os.path.exists(endPath):
        print "End Path Does Not Exist";
        return;
    startPathFileList=os.listdir(startPath);
    endPathFileList=os.listdir(endPath);
    #Simple check to see if path contains html content, NOTE: CAN STILL FAIL
    #Only checks first in list
    if not startPathFileList[0].endswith('.html'):
        print "Need directory of all historical html files for start date";
        return;
    if not endPathFileList[0].endswith('.html'):
        print "Need directory of all historical html files for end date";
        return;
    #extract date
    startDate='';
    endDate='';
    startDate=startPath[startPath.find('2015'):len(startPath)-1];
    endDate=endPath[endPath.find('2015'):len(endPath)-1];
    #create directory based on the compared dates
    diffOutputDirectory=startDate+'_'+endDate+'/';
    if not os.path.exists(diffOutputDirectory):
        os.makedirs(diffOutputDirectory);
    
    #iterate through all files and do diff
    #ONLY does diff if webpage is fetched in both time periods
    for file in startPathFileList:
        try:
            startFile= startPath+file;
            endFile= endPath+file;
            #print startFile;
            #print endFile;
            if not file.endswith('.html'):
                continue;
            #Run files through HTML differ
            diffOutput=htmlDiff.htmldiff(startFile,endFile);
            #Serialize to file
            diffOutputEncoded=jsonpickle.encode(diffOutput);
            outfile=open(diffOutputDirectory+file, 'w');
            json.dump(diffOutputEncoded, outfile);
            outfile.close();
            #NOTE: HERE IS HOW TO READ FROM THE SERIALIZED FILE BACK TO PYTHON OBJECT
            #f=open(diffOutputDirectory+file,'r');
            #newDiffOutput=jsonpickle.decode(json.load(f));
            #f.close();
        except Exception as e:
            #most likely due to webpage not fetched in time, or beautiful soup erroring because 
            #the webpage is not properly formatted
            #outfile.close();
            print e;
            continue;
    return;

Example #2

0

Show file

File: htmlDiffDriver.py Project: relyt0925/InternetContentAnalysisSystem

def diffDriver(startPath, endPath):
    #Make sure paths exist or else exit
    if not os.path.exists(startPath):
        print "Start Path Does Not Exist"
        return
    if not os.path.exists(endPath):
        print "End Path Does Not Exist"
        return
    startPathFileList = os.listdir(startPath)
    endPathFileList = os.listdir(endPath)
    #Simple check to see if path contains html content, NOTE: CAN STILL FAIL
    #Only checks first in list
    if not startPathFileList[0].endswith('.html'):
        print "Need directory of all historical html files for start date"
        return
    if not endPathFileList[0].endswith('.html'):
        print "Need directory of all historical html files for end date"
        return
    #extract date
    startDate = ''
    endDate = ''
    startDate = startPath[startPath.find('2015'):len(startPath) - 1]
    endDate = endPath[endPath.find('2015'):len(endPath) - 1]
    #create directory based on the compared dates
    diffOutputDirectory = startDate + '_' + endDate + '/'
    if not os.path.exists(diffOutputDirectory):
        os.makedirs(diffOutputDirectory)

    #iterate through all files and do diff
    #ONLY does diff if webpage is fetched in both time periods
    for file in startPathFileList:
        try:
            startFile = startPath + file
            endFile = endPath + file
            #print startFile;
            #print endFile;
            if not file.endswith('.html'):
                continue
            #Run files through HTML differ
            diffOutput = htmlDiff.htmldiff(startFile, endFile)
            #Serialize to file
            diffOutputEncoded = jsonpickle.encode(diffOutput)
            outfile = open(diffOutputDirectory + file, 'w')
            json.dump(diffOutputEncoded, outfile)
            outfile.close()
            #NOTE: HERE IS HOW TO READ FROM THE SERIALIZED FILE BACK TO PYTHON OBJECT
            #f=open(diffOutputDirectory+file,'r');
            #newDiffOutput=jsonpickle.decode(json.load(f));
            #f.close();
        except Exception as e:
            #most likely due to webpage not fetched in time, or beautiful soup erroring because
            #the webpage is not properly formatted
            #outfile.close();
            print e
            continue
    return

Example #3

0

Show file

File: features.py Project: relyt0925/InternetContentAnalysisSystem

        f4 = specialCharRatio(change)
        f5 = getGSB(change)
        f6 = jsEval(change)
        featureRow = []
        featureRow.append(f1)
        featureRow.append(f2)
        featureRow.append(f3)
        featureRow.append(f4)
        featureRow.append(f5)
        featureRow.append(f6)
        features.append(featureRow)

    return features

if __name__ == "__main__":
    #file1 = sys.argv[1]
    #file2 = sys.argv[2]
    file1 = 'ground_truth/malware_traffic_analysis/2015_10_20_before'
    file2 = 'ground_truth/malware_traffic_analysis/2015_10_20_after'
    diff = htmlDiff.htmldiff(file1, file2)
    try: 
        outputDict['afterText'].rstrip()
    except:
        pass
    try: 
        outputDict['rawChange'].rstrip()
    except:
        pass
    features = getFeatures(diff)

Example #4

0

Show file

        f4 = specialCharRatio(change)
        f5 = getGSB(change)
        f6 = jsEval(change)
        featureRow = []
        featureRow.append(f1)
        featureRow.append(f2)
        featureRow.append(f3)
        featureRow.append(f4)
        featureRow.append(f5)
        featureRow.append(f6)
        features.append(featureRow)

    return features


if __name__ == "__main__":
    #file1 = sys.argv[1]
    #file2 = sys.argv[2]
    file1 = 'ground_truth/malware_traffic_analysis/2015_10_20_before'
    file2 = 'ground_truth/malware_traffic_analysis/2015_10_20_after'
    diff = htmlDiff.htmldiff(file1, file2)
    try:
        outputDict['afterText'].rstrip()
    except:
        pass
    try:
        outputDict['rawChange'].rstrip()
    except:
        pass
    features = getFeatures(diff)