Ejemplo n.º 1
0
def processDayAnonymous(handler,
                        day,
                        month,
                        monthsFolder,
                        startIdx=0,
                        endIdx=sys.maxint,
                        notifications=True):
    anonymousFileName = utility.addMissingSlash(monthsFolder) \
    + utility.addMissingSlash(month) \
    + anonymousDataFolder + anonymousFilePrefix + "%02d" % day + anonymousFileSuffix

    if notifications:
        print "Working on: " + anonymousFileName
    with gzip.open(anonymousFileName) as a:
        aReader = csv.DictReader(a, delimiter="\t")

        i = 0
        for anonymous in aReader:
            if startIdx <= i <= endIdx:
                sparqlQuery = urllib.unquote_plus(
                    anonymous['#anonymizedQuery'])

                anonymous['Valid'] = 'VALID'
                handler.handle(sparqlQuery, anonymous)
            elif i > endIdx:
                break
            i += 1
Ejemplo n.º 2
0
def processMonth(handler,
                 month,
                 monthsFolder,
                 anonymous=False,
                 notifications=True):
    folderToSearch = processedFolder
    prefixToSearch = processedPrefix
    suffixToSearch = processedSuffix

    if anonymous:
        folderToSearch = anonymousDataFolder
        prefixToSearch = anonymousFilePrefix
        suffixToSearch = anonymousFileSuffix

    for filename in glob.glob(
            utility.addMissingSlash(monthsFolder) +
            utility.addMissingSlash(month) + folderToSearch + prefixToSearch +
            "*" + suffixToSearch):
        day = os.path.basename(
            filename)[len(prefixToSearch):][:-len(suffixToSearch)]
        if anonymous:
            processDayAnonymous(handler,
                                int(day),
                                month,
                                monthsFolder,
                                notifications=notifications)
        else:
            processDay(handler,
                       int(day),
                       month,
                       monthsFolder,
                       notifications=notifications)
Ejemplo n.º 3
0
def processDay(handler,
               day,
               month,
               monthsFolder,
               startIdx=0,
               endIdx=sys.maxint,
               notifications=True):
    processedFileName = utility.addMissingSlash(monthsFolder) \
        + utility.addMissingSlash(month) \
        + processedFolder + processedPrefix + "%02d" % day \
        + processedSuffix

    if notifications:
        print "Working on: " + processedFileName
    with gzip.open(processedFileName) as p, \
            gzip.open(utility.addMissingSlash(monthsFolder)
                      + utility.addMissingSlash(month) + "rawLogData/"
                      + sourcePrefix + "%02d" % day + ".tsv.gz") as s:
        pReader = csv.DictReader(p, delimiter="\t")
        sReader = csv.DictReader(s, delimiter="\t")

        i = 0
        for processed, source in izip(pReader, sReader):
            if startIdx <= i <= endIdx:
                requestParameters = dict(
                    urlparse.parse_qsl(
                        urlparse.urlsplit(source['uri_query']).query.replace(
                            ';', "%3B")))

                if 'query' in requestParameters.keys():
                    sparqlQuery = requestParameters['query']
                else:
                    sparqlQuery = None

                processed['hour'] = source['hour']
                processed['day'] = day
                processed['user_agent'] = source['user_agent']
                processed['http_status'] = source['http_status']
                processed['timestamp'] = source['ts']
                processed['ts'] = source['ts']
                handler.handle(sparqlQuery, processed)
            elif i > endIdx:
                break
            i += 1
Ejemplo n.º 4
0
def joinMonth(month,
              monthsFolder=config.monthsFolder,
              ignoreLock=False,
              outputPath=None,
              outputFilename=None):
    if os.path.isfile(
            utility.addMissingSlash(monthsFolder) +
            utility.addMissingSlash(month) + "locked") and not ignoreLock:
        print "ERROR: The month " + month + " is being edited at the moment. Use -i or ignoreLock = True if you want to force the execution of this script."
        sys.exit()

    anonymizedFolder = "anonymousRawData/"
    anonymizedPrefix = anonymizedFolder + "AnonymousQueryCnt"

    pathBase = utility.addMissingSlash(monthsFolder) \
      + utility.addMissingSlash(month)

    outputFile = month.strip("/").replace("/", "_") + "_Joined.tsv.gz"

    if outputFilename is not None:
        outputFile = outputFilename

    targetFile = pathBase + anonymizedFolder
    if outputPath is not None:
        targetFile = outputPath
    if not os.path.exists(targetFile):
        os.makedirs(targetFile)
    targetFile += outputFile

    with gzip.open(targetFile, "w") as target:
        headerSet = False
        for i in xrange(1, 32):
            print "Working on %02d" % i
            sourceFile = pathBase + anonymizedPrefix + "%02d" % i + ".tsv.gz"
            if not (os.path.exists(sourceFile)):
                continue
            with gzip.open(sourceFile) as source:
                if headerSet:
                    next(source)
                else:
                    headerSet = True
                for line in source:
                    target.write(line)
Ejemplo n.º 5
0
def processRankedQueryType(handler,
                           month,
                           monthsFolder,
                           startIdx=0,
                           endIdx=sys.maxint,
                           notifications=True):
    rankedQueryTypeFilename = utility.addMissingSlash(
        monthsFolder) + utility.addMissingSlash(
            month) + rankedQueryTypeFolder + rankedQueryTypeFile

    if notifications:
        print "Working on: " + rankedQueryTypeFilename

    with open(rankedQueryTypeFilename) as r:
        rReader = csv.DictReader(r, delimiter="\t")

        i = 0
        for ranked in rReader:
            if startIdx <= i <= endIdx:
                handler.handle(ranked["ExampleQuery"], ranked)
            elif i > endIdx:
                break
            i += 1
Ejemplo n.º 6
0
def fieldRanking(month, metric, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplitting = False, writeOut = False, notifications = True, anonymous = False):
	if os.path.isfile(utility.addMissingSlash(monthsFolder)
		              + utility.addMissingSlash(month) + "locked") \
	   and not ignoreLock:
		print "ERROR: The month " + month + " is being edited at the moment."
		+ " Use -i or ignoreLock = True if you want to force the execution of this script."
		sys.exit()

	metric = utility.argMetric(metric)

	pathBase = utility.addMissingSlash(monthsFolder) \
		    + utility.addMissingSlash(month) \
		    + utility.addMissingSlash(metric)

	if outputPath is not None:
		pathBase = utility.addMissingSlash(outputPath)

	addString = ""
	if anonymous:
		addString = "_anonymous_"

	outputFile = month.strip("/").replace("/", "_") + "_" + metric + addString + "_Ranking.tsv"

	if outputFilename is not None:
		outputFile = outputFilename

	header = metric + "\t" + metric + "_count\n"

	filter = utility.filter()

	filter.setup(filterParams)


	class FieldRankingHandler:
		totalMetricCounts = defaultdict(int)

		def handle(self, sparqlQuery, processed):
			if not filter.checkLine(processed):
				return

			for key in utility.fetchEntries(processed, metric, nosplitting = nosplitting):
				self.totalMetricCounts[key] += 1

		def writeOut(self):
			with open(pathBase + outputFile, "w") as file:
				file.write(header)
				for k, v in sorted(self.totalMetricCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True):
					file.write(str(k) + "\t" + str(v) + "\n")

	handler = FieldRankingHandler()

	if anonymous:
	    processdata.processMonth(handler, month, monthsFolder, anonymous = True, notifications = notifications)
	else:
	    processdata.processMonth(handler, month, monthsFolder, notifications = notifications)



	if writeOut:
		if not os.path.exists(pathBase):
			os.makedirs(pathBase)
		handler.writeOut()
	return handler.totalMetricCounts
Ejemplo n.º 7
0
                    action="store_true")
parser.add_argument("--monthsFolder",
                    "-m",
                    default=config.monthsFolder,
                    type=str,
                    help="The folder in which the months directory are " +
                    "residing.")
parser.add_argument("months", type=str, help="The months to be processed")

if (len(sys.argv[1:]) == 0):
    parser.print_help()
    parser.exit()

args = parser.parse_args()

monthsFolder = utility.addMissingSlash(args.monthsFolder)
statisticsSubfolder = monthsFolder + "statistics/"
if not os.path.exists(statisticsSubfolder):
    os.makedirs(statisticsSubfolder)


def fieldRankingOn(monthFolder, metric, filename):
    print "Working with fieldRanking " + metric + " on " + filename
    fieldRanking.fieldRanking(monthFolder,
                              metric,
                              monthsFolder=args.monthsFolder,
                              outputPath=statisticsSubfolder + metric +
                              "_Ranking",
                              outputFilename=filename,
                              writeOut=True,
                              notifications=False)
Ejemplo n.º 8
0
                    "-o",
                    action='store_true',
                    help="If set " + "only valid lines are being looked at")
parser.add_argument("month",
                    type=str,
                    help="The month from which lines " +
                    "should be displayed.")

if (len(sys.argv[1:]) == 0):
    parser.print_help()
    parser.exit()

args = parser.parse_args()

if os.path.isfile(
        utility.addMissingSlash(args.monthsFolder) +
        utility.addMissingSlash(args.month) +
        "locked") and not args.ignoreLock:
    print "ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script."
    sys.exit()


class CountRdfPropertiesHandler:
    queryCount = 0
    propQueryCounts = {}

    def handle(self, sparqlQuery, processed):
        self.queryCount += 1

        if args.onlyValid:
            if processed['#Valid'] is not 'VALID':
Ejemplo n.º 9
0
                    + " NOTE: The day setting is ignored if query type ranking is enabled.")

if (len(sys.argv[1:]) == 0):
    parser.print_help()
    parser.exit()

args = parser.parse_args()

startLine = args.startline
endLine = args.endline

if args.line != None:
    startLine = args.line
    endLine = args.line

if os.path.isfile(utility.addMissingSlash(args.monthsFolder)
                  + utility.addMissingSlash(args.month) + "locked") \
   and not args.ignoreLock:
    print "ERROR: The month " + args.month + " is being edited at the "
    + "moment. Use -i if you want to force the execution of this script."
    sys.exit()

metrics = list()
metricsNotNull = list()

if args.metricsToBeViewed is not "":
    for metric in args.metricsToBeViewed.split(","):
        metrics.append(utility.addMissingDoubleCross(metric))

if args.metricsNotNull is not "":
    for metric in args.metricsNotNull.split(","):
Ejemplo n.º 10
0
                    type=str,
                    help="the month which we're interested in")
parser.add_argument("lines",
                    type=int,
                    help="number of lines the testfiles should have")

if (len(sys.argv[1:]) == 0):
    parser.print_help()
    parser.exit()

args = parser.parse_args()
monthsFolder = args.monthsFolder
month = args.month


if os.path.isfile(utility.addMissingSlash(monthsFolder) +
                  utility.addMissingSlash(month) + "locked") \
   and not args.ignoreLock:
    print "ERROR: The month " + args.month + " is being edited at the moment."
    + " Use -i if you want to force the execution of this script."
    sys.exit()

# create new folder for the test data
os.makedirs("testData/processedLogData")
os.makedirs("testData/rawLogData")

for filename in glob.glob(monthsFolder + "/" + month + "/processedLogData/" +
                          processdata.processedPrefix + "*" +
                          processdata.processedSuffix):
    day = int(
        os.path.basename(filename)[len(processdata.processedPrefix):]
Ejemplo n.º 11
0
def xyMapping(month, metricOne, metricTwo, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplittingOne = False, nosplittingTwo = False, writeOut = False, notifications = True):
    if os.path.isfile(utility.addMissingSlash(monthsFolder)
                      + utility.addMissingSlash(month) + "locked") \
       and not ignoreLock:
        print "ERROR: The month " + month + " is being edited at the "
        + "moment. Use -i if you want to force the execution of this script."
        sys.exit()

    metricOne = utility.argMetric(metricOne)
    metricTwo = utility.argMetric(metricTwo)

    folderName = metricOne + "_" + metricTwo

    pathBase = utility.addMissingSlash(monthsFolder) \
            + utility.addMissingSlash(month) \
            + utility.addMissingSlash(folderName)

    if outputPath is not None:
        pathBase = utility.addMissingSlash(outputPath)

    outputFile = month.strip("/").replace("/", "_") + "_" + folderName + ".tsv"

    if outputFilename is not None:
    	outputFile = outputFilename

    filter = utility.filter()

    filter.setup(filterParams)


    class hourlyFieldValueHandler:
        monthlyFieldValues = set()

        monthlyData = dict()

        def handle(self, sparqlQuery, processed):
            if not filter.checkLine(processed):
                return

            entriesOne = utility.fetchEntries(processed, metricOne, nosplittingOne)

            for keyTwo in utility.fetchEntries(processed, metricTwo, nosplittingTwo):
                if keyTwo not in self.monthlyData:
                    self.monthlyData[keyTwo] = defaultdict(int)

                for keyOne in entriesOne:
                    self.monthlyFieldValues.add(keyOne)
                    self.monthlyData[keyTwo][keyOne] += 1

        def writeHourlyValues(self):
            writeOutMethod(pathBase + outputFile, self.monthlyFieldValues, self.monthlyData, metricTwo + "\\" + metricOne)

    handler = hourlyFieldValueHandler()

    processdata.processMonth(handler, month, monthsFolder, notifications = notifications)

    if writeOut:
        if not os.path.exists(pathBase):
            os.makedirs(pathBase)
        handler.writeHourlyValues()
    return (handler.monthlyFieldValues, handler.monthlyData)
args = parser.parse_args()

uri_path = {"/sparql", "/bigdata/namespace/wdq/sparql"}
user_agent = {
    "Mozilla/5.0 (Android 4.4; Mobile; rv:41.0) Gecko/41.0 Firefox/41.0",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0"
}
timestamp = {
    "2017-07-01 01:31:14", "2017-07-01 01:32:54", "2017-07-01 01:34:10"
}
agent_type = {"spider", "user"}
http_status = "200"

with gzip.open(
        utility.addMissingSlash(args.outputDirectory) + "QueryCnt01.tsv.gz",
        "w") as target:
    print("uri_query\turi_path\tuser_agent\tts\tagent_type\thour\thttp_status",
          file=target)

    exampleQueryFolder = utility.addMissingSlash(args.exampleQueryFolder)

    for filename in glob.glob(exampleQueryFolder + "*.exampleQuery"):
        with open(filename) as exampleFile:
            line = "?query=" + urllib.quote_plus(exampleFile.read()) + "\t"
            line += random.sample(uri_path, 1)[0] + "\t"
            line += random.sample(user_agent, 1)[0] + "\t"
            line += random.sample(timestamp, 1)[0] + "\t"
            line += random.sample(agent_type, 1)[0] + "\t"
            line += str(random.randint(0, 23)) + "\t"
            line += http_status
                    + " Default filter is Valid=^VALID$."
                    + " Enter as <metric>=<regex>,<othermetric>/<regex> (e.g."
                    + " QueryType=wikidataLastModified,ToolName=^USER$)"
                    + " NOTE: If you use this option you should probably also"
                    + " set the --outputPath to some value other than the "
                    + "default.")
parser.add_argument("month", type=str, help="The month for which the ranking should be generated.")
parser.add_argument("--threshold", "-t", default = 2000, type = int, help = "The threshold above which the combinations should be listed. Default is 2000.")

if (len(sys.argv[1:]) == 0):
    parser.print_help()
    parser.exit()

args = parser.parse_args()

monthsFolder = utility.addMissingSlash(args.monthsFolder)
month = utility.addMissingSlash(args.month)

if os.path.isfile(monthsFolder + month + "locked") \
   and not ignoreLock:
    print ("ERROR: The month " + args.month + " is being edited at the moment."
    + " Use -i or ignoreLock = True if you want to force the execution of this script.")
    sys.exit()

subfolder = "automatedBotClassification/"

pathBase = monthsFolder + month + subfolder

if not os.path.exists(pathBase):
    os.makedirs(pathBase)
Ejemplo n.º 14
0
def fieldEntriesDaysApart(months,
                          metric,
                          days,
                          monthsFolder=config.monthsFolder,
                          ignoreLock=False,
                          outputPath=None,
                          outputFilename=None,
                          filterParams="",
                          nosplitting=False,
                          writeOut=False,
                          notifications=True,
                          anonymous=False):
    for month in months.split(","):
        if os.path.isfile(
                utility.addMissingSlash(monthsFolder) +
                utility.addMissingSlash(month) + "locked") and not ignoreLock:
            print "ERROR: The month " + month + " is being edited at the moment." + " Use -i or ignoreLock = True if you want to force the execution of this script."
            sys.exit()

    metric = utility.argMetric(metric)

    pathBase = utility.addMissingSlash(monthsFolder) \
      + utility.addMissingSlash(months.replace("/", "_")) \
      + utility.addMissingSlash(metric)

    if outputPath is not None:
        pathBase = utility.addMissingSlash(outputPath)

    addString = ""
    if anonymous:
        addString = "_anonymous_"

    outputFile = month.strip("/").replace(
        "/",
        "_") + "_" + metric + addString + "_" + str(days) + "_days_apart.tsv"

    if outputFilename is not None:
        outputFile = outputFilename

    header = metric + "\n"

    filter = utility.filter()

    filter.setup(filterParams)

    faultyTimestamps = defaultdict(int)

    class FieldEntriesDaysApartHandler:
        firstSeen = dict()
        lastSeen = dict()

        fieldEntries = set()

        def handle(self, sparqlQuery, processed):
            if not filter.checkLine(processed):
                return

            for key in utility.fetchEntries(processed,
                                            metric,
                                            nosplitting=nosplitting):
                timestamp = processed["timestamp"]
                try:
                    parsedTime = dateparser.parse(timestamp)
                except ValueError:
                    print "ERROR: Faulty timestamp " + str(timestamp)
                    faultyTimestamps[timestamp] += 1
                    continue
                if not key in self.firstSeen:
                    self.firstSeen[key] = parsedTime
                    self.lastSeen[key] = parsedTime
                if parsedTime > self.lastSeen[key]:
                    self.lastSeen[key] = parsedTime

        def compute(self):
            for key, firstTS in self.firstSeen.iteritems():
                lastTS = self.lastSeen[key]
                if (lastTS - firstTS).days >= days:
                    self.fieldEntries.add(key)

        def writeOut(self):
            with open(pathBase + outputFile, "w") as file:
                file.write(header)
                for key in self.fieldEntries:
                    file.write(str(key) + "\n")

    handler = FieldEntriesDaysApartHandler()

    for month in months.split(","):
        if anonymous:
            processdata.processMonth(handler,
                                     month,
                                     monthsFolder,
                                     anonymous=True,
                                     notifications=notifications)
        else:
            processdata.processMonth(handler,
                                     month,
                                     monthsFolder,
                                     notifications=notifications)

    handler.compute()

    if len(faultyTimestamps) > 0:
        print "Faulty timestamp\tcount"
        for k, v in sorted(faultyTimestamps.iteritems(),
                           key=lambda (k, v): (v, k),
                           reverse=True):
            print str(k) + "\t" + str(v)

    if writeOut:
        if not os.path.exists(pathBase):
            os.makedirs(pathBase)
        handler.writeOut()
    return handler.fieldEntries