f.close() print "******Written labeled*****" tmpUnlabeledPool = unlabeledPool.find({"random" : { "$gt": 0, "$lt": 1}}) f = open(os.path.expanduser(pathToEpic + "/data/PoolData/unlabeledPool.txt"),'w') #f = open("unlabeledPool.txt",'w') for i in range(1,tmpUnlabeledPool.count()): f.write(str(tmpUnlabeledPool[i])) f.write("\n") print str(i) if i% 100 == 0: print i f.close() print "*****Written unlabeled*****" tmp_file = open(os.path.expanduser(pathToEpic + '/data/PoolData/labeledPool.conll')) tmp_file.close() noise = 0.0 if len(sys.argv) > 3: noise = float(sys.argv[3]) print "*****Time to make conll*****" makeConll(pathToEpic + '/data/PoolData/labeledPool.txt', pathToEpic + '/data/PoolData/labeledPool.conll',noise) print "poop"
def relabelBatch(randomIds,noise): pathToEpic = os.getcwd() pathToEpic = pathToEpic[0:pathToEpic.rfind("epic")+4] returnString = "Tmp file: " print "Inside moveBatch" # Move Batch between databases #client = MongoClient('mon-entity-event-r13-2.recfut.com:27016') #db = client.rf_entity_curation #labeled = db.malware_labeled #unlabeled = db.malware_unlabeled batch = open(os.path.expanduser(pathToEpic + "/data/PoolData/batch.txt"),'w') readlabeled = open(os.path.expanduser(pathToEpic + "/data/PoolData/labeledPool.txt"), 'r') lines = readlabeled.readlines() readlabeled.close() print "Labeled openened for rewriting" #print "randomIds " + str(randomIds) ################## Batch moved in database ############# #for oneId in randomIds: # tmpId = unlabeled.find({"random" : oneId}) # labeled.insert(tmpId) # unlabeled.remove({"random" : oneId}) # tmpId = labeled.find({"random" : oneId}) # batch.write(str(tmpId[0])) # batch.write("\n") #print "Starting to remove id from textfile" for line in lines: idFound = False for oneID in randomIds: if not (line.find(str(oneID)[0:len(str(oneID))-2])==-1): idFound = True #print str(idFound)+" " +str(oneID)[0:len(str(oneID))-2] +"\n"+line if idFound: batch.write(line) #print line + " does not include " +oneId #print str(idFound)+" " + +"\n"+line #returnString += str(idFound) + " " + line + "\n" batch.close() # Get Conll of the batches and add these to all conll's of labeled pool makeConll(pathToEpic + "/data/PoolData/batch.txt", pathToEpic + "/data/PoolData/batchConll.conll", noise) labeledOrig = open(os.path.expanduser(pathToEpic + "/data/PoolData/labeledPool.txt"), 'a') labeledOrigConll = open(os.path.expanduser(pathToEpic + "/data/PoolData/labeledPool.conll"),'a') batch = open(os.path.expanduser(pathToEpic + "/data/PoolData/batch.txt"),'r') batchConll = open(os.path.expanduser(pathToEpic + "/data/PoolData/batchConll.conll"),'r') labeledOrig.write(batch.read()) labeledOrigConll.write(batchConll.read()) labeledOrig.close() labeledOrigConll.close() batch.close() batchConll.close() #os.remove(os.path.expanduser("pathToEpic + "/data/batch.txt")) #os.remove(os.path.expanduser("pathToEpic + "/data/batchConll.conll")) return returnString
import os import unicodedata from makeConllFromDBOutput import makeConll pathToEpic = os.getcwd() pathToEpic = pathToEpic[0:pathToEpic.rfind("epic")+4] positiveFile = pathToEpic + "data/epicEvalutationTestSet/positives.txt" positiveConll = pathToEpic + "data/epicEvalutationTestSet/positives.conll" negativeFile = pathToEpic + "data/epicEvalutationTestSet/negatives.txt" negativeConll = pathToEpic + "data/epicEvalutationTestSet/negatives.conll" positiveFakeFile = pathToEpic + "data/epicEvalutationTestSet/fakePositives.txt" positiveFakeConll = pathToEpic + "data/epicEvalutationTestSet/fakePositives.conll" makeConll(positiveFile, positiveConll,0.0) makeConll(negativeFile, negativeConll,0.0) makeConll(positiveFakeFile, positiveFakeConll,0.0) filenames = [positiveConll, positiveFakeConll, negativeConll] with open(os.path.expanduser(pathToEpic + 'data/epicEvalutationTestSet.conll'), 'w') as outfile: for fname in filenames: with open(os.path.expanduser(fname)) as infile: outfile.write(infile.read())
#import pymongo import sys import os #from pymongo import MongoClient from makeConllFromDBOutput import makeConll #from getJustSentences import getJustSentences pathToEpic = os.getcwd() pathToEpic = pathToEpic[0:pathToEpic.rfind("epic")+4] client = MongoClient('mon-entity-event-r13-6.recfut.com:27019') db = client.rf_entity_curation allMalware = db.malware_negatives negatives = allMalware.find() negativeFile = open(os.path.expanduser(pathToEpic+"/data/APInegatives.txt"),'w') counter = 0 for i in negatives: negativeFile.write(str(i)+ "\n") print "counter " +str(counter) counter += 1 makeConll(pathToEpic+"/data/APInegatives.txt",pathToEpic+"/data/APInegatives.conll",0.0)
f.close() print "******Written labeled*****" tmpUnlabeledPool = unlabeledPool.find({"random": {"$gt": 0, "$lt": 1}}) f = open(os.path.expanduser(pathToEpic + "/data/PoolData/unlabeledPool.txt"), 'w') #f = open("unlabeledPool.txt",'w') for i in range(1, tmpUnlabeledPool.count()): f.write(str(tmpUnlabeledPool[i])) f.write("\n") print str(i) if i % 100 == 0: print i f.close() print "*****Written unlabeled*****" tmp_file = open( os.path.expanduser(pathToEpic + '/data/PoolData/labeledPool.conll')) tmp_file.close() noise = 0.0 if len(sys.argv) > 3: noise = float(sys.argv[3]) print "*****Time to make conll*****" makeConll(pathToEpic + '/data/PoolData/labeledPool.txt', pathToEpic + '/data/PoolData/labeledPool.conll', noise) print "poop"
#import pymongo import sys import os #from pymongo import MongoClient from makeConllFromDBOutput import makeConll #from getJustSentences import getJustSentences pathToEpic = os.getcwd() pathToEpic = pathToEpic[0:pathToEpic.rfind("epic") + 4] client = MongoClient('mon-entity-event-r13-6.recfut.com:27019') db = client.rf_entity_curation allMalware = db.malware_negatives negatives = allMalware.find() negativeFile = open(os.path.expanduser(pathToEpic + "/data/APInegatives.txt"), 'w') counter = 0 for i in negatives: negativeFile.write(str(i) + "\n") print "counter " + str(counter) counter += 1 makeConll(pathToEpic + "/data/APInegatives.txt", pathToEpic + "/data/APInegatives.conll", 0.0)
def moveBatch(randomIds, noise): pathToEpic = os.getcwd() pathToEpic = pathToEpic[0:pathToEpic.rfind("epic") + 4] returnString = "Tmp file: " print "Inside moveBatch" # Move Batch between databases #client = MongoClient('mon-entity-event-r13-2.recfut.com:27016') #db = client.rf_entity_curation #labeled = db.malware_labeled #unlabeled = db.malware_unlabeled batch = open(os.path.expanduser(pathToEpic + "/data/PoolData/batch.txt"), 'w') readUnlabeled = open( os.path.expanduser(pathToEpic + "/data/PoolData/unlabeledPool.txt"), 'r') lines = readUnlabeled.readlines() readUnlabeled.close() writeUnlabeled = open( os.path.expanduser(pathToEpic + "/data/PoolData/unlabeledPool.txt"), 'w') print "Unlabeled openened for writing" #print "randomIds " + str(randomIds) ################## Batch moved in database ############# #for oneId in randomIds: # tmpId = unlabeled.find({"random" : oneId}) # labeled.insert(tmpId) # unlabeled.remove({"random" : oneId}) # tmpId = labeled.find({"random" : oneId}) # batch.write(str(tmpId[0])) # batch.write("\n") #print "Starting to remove id from textfile" for line in lines: idFound = False for oneID in randomIds: if not (line.find(str(oneID)[0:len(str(oneID)) - 2]) == -1): idFound = True #print str(idFound)+" " +str(oneID)[0:len(str(oneID))-2] +"\n"+line if not idFound: #print "Write \""+line+"\" to unlabeled" writeUnlabeled.write(line) else: #print "Write \""+line+"\" to batch" batch.write(line) #print line + " does not include " +oneId #print str(idFound)+" " + +"\n"+line #returnString += str(idFound) + " " + line + "\n" writeUnlabeled.close() batch.close() # Get Conll of the batches and add these to all conll's of labeled pool makeConll(pathToEpic + "/data/PoolData/batch.txt", pathToEpic + "/data/PoolData/batchConll.conll", noise) labeledOrig = open( os.path.expanduser(pathToEpic + "/data/PoolData/labeledPool.txt"), 'a') labeledOrigConll = open( os.path.expanduser(pathToEpic + "/data/PoolData/labeledPool.conll"), 'a') batch = open(os.path.expanduser(pathToEpic + "/data/PoolData/batch.txt"), 'r') batchConll = open( os.path.expanduser(pathToEpic + "/data/PoolData/batchConll.conll"), 'r') labeledOrig.write(batch.read()) labeledOrigConll.write(batchConll.read()) labeledOrig.close() labeledOrigConll.close() batch.close() batchConll.close() #os.remove(os.path.expanduser(pathToEpic + "/data/PoolData/batch.txt")) #os.remove(os.path.expanduser(pathToEpic + "/data/PoolData/batchConll.conll")) return returnString
import os import unicodedata from makeConllFromDBOutput import makeConll pathToEpic = os.getcwd() pathToEpic = pathToEpic[0 : pathToEpic.rfind("epic") + 4] positiveFile = pathToEpic + "data/epicEvalutationTestSet/positives.txt" positiveConll = pathToEpic + "data/epicEvalutationTestSet/positives.conll" negativeFile = pathToEpic + "data/epicEvalutationTestSet/negatives.txt" negativeConll = pathToEpic + "data/epicEvalutationTestSet/negatives.conll" positiveFakeFile = pathToEpic + "data/epicEvalutationTestSet/fakePositives.txt" positiveFakeConll = pathToEpic + "data/epicEvalutationTestSet/fakePositives.conll" makeConll(positiveFile, positiveConll, 0.0) makeConll(negativeFile, negativeConll, 0.0) makeConll(positiveFakeFile, positiveFakeConll, 0.0) filenames = [positiveConll, positiveFakeConll, negativeConll] with open(os.path.expanduser(pathToEpic + "data/epicEvalutationTestSet.conll"), "w") as outfile: for fname in filenames: with open(os.path.expanduser(fname)) as infile: outfile.write(infile.read())