Example #1
0
 def __init__(self, blackOakDataSet=BlackOakDataSet()):
     path_to_tool_detected = "/home/felix/BlackOak/List_A/ToolOutputDetectedCells/Trifacta_detectedCells.csv"
     path_to_tool_correct_detected = "/home/felix/BlackOak/List_A/ToolOutputCorrectCells/Trifacta_CorrectCells.csv"
     super(TrifactaOnBlackOak,
           self).__init__("Trifacta", blackOakDataSet,
                          path_to_tool_detected,
                          path_to_tool_correct_detected)
 def __init__(self, blackOakDataSet=BlackOakDataSet()):
     path_to_tool_detected = "/home/felix/BlackOak/List_A/ToolOutputDetectedCells/histograms.csv"
     path_to_tool_correct_detected = "/home/felix/BlackOak/List_A/ToolOutputCorrectCells/histograms_CorrectCells.csv"
     super(DBoostHistogramOnBlackOak,
           self).__init__("dBoost_Histogram", blackOakDataSet,
                          path_to_tool_detected,
                          path_to_tool_correct_detected)
 def __init__(self, blackOakDataSet=BlackOakDataSet()):
     path_to_tool_detected = "/home/felix/BlackOak/List_A/ToolOutputDetectedCells/GRefine_detectedCells.csv"
     path_to_tool_correct_detected = "/home/felix/BlackOak/List_A/ToolOutputCorrectCells/GRefine_CorrectCells.csv"
     super(OpenRefineOnBlackOak,
           self).__init__("OpenRefine", blackOakDataSet,
                          path_to_tool_detected,
                          path_to_tool_correct_detected)
 def __init__(self, blackOakDataSet=BlackOakDataSet()):
     path_to_tool_detected = "/home/felix/BlackOak/List_A/ToolOutputDetectedCells/gaussian.csv"
     path_to_tool_correct_detected = "/home/felix/BlackOak/List_A/ToolOutputCorrectCells/gaussian_CorrectCells.csv"
     super(DBoostGaussianOnBlackOak,
           self).__init__("dBoost_Gaussian", blackOakDataSet,
                          path_to_tool_detected,
                          path_to_tool_correct_detected)
 def __init__(self, blackOakDataSet=BlackOakDataSet()):
     path_to_tool_detected = "/home/felix/BlackOak/List_A/ToolOutputDetectedCells/rulebased.txt"
     path_to_tool_correct_detected = "/home/felix/BlackOak/List_A/ToolOutputCorrectCells/rulebased.txt_CorrectCells.csv"
     super(DCCleanOnBlackOak, self).__init__("DC-Clean", blackOakDataSet,
                                             path_to_tool_detected,
                                             path_to_tool_correct_detected)
Example #6
0
import matplotlib.pyplot as plt
import numpy as np

from ml.datasets.blackOak import BlackOakDataSet

data = BlackOakDataSet()

init_labels = 4
std_labels = 10
columns_eval = 10
sec_pro_label = 3.0

fscore_all = []


'''
#std active learning, no stop criterion
fscore_all.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.20012207943518531, 0.25386856758621601, 0.41466433595704261, 0.46892432523806837, 0.57893657155151279, 0.70256618928410297, 0.70749683905387484, 0.70750253315124023, 0.70750253315124023, 0.74519324856697966, 0.7491604672232115, 0.75374344923565373, 0.754499197387558, 0.81992328257584568, 0.84040818069254086, 0.8553099479493852, 0.90187938499685372, 0.90187938499685372, 0.90187938499685372, 0.90733775816160611, 0.90846046226507471, 0.91237386578151902, 0.91239429476850498, 0.91248123045048268, 0.91818444173667668, 0.92671278690037029, 0.93019831617325843, 0.93046475437020559, 0.93091040665768787, 0.93091040665768787, 0.93008550827989345, 0.93236071269309873, 0.93236355504321533, 0.9320129220419805, 0.93603961499574073, 0.93890562906677144, 0.94091704901395579, 0.9565499598597258, 0.96939385589585492, 0.96939385589585492, 0.96942323314378542, 0.97165316414511149, 0.97165316414511149, 0.97191976629747079, 0.97536979246201572, 0.97489147121451458, 0.97730611093774689, 0.97730611093774689, 0.98019022075242013, 0.98019022075242013])
fscore_all.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.20005885030413656, 0.23967151801689551, 0.35574865517630433, 0.46227290675972571, 0.55832618052640481, 0.66719866866972055, 0.668526806295348, 0.66853247239591018, 0.66853247239591018, 0.65420151065551668, 0.66145348394998904, 0.66624960085498486, 0.71274491692113762, 0.7437250997411704, 0.76976471147124892, 0.79157605892149652, 0.84602589692410057, 0.84963281650963418, 0.84963718987527848, 0.88397370494259264, 0.88397658764393794, 0.89041097653419621, 0.89048889369319273, 0.89035910983400157, 0.89603548019778012, 0.90624218536783507, 0.89966720826394908, 0.9124004379224796, 0.91240809931300471, 0.94536960446157781, 0.94517342172119578, 0.94507483756868094, 0.94504000322995307, 0.94526955058842532, 0.95186331529482504, 0.9569550026748741, 0.958271038017941, 0.95849818468781833, 0.97480412040499531, 0.97480412040499531, 0.974903827577007, 0.97586005616264027, 0.97591256324874742, 0.97583461597609888, 0.975941855651134, 0.97348441926345619, 0.98271118765431775, 0.98322511529467094, 0.98322511529467094, 0.98322511529467094])
fscore_all.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.19306548457129363, 0.23970296856095805, 0.40067542542642198, 0.50667034717951576, 0.59673617948029478, 0.67696875613162355, 0.67973152662572023, 0.67973722648745505, 0.67973722648745505, 0.67283055687242976, 0.68798461599014882, 0.69677145892778691, 0.69755072614195168, 0.72844262021750572, 0.76045570615938229, 0.80429937924723771, 0.84912140554091309, 0.85530966801915698, 0.85530966801915698, 0.90470487370573105, 0.90498218481898662, 0.90728959305228574, 0.90747125149687191, 0.90097797583700756, 0.91396157624598762, 0.91894482019828594, 0.92079960888695744, 0.93086370713018074, 0.93086644704049248, 0.93082808701814101, 0.93100479425202232, 0.92983143261092605, 0.92971164376790272, 0.93611619637620147, 0.93286023744785329, 0.9421035037584401, 0.94538787755328801, 0.93499801771021163, 0.94900167546426217, 0.94903992502927725, 0.94734466383987703, 0.94912376872107673, 0.94926277697707451, 0.94929805279638335, 0.95694205031546209, 0.95775518579510321, 0.96439497510769467, 0.97348705999059659, 0.97380120734738673, 0.97380120734738673])
fscore_all.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.20075733975756821, 0.26177292388407175, 0.41738881068887596, 0.52070181091593015, 0.62168806865844994, 0.7487646879215365, 0.7570367079342567, 0.75703369967334122, 0.75703369967334122, 0.7704225540748677, 0.77343236091539014, 0.76999916711383098, 0.77362756501581309, 0.8097905745803412, 0.82355855124248822, 0.80468440357464477, 0.85484463932547794, 0.85486422410033958, 0.85590863047544519, 0.88543044897239132, 0.88781249914044691, 0.88796376555041456, 0.88806633146472036, 0.8879389777055845, 0.89764445432609352, 0.92670603275095875, 0.91177236147597951, 0.92814475715222111, 0.94203749670474801, 0.94203749670474801, 0.94331034428821514, 0.95014370891100286, 0.95013957348025091, 0.95038226058830333, 0.95805508629409364, 0.96229310054952555, 0.95910684417931469, 0.95910684417931469, 0.95991716705118058, 0.95991716705118058, 0.95994144384301894, 0.96000278825164997, 0.96000413073785307, 0.95993691096030087, 0.95953429927709344, 0.9598709667413674, 0.97239386357906166, 0.97309624468953837, 0.97408928352012214, 0.97408928352012214])
fscore_all.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.20077698067710956, 0.2412915723145487, 0.40255219000541548, 0.54163493791069395, 0.60158582760926926, 0.68639875261476069, 0.69341255835771876, 0.69339396115033303, 0.69339396115033303, 0.67617291836023685, 0.68253914751119271, 0.6869161062650081, 0.69172346526620399, 0.69194128084418671, 0.7387257196266438, 0.79531751175133925, 0.84657666849802482, 0.84734464442671142, 0.8473918550145586, 0.91428382887285897, 0.91428382887285897, 0.90124461368874642, 0.9012681026293321, 0.90112411777591372, 0.91740602136069294, 0.91786033696370195, 0.91517181717065943, 0.93112704465784224, 0.94407866793959427, 0.94407866793959427, 0.94308411901306977, 0.95784541816236701, 0.95784541816236701, 0.9579694787565648, 0.96368250634462249, 0.96519117480748862, 0.96952810223565988, 0.96954546279233111, 0.97058469409816672, 0.97058469409816672, 0.97175292101654243, 0.96741589963978603, 0.96741589963978603, 0.96750810462011638, 0.97260792075104141, 0.97451961131039599, 0.97576714970479095, 0.97576714970479095, 0.97783209003804006, 0.97783209003804006])
'''


#distinct active learning, no stop criterion
fscore_all.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.20019403096996738, 0.25515300387948225, 0.41289471412894713, 0.52015754712125062, 0.61942677773436339, 0.73661296462207126, 0.73748154532006671, 0.7327364275995466, 0.75332485507216751, 0.79220835814825041, 0.79437689246817489, 0.79672485701761986, 0.80070917242672568, 0.82785163874346979, 0.84959825456323546, 0.86694880264244434, 0.90884523358815206, 0.93648518396302094, 0.93648518396302094, 0.93649495562120522, 0.93652808011300137, 0.9351802605095475, 0.93590989502864641, 0.94229941672067408, 0.94609729891021965, 0.95349021012919677, 0.96231356296376602, 0.9618072306483475, 0.9618072306483475, 0.9618072306483475, 0.96041117148111255, 0.96542813269501904, 0.96545251045614311, 0.96580044545528299, 0.97562558152511725, 0.96682307352321428, 0.96514175939464319, 0.96584006516609566, 0.96584006516609566, 0.96584006516609566, 0.96694573485119195, 0.9676177729987625, 0.96761909140365798, 0.96773666313817663, 0.97132472609143417, 0.97835911597214775, 0.97723047186400569, 0.97756865216880284, 0.97756865216880284, 0.97756865216880284])
fscore_all.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.20039484358660534, 0.23960625733568075, 0.4062745651132354, 0.54428208899432917, 0.64454769931521172, 0.77095238595327864, 0.774274789459789, 0.79016487646599587, 0.75601770621117248, 0.74117069588380102, 0.74266597200233631, 0.74593757350402123, 0.74592154288837376, 0.74901175123319674, 0.76763352964640752, 0.77548328553531165, 0.80841796463437754, 0.8124400049589996, 0.86208182969937164, 0.91940901570644351, 0.91993277984114163, 0.92461240160369562, 0.92607252973889664, 0.92607252973889664, 0.93643998748095625, 0.94439476846794934, 0.95461995924598686, 0.95515282910572818, 0.95515282910572818, 0.95515282910572818, 0.95515282910572818, 0.95736748050649323, 0.95767726074071802, 0.95767848193917204, 0.96669770217231388, 0.97125181057128118, 0.97736500203004473, 0.97737564804597221, 0.97737564804597221, 0.97737564804597221, 0.97737830951532012, 0.97891986221180971, 0.97900211654923219, 0.97900211654923219, 0.98519875214898955, 0.99029490672924969, 0.9928718650012105, 0.9928718650012105, 0.9928718650012105, 0.9928718650012105])
fscore_all.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.19934897294870357, 0.24292873792498088, 0.40613569514748721, 0.53758471738794489, 0.62024900180613252, 0.74799841887490626, 0.75256271405608244, 0.77203531572628703, 0.79028848062269696, 0.82952305489525546, 0.83315255001942445, 0.84654209218956111, 0.84814903749394488, 0.85762355120253697, 0.88258985372760235, 0.88791060489656881, 0.90955643707176348, 0.91070067554129397, 0.91352122429935279, 0.91406869741913133, 0.91406869741913133, 0.91122043575210754, 0.91134542568140142, 0.90742274232270559, 0.90899672883013516, 0.92333131084545461, 0.93984466889704843, 0.93997190010633469, 0.93997190010633469, 0.93997190010633469, 0.94010679751767934, 0.94282001900756207, 0.94282141639195183, 0.94643569636755087, 0.96027370176929394, 0.96165262188256972, 0.96398482820759102, 0.96461253314931006, 0.96461253314931006, 0.96461253314931006, 0.96461928350182236, 0.96469386570102877, 0.96469386570102877, 0.96484607536057432, 0.97230949304830727, 0.97853974564261137, 0.98714429289679906, 0.98714429289679906, 0.98714429289679906, 0.98714429289679906])
fscore_all.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.19334436749299577, 0.23311130756754689, 0.39654885266411838, 0.5049680586710843, 0.60799479141106305, 0.727623922126859, 0.72879815724921726, 0.74770291018955493, 0.76847135708043179, 0.80756085502164221, 0.81754754772449445, 0.81960855965551938, 0.82280405751562569, 0.85670437987606962, 0.85960391583211437, 0.86786263824776, 0.92092098452626325, 0.92235210665551481, 0.92235495896806785, 0.92247826029409818, 0.92247780505422794, 0.92273671230670307, 0.92260301362956676, 0.92266656102891176, 0.93150966601082852, 0.94164442473147569, 0.93572350715831798, 0.93641732763567131, 0.93641732763567131, 0.93641732763567131, 0.93712073333025037, 0.93712892526737923, 0.93734413124005045, 0.93753580925709445, 0.94834220977126504, 0.94816880379976576, 0.95104972205698346, 0.94800053685441654, 0.94800053685441654, 0.94800053685441654, 0.94821991577106313, 0.9519236404312299, 0.95193432247116294, 0.95194052059764767, 0.95696911571150178, 0.9679383839191521, 0.97308946650773986, 0.97586545470192687, 0.97586545470192687, 0.97586545470192687])
fscore_all.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.18989951225245341, 0.2531154381397524, 0.3673822992805425, 0.47637115841229777, 0.59212327031273759, 0.70764693490625663, 0.71358926824522639, 0.72561176465881339, 0.74555173690332111, 0.73240773880537657, 0.72401105746768424, 0.71970783366081625, 0.74124848514827002, 0.76826963206117971, 0.78427351790277033, 0.79046301530161944, 0.83746396113229737, 0.84396030881503836, 0.84396030881503836, 0.89527047197801868, 0.91387605599491906, 0.92116250188260906, 0.95150217496498479, 0.95132272564124498, 0.95304795720009017, 0.96725397032279981, 0.96725397032279981, 0.96737276825921925, 0.96737276825921925, 0.96737276825921925, 0.96737276825921925, 0.96787558228793158, 0.96828648197662581, 0.96853855540858103, 0.97675870536297382, 0.98370129257477112, 0.97817003163197558, 0.97875798903579336, 0.97875798903579336, 0.97875798903579336, 0.97875798903579336, 0.97867823831092893, 0.97868222859204257, 0.97870833148848946, 0.98038746586919778, 0.98427340245721562, 0.99217081664042084, 0.99163064760911224, 0.99163064760911224, 0.99163064760911224])
from ml.datasets.blackOak import BlackOakDataSet
from ml.tools.nadeef_repair.NadeefMe import NadeefMe

tool = NadeefMe(BlackOakDataSet(), "/home/felix/SequentialPatternErrorDetection/nadeef_repair/blackoak_audit/blackoak_nadeef_new.csv")

print "Fscore: " + str(tool.calculate_total_fscore())
print "Precision: " + str(tool.calculate_total_precision())
print "Recall: " + str(tool.calculate_total_recall())
 def __init__(self, blackOakDataSet=BlackOakDataSet()):
     path_to_tool_detected = "/home/felix/BlackOak/List_A/ToolOutputDetectedCells/katara.txt"
     path_to_tool_correct_detected = "/home/felix/BlackOak/List_A/ToolOutputCorrectCells/katara.txt_CorrectCells.csv"
     super(KataraOnBlackOak,
           self).__init__("Katara", blackOakDataSet, path_to_tool_detected,
                          path_to_tool_correct_detected)
        for key, value in min_certainties.iteritems():
            if min_certainty > value:
                min_certainty = value
                min_certainty_index = key

        print min_certainty_index

        return min_certainty_index


#input

start_time = time.time()

dataSet = BlackOakDataSet()
#dataSet = FlightLarysa()
#from ml.flights.FlightHoloClean import FlightHoloClean
#dataSet = FlightHoloClean()
#dataSet = HospitalHoloClean()
#dataSet = IQContest()

print("read: %s seconds ---" % (time.time() - start_time))

start_time = time.time()

train_fraction = 1.0
ngrams = 2
runSVD = True
replace = True
svd_dimensions = 100
name = name.upper()

query = "CREATE TABLE temp_result_"+ name +" AS (Select tbl.* From audit tbl\n" + \
   "Inner Join\n" + \
   "(\n" + \
   "  Select tupleid,attribute,tablename,MIN(time) MinPoint From audit Group By tupleid,attribute," + \
   "tablename\n" + \
   ")tbl1\n" + \
   "On tbl1.tupleid=tbl.tupleid and tbl1.attribute=tbl.attribute and tbl1.tablename=tbl.tablename\n" + \
   "Where tbl.tablename = 'TB_" + name +"' and tbl1.MinPoint=tbl.time);"

cursor.execute(query)
#cursor.close()
connection.commit()

result_path = "/tmp/nadeef_result.csv"

result_file = open(result_path, 'w')

cursor.copy_to(result_file, "temp_result_" + name, sep=",")

result_file.close()

from ml.tools.nadeef_repair.NadeefMe import NadeefMe
from ml.datasets.blackOak import BlackOakDataSet

tool = NadeefMe(BlackOakDataSet(), result_path)

print "Fscore: " + str(tool.calculate_total_fscore())
print "Precision: " + str(tool.calculate_total_precision())
print "Recall: " + str(tool.calculate_total_recall())
Example #11
0
                with open(log_file, "a") as myfile:
                    myfile.write(
                        str(rule) + ", " + str(runtime) + ", " +
                        str(cur_precision) + ", " + str(cur_recall) + ", " +
                        str(cur_fscore) + "\n")

            #clean up
            self.clean_up(connection, name, cursor, result_path)

        #final clean up
        self.clean_up_end(connection, name, cursor, csv_path)

        print "time: " + str(time_list)
        print "fscore: " + str(fscore)
        print "precision: " + str(precision)
        print "recall: " + str(recall)


if __name__ == '__main__':

    data = BlackOakDataSet()

    rules = []
    rules.append(FD(Set(["ZIP"]), "City"))
    rules.append(FD(Set(["ZIP"]), "State"))

    nadeef = NadeefAll(BlackOakDataSet(), rules)

    #data = HospitalHoloClean()
    #nadeef_repair = NadeefAll(BlackOakDataSet())
Example #12
0
 def __init__(self, blackOakDataSet=BlackOakDataSet()):
     path_to_tool_detected = "/home/felix/BlackOak/List_A/ToolOutputDetectedCells/AddressCleaner_detectedCells.csv"
     path_to_tool_correct_detected = "/home/felix/BlackOak/List_A/ToolOutputCorrectCells/AddressCleaner_CorrectCells.csv"
     super(AddressCleanerOnBlackOak, self).__init__("AddressCleaner", blackOakDataSet,
                                                    path_to_tool_detected,
                                                    path_to_tool_correct_detected)