def test_extract_with_seed(self): mdr = MDR() page = get_page('htmlpage0') candidates, doc = mdr.list_candidates(page, 'utf8') # we known first element can be used as seed seed_record = Record(candidates[0][0]) fragment = fragment_fromstring(get_page('fragment0')) seed_record_copy, mappings = mdr.extract(fragment, seed_record) # record only have 1 <li> elememt self.assertEquals(1, len(seed_record_copy)) # 40 items (records) self.assertEquals(40, len(mappings)) extracted_dates = [] for record, mapping in mappings.items(): for k, v in mapping.items(): if k.attrib.get('itemprop') == 'datePublished': extracted_dates.append(v.attrib.get('content')) self.assertEquals(extracted_dates[0], '2014-07-02') self.assertEquals(extracted_dates[-1], '2014-05-18')
def test_extract_with_seed(self): mdr = MDR() page = get_page('htmlpage0') candidates, doc = mdr.list_candidates(page, 'utf8') # we known first element can be used as seed seed_record = Record(candidates[0][0]) fragment = fragment_fromstring(get_page('fragment0')) seed_record_copy, mappings = mdr.extract(fragment, seed_record) # record only have 1 <li> elememt self.assertEquals(1, len(seed_record_copy)) # 40 items (records) self.assertEquals(40, len(mappings)) extracted_dates = [] for record, mapping in mappings.iteritems(): for k, v in mapping.iteritems(): if k.attrib.get('itemprop') == 'datePublished': extracted_dates.append(v.attrib.get('content')) self.assertEquals(extracted_dates[0], '2014-07-02') self.assertEquals(extracted_dates[-1], '2014-05-18')
def test_extract_with_seed2(self): mdr = MDR() page1 = get_page('htmlpage1') candidates, doc = mdr.list_candidates(page1, 'utf8') seed_record = Record(candidates[0][1], candidates[0][2]) fragment1 = fragment_fromstring(get_page('fragment1')) seed_record_copy, mappings = mdr.extract(fragment1, seed_record) self.assertEquals(2, len(seed_record_copy)) self.assertEquals('hreview', seed_record_copy[1].attrib.get('class')) # 27 items (records) self.assertEquals(27, len(mappings)) extracted_dates = [] extracted_texts = [] for record, mapping in mappings.items(): for k, v in mapping.items(): if k.attrib.get('class') == 'dtreviewed': extracted_dates.append(v.text) elif k.attrib.get('class') == 'description': extracted_texts.append(v.text) # extract items are sorted in original order self.assertEquals(extracted_dates[0], '27-05-2014') self.assertEquals(extracted_dates[-1], '07-07-2013') self.assertEquals(extracted_texts[0], 'Kwaliteit van het eten matig') self.assertEquals( extracted_texts[-1], 'Paviljoen Strand 90 te Domburg is een uiterst sfeervol restaurant. De inrichting is smaakvol met mooie kleuren. De bediening is vriendelijk en behulpzaam. Het eten was lekker. Kortom, we zullen er zeker terug komen.' )
def test_extract_with_seed2(self): mdr = MDR() page1 = get_page('htmlpage1') candidates, doc = mdr.list_candidates(page1, 'utf8') seed_record = Record(candidates[0][1], candidates[0][2]) fragment1 = fragment_fromstring(get_page('fragment1')) seed_record_copy, mappings = mdr.extract(fragment1, seed_record) self.assertEquals(2, len(seed_record_copy)) self.assertEquals('hreview', seed_record_copy[1].attrib.get('class')) # 27 items (records) self.assertEquals(27, len(mappings)) extracted_dates = [] extracted_texts = [] for record, mapping in mappings.iteritems(): for k, v in mapping.iteritems(): if k.attrib.get('class') == 'dtreviewed': extracted_dates.append(v.text) elif k.attrib.get('class') == 'description': extracted_texts.append(v.text) # extract items are sorted in original order self.assertEquals(extracted_dates[0], '27-05-2014') self.assertEquals(extracted_dates[-1], '07-07-2013') self.assertEquals(extracted_texts[0], 'Kwaliteit van het eten matig') self.assertEquals(extracted_texts[-1], 'Paviljoen Strand 90 te Domburg is een uiterst sfeervol restaurant. De inrichting is smaakvol met mooie kleuren. De bediening is vriendelijk en behulpzaam. Het eten was lekker. Kortom, we zullen er zeker terug komen.')
def test_mdr_fit(): """Ensure that the MDR 'fit' function constructs the right matrix to count each class, as well as the right map from feature instances to labels""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) mdr = MDR() mdr.fit(features, classes) assert len(mdr.class_count_matrix) == 4 assert len(mdr.feature_map) == 4 assert mdr.class_count_matrix[(2, 0)][1] == 1 assert mdr.class_count_matrix[(0, 0)][0] == 3 assert mdr.class_count_matrix[(0, 0)][1] == 6 assert mdr.class_count_matrix[(1, 1)][0] == 2 assert mdr.class_count_matrix[(0, 1)][1] == 3 assert mdr.feature_map[(2, 0)] == 1 assert mdr.feature_map[(0, 0)] == 1 assert mdr.feature_map[(1, 1)] == 0 assert mdr.feature_map[(0, 1)] == 1
def test_detect(self): mdr = MDR() page = get_page('htmlpage0') candidates, doc = mdr.list_candidates(page, 'utf8') assert_element('ul', "ylist ylist-bordered reviews", '', candidates[0]) page1 = get_page('htmlpage1') candidates, doc = mdr.list_candidates(page1, 'utf8') assert_element('div', "tab-pane fade in active", 'reviews', candidates[0])
def test_mdr_fit_transform(): """Ensure that the MDR 'fit_transform' function combines both fit and transform, and produces the right predicted labels""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) mdr = MDR() new_features = mdr.fit_transform(features, classes) assert np.array_equal(new_features, [[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [0]])
def test_mdr_init(): """Ensure that the MDR instantiator stores the MDR variables properly""" mdr_obj = MDR() assert mdr_obj.tie_break == 1 assert mdr_obj.default_label == 0 assert mdr_obj.class_count_matrix is None assert mdr_obj.feature_map is None mdr_obj2 = MDR(tie_break=1, default_label=2) assert mdr_obj2.tie_break == 1 assert mdr_obj2.default_label == 2 assert mdr_obj.class_count_matrix is None assert mdr_obj.feature_map is None
def test_extract(self): mdr = MDR() page = get_page('htmlpage0') candidates, doc = mdr.list_candidates(page, 'utf8') seed_record, mappings = mdr.extract(candidates[0]) # record only have 1 <li> elememt self.assertEquals(1, len(seed_record)) # div is the top element of <li>, and there are 40 items in total self.assertEquals(40, len(mappings)) page1 = get_page('htmlpage1') candidates, doc = mdr.list_candidates(page1, 'utf8') seed_record, mappings = mdr.extract(candidates[0]) # record have 2 elememts: <div class='divider-horizontal'> and <div class='hreview'> self.assertEquals(2, len(seed_record)) self.assertEquals('divider-horizontal', seed_record[0].attrib.get('class')) self.assertEquals('hreview', seed_record[1].attrib.get('class')) self.assertEquals(30, len(mappings)) fragment2 = fragment_fromstring(get_page('fragment2')) seed_record, mappings = mdr.extract(fragment2) # record have 2 elememts: <div class='row'> and <div class='row'> self.assertEquals(2, len(seed_record)) self.assertEquals('row', seed_record[0].attrib.get('class')) self.assertEquals(7, len(mappings))
def test_fit(): """Ensure that the MDR 'fit' method constructs the right matrix to count each class, as well as the right map from feature instances to labels""" features = np.array([ [2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) mdr = MDR() mdr.fit(features, classes) assert len(mdr.unique_labels) == 2 assert mdr.class_fraction == 1. / 3. assert len(mdr.class_count_matrix) == 4 assert len(mdr.feature_map) == 4 assert mdr.class_count_matrix[(2,0)][0] == 0 assert mdr.class_count_matrix[(2,0)][1] == 1 assert mdr.class_count_matrix[(0,0)][0] == 3 assert mdr.class_count_matrix[(0,0)][1] == 6 assert mdr.class_count_matrix[(1,1)][0] == 2 assert mdr.class_count_matrix[(1,1)][1] == 0 assert mdr.class_count_matrix[(0,1)][0] == 0 assert mdr.class_count_matrix[(0,1)][1] == 3 assert mdr.class_count_matrix[(2,2)][0] == 0 assert mdr.class_count_matrix[(2,2)][1] == 0 assert mdr.feature_map[(2,0)] == 1 assert mdr.feature_map[(0,0)] == 0 assert mdr.feature_map[(1,1)] == 0 assert mdr.feature_map[(0,1)] == 1
def test_mdr_transform(): """Ensure that the MDR 'transform' function maps a new set of feature instances to the desired labels""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) mdr = MDR() mdr.fit(features, classes) test_features = np.array([[2, 2], [1, 1], [0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [0, 0], [0, 0], [0, 0], [0, 1], [1, 0], [0, 0], [1, 0], [0, 0]]) new_features = mdr.transform(test_features) assert np.array_equal(new_features, [[0], [0], [1], [1], [1], [1], [0], [1], [1], [1], [1], [0], [1], [0], [1]])
def test_cluster(self): mdr = MDR() page = get_page('htmlpage0') candidates, doc = mdr.list_candidates(page, 'utf8') m = mdr.calculate_similarity_matrix(candidates[0]) self.assertEquals(1, len(set(mdr.hcluster(m)))) page1 = get_page('htmlpage1') candidates, doc = mdr.list_candidates(page1, 'utf8') m = mdr.calculate_similarity_matrix(candidates[0]) # first element is different from the rests self.assertEquals(3, len(set(mdr.hcluster(m))))
def test_custom_score(): """Ensure that the MDR 'score' method outputs the right custom score passed in from the user""" features = np.array([[2,0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) mdr = MDR() mdr.fit(features, classes) assert mdr.score(features = features, classes = classes, scoring_function = accuracy_score) == 9./15 assert mdr.score(features = features, classes = classes, scoring_function = zero_one_loss) == 1 - 9./15 assert mdr.score(features = features, classes = classes, scoring_function = zero_one_loss, normalize=False) == 15 - 9
def test_mdr_fit_raise_ValueError(): """Ensure that the MDR 'fit' function raises ValueError when it is not a binary classification (temporary)""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) mdr = MDR() try: mdr.fit(features, classes) except ValueError: assert True else: assert False classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) try: mdr.fit(features, classes) except ValueError: assert True else: assert False
def test_transform(): """Ensure that the MDR 'transform' method maps a new set of feature instances to the desired labels""" features = np.array([ [2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) mdr = MDR() mdr.fit(features, classes) test_features = np.array([ [2, 2], [1, 1], [0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [0, 0], [0, 0], [0, 0], [0, 1], [1, 0], [0, 0], [1, 0], [0, 0]]) new_features = mdr.transform(test_features) assert np.array_equal(new_features, [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0])
def test_mdr_sklearn_pipeline(): """Ensure that MDR can be used as a transformer in a scikit-learn pipeline""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) clf = make_pipeline(MDR(), LogisticRegression()) cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True)) assert np.mean(cv_scores) > 0.
def test_fit_transform(): """Ensure that the MDR 'fit_transform' method combines both fit and transform, and produces the right predicted labels""" features = np.array([[2,0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) mdr = MDR() new_features = mdr.fit_transform(features, classes) assert np.array_equal(new_features, [1,0,1,0,0,0,1,0,0,1,0,0,0,0,0])
def test_score(): """Ensure that the MDR 'score' method outputs the right default score, as well as the right custom metric if specified""" features = np.array([[2,0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) mdr = MDR() mdr.fit(features, classes) assert mdr.score(features, classes) == 9./15
def __init__(self,name,loc=None,value=None,otype=None): """defines properties of a node given its name""" self.name = name self.arity = {None:0} self.arity['f'] = defaultdict(lambda: 0, { 'sin':1,'cos':1,'exp':1,'log':1,'^2':1,'^3':1, 'sqrt':1,'if':1, 'ife':2,'+':2,'-':2,'*':2,'/':2,'>_f':2,'<_f':2, '>=_f':2,'<=_f':2,'xor_f':2,'mdr2':2})[name] self.arity['b'] = defaultdict(lambda: 0, { '!':1,'if':1,'ife':1, '&':2,'|':2,'==':2,'>_b':2,'<_b':2,'>=_b':2, '<=_b':2,'xor_b':2})[name] self.in_type = { # float operations '+':'f', '-':'f', '*':'f', '/':'f', 'sin':'f', 'cos':'f', 'exp': 'f', 'log':'f', 'x':None, 'k':None, '^2':'f', '^3':'f', 'sqrt': 'f', # 'rbf': , # bool operations '!':'b', '&':'b', '|':'b', '==':'b', '>_f':'f', '<_f':'f', '>=_f':'f', '<=_f':'f', '>_b':'b', '<_b':'b', '>=_b':'b', '<=_b':'b','xor_b':'b', 'xor_f':'f', # mixed 'mdr2':'f','if':('f','b'),'ife':('f','b') }[name] if otype is None: self.out_type = { # float operations '+': 'f','-': 'f','*': 'f','/': 'f','sin': 'f','cos': 'f','exp': 'f', 'log': 'f','x':'f','k': 'f','^2': 'f','^3': 'f','sqrt': 'f', # 'rbf': , # bool operations '!': 'b', '&': 'b','|': 'b','==': 'b','>_f': 'b','<_f': 'b','>=_f': 'b', '<=_f': 'b','>_b': 'b','<_b': 'b','>=_b': 'b','<=_b': 'b','xor_f':'b', 'xor_b':'b', # mixed 'mdr2':'b','if':'f','ife':'f' }[name] else: self.out_type = otype if 'mdr' in self.name: self.model = MDR() self.evaluate = run_MDR self.loc = loc self.value = value
a5000_01h, a5000_02h, a5000_04h ] dataset_names = [ 'a10_005h', 'a10_01h', 'a10_02h', 'a10_04h', 'a100_005h', 'a100_01h', 'a100_02h', 'a100_04h', 'a1000_005h', 'a1000_01h', 'a1000_02h', 'a1000_04h', 'a5000_005h', 'a5000_01h', 'a5000_02h', 'a5000_04h' ] output_txt = '/home/ansohn/Python/venvs/mdr/gametes_logs/target_scores.txt' with open(output_txt, 'w') as t1: for i in range(16): # print(dataset) dataset = gametes_all[i] dataset_name = dataset_names[i] load_dataset = pd.read_csv(dataset, sep='\t') phenotype = load_dataset['Class'].values individuals = load_dataset.drop('Class', axis=1) individuals = individuals[['M0P0', 'M0P1']].values for i in range(30): X_train, X_test, y_train, y_test = train_test_split( individuals, phenotype, train_size=0.75, test_size=0.25) target_pipeline = MDR() target_pipeline.fit(X_train, y_train) t1.write('{}\t{}\tmdr-perfect\n'.format( dataset_name, target_pipeline.score(X_test, y_test)))
for features in itertools.combinations(range(X.shape[1]), cur_n): mdr_model = copy.deepcopy(mdr_instance) mdr_model.fit(X[:, features], y) mdr_model_score = mdr_model.score(X[:, features], y) model_features = [feature_names[feature] for feature in features] yield mdr_model, mdr_model_score, model_features xtr = _ekf(training_features, ekf_index=0) xte = _ekf(testing_features, ekf_index=0) #my_mdr_tr = MDR(tie_break_choice, default_label_choice) #my_mdr_te = MDR(tie_break_choice, default_label_choice) mymdr = MDR() clf = GaussianNB() n_way_results = [] n_way_features = [] for nw in range(2,4): # subset_features = np.random.choice(training_features.columns, nw, replace=False) # training_features = training_features[subset_features] m1 = n_way_models(mymdr, xtr.values, training_classes, n=[nw], feature_names=list(xtr.columns)) m2 = list(m1) for i in range(0, len(m2)): n_way_results.append( (m2[i])[1] ) # n_way_results = tuple(n_way_results) n_way_features.append( (m2[i])[2] )
def extract(request): if request.GET.get('url'): url = request.GET['url'] mdr = MDR() try: r = requests.get(url) parsed_uri = urlparse(url) except: return redirect(index) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) candidates, doc = mdr.list_candidates(r.text) seed, mappings = mdr.extract(candidates[0]) script_dir = os.path.dirname(__file__) #<-- absolute dir the script is in rel_path = "templates/autoscrapper/output.html" abs_file_path = os.path.join(script_dir, rel_path) f = open(rel_path,'w') x = seed.trees[0] # print "seed : ",x values = list(mappings.viewvalues()) f.write("{% load static %}") f.write("<html><h1>Extracted Data<h1>") f.write('<link href="bootstrap.min.css" rel="stylesheet" >') f.write("""<link href="{% static 'bootstrap.min.css' %} " rel="stylesheet" >""") f.write("""<link href="{% static 'cover.css' %} " rel="stylesheet">""") f.write('<table class="table table-bordered ">') key = x.iterdescendants() while(True): try: k = key.next() f.write("<th>") try: classname = k.attrib['class'] f.write(classname) except: f.write("_"+k.tag+"</th>") f.write("</th>") except: break for i, value in enumerate(values): f.write("<tr>") print "data item", i print "==============" key = x.iterdescendants() while(True): try: k = key.next() try: val = value[k] except: f.write("<td></td>") continue f.write("<td>") print k.tag, " --------> ", val.tag if k.tag == 'a': valattrib = val.attrib href = valattrib['href'] # print href try: atext = a.text print "atext = ", atext except: atext = href # print href[:4] if href[:4] != 'http': # print "rel" f.write('<a href="'+domain+href+'" >'+atext+'</a>') else: # print "abs" f.write('<a href="'+valattrib['href']+'" >'+atext+'</a>') # print "href = ", valattrib['href'] elif k.tag == 'img': valattrib = val.attrib href = valattrib['src'] if href[:4] != 'http': f.write('<img height="100" src="'+domain+href+'" >') else: f.write('<img height="100" src="'+valattrib['src']+'" >') # print "img = ", valattrib['src'] else: try: f.write(val.text) ktext = k.text # valtext = val.text valtext = etree.tostring(val, pretty_print=True) # print ktext, " --------> ", valtext except: pass f.write("</td>") except: break f.write("</tr>") f.write("</table>") f.write("</html>") f.close() # return HttpResponse(url) return redirect('/output/') else: return redirect(index)