def main(): page_file_dir = '/Users/bamana/Documents/InferLink/workspace/memex/memexpython/input/_template_test' pageManager = PageManager() files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) pageManager.learnStripes() (list_markup, list_names) = pageManager.learnListMarkups() rule_set = pageManager.learnAllRules() (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) # print json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')) rule_set = pageManager.learnRulesFromMarkup(list_markup) print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))
def induce_template(self, cluster_members): sub_page_mgr = PageManager() for id in cluster_members: curr_page = self.__page_manager.getPage(id) sub_page_mgr.addPage(id, curr_page.string) sub_page_mgr.learnStripes() return sub_page_mgr.getStripes()
def do_learning(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') with codecs.open(markup_file, "r", "utf-8") as myfile: json_str = myfile.read().encode('utf-8') markup = json.loads(json_str) pageManager = PageManager() for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) markup.pop("__SCHEMA__", None) markup.pop("__URLS__", None) pageManager.learnStripes(markup) rule_set = pageManager.learnRulesFromMarkup(markup) rules_file = os.path.join(directory, 'learning', 'rules.json') with codecs.open(rules_file, "w", "utf-8") as myfile: myfile.write(json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(rules = json.loads(rule_set.toJson()) ) abort(404)
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"]) write_debug_files = False for opt in opts: if opt in [('-d', ''), ('--debug', '')]: write_debug_files = True if opt in [('-h', ''), ('--help', '')]: raise Usage('python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files') except getopt.error, msg: raise Usage(msg) logger.info('Running RuleLearner with file at %s for rules %s', args[0], args[1]) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager(write_debug_files) start_time = time.time() for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) logger.info("--- LOAD PAGES: %s seconds ---" % (time.time() - start_time)) #Read the markups from a file... start_time = time.time() markups_file = args[1] with codecs.open(markups_file, "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) markups.pop("__URLS__", None) logger.info("--- LOAD MARKUPS: %s seconds ---" % (time.time() - start_time)) pageManager.learnStripes(markups) start_time = time.time() rule_set = pageManager.learnRulesFromMarkup(markups) logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" % (time.time() - start_time)) if(len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() else: print rule_set.toJson()
def run(page_file_dir, ignore_files=[]): test_pages = [] pageManager = PageManager(write_debug_files) if os.path.isfile(page_file_dir): with open(page_file_dir) as f: urls = f.readlines() for url in urls: page_url = url.strip() req = urllib2.urlopen(page_url) page_contents = req.read() charset = chardet.detect(page_contents) page_encoding = charset['encoding'] page_str = page_contents.decode(page_encoding).encode('utf-8') pageManager.addPage(page_url, page_contents) test_pages.append(page_url) else: files = [ f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f)) ] for the_file in files: if the_file.startswith( '.' ) or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files: continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) test_pages.append(the_file) pageManager.learnStripes() ##table, ul, etc. list learning # (list_markup, list_names) = pageManager.learnListMarkups() # list_rules = pageManager.learnRulesFromMarkup(list_markup) ##div learning # train_pages = {} # for page_id in pageManager._pages: # train_pages[page_id] = pageManager.getPage(page_id).getString() # d = DivListLearner() # div_rules, div_markup = d.run(train_pages) rule_set = pageManager.learnAllRules() rule_set.removeBadRules(test_pages) # for rule in list_rules.rules: # rule_set.add_rule(rule) # # for rule in div_rules.rules: # rule_set.add_rule(rule) return rule_set
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "h", ["help"]) except getopt.error, msg: raise Usage(msg) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager() page_str_array = [] for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') page_str_array.append(page_str) pageManager.addPage(the_file, page_str) pageManager.learnStripes() #Read the markups from a file... markups_file = args[1] with codecs.open(os.path.join('', markups_file), "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) #Before we learn the stripes let's make sure we can open the output file pageManager.learnStripes(markups) rule_set = pageManager.learnRulesFromMarkup(markups) if(len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() #testing flatten = False extraction_list = rule_set.extract(page_str_array[0]) if rule_set.validate(extraction_list): if flatten: print json.dumps(Landmark.flattenResult(extraction_list), sort_keys=True, indent=2, separators=(',', ': ')) else: print json.dumps(extraction_list, sort_keys=True, indent=2, separators=(',', ': '))
def learn_list_extractors(self, pages): page_mgr = PageManager() #write_debug_files=True) markup = {} for page in pages: page_content = pages[page] page_mgr.addPage(page, page_content) content_list_markup = self.lists_on_single_page(page_content) markup[page] = content_list_markup # print '--- MARKUP ---' # print json.dumps(markup) page_mgr.learnStripes(markups=markup) rules = page_mgr.learnRulesFromMarkup(markup) # now, for each markup rule, learn a little page manager sublist_page_managers = {} for page in markup: for rule_name in markup[page]: if rule_name not in sublist_page_managers: sublist_page_managers[rule_name] = PageManager() for rid in range(len(markup[page][rule_name]['sequence'])): row = markup[page][rule_name]['sequence'][rid] sublist_page_managers[rule_name].addPage( page + "html%d" % rid, row['extract']) sublist_sub_rules = {} for sublist in sublist_page_managers: sublist_page_managers[sublist].learnStripes() sub_rules = sublist_page_managers[sublist].learnAllRules( in_list=True) sublist_sub_rules[ sublist] = sub_rules # This should match a rule name in the rules... count = 1 for rule in rules.rules: # print "== RULE INFO ==" # print str(rule.name) rule.set_sub_rules(sublist_sub_rules[rule.name]) list_name = '_div_list' + format(count, '04') for page_id in markup: if rule.name in markup[page_id]: markup[page_id][list_name] = markup[page_id].pop(rule.name) rule.name = list_name # print str(json.dumps(rule.toJson())) # print "===============" # # print rules.toJson() return rules, markup
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"]) write_debug_files = False for opt in opts: if opt in [('-d', ''), ('--debug', '')]: write_debug_files = True if opt in [('-h', ''), ('--help', '')]: raise Usage('python -m learning.RuleLearnerAllSlots [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] \n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files') except getopt.error, msg: raise Usage(msg) logger.info('Running RuleLearnerAllSlots All Slots with files at %s', args[0]) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager(write_debug_files) if os.path.isfile(page_file_dir): with open(page_file_dir) as f: urls = f.readlines() for url in urls: page_url = url.strip() req = urllib2.urlopen(page_url) page_contents = req.read() charset = chardet.detect(page_contents) page_encoding = charset['encoding'] page_str = page_contents.decode(page_encoding).encode('utf-8') pageManager.addPage(page_url, page_contents) else: files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) pageManager.learnStripes() rule_set = pageManager.learnAllRules() print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))
def run(page_file_dir, ignore_files = []): test_pages = [] pageManager = PageManager(write_debug_files) if os.path.isfile(page_file_dir): with open(page_file_dir) as f: urls = f.readlines() for url in urls: page_url = url.strip() req = urllib2.urlopen(page_url) page_contents = req.read() charset = chardet.detect(page_contents) page_encoding = charset['encoding'] page_str = page_contents.decode(page_encoding).encode('utf-8') pageManager.addPage(page_url, page_contents) test_pages.append(page_url) else: files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.') or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files: continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) test_pages.append(the_file) pageManager.learnStripes() ##table, ul, etc. list learning # (list_markup, list_names) = pageManager.learnListMarkups() # list_rules = pageManager.learnRulesFromMarkup(list_markup) ##div learning # train_pages = {} # for page_id in pageManager._pages: # train_pages[page_id] = pageManager.getPage(page_id).getString() # d = DivListLearner() # div_rules, div_markup = d.run(train_pages) rule_set = pageManager.learnAllRules() rule_set.removeBadRules(test_pages) # for rule in list_rules.rules: # rule_set.add_rule(rule) # # for rule in div_rules.rules: # rule_set.add_rule(rule) return rule_set
def learn_list_extractors(self, pages): page_mgr = PageManager() #write_debug_files=True) markup = {} for page in pages: page_content = pages[page] page_mgr.addPage(page, page_content) content_list_markup = self.lists_on_single_page(page_content) markup[page] = content_list_markup # print '--- MARKUP ---' # print json.dumps(markup) page_mgr.learnStripes(markups=markup) rules = page_mgr.learnRulesFromMarkup(markup) # now, for each markup rule, learn a little page manager sublist_page_managers = {} for page in markup: for rule_name in markup[page]: if rule_name not in sublist_page_managers: sublist_page_managers[rule_name] = PageManager() for rid in range(len(markup[page][rule_name]['sequence'])): row = markup[page][rule_name]['sequence'][rid] sublist_page_managers[rule_name].addPage(page+"html%d" % rid, row['extract']) sublist_sub_rules = {} for sublist in sublist_page_managers: sublist_page_managers[sublist].learnStripes() sub_rules = sublist_page_managers[sublist].learnAllRules(in_list = True) sublist_sub_rules[sublist] = sub_rules # This should match a rule name in the rules... count = 1 for rule in rules.rules: # print "== RULE INFO ==" # print str(rule.name) rule.set_sub_rules(sublist_sub_rules[rule.name]) list_name = '_div_list'+format(count, '04') for page_id in markup: if rule.name in markup[page_id]: markup[page_id][list_name] = markup[page_id].pop(rule.name) rule.name = list_name # print str(json.dumps(rule.toJson())) # print "===============" # # print rules.toJson() return rules, markup
def autolearn_grid(): if request.method == 'POST': data = request.get_json(force=True) page_urls = data['urls'] page_manager = PageManager() results = {} for page_url in page_urls: page_contents = urllib2.urlopen(page_url).read() page_manager.addPage(page_url, page_contents) page_manager.learnStripes() rule_set = page_manager.learnAllRules() results['rules'] = json.loads(rule_set.toJson()) return jsonify(results) abort(404)
def save_markup(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] markup = data['markup'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') if not markup['__SCHEMA__'][0]['children']: markup_slot = { "id": "j1_2", "text": "slot", "icon": "glyphicon glyphicon-stop", "li_attr": { "id": "j1_2" }, "a_attr": { "href": "#", "id": "j1_2_anchor" }, "state": { "loaded": True, "opened": False, "selected": False, "disabled": False }, "data": {}, "children": [], "type": "item" }; list_slot = { "a_attr": { "href": "#", "id": "j1_3_anchor" }, "children": [], "data": {}, "icon": "glyphicon glyphicon-th-list", "id": "j1_3", "li_attr": { "id": "j1_3" }, "state": { "disabled": False, "loaded": True, "opened": False, "selected": False }, "text": "category", "type": "list" }; pageManager = PageManager() for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) schema = markup.pop("__SCHEMA__", None) urls = markup.pop("__URLS__", None) pageManager.learnStripes() (list_markup, list_names) = pageManager.learnListMarkups() rule_set = pageManager.learnAllRules() (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) count = 1 # Generate the schema from the list slots for list_name in list_names.keys(): count += 1 auto_markup_slot = copy.deepcopy(list_slot) auto_markup_slot['text'] = list_name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' ## now add the children to the auto learned list slot children = [] for name in list_names[list_name]: count += 1 auto_markup_slot_sub = copy.deepcopy(markup_slot) auto_markup_slot_sub['text'] = name auto_markup_slot_sub['id'] = 'j1_'+str(count) auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor' children.append(auto_markup_slot_sub) auto_markup_slot['children'] = children schema[0]['children'].append(auto_markup_slot) # Generate the schema from the item slots for name in names: count += 1 auto_markup_slot = copy.deepcopy(markup_slot) auto_markup_slot['text'] = name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' schema[0]['children'].append(auto_markup_slot) markup['__SCHEMA__'] = schema markup['__URLS__'] = urls with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() else: with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(markup) abort(404)
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"]) write_debug_files = False for opt in opts: if opt in [('-d', ''), ('--debug', '')]: write_debug_files = True if opt in [('-h', ''), ('--help', '')]: raise Usage( 'python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files' ) except getopt.error, msg: raise Usage(msg) logger.info('Running RuleLearner with file at %s for rules %s', args[0], args[1]) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager(write_debug_files) start_time = time.time() for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) logger.info("--- LOAD PAGES: %s seconds ---" % (time.time() - start_time)) #Read the markups from a file... start_time = time.time() markups_file = args[1] with codecs.open(markups_file, "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) markups.pop("__URLS__", None) logger.info("--- LOAD MARKUPS: %s seconds ---" % (time.time() - start_time)) pageManager.learnStripes(markups) start_time = time.time() rule_set = pageManager.learnRulesFromMarkup(markups) logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" % (time.time() - start_time)) if (len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() else: print rule_set.toJson()
def save_markup(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] markup = data['markup'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') if not markup['__SCHEMA__'][0]['children']: markup_slot = { "id": "j1_2", "text": "slot", "icon": "glyphicon glyphicon-stop", "li_attr": { "id": "j1_2" }, "a_attr": { "href": "#", "id": "j1_2_anchor" }, "state": { "loaded": True, "opened": False, "selected": False, "disabled": False }, "data": {}, "children": [], "type": "item" }; list_slot = { "a_attr": { "href": "#", "id": "j1_3_anchor" }, "children": [], "data": {}, "icon": "glyphicon glyphicon-th-list", "id": "j1_3", "li_attr": { "id": "j1_3" }, "state": { "disabled": False, "loaded": True, "opened": False, "selected": False }, "text": "category", "type": "list" }; pageManager = PageManager() test_pages = [] for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) test_pages.append(page_str) schema = markup.pop("__SCHEMA__", None) urls = markup.pop("__URLS__", None) pageManager.learnStripes() list_markup = {} list_names = {} if LEARN_LISTS: (list_markup, list_names) = pageManager.learnListMarkups() #This is the div learning train_pages = {} for page_id in pageManager._pages: train_pages[page_id] = pageManager.getPage(page_id).getString() d = DivListLearner() div_rules, div_markup = d.run(train_pages) (div_list_markup, div_list_names) = pageManager.listRulesToMarkup(div_rules) for page_id in div_markup: for item in div_markup[page_id]: if item in div_list_markup[page_id]: if 'starting_token_location' in div_markup[page_id][item]: div_list_markup[page_id][item]['starting_token_location'] = div_markup[page_id][item]['starting_token_location'] if 'ending_token_location' in div_markup[page_id][item]: div_list_markup[page_id][item]['ending_token_location'] = div_markup[page_id][item]['ending_token_location'] if div_markup[page_id][item]['sequence']: for idx, val in enumerate(div_markup[page_id][item]['sequence']): if len(div_list_markup[page_id][item]['sequence']) <= idx: div_list_markup[page_id][item]['sequence'].insert(idx, val); else: div_list_markup[page_id][item]['sequence'][idx]['starting_token_location'] = val['starting_token_location'] div_list_markup[page_id][item]['sequence'][idx]['ending_token_location'] = val['ending_token_location'] #Now add these to the list_markup and list_names if len(div_rules.rules) > 0: for page_id in div_list_markup: if page_id not in list_markup: list_markup[page_id] = {} list_markup[page_id].update(div_list_markup[page_id]) list_names.update(div_list_names) rule_set = pageManager.learnAllRules() rule_set.removeBadRules(test_pages) (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) count = 1 # Generate the schema from the list slots for list_name in list_names.keys(): count += 1 auto_markup_slot = copy.deepcopy(list_slot) auto_markup_slot['text'] = list_name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' ## now add the children to the auto learned list slot children = [] for name in list_names[list_name]: count += 1 auto_markup_slot_sub = copy.deepcopy(markup_slot) auto_markup_slot_sub['text'] = name auto_markup_slot_sub['id'] = 'j1_'+str(count) auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor' children.append(auto_markup_slot_sub) auto_markup_slot['children'] = children schema[0]['children'].append(auto_markup_slot) # Generate the schema from the item slots for name in names: count += 1 auto_markup_slot = copy.deepcopy(markup_slot) auto_markup_slot['text'] = name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' schema[0]['children'].append(auto_markup_slot) markup['__SCHEMA__'] = schema markup['__URLS__'] = urls with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() else: with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(markup) abort(404)
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "h", ["help"]) except getopt.error, msg: raise Usage(msg) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager() page_str_array = [] for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') page_str_array.append(page_str) pageManager.addPage(the_file, page_str) pageManager.learnStripes() #Read the markups from a file... markups_file = args[1] with codecs.open(os.path.join('', markups_file), "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) #Before we learn the stripes let's make sure we can open the output file pageManager.learnStripes(markups) rule_set = pageManager.learnRulesFromMarkup(markups) if (len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() #testing flatten = False extraction_list = rule_set.extract(page_str_array[0]) if rule_set.validate(extraction_list): if flatten: print json.dumps(Landmark.flattenResult(extraction_list), sort_keys=True, indent=2, separators=(',', ': ')) else: print json.dumps(extraction_list, sort_keys=True, indent=2, separators=(',', ': '))