def main(): page_file_dir = '/Users/bamana/Documents/InferLink/workspace/memex/memexpython/input/_template_test' pageManager = PageManager() files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) pageManager.learnStripes() (list_markup, list_names) = pageManager.learnListMarkups() rule_set = pageManager.learnAllRules() (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) # print json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')) rule_set = pageManager.learnRulesFromMarkup(list_markup) print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))
def run(page_file_dir, ignore_files=[]): test_pages = [] pageManager = PageManager(write_debug_files) if os.path.isfile(page_file_dir): with open(page_file_dir) as f: urls = f.readlines() for url in urls: page_url = url.strip() req = urllib2.urlopen(page_url) page_contents = req.read() charset = chardet.detect(page_contents) page_encoding = charset['encoding'] page_str = page_contents.decode(page_encoding).encode('utf-8') pageManager.addPage(page_url, page_contents) test_pages.append(page_url) else: files = [ f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f)) ] for the_file in files: if the_file.startswith( '.' ) or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files: continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) test_pages.append(the_file) pageManager.learnStripes() ##table, ul, etc. list learning # (list_markup, list_names) = pageManager.learnListMarkups() # list_rules = pageManager.learnRulesFromMarkup(list_markup) ##div learning # train_pages = {} # for page_id in pageManager._pages: # train_pages[page_id] = pageManager.getPage(page_id).getString() # d = DivListLearner() # div_rules, div_markup = d.run(train_pages) rule_set = pageManager.learnAllRules() rule_set.removeBadRules(test_pages) # for rule in list_rules.rules: # rule_set.add_rule(rule) # # for rule in div_rules.rules: # rule_set.add_rule(rule) return rule_set
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"]) write_debug_files = False for opt in opts: if opt in [('-d', ''), ('--debug', '')]: write_debug_files = True if opt in [('-h', ''), ('--help', '')]: raise Usage('python -m learning.RuleLearnerAllSlots [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] \n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files') except getopt.error, msg: raise Usage(msg) logger.info('Running RuleLearnerAllSlots All Slots with files at %s', args[0]) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager(write_debug_files) if os.path.isfile(page_file_dir): with open(page_file_dir) as f: urls = f.readlines() for url in urls: page_url = url.strip() req = urllib2.urlopen(page_url) page_contents = req.read() charset = chardet.detect(page_contents) page_encoding = charset['encoding'] page_str = page_contents.decode(page_encoding).encode('utf-8') pageManager.addPage(page_url, page_contents) else: files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) pageManager.learnStripes() rule_set = pageManager.learnAllRules() print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))
def run(page_file_dir, ignore_files = []): test_pages = [] pageManager = PageManager(write_debug_files) if os.path.isfile(page_file_dir): with open(page_file_dir) as f: urls = f.readlines() for url in urls: page_url = url.strip() req = urllib2.urlopen(page_url) page_contents = req.read() charset = chardet.detect(page_contents) page_encoding = charset['encoding'] page_str = page_contents.decode(page_encoding).encode('utf-8') pageManager.addPage(page_url, page_contents) test_pages.append(page_url) else: files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.') or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files: continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) test_pages.append(the_file) pageManager.learnStripes() ##table, ul, etc. list learning # (list_markup, list_names) = pageManager.learnListMarkups() # list_rules = pageManager.learnRulesFromMarkup(list_markup) ##div learning # train_pages = {} # for page_id in pageManager._pages: # train_pages[page_id] = pageManager.getPage(page_id).getString() # d = DivListLearner() # div_rules, div_markup = d.run(train_pages) rule_set = pageManager.learnAllRules() rule_set.removeBadRules(test_pages) # for rule in list_rules.rules: # rule_set.add_rule(rule) # # for rule in div_rules.rules: # rule_set.add_rule(rule) return rule_set
def autolearn_grid(): if request.method == 'POST': data = request.get_json(force=True) page_urls = data['urls'] page_manager = PageManager() results = {} for page_url in page_urls: page_contents = urllib2.urlopen(page_url).read() page_manager.addPage(page_url, page_contents) page_manager.learnStripes() rule_set = page_manager.learnAllRules() results['rules'] = json.loads(rule_set.toJson()) return jsonify(results) abort(404)
def save_markup(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] markup = data['markup'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') if not markup['__SCHEMA__'][0]['children']: markup_slot = { "id": "j1_2", "text": "slot", "icon": "glyphicon glyphicon-stop", "li_attr": { "id": "j1_2" }, "a_attr": { "href": "#", "id": "j1_2_anchor" }, "state": { "loaded": True, "opened": False, "selected": False, "disabled": False }, "data": {}, "children": [], "type": "item" }; list_slot = { "a_attr": { "href": "#", "id": "j1_3_anchor" }, "children": [], "data": {}, "icon": "glyphicon glyphicon-th-list", "id": "j1_3", "li_attr": { "id": "j1_3" }, "state": { "disabled": False, "loaded": True, "opened": False, "selected": False }, "text": "category", "type": "list" }; pageManager = PageManager() for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) schema = markup.pop("__SCHEMA__", None) urls = markup.pop("__URLS__", None) pageManager.learnStripes() (list_markup, list_names) = pageManager.learnListMarkups() rule_set = pageManager.learnAllRules() (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) count = 1 # Generate the schema from the list slots for list_name in list_names.keys(): count += 1 auto_markup_slot = copy.deepcopy(list_slot) auto_markup_slot['text'] = list_name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' ## now add the children to the auto learned list slot children = [] for name in list_names[list_name]: count += 1 auto_markup_slot_sub = copy.deepcopy(markup_slot) auto_markup_slot_sub['text'] = name auto_markup_slot_sub['id'] = 'j1_'+str(count) auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor' children.append(auto_markup_slot_sub) auto_markup_slot['children'] = children schema[0]['children'].append(auto_markup_slot) # Generate the schema from the item slots for name in names: count += 1 auto_markup_slot = copy.deepcopy(markup_slot) auto_markup_slot['text'] = name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' schema[0]['children'].append(auto_markup_slot) markup['__SCHEMA__'] = schema markup['__URLS__'] = urls with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() else: with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(markup) abort(404)
def save_markup(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] markup = data['markup'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') if not markup['__SCHEMA__'][0]['children']: markup_slot = { "id": "j1_2", "text": "slot", "icon": "glyphicon glyphicon-stop", "li_attr": { "id": "j1_2" }, "a_attr": { "href": "#", "id": "j1_2_anchor" }, "state": { "loaded": True, "opened": False, "selected": False, "disabled": False }, "data": {}, "children": [], "type": "item" }; list_slot = { "a_attr": { "href": "#", "id": "j1_3_anchor" }, "children": [], "data": {}, "icon": "glyphicon glyphicon-th-list", "id": "j1_3", "li_attr": { "id": "j1_3" }, "state": { "disabled": False, "loaded": True, "opened": False, "selected": False }, "text": "category", "type": "list" }; pageManager = PageManager() test_pages = [] for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) test_pages.append(page_str) schema = markup.pop("__SCHEMA__", None) urls = markup.pop("__URLS__", None) pageManager.learnStripes() list_markup = {} list_names = {} if LEARN_LISTS: (list_markup, list_names) = pageManager.learnListMarkups() #This is the div learning train_pages = {} for page_id in pageManager._pages: train_pages[page_id] = pageManager.getPage(page_id).getString() d = DivListLearner() div_rules, div_markup = d.run(train_pages) (div_list_markup, div_list_names) = pageManager.listRulesToMarkup(div_rules) for page_id in div_markup: for item in div_markup[page_id]: if item in div_list_markup[page_id]: if 'starting_token_location' in div_markup[page_id][item]: div_list_markup[page_id][item]['starting_token_location'] = div_markup[page_id][item]['starting_token_location'] if 'ending_token_location' in div_markup[page_id][item]: div_list_markup[page_id][item]['ending_token_location'] = div_markup[page_id][item]['ending_token_location'] if div_markup[page_id][item]['sequence']: for idx, val in enumerate(div_markup[page_id][item]['sequence']): if len(div_list_markup[page_id][item]['sequence']) <= idx: div_list_markup[page_id][item]['sequence'].insert(idx, val); else: div_list_markup[page_id][item]['sequence'][idx]['starting_token_location'] = val['starting_token_location'] div_list_markup[page_id][item]['sequence'][idx]['ending_token_location'] = val['ending_token_location'] #Now add these to the list_markup and list_names if len(div_rules.rules) > 0: for page_id in div_list_markup: if page_id not in list_markup: list_markup[page_id] = {} list_markup[page_id].update(div_list_markup[page_id]) list_names.update(div_list_names) rule_set = pageManager.learnAllRules() rule_set.removeBadRules(test_pages) (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) count = 1 # Generate the schema from the list slots for list_name in list_names.keys(): count += 1 auto_markup_slot = copy.deepcopy(list_slot) auto_markup_slot['text'] = list_name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' ## now add the children to the auto learned list slot children = [] for name in list_names[list_name]: count += 1 auto_markup_slot_sub = copy.deepcopy(markup_slot) auto_markup_slot_sub['text'] = name auto_markup_slot_sub['id'] = 'j1_'+str(count) auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor' children.append(auto_markup_slot_sub) auto_markup_slot['children'] = children schema[0]['children'].append(auto_markup_slot) # Generate the schema from the item slots for name in names: count += 1 auto_markup_slot = copy.deepcopy(markup_slot) auto_markup_slot['text'] = name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' schema[0]['children'].append(auto_markup_slot) markup['__SCHEMA__'] = schema markup['__URLS__'] = urls with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() else: with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(markup) abort(404)