def main(): page_file_dir = '/Users/bamana/Documents/InferLink/workspace/memex/memexpython/input/_template_test' pageManager = PageManager() files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) pageManager.learnStripes() (list_markup, list_names) = pageManager.learnListMarkups() rule_set = pageManager.learnAllRules() (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) # print json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')) rule_set = pageManager.learnRulesFromMarkup(list_markup) print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))
def save_markup(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] markup = data['markup'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') if not markup['__SCHEMA__'][0]['children']: markup_slot = { "id": "j1_2", "text": "slot", "icon": "glyphicon glyphicon-stop", "li_attr": { "id": "j1_2" }, "a_attr": { "href": "#", "id": "j1_2_anchor" }, "state": { "loaded": True, "opened": False, "selected": False, "disabled": False }, "data": {}, "children": [], "type": "item" }; list_slot = { "a_attr": { "href": "#", "id": "j1_3_anchor" }, "children": [], "data": {}, "icon": "glyphicon glyphicon-th-list", "id": "j1_3", "li_attr": { "id": "j1_3" }, "state": { "disabled": False, "loaded": True, "opened": False, "selected": False }, "text": "category", "type": "list" }; pageManager = PageManager() for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) schema = markup.pop("__SCHEMA__", None) urls = markup.pop("__URLS__", None) pageManager.learnStripes() (list_markup, list_names) = pageManager.learnListMarkups() rule_set = pageManager.learnAllRules() (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) count = 1 # Generate the schema from the list slots for list_name in list_names.keys(): count += 1 auto_markup_slot = copy.deepcopy(list_slot) auto_markup_slot['text'] = list_name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' ## now add the children to the auto learned list slot children = [] for name in list_names[list_name]: count += 1 auto_markup_slot_sub = copy.deepcopy(markup_slot) auto_markup_slot_sub['text'] = name auto_markup_slot_sub['id'] = 'j1_'+str(count) auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor' children.append(auto_markup_slot_sub) auto_markup_slot['children'] = children schema[0]['children'].append(auto_markup_slot) # Generate the schema from the item slots for name in names: count += 1 auto_markup_slot = copy.deepcopy(markup_slot) auto_markup_slot['text'] = name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' schema[0]['children'].append(auto_markup_slot) markup['__SCHEMA__'] = schema markup['__URLS__'] = urls with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() else: with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(markup) abort(404)
def save_markup(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] markup = data['markup'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') if not markup['__SCHEMA__'][0]['children']: markup_slot = { "id": "j1_2", "text": "slot", "icon": "glyphicon glyphicon-stop", "li_attr": { "id": "j1_2" }, "a_attr": { "href": "#", "id": "j1_2_anchor" }, "state": { "loaded": True, "opened": False, "selected": False, "disabled": False }, "data": {}, "children": [], "type": "item" }; list_slot = { "a_attr": { "href": "#", "id": "j1_3_anchor" }, "children": [], "data": {}, "icon": "glyphicon glyphicon-th-list", "id": "j1_3", "li_attr": { "id": "j1_3" }, "state": { "disabled": False, "loaded": True, "opened": False, "selected": False }, "text": "category", "type": "list" }; pageManager = PageManager() test_pages = [] for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) test_pages.append(page_str) schema = markup.pop("__SCHEMA__", None) urls = markup.pop("__URLS__", None) pageManager.learnStripes() list_markup = {} list_names = {} if LEARN_LISTS: (list_markup, list_names) = pageManager.learnListMarkups() #This is the div learning train_pages = {} for page_id in pageManager._pages: train_pages[page_id] = pageManager.getPage(page_id).getString() d = DivListLearner() div_rules, div_markup = d.run(train_pages) (div_list_markup, div_list_names) = pageManager.listRulesToMarkup(div_rules) for page_id in div_markup: for item in div_markup[page_id]: if item in div_list_markup[page_id]: if 'starting_token_location' in div_markup[page_id][item]: div_list_markup[page_id][item]['starting_token_location'] = div_markup[page_id][item]['starting_token_location'] if 'ending_token_location' in div_markup[page_id][item]: div_list_markup[page_id][item]['ending_token_location'] = div_markup[page_id][item]['ending_token_location'] if div_markup[page_id][item]['sequence']: for idx, val in enumerate(div_markup[page_id][item]['sequence']): if len(div_list_markup[page_id][item]['sequence']) <= idx: div_list_markup[page_id][item]['sequence'].insert(idx, val); else: div_list_markup[page_id][item]['sequence'][idx]['starting_token_location'] = val['starting_token_location'] div_list_markup[page_id][item]['sequence'][idx]['ending_token_location'] = val['ending_token_location'] #Now add these to the list_markup and list_names if len(div_rules.rules) > 0: for page_id in div_list_markup: if page_id not in list_markup: list_markup[page_id] = {} list_markup[page_id].update(div_list_markup[page_id]) list_names.update(div_list_names) rule_set = pageManager.learnAllRules() rule_set.removeBadRules(test_pages) (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) count = 1 # Generate the schema from the list slots for list_name in list_names.keys(): count += 1 auto_markup_slot = copy.deepcopy(list_slot) auto_markup_slot['text'] = list_name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' ## now add the children to the auto learned list slot children = [] for name in list_names[list_name]: count += 1 auto_markup_slot_sub = copy.deepcopy(markup_slot) auto_markup_slot_sub['text'] = name auto_markup_slot_sub['id'] = 'j1_'+str(count) auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor' children.append(auto_markup_slot_sub) auto_markup_slot['children'] = children schema[0]['children'].append(auto_markup_slot) # Generate the schema from the item slots for name in names: count += 1 auto_markup_slot = copy.deepcopy(markup_slot) auto_markup_slot['text'] = name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' schema[0]['children'].append(auto_markup_slot) markup['__SCHEMA__'] = schema markup['__URLS__'] = urls with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() else: with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(markup) abort(404)