def main(): page_file_dir = '/Users/bamana/Documents/InferLink/workspace/memex/memexpython/input/_template_test' pageManager = PageManager() files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) pageManager.learnStripes() (list_markup, list_names) = pageManager.learnListMarkups() rule_set = pageManager.learnAllRules() (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) # print json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')) rule_set = pageManager.learnRulesFromMarkup(list_markup) print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"]) write_debug_files = False for opt in opts: if opt in [('-d', ''), ('--debug', '')]: write_debug_files = True if opt in [('-h', ''), ('--help', '')]: raise Usage('python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files') except getopt.error, msg: raise Usage(msg) logger.info('Running RuleLearner with file at %s for rules %s', args[0], args[1]) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager(write_debug_files) start_time = time.time() for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) logger.info("--- LOAD PAGES: %s seconds ---" % (time.time() - start_time)) #Read the markups from a file... start_time = time.time() markups_file = args[1] with codecs.open(markups_file, "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) markups.pop("__URLS__", None) logger.info("--- LOAD MARKUPS: %s seconds ---" % (time.time() - start_time)) pageManager.learnStripes(markups) start_time = time.time() rule_set = pageManager.learnRulesFromMarkup(markups) logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" % (time.time() - start_time)) if(len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() else: print rule_set.toJson()
def induce_template(self, cluster_members): sub_page_mgr = PageManager() for id in cluster_members: curr_page = self.__page_manager.getPage(id) sub_page_mgr.addPage(id, curr_page.string) sub_page_mgr.learnStripes() return sub_page_mgr.getStripes()
def do_learning(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') with codecs.open(markup_file, "r", "utf-8") as myfile: json_str = myfile.read().encode('utf-8') markup = json.loads(json_str) pageManager = PageManager() for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) markup.pop("__SCHEMA__", None) markup.pop("__URLS__", None) pageManager.learnStripes(markup) rule_set = pageManager.learnRulesFromMarkup(markup) rules_file = os.path.join(directory, 'learning', 'rules.json') with codecs.open(rules_file, "w", "utf-8") as myfile: myfile.write(json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(rules = json.loads(rule_set.toJson()) ) abort(404)
def run(page_file_dir, ignore_files=[]): test_pages = [] pageManager = PageManager(write_debug_files) if os.path.isfile(page_file_dir): with open(page_file_dir) as f: urls = f.readlines() for url in urls: page_url = url.strip() req = urllib2.urlopen(page_url) page_contents = req.read() charset = chardet.detect(page_contents) page_encoding = charset['encoding'] page_str = page_contents.decode(page_encoding).encode('utf-8') pageManager.addPage(page_url, page_contents) test_pages.append(page_url) else: files = [ f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f)) ] for the_file in files: if the_file.startswith( '.' ) or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files: continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) test_pages.append(the_file) pageManager.learnStripes() ##table, ul, etc. list learning # (list_markup, list_names) = pageManager.learnListMarkups() # list_rules = pageManager.learnRulesFromMarkup(list_markup) ##div learning # train_pages = {} # for page_id in pageManager._pages: # train_pages[page_id] = pageManager.getPage(page_id).getString() # d = DivListLearner() # div_rules, div_markup = d.run(train_pages) rule_set = pageManager.learnAllRules() rule_set.removeBadRules(test_pages) # for rule in list_rules.rules: # rule_set.add_rule(rule) # # for rule in div_rules.rules: # rule_set.add_rule(rule) return rule_set
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "h", ["help"]) except getopt.error, msg: raise Usage(msg) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager() page_str_array = [] for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') page_str_array.append(page_str) pageManager.addPage(the_file, page_str) pageManager.learnStripes() #Read the markups from a file... markups_file = args[1] with codecs.open(os.path.join('', markups_file), "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) #Before we learn the stripes let's make sure we can open the output file pageManager.learnStripes(markups) rule_set = pageManager.learnRulesFromMarkup(markups) if(len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() #testing flatten = False extraction_list = rule_set.extract(page_str_array[0]) if rule_set.validate(extraction_list): if flatten: print json.dumps(Landmark.flattenResult(extraction_list), sort_keys=True, indent=2, separators=(',', ': ')) else: print json.dumps(extraction_list, sort_keys=True, indent=2, separators=(',', ': '))
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"]) write_debug_files = False for opt in opts: if opt in [('-d', ''), ('--debug', '')]: write_debug_files = True if opt in [('-h', ''), ('--help', '')]: raise Usage('python -m learning.RuleLearnerAllSlots [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] \n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files') except getopt.error, msg: raise Usage(msg) logger.info('Running RuleLearnerAllSlots All Slots with files at %s', args[0]) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager(write_debug_files) if os.path.isfile(page_file_dir): with open(page_file_dir) as f: urls = f.readlines() for url in urls: page_url = url.strip() req = urllib2.urlopen(page_url) page_contents = req.read() charset = chardet.detect(page_contents) page_encoding = charset['encoding'] page_str = page_contents.decode(page_encoding).encode('utf-8') pageManager.addPage(page_url, page_contents) else: files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) pageManager.learnStripes() rule_set = pageManager.learnAllRules() print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))
def run(page_file_dir, ignore_files = []): test_pages = [] pageManager = PageManager(write_debug_files) if os.path.isfile(page_file_dir): with open(page_file_dir) as f: urls = f.readlines() for url in urls: page_url = url.strip() req = urllib2.urlopen(page_url) page_contents = req.read() charset = chardet.detect(page_contents) page_encoding = charset['encoding'] page_str = page_contents.decode(page_encoding).encode('utf-8') pageManager.addPage(page_url, page_contents) test_pages.append(page_url) else: files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.') or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files: continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) test_pages.append(the_file) pageManager.learnStripes() ##table, ul, etc. list learning # (list_markup, list_names) = pageManager.learnListMarkups() # list_rules = pageManager.learnRulesFromMarkup(list_markup) ##div learning # train_pages = {} # for page_id in pageManager._pages: # train_pages[page_id] = pageManager.getPage(page_id).getString() # d = DivListLearner() # div_rules, div_markup = d.run(train_pages) rule_set = pageManager.learnAllRules() rule_set.removeBadRules(test_pages) # for rule in list_rules.rules: # rule_set.add_rule(rule) # # for rule in div_rules.rules: # rule_set.add_rule(rule) return rule_set
def learn_list_extractors(self, pages): page_mgr = PageManager() #write_debug_files=True) markup = {} for page in pages: page_content = pages[page] page_mgr.addPage(page, page_content) content_list_markup = self.lists_on_single_page(page_content) markup[page] = content_list_markup # print '--- MARKUP ---' # print json.dumps(markup) page_mgr.learnStripes(markups=markup) rules = page_mgr.learnRulesFromMarkup(markup) # now, for each markup rule, learn a little page manager sublist_page_managers = {} for page in markup: for rule_name in markup[page]: if rule_name not in sublist_page_managers: sublist_page_managers[rule_name] = PageManager() for rid in range(len(markup[page][rule_name]['sequence'])): row = markup[page][rule_name]['sequence'][rid] sublist_page_managers[rule_name].addPage( page + "html%d" % rid, row['extract']) sublist_sub_rules = {} for sublist in sublist_page_managers: sublist_page_managers[sublist].learnStripes() sub_rules = sublist_page_managers[sublist].learnAllRules( in_list=True) sublist_sub_rules[ sublist] = sub_rules # This should match a rule name in the rules... count = 1 for rule in rules.rules: # print "== RULE INFO ==" # print str(rule.name) rule.set_sub_rules(sublist_sub_rules[rule.name]) list_name = '_div_list' + format(count, '04') for page_id in markup: if rule.name in markup[page_id]: markup[page_id][list_name] = markup[page_id].pop(rule.name) rule.name = list_name # print str(json.dumps(rule.toJson())) # print "===============" # # print rules.toJson() return rules, markup
def learn_list_extractors(self, pages): page_mgr = PageManager() #write_debug_files=True) markup = {} for page in pages: page_content = pages[page] page_mgr.addPage(page, page_content) content_list_markup = self.lists_on_single_page(page_content) markup[page] = content_list_markup # print '--- MARKUP ---' # print json.dumps(markup) page_mgr.learnStripes(markups=markup) rules = page_mgr.learnRulesFromMarkup(markup) # now, for each markup rule, learn a little page manager sublist_page_managers = {} for page in markup: for rule_name in markup[page]: if rule_name not in sublist_page_managers: sublist_page_managers[rule_name] = PageManager() for rid in range(len(markup[page][rule_name]['sequence'])): row = markup[page][rule_name]['sequence'][rid] sublist_page_managers[rule_name].addPage(page+"html%d" % rid, row['extract']) sublist_sub_rules = {} for sublist in sublist_page_managers: sublist_page_managers[sublist].learnStripes() sub_rules = sublist_page_managers[sublist].learnAllRules(in_list = True) sublist_sub_rules[sublist] = sub_rules # This should match a rule name in the rules... count = 1 for rule in rules.rules: # print "== RULE INFO ==" # print str(rule.name) rule.set_sub_rules(sublist_sub_rules[rule.name]) list_name = '_div_list'+format(count, '04') for page_id in markup: if rule.name in markup[page_id]: markup[page_id][list_name] = markup[page_id].pop(rule.name) rule.name = list_name # print str(json.dumps(rule.toJson())) # print "===============" # # print rules.toJson() return rules, markup
def markup_on_page(): if request.method == 'POST': data = request.get_json(force=True) file_name = data['file_name'] project_folder = data['project_folder'] markup = data['markup'] sample_file = os.path.join(app.static_folder, 'project_folders', project_folder, file_name) with codecs.open(sample_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') page_manager = PageManager() page_manager.addPage(file_name, page_str) shortest_pairs = page_manager.getPossibleLocations(file_name, markup) return jsonify(shortest_pairs = shortest_pairs)
def autolearn_grid(): if request.method == 'POST': data = request.get_json(force=True) page_urls = data['urls'] page_manager = PageManager() results = {} for page_url in page_urls: page_contents = urllib2.urlopen(page_url).read() page_manager.addPage(page_url, page_contents) page_manager.learnStripes() rule_set = page_manager.learnAllRules() results['rules'] = json.loads(rule_set.toJson()) return jsonify(results) abort(404)
def visible_token_viewer(): if request.method == 'POST': data = request.get_json(force=True) test_string = data['test_string'] test_string = ' '.join(test_string.split()) pageManager = PageManager() page_file_dir = os.path.join(app.static_folder, 'visible_tokens_test') files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) triples = [] for triple in pageManager.getVisibleTokenStructure(): if triple['invisible_token_buffer_before'].endswith(test_string): triples.append(triple) return jsonify(triples=triples)
def lists_on_single_page(self, content): pg = PageManager() pg.addPage("zzz", content) triples = pg.getVisibleTokenStructure() (ptree, paths_to_vis_text, path_to_invis_toks) = self.prefix_tree(triples, only_consider_tag='div') potential_lists = self.prefix_tree_to_paths(ptree) if self.__DEBUG: print '.... POTENTIAL LISTS ARE ....' print '\n'.join([''.join(p) for p in potential_lists]) print '.... OK!....' all_tokens_list = pg.getPage("zzz").tokens # Now, let's get our lists lists = {} for i in range(len(potential_lists)): pot_list = potential_lists[i] as_path = ''.join(pot_list) if self.__DEBUG: print "PATH: %s" % as_path lists[as_path] = {'rows': []} # if as_path in paths_to_vis_text: for path_to_vis in paths_to_vis_text: if path_to_vis.find(as_path) > -1: vis_texts = [a for a in paths_to_vis_text[path_to_vis]] invis_toks = [t for t in path_to_invis_toks[path_to_vis]] for idx in range(len(vis_texts)): if self.__DEBUG: print "%s ==> %s" % ( vis_texts[idx], str(invis_toks[idx].token_location)) html_between_row = '' if (idx + 1) < len(vis_texts): begin = invis_toks[idx].token_location end = invis_toks[idx + 1].token_location - 1 html_between_row = all_tokens_list.getTokensAsString( begin, end, whitespace=True) lists[as_path]['rows'].append({ 'visible_text': vis_texts[idx], 'starting_token_location': invis_toks[idx].token_location, 'html_between_row': html_between_row }) as_json_str = json.dumps(lists) if self.__DEBUG: print "--------" print as_json_str print "--------" # # do it as an extraction instead? # item_rule_begin = Landmark.escape_regex_string('<html') # item_rule_end = Landmark.escape_regex_string('/html>') # # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list]) # # # figure out: for each tag in the rule, add it's end tag (keep track of tag type) # # NOTE: for now, this assumes that the HTML is well formed # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))]) # # end_iter_rule = end_it # # # include end-regex: included in the stuff that's extracted. # # Solve for the case where you only see part of the stuff # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end, # begin_iter_rule, end_iter_rule, removehtml=True) # extraction = rule.apply(content) # # print "**PATH: "+''.join(pot_list) # as_json_str = json.dumps(extraction) # # for seq in extraction['sequence']: # print "\t"+seq['extract'] # TODO: do this here???? # TODO: big drop down the path should be considered... not just if hte path occurs twice # TODO: fix bugs markup = self.creat_row_markup(lists, all_tokens_list, pg) if self.__DEBUG: print "list markup" json.dumps(markup) return markup
def save_markup(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] markup = data['markup'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') if not markup['__SCHEMA__'][0]['children']: markup_slot = { "id": "j1_2", "text": "slot", "icon": "glyphicon glyphicon-stop", "li_attr": { "id": "j1_2" }, "a_attr": { "href": "#", "id": "j1_2_anchor" }, "state": { "loaded": True, "opened": False, "selected": False, "disabled": False }, "data": {}, "children": [], "type": "item" }; list_slot = { "a_attr": { "href": "#", "id": "j1_3_anchor" }, "children": [], "data": {}, "icon": "glyphicon glyphicon-th-list", "id": "j1_3", "li_attr": { "id": "j1_3" }, "state": { "disabled": False, "loaded": True, "opened": False, "selected": False }, "text": "category", "type": "list" }; pageManager = PageManager() for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) schema = markup.pop("__SCHEMA__", None) urls = markup.pop("__URLS__", None) pageManager.learnStripes() (list_markup, list_names) = pageManager.learnListMarkups() rule_set = pageManager.learnAllRules() (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) count = 1 # Generate the schema from the list slots for list_name in list_names.keys(): count += 1 auto_markup_slot = copy.deepcopy(list_slot) auto_markup_slot['text'] = list_name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' ## now add the children to the auto learned list slot children = [] for name in list_names[list_name]: count += 1 auto_markup_slot_sub = copy.deepcopy(markup_slot) auto_markup_slot_sub['text'] = name auto_markup_slot_sub['id'] = 'j1_'+str(count) auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor' children.append(auto_markup_slot_sub) auto_markup_slot['children'] = children schema[0]['children'].append(auto_markup_slot) # Generate the schema from the item slots for name in names: count += 1 auto_markup_slot = copy.deepcopy(markup_slot) auto_markup_slot['text'] = name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' schema[0]['children'].append(auto_markup_slot) markup['__SCHEMA__'] = schema markup['__URLS__'] = urls with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() else: with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(markup) abort(404)
def save_markup(): if request.method == 'POST': data = request.get_json(force=True) project_folder = data['project_folder'] markup = data['markup'] directory = os.path.join(app.static_folder, 'project_folders', project_folder) markup_file = os.path.join(directory, 'learning', 'markup.json') if not markup['__SCHEMA__'][0]['children']: markup_slot = { "id": "j1_2", "text": "slot", "icon": "glyphicon glyphicon-stop", "li_attr": { "id": "j1_2" }, "a_attr": { "href": "#", "id": "j1_2_anchor" }, "state": { "loaded": True, "opened": False, "selected": False, "disabled": False }, "data": {}, "children": [], "type": "item" }; list_slot = { "a_attr": { "href": "#", "id": "j1_3_anchor" }, "children": [], "data": {}, "icon": "glyphicon glyphicon-th-list", "id": "j1_3", "li_attr": { "id": "j1_3" }, "state": { "disabled": False, "loaded": True, "opened": False, "selected": False }, "text": "category", "type": "list" }; pageManager = PageManager() test_pages = [] for key in markup['__URLS__']: page_file = os.path.join(directory, key) with codecs.open(page_file, "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(key, page_str) test_pages.append(page_str) schema = markup.pop("__SCHEMA__", None) urls = markup.pop("__URLS__", None) pageManager.learnStripes() list_markup = {} list_names = {} if LEARN_LISTS: (list_markup, list_names) = pageManager.learnListMarkups() #This is the div learning train_pages = {} for page_id in pageManager._pages: train_pages[page_id] = pageManager.getPage(page_id).getString() d = DivListLearner() div_rules, div_markup = d.run(train_pages) (div_list_markup, div_list_names) = pageManager.listRulesToMarkup(div_rules) for page_id in div_markup: for item in div_markup[page_id]: if item in div_list_markup[page_id]: if 'starting_token_location' in div_markup[page_id][item]: div_list_markup[page_id][item]['starting_token_location'] = div_markup[page_id][item]['starting_token_location'] if 'ending_token_location' in div_markup[page_id][item]: div_list_markup[page_id][item]['ending_token_location'] = div_markup[page_id][item]['ending_token_location'] if div_markup[page_id][item]['sequence']: for idx, val in enumerate(div_markup[page_id][item]['sequence']): if len(div_list_markup[page_id][item]['sequence']) <= idx: div_list_markup[page_id][item]['sequence'].insert(idx, val); else: div_list_markup[page_id][item]['sequence'][idx]['starting_token_location'] = val['starting_token_location'] div_list_markup[page_id][item]['sequence'][idx]['ending_token_location'] = val['ending_token_location'] #Now add these to the list_markup and list_names if len(div_rules.rules) > 0: for page_id in div_list_markup: if page_id not in list_markup: list_markup[page_id] = {} list_markup[page_id].update(div_list_markup[page_id]) list_names.update(div_list_names) rule_set = pageManager.learnAllRules() rule_set.removeBadRules(test_pages) (markup, names) = pageManager.rulesToMarkup(rule_set) for key in markup.keys(): if key in list_markup: markup[key].update(list_markup[key]) count = 1 # Generate the schema from the list slots for list_name in list_names.keys(): count += 1 auto_markup_slot = copy.deepcopy(list_slot) auto_markup_slot['text'] = list_name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' ## now add the children to the auto learned list slot children = [] for name in list_names[list_name]: count += 1 auto_markup_slot_sub = copy.deepcopy(markup_slot) auto_markup_slot_sub['text'] = name auto_markup_slot_sub['id'] = 'j1_'+str(count) auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor' children.append(auto_markup_slot_sub) auto_markup_slot['children'] = children schema[0]['children'].append(auto_markup_slot) # Generate the schema from the item slots for name in names: count += 1 auto_markup_slot = copy.deepcopy(markup_slot) auto_markup_slot['text'] = name auto_markup_slot['id'] = 'j1_'+str(count) auto_markup_slot['li_attr']['id'] = 'j1_'+str(count) auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor' schema[0]['children'].append(auto_markup_slot) markup['__SCHEMA__'] = schema markup['__URLS__'] = urls with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() else: with codecs.open(markup_file, "w", "utf-8") as myfile: myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))) myfile.close() return jsonify(markup) abort(404)
def lists_on_single_page(self, content): pg = PageManager() pg.addPage("zzz", content) triples = pg.getVisibleTokenStructure() (ptree, paths_to_vis_text, path_to_invis_toks) = self.prefix_tree(triples, only_consider_tag='div') potential_lists = self.prefix_tree_to_paths(ptree) if self.__DEBUG: print '.... POTENTIAL LISTS ARE ....' print '\n'.join([''.join(p) for p in potential_lists]) print '.... OK!....' all_tokens_list = pg.getPage("zzz").tokens # Now, let's get our lists lists = {} for i in range(len(potential_lists)): pot_list = potential_lists[i] as_path = ''.join(pot_list) if self.__DEBUG: print "PATH: %s" % as_path lists[as_path] = { 'rows': [] } # if as_path in paths_to_vis_text: for path_to_vis in paths_to_vis_text: if path_to_vis.find(as_path) > -1: vis_texts = [a for a in paths_to_vis_text[path_to_vis]] invis_toks = [t for t in path_to_invis_toks[path_to_vis]] for idx in range(len(vis_texts)): if self.__DEBUG: print "%s ==> %s" % (vis_texts[idx], str(invis_toks[idx].token_location)) html_between_row = '' if (idx+1) < len(vis_texts): begin = invis_toks[idx].token_location end = invis_toks[idx+1].token_location - 1 html_between_row = all_tokens_list.getTokensAsString(begin, end, whitespace=True) lists[as_path]['rows'].append({ 'visible_text': vis_texts[idx], 'starting_token_location': invis_toks[idx].token_location, 'html_between_row': html_between_row }) as_json_str = json.dumps(lists) if self.__DEBUG: print "--------" print as_json_str print "--------" # # do it as an extraction instead? # item_rule_begin = Landmark.escape_regex_string('<html') # item_rule_end = Landmark.escape_regex_string('/html>') # # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list]) # # # figure out: for each tag in the rule, add it's end tag (keep track of tag type) # # NOTE: for now, this assumes that the HTML is well formed # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))]) # # end_iter_rule = end_it # # # include end-regex: included in the stuff that's extracted. # # Solve for the case where you only see part of the stuff # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end, # begin_iter_rule, end_iter_rule, removehtml=True) # extraction = rule.apply(content) # # print "**PATH: "+''.join(pot_list) # as_json_str = json.dumps(extraction) # # for seq in extraction['sequence']: # print "\t"+seq['extract'] # TODO: do this here???? # TODO: big drop down the path should be considered... not just if hte path occurs twice # TODO: fix bugs markup = self.creat_row_markup(lists, all_tokens_list, pg) if self.__DEBUG: print "list markup" json.dumps(markup) return markup
class TruffleShuffle(object): #json lines file is of the CDR format def __init__(self, page_file_dir='/path/to/dir/', json_lines_file=None): self.__page_file_dir = page_file_dir self.__chunkBreakSeparator = '<BRK>' self.__page_manager = PageManager() if json_lines_file: count = 0 myfile = codecs.open(json_lines_file, "r", "utf-8") for line in myfile: count += 1 try: json_object = json.loads(line) the_file = json_object['doc_id'] page_str = json_object['raw_content'] self.__page_manager.addPage(the_file, page_str) except: print 'Unable to process line %d' % count else: files = [f for f in os.listdir(self.__page_file_dir) if os.path.isfile(os.path.join(self.__page_file_dir, f))] for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(self.__page_file_dir, the_file), "rU", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') self.__page_manager.addPage(the_file, page_str) def get_chunk_separator(self): return self.__chunkBreakSeparator def get_page_manager(self): return self.__page_manager # so maybe you table randome samples of 3 pages, and induce a template # if you find a template that is similar (or matches) most, then that is the template for this cluster? # or you could do a greedy build or something (e.g., add another page and if it doesn't change, you are good) def sample_and_learn_template(self, cluster_members, sub_sample_size=5, iterations=10): stripes = [] for itr in range(iterations): shuffle(cluster_members) # randomly orders them random_members = cluster_members[0:sub_sample_size] # get the sub-sample template = self.induce_template(random_members) stripe_texts = [] for stripe in template: stripe_text = stripe['stripe'] stripe_texts.append(stripe_text) # now, only keep the top X longest stripes and see what it does... top_x = 10 # for now... stripes_by_size = {} for stpe in stripe_texts: stsz = len(stpe) if stsz not in stripes_by_size: stripes_by_size[stsz] = [] stripes_by_size[stsz].append(stpe) top_sizes = sorted(stripes_by_size.keys(), reverse=True) kept_big_stripes = [] for tsz in top_sizes: kept_big_stripes.extend(stripes_by_size[tsz]) if len(kept_big_stripes) > top_x: break # stripes_string = self.__chunkBreakSeparator.join(stripe_texts) stripes_string = self.__chunkBreakSeparator.join(kept_big_stripes[:top_x]) stripes.append(stripes_string) template_occurrences = {} for tstr in stripes: template_occurrences[tstr] = stripes.count(tstr) for sstring in template_occurrences: if template_occurrences[sstring] > 1: print "Template: %s" % sstring[:250] # just a little bit print "Induced template occurs %d out of %d" % (template_occurrences[sstring], iterations) def induce_template(self, cluster_members): sub_page_mgr = PageManager() for id in cluster_members: curr_page = self.__page_manager.getPage(id) sub_page_mgr.addPage(id, curr_page.string) sub_page_mgr.learnStripes() return sub_page_mgr.getStripes() def prep_truffles_to_shuffle(self): all_chunks = set() page_chunks_map = {} for page_id in self.__page_manager.getPageIds(): page_chunks = self.__page_manager.getPageChunks(page_id) all_chunks.update(page_chunks) page_chunks_map[page_id] = page_chunks chunks_to_remove = set() all_pages_sz = len(self.__page_manager.getPageIds()) for chunk in all_chunks: num_pages_with_chunk = 0 for page_id in self.__page_manager.getPageIds(): if chunk in page_chunks_map[page_id]: num_pages_with_chunk += 1 if num_pages_with_chunk < 10 or num_pages_with_chunk == all_pages_sz: chunks_to_remove.add(chunk) # print str(len(all_chunks)) + " chunks before filtering" all_chunks.difference_update(chunks_to_remove) for page_id in self.__page_manager.getPageIds(): page_chunks_map[page_id].difference_update(chunks_to_remove) # print str(len(all_chunks)) + " chunks left after filtering" # print str(all_pages_sz) + " pages total" return all_chunks, page_chunks_map ############################## # # Clusters pages according to "rules". A "rule" is a list of chunks, and a "chunk" is a section of a Web page # that is visible to a user. # # Inputs: # algorithm: 'rule_size': cluster by the size of rule from long rules to short rules # 'coverage' : cluster by the number of pages covered by a rule, small to big (more specific to less) # # Outputs: # dict[rule] = { # 'MEMBERS': list of page ids (Pids from the PageManager), # 'ANCHOR': the anchoring chunk for this cluster # } # That is, each entry is a rule and its value is a dict. Note that an anchor is unique # Each rule is a string of chunk_1<BRK>chunk_2<BRK>...<BRK>chunk_N # it's a string to make it an index, but to use it you could break on <BRK> # which you can get from the method get_chunk_separator() # ############################## def do_truffle_shuffle(self, algorithm='coverage'): all_chunks, page_chunks_map = self.prep_truffles_to_shuffle() chunk_counts = {} seen_rules = [] rule_anchors = {} for chunk in all_chunks: pages_with_chunk = [] for page_id in self.__page_manager.getPageIds(): if chunk in page_chunks_map[page_id]: pages_with_chunk.append(page_id) other_chunks = set() other_chunks.update(page_chunks_map[pages_with_chunk[0]]) for page_id in pages_with_chunk: other_chunks.intersection_update(page_chunks_map[page_id]) # now, find all the guys that have all of those chunks... if len(other_chunks) > 1: # one token is not enough, enforce that there are at least 2... rule = self.__chunkBreakSeparator.join(other_chunks) if rule not in seen_rules: chunk_counts[rule] = pages_with_chunk rule_anchors[rule] = chunk if algorithm == 'coverage': counts = dict([(rule, len(chunk_counts[rule])) for rule in chunk_counts]) else: # count by the size of the rule, but prefer longer, # so make it negative so we don't need to change sorted() call below (e.g., make rules negative # so that sorted small to large actually gives us longer rules (more negative) to shorter (less neg) counts = dict([(rule, -len(rule.split(self.__chunkBreakSeparator))) for rule in chunk_counts]) inverted = {} for rl in counts: sz = counts[rl] if sz not in inverted: inverted[sz] = [] inverted[sz].append(rl) final_clusters = {} already_clustered = [] for size in sorted(inverted.keys()): rules = inverted[size] for rule in rules: pids = [p for p in chunk_counts[rule] if p not in already_clustered] already_clustered.extend(pids) if len(pids) > 1: final_clusters[rule] = { 'MEMBERS': pids, 'ANCHOR': rule_anchors[rule] } return final_clusters
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"]) write_debug_files = False for opt in opts: if opt in [('-d', ''), ('--debug', '')]: write_debug_files = True if opt in [('-h', ''), ('--help', '')]: raise Usage( 'python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files' ) except getopt.error, msg: raise Usage(msg) logger.info('Running RuleLearner with file at %s for rules %s', args[0], args[1]) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager(write_debug_files) start_time = time.time() for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') pageManager.addPage(the_file, page_str) logger.info("--- LOAD PAGES: %s seconds ---" % (time.time() - start_time)) #Read the markups from a file... start_time = time.time() markups_file = args[1] with codecs.open(markups_file, "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) markups.pop("__URLS__", None) logger.info("--- LOAD MARKUPS: %s seconds ---" % (time.time() - start_time)) pageManager.learnStripes(markups) start_time = time.time() rule_set = pageManager.learnRulesFromMarkup(markups) logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" % (time.time() - start_time)) if (len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() else: print rule_set.toJson()
def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "h", ["help"]) except getopt.error, msg: raise Usage(msg) #read the directory location from arg0 page_file_dir = args[0] pageManager = PageManager() page_str_array = [] for subdir, dirs, files in os.walk(page_file_dir): for the_file in files: if the_file.startswith('.'): continue with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile: page_str = myfile.read().encode('utf-8') page_str_array.append(page_str) pageManager.addPage(the_file, page_str) pageManager.learnStripes() #Read the markups from a file... markups_file = args[1] with codecs.open(os.path.join('', markups_file), "r", "utf-8") as myfile: markup_str = myfile.read().encode('utf-8') markups = json.loads(markup_str) markups.pop("__SCHEMA__", None) #Before we learn the stripes let's make sure we can open the output file pageManager.learnStripes(markups) rule_set = pageManager.learnRulesFromMarkup(markups) if (len(args) > 2): output_file = args[2] with codecs.open(output_file, "w", "utf-8") as myfile: myfile.write(rule_set.toJson()) myfile.close() #testing flatten = False extraction_list = rule_set.extract(page_str_array[0]) if rule_set.validate(extraction_list): if flatten: print json.dumps(Landmark.flattenResult(extraction_list), sort_keys=True, indent=2, separators=(',', ': ')) else: print json.dumps(extraction_list, sort_keys=True, indent=2, separators=(',', ': '))