Python PageManager.learnListMarkups Examples

Programming Language: Python

Namespace/Package Name: learning.PageManager

Class/Type: PageManager

Method/Function: learnListMarkups

Examples at hotexamples.com: 3

Python PageManager.learnListMarkups - 3 examples found. These are the top rated real world Python examples of learning.PageManager.PageManager.learnListMarkups extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PageManager(13)

addPage(13)

learnStripes(9)

learnRulesFromMarkup(5)

learnAllRules(4)

getPage(3)

getVisibleTokenStructure(2)

learnListMarkups(2)

rulesToMarkup(2)

getPageChunks(1)

getPageIds(1)

getPossibleLocations(1)

getStripes(1)

listRulesToMarkup(1)

Example #1

Show file

File: ListLearner.py Project: thezedwards/extraction

def main():
    page_file_dir = '/Users/bamana/Documents/InferLink/workspace/memex/memexpython/input/_template_test'
    
    pageManager = PageManager()
    files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
    for the_file in files:
        if the_file.startswith('.'):
            continue
        
        with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
            page_str = myfile.read().encode('utf-8')
        
        pageManager.addPage(the_file, page_str)
    
    pageManager.learnStripes()
    (list_markup, list_names) = pageManager.learnListMarkups()
    
    rule_set = pageManager.learnAllRules()
    (markup, names) = pageManager.rulesToMarkup(rule_set)
    
    for key in markup.keys():
        if key in list_markup:
            markup[key].update(list_markup[key])

#     print json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))
    
    rule_set = pageManager.learnRulesFromMarkup(list_markup)
    print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))

Example #2

Show file

File: controllers.py Project: saggu/extraction

def save_markup():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        markup = data['markup']

        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')

        if not markup['__SCHEMA__'][0]['children']:
            markup_slot = {
              "id": "j1_2",
              "text": "slot",
              "icon": "glyphicon glyphicon-stop",
              "li_attr": {
                "id": "j1_2"
              },
              "a_attr": {
                "href": "#",
                "id": "j1_2_anchor"
              },
              "state": {
                "loaded": True,
                "opened": False,
                "selected": False,
                "disabled": False
              },
              "data": {},
              "children": [],
              "type": "item"
            };
            
            list_slot = {
             "a_attr": {
                "href": "#",
                "id": "j1_3_anchor"
              },
              "children": [],
              "data": {},
              "icon": "glyphicon glyphicon-th-list",
              "id": "j1_3",
              "li_attr": {
                "id": "j1_3"
              },
              "state": {
                "disabled": False,
                "loaded": True,
                "opened": False,
                "selected": False
              },
              "text": "category",
              "type": "list"
            };

            pageManager = PageManager()
            for key in markup['__URLS__']:
                page_file = os.path.join(directory, key)
                with codecs.open(page_file, "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                pageManager.addPage(key, page_str)

            schema = markup.pop("__SCHEMA__", None)
            urls = markup.pop("__URLS__", None)

            pageManager.learnStripes()
            (list_markup, list_names) = pageManager.learnListMarkups()
            rule_set = pageManager.learnAllRules()
            (markup, names) = pageManager.rulesToMarkup(rule_set)

            for key in markup.keys():
                if key in list_markup:
                    markup[key].update(list_markup[key])

            count = 1
            # Generate the schema from the list slots
            for list_name in list_names.keys():
                count += 1
                auto_markup_slot = copy.deepcopy(list_slot)
                auto_markup_slot['text'] = list_name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                ## now add the children to the auto learned list slot
                children = []
                for name in list_names[list_name]:
                    count += 1
                    auto_markup_slot_sub = copy.deepcopy(markup_slot)
                    auto_markup_slot_sub['text'] = name
                    auto_markup_slot_sub['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                    children.append(auto_markup_slot_sub)
                auto_markup_slot['children'] = children
                schema[0]['children'].append(auto_markup_slot)

            # Generate the schema from the item slots
            for name in names:
                count += 1
                auto_markup_slot = copy.deepcopy(markup_slot)
                auto_markup_slot['text'] = name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                schema[0]['children'].append(auto_markup_slot)
            markup['__SCHEMA__'] = schema
            markup['__URLS__'] = urls

            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        else:
            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        return jsonify(markup)
    abort(404)

Example #3

Show file

File: controllers.py Project: thezedwards/extraction

def save_markup():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        markup = data['markup']

        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')

        if not markup['__SCHEMA__'][0]['children']:
            markup_slot = {
              "id": "j1_2",
              "text": "slot",
              "icon": "glyphicon glyphicon-stop",
              "li_attr": {
                "id": "j1_2"
              },
              "a_attr": {
                "href": "#",
                "id": "j1_2_anchor"
              },
              "state": {
                "loaded": True,
                "opened": False,
                "selected": False,
                "disabled": False
              },
              "data": {},
              "children": [],
              "type": "item"
            };
            
            list_slot = {
             "a_attr": {
                "href": "#",
                "id": "j1_3_anchor"
              },
              "children": [],
              "data": {},
              "icon": "glyphicon glyphicon-th-list",
              "id": "j1_3",
              "li_attr": {
                "id": "j1_3"
              },
              "state": {
                "disabled": False,
                "loaded": True,
                "opened": False,
                "selected": False
              },
              "text": "category",
              "type": "list"
            };

            pageManager = PageManager()
            test_pages = []
            for key in markup['__URLS__']:
                page_file = os.path.join(directory, key)
                with codecs.open(page_file, "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                pageManager.addPage(key, page_str)
                test_pages.append(page_str)

            schema = markup.pop("__SCHEMA__", None)
            urls = markup.pop("__URLS__", None)

            pageManager.learnStripes()
            list_markup = {}
            list_names = {}
            if LEARN_LISTS:
                (list_markup, list_names) = pageManager.learnListMarkups()
                
                #This is the div learning
                train_pages = {}
                for page_id in pageManager._pages:
                    train_pages[page_id] = pageManager.getPage(page_id).getString()
                d = DivListLearner()
                div_rules, div_markup = d.run(train_pages)
                 
                (div_list_markup, div_list_names) = pageManager.listRulesToMarkup(div_rules)
                
                for page_id in div_markup:
                    for item in div_markup[page_id]:
                        if item in div_list_markup[page_id]:
                            if 'starting_token_location' in div_markup[page_id][item]:
                                div_list_markup[page_id][item]['starting_token_location'] = div_markup[page_id][item]['starting_token_location']
                            if 'ending_token_location' in div_markup[page_id][item]:
                                div_list_markup[page_id][item]['ending_token_location'] = div_markup[page_id][item]['ending_token_location']
                            if div_markup[page_id][item]['sequence']:
                                for idx, val in enumerate(div_markup[page_id][item]['sequence']):
                                    if len(div_list_markup[page_id][item]['sequence']) <= idx:
                                        div_list_markup[page_id][item]['sequence'].insert(idx, val);
                                    else:
                                        div_list_markup[page_id][item]['sequence'][idx]['starting_token_location'] = val['starting_token_location']
                                        div_list_markup[page_id][item]['sequence'][idx]['ending_token_location'] = val['ending_token_location']
                
                #Now add these to the list_markup and list_names
                if len(div_rules.rules) > 0:
                    for page_id in div_list_markup:
                        if page_id not in list_markup:
                            list_markup[page_id] = {}
                        list_markup[page_id].update(div_list_markup[page_id])
                    list_names.update(div_list_names)
            
            rule_set = pageManager.learnAllRules()
            rule_set.removeBadRules(test_pages)
            
            (markup, names) = pageManager.rulesToMarkup(rule_set)

            for key in markup.keys():
                if key in list_markup:
                    markup[key].update(list_markup[key])

            count = 1
            # Generate the schema from the list slots
            for list_name in list_names.keys():
                count += 1
                auto_markup_slot = copy.deepcopy(list_slot)
                auto_markup_slot['text'] = list_name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                ## now add the children to the auto learned list slot
                children = []
                for name in list_names[list_name]:
                    count += 1
                    auto_markup_slot_sub = copy.deepcopy(markup_slot)
                    auto_markup_slot_sub['text'] = name
                    auto_markup_slot_sub['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                    children.append(auto_markup_slot_sub)
                auto_markup_slot['children'] = children
                schema[0]['children'].append(auto_markup_slot)

            # Generate the schema from the item slots
            for name in names:
                count += 1
                auto_markup_slot = copy.deepcopy(markup_slot)
                auto_markup_slot['text'] = name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                schema[0]['children'].append(auto_markup_slot)
            markup['__SCHEMA__'] = schema
            markup['__URLS__'] = urls

            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        else:
            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        return jsonify(markup)
    abort(404)