Python PageManager.learnAllRulesの例、learning.PageManager.PageManager.learnAllRules Pythonの例

コード例 #1

0

ファイルを表示

ファイル: ListLearner.py プロジェクト: thezedwards/extraction

def main():
    page_file_dir = '/Users/bamana/Documents/InferLink/workspace/memex/memexpython/input/_template_test'
    
    pageManager = PageManager()
    files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
    for the_file in files:
        if the_file.startswith('.'):
            continue
        
        with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
            page_str = myfile.read().encode('utf-8')
        
        pageManager.addPage(the_file, page_str)
    
    pageManager.learnStripes()
    (list_markup, list_names) = pageManager.learnListMarkups()
    
    rule_set = pageManager.learnAllRules()
    (markup, names) = pageManager.rulesToMarkup(rule_set)
    
    for key in markup.keys():
        if key in list_markup:
            markup[key].update(list_markup[key])

#     print json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))
    
    rule_set = pageManager.learnRulesFromMarkup(list_markup)
    print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))

コード例 #2

0

ファイルを表示

ファイル: RuleLearnerAllSlots.py プロジェクト: thezedwards/extraction

def run(page_file_dir, ignore_files=[]):
    test_pages = []
    pageManager = PageManager(write_debug_files)

    if os.path.isfile(page_file_dir):
        with open(page_file_dir) as f:
            urls = f.readlines()
        for url in urls:
            page_url = url.strip()
            req = urllib2.urlopen(page_url)
            page_contents = req.read()
            charset = chardet.detect(page_contents)
            page_encoding = charset['encoding']
            page_str = page_contents.decode(page_encoding).encode('utf-8')
            pageManager.addPage(page_url, page_contents)
            test_pages.append(page_url)
    else:
        files = [
            f for f in os.listdir(page_file_dir)
            if os.path.isfile(os.path.join(page_file_dir, f))
        ]
        for the_file in files:
            if the_file.startswith(
                    '.'
            ) or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files:
                continue

            with codecs.open(os.path.join(page_file_dir, the_file), "r",
                             "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')

            pageManager.addPage(the_file, page_str)
            test_pages.append(the_file)

    pageManager.learnStripes()

    ##table, ul, etc. list learning
    #         (list_markup, list_names) = pageManager.learnListMarkups()
    #         list_rules = pageManager.learnRulesFromMarkup(list_markup)

    ##div learning
    #         train_pages = {}
    #         for page_id in pageManager._pages:
    #             train_pages[page_id] = pageManager.getPage(page_id).getString()
    #         d = DivListLearner()
    #         div_rules, div_markup = d.run(train_pages)

    rule_set = pageManager.learnAllRules()
    rule_set.removeBadRules(test_pages)

    #         for rule in list_rules.rules:
    #             rule_set.add_rule(rule)
    #
    #         for rule in div_rules.rules:
    #             rule_set.add_rule(rule)
    return rule_set

コード例 #3

0

ファイルを表示

ファイル: RuleLearnerAllSlots.py プロジェクト: saggu/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"])
            
            write_debug_files = False
            
            for opt in opts:
                if opt in [('-d', ''), ('--debug', '')]:
                    write_debug_files = True
                if opt in [('-h', ''), ('--help', '')]:
                    raise Usage('python -m learning.RuleLearnerAllSlots [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] \n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files')
        except getopt.error, msg:
            raise Usage(msg)
        
        logger.info('Running RuleLearnerAllSlots All Slots with files at %s', args[0])
        
        #read the directory location from arg0
        page_file_dir = args[0]
        
        pageManager = PageManager(write_debug_files)
        
        if os.path.isfile(page_file_dir):
            with open(page_file_dir) as f:
                urls = f.readlines()
            for url in urls:
                page_url = url.strip()
                req = urllib2.urlopen(page_url)
                page_contents = req.read()
                charset = chardet.detect(page_contents)
                page_encoding = charset['encoding']
                page_str = page_contents.decode(page_encoding).encode('utf-8')
                pageManager.addPage(page_url, page_contents)
        else:
            files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
            for the_file in files:
                if the_file.startswith('.'):
                    continue
                 
                with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                     
                pageManager.addPage(the_file, page_str)

        pageManager.learnStripes()
        rule_set = pageManager.learnAllRules()
          
        print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))

コード例 #4

0

ファイルを表示

ファイル: RuleLearnerAllSlots.py プロジェクト: usc-isi-i2/landmark-extraction

def run(page_file_dir, ignore_files = []):
    test_pages = []
    pageManager = PageManager(write_debug_files)
    
    if os.path.isfile(page_file_dir):
        with open(page_file_dir) as f:
            urls = f.readlines()
        for url in urls:
            page_url = url.strip()
            req = urllib2.urlopen(page_url)
            page_contents = req.read()
            charset = chardet.detect(page_contents)
            page_encoding = charset['encoding']
            page_str = page_contents.decode(page_encoding).encode('utf-8')
            pageManager.addPage(page_url, page_contents)
            test_pages.append(page_url)
    else:
        files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
        for the_file in files:
            if the_file.startswith('.') or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files:
                continue
             
            with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
                 
            pageManager.addPage(the_file, page_str)
            test_pages.append(the_file)

    pageManager.learnStripes()
    
    ##table, ul, etc. list learning
#         (list_markup, list_names) = pageManager.learnListMarkups()
#         list_rules = pageManager.learnRulesFromMarkup(list_markup)
    
    ##div learning
#         train_pages = {}
#         for page_id in pageManager._pages:
#             train_pages[page_id] = pageManager.getPage(page_id).getString()
#         d = DivListLearner()
#         div_rules, div_markup = d.run(train_pages)
    
    rule_set = pageManager.learnAllRules()
    rule_set.removeBadRules(test_pages)
    
#         for rule in list_rules.rules:
#             rule_set.add_rule(rule)
#         
#         for rule in div_rules.rules:
#             rule_set.add_rule(rule)
    return rule_set

コード例 #5

0

ファイルを表示

ファイル: controllers.py プロジェクト: saggu/extraction

def autolearn_grid():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        page_urls = data['urls']
        
        page_manager = PageManager()
        results = {}
        for page_url in page_urls:
            page_contents = urllib2.urlopen(page_url).read()
            page_manager.addPage(page_url, page_contents)
            
        page_manager.learnStripes()
        rule_set = page_manager.learnAllRules()
        results['rules'] = json.loads(rule_set.toJson())
        
        return jsonify(results)
    
    abort(404)

コード例 #6

0

ファイルを表示

ファイル: controllers.py プロジェクト: thezedwards/extraction

def autolearn_grid():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        page_urls = data['urls']
        
        page_manager = PageManager()
        results = {}
        for page_url in page_urls:
            page_contents = urllib2.urlopen(page_url).read()
            page_manager.addPage(page_url, page_contents)
            
        page_manager.learnStripes()
        rule_set = page_manager.learnAllRules()
        results['rules'] = json.loads(rule_set.toJson())
        
        return jsonify(results)
    
    abort(404)

コード例 #7

0

ファイルを表示

ファイル: controllers.py プロジェクト: saggu/extraction

def save_markup():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        markup = data['markup']

        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')

        if not markup['__SCHEMA__'][0]['children']:
            markup_slot = {
              "id": "j1_2",
              "text": "slot",
              "icon": "glyphicon glyphicon-stop",
              "li_attr": {
                "id": "j1_2"
              },
              "a_attr": {
                "href": "#",
                "id": "j1_2_anchor"
              },
              "state": {
                "loaded": True,
                "opened": False,
                "selected": False,
                "disabled": False
              },
              "data": {},
              "children": [],
              "type": "item"
            };
            
            list_slot = {
             "a_attr": {
                "href": "#",
                "id": "j1_3_anchor"
              },
              "children": [],
              "data": {},
              "icon": "glyphicon glyphicon-th-list",
              "id": "j1_3",
              "li_attr": {
                "id": "j1_3"
              },
              "state": {
                "disabled": False,
                "loaded": True,
                "opened": False,
                "selected": False
              },
              "text": "category",
              "type": "list"
            };

            pageManager = PageManager()
            for key in markup['__URLS__']:
                page_file = os.path.join(directory, key)
                with codecs.open(page_file, "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                pageManager.addPage(key, page_str)

            schema = markup.pop("__SCHEMA__", None)
            urls = markup.pop("__URLS__", None)

            pageManager.learnStripes()
            (list_markup, list_names) = pageManager.learnListMarkups()
            rule_set = pageManager.learnAllRules()
            (markup, names) = pageManager.rulesToMarkup(rule_set)

            for key in markup.keys():
                if key in list_markup:
                    markup[key].update(list_markup[key])

            count = 1
            # Generate the schema from the list slots
            for list_name in list_names.keys():
                count += 1
                auto_markup_slot = copy.deepcopy(list_slot)
                auto_markup_slot['text'] = list_name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                ## now add the children to the auto learned list slot
                children = []
                for name in list_names[list_name]:
                    count += 1
                    auto_markup_slot_sub = copy.deepcopy(markup_slot)
                    auto_markup_slot_sub['text'] = name
                    auto_markup_slot_sub['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                    children.append(auto_markup_slot_sub)
                auto_markup_slot['children'] = children
                schema[0]['children'].append(auto_markup_slot)

            # Generate the schema from the item slots
            for name in names:
                count += 1
                auto_markup_slot = copy.deepcopy(markup_slot)
                auto_markup_slot['text'] = name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                schema[0]['children'].append(auto_markup_slot)
            markup['__SCHEMA__'] = schema
            markup['__URLS__'] = urls

            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        else:
            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        return jsonify(markup)
    abort(404)

コード例 #8

0

ファイルを表示

ファイル: controllers.py プロジェクト: thezedwards/extraction

def save_markup():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        markup = data['markup']

        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')

        if not markup['__SCHEMA__'][0]['children']:
            markup_slot = {
              "id": "j1_2",
              "text": "slot",
              "icon": "glyphicon glyphicon-stop",
              "li_attr": {
                "id": "j1_2"
              },
              "a_attr": {
                "href": "#",
                "id": "j1_2_anchor"
              },
              "state": {
                "loaded": True,
                "opened": False,
                "selected": False,
                "disabled": False
              },
              "data": {},
              "children": [],
              "type": "item"
            };
            
            list_slot = {
             "a_attr": {
                "href": "#",
                "id": "j1_3_anchor"
              },
              "children": [],
              "data": {},
              "icon": "glyphicon glyphicon-th-list",
              "id": "j1_3",
              "li_attr": {
                "id": "j1_3"
              },
              "state": {
                "disabled": False,
                "loaded": True,
                "opened": False,
                "selected": False
              },
              "text": "category",
              "type": "list"
            };

            pageManager = PageManager()
            test_pages = []
            for key in markup['__URLS__']:
                page_file = os.path.join(directory, key)
                with codecs.open(page_file, "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                pageManager.addPage(key, page_str)
                test_pages.append(page_str)

            schema = markup.pop("__SCHEMA__", None)
            urls = markup.pop("__URLS__", None)

            pageManager.learnStripes()
            list_markup = {}
            list_names = {}
            if LEARN_LISTS:
                (list_markup, list_names) = pageManager.learnListMarkups()
                
                #This is the div learning
                train_pages = {}
                for page_id in pageManager._pages:
                    train_pages[page_id] = pageManager.getPage(page_id).getString()
                d = DivListLearner()
                div_rules, div_markup = d.run(train_pages)
                 
                (div_list_markup, div_list_names) = pageManager.listRulesToMarkup(div_rules)
                
                for page_id in div_markup:
                    for item in div_markup[page_id]:
                        if item in div_list_markup[page_id]:
                            if 'starting_token_location' in div_markup[page_id][item]:
                                div_list_markup[page_id][item]['starting_token_location'] = div_markup[page_id][item]['starting_token_location']
                            if 'ending_token_location' in div_markup[page_id][item]:
                                div_list_markup[page_id][item]['ending_token_location'] = div_markup[page_id][item]['ending_token_location']
                            if div_markup[page_id][item]['sequence']:
                                for idx, val in enumerate(div_markup[page_id][item]['sequence']):
                                    if len(div_list_markup[page_id][item]['sequence']) <= idx:
                                        div_list_markup[page_id][item]['sequence'].insert(idx, val);
                                    else:
                                        div_list_markup[page_id][item]['sequence'][idx]['starting_token_location'] = val['starting_token_location']
                                        div_list_markup[page_id][item]['sequence'][idx]['ending_token_location'] = val['ending_token_location']
                
                #Now add these to the list_markup and list_names
                if len(div_rules.rules) > 0:
                    for page_id in div_list_markup:
                        if page_id not in list_markup:
                            list_markup[page_id] = {}
                        list_markup[page_id].update(div_list_markup[page_id])
                    list_names.update(div_list_names)
            
            rule_set = pageManager.learnAllRules()
            rule_set.removeBadRules(test_pages)
            
            (markup, names) = pageManager.rulesToMarkup(rule_set)

            for key in markup.keys():
                if key in list_markup:
                    markup[key].update(list_markup[key])

            count = 1
            # Generate the schema from the list slots
            for list_name in list_names.keys():
                count += 1
                auto_markup_slot = copy.deepcopy(list_slot)
                auto_markup_slot['text'] = list_name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                ## now add the children to the auto learned list slot
                children = []
                for name in list_names[list_name]:
                    count += 1
                    auto_markup_slot_sub = copy.deepcopy(markup_slot)
                    auto_markup_slot_sub['text'] = name
                    auto_markup_slot_sub['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                    children.append(auto_markup_slot_sub)
                auto_markup_slot['children'] = children
                schema[0]['children'].append(auto_markup_slot)

            # Generate the schema from the item slots
            for name in names:
                count += 1
                auto_markup_slot = copy.deepcopy(markup_slot)
                auto_markup_slot['text'] = name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                schema[0]['children'].append(auto_markup_slot)
            markup['__SCHEMA__'] = schema
            markup['__URLS__'] = urls

            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        else:
            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        return jsonify(markup)
    abort(404)