Python PageManager.learnStripesの例、learning.PageManager.PageManager.learnStripes Pythonの例

コード例 #1

0

ファイルを表示

ファイル: ListLearner.py プロジェクト: thezedwards/extraction

def main():
    page_file_dir = '/Users/bamana/Documents/InferLink/workspace/memex/memexpython/input/_template_test'
    
    pageManager = PageManager()
    files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
    for the_file in files:
        if the_file.startswith('.'):
            continue
        
        with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
            page_str = myfile.read().encode('utf-8')
        
        pageManager.addPage(the_file, page_str)
    
    pageManager.learnStripes()
    (list_markup, list_names) = pageManager.learnListMarkups()
    
    rule_set = pageManager.learnAllRules()
    (markup, names) = pageManager.rulesToMarkup(rule_set)
    
    for key in markup.keys():
        if key in list_markup:
            markup[key].update(list_markup[key])

#     print json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': '))
    
    rule_set = pageManager.learnRulesFromMarkup(list_markup)
    print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))

コード例 #2

0

ファイルを表示

 def induce_template(self, cluster_members):
     sub_page_mgr = PageManager()
     for id in cluster_members:
         curr_page = self.__page_manager.getPage(id)
         sub_page_mgr.addPage(id, curr_page.string)
     sub_page_mgr.learnStripes()
     return sub_page_mgr.getStripes()

コード例 #3

0

ファイルを表示

ファイル: controllers.py プロジェクト: thezedwards/extraction

def do_learning():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')
        with codecs.open(markup_file, "r", "utf-8") as myfile:
            json_str = myfile.read().encode('utf-8')
        markup = json.loads(json_str)

        pageManager = PageManager()
        for key in markup['__URLS__']:
            page_file = os.path.join(directory, key)
            with codecs.open(page_file, "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
            pageManager.addPage(key, page_str)

        markup.pop("__SCHEMA__", None)
        markup.pop("__URLS__", None)

        pageManager.learnStripes(markup)
        rule_set = pageManager.learnRulesFromMarkup(markup)

        rules_file = os.path.join(directory, 'learning', 'rules.json')
        with codecs.open(rules_file, "w", "utf-8") as myfile:
            myfile.write(json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': ')))
            myfile.close()

        return jsonify(rules = json.loads(rule_set.toJson()) )
    abort(404)

コード例 #4

0

ファイルを表示

ファイル: RuleLearner.py プロジェクト: saggu/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"])
            
            write_debug_files = False
            
            for opt in opts:
                if opt in [('-d', ''), ('--debug', '')]:
                    write_debug_files = True
                if opt in [('-h', ''), ('--help', '')]:
                    raise Usage('python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files')
        except getopt.error, msg:
            raise Usage(msg)
        
        logger.info('Running RuleLearner with file at %s for rules %s', args[0], args[1])
        
        #read the directory location from arg0
        page_file_dir = args[0]
        
        pageManager = PageManager(write_debug_files)
        
        start_time = time.time()
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue
                
                with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                    
                pageManager.addPage(the_file, page_str)
        logger.info("--- LOAD PAGES: %s seconds ---" % (time.time() - start_time))
        
        #Read the markups from a file...
        start_time = time.time()
        markups_file = args[1]
        with codecs.open(markups_file, "r", "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)
        
        markups.pop("__SCHEMA__", None)
        markups.pop("__URLS__", None)
        
        logger.info("--- LOAD MARKUPS: %s seconds ---" % (time.time() - start_time))

        pageManager.learnStripes(markups)
        start_time = time.time()
        rule_set = pageManager.learnRulesFromMarkup(markups)
        logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" % (time.time() - start_time))
         
        if(len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()
        else:
            print rule_set.toJson()

コード例 #5

0

ファイルを表示

ファイル: controllers.py プロジェクト: saggu/extraction

def do_learning():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')
        with codecs.open(markup_file, "r", "utf-8") as myfile:
            json_str = myfile.read().encode('utf-8')
        markup = json.loads(json_str)

        pageManager = PageManager()
        for key in markup['__URLS__']:
            page_file = os.path.join(directory, key)
            with codecs.open(page_file, "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
            pageManager.addPage(key, page_str)

        markup.pop("__SCHEMA__", None)
        markup.pop("__URLS__", None)

        pageManager.learnStripes(markup)
        rule_set = pageManager.learnRulesFromMarkup(markup)

        rules_file = os.path.join(directory, 'learning', 'rules.json')
        with codecs.open(rules_file, "w", "utf-8") as myfile:
            myfile.write(json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': ')))
            myfile.close()

        return jsonify(rules = json.loads(rule_set.toJson()) )
    abort(404)

コード例 #6

0

ファイルを表示

ファイル: RuleLearnerAllSlots.py プロジェクト: thezedwards/extraction

def run(page_file_dir, ignore_files=[]):
    test_pages = []
    pageManager = PageManager(write_debug_files)

    if os.path.isfile(page_file_dir):
        with open(page_file_dir) as f:
            urls = f.readlines()
        for url in urls:
            page_url = url.strip()
            req = urllib2.urlopen(page_url)
            page_contents = req.read()
            charset = chardet.detect(page_contents)
            page_encoding = charset['encoding']
            page_str = page_contents.decode(page_encoding).encode('utf-8')
            pageManager.addPage(page_url, page_contents)
            test_pages.append(page_url)
    else:
        files = [
            f for f in os.listdir(page_file_dir)
            if os.path.isfile(os.path.join(page_file_dir, f))
        ]
        for the_file in files:
            if the_file.startswith(
                    '.'
            ) or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files:
                continue

            with codecs.open(os.path.join(page_file_dir, the_file), "r",
                             "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')

            pageManager.addPage(the_file, page_str)
            test_pages.append(the_file)

    pageManager.learnStripes()

    ##table, ul, etc. list learning
    #         (list_markup, list_names) = pageManager.learnListMarkups()
    #         list_rules = pageManager.learnRulesFromMarkup(list_markup)

    ##div learning
    #         train_pages = {}
    #         for page_id in pageManager._pages:
    #             train_pages[page_id] = pageManager.getPage(page_id).getString()
    #         d = DivListLearner()
    #         div_rules, div_markup = d.run(train_pages)

    rule_set = pageManager.learnAllRules()
    rule_set.removeBadRules(test_pages)

    #         for rule in list_rules.rules:
    #             rule_set.add_rule(rule)
    #
    #         for rule in div_rules.rules:
    #             rule_set.add_rule(rule)
    return rule_set

コード例 #7

0

ファイルを表示

ファイル: RuleLearnerAndExtraction.py プロジェクト: saggu/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "h", ["help"])
        except getopt.error, msg:
            raise Usage(msg)
        
        #read the directory location from arg0
        page_file_dir = args[0]
        
        pageManager = PageManager()
        page_str_array = []
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue
                
                with codecs.open(os.path.join(subdir, the_file), "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                    page_str_array.append(page_str)
                    
                pageManager.addPage(the_file, page_str)
                
        pageManager.learnStripes()
        
        #Read the markups from a file...
        markups_file = args[1]
        with codecs.open(os.path.join('', markups_file), "r", "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)
        
        markups.pop("__SCHEMA__", None)
        
        #Before we learn the stripes let's make sure we can open the output file
        pageManager.learnStripes(markups)
        rule_set = pageManager.learnRulesFromMarkup(markups)
        
        if(len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()
        
        #testing
        flatten = False
        extraction_list = rule_set.extract(page_str_array[0])
        
        if rule_set.validate(extraction_list):
            if flatten:
                print json.dumps(Landmark.flattenResult(extraction_list), sort_keys=True, indent=2, separators=(',', ': '))
            else:
                print json.dumps(extraction_list, sort_keys=True, indent=2, separators=(',', ': '))

コード例 #8

0

ファイルを表示

    def learn_list_extractors(self, pages):
        page_mgr = PageManager()  #write_debug_files=True)

        markup = {}
        for page in pages:
            page_content = pages[page]
            page_mgr.addPage(page, page_content)
            content_list_markup = self.lists_on_single_page(page_content)
            markup[page] = content_list_markup

#         print '--- MARKUP ---'
#         print json.dumps(markup)
        page_mgr.learnStripes(markups=markup)
        rules = page_mgr.learnRulesFromMarkup(markup)

        # now, for each markup rule, learn a little page manager
        sublist_page_managers = {}
        for page in markup:
            for rule_name in markup[page]:
                if rule_name not in sublist_page_managers:
                    sublist_page_managers[rule_name] = PageManager()
                for rid in range(len(markup[page][rule_name]['sequence'])):
                    row = markup[page][rule_name]['sequence'][rid]
                    sublist_page_managers[rule_name].addPage(
                        page + "html%d" % rid, row['extract'])

        sublist_sub_rules = {}
        for sublist in sublist_page_managers:
            sublist_page_managers[sublist].learnStripes()
            sub_rules = sublist_page_managers[sublist].learnAllRules(
                in_list=True)
            sublist_sub_rules[
                sublist] = sub_rules  # This should match a rule name in the rules...

        count = 1
        for rule in rules.rules:
            #             print "== RULE INFO =="
            #             print str(rule.name)
            rule.set_sub_rules(sublist_sub_rules[rule.name])
            list_name = '_div_list' + format(count, '04')
            for page_id in markup:
                if rule.name in markup[page_id]:
                    markup[page_id][list_name] = markup[page_id].pop(rule.name)
            rule.name = list_name
#             print str(json.dumps(rule.toJson()))
#             print "==============="
#
#         print rules.toJson()

        return rules, markup

コード例 #9

0

ファイルを表示

ファイル: RuleLearnerAllSlots.py プロジェクト: saggu/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"])
            
            write_debug_files = False
            
            for opt in opts:
                if opt in [('-d', ''), ('--debug', '')]:
                    write_debug_files = True
                if opt in [('-h', ''), ('--help', '')]:
                    raise Usage('python -m learning.RuleLearnerAllSlots [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] \n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files')
        except getopt.error, msg:
            raise Usage(msg)
        
        logger.info('Running RuleLearnerAllSlots All Slots with files at %s', args[0])
        
        #read the directory location from arg0
        page_file_dir = args[0]
        
        pageManager = PageManager(write_debug_files)
        
        if os.path.isfile(page_file_dir):
            with open(page_file_dir) as f:
                urls = f.readlines()
            for url in urls:
                page_url = url.strip()
                req = urllib2.urlopen(page_url)
                page_contents = req.read()
                charset = chardet.detect(page_contents)
                page_encoding = charset['encoding']
                page_str = page_contents.decode(page_encoding).encode('utf-8')
                pageManager.addPage(page_url, page_contents)
        else:
            files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
            for the_file in files:
                if the_file.startswith('.'):
                    continue
                 
                with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                     
                pageManager.addPage(the_file, page_str)

        pageManager.learnStripes()
        rule_set = pageManager.learnAllRules()
          
        print json.dumps(json.loads(rule_set.toJson()), sort_keys=True, indent=2, separators=(',', ': '))

コード例 #10

0

ファイルを表示

ファイル: RuleLearnerAllSlots.py プロジェクト: usc-isi-i2/landmark-extraction

def run(page_file_dir, ignore_files = []):
    test_pages = []
    pageManager = PageManager(write_debug_files)
    
    if os.path.isfile(page_file_dir):
        with open(page_file_dir) as f:
            urls = f.readlines()
        for url in urls:
            page_url = url.strip()
            req = urllib2.urlopen(page_url)
            page_contents = req.read()
            charset = chardet.detect(page_contents)
            page_encoding = charset['encoding']
            page_str = page_contents.decode(page_encoding).encode('utf-8')
            pageManager.addPage(page_url, page_contents)
            test_pages.append(page_url)
    else:
        files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
        for the_file in files:
            if the_file.startswith('.') or the_file == 'markup.json' or the_file == 'rules.json' or the_file in ignore_files:
                continue
             
            with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
                 
            pageManager.addPage(the_file, page_str)
            test_pages.append(the_file)

    pageManager.learnStripes()
    
    ##table, ul, etc. list learning
#         (list_markup, list_names) = pageManager.learnListMarkups()
#         list_rules = pageManager.learnRulesFromMarkup(list_markup)
    
    ##div learning
#         train_pages = {}
#         for page_id in pageManager._pages:
#             train_pages[page_id] = pageManager.getPage(page_id).getString()
#         d = DivListLearner()
#         div_rules, div_markup = d.run(train_pages)
    
    rule_set = pageManager.learnAllRules()
    rule_set.removeBadRules(test_pages)
    
#         for rule in list_rules.rules:
#             rule_set.add_rule(rule)
#         
#         for rule in div_rules.rules:
#             rule_set.add_rule(rule)
    return rule_set

コード例 #11

0

ファイルを表示

ファイル: TreeListLearner.py プロジェクト: usc-isi-i2/landmark-extraction

    def learn_list_extractors(self, pages):
        page_mgr = PageManager() #write_debug_files=True)

        markup = {}
        for page in pages:
            page_content = pages[page]
            page_mgr.addPage(page, page_content)
            content_list_markup = self.lists_on_single_page(page_content)
            markup[page] = content_list_markup

#         print '--- MARKUP ---'
#         print json.dumps(markup)
        page_mgr.learnStripes(markups=markup)
        rules = page_mgr.learnRulesFromMarkup(markup)


        # now, for each markup rule, learn a little page manager
        sublist_page_managers = {}
        for page in markup:
            for rule_name in markup[page]:
                if rule_name not in sublist_page_managers:
                    sublist_page_managers[rule_name] = PageManager()
                for rid in range(len(markup[page][rule_name]['sequence'])):
                    row = markup[page][rule_name]['sequence'][rid]
                    sublist_page_managers[rule_name].addPage(page+"html%d" % rid, row['extract'])

        sublist_sub_rules = {}
        for sublist in sublist_page_managers:
            sublist_page_managers[sublist].learnStripes()
            sub_rules = sublist_page_managers[sublist].learnAllRules(in_list = True)
            sublist_sub_rules[sublist] = sub_rules  # This should match a rule name in the rules...

        count = 1 
        for rule in rules.rules:
#             print "== RULE INFO =="
#             print str(rule.name)
            rule.set_sub_rules(sublist_sub_rules[rule.name])
            list_name = '_div_list'+format(count, '04')
            for page_id in markup:
                if rule.name in markup[page_id]:
                    markup[page_id][list_name] = markup[page_id].pop(rule.name)
            rule.name = list_name
#             print str(json.dumps(rule.toJson()))
#             print "==============="
#             
#         print rules.toJson()
        
        
        return rules, markup

コード例 #12

0

ファイルを表示

ファイル: controllers.py プロジェクト: saggu/extraction

def autolearn_grid():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        page_urls = data['urls']
        
        page_manager = PageManager()
        results = {}
        for page_url in page_urls:
            page_contents = urllib2.urlopen(page_url).read()
            page_manager.addPage(page_url, page_contents)
            
        page_manager.learnStripes()
        rule_set = page_manager.learnAllRules()
        results['rules'] = json.loads(rule_set.toJson())
        
        return jsonify(results)
    
    abort(404)

コード例 #13

0

ファイルを表示

ファイル: controllers.py プロジェクト: thezedwards/extraction

def autolearn_grid():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        page_urls = data['urls']
        
        page_manager = PageManager()
        results = {}
        for page_url in page_urls:
            page_contents = urllib2.urlopen(page_url).read()
            page_manager.addPage(page_url, page_contents)
            
        page_manager.learnStripes()
        rule_set = page_manager.learnAllRules()
        results['rules'] = json.loads(rule_set.toJson())
        
        return jsonify(results)
    
    abort(404)

コード例 #14

0

ファイルを表示

ファイル: controllers.py プロジェクト: saggu/extraction

def save_markup():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        markup = data['markup']

        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')

        if not markup['__SCHEMA__'][0]['children']:
            markup_slot = {
              "id": "j1_2",
              "text": "slot",
              "icon": "glyphicon glyphicon-stop",
              "li_attr": {
                "id": "j1_2"
              },
              "a_attr": {
                "href": "#",
                "id": "j1_2_anchor"
              },
              "state": {
                "loaded": True,
                "opened": False,
                "selected": False,
                "disabled": False
              },
              "data": {},
              "children": [],
              "type": "item"
            };
            
            list_slot = {
             "a_attr": {
                "href": "#",
                "id": "j1_3_anchor"
              },
              "children": [],
              "data": {},
              "icon": "glyphicon glyphicon-th-list",
              "id": "j1_3",
              "li_attr": {
                "id": "j1_3"
              },
              "state": {
                "disabled": False,
                "loaded": True,
                "opened": False,
                "selected": False
              },
              "text": "category",
              "type": "list"
            };

            pageManager = PageManager()
            for key in markup['__URLS__']:
                page_file = os.path.join(directory, key)
                with codecs.open(page_file, "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                pageManager.addPage(key, page_str)

            schema = markup.pop("__SCHEMA__", None)
            urls = markup.pop("__URLS__", None)

            pageManager.learnStripes()
            (list_markup, list_names) = pageManager.learnListMarkups()
            rule_set = pageManager.learnAllRules()
            (markup, names) = pageManager.rulesToMarkup(rule_set)

            for key in markup.keys():
                if key in list_markup:
                    markup[key].update(list_markup[key])

            count = 1
            # Generate the schema from the list slots
            for list_name in list_names.keys():
                count += 1
                auto_markup_slot = copy.deepcopy(list_slot)
                auto_markup_slot['text'] = list_name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                ## now add the children to the auto learned list slot
                children = []
                for name in list_names[list_name]:
                    count += 1
                    auto_markup_slot_sub = copy.deepcopy(markup_slot)
                    auto_markup_slot_sub['text'] = name
                    auto_markup_slot_sub['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                    children.append(auto_markup_slot_sub)
                auto_markup_slot['children'] = children
                schema[0]['children'].append(auto_markup_slot)

            # Generate the schema from the item slots
            for name in names:
                count += 1
                auto_markup_slot = copy.deepcopy(markup_slot)
                auto_markup_slot['text'] = name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                schema[0]['children'].append(auto_markup_slot)
            markup['__SCHEMA__'] = schema
            markup['__URLS__'] = urls

            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        else:
            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        return jsonify(markup)
    abort(404)

コード例 #15

0

ファイルを表示

ファイル: RuleLearner.py プロジェクト: thezedwards/extraction

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "dh", ["debug", "help"])

            write_debug_files = False

            for opt in opts:
                if opt in [('-d', ''), ('--debug', '')]:
                    write_debug_files = True
                if opt in [('-h', ''), ('--help', '')]:
                    raise Usage(
                        'python -m learning.RuleLearner [OPTIONAL_PARAMS] [TEST_FILES_FOLDER] [MARKUP_FILE]\n\t[OPTIONAL_PARAMS]: -d to get debug stripe html files'
                    )
        except getopt.error, msg:
            raise Usage(msg)

        logger.info('Running RuleLearner with file at %s for rules %s',
                    args[0], args[1])

        #read the directory location from arg0
        page_file_dir = args[0]

        pageManager = PageManager(write_debug_files)

        start_time = time.time()
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue

                with codecs.open(os.path.join(subdir, the_file), "r",
                                 "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')

                pageManager.addPage(the_file, page_str)
        logger.info("--- LOAD PAGES: %s seconds ---" %
                    (time.time() - start_time))

        #Read the markups from a file...
        start_time = time.time()
        markups_file = args[1]
        with codecs.open(markups_file, "r", "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)

        markups.pop("__SCHEMA__", None)
        markups.pop("__URLS__", None)

        logger.info("--- LOAD MARKUPS: %s seconds ---" %
                    (time.time() - start_time))

        pageManager.learnStripes(markups)
        start_time = time.time()
        rule_set = pageManager.learnRulesFromMarkup(markups)
        logger.info("--- LEARN RULES FROM MARKUP: %s seconds ---" %
                    (time.time() - start_time))

        if (len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()
        else:
            print rule_set.toJson()

コード例 #16

0

ファイルを表示

ファイル: controllers.py プロジェクト: thezedwards/extraction

def save_markup():
    if request.method == 'POST':
        data = request.get_json(force=True)
        
        project_folder = data['project_folder']
        markup = data['markup']

        directory = os.path.join(app.static_folder, 'project_folders', project_folder)
        markup_file = os.path.join(directory, 'learning', 'markup.json')

        if not markup['__SCHEMA__'][0]['children']:
            markup_slot = {
              "id": "j1_2",
              "text": "slot",
              "icon": "glyphicon glyphicon-stop",
              "li_attr": {
                "id": "j1_2"
              },
              "a_attr": {
                "href": "#",
                "id": "j1_2_anchor"
              },
              "state": {
                "loaded": True,
                "opened": False,
                "selected": False,
                "disabled": False
              },
              "data": {},
              "children": [],
              "type": "item"
            };
            
            list_slot = {
             "a_attr": {
                "href": "#",
                "id": "j1_3_anchor"
              },
              "children": [],
              "data": {},
              "icon": "glyphicon glyphicon-th-list",
              "id": "j1_3",
              "li_attr": {
                "id": "j1_3"
              },
              "state": {
                "disabled": False,
                "loaded": True,
                "opened": False,
                "selected": False
              },
              "text": "category",
              "type": "list"
            };

            pageManager = PageManager()
            test_pages = []
            for key in markup['__URLS__']:
                page_file = os.path.join(directory, key)
                with codecs.open(page_file, "r", "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                pageManager.addPage(key, page_str)
                test_pages.append(page_str)

            schema = markup.pop("__SCHEMA__", None)
            urls = markup.pop("__URLS__", None)

            pageManager.learnStripes()
            list_markup = {}
            list_names = {}
            if LEARN_LISTS:
                (list_markup, list_names) = pageManager.learnListMarkups()
                
                #This is the div learning
                train_pages = {}
                for page_id in pageManager._pages:
                    train_pages[page_id] = pageManager.getPage(page_id).getString()
                d = DivListLearner()
                div_rules, div_markup = d.run(train_pages)
                 
                (div_list_markup, div_list_names) = pageManager.listRulesToMarkup(div_rules)
                
                for page_id in div_markup:
                    for item in div_markup[page_id]:
                        if item in div_list_markup[page_id]:
                            if 'starting_token_location' in div_markup[page_id][item]:
                                div_list_markup[page_id][item]['starting_token_location'] = div_markup[page_id][item]['starting_token_location']
                            if 'ending_token_location' in div_markup[page_id][item]:
                                div_list_markup[page_id][item]['ending_token_location'] = div_markup[page_id][item]['ending_token_location']
                            if div_markup[page_id][item]['sequence']:
                                for idx, val in enumerate(div_markup[page_id][item]['sequence']):
                                    if len(div_list_markup[page_id][item]['sequence']) <= idx:
                                        div_list_markup[page_id][item]['sequence'].insert(idx, val);
                                    else:
                                        div_list_markup[page_id][item]['sequence'][idx]['starting_token_location'] = val['starting_token_location']
                                        div_list_markup[page_id][item]['sequence'][idx]['ending_token_location'] = val['ending_token_location']
                
                #Now add these to the list_markup and list_names
                if len(div_rules.rules) > 0:
                    for page_id in div_list_markup:
                        if page_id not in list_markup:
                            list_markup[page_id] = {}
                        list_markup[page_id].update(div_list_markup[page_id])
                    list_names.update(div_list_names)
            
            rule_set = pageManager.learnAllRules()
            rule_set.removeBadRules(test_pages)
            
            (markup, names) = pageManager.rulesToMarkup(rule_set)

            for key in markup.keys():
                if key in list_markup:
                    markup[key].update(list_markup[key])

            count = 1
            # Generate the schema from the list slots
            for list_name in list_names.keys():
                count += 1
                auto_markup_slot = copy.deepcopy(list_slot)
                auto_markup_slot['text'] = list_name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                ## now add the children to the auto learned list slot
                children = []
                for name in list_names[list_name]:
                    count += 1
                    auto_markup_slot_sub = copy.deepcopy(markup_slot)
                    auto_markup_slot_sub['text'] = name
                    auto_markup_slot_sub['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['li_attr']['id'] = 'j1_'+str(count)
                    auto_markup_slot_sub['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                    children.append(auto_markup_slot_sub)
                auto_markup_slot['children'] = children
                schema[0]['children'].append(auto_markup_slot)

            # Generate the schema from the item slots
            for name in names:
                count += 1
                auto_markup_slot = copy.deepcopy(markup_slot)
                auto_markup_slot['text'] = name
                auto_markup_slot['id'] = 'j1_'+str(count)
                auto_markup_slot['li_attr']['id'] = 'j1_'+str(count)
                auto_markup_slot['a_attr']['id'] = 'j1_'+str(count)+'_anchor'
                schema[0]['children'].append(auto_markup_slot)
            markup['__SCHEMA__'] = schema
            markup['__URLS__'] = urls

            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        else:
            with codecs.open(markup_file, "w", "utf-8") as myfile:
                myfile.write(json.dumps(markup, sort_keys=True, indent=2, separators=(',', ': ')))
                myfile.close()

        return jsonify(markup)
    abort(404)

コード例 #17

0

ファイルを表示

def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "h", ["help"])
        except getopt.error, msg:
            raise Usage(msg)

        #read the directory location from arg0
        page_file_dir = args[0]

        pageManager = PageManager()
        page_str_array = []
        for subdir, dirs, files in os.walk(page_file_dir):
            for the_file in files:
                if the_file.startswith('.'):
                    continue

                with codecs.open(os.path.join(subdir, the_file), "r",
                                 "utf-8") as myfile:
                    page_str = myfile.read().encode('utf-8')
                    page_str_array.append(page_str)

                pageManager.addPage(the_file, page_str)

        pageManager.learnStripes()

        #Read the markups from a file...
        markups_file = args[1]
        with codecs.open(os.path.join('', markups_file), "r",
                         "utf-8") as myfile:
            markup_str = myfile.read().encode('utf-8')
        markups = json.loads(markup_str)

        markups.pop("__SCHEMA__", None)

        #Before we learn the stripes let's make sure we can open the output file
        pageManager.learnStripes(markups)
        rule_set = pageManager.learnRulesFromMarkup(markups)

        if (len(args) > 2):
            output_file = args[2]
            with codecs.open(output_file, "w", "utf-8") as myfile:
                myfile.write(rule_set.toJson())
                myfile.close()

        #testing
        flatten = False
        extraction_list = rule_set.extract(page_str_array[0])

        if rule_set.validate(extraction_list):
            if flatten:
                print json.dumps(Landmark.flattenResult(extraction_list),
                                 sort_keys=True,
                                 indent=2,
                                 separators=(',', ': '))
            else:
                print json.dumps(extraction_list,
                                 sort_keys=True,
                                 indent=2,
                                 separators=(',', ': '))