Esempio n. 1
0
def extract_infoboxes(text):
    pattern = re.compile('{{[iI]nfobox[/\w\. ]+')
    matches = pattern.findall(text)
    infoboxes = []
    
    for match in matches:
        infobox = infobox_parser.get_class(match[2:]).replace('wikipedia-', '')
        infoboxes.append(infobox)

    return infoboxes
Esempio n. 2
0
def get_infobox_class_pairs(from_cache=True):
    """
    Return pairs of (infobox, class)

    infobox format is lower case with hyphen (e.g. 'afl-player-2')
    class format is as returbed by get_class.
    """
    infobox_urls = []
    infobox_class_pairs = []

    for i, mapping_url in enumerate(MAPPINGS_URLS):
        cache_path = HTML_CACHE_PATH_PREFIX + 'main_mapping_en_' + str(i+1) + '.html'

        if from_cache:
            mapping_page = open(cache_path, 'r').read()
        else:
            mapping_page = get_page_and_store(mapping_url, cache_path)

        infobox_urls += get_infobox_urls(mapping_page)

    for i, infobox_url in enumerate(infobox_urls):
        full_url = URL_PREFIX + infobox_url
        infobox = infobox_parser.get_class(infobox_url.split(':')[1]).replace('wikipedia-', '')
        cache_path = HTML_CACHE_PATH_PREFIX + 'infobox-' + infobox + '.html'

        #print '(%d/%d) %s' % (i+1, len(infobox_urls), infobox)

        if from_cache:
            infobox_page = open(cache_path, 'r').read()
        else:
            infobox_page = get_page_and_store(URL_PREFIX + infobox_url, cache_path)

        if infobox == 'football-biography':     # temporary solution
            infobox_class_pairs.append((infobox, 'SoccerPlayer'))
        else:
            infobox_class_pairs.append((infobox, get_class(infobox_page)))

    return infobox_class_pairs