def extract_infoboxes(text): pattern = re.compile('{{[iI]nfobox[/\w\. ]+') matches = pattern.findall(text) infoboxes = [] for match in matches: infobox = infobox_parser.get_class(match[2:]).replace('wikipedia-', '') infoboxes.append(infobox) return infoboxes
def get_infobox_class_pairs(from_cache=True): """ Return pairs of (infobox, class) infobox format is lower case with hyphen (e.g. 'afl-player-2') class format is as returbed by get_class. """ infobox_urls = [] infobox_class_pairs = [] for i, mapping_url in enumerate(MAPPINGS_URLS): cache_path = HTML_CACHE_PATH_PREFIX + 'main_mapping_en_' + str(i+1) + '.html' if from_cache: mapping_page = open(cache_path, 'r').read() else: mapping_page = get_page_and_store(mapping_url, cache_path) infobox_urls += get_infobox_urls(mapping_page) for i, infobox_url in enumerate(infobox_urls): full_url = URL_PREFIX + infobox_url infobox = infobox_parser.get_class(infobox_url.split(':')[1]).replace('wikipedia-', '') cache_path = HTML_CACHE_PATH_PREFIX + 'infobox-' + infobox + '.html' #print '(%d/%d) %s' % (i+1, len(infobox_urls), infobox) if from_cache: infobox_page = open(cache_path, 'r').read() else: infobox_page = get_page_and_store(URL_PREFIX + infobox_url, cache_path) if infobox == 'football-biography': # temporary solution infobox_class_pairs.append((infobox, 'SoccerPlayer')) else: infobox_class_pairs.append((infobox, get_class(infobox_page))) return infobox_class_pairs