def __init__(self): self.entities = collect_entities()
def main(): entities = collect_entities() parse(join(wikidump_path, articles_url.split('/')[-1]), entities, raw_articles_path)
import bz2 import os.path from urllib2 import unquote ### PARAMS #################################################################### prefix = 'http://it.wikipedia.org/wiki/' ### SUPPORT CLASSES ########################################################### link_dictionary = {} from config import entities_path from pickler import Pickler from collect_entities import collect_entities entities = collect_entities() class WikiDocument: def __init__(self): self.id = None self.url = None self.text = None def __str__(self): return '<doc id="%d" url="%s">\n%s\n</doc>\n' % (self.id, self.url, self.text) def get_wiki_document_url(wiki_document_title, prefix): quoted_title = urllib.quote(wiki_document_title.replace(' ', '_').encode('utf-8')) quoted_title = quoted_title.replace('%28', '(').replace('%29', ')') return prefix + (quoted_title[0].upper() if quoted_title else '') + quoted_title[1:]
import re import bz2 import os.path from urllib2 import unquote ### PARAMS #################################################################### prefix = 'http://it.wikipedia.org/wiki/' ### SUPPORT CLASSES ########################################################### link_dictionary = {} from config import entities_path from pickler import Pickler from collect_entities import collect_entities entities = collect_entities() class WikiDocument: def __init__(self): self.id = None self.url = None self.text = None def __str__(self): return '<doc id="%d" url="%s">\n%s\n</doc>\n' % (self.id, self.url, self.text) def get_wiki_document_url(wiki_document_title, prefix): quoted_title = urllib.quote(