def __init__(self):
     self.entities = collect_entities()
Ejemplo n.º 2
0
def main():
    entities = collect_entities()
    parse(join(wikidump_path, articles_url.split('/')[-1]), entities, raw_articles_path)
Ejemplo n.º 3
0
def main():
    entities = collect_entities()
    parse(join(wikidump_path,
               articles_url.split('/')[-1]), entities, raw_articles_path)
Ejemplo n.º 4
0
import bz2
import os.path
from urllib2 import unquote

### PARAMS ####################################################################

prefix = 'http://it.wikipedia.org/wiki/'

### SUPPORT CLASSES ###########################################################


link_dictionary = {}
from config import entities_path
from pickler import Pickler
from collect_entities import collect_entities
entities = collect_entities()

class WikiDocument:
    def __init__(self):
        self.id = None
        self.url = None
        self.text = None

    def __str__(self):
        return '<doc id="%d" url="%s">\n%s\n</doc>\n' % (self.id, self.url, self.text)

def get_wiki_document_url(wiki_document_title, prefix):
    quoted_title = urllib.quote(wiki_document_title.replace(' ', '_').encode('utf-8'))
    quoted_title = quoted_title.replace('%28', '(').replace('%29', ')')
    return prefix + (quoted_title[0].upper() if quoted_title else '') + quoted_title[1:]
Ejemplo n.º 5
0
import re
import bz2
import os.path
from urllib2 import unquote

### PARAMS ####################################################################

prefix = 'http://it.wikipedia.org/wiki/'

### SUPPORT CLASSES ###########################################################

link_dictionary = {}
from config import entities_path
from pickler import Pickler
from collect_entities import collect_entities
entities = collect_entities()


class WikiDocument:
    def __init__(self):
        self.id = None
        self.url = None
        self.text = None

    def __str__(self):
        return '<doc id="%d" url="%s">\n%s\n</doc>\n' % (self.id, self.url,
                                                         self.text)


def get_wiki_document_url(wiki_document_title, prefix):
    quoted_title = urllib.quote(
Ejemplo n.º 6
0
 def __init__(self):
     self.entities = collect_entities()