Esempio n. 1
0
def check():
    #SPARQL server responds
    try:
        from sparql_access import count_entities_of_type
        count_entities_of_type([u'http://dbpedia.org/ontology/Settlement'])
    except ValueError:
        raise RuntimeError('Cannot access SPARQL endpoint: %s.' % sparql_endpoint)
    #raw articles are available
    create_dir(data_path)
    create_dir(raw_articles_path)
    if not os.listdir(raw_articles_path):
        #Wikipedia data archive is available
        try:
            from bz2 import BZ2File
            create_dir(wikidump_path)
            BZ2File(os.path.join(wikidump_path, articles_url.split('/')[-1]))
        except IOError:
            import urllib2
            print 'Downloading Wikipedia pages-articles archive.'
            try:
                u = urllib2.urlopen(articles_url)
            except urllib2.HTTPError:
                raise RuntimeError('Cannot download file: %s.' % articles_url)
            f = open(os.path.join(wikidump_path, articles_url.split('/')[-1]), 'wb')
            f.write(u.read())
            f.close()
            print 'Finished downloading.'
        import extract_dumps
        print 'Extracting raw articles from the archive.'
        extract_dumps.main()
        print 'Finished extracting.'
Esempio n. 2
0
def check():
    #SPARQL server responds
    try:
        from sparql_access import count_entities_of_type
        count_entities_of_type([u'http://dbpedia.org/ontology/Settlement'])
    except ValueError:
        raise RuntimeError('Cannot access SPARQL endpoint: %s.' %
                           sparql_endpoint)
    #raw articles are available
    create_dir(data_path)
    create_dir(raw_articles_path)
    if not os.listdir(raw_articles_path):
        #Wikipedia data archive is available
        try:
            from bz2 import BZ2File
            create_dir(wikidump_path)
            BZ2File(os.path.join(wikidump_path, articles_url.split('/')[-1]))
        except IOError:
            import urllib2
            print 'Downloading Wikipedia pages-articles archive.'
            try:
                u = urllib2.urlopen(articles_url)
            except urllib2.HTTPError:
                raise RuntimeError('Cannot download file: %s.' % articles_url)
            f = open(os.path.join(wikidump_path,
                                  articles_url.split('/')[-1]), 'wb')
            f.write(u.read())
            f.close()
            print 'Finished downloading.'
        import extract_dumps
        print 'Extracting raw articles from the archive.'
        extract_dumps.main()
        print 'Finished extracting.'
Esempio n. 3
0
 def get_most_specific_types(predominant_types):
     threshold = 100
     return [
         type for type in predominant_types
         if count_entities_of_type(type) > threshold
     ]
 def get_most_specific_types(predominant_types):
     threshold = 100
     return [
         type for type in predominant_types
         if count_entities_of_type(type) > threshold
     ]