def matching_france(region=None): mode = request.args.get('mode', 'none') if region is None: config = Config('./config/config.yml') factory = DocFactory(config.get('mongodb')) internal = factory.internal_collection() objects = internal.aggregate([{ '$match': { 'name': { '$exists': True, '$not': { '$size': 0 } }, '$and': [{ 'admin_hierarchy.ADMIN_LEVEL_1.name': 'France' }] } }, { '$group': { '_id': '$admin_hierarchy.ADMIN_LEVEL_2.name', 'count': { '$sum': 1 } } }]) return render_template('admin/matching-france/region-list.html', data=objects, mode=mode) else: return render_template('admin/matching-france/list.html', region=region, mode=mode)
from lib.factory.StorageLocation import StorageLocation as DocFactory from lib.factory.Loader import Loader as LoaderFactory from lib.config.Yaml import Yaml as Config from lib.parser.wiki.France import France as WikiParser config = Config('./config/config.yml') document_factory = DocFactory(config.get('mongodb')) url = 'https://fr.wikipedia.org/wiki/Paris' headers = {'User-Agent': 'Mozilla/5.0'} loader = LoaderFactory.loader_with_mongodb(config.get('mongodb')) content, code = loader.load(url, headers=headers) parser = WikiParser(content) doc = document_factory.wiki(url) print('.' if doc.is_new() else 'E', end='') document = doc.get_document() print('.' if 'code' in document else 'E', end='') doc.update(parser.as_dictionary()) dic = doc.get_document() print('.' if dic.get('name') == 'Paris' else 'E', end='')
config = Config('./config/config.yml') arg_parser = ArgumentParser( description='Download data from wiki by link or search request') arg_parser.add_argument('-f', help='turn on the force mode') arg_parser.add_argument('-l', help='custom link to page with result(s)') opts = arg_parser.parse_args() insee_index = 0 name_index = 1 population_index = 2 force_update = opts.f headers = {'User-Agent': 'Mozilla/5.0'} loader = Loader.loader_with_mongodb(config.get('mongodb')) document_factory = DocFactory(config.get('mongodb')) log = FileLog('./log/wiki_page_italy_{date}.log'.format( date=datetime.datetime.now().strftime('%Y-%m-%d'))) log.add('Start', log.INFO) log.add('Params: [{0}]'.format(repr(opts).encode('utf-8')), log.INFO) message_format = 'Parsing request:[{0}]' use_link = bool(opts.l) custom_link = opts.l if use_link else '' def update_meta(url, request, document): actual_doc = document.get_document() actual_doc.update(url=url) added_requests = [tuple(x) for x in actual_doc.get('requests', ())]
lst_address = [] region_index = 1 provincia_index = 3 comune_index = 5 localita_index = 9 altitude_index = 13 codloc_index = 8 loc2011_index = 7 procom_index = 6 codcom_index = 4 codpro_index = 2 codreg_index = 0 config = Config('./config/config.yml') doc_factory = DocFactory(config.get('mongodb')) language='it' spider = Spider( loader_factory=LoaderFactory, gmap_parser=MapFactory.italy, wiki_parser=ParserItalyWiki, doc_factory=doc_factory, language=language, config=config, use_cache=True ) def gmap_by_address(address): objects = spider.get_gmap_address(address)
from pymongo import MongoClient from lib.config.Yaml import Yaml as Config from lib.factory.StorageLocation import StorageLocation as DocFactory config = Config('./config/config.yml').get('mongodb') connection = MongoClient(config['host'], config['port']) factory = DocFactory(config) wiki = factory.wiki_collection() wiki.drop_indexes() wiki.create_index([('_id', 1)]) wiki.create_index([('code', 1)]) wiki.create_index([('name', 1)]) wiki.create_index([('admin_hierarchy', 1)]) gmaps = factory.gmaps_collection() gmaps.drop_indexes() gmaps.create_index([('_id', 1)]) gmaps.create_index([('code', 1)]) gmaps.create_index([('name', 1)]) gmaps.create_index([('admin_hierarchy', 1)])
from lib.config.Yaml import Yaml as Config from lib.logger.File import File as FileLog from lib.factory.StorageLocation import StorageLocation as DocFactory from argparse import ArgumentParser arg_parser = ArgumentParser(description='Download data from gmaps by address') arg_parser.add_argument('-f', help='turn on the force mode') arg_parser.add_argument('-a', help='address') opts = arg_parser.parse_args() config = Config('./config/config.yml') loader = LoaderFactory.loader_gmaps_with_cache( gmaps_config=config.get('googlemaps'), storage_config=config.get('mongodb')) document_factory = DocFactory(config.get('mongodb')) log = FileLog('./log/gmaps_address_france_{date}.log'.format( date=datetime.datetime.now().strftime('%Y-%m-%d'))) log.add('Start', log.INFO) log.add('Params: [{0}]'.format(repr(opts).encode('utf-8')), log.INFO) use_address = bool(opts.a) address = opts.a if use_address else '' force_update = opts.f def update_meta(request, document): actual_doc = document.get_document() added_requests = [(tuple(x) if isinstance(x, list) else x) for x in actual_doc.get('requests', ())] added_requests.append(request)
from lib.job.storage.MongoDB import MongoDB as Storage from lib.job.map.google.PositionTask import PositionTask from lib.config.Yaml import Yaml as Config from lib.factory.StorageLocation import StorageLocation as DocFactory country = 'Italia' config = Config('./config/config.yml').get('mongodb') job_list = Storage(PositionTask.get_name(country), config) factory = DocFactory(config) wiki = factory.wiki_collection() filter = { 'name': { '$exists': True, '$not': { '$size': 0 } }, 'admin_hierarchy': { '$elemMatch': { 'name': country } } } objects = wiki.find(filter) for obj in objects: try:
from lib.parser.wiki.France import France as ParserFranceWiki from lib.factory.Loader import Loader as LoaderFactory import csv from lib.parser.map.google.GMapFactory import GMapFactory as MapFactory from lib.spider.Spider import Spider files = [ #'data/france/Departements_28_08_17_cards.csv', #'data/france/arrondissements_25_08_17_cards.csv', 'data/france/20_08_17_canton_google_3.csv', 'data/france/communes_17_09_17.csv' ] config = Config('./config/config.yml') doc_factory = DocFactory(config.get('mongodb')) language = 'fr' spider = Spider(loader_factory=LoaderFactory, gmap_parser=MapFactory.france, wiki_parser=ParserFranceWiki, doc_factory=doc_factory, language=language, config=config, use_cache=True) internal_collection = doc_factory.internal_collection() i = 0 hash_lib = hash() for csv_file in files: with open(csv_file, encoding='utf-8') as admin_div_CSV:
from lib.spider.Spider import Spider from lib.parser.map.google.GMapFactory import GMapFactory as MapFactory from lib.factory.Loader import Loader import math import wikipedia import datetime import sys from lib.parser.wiki.Spain import Spain as WikiES from lib.logger.File import File as FileLog from argparse import ArgumentParser # from lib.parser.wiki.Spain import Spain as ParserSpain country = 'Spain' config = Config('./config/config.yml') document_factory = DocFactory(config.get('mongodb')) df = pd.read_csv('./data/spain/Spain_notDublicate.csv', skiprows=[1]) # print(config) language = 'es' # spider = Spider( # loader_factory=LoaderFactory, # gmap_parser=MapFactory.spain, # wiki_parser=True, # doc_factory=doc_factory, # language=language, # config=config, # use_cache=True # ) loader = Loader.loader_with_mongodb(config.get('mongodb'))
def insee_code_unit(id): config = Config('./config/config.yml') factory = DocFactory(config.get('mongodb')) collection = factory.insee_collection() obj = collection.find_one({'code': id}) return render_template('admin/other/unit.html', data=obj)
def matching_france_js(region): region = unquote_plus(region) #mode = request.args.get('mode', 'none') config = Config('./config/config.yml') factory = DocFactory(config.get('mongodb')) internal = factory.internal_collection() wiki = factory.wiki_collection() gmap = factory.gmaps_collection() insee = factory.insee_collection() objects = internal.find({ 'name': { '$exists': True, '$not': { '$size': 0 } }, '$and': [{ 'admin_hierarchy.ADMIN_LEVEL_1.name': 'France' }, { 'admin_hierarchy.ADMIN_LEVEL_2.name': region }], }) result = [] for item in objects: dic = {'internal': item} wiki_res = {} if item.get('source', {}).get('wiki'): wiki_res = wiki.find_one( {'code': item.get('source', {}).get('wiki')}) dic.update(wiki=wiki_res) gmap_res = {} if item.get('source', {}).get('gmap'): gmap_res = gmap.find_one( {'code': item.get('source', {}).get('gmap')}) dic.update(gmap=gmap_res) insee_res = {} if item.get('source', {}).get('insee'): insee_res = insee.find_one( {'code': item.get('source', {}).get('insee')}) dic.update(insee=insee_res) compare_res = {} compare_res.update({ 'insee_code!=wiki_code': 1 if not (insee_res.get('InseeXls_CodeCommune') == wiki_res.get('commune_codes')) else 0 }) compare_res.update({ 'insee_name!=wiki_name': 1 if not (insee_res.get('InseeXls_NameCommune') == wiki_res.get('name')) else 0 }) compare_res.update({ 'wiki_name!=gmaps_name': 1 if not (wiki_res.get('true_name', wiki_res.get('name')) == gmap_res.get('true_name', gmap_res.get('name'))) else 0 }) compare_res.update({ 'wiki_post!=gmaps_post': 1 if not (str(wiki_res.get('postal_codes')) == str( gmap_res.get('postal_code'))) else 0 }) compare_res.update({ 'wiki_admin!=gmaps_admin': 1 if not (str(wiki_res.get('admin_hierarchy')) == str( gmap_res.get('admin_hierarchy'))) else 0 }) try: max_meters_in_distance = 5000 compare_res.update({ 'wiki_posinion>gmaps_position': 1 if Comparison.by_distance(wiki_res.get('center'), gmap_res.get('center')) > max_meters_in_distance else 0 }) except: compare_res.update({'wiki_posinion>gmaps_position': 1}) dic.update(compare=compare_res) # if mode != 'none': # if mode == 'wiki_adapte': # if dic.get('wiki', {}).get('name', '').lower() != dic.get('insee', {}).get('name', '').lower(): # result.append(dic) # elif mode == 'gmap_adapte': # if dic.get('gmap', {}).get('name', '').lower() != dic.get('insee', {}).get('name', '').lower(): # result.append(dic) # else: # result.append(dic) result.append(dic) return render_template('admin/matching-france/list.js', e=escape, items=result)
from lib.parser.wiki.France import France as ParserFranceWiki from lib.factory.Loader import Loader as LoaderFactory import csv from lib.parser.map.google.GMapFactory import GMapFactory as MapFactory from lib.spider.Spider import Spider files = [ 'data/france/Departements_28_08_17_cards.csv', 'data/france/arrondissements_25_08_17_cards.csv', 'data/france/20_08_17_canton_google_3.csv', 'data/france/communes_17_09_17.csv' ] config = Config('./config/config.yml') doc_factory = DocFactory(config.get('mongodb')) language = 'fr' spider = Spider(loader_factory=LoaderFactory, gmap_parser=MapFactory.france, wiki_parser=ParserFranceWiki, doc_factory=doc_factory, language=language, config=config, use_cache=True) def gmap_by_address(wiki): address = [] for name, value in wiki.get('admin_hierarchy', {}).items():
from lib.factory.StorageLocation import StorageLocation as DocFactory from lib.config.Yaml import Yaml as Config import re subject = "replacing the leftmost non-overlapping" re.sub('e', 'E', subject) config = Config('./config/config.yml') doc_factory = DocFactory(config.get('mongodb')) gmap_docs = doc_factory.gmaps_collection() wiki_docs = doc_factory.wiki_collection() for gmap_doc in gmap_docs.find(): print(gmap_doc.get('name'), "\n") if gmap_doc.get('name'): gmap_doc.update(true_name=gmap_doc.get('name')) gmap_docs.update_one({'code': gmap_doc.get('code')}, {'$set': gmap_doc}) for wiki_doc in wiki_docs.find(): print(wiki_doc.get('name'), "\n") if wiki_doc.get('name'): true_name = wiki_doc.get('name') true_name = re.sub('\s+\([^\)]+\)$', '', true_name) wiki_doc.update(true_name=true_name) wiki_docs.update_one({'code': wiki_doc.get('code')}, {'$set': wiki_doc})
from lib.factory.StorageLocation import StorageLocation as DocFactory from lib.factory.Loader import Loader from lib.config.Yaml import Yaml as Config from lib.parser.map.google.GMapFactory import GMapFactory as MapFactory from lib.factory.Loader import Loader as LoaderFactory from lib.parser.wiki.France import France as ParserFranceWiki from lib.spider.Spider import Spider config = Config('./config/config.yml') doc_factory = DocFactory(config.get('mongodb')) internal_docs = doc_factory.internal_collection() gmap_docs = doc_factory.gmaps_collection() wiki_docs = doc_factory.wiki_collection() language = 'fr' gmap_config = config.get('googlemaps') gmap_config.update(language=language) gmap_loader = Loader.loader_gmaps_with_cache( gmaps_config=gmap_config, storage_config=config.get('mongodb')) document_filter = { 'name': { '$exists': True, '$not': { '$size': 0 } },