gmaps_config = config.get('googlemaps') gmaps_config.update(language='it') loader = LoaderFactory.loader_gmaps_with_cache( gmaps_config=gmaps_config, storage_config=config.get('mongodb') ) options.update(loader=loader) doc_factory = DocFactory(config.get('mongodb')) options.update(doc_factory=doc_factory) mongo_config = config.get('mongodb') connection = MongoClient(mongo_config['host'], mongo_config['port']) counter = CounterMongoDB(counter_name='gmap', start=1, end=gmaps_config.get('geocoding').get('limit'), step=1, ttl=86400, connection=connection) options.update(force_update=force) options.update(parser=Italy) storage = Storage(job_name=PositionTask.get_name(country), storage_config=config.get('mongodb')) log = Log(log_name=PositionTask.get_name(country), config=config.get('mongodb')) task_list = TaskListMongoDB(task_type=PositionTask.get_name(country), options=options, storage=storage, log=log) executor = ExecutorWithLimit(task_list, counter) executor.run()
from lib.job.storage.MongoDB import MongoDB as Storage from lib.job.map.google.AddressTask import AddressTask from lib.config.Yaml import Yaml as Config import pandas as pd country = 'Italy' lst_address = [] region_index = 1 provincia_index = 3 comune_index = 5 localita_index = 9 config = Config('./config/config.yml').get('mongodb') job_list = Storage(AddressTask.get_name(country), config) df = pd.read_csv('./data/italy/indicatori_2011_localita.csv', delimiter=";", skiprows=[1], encoding='ISO-8859-1') for index, row in df.iterrows(): print(index) try: new_address = 'Italia, ' if row[region_index]: new_address += row[region_index] if new_address not in lst_address: lst_address.append(new_address) job_list.add(new_address)
mongo_config = config.get('mongodb') connection = MongoClient(mongo_config['host'], mongo_config['port']) counter = CounterMongoDB(counter_name='gmap', start=1, end=gmaps_config.get('geocoding').get('limit'), step=1, ttl=86400, connection=connection) options.update(doc_factory=doc_factory) options.update(force_update=force) options.update(parser=Italy) storage = Storage(job_name=AddressTask.get_name(country), storage_config=config.get('mongodb')) log = Log(log_name=AddressTask.get_name(country), config=config.get('mongodb')) task_list = TaskListMongoDB(task_type=AddressTask.get_name(country), options=options, storage=storage, log=log) executor = ExecutorWithLimit(task_list, counter) executor.run()
from lib.job.storage.MongoDB import MongoDB as Storage from lib.job.map.google.PositionTask import PositionTask from lib.config.Yaml import Yaml as Config from lib.factory.StorageLocation import StorageLocation as DocFactory country = 'Italia' config = Config('./config/config.yml').get('mongodb') job_list = Storage(PositionTask.get_name(country), config) factory = DocFactory(config) wiki = factory.wiki_collection() filter = { 'name': { '$exists': True, '$not': { '$size': 0 } }, 'admin_hierarchy': { '$elemMatch': { 'name': country } } } objects = wiki.find(filter) for obj in objects: try:
from lib.job.storage.MongoDB import MongoDB as Storage from lib.job.wiki.RequestTask import RequestTask from lib.config.Yaml import Yaml as Config import pandas as pd config = Config('./config/config.yml') country = 'France' job_list = Storage(RequestTask.get_name(country), config.get('mongodb')) df = pd.read_csv('./WorkBaseFile/BaseCommuneInInseeFR', delimiter="\t") for index, row in df.iterrows(): insee = row[0] job_list.add("insee+{insee}".format(insee=insee))
from lib.job.wiki.RequestTask import RequestTask from lib.config.Yaml import Yaml as Config import pandas as pd import urllib.parse country = 'Italy' lst_address = [] region_index = 1 provincia_index = 3 comune_index = 5 localita_index = 9 config = Config('./config/config.yml').get('mongodb') job_list = Storage(RequestTask.get_name(country), config) df = pd.read_csv('./data/italy/indicatori_2011_localita.csv', delimiter=";", skiprows=[1], encoding='ISO-8859-1') for index, row in df.iterrows(): print(index) try: new_address = 'Italia,' if row[1]: new_address += row[region_index] if new_address not in lst_address: lst_address.append(new_address) job_list.add(urllib.parse.quote(new_address))
loader = LoaderFactory.loader_with_mongodb( storage_config=config.get('mongodb')) options.update(loader=loader) doc_factory = DocFactory(config.get('mongodb')) options.update(doc_factory=doc_factory) options.update(force_update=force) options.update(parser=France) options.update(headers={'User-Agent': 'Mozilla/5.0'}) options.update( url_format= "https://fr.wikipedia.org/w/index.php?search={0}&title=Sp%C3%A9cial:Recherche&profile=default&fulltext=1&searchengineselect=mediawiki&searchToken=ac9zaxa1lggzxpdhc5ukg06t6" ) storage = Storage(job_name=RequestTask.get_name(country), storage_config=config.get('mongodb')) log = Log(log_name=RequestTask.get_name(country), config=config.get('mongodb')) task_list = TaskListMongoDB(task_type=RequestTask.get_name(country), options=options, storage=storage, log=log) executor = Executor(task_list) while True: executor.run() sleep(10)
from lib.config.Yaml import Yaml as Config from lib.factory.StorageLocation import StorageLocation as DocFactory import pandas as pd import urllib.parse country = 'Italia' lst_address = [] region_index = 1 provincia_index = 3 comune_index = 5 localita_index = 9 config = Config('./config/config.yml').get('mongodb') job_list = Storage(PageTask.get_name(country), config) factory = DocFactory(config) wiki = factory.wiki_collection() filter = { 'name': { '$exists': True, '$not': { '$size': 0 } }, 'admin_hierarchy': { '$elemMatch': { 'name': country }
options = {} loader = LoaderFactory.loader_with_mongodb( storage_config=config.get('mongodb')) options.update(loader=loader) doc_factory = DocFactory(config.get('mongodb')) options.update(doc_factory=doc_factory) options.update(force_update=force) options.update(parser=Italy) options.update(host='it.wikipedia.org') options.update(headers={'User-Agent': 'Mozilla/5.0'}) storage = Storage(job_name=PageRecursiveTask.TYPE, storage_config=config.get('mongodb')) options.update() options.update(log_history=LogHistory('log/{}.log'.format(title))) options.update(recursive_storage=RecursiveParser(title, config.get('mongodb'))) log = Log(log_name=PageRecursiveTask.TYPE, config=config.get('mongodb')) task_list = TaskListMongoDB(task_type=PageRecursiveTask.TYPE, options=options, storage=storage, log=log) executor = Executor(task_list)
from lib.job.storage.MongoDB import MongoDB as Storage from lib.job.wiki.RequestTask import RequestTask from lib.config.Yaml import Yaml as Config import pandas as pd from lib.job.wiki.PageRecursiveTask import PageRecursiveTask config = Config('./config/config.yml') country = 'Italy' max_dig_level = 4 job_list = Storage(PageRecursiveTask.get_name(country), config.get('mongodb')) df = pd.read_csv('./WorkBaseFile/ItalyUrlMainList', delimiter="\t") for index, row in df.iterrows(): link = row[0] job_list.add({'link': link, 'level': max_dig_level})