Ejemplo n.º 1
0
gmaps_config = config.get('googlemaps')
gmaps_config.update(language='it')

loader = LoaderFactory.loader_gmaps_with_cache(
    gmaps_config=gmaps_config,
    storage_config=config.get('mongodb')
)
options.update(loader=loader)

doc_factory = DocFactory(config.get('mongodb'))
options.update(doc_factory=doc_factory)

mongo_config = config.get('mongodb')

connection = MongoClient(mongo_config['host'], mongo_config['port'])

counter = CounterMongoDB(counter_name='gmap', start=1, end=gmaps_config.get('geocoding').get('limit'), step=1, ttl=86400, connection=connection)

options.update(force_update=force)

options.update(parser=Italy)

storage = Storage(job_name=PositionTask.get_name(country), storage_config=config.get('mongodb'))

log = Log(log_name=PositionTask.get_name(country), config=config.get('mongodb'))

task_list = TaskListMongoDB(task_type=PositionTask.get_name(country), options=options, storage=storage, log=log)

executor = ExecutorWithLimit(task_list, counter)

executor.run()
Ejemplo n.º 2
0
from lib.job.storage.MongoDB import MongoDB as Storage
from lib.job.map.google.AddressTask import AddressTask
from lib.config.Yaml import Yaml as Config
import pandas as pd

country = 'Italy'
lst_address = []

region_index = 1
provincia_index = 3
comune_index = 5
localita_index = 9

config = Config('./config/config.yml').get('mongodb')

job_list = Storage(AddressTask.get_name(country), config)

df = pd.read_csv('./data/italy/indicatori_2011_localita.csv',
                 delimiter=";",
                 skiprows=[1],
                 encoding='ISO-8859-1')

for index, row in df.iterrows():
    print(index)
    try:
        new_address = 'Italia, '
        if row[region_index]:
            new_address += row[region_index]
            if new_address not in lst_address:
                lst_address.append(new_address)
                job_list.add(new_address)
Ejemplo n.º 3
0
mongo_config = config.get('mongodb')

connection = MongoClient(mongo_config['host'], mongo_config['port'])

counter = CounterMongoDB(counter_name='gmap',
                         start=1,
                         end=gmaps_config.get('geocoding').get('limit'),
                         step=1,
                         ttl=86400,
                         connection=connection)

options.update(doc_factory=doc_factory)

options.update(force_update=force)

options.update(parser=Italy)

storage = Storage(job_name=AddressTask.get_name(country),
                  storage_config=config.get('mongodb'))

log = Log(log_name=AddressTask.get_name(country), config=config.get('mongodb'))

task_list = TaskListMongoDB(task_type=AddressTask.get_name(country),
                            options=options,
                            storage=storage,
                            log=log)

executor = ExecutorWithLimit(task_list, counter)

executor.run()
from lib.job.storage.MongoDB import MongoDB as Storage
from lib.job.map.google.PositionTask import PositionTask
from lib.config.Yaml import Yaml as Config
from lib.factory.StorageLocation import StorageLocation as DocFactory

country = 'Italia'

config = Config('./config/config.yml').get('mongodb')

job_list = Storage(PositionTask.get_name(country), config)

factory = DocFactory(config)
wiki = factory.wiki_collection()

filter = {
    'name': {
        '$exists': True,
        '$not': {
            '$size': 0
        }
    },
    'admin_hierarchy': {
        '$elemMatch': {
            'name': country
        }
    }
}

objects = wiki.find(filter)
for obj in objects:
    try:
Ejemplo n.º 5
0
from lib.job.storage.MongoDB import MongoDB as Storage
from lib.job.wiki.RequestTask import RequestTask
from lib.config.Yaml import Yaml as Config
import pandas as pd

config = Config('./config/config.yml')

country = 'France'

job_list = Storage(RequestTask.get_name(country), config.get('mongodb'))

df = pd.read_csv('./WorkBaseFile/BaseCommuneInInseeFR', delimiter="\t")
for index, row in df.iterrows():
    insee = row[0]
    job_list.add("insee+{insee}".format(insee=insee))
from lib.job.wiki.RequestTask import RequestTask
from lib.config.Yaml import Yaml as Config
import pandas as pd
import urllib.parse

country = 'Italy'
lst_address = []

region_index = 1
provincia_index = 3
comune_index = 5
localita_index = 9

config = Config('./config/config.yml').get('mongodb')

job_list = Storage(RequestTask.get_name(country), config)

df = pd.read_csv('./data/italy/indicatori_2011_localita.csv',
                 delimiter=";",
                 skiprows=[1],
                 encoding='ISO-8859-1')

for index, row in df.iterrows():
    print(index)
    try:
        new_address = 'Italia,'
        if row[1]:
            new_address += row[region_index]
            if new_address not in lst_address:
                lst_address.append(new_address)
                job_list.add(urllib.parse.quote(new_address))
Ejemplo n.º 7
0
loader = LoaderFactory.loader_with_mongodb(
    storage_config=config.get('mongodb'))
options.update(loader=loader)

doc_factory = DocFactory(config.get('mongodb'))
options.update(doc_factory=doc_factory)

options.update(force_update=force)

options.update(parser=France)
options.update(headers={'User-Agent': 'Mozilla/5.0'})
options.update(
    url_format=
    "https://fr.wikipedia.org/w/index.php?search={0}&title=Sp%C3%A9cial:Recherche&profile=default&fulltext=1&searchengineselect=mediawiki&searchToken=ac9zaxa1lggzxpdhc5ukg06t6"
)

storage = Storage(job_name=RequestTask.get_name(country),
                  storage_config=config.get('mongodb'))

log = Log(log_name=RequestTask.get_name(country), config=config.get('mongodb'))

task_list = TaskListMongoDB(task_type=RequestTask.get_name(country),
                            options=options,
                            storage=storage,
                            log=log)

executor = Executor(task_list)

while True:
    executor.run()
    sleep(10)
Ejemplo n.º 8
0
from lib.config.Yaml import Yaml as Config
from lib.factory.StorageLocation import StorageLocation as DocFactory
import pandas as pd
import urllib.parse

country = 'Italia'
lst_address = []

region_index = 1
provincia_index = 3
comune_index = 5
localita_index = 9

config = Config('./config/config.yml').get('mongodb')

job_list = Storage(PageTask.get_name(country), config)

factory = DocFactory(config)
wiki = factory.wiki_collection()

filter = {
    'name': {
        '$exists': True,
        '$not': {
            '$size': 0
        }
    },
    'admin_hierarchy': {
        '$elemMatch': {
            'name': country
        }
Ejemplo n.º 9
0
options = {}

loader = LoaderFactory.loader_with_mongodb(
    storage_config=config.get('mongodb'))
options.update(loader=loader)

doc_factory = DocFactory(config.get('mongodb'))
options.update(doc_factory=doc_factory)

options.update(force_update=force)

options.update(parser=Italy)
options.update(host='it.wikipedia.org')
options.update(headers={'User-Agent': 'Mozilla/5.0'})
storage = Storage(job_name=PageRecursiveTask.TYPE,
                  storage_config=config.get('mongodb'))

options.update()

options.update(log_history=LogHistory('log/{}.log'.format(title)))
options.update(recursive_storage=RecursiveParser(title, config.get('mongodb')))

log = Log(log_name=PageRecursiveTask.TYPE, config=config.get('mongodb'))

task_list = TaskListMongoDB(task_type=PageRecursiveTask.TYPE,
                            options=options,
                            storage=storage,
                            log=log)

executor = Executor(task_list)
Ejemplo n.º 10
0
from lib.job.storage.MongoDB import MongoDB as Storage
from lib.job.wiki.RequestTask import RequestTask
from lib.config.Yaml import Yaml as Config
import pandas as pd
from lib.job.wiki.PageRecursiveTask import PageRecursiveTask

config = Config('./config/config.yml')

country = 'Italy'

max_dig_level = 4

job_list = Storage(PageRecursiveTask.get_name(country), config.get('mongodb'))

df = pd.read_csv('./WorkBaseFile/ItalyUrlMainList', delimiter="\t")
for index, row in df.iterrows():
    link = row[0]
    job_list.add({'link': link, 'level': max_dig_level})