Example #1
0
from argparse import ArgumentParser

config = Config('./config/config.yml')

arg_parser = ArgumentParser(
    description='Download data from wiki by link or search request')
arg_parser.add_argument('-f', help='turn on the force mode')
arg_parser.add_argument('-l', help='custom link to page with result(s)')
opts = arg_parser.parse_args()

insee_index = 0
name_index = 1
population_index = 2
force_update = opts.f
headers = {'User-Agent': 'Mozilla/5.0'}
loader = Loader.loader_with_mongodb(config.get('mongodb'))
document_factory = DocFactory(config.get('mongodb'))
log = FileLog('./log/wiki_page_italy_{date}.log'.format(
    date=datetime.datetime.now().strftime('%Y-%m-%d')))
log.add('Start', log.INFO)
log.add('Params: [{0}]'.format(repr(opts).encode('utf-8')), log.INFO)

message_format = 'Parsing request:[{0}]'

use_link = bool(opts.l)
custom_link = opts.l if use_link else ''


def update_meta(url, request, document):
    actual_doc = document.get_document()
    actual_doc.update(url=url)
from lib.factory.StorageLocation import StorageLocation as DocFactory
from lib.factory.Loader import Loader as LoaderFactory
from lib.config.Yaml import Yaml as Config
from lib.parser.wiki.France import France as WikiParser


config = Config('./config/config.yml')
document_factory = DocFactory(config.get('mongodb'))

url = 'https://fr.wikipedia.org/wiki/Paris'
headers = {'User-Agent': 'Mozilla/5.0'}

loader = LoaderFactory.loader_with_mongodb(config.get('mongodb'))

content, code = loader.load(url, headers=headers)

parser = WikiParser(content)

doc = document_factory.wiki(url)

print('.' if doc.is_new() else 'E', end='')

document = doc.get_document()

print('.' if 'code' in document else 'E', end='')

doc.update(parser.as_dictionary())

dic = doc.get_document()

print('.' if dic.get('name') == 'Paris' else 'E', end='')
Example #3
0
from lib.config.Yaml import Yaml as Config
from lib.factory.Loader import Loader as LoaderFactory
from lib.factory.StorageLocation import StorageLocation as DocFactory
from lib.parser.wiki.Italy import Italy
from lib.logger.MongoDB import MongoDB as Log
from time import sleep

force = True

config = Config('./config/config.yml')

country = 'Italia'

options = {}

loader = LoaderFactory.loader_with_mongodb(
    storage_config=config.get('mongodb'))
options.update(loader=loader)

doc_factory = DocFactory(config.get('mongodb'))
options.update(doc_factory=doc_factory)

options.update(force_update=force)

options.update(parser=Italy)
options.update(headers={'User-Agent': 'Mozilla/5.0'})
storage = Storage(job_name=PageTask.get_name(country),
                  storage_config=config.get('mongodb'))

log = Log(log_name=PageTask.get_name(country), config=config.get('mongodb'))

task_list = TaskListMongoDB(task_type=PageTask.get_name(country),