Example #1
0
def routine(*dates):

    results = _scrap_dates(dates)
    if len(results) == 0:
        return

    with Dbinterface(os.environ['DIARIOBOT_DATABASE_CONNECTIONSTRING']
                     ).opensession() as session:
        for publicacao in _get_publicacoes(results):
            entry = Publicacao_Original(**publicacao)
            session.add(entry)

        session.commit()
# utils


def split(spliter, data, labels):
    for train, test in spliter.split(data, labels):
        yield data[test], labels[test]


##
# getting data

appconfig = inout.read_yaml('./appconfig')
stopwords = inout.read_json('./stopwords') + inout.read_json(
    './stopwords.domain')

dbi = Dbinterface(appconfig['db']['connectionstring'])

print('retrieving data')
with dbi.opensession() as session:
    blacklist = list(session.query(Diario_Backlisted.palavra))

    contratos = session.query(Contrato).join(Contrato.predicao)

stopwords += [entry[0] for entry in blacklist]
contratos = [{
    '_id': contrato.id,
    'corpo': contrato.objeto,
    'classe': contrato.predicao.classe
} for contrato in contratos]

random_state = appconfig['random_state']
Example #3
0
parser.add_argument('year', type=int, help='Year to scrap')
parser.add_argument('month', type=int, help='Month to scrap')

year = parser.parse_args().year
month = parser.parse_args().month

##
# Scrap routine

print('starting scraping routine')

publicacoes = []
dates = get_dates(year, month)
for date in dates:
    print('scraping {}'.format(date))
    publicacoes += scraper.scrap(date)

##
# Persist results

print('persisting on database')

dbi = Dbinterface(os.environ['DIARIOBOT_DATABASE_CONNECTIONSTRING'])
with dbi.opensession() as session:

    for publicacao in publicacoes:
        entry = Publicacao_Original(**publicacao)
        session.add(entry)

    session.commit()