def routine(*dates): results = _scrap_dates(dates) if len(results) == 0: return with Dbinterface(os.environ['DIARIOBOT_DATABASE_CONNECTIONSTRING'] ).opensession() as session: for publicacao in _get_publicacoes(results): entry = Publicacao_Original(**publicacao) session.add(entry) session.commit()
# utils def split(spliter, data, labels): for train, test in spliter.split(data, labels): yield data[test], labels[test] ## # getting data appconfig = inout.read_yaml('./appconfig') stopwords = inout.read_json('./stopwords') + inout.read_json( './stopwords.domain') dbi = Dbinterface(appconfig['db']['connectionstring']) print('retrieving data') with dbi.opensession() as session: blacklist = list(session.query(Diario_Backlisted.palavra)) contratos = session.query(Contrato).join(Contrato.predicao) stopwords += [entry[0] for entry in blacklist] contratos = [{ '_id': contrato.id, 'corpo': contrato.objeto, 'classe': contrato.predicao.classe } for contrato in contratos] random_state = appconfig['random_state']
parser.add_argument('year', type=int, help='Year to scrap') parser.add_argument('month', type=int, help='Month to scrap') year = parser.parse_args().year month = parser.parse_args().month ## # Scrap routine print('starting scraping routine') publicacoes = [] dates = get_dates(year, month) for date in dates: print('scraping {}'.format(date)) publicacoes += scraper.scrap(date) ## # Persist results print('persisting on database') dbi = Dbinterface(os.environ['DIARIOBOT_DATABASE_CONNECTIONSTRING']) with dbi.opensession() as session: for publicacao in publicacoes: entry = Publicacao_Original(**publicacao) session.add(entry) session.commit()