Esempio n. 1
0
    def _check_url_presence(self, urls):
        scrapes_table = read_sql_df(
            columns=['url'],
            table='BaseScrapes',
            query='SELECT Url FROM Scrapes WHERE Url IN ({})'.format(','.join(["'{}'".format(url) for url in urls])))

        return len(scrapes_table) > 0 and len([url for url in urls if url not in scrapes_table['url'].values]) == 0
Esempio n. 2
0
 def sql(self):
     df = read_sql_df(['coin_id', 'previous_rank', 'rank', 'symbol'])
     for coin in df['symbol'].values:
         yield self.sql_template.format(
             coin=coin,
             daily_trends_table=_daily_trends_table,
             values_table=_values_table)
Esempio n. 3
0
    def transform(self, df):
        coins = read_sql_df(['coin_id', 'rank', 'previous_rank', 'symbol', 'name'], table=_coins_table)
        complete_dataset = df.merge(coins[['symbol', 'coin_id']], on='symbol', how='inner')
        complete_dataset = complete_dataset.drop(['level_0', 'Unnamed: 0', 'index', 'name', 'id'], axis=1)

        complete_dataset['Date'] = '{:%Y-%m-%d}'.format(self.date_hour.date())
        return complete_dataset
Esempio n. 4
0
    def _get_not_scraped(self, urls):
        cols = get_table_columns('Scrapes')

        scraped = read_sql_df(
            columns=cols,
            table='Scrapes')

        return [u for u in urls if u not in set(scraped['url'])]
Esempio n. 5
0
    def run(self):
        base_scrape = read_sql_df(
            columns=get_table_columns('BaseScrapes'),
            table='BaseScrapes',
            query="SELECT * FROM {{table}} WHERE ScrapeSource='{source}'".format(source=self.source))

        source, url, subsections = base_scrape.values[0]

        df = self.scrape_top_level(url) if subsections is None else self._scrape_multiple(url, subsections)

        s3_write(df, 'parquet', self.output().path)
Esempio n. 6
0
 def run(self):
     df = read_sql_df(columns=self.columns, table=self.table)
     s3_write(df, self.output_format, self.output_path)