def shuffle_table(table=None, engine=None, shuffle=''): if table is None: table = config['TABLES']['QUEUE'] print() if shuffle is True: x = 'y' elif shuffle is False: x = 'n' else: x = input(f'Shuffle {table}? Recomended to prevent IP being banned (y/n): ') print() if x == 'y': if engine == None: engine = mysql_engine() temp = f'{table}_backup_'+datetime.now().strftime('%d_%m') tprint(f'[·] Shuffling table {table} (can take up to 5 mins).') engine.execute(f'create table {temp} like {table}') engine.execute(f'insert into {temp} (original_link) select original_link from {table} order by rand()') engine.execute(f'drop table {table}') engine.execute(f'rename table {temp} to {table}') tprint('[+] Done shuffling.')
def get_full_df(n_pools=15, n=150, queue_table=None, processed_table=None, delete=False, engine=None, con=None, rand=False): if engine == None: engine = mysql_engine() if queue_table == None: queue_table = config['TABLES']['QUEUE'] if processed_table == None: processed_table = config['TABLES']['PROCESSED'] tprint('[·] Getting chunk...') chunk = get_chunk_from_db(n=n, queue_table=queue_table, processed_table=processed_table, delete=delete, engine=engine, con=con, rand=rand) tprint('[·] Populating chunk...') df = populate_df(chunk, n_pools=n_pools) return df
def populate_df(df, n_pools=15): pd.options.mode.chained_assignment = None row_list = list(df.T.to_dict().values()) p = Pool(15, init_worker) try: news_dict = p.map_async(process_row, row_list, 15) news_dict.wait() out = pd.DataFrame(news_dict.get()) p.close() p.join() # Set Dummy variables to 0 instead of None try: out.borrar[out.borrar.isnull()] = 0 except: pass return out except KeyboardInterrupt: print() tprint('Interrupted') p.terminate() p.join() sys.exit()
def process_Biobiochile(page): d = process_inner(page) soup = bs(page.content, 'lxml') try: d['authors'] = soup.find('div', {'class': 'nota-autor'}).find('a').text except Exception as exc: tprint('[-] Error parsing authors (Biobiochile) - ', exc, important=False) try: d['section'] = ' '.join( soup.find('div', { 'class': 'categoria-titulo-nota' }).text.split()) except Exception as exc: tprint('[-] Error parsing section (Biobiochile) - ', exc, important=False) try: d['body'] = soup.find('div', {'class': 'nota-body'}).text d['body'] = d['body'].replace('Etiquetas de esta nota:', '') except Exception as exc: tprint('[-] Error parsing body (Biobiochile) - ', exc, important=False) try: d['description'] = None except Exception as exc: tprint('[-] Error parsing description (Biobiochile) - ', exc, important=False) return d
def reinsert_from_error_to_queue(engine=None, con=None, where=''): queue_table = config['TABLES']['QUEUE'] error_table = config['TABLES']['ERROR'] close = False close_ = False if engine == None: engine = mysql_engine() con = engine.connect() close = True if con == None: con = engine.connect() close_ = True # Where clause if where == '': where= input('Where clause for mysql query:\n\t- ') print() # Count and confirm tprint('[·] Counting links...') count = engine.execute(f'select count(*) from {error_table} where {where}').scalar() y = input(f'\nAre you sure you want to reinsert {count} links? (y/n): ') print() if y=='y': # Get links to be reinserted tprint('[·] Getting Links...') to_be_reinserted = mysql_query_as_set(f'select original_link from {error_table} where {where};', con=con) # Reinserting into queue tprint('[·] Reinserting into queue table...') insert_set(to_be_reinserted, queue_table, 'original_link', engine=engine, con=con) # Delete from error tprint('[·] Deleting from error table...') engine.execute(f'delete from {error_table} where {where}') count_error = engine.execute(f'select count(*) from {error_table}').scalar() tprint(f'[+] Done! {count_error} links left in {error_table} table') if close == True: con.close() engine.dispose() if close_ == True: con.close()
def update_db(df, backup=None, queue=None, engine=None, con=None, shuffle=False): if backup is None: backup = config['TABLES']['BACKUP'] if queue is None: queue = config['TABLES']['QUEUE'] if engine is None and con is None: engine = mysql_engine() if con is None: con = engine.connect() print() df.to_sql('erase', con=con, index=False, if_exists='append', chunksize=50000) engine.execute(f'insert ignore into {backup} (original_link) select original_link from erase') engine.execute(f'insert ignore into {queue} (original_link) select original_link from erase') engine.execute('drop table if exists erase') tprint('Successfully added urls to database.') shuffle_table(engine=engine, shuffle=shuffle)
def process_Emol(page): d = process_inner(page) soup = bs(page.content, 'lxml') try: d['section'] = d['link'].split('/')[4].capitalize() except Exception as exc: tprint('[-] Error parsing section (Emol) - ', exc, important=False) try: d['authors'] = soup.find('div', {'class', 'info-notaemol-porfecha' }).text.split('|')[-1].strip().replace( 'Por ', '').replace('Redactado por ', '') except Exception as exc: tprint('[-] Error parsing section (Emol) - ', exc, important=False) return d
def get_chunk_from_db(n=150, queue_table=None, processed_table=None, delete=False, engine=mysql_engine(), con=None, rand=False): if queue_table == None: queue_table = config['TABLES']['QUEUE'] if processed_table == None: processed_table = config['TABLES']['PROCESSED'] if con == None: con = engine.connect() order = 'order by id' if rand == True: order = 'order by rand()' query = f'select original_link from {queue_table} {order} limit {str(n)}' try: # Reading rows df = pd.read_sql(query, con) # Backup and delete rows if delete == True: temp_table = temp_name(engine) df.to_sql(temp_table, con=con, if_exists='append', index=False) insert_query = f'INSERT IGNORE INTO {processed_table} (original_link) SELECT original_link FROM {temp_table}' engine.execute(insert_query) engine.execute(f'delete from {queue_table} limit {str(n)}') engine.execute(f'drop table if exists {temp_table}') except Exception as exc: tprint('[-] Error en get_chunk_from_db()', exc) df = None return df
def process_Df(page): cookies = read_cookies() page = requests.get(page.url, cookies=cookies['df']) if '¡Página no encontrada!' in page.text: try: tprint('[·] df.cl page not found. Searching for title...', important=False) title_ = page.url.split('/')[3].replace('-', '+') search_ = f'https://www.df.cl/cgi-bin/prontus_search.cgi?search_texto="{title_}"&search_prontus=noticias&search_tmp=search.html&search_idx=ALL&search_modo=and&search_form=yes' soup = bs(page.content, 'lxml') page = requests.get(search_) soup = bs(page.content, 'lxml') box = soup.find('div', {'id': 'wrap-noticias'}) new_url = 'https://www.df.cl' + box.find('article').h2.a['href'] tprint('[+] df.cl page found!', important=False) page = requests.get(new_url, cookies=cookies['df']) except Exception as exc: tprint('[-] df.cl page not found', important=False) d = process_inner(page) soup = bs(page.content, 'lxml') try: d['section'] = soup.find('meta', {'name': 'keywords'})['content'].strip() except Exception as exc: tprint('[-] Error parsing section (Df) - ', exc, important=False) try: d['body'] = '\n'.join([ p for p in d['body'].split('\n') if len(p.split()) > 4 and p != d['description'] ]) except Exception as exc: tprint('[-] Error parsing body (Df) - ', exc, important=False) return d
def process_link(link): link = get_direct_link(link) if link != '': try: d = process_outer(link) except requests.exceptions.ConnectionError: error = '[-] Connection Error ' + link tprint(error, important=False) d = {'error': 1, 'info': 'ConnectionError'} except Exception as exc: error = '[-] Error General ' + link + ' :' + str(exc) error = error[:275] tprint(error, important=False) d = {'error': 1, 'info': error} else: # Mark for deletion if tweet does not contain any links. error = '[-] Link Vacío en process_link' tprint(error, important=False) d = {'error': 1, 'borrar': 1, 'info': error} return d
def update(self): # Pendiente arreglar (borrar, info, columnas de menos) try: self.create() self.insert_table() try: self.error.to_sql(self.error_table, con = self.con, if_exists='append', index=False) except Exception as exc: tprint('[-] Error updating error TempTable - ', exc) except DatabaseError as db_error: error_msg = db_error._message() if 'Incorrect string value' in error_msg: self.destroy() bad_row = int(error_msg.split()[-1]) - 1 poison = error_msg.split("value: '")[1].split("...' for")[0] tprint(f'[-] Encoding Error on row {bad_row} ({poison}). Retrying...') i = self.press.reset_index()['index'][bad_row] self.df['error'][i] = 1 self.df['info'][i] = 'Encoding Error' self.divide_df() self.update() else: raise Exception('Unknown DatabaseError') except Exception as exc: error = f'[-] Error updating {self.result_table} table TempTable - '+str(exc) error = error[:275] tprint(error) try: save = self.df save['info'] = save['info'].fillna(error[:255]) save[['original_link', 'borrar', 'info']].to_sql(self.error_table, con = self.con, if_exists='append', index=False) except Exception as exc: tprint('[-] Error trying to save extracted rows TempTable - ', exc) self.destroy()
def process_Elmostrador(page): d = process_inner(page) soup = bs(page.content, 'lxml') d['description'] = None try: d['description'] = soup.find('figcaption').text except Exception as exc: tprint('[-] Error parsing description (Elmostrador) - ', exc, important=False) try: d['authors'] = soup.find('p', { 'class': 'autor-y-fecha' }).find('a').text except Exception as exc: tprint('[-] Error parsing authors (Elmostrador) - ', exc, important=False) try: if 'www.elmostrador.cl' not in d['image'] and d['image']: d['image'] = 'https://www.elmostrador.cl' + d['image'] except Exception as exc: tprint('[-] Error fixing image (Elmostrador) - ', exc, important=False) if not d['date']: try: date = [s for s in d['link'].split('/') if s.isdigit()][:3] d['date'] = datetime.datetime(*map(int, date)) except Exception as exc: tprint('[-] Error parsing date (Elmostrador) - ', exc, important=False) try: d['section'] = ' '.join([ x for x in soup.find_all('h2') if x.find('i') != None ][0].text.split()) except Exception as exc: tprint('[-] Error parsing section (Elmostrador) - ', exc, important=False) try: d['body'] = d['body'].split('__________________')[0] except Exception as exc: tprint('[-] Error fixing body (Elmostrador) - ', exc, important=False) return d
def process_Cooperativa(page): d = process_inner(page) try: if 'al aire libre' in d['title'].lower(): d = {'borrar': 1, info: 'Borrar, Al aire libre'} except: pass soup = bs(page.content, 'lxml') try: d['authors'] = soup.find('div', { 'class': 'fecha-publicacion' }).find('span').text except Exception as exc: tprint('[-] Error parsing authors (Cooperativa) - ', exc, important=False) try: d['section'] = soup.find('a', {'id': 'linkactivo'}).text except Exception as exc: tprint('[-] Error parsing section (Cooperativa) - ', exc, important=False) try: d['tags'] = soup.find('meta', {'name': 'keywords'})['content'].strip() except Exception as exc: tprint('[-] Error parsing tags (Cooperativa) - ', exc, important=False) try: d['link'] = soup.find('meta', property='og:url')['content'] except Exception as exc: tprint('[-] Error parsing link (Cooperativa) - ', exc, important=False) if not d['date']: try: date = [x for x in d['link'].split('/') if '-' in x][-1].split('-') d['date'] = datetime.datetime(*map(int, date)) except Exception as exc: tprint('[-] Error parsing date (Cooperativa) - ', exc, important=False) try: if 'www.cooperativa.cl' not in d['image'] and d['image']: d['image'] = 'https://www.cooperativa.cl' + d['image'] except Exception as exc: tprint('[-] Error fixing image (Cooperativa) - ', exc, important=False) return d
def process_Latercera(page): d = {} if 'Lo sentimos, estamos actualizando el sitio' not in page.text: d = process_inner(page) else: ### Buscar noticia en google, si es necesario. scraped_link = page.url.strip('/') tprint('[-] Link Latercera no encontrado', page.url, important=False) new_link = 'https://www.latercera.com/noticia/' + '-'.join([ p for p in scraped_link.split('/')[-1].split('.')[0].split('-') if not p.isdigit() ]) #print(new_link) page = requests.get(new_link) if 'Lo sentimos, estamos actualizando el sitio' not in page.text: d = process_inner(page) tprint('[+] Link Latercera encontrado (intento:1): ', new_link, important=False) else: try: tprint('[·] Google Searching...', important=False) buscar = ' '.join([ p for p in scraped_link.split('/')[-1].split('.')[0].split( '-') if not p.isdigit() ]) + ' site:latercera.com' results = search(buscar, stop=5) rs = [] for r in results: rs.append(r) result = [r for r in rs if 'sitemap' not in r][0] if 'sitemap' not in result: tprint('[+] Resultado en Google (intento:2):', result, important=False) page = requests.get(result) d = process_inner(page) else: d['error'] = 1 d['info'] = 'Link Latercera no encontrado en google' except Exception as exc: tprint('[-] Link Latercera no encontrado', important=False) d['error'] = 1 d['info'] = 'Link Latercera no encontrado en google' soup = bs(page.content, 'lxml') ### Recuperar Image. try: d['image'] = soup.find('figure').find('img')['src'] except Exception as exc: tprint('[-] Error parsing image (Latercera) - ', exc, important=False) ### Recuperar Autor try: d['authors'] = [ h.text for h in soup.find_all('h4') if 'Autor' in h.text ][0].replace('Autor: ', '') except Exception as exc: tprint('[-] Error parsing authors (Latercera) - ', exc, important=False) try: if d['description'] == None: d['description'] = soup.find('div', {'class': 'bajada-art'}).text except Exception as exc: tprint('[-] Error parsing description (Latercera) - ', exc, important=False) try: if d['date'] == None: date = ' '.join( soup.find('span', { 'class': 'time-ago' }).text.replace('|', '').split()) d['date'] = datetime.datetime.strptime(date, '%d/%m/%Y %I:%M %p') except Exception as exc: tprint('[-] Error parsing date (Latercera) - ', exc, important=False) try: d['section'] = soup.find('meta', property='article:section')['content'] except: try: d['section'] = [ x.find('a').text for x in soup.find_all('h4') if x.find('a') != None and 'canal' in x.find('a')['href'] ][0] except Exception as exc: tprint('[-] Error parsing section (Latercera) - ', exc, important=False) d['tags'] = ', '.join( [x['content'] for x in soup.find_all('meta', property='article:tag')]) if not d['tags']: try: d['tags'] = ', '.join([ x.text for x in soup.find('div', { 'class': 'tags-interior' }).find_all('a') ]) except Exception as exc: tprint('[-] Error parsing tags (Latercera) - ', exc, important=False) return d
def delete_error_where(engine=None, con=None, where=''): processed_table = config['TABLES']['PROCESSED'] error_table = config['TABLES']['ERROR'] close = False close_ = False if engine == None: engine = mysql_engine() con = engine.connect() close = True if con == None: con = engine.connect() close_ = True # Where clause if where == '': where= input('Where clause for mysql query:\n\t- ') print() # Count and confirm tprint('[·] Counting links...') count = engine.execute(f'select count(*) from {error_table} where {where}').scalar() y = input(f'\nAre you sure you want to remove {count} links? (y/n): ') print() if y=='y': # Get links to be removed tprint('[·] Getting Links...') to_be_removed = mysql_query_as_set(f'select original_link from {error_table} where {where};', con=con) # Filtering Processed tprint('[·] Filtering processed table...') processed = mysql_query_as_set(f'select original_link from {processed_table};', con=con) processed = processed - to_be_removed # Reinserting into processed tprint('[·] Reinserting into processed table...') temp = f'{processed_table}_backup_'+datetime.now().strftime('%d_%m') engine.execute(f'create table {temp} like {processed_table}') insert_set(processed, temp, 'original_link', engine=engine, con=con) engine.execute(f'drop table {processed_table}') engine.execute(f'rename table {temp} to {processed_table}') # Delete from error tprint('[·] Deleting from processed and error table...') engine.execute(f'delete from {error_table} where {where}') # Done. count_error = engine.execute(f'select count(*) from {error_table}').scalar() tprint(f'[+] Done! {count_error} links left in {error_table} table') if close == True: con.close() engine.dispose() if close_ == True: con.close()
def work(result_table=None, df=None, debug=True, n_pools=15, n=150, queue_table=None, processed_table=None, error_table=None, delete=False, engine=None, con=None, rand=False): if df is None: df = pd.DataFrame() if result_table == None: result_table = config['TABLES']['RESULT'] if queue_table == None: queue_table = config['TABLES']['QUEUE'] if processed_table == None: processed_table = config['TABLES']['PROCESSED'] if error_table == None: error_table = config['TABLES']['ERROR'] s = time.time() tprint('[·] Downloading and processing data from table...') tt = TempTable(result_table=result_table, df=df, debug=debug, n_pools=n_pools, n=n, queue_table=queue_table, processed_table=processed_table, error_table=error_table, delete=delete, engine=engine, con=con, rand=rand) if not tt.df.empty: t1 = time.time() tprint(f'[+] Done ({round(t1-s,2)} seconds)') tprint('[·] Inserting into main table...') tt.update() f = time.time() tprint(f'[+] Done ({round(f-t1,2)} seconds)') tprint(f'[+] {len(tt.press)}/{len(tt.df)} news scraped in {round(f - s,2)} seconds. ({round((f - s)/n, 2)} s/article)') status = 'working' else: # Terminate job when there are no links left. tprint('[+] DONE, updated every article.') status = 'done' tt.close_mysql() return tt.df, status
def program(result_table=None, df=None, debug=True, n_pools=15, n=150, queue_table=None, processed_table=None, error_table=None, delete=False, engine=None, con=None, rand=False): if df is None: df = pd.DataFrame if result_table == None: result_table = config['TABLES']['RESULT'] if queue_table == None: queue_table = config['TABLES']['QUEUE'] if processed_table == None: processed_table = config['TABLES']['PROCESSED'] if error_table == None: error_table = config['TABLES']['ERROR'] # Initialiazing... if engine == None: engine = mysql_engine() con = engine.connect() if con == None: con = engine.connect() print() tprint('[·] Initializing...') status = 'working' init_mysql_db(engine=engine) recover_discarded(con=con) queue = len_tables('queue')['queue'] tprint('[+] Done.') print() con.close() engine.dispose() i = 1 while queue != 0: engine = mysql_engine() con = engine.connect() try: result, status = work(result_table=result_table, df=df, debug=debug, n_pools=n_pools, n=n, queue_table=queue_table, processed_table=processed_table, error_table=error_table, delete=delete, engine=engine, con=con, rand=rand) if status == 'done': tprint('[+] DONE!') input('\n(ENTER)') sys.exit() except KeyboardInterrupt: sys.exit() except Exception as exc: tprint('[-] Error General - ', exc) print() con.close() engine.dispose() if i%100 == 0: queue = len_tables('queue')['queue'] tprint('[+] {} left in queue.'.format(queue)) print() i += 1 tprint('[+] DONE!')
def scrape_tweets(user, days=0, months=0, years=0, monthly=False, yearly=False, since='', until=''): path = f"{os.environ['HOME']}/presscontrol/twitter_tempfiles" if not os.path.exists(path): os.makedirs(path) tweets = {} counter = 0 if until == '': until = datetime.today()+timedelta(1) else: until = datetime.strptime(until, '%Y-%m-%d') if since == '': d = int(days + months*31 + years*365) since = until - timedelta(d) else: since = datetime.strptime(since, '%Y-%m-%d') since_ = since while since_ < until: if since_.year not in tweets.keys(): tweets[since_.year] = {} until_ = next_date(since_) if next_date(since_) < until else until if since_.day == until_.day: pr = f'{calendar.month_name[since_.month]} {since_.year}' else: pr = f'{dt2str(since_)} to {dt2str(until_)}' filename = f"{path}/{user} {pr}.pkl" tprint(f'[·] Getting tweets from {pr}') if os.path.exists(filename): tprint('[+] Found in twitter tempfiles') tweets_ = pd.read_pickle(filename) else: try: tweetCriteria = got.manager.TweetCriteria().setUsername(user).setSince(dt2str(since_)).setUntil(dt2str(until_)) tweets_ = got.manager.TweetManager.getTweets(tweetCriteria) if len(tweets_) > 0: with open(f"{path}/{user} {pr}.pkl", 'wb') as f: pickle.dump(tweets_, f) except Exception as exc: tweets_ = [] print('\nError\n', exc) tprint(f'[+] Done ({len(tweets_)} tweets).') counter += len(tweets_) tweets[since_.year][since_.month] = tweets_ since_ = until_ print() tprint(f'[+] DONE ({counter} tweets)') return tweets