download_logger.info(f'Attachment ID {filename} Downloaded.') time.sleep(random.randint(1, 8)) return filename def main(url_list): with TPE(max_workers=5) as executor: futures = [executor.submit(downloader, link) for link in url_list] for future in as_completed(futures): download_logger.info( f'Thread Closed for Attachment ID {future.result()}') if __name__ == '__main__': download_logger = log.get_logger(__name__) attachment_id_start = 2708 attachment_id_end = 3748 attachment_id_list = list(range(attachment_id_start, attachment_id_end)) uri = 'http://www.stockmarketpilipinas.com/attachment.php?aid=' download_links = uri_creator(uri, attachment_id_start, attachment_id_end, attachment_id_list) main(download_links) download_logger.info( f'Downloaded {attachment_id_end - attachment_id_start} Attachments.')
import csv import os import re import Log.log as log ### Validation steps: # 1. Load row # 2. if pattern is not found: log to file # 3. if pattern is found: append filename to pre_processed_list # 4. return pre_processed_list # Initialize Logger file_cleaner_log = log.get_logger(__name__) # Present Data Dump Path data_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', 'Downloader/dump/')) # Deletion Process for files that DIDN'T PASS the utf-8 encoding check invalid_file_log = os.path.dirname(__file__) + '/log/invalid.txt' # files_for_deletion = get_invalid_file() # delete_invalid_file(files_for_deletion) valid_file_log = os.path.dirname(__file__) + '/log/valid.txt' string_pattern = "\A\^FINANCIAL" def delete_invalid_file(invalid_files_list): """ Iterates through invalid_files_list to remove invalid files from data_path If a file is not found, log the exception and continue iteration
from Log import log from Parser import utf8_validator, file_cleaner, file_parser # This module calls multiple modules from the Parser directory. # Initialize Logger pre_processor_log = log.get_logger(__name__) # Get Valid and Invalid file lists valid_file_list, invalid_file_list = utf8_validator.csv_validator() # Pass invalid file list to deletion function in file_cleaner module file_cleaner.delete_invalid_file(invalid_file_list) # Pass valid file list to deletion function in file_cleaner module pre_processed_list = file_cleaner.preproc_valid_files(valid_file_list, len(valid_file_list)) # Delete files from dump that doesn't start with ^FINANCIAL for_removal = list(set(valid_file_list).difference(set(pre_processed_list))) file_cleaner.delete_invalid_file(for_removal) # Sort and write rows to appropriate table in DB file_parser.data_sorter(pre_processed_list)
#!/usr/bin/env python3.7 import re import Log.log as log from datetime import datetime as dt from sqlalchemy import create_engine, Column, Integer, String, Date from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker from Extraction import db_config db_logger = log.get_logger(__name__) date_now = dt.now().date() Base = declarative_base() engine = create_engine( f'postgresql://{db_config.username}:{db_config.password}@{db_config.host}:{db_config.port}/{db_config.database}', echo=False) class Combination(Base): __tablename__ = 'lotto' id = Column('id', Integer, primary_key=True) datedrawn = Column('datedrawn', Date) game = Column('game', String) game_result = Column('result', String) jackpot = Column('jackpot', Integer)
if regexer(row) == 'sector': # Perform type conversions # row = to_date(to_centavo(to_integer(row[:-1]))) # Populate models.Sector Instance Variables # db_commit_sector(row) # sector_name = row[0] #sector_counter += 1 parser_log.info( f'Sector {row[0][1:]} data found on {file}') else: # Perform type conversions row = to_date(to_centavo(to_integer(row[:-1]))) # Populate models.Stock Instance Variables if db_commit_stock(row) == 'duplicate': parser_log.info( f'Data for {row[0]} already in DB. Duplicate on {file}' ) else: stock_counter += 1 parser_log.info(f'Stock {row[0]} data found on {file}') parser_log.info(f'{file} parsed successfully.') parser_log.info( f'STATISTICS for {file} : {sector_counter} sector data. {stock_counter} stock data.' ) parser_log = log.get_logger(__name__)
#!/usr/bin/env python3.7 import requests from lxml import html from Extraction import db_writer_postgres import calendar import Log.log as log scraper_logger = log.get_logger(__name__) def get_page(): """ retrieves raw html response using GET on uri and html_headers converts response string to an htmlElement object :return: """ pcso_response = web_session.get(uri, headers=html_headers) pcso_page = html.fromstring(pcso_response.content) return pcso_page, pcso_response def extract_state(raw_html): """ locates @id="__VIEWSTATE" and returns the __VIEWSTATE value locates @id="__VIEWSTATEGENERATOR" and returns the __VIEWSTATEGENERATOR value locates @id="__EVENT..." and returns the __EVENT... value ISSUE: Using xpath search, __VIEWSTATE, __VIEWSTATEGENERATOR and __EVENTVALIDATION results in a string that is enclosed with ['string']. Method below is to strip the full string using slice[2:-2] to strip the first and last 2 elements of the string. LOOK FOR A BETTER METHOD...