Ejemplo n.º 1
0
    download_logger.info(f'Attachment ID {filename} Downloaded.')
    time.sleep(random.randint(1, 8))
    return filename


def main(url_list):
    with TPE(max_workers=5) as executor:
        futures = [executor.submit(downloader, link) for link in url_list]
        for future in as_completed(futures):
            download_logger.info(
                f'Thread Closed for Attachment ID {future.result()}')


if __name__ == '__main__':
    download_logger = log.get_logger(__name__)

    attachment_id_start = 2708
    attachment_id_end = 3748

    attachment_id_list = list(range(attachment_id_start, attachment_id_end))

    uri = 'http://www.stockmarketpilipinas.com/attachment.php?aid='

    download_links = uri_creator(uri, attachment_id_start, attachment_id_end,
                                 attachment_id_list)

    main(download_links)

    download_logger.info(
        f'Downloaded {attachment_id_end - attachment_id_start} Attachments.')
Ejemplo n.º 2
0
import csv
import os
import re
import Log.log as log

### Validation steps:
#   1. Load row
#   2. if pattern is not found:     log to file
#   3. if pattern is found:         append filename to pre_processed_list
#   4. return pre_processed_list

# Initialize Logger
file_cleaner_log = log.get_logger(__name__)

# Present Data Dump Path
data_path = os.path.abspath(
    os.path.join(os.path.dirname(__file__), '..', 'Downloader/dump/'))

# Deletion Process for files that DIDN'T PASS the utf-8 encoding check
invalid_file_log = os.path.dirname(__file__) + '/log/invalid.txt'
# files_for_deletion = get_invalid_file()
# delete_invalid_file(files_for_deletion)

valid_file_log = os.path.dirname(__file__) + '/log/valid.txt'
string_pattern = "\A\^FINANCIAL"


def delete_invalid_file(invalid_files_list):
    """
    Iterates through invalid_files_list to remove invalid files from data_path
    If a file is not found, log the exception and continue iteration
Ejemplo n.º 3
0
from Log import log
from Parser import utf8_validator, file_cleaner, file_parser
# This module calls multiple modules from the Parser directory.

#   Initialize Logger
pre_processor_log = log.get_logger(__name__)

#   Get Valid and Invalid file lists
valid_file_list, invalid_file_list = utf8_validator.csv_validator()

#   Pass invalid file list to deletion function in file_cleaner module
file_cleaner.delete_invalid_file(invalid_file_list)

#   Pass valid file list to deletion function in file_cleaner module
pre_processed_list = file_cleaner.preproc_valid_files(valid_file_list,
                                                      len(valid_file_list))

#   Delete files from dump that doesn't start with ^FINANCIAL
for_removal = list(set(valid_file_list).difference(set(pre_processed_list)))
file_cleaner.delete_invalid_file(for_removal)

#   Sort and write rows to appropriate table in DB
file_parser.data_sorter(pre_processed_list)
#!/usr/bin/env python3.7

import re
import Log.log as log
from datetime import datetime as dt

from sqlalchemy import create_engine, Column, Integer, String, Date
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

from Extraction import db_config

db_logger = log.get_logger(__name__)

date_now = dt.now().date()

Base = declarative_base()
engine = create_engine(
    f'postgresql://{db_config.username}:{db_config.password}@{db_config.host}:{db_config.port}/{db_config.database}',
    echo=False)


class Combination(Base):
    __tablename__ = 'lotto'

    id = Column('id', Integer, primary_key=True)
    datedrawn = Column('datedrawn', Date)
    game = Column('game', String)
    game_result = Column('result', String)
    jackpot = Column('jackpot', Integer)
Ejemplo n.º 5
0
                if regexer(row) == 'sector':
                    #   Perform type conversions
                    # row = to_date(to_centavo(to_integer(row[:-1])))

                    #   Populate models.Sector Instance Variables
                    # db_commit_sector(row)
                    # sector_name = row[0]
                    #sector_counter += 1
                    parser_log.info(
                        f'Sector {row[0][1:]} data found on {file}')

                else:
                    #   Perform type conversions
                    row = to_date(to_centavo(to_integer(row[:-1])))
                    #   Populate models.Stock Instance Variables
                    if db_commit_stock(row) == 'duplicate':
                        parser_log.info(
                            f'Data for {row[0]} already in DB. Duplicate on {file}'
                        )
                    else:
                        stock_counter += 1
                        parser_log.info(f'Stock {row[0]} data found on {file}')

            parser_log.info(f'{file} parsed successfully.')
            parser_log.info(
                f'STATISTICS for {file} : {sector_counter} sector data. {stock_counter} stock data.'
            )


parser_log = log.get_logger(__name__)
Ejemplo n.º 6
0
#!/usr/bin/env python3.7

import requests
from lxml import html
from Extraction import db_writer_postgres
import calendar
import Log.log as log

scraper_logger = log.get_logger(__name__)

def get_page():
    """
     retrieves raw html response using GET on uri and html_headers
     converts response string to an htmlElement object
    :return:
    """
    pcso_response = web_session.get(uri, headers=html_headers)
    pcso_page = html.fromstring(pcso_response.content)

    return pcso_page, pcso_response


def extract_state(raw_html):
    """
    locates @id="__VIEWSTATE" and returns the __VIEWSTATE value
    locates @id="__VIEWSTATEGENERATOR" and returns the __VIEWSTATEGENERATOR value
    locates @id="__EVENT..." and returns the __EVENT... value

    ISSUE: Using xpath search, __VIEWSTATE, __VIEWSTATEGENERATOR and __EVENTVALIDATION results
    in a string that is enclosed with ['string']. Method below is to strip the full string using
    slice[2:-2] to strip the first and last 2 elements of the string. LOOK FOR A BETTER METHOD...