Example #1
0
def main():
    Cust_Count = len(CCs)
    proc = 16
    iterator = xrange(proc)
    remainder = Cust_Count % proc
    ccount = Cust_Count / proc
    lenI = len(iterator)
    chk1Time = pyTimer.startTimer()
    func = partial(createTransData, ccount, remainder, lenI)
    pool = Pool(processes=proc)
    results = pool.map(func, iterator)
    liMaster = results[0] + results[1] + results[2] + results[3] + results[
        4] + results[5] + results[6] + results[7] + results[8] + results[
            9] + results[10] + results[11] + results[12] + results[
                13] + results[14] + results[15]
    endLoopTime = pyTimer.startTimer()
    avgLoopTime = round(((endLoopTime - chk1Time) / Cust_Count), 2)
    avgLoopTime = ("{0:.1f}".format(avgLoopTime))
    pyTimer.writeRuntimeLog(
        "The average time to create 1 customer's transactions is: " +
        str(avgLoopTime) + ' seconds\n')
    #Open CSV file for writing
    chk2Time = pyTimer.startTimer()
    ##    lines=sc.parallelize(liMaster)
    ##    lines.saveAsTextFile("Transactions")
    with open('cc_trans.csv', 'w') as f1:
        writer = csv.writer(
            f1,
            delimiter='|',
            lineterminator='\n',
        )
        #File header
        writer.writerow(['ROWNUM']+['ACCOUNTID']+['MERCHANT_NAME']+['MERCHANT_CATEGORY_CODE']+['MERCHANT_CATEGORY_DESC']+['MERCHANT_COUNTRY']+\
                        ['POST_DATE']+['TRANSACTION_DATE']+['TRANSACTION_TYPE']+['CREDIT_DEBIT']+['CREDIT_LIMIT']+['AMOUNT']+['BALANCE']+\
                        ['CREDITCARDNUMBER']+['CC_TYPE']+['USE_CASE']+['CUST_NAME']+['NUM_CCS']+['CUST_CITY']+['CUST_STATE']+['CUST_ZIP']+['CUST_COUNTRY']+['TRANS_DETAIL'])
        for row in liMaster:
            writer.writerow(row)
    endCSVTime = pyTimer.startTimer()
    endCSVTime = round((endCSVTime - chk2Time), 2)
    endCSVTime = ("{0:.1f}".format(endCSVTime))
    pyTimer.writeRuntimeLog("It took: " + str(endCSVTime) +
                            ' seconds to write to file\n')
    pyTimer.endTimer(startTime, str(Cust_Count) + ' Transactions creation')
Example #2
0
        nameLi.extend(extractNames(htmlLi))
        writeToLog("Removing Duplicates\n")
        nameLi = removeDuplicates(nameLi)
        writeToLog("Creating CSV\n")
        createCSV(nameLi, scrapeFile)

##*********************END MAIN FUNCTION*********************##

##*********************END FUNCTIONS*********************##

##*********************PROGRAM*********************##
##  If statement makes this program standalone
##  Do not need this if statement if another program will be calling above functions
if __name__ == "__main__":
##  Create start time
    startTime = pyTimer.startTimer()
##  Try to download NLTK packages
    try:
        punktDL = nltk.download('punkt')
        aptDL = nltk.download('averaged_perceptron_tagger')
    except:
        writeToLog('NLTK Punkt and Averaged_Perceptron_tagger need to be installed')
    currDate = datetime.now()
    fileDate = currDate.strftime('%m%d%Y')
    writeToLog('*****************************' + fileDate + '*****************************\n')
    fileName = '/var/www/html/' + fileDate + '_CRA_Scrape.csv'
    mainURL = 'http://www.cra-arc.gc.ca/convictions/'
    mainXPath = '//*[@class="module-menu-section span-3"]'
    linkXPath = '//*[@class="col-md-9 col-md-push-3"]'
    paraXPath = '//p'
## If the NLTK packages are downloaded, run the main program
Example #3
0
def main(mainURLList):
    currDate = datetime.now()
##  Make currDate Yesterday's date
    currDate = currDate - timedelta(days=1)
    fileDate = currDate.strftime('%m%d%Y')
    currDate = currDate.strftime('%Y-%m-%d')
    writeToLog("*************************** " + currDate + " ***************************\n")
##  Open a file and overwrite the existing file or create a new file if needed
    fileName = '/var/www/html/' + fileDate + '_ScreenScrape.csv'
    with open(fileName,'w') as scrapeFile:
        writer = csv.writer(scrapeFile, delimiter=',', quoting=csv.QUOTE_NONE, escapechar=' ')
##  Add a header row
        writer.writerow(["PhoneNumber","Email_Address","Website","BackPage_Link"])
        try:
##  Loop through all urls in the mainURLList
            for mainURL in mainURLList:
                liData = []
                writeToLog("\nMain scrape of: " + mainURL + "\n")
                startT = pyTimer.startTimer()
                startPage = 0
                endPage = 0
                increment = 1
##  Increment through 999 possible pages
                while increment < 1000:
##  If increment > 1 then add the page string to the URL
##  Http request the mainURL
                    if increment == 1:
                        mainRequest = requests.get(mainURL + "adult/")
                    else:
                        mainRequest = requests.get(mainURL + "adult/?page=" + str(increment))
##  Translate the request content to HTML
                    mainContent = html.fromstring(mainRequest.content)
##  Use xpath to only grab HTML tags with the CSS class "date"
                    date = mainContent.xpath('//*[@class="date"]')
                    dateStr = ''
##  Loop through dates on the page to make sure that the current date is on the page
                    for dateStr in date:
                        dateStr = tostring(dateStr)
                        dateStr = re.search("\w{3}. \w{3}. \d{1,2}", dateStr)
			dateStr = dateStr.group() + " - " + str(datetime.now().year)
                        dateStr = datetime.strptime(dateStr, '%a. %b. %d - %Y').date()
                        dateStr = dateStr.strftime('%Y-%m-%d')
                        if dateStr == currDate:
                            break
##  Compare current date to date on webpage
                    if dateStr == currDate:
                        if startPage == 0:
                            startPage = increment
##  Extend liData to include anything from the main body of the postings
                        liData.extend(scrapeInfo(mainURL, mainContent, '/html/body/div//*[@href]'))
##  Extend liData to include anything from the sponsorBoxContent
                        liData.extend(scrapeInfo(mainURL, mainContent, '//*[@class="sponsorBoxContent"]/a'))
##  If the date on the page is greater than the currDate variable or the currDate variable is blank, go to next page
                    elif currDate < dateStr and currDate <> '':
                        increment = increment + 1
                        continue
                    else:
                        endPage = increment
                        writeToLog("Scraped pages: " + str(startPage) + " to " + str(endPage) + "\n")
                        writeToLog("Remove dups from scrape of: " + mainURL + "\n")
                        beforeDedup = len(liData)
##  Call function removeDuplicates
                        liData = removeDuplicates(liData)
                        writeToLog(str(len(liData)) + " records of " + str(beforeDedup) + " left after deduplication\n")
                        break
                    increment = increment + 1
                writeToLog(pyTimer.endTimer(startT) + mainURL + "\n")
                writeToLog("Write to scrape to CSV\n")
##  Call createCSV function to write the list data to the scrapeFile
##  createCSV needs a list and a writer from the open file to run
                writeToCSV(liData, writer)
##  Sleep for 30 seconds and then request a different page to make it seem like a human is doing the surfing
                time.sleep(30)
                requests.get("http://www.google.com")
        except:
            e = traceback.format_exc()
            writeToLog("Unexpected error:" + str(e) + "\n")
Example #4
0
# History  | ddmmyyyy  |  User     |                Changes
#          | 01192016  | Ivana D.  | Credit Card model,code, ref lists, etc...
#	   | 01202016  | Jeff K.   | Comments, ref lists, etc...
#	   | 01202016  | Justin S  | SSN distinct list
#-----------------------------------------------------------------------------*/
#Reference data is located on the test-bmohb console gs://newccdatav3
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("Customers")
sc = SparkContext(conf=conf)
from random import randrange, choice, randint
from datetime import datetime
from barnum import gen_data
from faker import Faker
import csv, NAICS, zips, re, geo_data, pyTimer

startTime = pyTimer.startTimer()
#####Customer Count wanted for the end file######
cust_count = 10000
fake = Faker()
#Dictionary for type of account
Related_Type = ['Primary', 'Secondary', 'Joint']
#Dictionary for how the account was opened
Party_Type = ['Person', 'Non-Person']
#Dictionary for a BMO customer
Party_Relation = ['Customer', 'Non-Customer']
#Dictionary for random flags
Yes_No_Cust_Flag = ['Yes'] + ['No'] * 2 + [''] * 392
#Closed Account flag
Clsd_flag = ['Yes'] + ['No'] * 98
#Dictionary for client whose net worth is over $500K
HighNetWorth = ['Yes'] + ['No'] * 30
Example #5
0
def main():
    #####Customer Count wanted for the end file######
    cust_count = 50000
    liSSNMaster = []
    liSSNMaster = createSSNs(liSSNMaster, cust_count)
    pyTimer.endTimer(
        startTime, 'Creating ' + str(len(liSSNMaster)) + ' SSNs for customers')
    chk1Time = pyTimer.startTimer()
    cust_list = []
    proc = 16
    iterator = xrange(proc)
    remainder = cust_count % proc
    ccount = cust_count / proc
    lenI = len(iterator)
    func = partial(createCustData, liSSNMaster, ccount, remainder, lenI)
    pool = Pool(processes=proc)
    results = pool.map(func, iterator)
    cust_list = results[0] + results[1] + results[2] + results[3] + results[
        4] + results[5] + results[6] + results[7] + results[8] + results[
            9] + results[10] + results[11] + results[12] + results[
                13] + results[14] + results[15]
    endLoopTime = pyTimer.startTimer()
    avgLoopTime = round(((endLoopTime - chk1Time) / cust_count), 6)
    avgLoopTime = ("{0:.6f}".format(avgLoopTime))
    pyTimer.writeRuntimeLog('The average time to create a customer is: ' +
                            str(avgLoopTime) + ' seconds\n')
    ##        cust_list.append(['ROWNUM']+['ACCOUNTID']+['ACCT_TYPE']+['NUM_CCS']+['NAME']+['M_NAME']+['SSN']+['AUTHORIZED_NAME2']+['M_NAME2']+['SSN2']+\
    ##        ['AUTHORIZED_NAME3']+['M_NAME3']+['SSN3']+['AUTHORIZED_NAME4']+['M_NAME4']+['SSN4']+['CREDITCARDNUMBER']+['CREDITCARDTYPE']+['EMPLOYER']+['CUSTEMAIL']+\
    ##        ['OCCUPATION']+['CITY']+['STATE']+['ZIP']+['COUNTRY']+['PREVIOUS_CITY']+['PREVIOUS_STATE']+\
    ##        ['PREVIOUS_ZIP']+['PREVIOUS_COUNTRY']+['DOB']+['PEP']+['SAR']+['CLOSEDACCOUNT']+['RELATED_ACCT']+['RELATED_TYPE']+['PARTY_TYPE']+['PARTY_RELATION']+['PARTY_STARTDATE']+['PARTY_ENDDATE']+\
    ##        ['LARGE_CASH_EXEMPT']+['DEMARKET_FLAG']+['DEMARKET_DATE']+['PROB_DEFAULT_RISKR']+['OFFICIAL_LANG_PREF']+['CONSENT_SHARING']+\
    ##        ['PREFERRED_CHANNEL']+['PRIMARY_BRANCH_NO']+['DEPENDANTS_COUNT']+['SEG_MODEL_ID']+['SEG_MODEL_TYPE']+\
    ##        ['SEG_MODEL_NAME']+['SEG_MODEL_GROUP']+['SEG_M_GRP_DESC']+['SEG_MODEL_SCORE']+['ARMS_MANUFACTURER']+['AUCTION']+\
    ##        ['CASHINTENSIVE_BUSINESS']+['CASINO_GAMBLING']+['CHANNEL_ONBOARDING']+['CHANNEL_ONGOING_TRANSACTIONS']+['CLIENT_NET_WORTH']+\
    ##        ['COMPLEX_HI_VEHICLE']+['DEALER_PRECIOUS_METAL']+['DIGITAL_PM_OPERATOR']+['EMBASSY_CONSULATE']+['EXCHANGE_CURRENCY']+\
    ##        ['FOREIGN_FINANCIAL_INSTITUTION']+['FOREIGN_GOVERNMENT']+['FOREIGN_NONBANK_FINANCIAL_INSTITUTION']+['INTERNET_GAMBLING']+\
    ##        ['MEDICAL_MARIJUANA_DISPENSARY']+['MONEY_SERVICE_BUSINESS']+['NAICS_CODE']+['NONREGULATED_FINANCIAL_INSTITUTION']+\
    ##        ['NOT_PROFIT']+['PRIVATELY_ATM_OPERATOR']+['PRODUCTS']+['SALES_USED_VEHICLES']+['SERVICES']+\
    ##        ['SIC_CODE']+['STOCK_MARKET_LISTING']+['THIRD_PARTY_PAYMENT_PROCESSOR']+['TRANSACTING_PROVIDER']+['HIGH_NET_WORTH']+['HIGH_RISK']+['RISK_RATING']+['USE_CASE_SCENARIO'])
    ##        cust_list=createCustData(cust_count)
    ##        lines=sc.parallelize(cust_list)
    ##        lines.saveAsTextFile("Customers")
    #Creates CSV
    with open('uber_custv3.csv', 'w') as f1:
        #Writer for CSV...Pipe delimited...Return for a new line
        writer = csv.writer(
            f1,
            delimiter='|',
            lineterminator='\n',
        )
        #Header Row
        writer.writerow(['ROWNUM']+['ACCOUNTID']+['ACCT_TYPE']+['NUM_CCS']+['NAME']+['M_NAME']+['SSN']+['AUTHORIZED_NAME2']+['M_NAME2']+['SSN2']+\
        ['AUTHORIZED_NAME3']+['M_NAME3']+['SSN3']+['AUTHORIZED_NAME4']+['M_NAME4']+['SSN4']+['CREDITCARDNUMBER']+['CREDITCARDTYPE']+['EMPLOYER']+['CUSTEMAIL']+\
        ['OCCUPATION']+['CITY']+['STATE']+['ZIP']+['COUNTRY']+['PREVIOUS_CITY']+['PREVIOUS_STATE']+\
        ['PREVIOUS_ZIP']+['PREVIOUS_COUNTRY']+['DOB']+['PEP']+['SAR']+['CLOSEDACCOUNT']+['RELATED_ACCT']+['RELATED_TYPE']+['PARTY_TYPE']+['PARTY_RELATION']+['PARTY_STARTDATE']+['PARTY_ENDDATE']+\
        ['LARGE_CASH_EXEMPT']+['DEMARKET_FLAG']+['DEMARKET_DATE']+['PROB_DEFAULT_RISKR']+['OFFICIAL_LANG_PREF']+['CONSENT_SHARING']+\
        ['PREFERRED_CHANNEL']+['PRIMARY_BRANCH_NO']+['DEPENDANTS_COUNT']+['SEG_MODEL_ID']+['SEG_MODEL_TYPE']+\
        ['SEG_MODEL_NAME']+['SEG_MODEL_GROUP']+['SEG_M_GRP_DESC']+['SEG_MODEL_SCORE']+['ARMS_MANUFACTURER']+['AUCTION']+\
        ['CASHINTENSIVE_BUSINESS']+['CASINO_GAMBLING']+['CHANNEL_ONBOARDING']+['CHANNEL_ONGOING_TRANSACTIONS']+['CLIENT_NET_WORTH']+\
        ['COMPLEX_HI_VEHICLE']+['DEALER_PRECIOUS_METAL']+['DIGITAL_PM_OPERATOR']+['EMBASSY_CONSULATE']+['EXCHANGE_CURRENCY']+\
        ['FOREIGN_FINANCIAL_INSTITUTION']+['FOREIGN_GOVERNMENT']+['FOREIGN_NONBANK_FINANCIAL_INSTITUTION']+['INTERNET_GAMBLING']+\
        ['MEDICAL_MARIJUANA_DISPENSARY']+['MONEY_SERVICE_BUSINESS']+['NAICS_CODE']+['NONREGULATED_FINANCIAL_INSTITUTION']+\
        ['NOT_PROFIT']+['PRIVATELY_ATM_OPERATOR']+['PRODUCTS']+['SALES_USED_VEHICLES']+['SERVICES']+\
        ['SIC_CODE']+['STOCK_MARKET_LISTING']+['THIRD_PARTY_PAYMENT_PROCESSOR']+['TRANSACTING_PROVIDER']+['HIGH_NET_WORTH']+['HIGH_RISK']+['RISK_RATING']+['USE_CASE_SCENARIO'])
        for row in cust_list:
            writer.writerow(row)
    pyTimer.endTimer(startTime, str(cust_count) + ' Customer creation')