def main(): Cust_Count = len(CCs) proc = 16 iterator = xrange(proc) remainder = Cust_Count % proc ccount = Cust_Count / proc lenI = len(iterator) chk1Time = pyTimer.startTimer() func = partial(createTransData, ccount, remainder, lenI) pool = Pool(processes=proc) results = pool.map(func, iterator) liMaster = results[0] + results[1] + results[2] + results[3] + results[ 4] + results[5] + results[6] + results[7] + results[8] + results[ 9] + results[10] + results[11] + results[12] + results[ 13] + results[14] + results[15] endLoopTime = pyTimer.startTimer() avgLoopTime = round(((endLoopTime - chk1Time) / Cust_Count), 2) avgLoopTime = ("{0:.1f}".format(avgLoopTime)) pyTimer.writeRuntimeLog( "The average time to create 1 customer's transactions is: " + str(avgLoopTime) + ' seconds\n') #Open CSV file for writing chk2Time = pyTimer.startTimer() ## lines=sc.parallelize(liMaster) ## lines.saveAsTextFile("Transactions") with open('cc_trans.csv', 'w') as f1: writer = csv.writer( f1, delimiter='|', lineterminator='\n', ) #File header writer.writerow(['ROWNUM']+['ACCOUNTID']+['MERCHANT_NAME']+['MERCHANT_CATEGORY_CODE']+['MERCHANT_CATEGORY_DESC']+['MERCHANT_COUNTRY']+\ ['POST_DATE']+['TRANSACTION_DATE']+['TRANSACTION_TYPE']+['CREDIT_DEBIT']+['CREDIT_LIMIT']+['AMOUNT']+['BALANCE']+\ ['CREDITCARDNUMBER']+['CC_TYPE']+['USE_CASE']+['CUST_NAME']+['NUM_CCS']+['CUST_CITY']+['CUST_STATE']+['CUST_ZIP']+['CUST_COUNTRY']+['TRANS_DETAIL']) for row in liMaster: writer.writerow(row) endCSVTime = pyTimer.startTimer() endCSVTime = round((endCSVTime - chk2Time), 2) endCSVTime = ("{0:.1f}".format(endCSVTime)) pyTimer.writeRuntimeLog("It took: " + str(endCSVTime) + ' seconds to write to file\n') pyTimer.endTimer(startTime, str(Cust_Count) + ' Transactions creation')
## If statement makes this program standalone ## Do not need this if statement if another program will be calling above functions if __name__ == "__main__": ## Create start time startTime = pyTimer.startTimer() ## Try to download NLTK packages try: punktDL = nltk.download('punkt') aptDL = nltk.download('averaged_perceptron_tagger') except: writeToLog('NLTK Punkt and Averaged_Perceptron_tagger need to be installed') currDate = datetime.now() fileDate = currDate.strftime('%m%d%Y') writeToLog('*****************************' + fileDate + '*****************************\n') fileName = '/var/www/html/' + fileDate + '_CRA_Scrape.csv' mainURL = 'http://www.cra-arc.gc.ca/convictions/' mainXPath = '//*[@class="module-menu-section span-3"]' linkXPath = '//*[@class="col-md-9 col-md-push-3"]' paraXPath = '//p' ## If the NLTK packages are downloaded, run the main program if punktDL and aptDL: main(mainURL, mainXPath, linkXPath, paraXPath, fileName) else: writeToLog('NLTK Punkt and Averaged_Perceptron_tagger need to be downloaded first.') writeToLog('Please sudo python and run nltk.download("punkt") and nltk.download("averaged_perceptron_tagger")') ## Find total time in seconds of program run pName = os.path.basename(__file__) endTime = pyTimer.endTimer(startTime, pName) writeToLog("Program took " + endTime + " to complete.\n") ##*********************END PROGRAM*********************##
## scrapeInfo needs the url, content and 2 xpath variables to call the function ## scrapeInfo returns a list when completed liData.extend(scrapeInfo(mainURL, mainContent, mainXPath, linkXPath)) ## Call function removeDuplicates beforeDedup = len(liData) liData = removeDuplicates(liData) writeToLog(str(len(liData)) + " records of " + str(beforeDedup) + " left after deduplication\n") ## Call createCSV function to write the list data to the scrapeFile ## createCSV needs a list and an open file to run createCSV(liData, scrapeFile) ##*********************END MAIN FUNCTION*********************## ##*********************END FUNCTIONS*********************## ##*********************PROGRAM*********************## ## If statement makes this program standalone ## Do not need this if statement if another program will be calling above functions if __name__ == "__main__": ## Create start time startTime = pyTimer.startTimer() currDate = datetime.now() fileDate = currDate.strftime('%m%d%Y') currDate = currDate.strftime('%Y-%m-%d') fileName = '/var/www/html/' + fileDate + '_Leafly_MMJScrape.csv' main('https://www.leafly.com/finder', '//*[@class="col-xs-6 col-md-4 spacer-bottom-xs"]', './/script', fileName) ## Find total time in seconds of program run endTime = pyTimer.endTimer(startTime) writeToLog("Program took " + endTime + " to complete.\n") ##*********************END PROGRAM*********************##
def main(mainURLList): currDate = datetime.now() ## Make currDate Yesterday's date currDate = currDate - timedelta(days=1) fileDate = currDate.strftime('%m%d%Y') currDate = currDate.strftime('%Y-%m-%d') writeToLog("*************************** " + currDate + " ***************************\n") ## Open a file and overwrite the existing file or create a new file if needed fileName = '/var/www/html/' + fileDate + '_ScreenScrape.csv' with open(fileName,'w') as scrapeFile: writer = csv.writer(scrapeFile, delimiter=',', quoting=csv.QUOTE_NONE, escapechar=' ') ## Add a header row writer.writerow(["PhoneNumber","Email_Address","Website","BackPage_Link"]) try: ## Loop through all urls in the mainURLList for mainURL in mainURLList: liData = [] writeToLog("\nMain scrape of: " + mainURL + "\n") startT = pyTimer.startTimer() startPage = 0 endPage = 0 increment = 1 ## Increment through 999 possible pages while increment < 1000: ## If increment > 1 then add the page string to the URL ## Http request the mainURL if increment == 1: mainRequest = requests.get(mainURL + "adult/") else: mainRequest = requests.get(mainURL + "adult/?page=" + str(increment)) ## Translate the request content to HTML mainContent = html.fromstring(mainRequest.content) ## Use xpath to only grab HTML tags with the CSS class "date" date = mainContent.xpath('//*[@class="date"]') dateStr = '' ## Loop through dates on the page to make sure that the current date is on the page for dateStr in date: dateStr = tostring(dateStr) dateStr = re.search("\w{3}. \w{3}. \d{1,2}", dateStr) dateStr = dateStr.group() + " - " + str(datetime.now().year) dateStr = datetime.strptime(dateStr, '%a. %b. %d - %Y').date() dateStr = dateStr.strftime('%Y-%m-%d') if dateStr == currDate: break ## Compare current date to date on webpage if dateStr == currDate: if startPage == 0: startPage = increment ## Extend liData to include anything from the main body of the postings liData.extend(scrapeInfo(mainURL, mainContent, '/html/body/div//*[@href]')) ## Extend liData to include anything from the sponsorBoxContent liData.extend(scrapeInfo(mainURL, mainContent, '//*[@class="sponsorBoxContent"]/a')) ## If the date on the page is greater than the currDate variable or the currDate variable is blank, go to next page elif currDate < dateStr and currDate <> '': increment = increment + 1 continue else: endPage = increment writeToLog("Scraped pages: " + str(startPage) + " to " + str(endPage) + "\n") writeToLog("Remove dups from scrape of: " + mainURL + "\n") beforeDedup = len(liData) ## Call function removeDuplicates liData = removeDuplicates(liData) writeToLog(str(len(liData)) + " records of " + str(beforeDedup) + " left after deduplication\n") break increment = increment + 1 writeToLog(pyTimer.endTimer(startT) + mainURL + "\n") writeToLog("Write to scrape to CSV\n") ## Call createCSV function to write the list data to the scrapeFile ## createCSV needs a list and a writer from the open file to run writeToCSV(liData, writer) ## Sleep for 30 seconds and then request a different page to make it seem like a human is doing the surfing time.sleep(30) requests.get("http://www.google.com") except: e = traceback.format_exc() writeToLog("Unexpected error:" + str(e) + "\n")
break increment = increment + 1 writeToLog(pyTimer.endTimer(startT) + mainURL + "\n") writeToLog("Write to scrape to CSV\n") ## Call createCSV function to write the list data to the scrapeFile ## createCSV needs a list and a writer from the open file to run writeToCSV(liData, writer) ## Sleep for 30 seconds and then request a different page to make it seem like a human is doing the surfing time.sleep(30) requests.get("http://www.google.com") except: e = traceback.format_exc() writeToLog("Unexpected error:" + str(e) + "\n") ##*********************END MAIN FUNCTION*********************## ##*********************END FUNCTIONS*********************## ##*********************PROGRAM*********************## ## If statement makes this program standalone ## Do not need this if statement if another program will be calling above functions if __name__ == "__main__": ## Create start time startTime = pyTimer.startTimer() ## Create list of all Canadian Backpage links and US Backpage links to be used for main function mainURLList = ["http://alberta.backpage.com/", "http://britishcolumbia.backpage.com/", "http://manitoba.backpage.com/", "http://newbrunswick.backpage.com/", "http://stjohns.backpage.com/", "http://yellowknife.backpage.com/", "http://halifax.backpage.com/", "http://ontario.backpage.com/", "http://quebec.backpage.com/", "http://saskatchewan.backpage.com/", "http://whitehorse.backpage.com/", "http://alabama.backpage.com/", "http://alaska.backpage.com/", "http://arizona.backpage.com/", "http://arkansas.backpage.com/", "http://california.backpage.com/", "http://colorado.backpage.com/", "http://connecticut.backpage.com/", "http://delaware.backpage.com/", "http://florida.backpage.com/", "http://georgia.backpage.com/", "http://hawaii.backpage.com/", "http://idaho.backpage.com/", "http://illinois.backpage.com/", "http://indiana.backpage.com/", "http://iowa.backpage.com/", "http://kansas.backpage.com/", "http://kentucky.backpage.com/", "http://louisiana.backpage.com/", "http://maine.backpage.com/", "http://maryland.backpage.com/", "http://massachusetts.backpage.com/", "http://michigan.backpage.com/", "http://minnesota.backpage.com/", "http://mississippi.backpage.com/", "http://missouri.backpage.com/", "http://montana.backpage.com/", "http://nebraska.backpage.com/", "http://nevada.backpage.com/", "http://newhampshire.backpage.com/", "http://newjersey.backpage.com/", "http://newmexico.backpage.com/", "http://newyork.backpage.com/", "http://northcarolina.backpage.com/", "http://northdakota.backpage.com/", "http://ohio.backpage.com/", "http://oklahoma.backpage.com/", "http://oregon.backpage.com/", "http://pennsylvania.backpage.com/", "http://rhodeisland.backpage.com/", "http://southcarolina.backpage.com/", "http://southdakota.backpage.com/", "http://tennessee.backpage.com/", "http://texas.backpage.com/", "http://utah.backpage.com/", "http://vermont.backpage.com/", "http://virginia.backpage.com/", "http://washington.backpage.com/", "http://washingtondc.backpage.com/", "http://westvirginia.backpage.com/", "http://wisconsin.backpage.com/", "http://wyoming.backpage.com/"] main(mainURLList) ## Find total time in seconds of program run endTime = pyTimer.endTimer(startTime) writeToLog("Program took " + endTime + " to complete.\n") ##*********************END PROGRAM*********************##
for row in reader: # read a row as {column1: value1, column2: value2,...} for (k, v) in row.items(): # go over each column name and value columns[k].append(v) # append the value into the appropriate list # based on column name k CCs = columns['CREDITCARDNUMBER'] ACCTs = columns['ACCOUNTID'] CCTypes = columns['CREDITCARDTYPE'] Holders = columns['NAME'] CCsCount = columns['NUM_CCS'] Cities = columns['CITY'] States = columns['STATE'] ZIPs = columns['ZIP'] Countries = columns['COUNTRY'] UseCase = columns['USE_CASE_SCENARIO'] ClsdFlags = columns['CLOSEDACCOUNT'] pyTimer.endTimer(startTime, '\n Reading in customer file') trans_no = 0 maxCheckin = date(2000, 1, 1) maxBook = date(2000, 1, 1) def pop_transDetail(cat_desc, maxDate, j, maxBook, maxCheckin, randomrange, randomchoice): checkin = date(2000, 1, 1) checkout = date(2000, 1, 1) booking = date(2000, 1, 1) transDetail = '' tmp2 = gen_data.create_name() addr = gen_data.create_city_state_zip() #Add details or Hotel Transactions if (cat_desc == 'Hotels/Motels/Inns/Resorts'
##['LARGE_CASH_EXEMPT']+['DEMARKET_FLAG']+['DEMARKET_DATE']+['PROB_DEFAULT_RISKR']+['OFFICIAL_LANG_PREF']+['CONSENT_SHARING']+\ ##['PREFERRED_CHANNEL']+['PRIMARY_BRANCH_NO']+['DEPENDANTS_COUNT']+['SEG_MODEL_ID']+['SEG_MODEL_TYPE']+\ ##['SEG_MODEL_NAME']+['SEG_MODEL_GROUP']+['SEG_M_GRP_DESC']+['SEG_MODEL_SCORE']+['ARMS_MANUFACTURER']+['AUCTION']+\ ##['CASHINTENSIVE_BUSINESS']+['CASINO_GAMBLING']+['CHANNEL_ONBOARDING']+['CHANNEL_ONGOING_TRANSACTIONS']+['CLIENT_NET_WORTH']+\ ##['COMPLEX_HI_VEHICLE']+['DEALER_PRECIOUS_METAL']+['DIGITAL_PM_OPERATOR']+['EMBASSY_CONSULATE']+['EXCHANGE_CURRENCY']+\ ##['FOREIGN_FINANCIAL_INSTITUTION']+['FOREIGN_GOVERNMENT']+['FOREIGN_NONBANK_FINANCIAL_INSTITUTION']+['INTERNET_GAMBLING']+\ ##['MEDICAL_MARIJUANA_DISPENSARY']+['MONEY_SERVICE_BUSINESS']+['NAICS_CODE']+['NONREGULATED_FINANCIAL_INSTITUTION']+\ ##['NOT_PROFIT']+['PRIVATELY_ATM_OPERATOR']+['PRODUCTS']+['SALES_USED_VEHICLES']+['SERVICES']+\ ##['SIC_CODE']+['STOCK_MARKET_LISTING']+['THIRD_PARTY_PAYMENT_PROCESSOR']+['TRANSACTING_PROVIDER']+['HIGH_NET_WORTH']+['HIGH_RISK']+['RISK_RATING']+['USE_CASE_SCENARIO']) #Loop for number of accounts to generate liMaster = [] start = 10786147 acct_list = [] liSSNMaster = [] liSSNMaster = createSSNs(liSSNMaster, cust_count) pyTimer.endTimer(startTime, 'Creating ' + str(len(liSSNMaster)) + ' SSNs for customers') chk1Time = pyTimer.startTimer() for i in xrange(cust_count): #Initiate High Risk Flags #Politically Exposed Person PEP = 'No' #Customer with a Suspicous Activity Report SAR = 'No' #Customer with a closed account #generate closed acct flag Clsd = choice(Clsd_flag) #High risk customer flag high_risk = 'No' #High Risk Rating hr_rating = '' #Customer that was demarketed by the bank
def main(): #####Customer Count wanted for the end file###### cust_count = 50000 liSSNMaster = [] liSSNMaster = createSSNs(liSSNMaster, cust_count) pyTimer.endTimer( startTime, 'Creating ' + str(len(liSSNMaster)) + ' SSNs for customers') chk1Time = pyTimer.startTimer() cust_list = [] proc = 16 iterator = xrange(proc) remainder = cust_count % proc ccount = cust_count / proc lenI = len(iterator) func = partial(createCustData, liSSNMaster, ccount, remainder, lenI) pool = Pool(processes=proc) results = pool.map(func, iterator) cust_list = results[0] + results[1] + results[2] + results[3] + results[ 4] + results[5] + results[6] + results[7] + results[8] + results[ 9] + results[10] + results[11] + results[12] + results[ 13] + results[14] + results[15] endLoopTime = pyTimer.startTimer() avgLoopTime = round(((endLoopTime - chk1Time) / cust_count), 6) avgLoopTime = ("{0:.6f}".format(avgLoopTime)) pyTimer.writeRuntimeLog('The average time to create a customer is: ' + str(avgLoopTime) + ' seconds\n') ## cust_list.append(['ROWNUM']+['ACCOUNTID']+['ACCT_TYPE']+['NUM_CCS']+['NAME']+['M_NAME']+['SSN']+['AUTHORIZED_NAME2']+['M_NAME2']+['SSN2']+\ ## ['AUTHORIZED_NAME3']+['M_NAME3']+['SSN3']+['AUTHORIZED_NAME4']+['M_NAME4']+['SSN4']+['CREDITCARDNUMBER']+['CREDITCARDTYPE']+['EMPLOYER']+['CUSTEMAIL']+\ ## ['OCCUPATION']+['CITY']+['STATE']+['ZIP']+['COUNTRY']+['PREVIOUS_CITY']+['PREVIOUS_STATE']+\ ## ['PREVIOUS_ZIP']+['PREVIOUS_COUNTRY']+['DOB']+['PEP']+['SAR']+['CLOSEDACCOUNT']+['RELATED_ACCT']+['RELATED_TYPE']+['PARTY_TYPE']+['PARTY_RELATION']+['PARTY_STARTDATE']+['PARTY_ENDDATE']+\ ## ['LARGE_CASH_EXEMPT']+['DEMARKET_FLAG']+['DEMARKET_DATE']+['PROB_DEFAULT_RISKR']+['OFFICIAL_LANG_PREF']+['CONSENT_SHARING']+\ ## ['PREFERRED_CHANNEL']+['PRIMARY_BRANCH_NO']+['DEPENDANTS_COUNT']+['SEG_MODEL_ID']+['SEG_MODEL_TYPE']+\ ## ['SEG_MODEL_NAME']+['SEG_MODEL_GROUP']+['SEG_M_GRP_DESC']+['SEG_MODEL_SCORE']+['ARMS_MANUFACTURER']+['AUCTION']+\ ## ['CASHINTENSIVE_BUSINESS']+['CASINO_GAMBLING']+['CHANNEL_ONBOARDING']+['CHANNEL_ONGOING_TRANSACTIONS']+['CLIENT_NET_WORTH']+\ ## ['COMPLEX_HI_VEHICLE']+['DEALER_PRECIOUS_METAL']+['DIGITAL_PM_OPERATOR']+['EMBASSY_CONSULATE']+['EXCHANGE_CURRENCY']+\ ## ['FOREIGN_FINANCIAL_INSTITUTION']+['FOREIGN_GOVERNMENT']+['FOREIGN_NONBANK_FINANCIAL_INSTITUTION']+['INTERNET_GAMBLING']+\ ## ['MEDICAL_MARIJUANA_DISPENSARY']+['MONEY_SERVICE_BUSINESS']+['NAICS_CODE']+['NONREGULATED_FINANCIAL_INSTITUTION']+\ ## ['NOT_PROFIT']+['PRIVATELY_ATM_OPERATOR']+['PRODUCTS']+['SALES_USED_VEHICLES']+['SERVICES']+\ ## ['SIC_CODE']+['STOCK_MARKET_LISTING']+['THIRD_PARTY_PAYMENT_PROCESSOR']+['TRANSACTING_PROVIDER']+['HIGH_NET_WORTH']+['HIGH_RISK']+['RISK_RATING']+['USE_CASE_SCENARIO']) ## cust_list=createCustData(cust_count) ## lines=sc.parallelize(cust_list) ## lines.saveAsTextFile("Customers") #Creates CSV with open('uber_custv3.csv', 'w') as f1: #Writer for CSV...Pipe delimited...Return for a new line writer = csv.writer( f1, delimiter='|', lineterminator='\n', ) #Header Row writer.writerow(['ROWNUM']+['ACCOUNTID']+['ACCT_TYPE']+['NUM_CCS']+['NAME']+['M_NAME']+['SSN']+['AUTHORIZED_NAME2']+['M_NAME2']+['SSN2']+\ ['AUTHORIZED_NAME3']+['M_NAME3']+['SSN3']+['AUTHORIZED_NAME4']+['M_NAME4']+['SSN4']+['CREDITCARDNUMBER']+['CREDITCARDTYPE']+['EMPLOYER']+['CUSTEMAIL']+\ ['OCCUPATION']+['CITY']+['STATE']+['ZIP']+['COUNTRY']+['PREVIOUS_CITY']+['PREVIOUS_STATE']+\ ['PREVIOUS_ZIP']+['PREVIOUS_COUNTRY']+['DOB']+['PEP']+['SAR']+['CLOSEDACCOUNT']+['RELATED_ACCT']+['RELATED_TYPE']+['PARTY_TYPE']+['PARTY_RELATION']+['PARTY_STARTDATE']+['PARTY_ENDDATE']+\ ['LARGE_CASH_EXEMPT']+['DEMARKET_FLAG']+['DEMARKET_DATE']+['PROB_DEFAULT_RISKR']+['OFFICIAL_LANG_PREF']+['CONSENT_SHARING']+\ ['PREFERRED_CHANNEL']+['PRIMARY_BRANCH_NO']+['DEPENDANTS_COUNT']+['SEG_MODEL_ID']+['SEG_MODEL_TYPE']+\ ['SEG_MODEL_NAME']+['SEG_MODEL_GROUP']+['SEG_M_GRP_DESC']+['SEG_MODEL_SCORE']+['ARMS_MANUFACTURER']+['AUCTION']+\ ['CASHINTENSIVE_BUSINESS']+['CASINO_GAMBLING']+['CHANNEL_ONBOARDING']+['CHANNEL_ONGOING_TRANSACTIONS']+['CLIENT_NET_WORTH']+\ ['COMPLEX_HI_VEHICLE']+['DEALER_PRECIOUS_METAL']+['DIGITAL_PM_OPERATOR']+['EMBASSY_CONSULATE']+['EXCHANGE_CURRENCY']+\ ['FOREIGN_FINANCIAL_INSTITUTION']+['FOREIGN_GOVERNMENT']+['FOREIGN_NONBANK_FINANCIAL_INSTITUTION']+['INTERNET_GAMBLING']+\ ['MEDICAL_MARIJUANA_DISPENSARY']+['MONEY_SERVICE_BUSINESS']+['NAICS_CODE']+['NONREGULATED_FINANCIAL_INSTITUTION']+\ ['NOT_PROFIT']+['PRIVATELY_ATM_OPERATOR']+['PRODUCTS']+['SALES_USED_VEHICLES']+['SERVICES']+\ ['SIC_CODE']+['STOCK_MARKET_LISTING']+['THIRD_PARTY_PAYMENT_PROCESSOR']+['TRANSACTING_PROVIDER']+['HIGH_NET_WORTH']+['HIGH_RISK']+['RISK_RATING']+['USE_CASE_SCENARIO']) for row in cust_list: writer.writerow(row) pyTimer.endTimer(startTime, str(cust_count) + ' Customer creation')