def generate(): "----------Start up the course times------------" autotable = AutoTable() scraper = Scraper(autotable) autotable = scraper.build_table() #builder = Builder(autotable) #autotable = builder.build_table() start_time = time.time() print("Year Space") year_solutions = generate_semester(autotable.year.courses) if len(year_solutions) == 0: optimal_solution = fall_winter_merge(autotable.fall.courses, autotable.winter.courses) else: optimal_solution = year_fall_winter_merge(year_solutions, autotable.fall.courses, autotable.winter.courses) print("Fall") for day in optimal_solution[0][0]: for timeslot in day: print(timeslot) print("Winter") for day in optimal_solution[0][1]: for timeslot in day: print(timeslot) print("Fall Distance: " + str(optimal_solution[1]) + " Winter Distance: " + str(optimal_solution[2])) print("--- Full algorithm %s seconds ---" % (time.time() - start_time)) root = Tk() gui1 = MyFirstGUI(optimal_solution[0][0], "Fall", root) root.mainloop() root = Tk() gui2 = MyFirstGUI(optimal_solution[0][1], "Winter", root) root.mainloop()
def generate(): "----------Start up the course times------------" autotable = AutoTable() scraper = Scraper(autotable) autotable = scraper.build_table() #builder = Builder(autotable) #autotable = builder.build_table() start_time = time.time() "----------Get all Fall Timetables------------" courses = autotable.fall.courses courses.extend(autotable.year.courses) space1 = autotable.solution_space(courses) "----------Get all Winter Timetables------------" courses = autotable.winter.courses courses.extend(autotable.year.courses) space2 = autotable.solution_space(courses) "-----------Combine fall and winter-------------" listed = autotable.index_year_courses(autotable.year.courses) compatible = autotable.construct_year(space1,space2,listed) print("Fall:") for section in compatible[0][0][0]: print(section.name) print(section) print("Winter:") for section in compatible[0][1][0]: print(section.name) print(section) print("Distance: "+str(compatible[0][2])) print("--- %s seconds ---" % (time.time() - start_time))
def main(): db = connect_to_db() if not companies_coll_exists(db): logger.info( "The 'companies' collection does not exist. Start scraping company list. " ) dow_30_companies = Scraper.dow_30_companies_func() insert_companies(db, dow_30_companies) logger.info('Current working directory: ' + os.getcwd()) with open('ECT/cookies.txt', 'r') as f: cookie = f.readline() logger.info(f'Using cookie: {cookie}') # get new links based on the tracking company list links_df = get_company_links(db, cookie) # get archived links link_set = get_archived_links(db) raw_df = Scraper.data_companies(links_df, cookie, link_set) count = 0 while (not Scraper.is_raw_df_all_good(raw_df) ) and count <= 3: # try no more than 3 times. raw_df = Scraper.rerun_companies(raw_df, cookie) count += 1 logger.info(f'rerun count: {count}') # insert raw insert_raw_data(db, raw_df, coll_name="raw") # process raw transcripts process_raw(db)
def add_new_vacancy(vacancy_for_db, site): collect = MongoClient('localhost', 27017)[vacancy_for_db][site] lst_vacancy = [] if site == 'hh': lst_vacancy = Sc.parse_hh(vacancy_for_db)[0] elif site == 'sj': lst_vacancy = Sc.parse_sj(vacancy_for_db)[0] for itm in lst_vacancy: itm['_id'] = itm.pop('Ссылка') if not any([itm for itm in collect.find({'_id': itm['_id']})]): collect.insert_one(itm) elif itm not in ([itm for itm in collect.find({'_id': itm['_id']})]): collect.update_one({'_id': itm['_id']}, {'$set': {'Зарплата мин': itm['Зарплата мин'], 'Зарплата мкс': itm['Зарплата мкс'], 'Валюта': itm['Валюта']}}) return True
def assert_stock_prices(): '''Make sure that all market dates after the first data date for any stock have data for that stock. The scraper often leaves holes in the data. If a hole is found, look for manually downloaded data in the old Yahoo database and in ./manualdata/<ticker>.csv. If doesn't exist, fill in data as Yahoo does and print an error.''' stocks = Database.get_stocks() last_assertion_day = StockData.createSDate("2017-07-25") market_dates = Database.get_market_dates() for stock in stocks: for date in market_dates: if last_assertion_day >= date: continue if date.day_number >= stock.first_data_date.day_number: if Database.get_dailydata(stock, date=date) is None: dd = Scraper.scrape_dailydata(stock, date, date) # try scraper again if dd is not None: if len(dd) != 0: dd = dd[0] # take it out of array else: dd = None # no data found if dd is None: # try old Yahoo database dd = Database.get_Yahoo_dailydata(stock, date) if dd is None: # try manual csv's dd = Scraper.get_manual_dailydata(stock, date) if dd is None: # nothing left to try, throw error # add the previous day's close to all values and volume to 0. This is what Yahoo does. prev = Database.get_dailydata(stock, date.getPrevious()) dd = StockData.SDailyData(stock, date, prev.close, prev.close, prev.close, prev.close, 0) Log.log_error( "No data found for {} on {}. Added pseudo values copied from previous day. Check manually to make sure daily data doesn't exist." .format(stock, date)) Database.add_dailydata(dd)
def update_index_stocks(): '''Update the list of stocks in the database to include all stocks in the DJI, S&P500, NASDAQ, NYSE, etc... and update those stocks to have the correct list of indices associated with them.''' indices = [] sp500 = Scraper.scrape_SP500() dji = Scraper.scrape_DJI() #nasdaq = Scraper.scrape_NASDAQ() misc = Scraper.scrape_misc() indices.append(sp500) indices.append(dji) #indices.append(nasdaq) indices.append(misc) if MiscInfo.SHOULD_SCRAPE_NYSE: nyse = Scraper.scrape_NYSE() indices.append(nyse) # add to list of stocks for index in indices: for stock in index: Database.add_stock(stock) # update indices listing for stock stocks = Database.get_stocks() for stock in stocks: # TODO check stock in "" Database.set_indices(stock, "S&P500", stock in sp500) Database.set_indices(stock, "DJI", stock in dji)
def semresult(): url = "http://www.fastvturesults.com/check_new_results/" collegecode = raw_input("Enter the college\nex:\tRnsit:1rn\t") year = raw_input("Enter the year") branch = raw_input("Enter the branch code\n") fp = FileOps.createnew("Cse6sem.csv") BeautifulUsn.gencollege(collegecode, year, branch, url) for i in range(120): studenturl = BeautifulUsn.gennexturl() page = EstablishConnection.openwebpage(studenturl) soup = Scraper.page(page) resulturl, name = Scraper.semresultlink('6', soup) if resulturl != 'none': page = EstablishConnection.openwebpage(resulturl) soup = Scraper.page(page) result = Scraper.getresult(soup, name) print result FileOps.writestudentresult(fp, result)
def test2(): soup = sc.get_soup('http://terraria.gamepedia.com/Chests') tables = soup.find('table', class_='inner') rows = tables.find_all('tr') for row in rows: content = sc.get_element_content(row, stripped=True) return (content)
def keyword_search(query, amount): pmc_ids = [pmc for pmc in Scraper.esearch(query, amount)] alltext = [i for i in Scraper.text_grab_multiple(pmc_ids)] keywords = [ i.lower() for i in Scraper.get_continuous_chunks(" ".join(alltext), query) ] return keywords[:7]
def writeToCSV(self): scraper = Scraper() finishedUltimaList = scraper.getUltimaList() f = open("{}.csv".format(self.animeName.get()), 'w', newline='') writer = csv.writer(f) for row in finishedUltimaList: self.progressBar.step(10) writer.writerow(row) self.progressBar.stop() f.close()
def writeToCSV(self): scraper = Scraper() finishedUltimaList = scraper.getUltimaList() f = open("{}.csv".format(self.animeName.get()),'w' , newline = '') writer = csv.writer(f) for row in finishedUltimaList: self.progressBar.step(10) writer.writerow(row) self.progressBar.stop() f.close()
def main_program(): root = tk.Tk() app = interface(root) # TODO: Set up file dialog to handle selecting filepath msg = Scraper.open_email( r'C:\Users\Couch\Desktop\TimesheetReader\test.msg') # Load Excel workbook path = app.browse_file_dialog() wb = openpyxl.load_workbook(path) sheet = wb.active # Amount of cells (Start - Break - Finish = 3) for each day (7); 3*7days = 21 MAX_CELL_COUNT = len(sheet['D5':'F11'] * 3) # Get list of times from email # TODO: Fix disgusting regex regex = r'\d?\d?\:?\d?\d?\s\w\.\w\.|-' times = Scraper.scrape_msg(msg, regex) # Create new list to copy times to # Append all elements as 0 to prefill data in Excel days = [] for i in range(0, MAX_CELL_COUNT): days.append(0) times_index = 0 for i in range(0, MAX_CELL_COUNT): if times_index < len(times): days[times_index] = str(times[times_index]) times_index += 1 # Format times days = Scraper.format_times(days) Interface.print_status( 'Copying times to spreadsheet: {0} at path: {1}'.format( str(sheet), path)) # write days data to cells i = 0 for rowOfCells in sheet['D5':'F11']: for cell in rowOfCells: cell.value = days[i] i += 1 print('\tRow: {0} copied!'.format(str(rowOfCells))) wb.save(path) Interface.print_status("Completed\n{0}".format('=' * 100)) root.mainloop()
def init(): global crawler global domain global bot global timeout global lag global depth global emailscan global tprint # Remove previous files try: shutil.rmtree(".cache") except OSError: pass try: os.remove("emails.txt") except OSError: pass # Process cmd line arguments cmdArgs() # Initialize scraper object crawler = Scraper(domain, bot, timeout, lag, depth, emailscan) # Pretty print thread tprint = threading.Thread(target=ThreadPrettyPrint)
def parse_and_store(html_file_path): conn = sqlite3.connect('reellog.db') c = conn.cursor() c.execute("SELECT COUNT(*) from reellog") (old_entry_count, ) = c.fetchone() to_write = Scraper.scrape(html_file_path) for row in to_write: command = "INSERT INTO reellog VALUES (%s)" % row try: c.execute(command) print('+ %s' % row) except sqlite3.IntegrityError: print('= %s' % row) conn.commit() c.execute("SELECT COUNT(*) from reellog") (new_entry_count,) = c.fetchone() conn.close() print("%i new entries added" % (int(new_entry_count) - int(old_entry_count)))
def call_scrape_singlethreaded(source, cities): if len(cities) == 0 or cities[0] == '': return True for destination in cities: if source != destination or destination == '': results = [] tries = 0 for day in get_range_dates(date.today(), DATE_RANGES): scraped_data = Scraper.parse( source, destination, day, date.strftime(date.today(), '%m/%d/%y')) if scraped_data: results += scraped_data tries = 0 else: tries += 1 if tries >= MAX_ATTEMPTS_SRAPE: print "ERROR: Exceeded Maximum Attempts" return False file_name = SCRAPE_DIRECTORY + directory + '/%s-%s-%s-flight-results.json' % ( date.strftime(date.today(), '%m%d%y'), source, destination) # Did not get the results, so return false if not results: return False with open(file_name, 'w') as fp: json.dump(results, fp, indent=4) # Case where the source = destination, we want to change cities else: return False return True
def makeDashboards(symbol, sampleAmount): companyList = pd.read_csv("companylist.csv") companyRow = companyList[companyList["Symbol"] == symbol] retrievedName = companyRow.iat[0, 1] retrievedSector = companyRow.iat[0, 7] origStock = Scraper.Stock(symbol, retrievedName, retrievedSector) #get the comparisons compare.experiment(sampleAmount, origStock) compareData = pd.read_csv("output.csv") compareData = compareData.fillna(0) # output to static HTML file #get all of the compare stocks stockFile = open("stocks.p", "rb") stockList = pickle.load(stockFile) polarityScript, polarityHtml = polarityDashboard( origStock, stockList, compareData.sort_values(by=['WIKI_SIMILARITY'], ascending=False)) biasScript, biasHtml = biasDashboard( origStock, stockList, compareData.sort_values(by=['WIKI_SIMILARITY'], ascending=False)) relScript, relHtml = relevanceDashboard(compareData) return polarityScript, polarityHtml, biasScript, biasHtml, relScript, relHtml
def crawler_main(self, indice_link): source = re.findall(REGEX_SOURCE, self.__url) url = self.__url parsed = Crawler.crawlear_web(self, url) ultima_pagina = (parsed.xpath(XPATH_ULTIMA_PAGINA)[0]) # transforma el string con , como separador de mil en entero locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') last_page = int(locale.atof((re.findall(REGEX_ULTIMA_PAGINA, ultima_pagina))[0])) cantidad_paginas = math.ceil(last_page / 24) indice_pagina = 1 # indice para cambio de pagina while cantidad_paginas >= indice_pagina: contador = 1 sigiente_pagina = self.__nuevo_link.format(indice_pagina) parsed3 = Crawler.crawlear_web(self, sigiente_pagina) unidades_href = parsed3.xpath(XPATH_HREF_UNIDADES) for elemento in unidades_href: parsed2 = Crawler.crawlear_web(self, elemento) instancia_scraper = Scraper.Scraper() instancia_scraper.crear_dicc(parsed2, elemento, source, indice_link) print("Nuevo Aviso", contador) contador += 1 indice_pagina += 1 print("Cambio de pagina") print("Nueva Url")
def runit(keyword): me = getBySearchid(keyword) print me.keyword rawresults = Scraper.scrape(me.keyword) results = [s for s in rawresults if s.keys()!=[]] # print results for rows in results: try: new_result = Results(search_id = keyword, fundingNumber = rows["fundingNumber"].strip(), opportunityTitle = rows["opportunityTitle"].strip(), Agency = rows["Agency"].strip(), openDate = rows["openDate"].strip(), CloseDate = rows["closeDate"].strip(), attachment = rows["attachment"].strip(), link = "http://www.grants.gov/"+rows["link"].strip() ) db.session.add(new_result) db.session.commit() except Exception: print 'goddamnit' raise # results = Scraper.scrape(keyword) # for rows in results: # try: # cells = rows.find_all("td") # print cells[1] # except Exception: # print 'well well' return redirect(url_for('view'))
def addEntry(db): # add new data # request data [ name, usdPrice, eurPrice, percentChange7d, percentChange24h, percentChange1h, lastUpdated ] = Scraper.getNewData() # insert on DB db.test.insert({ "name": name, "usdPrice": usdPrice, "eurPrice": eurPrice, "percentChange7d": percentChange7d, "percentChange24h": percentChange24h, "percentChange1h": percentChange1h, "date": datetime.datetime.fromtimestamp( int(lastUpdated)).strftime('%Y-%m-%d %H:%M:%S') })
def load_stocks(stocks, start, end): start_time = time.time() session = DatabaseService.setup_db() # in_db = session.query(Stock).filter(Stock.ticker.in_(stocks)).filter(Stock.timestamp >= start).filter(Stock.timestamp <= end).all() # removed = [] # for row in in_db: # if row.ticker not in removed: # stocks.remove(row.ticker) # removed.append(row.ticker) if len(stocks) > 0: panel = Scraper.lookup(stocks, start, end) for date in panel.major_axis: for company in panel.minor_axis: frame = panel.loc[:, date, company] high = frame["High"] low = frame["Low"] open = frame["Open"] close = frame["Close"] vol = frame["Volume"] adj_close = frame["Adj Close"] s = Stock(company, high, low, open, close, vol, adj_close, date) if not s.is_nan(): session.add(s) # print("{}: high[{}] low[{}] date[{}]".format(stock, hi, low, date)) session.commit() end_time = time.time() print("Time taken to load stocks: %s seconds" % (end_time - start_time))
def addStarting(playerMap,projStarters): # print(playerMap) for playerid,gameMap in playerMap.items(): for gameid,gameList in gameMap.items(): isStarting = 1 if Scraper.playerid_to_playerName(playerid) in projStarters else 0 gameList.append(isStarting) return playerMap
def get_update_logs(building: Building, filter: Filter): apts_now = Scraper.get_apartments(building) histos = list(Historizer.load_building(building)) histos.append(HistoEntry(datetime.datetime.utcnow(), apts_now)) for i in range(1, len(histos)): yield UpdateLog( histos[i - 1].date, histos[i].date, compare(histos[i - 1].apartments, histos[i].apartments))
def yield_building_changes(building: Building, filter: Filter): apts_now = Scraper.get_apartments( building ) # no filter on api because we prefer filtering later (to compare with before) last_histo = get_last_snapshot(building) if last_histo: apts_before = last_histo.apartments return yield_changes(apts_before, apts_now, filter)
def item_image(item_name): url = 'http://terraria.gamepedia.com/File:' + item_name.replace( ' ', '_') + '.png' soup = sc.get_soup(url) tag = soup.find('a', title=item_name + '.png') if tag != None: return tag['href'] return 'no image'
def test3(url): elements = sc.get_soup(url).find_all() query = SoupQuery('table') query.has_content('Result') query.has_content('Ingredients') query.has_content('Crafting Station') query.has_attribute('class', 'inner') #parent = SoupQuery('table') #query.has_parent(parent) found = [] for element in elements: if sc.get_query(element, query): if sc.multibox_test(element): for item in get_row_data(element): found.append(item) return found
def on_click_standard(self): Scraper.Scrape(self.Eventname.text(), self.Keywords.text(), self.getLatitude.text(), self.getLongitude.text(), self.getRadius.text(), self.calenderButtonStart.text(), self.startTime.currentText(), self.calenderButtonEnd.text(), self.endTime.currentText(), self.eventNums[self.eventLookup.currentText()])
def load_training_stocks(start_sample_time, end_sample_time, start_train_time, end_train_time): stocks = [] with open(get_data_file()) as csv_file: reader = csv.DictReader(csv_file) tickers = [row['Symbol'] for row in reader] # tickers = random.sample(tickers, MAX_NUM_TICKERS) tickers = tickers[0:MAX_NUM_TICKERS] # Cols are Symbol, Name, Sector try: training_data = Scraper.lookup(tickers, start_sample_time, end_sample_time) validation_data = Scraper.lookup(tickers, start_train_time, end_train_time)['Close'] stocks = [TrainingStock.TrainingStock(company, training_data[company], validation_data[company]) for company in tickers] except RemoteDataError: print("Error reading stock data, skipping") return stocks
def check_coins(coins): if debug: print('*** DEBUG *** check_coins() with parameter '+coins) if debug: print('*** DEBUG *** You want to check {0}'.format(coins)) j = json.loads(Scraper.scrape(coins,'BTC,USD')) if 'Response' in j: print ("Could not find coin! - "+j['Message']) return False else: if debug: print ("*** DEBUG *** Your coin {0} exists, currently costs {1} BTC, which is the equivalent of ${2}".format(coins,j['BTC'],j['USD'])) return True
def get_company_links(db, cookies): """ Get links based on the tracking company list :param db: db client :param cookies: String, http header cookie. Read from local txt file. :return: dataframe, links_df from "Scraper.get_links_2(company_list, cookies)" """ company_list = get_company_symbol_list(db) links_df = Scraper.get_links_2(company_list, cookies) return links_df
def update_navigation() -> None: scraper = Scraper.Scraper() nav_data = scraper.get_all_nav() # Subject Names json.dump(nav_data["subject_names"], open(make_path(navigation_dir, 'subject_names.json'),'w')) # Subject -> Colleges json.dump(nav_data["subject_colleges"], open(make_path(navigation_dir, 'subject_colleges.json'),'w')) # College -> Subjects json.dump(nav_data["college_subjects"], open(make_path(navigation_dir, 'college_subjects.json'),'w')) # Errors json.dump(nav_data["errors"], open(make_path(error_dir, 'update_nav.json'),'w'))
def influencer_bin_data(influencers, user, pw): for i in range(len(influencers)): print('Extracting info from ' + influencers[i].decode('utf-8') + '... using ' + user) scraper = Scraper(influencers[i], user, pw) user_data = scraper.run() print('\n' + 'No. of followers scraped for ' + influencers[i].decode('utf-8') + ' : ' + str(len(user_data))) scraper.close() # save data for each user file_name = 'data/followers_' + influencers[i].decode( 'utf-8') + '.pickle' with open(file_name, 'wb') as file: pickle.dump(user_data, file) # track done list of users with open('done_list.txt', 'a') as file: file.write(influencers[i].decode('utf-8') + '\n')
def update_stock_dates(): '''Update the list of dates that the stock market was open to include all dates up to today.''' last_update = Database.get_last_market_date() dates = Scraper.scrape_market_dates(start_date=last_update) if dates is None: return if last_update is not None: dates = dates[1:] # first date is already in database for date in dates: Database.add_market_date(date)
def grab_data(url): elements = sc.get_soup(url).find_all() query = SoupQuery('table') query.has_content('Result') query.has_content('Ingredients') found = [] for element in elements: if sc.get_query(element, query): for item in row_format(element): if len(item) < 15: found.append(item) cleaned = [] for row in found: clean = cleaners.clean_extra_vals(row) clean = cleaners.list_filter(clean, [')', '(']) if 'v' not in clean: cleaned.append(clean) return cleaned
def getMDAfromText(filename,text): try: soup = BeautifulSoup(text, "lxml") fullText = scraper.scrapeByAnchorTag(soup) if fullText is not None: print("{0}\tScraped By Anchor".format(filename)) return fullText fullText = scraper.scrapeByRegex(soup) if fullText is not None: print("{0}\tScraped By Regex".format(filename)) return fullText if fullText is None: print("{0}\tUnable to scrape".format(filename)) text = ''.join(soup.findAll(text=True)) text.replace("’","'") helper.writeToDirectoryFile("debug",filename,text) return None except UnicodeEncodeError: print("{0}\tUnicodeEncodeError".format(filename)) helper.writeToDirectoryFile("debug",filename,text) return None
def __init__(self): self.scraper = Scraper.Scraper() self.materias = {} # Dicionario: Codigo_materia->Materia_obj self.nomes_materias = {} # Dicionario: Codigo_materia->nome_materia self.curso_materias = { } # Dicionario: Codigo_curso->lista_materias_do_curso (Ordem alfabetica) self.lista_cursos = [ ] # Lista com o nome de todos os cursos. Guardados em tuplas (cod, nome) if os.path.exists(DICT_FILE): with open(DICT_FILE, "rb") as f: self.materias, self.nomes_materias, self.curso_materias, self.lista_cursos = cPickle.load( f)
def get_HTML_article(url_opener, article_file, article_url): 'Get URL HTML' print ("Getting HTML article from URL: " + article_url) html_response=url_opener.open(article_url) 'Build HTML parser' soup = BeautifulSoup(html_response) 'Get the Author' article_author_obj=soup.find('a', attrs={"rel": "author"}) if (article_author_obj != None): article_author= article_author_obj.contents author = str(article_author[0]) author_stripped = Scraper.string_cleaner(author) else : author_stripped = "Unknown" article_file.write("<author>" + author_stripped + '</author>\n\n') 'Get the Author' article_body=soup.findAll('article') 'Get all paragraphs + clean redundant chars' article_file.write("<content>" + "\n") try: for article in article_body: for paragraph in article.findAll('p'): stripped_p = Scraper.string_cleaner(paragraph) article_file.write(stripped_p + "\n") except: return False article_file.write("</content>" + "\n") return True 'Get next page - Currently disabled '
def get_HTML_article(url_opener, article_file, article_url): 'Get URL HTML' print ("Getting HTML article from URL: " + article_url) html_response=url_opener.open(article_url) 'Build HTML parser' soup = BeautifulSoup(html_response) 'Get the Author' article_author_obj=soup.find('span', attrs={"itemprop": "name"}) if (article_author_obj != None): article_author= article_author_obj.contents author_to_parse = article_author[0].split(",", 1) author = re.sub(r'\\n', '', str(author_to_parse[0])).strip() author_stripped = Scraper.string_cleaner(author) else : author_stripped = "Unknown" article_file.write("<author>" + author_stripped +'</author>\n\n') 'Get The Article body' article_body=soup.find(attrs={"itemprop": "articleBody"}) 'Get all paragraphs + clean redundant chars' article_file.write("<content>" + "\n") try: for paragraph in article_body.findAll('p'): stripped_p = Scraper.string_cleaner(paragraph) article_file.write(stripped_p + "\n") except: return False article_file.write("</content>" + "\n") return True 'Get next page - Currently disabled '
def executeScript(self): scraper = Scraper() tempName = self.animeName.get() tempName = tempName.split() tempName.append('episode') animeName = "" for i in tempName: animeName = animeName + i + "-" permAnimeName = animeName self.count = 1 a = True while a: animeName = animeName + str(self.count) a = scraper.getHTMLTags("dummy string", animeName) animeName = permAnimeName self.count += 1 self.writeToCSV()
def check_yesterday_fanduel(playerMap): yesterdayDate = datetime.date.today()-datetime.timedelta(days=1) with open("final_preds.txt","r") as f: resultList = json.loads(f.readline()) with open("final_predList.txt","r") as f: predList = json.loads(f.readline()) with open("yesterday_results.txt","w") as f: totalPred = 0 totalActual = 0 totalCost = 0 for i in range(0,len(resultList[0])): name = resultList[3][i] points = resultList[1][i] position = resultList[0][i] cost= resultList[2][i] totalPred += points totalCost += cost #print(name) playeridStr = Scraper.playername_to_id(str(name)) #print(playeridStr) # print(type(playeridStr)) gameOrderedDict = playerMap[playeridStr] lastGameStats = gameOrderedDict[next(reversed(gameOrderedDict))] predictedStatsList = predList[name] if(lastGameStats[0] != yesterdayDate.month or lastGameStats[1] != yesterdayDate.day or lastGameStats[2] != yesterdayDate.year): f.write(name + " might have been injured or did not play\n") f.write(name + " (" + position + ") was projected for " + str(points) + " points at " + str(cost) + " cost and actually got " + str(0) + "\n") else: f.write(json.dumps([float("{0:.2f}".format(x)) for x in predictedStatsList])+"\n") statsList = lastGameStats[12:] f.write(json.dumps(statsList)+"\n") actual_fanduel = Util.calc_fanduel_points(statsList) totalActual += actual_fanduel f.write(name + " (" + position + ") was projected for " + str(points) + " points at " + str(cost) + " cost and actually got " + str(actual_fanduel) + "\n") f.write("\n") f.write("Total Predicted points is " + str(totalPred) + " at " + str(totalCost) + " cost, and total actual points is " + "{0:.2f}".format(totalActual))
def find_courses(): username = "******" password = get_password() driver = Driver('firefox') driver.home() assert (driver.login(username, password)) driver.goto_menu("Onderwijs") driver.fillout() scraper = Scraper(driver) scraper.find_courses() scraper.print_courses() scraper.find_courses_names() scraper.scrape_course_elements() driver.shutdown() with open("courses.dat", 'wb+') as courses_file: pickle.dump(scraper.courses, courses_file) for course in scraper.courses: print(course) return scraper.courses
def gen_description_and_fanduel_map(dict,csvFileName): playerList = [] pred_statList = {} with open("final.txt","w") as f: fanduel_data_arr = Util.fanduel_scrape(csvFileName) for playerid, statList in dict.items(): name = Scraper.playerid_to_playerName(str(int(playerid))) #print(name) if(name in fanduel_data_arr["Name"].as_matrix()): [row] = fanduel_data_arr.loc[fanduel_data_arr['Name'] == name].as_matrix() position = row[1] fanduelAvg = row[4] cost = row[6] injured = row[10] predicted = Util.calc_fanduel_points(statList) #print(type(statList)) pred_statList[name] = statList.tolist() #print(row) f.write( name + ": [" + "{0:.2f}".format(statList[0]) + " mins, " + "{0:.2f}".format(statList[1]) + "/" + "{0:.2f}".format(statList[2]) + " fg, " + "{0:.2f}".format(statList[3]) + "/" + "{0:.2f}".format(statList[4]) + " 3p, " + "{0:.2f}".format(statList[5]) + "/" + "{0:.2f}".format(statList[6]) + " ft, " + "{0:.2f}".format(statList[7]) + " dreb, " + "{0:.2f}".format(statList[8]) + " oreb, " + "{0:.2f}".format(statList[9]) + " reb, " + "{0:.2f}".format(statList[10]) + " ast, " + "{0:.2f}".format(statList[11]) + " stl, " + "{0:.2f}".format(statList[12]) + " blk, " + "{0:.2f}".format(statList[13]) + " TO, " + "{0:.2f}".format(statList[14]) + " PF, " + "{0:.2f}".format(statList[15]) + " +/-, " + "{0:.2f}".format(statList[16]) + " pts] FANDUEL: " + "{0:.2f}".format(predicted) + ", " + position + ", " + str(cost) + ", " + "{0:.2f}".format(fanduelAvg) + "\n") if(injured != "GTD" and injured != "O"): playerList.append([position, predicted, cost,name]) writeFinal_predList(pred_statList) #writePlayerIDDict(playerIDDict) return playerList
return recognizer.recognize_sphinx(audio) except speech_recognition.UnknownValueError: print("Could not understand audio") except speech_recognition.RequestError as e: print("Recog Error; {0}".format(e)) return "" print 'Say an emotion' command = listen() print command i = True while i == True: if command == "smile": print Scraper.getimage("smiling-face-with-open-mouth-and-smiling-eyes") print "\n:)" elif command == "sad": i = False print Scraper.getimage("crying-face") print "\n:(" elif command == "excited": i = False print Scraper.getimage("jack-o-lantern") print "\n:O" elif command == "upset": i = False print Scraper.getimage("angry-face") print "\n>:(" else: print command
import csv import Optimize import ML import Util import ReadWriteFiles import Scraper # print("Reading previously stored player-stats map") (lastModifiedDate,currentMap) = ReadWriteFiles.readPlayerStatsFile() isUpdated = (lastModifiedDate == datetime.date.today()) print("Getting data about players playing today") today_playerMap = Scraper.create_todays_playerMap() projStarters = Scraper.getProjStarters() today_playerMap = Util.addStarting(today_playerMap,projStarters) print(json.dumps(today_playerMap)) (lastModifiedDate,currentMap) = ReadWriteFiles.readPlayerStatsFile() injuredTodayMap = Scraper.getInjuredPlayers() injuredIDMap = ReadWriteFiles.readInjuredIDMap() if(not isUpdated): print("Creating Player Map")
def generate_features(currentMap,today_stats,injuredIDMap, injuredTodayMap): # print(type(currentMap)) #print(today_stats) #featureList = OrderedDict(list) trainingFeatureList = deque([]) testingFeatureList = deque([]) todayFeatureList = deque([]) completeFeatureMap = defaultdict(OrderedDict) allGameIDs = set() for playerid,orderedDict in currentMap.items(): #prevGameIds = deque([]) #19 stats for each game seasonGameStatsTotals = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] prevGameStats = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] #use deques with max size to keep track of most recent n games prev2GamesStats = deque([],2) prev3GamesStats = deque([],3) prev5GamesStats = deque([],5) prev10GamesStats = deque([],10) prev20GamesStats = deque([],20) prev2GamesStats.append([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]) prev3GamesStats.append([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]) prev5GamesStats.append([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]) prev10GamesStats.append([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]) prev20GamesStats.append([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]) count = 0 #need how many games stats each player has # split 80%-20% train-test # best to split by time #first 4/5 of (games-first) = train, rest for test gamesForPlayer = len(orderedDict) for gameid,statList in orderedDict.items(): allGameIDs.add(gameid) #count represents how many games the player has previously played gameFeature = [int(playerid)] + [int(gameid)] + statList[:8] + [count] gameFeature += prevGameStats gameFeature += Util.avgStats(prev2GamesStats) gameFeature += Util.avgStats(prev3GamesStats) gameFeature += Util.avgStats(prev5GamesStats) gameFeature += Util.avgStats(prev10GamesStats) gameFeature += Util.avgStats(prev20GamesStats) gameFeature += (np.array(seasonGameStatsTotals) / max(count,1)).tolist() if(count <= 0.8 * (gamesForPlayer-1)): trainingFeatureList.append(gameFeature) else: testingFeatureList.append(gameFeature) # print("HERE gameid " + str(gameid)) completeFeatureMap[playerid][gameid] = gameFeature # print(len(gameFeature)) # if(len(gameFeature) != 158): # print(gameFeature) count+=1 #prevGameIds += [gameid] prevGameStats = statList[8:] prev2GamesStats.append(statList[8:]) prev3GamesStats.append(statList[8:]) prev5GamesStats.append(statList[8:]) prev10GamesStats.append(statList[8:]) prev20GamesStats.append(statList[8:]) seasonGameStatsTotals = [x + y for x, y in zip(seasonGameStatsTotals,statList[8:])] if(playerid in today_stats): (todayGameid,statsList) = today_stats[playerid].popitem() feature = [int(playerid)] + [int(todayGameid)] + statsList[:8] + [count] + prevGameStats + Util.avgStats(prev2GamesStats) + Util.avgStats(prev3GamesStats) + Util.avgStats(prev5GamesStats) + Util.avgStats(prev10GamesStats) + Util.avgStats(prev20GamesStats) + (np.array(seasonGameStatsTotals) / count).tolist() todayFeatureList.append(feature) for feature in todayFeatureList: todayGameid = str(feature[1]) ownTeamNum = feature[6] injuredList = injuredTodayMap[ownTeamNum] injuredListFeatures = [] if(len(injuredList) == 0): injuredListFeatures = awayInjuredListFeatures = np.zeros((1,148)) else: for injuredName in injuredList: injuredID = Scraper.playername_to_id(injuredName) for (gameid) in reversed(list(completeFeatureMap[injuredID].keys())): #get the last features that the injured player had if(gameid <= todayGameid): gameStatsList = completeFeatureMap[injuredID][gameid] # weight = gameStatsList[10] injuredListFeatures.append(gameStatsList[10:]) # print(len(gameStatsList[10:])) break injuredListFeatures = np.array(injuredListFeatures) # print(injuredListFeatures.shape) meanInjuredStats = np.mean(injuredListFeatures,0) stdInjuredStats = np.std(injuredListFeatures,0) feature += (meanInjuredStats.tolist() + stdInjuredStats.tolist()) #print(list(todayFeatureList)) injuredMap = {} for currentGameID in allGameIDs: #create injury features # print(currentGameID) # print(type(currentGameID)) #for both the away team and home team awayInjuredIDList = injuredIDMap[currentGameID][0] awayInjuredListFeatures = [] for awayInjuredID in awayInjuredIDList: # print(type(completeFeatureMap[injuredID].keys())) # print("new awayInjuredID " + str(awayInjuredID)) for (gameid) in reversed(list(completeFeatureMap[awayInjuredID].keys())): # print(gameid) # print(type(gameid)) #get the last features that the injured player had if(gameid <= currentGameID): gameStatsList = completeFeatureMap[awayInjuredID][gameid] # weight = gameStatsList[10] awayInjuredListFeatures.append(gameStatsList[10:]) # print(len(gameStatsList[10:])) # print(awayInjuredID + " " + currentGameID) # print(gameStatsList) break if(len(awayInjuredListFeatures) == 0): awayInjuredListFeatures = np.zeros((1,148)) else: awayInjuredListFeatures = np.array(awayInjuredListFeatures) # print(injuredListFeatures.shape) awayMeanInjuredStats = np.mean(awayInjuredListFeatures,0) awayStdInjuredStats = np.std(awayInjuredListFeatures,0) # print(awayMeanInjuredStats.shape) # print(awayStdInjuredStats.shape) homeInjuredIDList = injuredIDMap[currentGameID][1] homeInjuredListFeatures = [] for homeInjuredID in homeInjuredIDList: # print(type(completeFeatureMap[injuredID].keys())) # print(reversed(list(completeFeatureMap[homeInjuredID].keys()))) for (gameid) in reversed(list(completeFeatureMap[homeInjuredID].keys())): #get the last features that the injured player had if(gameid <= currentGameID): gameStatsList = completeFeatureMap[homeInjuredID][gameid] # weight = gameStatsList[10] homeInjuredListFeatures.append(gameStatsList[10:]) # print(len(gameStatsList[10:])) # print(homeInjuredID + " " + currentGameID) # print(gameStatsList) break if(len(homeInjuredListFeatures) == 0): homeInjuredListFeatures = np.zeros((1,148)) else: homeInjuredListFeatures = np.array(homeInjuredListFeatures) # print(injuredListFeatures.shape) homeMeanInjuredStats = np.mean(homeInjuredListFeatures,0) homeStdInjuredStats = np.std(homeInjuredListFeatures,0) # print(homeMeanInjuredStats.shape) # print(homeStdInjuredStats.shape) injuredMap[currentGameID] = (awayMeanInjuredStats.tolist() + awayStdInjuredStats.tolist(), homeMeanInjuredStats.tolist() + homeStdInjuredStats.tolist()) # print(injuredMap) #add injuryfeatures to previously computed features for gameFeature in list(trainingFeatureList): gameid = gameFeature[1] isAway = gameFeature[8] # print("HERE: " + str(gameid)) gameFeature += injuredMap[str(gameid)][isAway] for gameFeature in list(testingFeatureList): gameid = gameFeature[1] isAway = gameFeature[8] # print("HERE: " + str(gameid)) gameFeature += injuredMap[str(gameid)][isAway] return (np.array(list(trainingFeatureList)),np.array(list(testingFeatureList)), np.array(list(todayFeatureList)))
# See LICENSE. from sys import stdout from Stock import Stock import Scraper import Rankings import Fixer import Writer # Scrape data from FINVIZ. Certain presets have been established (see direct # link for more details) url = 'http://finviz.com/screener.ashx?v=152&f=cap_smallover&' + \ 'ft=4&c=0,1,2,6,7,10,11,13,14,45,65' html = Scraper.importHtml(url) # Parse the HTML for the number of pages from which we'll pull data nPages = -1 for line in html: if line[0:40] == '<option selected="selected" value=1>Page': # Find indices b1 = line.index('/') + 1 b2 = b1 + line[b1:].index('<') # Number of pages containing stock data nPages = int(line[b1:b2]) break # Create a database containing all stocks stocks = []
#!/bin/python2 import csv import Scraper def findDII(margin): dii = (-11.0/24)*margin + (2411.0/24) return dii #Get all the vote data from the Scraper script rawActions = Scraper.getActions() #First we find the data for all votes actions = Scraper.filterActions(actions=rawActions, billsOnly=False, passedOnly=False) margins = Scraper.getMargins(actions) leg, votes = Scraper.getVoteDict(actions) mccarthyID = "M001165" mccarthyVotes = votes[mccarthyID] repubs = {} for l in leg: if leg[l]['Party'] == 'D': pass else: repubs[l] = leg[l] diisum = 0 totalVotes = 0 votesAgainst = 0 for b in votes[l]: if votes[l][b] == 0: pass elif votes[l][b] != mccarthyVotes[b]: diisum += findDII(margins[b]) votesAgainst += 1
' Iterate the submissions ' for sub in submissions: try: 'Accept only posts with comments' if (sub.score < VOTE_TRESHHOLD): continue 'Open File with article id as the name' article_file = open(str(sub_reddit) + "\\" + str(article_id), 'w+', newline="\n") article_file.write("<article>\n") article_file.write("<sub-reddit>" + sub_reddit + "</sub-reddit>\n") article_file.write("<news-paper>" + sub.domain + "</news-paper>\n") article_file.write("\n") stripped_title = Scraper.string_cleaner(sub.title) article_file.write("<title>" + stripped_title + "</title>\n") 'Get the article content' if (SUPPORTED_NEWS_SITES[0] in sub.domain): success = Scraper.ny_times.get_HTML_article(url_opener, article_file, sub.url) elif (SUPPORTED_NEWS_SITES[1] in sub.domain): success = Scraper.usa_today.get_HTML_article(url_opener, article_file, sub.url) elif (SUPPORTED_NEWS_SITES[2] in sub.domain): success = Scraper.washington_post.get_HTML_article(url_opener, article_file, sub.url) else: success = False 'Close the XML file' article_file.write("</article>\n") 'Found articles counter'
for result in results_obj: try: 'Get ID' try: res_id = result['data-fullname'] except: continue 'Get Entry' entry_obj = result.find('div', attrs={"class": "entry unvoted"}) 'Get title' title_obj = (entry_obj.find('p', attrs={"class": "title"})).find('a', attrs={"class": "title "}) title_parsed = title_obj.contents url= title_obj['href'] title = Scraper.string_cleaner(str(title_parsed[0])) 'Get Domain' domain_obj = entry_obj.find('p', attrs={"class": "title"}) span_obj = domain_obj.find('span', attrs={"class": "domain"}) domain_parsed = span_obj.find('a').contents domain = Scraper.string_cleaner(str(domain_parsed[0])) 'Subreddit' tagline = entry_obj.find('p', attrs={"class": "tagline"}) hover = tagline.find('a', attrs={"class": "subreddit hover"}) subredd_parsed = hover.contents subredd = Scraper.string_cleaner(str(subredd_parsed[0])) subredd = subredd[:-1] 'Score'
#!/bin/python2 import csv import Scraper import json import urllib2 mccarthyID = "M001165" pelosiID = "P000197" def findDII(margin): dii = (-11.0/24)*margin + (2411.0/24) return dii #Get all the vote data from the Scraper script rawActions = Scraper.getActions() actionsAll = Scraper.filterActions(actions=rawActions, billsOnly=False, passedOnly=False) marginsAll = Scraper.getMargins(actionsAll) legAll, votesAll = Scraper.getVoteDict(actionsAll) mccarthyVotes = votesAll[mccarthyID] pelosiVotes = votesAll[pelosiID] actionsBill = Scraper.filterActions(actions = rawActions, billsOnly = True, passedOnly = False) marginsBill = Scraper.getMargins(actionsBill) legBill, votesBill = Scraper.getVoteDict(actionsBill) congress = {} for l in legAll: if legAll[l]['Party'] == 'D': congress[l] = legAll[l] diiAllSum = 0 totalAllVotes = 0