def generate():
    "----------Start up the course times------------"
    autotable = AutoTable()
    scraper = Scraper(autotable)
    autotable = scraper.build_table()
    #builder = Builder(autotable)
    #autotable = builder.build_table()
    start_time = time.time()
    print("Year Space")
    year_solutions = generate_semester(autotable.year.courses)
    if len(year_solutions) == 0:
        optimal_solution = fall_winter_merge(autotable.fall.courses,
                                             autotable.winter.courses)
    else:
        optimal_solution = year_fall_winter_merge(year_solutions,
                                                  autotable.fall.courses,
                                                  autotable.winter.courses)

    print("Fall")
    for day in optimal_solution[0][0]:
        for timeslot in day:
            print(timeslot)
    print("Winter")
    for day in optimal_solution[0][1]:
        for timeslot in day:
            print(timeslot)
    print("Fall Distance: " + str(optimal_solution[1]) +
          "    Winter Distance: " + str(optimal_solution[2]))
    print("--- Full algorithm %s seconds ---" % (time.time() - start_time))
    root = Tk()
    gui1 = MyFirstGUI(optimal_solution[0][0], "Fall", root)
    root.mainloop()
    root = Tk()
    gui2 = MyFirstGUI(optimal_solution[0][1], "Winter", root)
    root.mainloop()
Exemple #2
0
def generate():
    "----------Start up the course times------------"
    
    autotable = AutoTable()
    scraper = Scraper(autotable)
    autotable = scraper.build_table()
    #builder = Builder(autotable)
    #autotable = builder.build_table()
    start_time = time.time()
    "----------Get all Fall Timetables------------"
    courses = autotable.fall.courses
    courses.extend(autotable.year.courses)
    space1 = autotable.solution_space(courses)
    "----------Get all Winter Timetables------------"
    courses = autotable.winter.courses
    courses.extend(autotable.year.courses)
    space2 = autotable.solution_space(courses)
    "-----------Combine fall and winter-------------"
    
    listed = autotable.index_year_courses(autotable.year.courses)
    compatible = autotable.construct_year(space1,space2,listed)
    print("Fall:")
    for section in compatible[0][0][0]:
        print(section.name)
        print(section)
    print("Winter:")
    for section in compatible[0][1][0]:
        print(section.name)
        print(section)
    print("Distance: "+str(compatible[0][2]))
    
    print("--- %s seconds ---" % (time.time() - start_time))    
def main():
    db = connect_to_db()

    if not companies_coll_exists(db):
        logger.info(
            "The 'companies' collection does not exist. Start scraping company list. "
        )
        dow_30_companies = Scraper.dow_30_companies_func()
        insert_companies(db, dow_30_companies)

    logger.info('Current working directory: ' + os.getcwd())

    with open('ECT/cookies.txt', 'r') as f:
        cookie = f.readline()
        logger.info(f'Using cookie: {cookie}')
    # get new links based on the tracking company list
    links_df = get_company_links(db, cookie)

    # get archived links
    link_set = get_archived_links(db)

    raw_df = Scraper.data_companies(links_df, cookie, link_set)

    count = 0
    while (not Scraper.is_raw_df_all_good(raw_df)
           ) and count <= 3:  # try no more than 3 times.
        raw_df = Scraper.rerun_companies(raw_df, cookie)
        count += 1
        logger.info(f'rerun count: {count}')

    # insert raw
    insert_raw_data(db, raw_df, coll_name="raw")

    # process raw transcripts
    process_raw(db)
Exemple #4
0
def add_new_vacancy(vacancy_for_db, site):

    collect = MongoClient('localhost', 27017)[vacancy_for_db][site]
    lst_vacancy = []

    if site == 'hh':

        lst_vacancy = Sc.parse_hh(vacancy_for_db)[0]

    elif site == 'sj':

        lst_vacancy = Sc.parse_sj(vacancy_for_db)[0]

    for itm in lst_vacancy:

        itm['_id'] = itm.pop('Ссылка')

        if not any([itm for itm in collect.find({'_id': itm['_id']})]):

            collect.insert_one(itm)

        elif itm not in ([itm for itm in collect.find({'_id': itm['_id']})]):

            collect.update_one({'_id': itm['_id']},
                               {'$set': {'Зарплата мин': itm['Зарплата мин'],
                                         'Зарплата мкс': itm['Зарплата мкс'],
                                         'Валюта': itm['Валюта']}})

    return True
def assert_stock_prices():
    '''Make sure that all market dates after the first data date for any stock have data for that stock. The scraper often
        leaves holes in the data. If a hole is found, look for manually downloaded data in the old Yahoo database and in
        ./manualdata/<ticker>.csv. If doesn't exist, fill in data as Yahoo does and print an error.'''
    stocks = Database.get_stocks()
    last_assertion_day = StockData.createSDate("2017-07-25")
    market_dates = Database.get_market_dates()
    for stock in stocks:
        for date in market_dates:
            if last_assertion_day >= date:
                continue
            if date.day_number >= stock.first_data_date.day_number:
                if Database.get_dailydata(stock, date=date) is None:
                    dd = Scraper.scrape_dailydata(stock, date,
                                                  date)  # try scraper again
                    if dd is not None:
                        if len(dd) != 0:
                            dd = dd[0]  # take it out of array
                        else:
                            dd = None  # no data found
                    if dd is None:  # try old Yahoo database
                        dd = Database.get_Yahoo_dailydata(stock, date)
                    if dd is None:  # try manual csv's
                        dd = Scraper.get_manual_dailydata(stock, date)
                    if dd is None:  # nothing left to try, throw error
                        # add the previous day's close to all values and volume to 0. This is what Yahoo does.
                        prev = Database.get_dailydata(stock,
                                                      date.getPrevious())
                        dd = StockData.SDailyData(stock, date, prev.close,
                                                  prev.close, prev.close,
                                                  prev.close, 0)
                        Log.log_error(
                            "No data found for {} on {}. Added pseudo values copied from previous day. Check manually to make sure daily data doesn't exist."
                            .format(stock, date))
                    Database.add_dailydata(dd)
def update_index_stocks():
    '''Update the list of stocks in the database to include all stocks in the
        DJI, S&P500, NASDAQ, NYSE, etc... and update those stocks to have the
        correct list of indices associated with them.'''
    indices = []
    sp500 = Scraper.scrape_SP500()
    dji = Scraper.scrape_DJI()
    #nasdaq = Scraper.scrape_NASDAQ()
    misc = Scraper.scrape_misc()
    indices.append(sp500)
    indices.append(dji)
    #indices.append(nasdaq)
    indices.append(misc)
    if MiscInfo.SHOULD_SCRAPE_NYSE:
        nyse = Scraper.scrape_NYSE()
        indices.append(nyse)
    # add to list of stocks
    for index in indices:
        for stock in index:
            Database.add_stock(stock)
    # update indices listing for stock
    stocks = Database.get_stocks()
    for stock in stocks:  # TODO check stock in ""
        Database.set_indices(stock, "S&P500", stock in sp500)
        Database.set_indices(stock, "DJI", stock in dji)
Exemple #7
0
def semresult():
    url = "http://www.fastvturesults.com/check_new_results/"

    collegecode = raw_input("Enter the college\nex:\tRnsit:1rn\t")
    year = raw_input("Enter the year")
    branch = raw_input("Enter the branch code\n")
    fp = FileOps.createnew("Cse6sem.csv")

    BeautifulUsn.gencollege(collegecode, year, branch, url)
    for i in range(120):
        studenturl = BeautifulUsn.gennexturl()
        page = EstablishConnection.openwebpage(studenturl)

        soup = Scraper.page(page)

        resulturl, name = Scraper.semresultlink('6', soup)

        if resulturl != 'none':
            page = EstablishConnection.openwebpage(resulturl)

            soup = Scraper.page(page)

            result = Scraper.getresult(soup, name)

            print result

            FileOps.writestudentresult(fp, result)
Exemple #8
0
def test2():
    soup = sc.get_soup('http://terraria.gamepedia.com/Chests')
    tables = soup.find('table', class_='inner')
    rows = tables.find_all('tr')
    for row in rows:
        content = sc.get_element_content(row, stripped=True)
        return (content)
Exemple #9
0
def keyword_search(query, amount):
    pmc_ids = [pmc for pmc in Scraper.esearch(query, amount)]
    alltext = [i for i in Scraper.text_grab_multiple(pmc_ids)]
    keywords = [
        i.lower()
        for i in Scraper.get_continuous_chunks(" ".join(alltext), query)
    ]
    return keywords[:7]
Exemple #10
0
 def writeToCSV(self):
     scraper = Scraper()
     finishedUltimaList = scraper.getUltimaList()
     f = open("{}.csv".format(self.animeName.get()), 'w', newline='')
     writer = csv.writer(f)
     for row in finishedUltimaList:
         self.progressBar.step(10)
         writer.writerow(row)
     self.progressBar.stop()
     f.close()
Exemple #11
0
 def writeToCSV(self):
     scraper = Scraper()
     finishedUltimaList = scraper.getUltimaList()
     f = open("{}.csv".format(self.animeName.get()),'w' , newline = '')
     writer = csv.writer(f)
     for row in finishedUltimaList:
         self.progressBar.step(10)
         writer.writerow(row)
     self.progressBar.stop()
     f.close()
Exemple #12
0
def main_program():
    root = tk.Tk()
    app = interface(root)

    # TODO: Set up file dialog to handle selecting filepath
    msg = Scraper.open_email(
        r'C:\Users\Couch\Desktop\TimesheetReader\test.msg')

    # Load Excel workbook
    path = app.browse_file_dialog()
    wb = openpyxl.load_workbook(path)
    sheet = wb.active

    # Amount of cells (Start - Break - Finish = 3) for each day (7); 3*7days = 21
    MAX_CELL_COUNT = len(sheet['D5':'F11'] * 3)

    # Get list of times from email
    # TODO: Fix disgusting regex
    regex = r'\d?\d?\:?\d?\d?\s\w\.\w\.|-'

    times = Scraper.scrape_msg(msg, regex)

    # Create new list to copy times to
    # Append all elements as 0 to prefill data in Excel
    days = []
    for i in range(0, MAX_CELL_COUNT):
        days.append(0)

    times_index = 0
    for i in range(0, MAX_CELL_COUNT):
        if times_index < len(times):
            days[times_index] = str(times[times_index])
            times_index += 1

    # Format times
    days = Scraper.format_times(days)

    Interface.print_status(
        'Copying times to spreadsheet: {0} at path: {1}'.format(
            str(sheet), path))

    # write days data to cells
    i = 0
    for rowOfCells in sheet['D5':'F11']:
        for cell in rowOfCells:
            cell.value = days[i]
            i += 1
        print('\tRow: {0} copied!'.format(str(rowOfCells)))

    wb.save(path)

    Interface.print_status("Completed\n{0}".format('=' * 100))

    root.mainloop()
Exemple #13
0
def init():
	global crawler
	global domain
	global bot
	global timeout
	global lag
	global depth
	global emailscan
	global tprint
	
	# Remove previous files
	try:
		shutil.rmtree(".cache")
	except OSError:
		pass
		
	try:
		os.remove("emails.txt")
	except OSError:
		pass
	
	# Process cmd line arguments
	cmdArgs()
	
	# Initialize scraper object
	crawler = Scraper(domain, bot, timeout, lag, depth, emailscan)

	# Pretty print thread
	tprint = threading.Thread(target=ThreadPrettyPrint)
Exemple #14
0
def parse_and_store(html_file_path):
    conn = sqlite3.connect('reellog.db')
    c = conn.cursor()

    c.execute("SELECT COUNT(*) from reellog")
    (old_entry_count, ) = c.fetchone()

    to_write = Scraper.scrape(html_file_path)

    for row in to_write:
        command = "INSERT INTO reellog VALUES (%s)" % row
        try:
            c.execute(command)
            print('+ %s' % row)
        except sqlite3.IntegrityError:
            print('= %s' % row)

    conn.commit()

    c.execute("SELECT COUNT(*) from reellog")
    (new_entry_count,) = c.fetchone()

    conn.close()

    print("%i new entries added" % (int(new_entry_count) - int(old_entry_count)))
Exemple #15
0
def call_scrape_singlethreaded(source, cities):
    if len(cities) == 0 or cities[0] == '':
        return True
    for destination in cities:
        if source != destination or destination == '':
            results = []
            tries = 0
            for day in get_range_dates(date.today(), DATE_RANGES):
                scraped_data = Scraper.parse(
                    source, destination, day,
                    date.strftime(date.today(), '%m/%d/%y'))
                if scraped_data:
                    results += scraped_data
                    tries = 0
                else:
                    tries += 1
                    if tries >= MAX_ATTEMPTS_SRAPE:
                        print "ERROR: Exceeded Maximum Attempts"
                        return False
            file_name = SCRAPE_DIRECTORY + directory + '/%s-%s-%s-flight-results.json' % (
                date.strftime(date.today(), '%m%d%y'), source, destination)
            # Did not get the results, so return false
            if not results:
                return False
            with open(file_name, 'w') as fp:
                json.dump(results, fp, indent=4)
        # Case where the source = destination, we want to change cities
        else:
            return False
    return True
Exemple #16
0
def makeDashboards(symbol, sampleAmount):
    companyList = pd.read_csv("companylist.csv")
    companyRow = companyList[companyList["Symbol"] == symbol]
    retrievedName = companyRow.iat[0, 1]
    retrievedSector = companyRow.iat[0, 7]
    origStock = Scraper.Stock(symbol, retrievedName, retrievedSector)

    #get the comparisons
    compare.experiment(sampleAmount, origStock)
    compareData = pd.read_csv("output.csv")
    compareData = compareData.fillna(0)
    # output to static HTML file

    #get all of the compare stocks
    stockFile = open("stocks.p", "rb")
    stockList = pickle.load(stockFile)

    polarityScript, polarityHtml = polarityDashboard(
        origStock, stockList,
        compareData.sort_values(by=['WIKI_SIMILARITY'], ascending=False))
    biasScript, biasHtml = biasDashboard(
        origStock, stockList,
        compareData.sort_values(by=['WIKI_SIMILARITY'], ascending=False))
    relScript, relHtml = relevanceDashboard(compareData)

    return polarityScript, polarityHtml, biasScript, biasHtml, relScript, relHtml
Exemple #17
0
    def crawler_main(self, indice_link):

        source = re.findall(REGEX_SOURCE, self.__url)
        url = self.__url
        parsed = Crawler.crawlear_web(self, url)
        ultima_pagina = (parsed.xpath(XPATH_ULTIMA_PAGINA)[0])

        # transforma el string con , como separador de mil en entero
        locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
        last_page = int(locale.atof((re.findall(REGEX_ULTIMA_PAGINA, ultima_pagina))[0]))
        cantidad_paginas = math.ceil(last_page / 24)

        indice_pagina = 1  # indice para cambio de pagina

        while cantidad_paginas >= indice_pagina:
            contador = 1
            sigiente_pagina = self.__nuevo_link.format(indice_pagina)
            parsed3 = Crawler.crawlear_web(self, sigiente_pagina)
            unidades_href = parsed3.xpath(XPATH_HREF_UNIDADES)

            for elemento in unidades_href:

                parsed2 = Crawler.crawlear_web(self, elemento)
                instancia_scraper = Scraper.Scraper()
                instancia_scraper.crear_dicc(parsed2, elemento, source, indice_link)

                print("Nuevo Aviso", contador)
                contador += 1

            indice_pagina += 1
            print("Cambio de pagina")
        print("Nueva Url")
Exemple #18
0
def runit(keyword):
	me = getBySearchid(keyword)
	print me.keyword
	rawresults = Scraper.scrape(me.keyword)
	results = [s for s in rawresults if s.keys()!=[]] 
	# print results
	for rows in results:
		try:		
			new_result = Results(search_id = keyword,
						fundingNumber = rows["fundingNumber"].strip(), 
						opportunityTitle = rows["opportunityTitle"].strip(),
						Agency = rows["Agency"].strip(),
						openDate = rows["openDate"].strip(),
						CloseDate = rows["closeDate"].strip(),
						attachment = rows["attachment"].strip(),
						link = "http://www.grants.gov/"+rows["link"].strip()
						)
			db.session.add(new_result)
			db.session.commit()
		except Exception:
			print 'goddamnit'
			raise

	# results = Scraper.scrape(keyword)
	# for rows in results:
		# try:
			# cells = rows.find_all("td")

			# print cells[1]
		# except Exception:
			# print 'well  well'
	return redirect(url_for('view'))
Exemple #19
0
def addEntry(db):  # add new data
    # request data
    [
        name, usdPrice, eurPrice, percentChange7d, percentChange24h,
        percentChange1h, lastUpdated
    ] = Scraper.getNewData()

    # insert on DB
    db.test.insert({
        "name":
        name,
        "usdPrice":
        usdPrice,
        "eurPrice":
        eurPrice,
        "percentChange7d":
        percentChange7d,
        "percentChange24h":
        percentChange24h,
        "percentChange1h":
        percentChange1h,
        "date":
        datetime.datetime.fromtimestamp(
            int(lastUpdated)).strftime('%Y-%m-%d %H:%M:%S')
    })
Exemple #20
0
def load_stocks(stocks, start, end):
    start_time = time.time()
    session = DatabaseService.setup_db()
    # in_db = session.query(Stock).filter(Stock.ticker.in_(stocks)).filter(Stock.timestamp >= start).filter(Stock.timestamp <= end).all()

    # removed = []
    # for row in in_db:
    #     if row.ticker not in removed:
    #         stocks.remove(row.ticker)
    #         removed.append(row.ticker)

    if len(stocks) > 0:
        panel = Scraper.lookup(stocks, start, end)
        for date in panel.major_axis:
            for company in panel.minor_axis:
                frame = panel.loc[:, date, company]

                high = frame["High"]
                low = frame["Low"]
                open = frame["Open"]
                close = frame["Close"]
                vol = frame["Volume"]
                adj_close = frame["Adj Close"]
                s = Stock(company, high, low, open, close, vol, adj_close, date)
                if not s.is_nan():
                    session.add(s)
                    # print("{}: high[{}] low[{}] date[{}]".format(stock, hi, low, date))

        session.commit()
    end_time = time.time()
    print("Time taken to load stocks: %s seconds" % (end_time - start_time))
Exemple #21
0
def addStarting(playerMap,projStarters):
    # print(playerMap)
    for playerid,gameMap in playerMap.items():
        for gameid,gameList in gameMap.items():
            isStarting = 1 if Scraper.playerid_to_playerName(playerid) in projStarters else 0
            gameList.append(isStarting)
    return playerMap
Exemple #22
0
def get_update_logs(building: Building, filter: Filter):
    apts_now = Scraper.get_apartments(building)
    histos = list(Historizer.load_building(building))
    histos.append(HistoEntry(datetime.datetime.utcnow(), apts_now))
    for i in range(1, len(histos)):
        yield UpdateLog(
            histos[i - 1].date, histos[i].date,
            compare(histos[i - 1].apartments, histos[i].apartments))
Exemple #23
0
def yield_building_changes(building: Building, filter: Filter):
    apts_now = Scraper.get_apartments(
        building
    )  # no filter on api because we prefer filtering later (to compare with before)
    last_histo = get_last_snapshot(building)
    if last_histo:
        apts_before = last_histo.apartments
        return yield_changes(apts_before, apts_now, filter)
def item_image(item_name):
    url = 'http://terraria.gamepedia.com/File:' + item_name.replace(
        ' ', '_') + '.png'
    soup = sc.get_soup(url)
    tag = soup.find('a', title=item_name + '.png')
    if tag != None:
        return tag['href']
    return 'no image'
Exemple #25
0
def test3(url):
    elements = sc.get_soup(url).find_all()
    query = SoupQuery('table')
    query.has_content('Result')
    query.has_content('Ingredients')
    query.has_content('Crafting Station')
    query.has_attribute('class', 'inner')
    #parent = SoupQuery('table')
    #query.has_parent(parent)

    found = []
    for element in elements:
        if sc.get_query(element, query):
            if sc.multibox_test(element):
                for item in get_row_data(element):
                    found.append(item)
    return found
Exemple #26
0
 def on_click_standard(self):
     Scraper.Scrape(self.Eventname.text(), self.Keywords.text(),
                    self.getLatitude.text(), self.getLongitude.text(),
                    self.getRadius.text(), self.calenderButtonStart.text(),
                    self.startTime.currentText(),
                    self.calenderButtonEnd.text(),
                    self.endTime.currentText(),
                    self.eventNums[self.eventLookup.currentText()])
Exemple #27
0
def load_training_stocks(start_sample_time, end_sample_time, start_train_time, end_train_time):
    stocks = []
    with open(get_data_file()) as csv_file:
        reader = csv.DictReader(csv_file)
        tickers = [row['Symbol'] for row in reader]
        #        tickers = random.sample(tickers, MAX_NUM_TICKERS)
        tickers = tickers[0:MAX_NUM_TICKERS]

        # Cols are Symbol, Name, Sector
        try:
            training_data = Scraper.lookup(tickers, start_sample_time, end_sample_time)
            validation_data = Scraper.lookup(tickers, start_train_time, end_train_time)['Close']
            stocks = [TrainingStock.TrainingStock(company, training_data[company], validation_data[company]) for company
                      in tickers]

        except RemoteDataError:
            print("Error reading stock data, skipping")
    return stocks
Exemple #28
0
def check_coins(coins):
    if debug: print('*** DEBUG *** check_coins() with parameter '+coins)
    if debug: print('*** DEBUG *** You want to check {0}'.format(coins))
    j = json.loads(Scraper.scrape(coins,'BTC,USD'))
    if 'Response' in j:
        print ("Could not find coin! - "+j['Message'])
        return False
    else:
        if debug: print ("*** DEBUG *** Your coin {0} exists, currently costs {1} BTC, which is the equivalent of ${2}".format(coins,j['BTC'],j['USD']))
        return True
def get_company_links(db, cookies):
    """
    Get links based on the tracking company list
    :param db: db client
    :param cookies: String, http header cookie. Read from local txt file.
    :return: dataframe, links_df from "Scraper.get_links_2(company_list, cookies)"
    """
    company_list = get_company_symbol_list(db)
    links_df = Scraper.get_links_2(company_list, cookies)
    return links_df
Exemple #30
0
def update_navigation() -> None:
    scraper = Scraper.Scraper()
    nav_data = scraper.get_all_nav()
    # Subject Names
    json.dump(nav_data["subject_names"], open(make_path(navigation_dir, 'subject_names.json'),'w'))
    # Subject -> Colleges
    json.dump(nav_data["subject_colleges"], open(make_path(navigation_dir, 'subject_colleges.json'),'w'))
    # College -> Subjects
    json.dump(nav_data["college_subjects"], open(make_path(navigation_dir, 'college_subjects.json'),'w'))
    # Errors
    json.dump(nav_data["errors"], open(make_path(error_dir, 'update_nav.json'),'w'))
Exemple #31
0
def influencer_bin_data(influencers, user, pw):
    for i in range(len(influencers)):

        print('Extracting info from ' + influencers[i].decode('utf-8') +
              '... using ' + user)
        scraper = Scraper(influencers[i], user, pw)
        user_data = scraper.run()
        print('\n' + 'No. of followers scraped for ' +
              influencers[i].decode('utf-8') + ' : ' + str(len(user_data)))
        scraper.close()

        # save data for each user
        file_name = 'data/followers_' + influencers[i].decode(
            'utf-8') + '.pickle'
        with open(file_name, 'wb') as file:
            pickle.dump(user_data, file)

        # track done list of users
        with open('done_list.txt', 'a') as file:
            file.write(influencers[i].decode('utf-8') + '\n')
def update_stock_dates():
    '''Update the list of dates that the stock market was open to include all dates
        up to today.'''
    last_update = Database.get_last_market_date()
    dates = Scraper.scrape_market_dates(start_date=last_update)
    if dates is None:
        return
    if last_update is not None:
        dates = dates[1:]  # first date is already in database
    for date in dates:
        Database.add_market_date(date)
def grab_data(url):
    elements = sc.get_soup(url).find_all()
    query = SoupQuery('table')
    query.has_content('Result')
    query.has_content('Ingredients')

    found = []
    for element in elements:
        if sc.get_query(element, query):
            for item in row_format(element):
                if len(item) < 15:
                    found.append(item)

    cleaned = []
    for row in found:
        clean = cleaners.clean_extra_vals(row)
        clean = cleaners.list_filter(clean, [')', '('])
        if 'v' not in clean:
            cleaned.append(clean)

    return cleaned
def getMDAfromText(filename,text):
    try:
        soup = BeautifulSoup(text, "lxml")    
        fullText = scraper.scrapeByAnchorTag(soup)
        if fullText is not None:
            print("{0}\tScraped By Anchor".format(filename))
            return fullText  
        fullText = scraper.scrapeByRegex(soup)
        if fullText is not None:
            print("{0}\tScraped By Regex".format(filename))
            return fullText
        if fullText is None:    
            print("{0}\tUnable to scrape".format(filename))
            text = ''.join(soup.findAll(text=True))
            text.replace("&#146;","'")
            helper.writeToDirectoryFile("debug",filename,text)   
        return None
    except UnicodeEncodeError:
        print("{0}\tUnicodeEncodeError".format(filename))
        helper.writeToDirectoryFile("debug",filename,text)   
        return None
Exemple #35
0
 def __init__(self):
     self.scraper = Scraper.Scraper()
     self.materias = {}  # Dicionario: Codigo_materia->Materia_obj
     self.nomes_materias = {}  # Dicionario: Codigo_materia->nome_materia
     self.curso_materias = {
     }  # Dicionario: Codigo_curso->lista_materias_do_curso (Ordem alfabetica)
     self.lista_cursos = [
     ]  # Lista com o nome de todos os cursos. Guardados em tuplas (cod, nome)
     if os.path.exists(DICT_FILE):
         with open(DICT_FILE, "rb") as f:
             self.materias, self.nomes_materias, self.curso_materias, self.lista_cursos = cPickle.load(
                 f)
def get_HTML_article(url_opener, article_file, article_url):    
    
    'Get URL HTML'
    print ("Getting HTML article from URL:   " + article_url)
    html_response=url_opener.open(article_url)
    
    'Build HTML parser'     
    soup = BeautifulSoup(html_response)
    
    'Get the Author'
    article_author_obj=soup.find('a', attrs={"rel": "author"})
    if (article_author_obj != None):
        article_author= article_author_obj.contents
        author = str(article_author[0])
        author_stripped = Scraper.string_cleaner(author)
    else :
        author_stripped = "Unknown"
        
    article_file.write("<author>" + author_stripped  + '</author>\n\n')
    
    'Get the Author'
    article_body=soup.findAll('article')
    
    'Get all paragraphs + clean redundant chars'
    article_file.write("<content>" + "\n")
    
    try:
        for article in article_body:
            for paragraph in article.findAll('p'):
                stripped_p = Scraper.string_cleaner(paragraph)
                article_file.write(stripped_p + "\n")
    except:
        return False

    article_file.write("</content>" + "\n")
    
    return True
    'Get next page - Currently disabled '
def get_HTML_article(url_opener, article_file, article_url):    
    
    'Get URL HTML'
    print ("Getting HTML article from URL:   " + article_url)
    html_response=url_opener.open(article_url)
    
    'Build HTML parser'     
    soup = BeautifulSoup(html_response)
    
    'Get the Author'
    article_author_obj=soup.find('span', attrs={"itemprop": "name"})
    if (article_author_obj != None):
        article_author= article_author_obj.contents
        author_to_parse = article_author[0].split(",", 1)
        author = re.sub(r'\\n', '', str(author_to_parse[0])).strip()
        author_stripped = Scraper.string_cleaner(author)
    else :
        author_stripped = "Unknown"
  
    article_file.write("<author>" + author_stripped +'</author>\n\n')
         
    'Get The Article body'
    article_body=soup.find(attrs={"itemprop": "articleBody"})
    
    'Get all paragraphs + clean redundant chars'
    article_file.write("<content>" + "\n")
    try:
        for paragraph in article_body.findAll('p'):
            stripped_p = Scraper.string_cleaner(paragraph)
            article_file.write(stripped_p + "\n")
    except:
        return False
                  
    article_file.write("</content>" + "\n")
    
    return True

    'Get next page - Currently disabled '
Exemple #38
0
    def executeScript(self):
        scraper = Scraper()

        tempName = self.animeName.get()
        tempName = tempName.split()
        tempName.append('episode')

        animeName = ""
        for i in tempName:
            animeName = animeName + i + "-"
        permAnimeName = animeName

        self.count = 1
        a = True

        while a:
            animeName = animeName + str(self.count)
            a = scraper.getHTMLTags("dummy string", animeName)
            animeName = permAnimeName

            self.count += 1

        self.writeToCSV()
def check_yesterday_fanduel(playerMap):
    yesterdayDate = datetime.date.today()-datetime.timedelta(days=1)
    with open("final_preds.txt","r") as f:
        resultList = json.loads(f.readline())
    with open("final_predList.txt","r") as f:
        predList = json.loads(f.readline())
    with open("yesterday_results.txt","w") as f:
        totalPred = 0
        totalActual = 0
        totalCost = 0
        for i in range(0,len(resultList[0])):
            name = resultList[3][i]
            points = resultList[1][i]
            position = resultList[0][i]
            cost= resultList[2][i]

            totalPred += points
            totalCost += cost

            #print(name)
            playeridStr = Scraper.playername_to_id(str(name))
            #print(playeridStr)
           # print(type(playeridStr))

            gameOrderedDict = playerMap[playeridStr]


            lastGameStats = gameOrderedDict[next(reversed(gameOrderedDict))]

            predictedStatsList = predList[name]
            

            if(lastGameStats[0] != yesterdayDate.month or lastGameStats[1] != yesterdayDate.day or lastGameStats[2] != yesterdayDate.year):
                f.write(name + " might have been injured or did not play\n")
                f.write(name + " (" + position + ") was projected for " + str(points) + " points at " + str(cost) + " cost and actually got " + str(0) + "\n")
            else:
                f.write(json.dumps([float("{0:.2f}".format(x)) for x in predictedStatsList])+"\n")
                statsList = lastGameStats[12:]
                f.write(json.dumps(statsList)+"\n")
                actual_fanduel = Util.calc_fanduel_points(statsList)
                totalActual += actual_fanduel
                f.write(name + " (" + position + ") was projected for " + str(points) + " points at " + str(cost) + " cost and actually got " + str(actual_fanduel) + "\n")
            f.write("\n")
        f.write("Total Predicted points is " + str(totalPred) + " at " + str(totalCost) + " cost, and total actual points is " + "{0:.2f}".format(totalActual)) 
Exemple #40
0
def find_courses():
    username = "******"
    password = get_password()

    driver = Driver('firefox')
    driver.home()
    assert (driver.login(username, password))
    driver.goto_menu("Onderwijs")
    driver.fillout()

    scraper = Scraper(driver)
    scraper.find_courses()
    scraper.print_courses()
    scraper.find_courses_names()
    scraper.scrape_course_elements()
    driver.shutdown()

    with open("courses.dat", 'wb+') as courses_file:
        pickle.dump(scraper.courses, courses_file)

    for course in scraper.courses:
        print(course)

    return scraper.courses
def gen_description_and_fanduel_map(dict,csvFileName):
    playerList = []
    pred_statList = {}

    with open("final.txt","w") as f:
        fanduel_data_arr = Util.fanduel_scrape(csvFileName)
    
        for playerid, statList in dict.items():
            name = Scraper.playerid_to_playerName(str(int(playerid)))
            #print(name)
        
            if(name in fanduel_data_arr["Name"].as_matrix()):
                [row] = fanduel_data_arr.loc[fanduel_data_arr['Name'] == name].as_matrix()
                position = row[1]
                fanduelAvg = row[4]
                cost = row[6]
                injured = row[10]

                predicted = Util.calc_fanduel_points(statList)

                #print(type(statList))

                pred_statList[name] = statList.tolist()
            
            
                #print(row)
                f.write( name + ": [" + "{0:.2f}".format(statList[0]) + " mins, " + "{0:.2f}".format(statList[1]) + "/" + 
                        "{0:.2f}".format(statList[2]) + " fg, " + "{0:.2f}".format(statList[3]) + "/" +  "{0:.2f}".format(statList[4]) + " 3p, "
                        + "{0:.2f}".format(statList[5]) + "/" +  "{0:.2f}".format(statList[6]) + " ft, " + "{0:.2f}".format(statList[7]) + " dreb, " + 
                        "{0:.2f}".format(statList[8]) + " oreb, " + "{0:.2f}".format(statList[9]) + " reb, " + "{0:.2f}".format(statList[10]) + " ast, " +
                        "{0:.2f}".format(statList[11]) + " stl, " + "{0:.2f}".format(statList[12]) + " blk, " + "{0:.2f}".format(statList[13]) + " TO, " + 
                        "{0:.2f}".format(statList[14]) + " PF, " + "{0:.2f}".format(statList[15]) + " +/-, " + "{0:.2f}".format(statList[16]) + " pts] FANDUEL: " 
                        + "{0:.2f}".format(predicted) + ", " + position + ", " + str(cost) + ", " + "{0:.2f}".format(fanduelAvg) + "\n")

                if(injured != "GTD" and injured != "O"):
                    playerList.append([position, predicted, cost,name])


    writeFinal_predList(pred_statList)

    #writePlayerIDDict(playerIDDict)

    return playerList
		return recognizer.recognize_sphinx(audio)
	except speech_recognition.UnknownValueError:
		print("Could not understand audio")
	except speech_recognition.RequestError as e:
		print("Recog Error; {0}".format(e))

	return ""

print 'Say an emotion'
command = listen()
print command
i = True

while i == True:
    if command == "smile":
        print Scraper.getimage("smiling-face-with-open-mouth-and-smiling-eyes")
        print "\n:)"
    elif command == "sad":
        i = False
        print Scraper.getimage("crying-face")
        print "\n:("
    elif command == "excited":
        i = False
        print Scraper.getimage("jack-o-lantern")
        print "\n:O"
    elif command == "upset":
        i = False
        print Scraper.getimage("angry-face")
        print "\n>:("
    else:
        print command
import csv


import Optimize
import ML
import Util
import ReadWriteFiles
import Scraper


# print("Reading previously stored player-stats map")
(lastModifiedDate,currentMap) = ReadWriteFiles.readPlayerStatsFile()
isUpdated = (lastModifiedDate == datetime.date.today())

print("Getting data about players playing today")
today_playerMap = Scraper.create_todays_playerMap()
projStarters = Scraper.getProjStarters()

today_playerMap = Util.addStarting(today_playerMap,projStarters)
print(json.dumps(today_playerMap))


(lastModifiedDate,currentMap) = ReadWriteFiles.readPlayerStatsFile()

injuredTodayMap = Scraper.getInjuredPlayers()
injuredIDMap = ReadWriteFiles.readInjuredIDMap()



if(not isUpdated):
    print("Creating Player Map")
Exemple #44
0
def generate_features(currentMap,today_stats,injuredIDMap, injuredTodayMap):
     # print(type(currentMap))

     #print(today_stats)
     #featureList = OrderedDict(list)
     trainingFeatureList = deque([])
     testingFeatureList = deque([])

     todayFeatureList = deque([])

     completeFeatureMap = defaultdict(OrderedDict)

     allGameIDs = set()
     
     for playerid,orderedDict in currentMap.items():
         
         #prevGameIds = deque([])
         #19 stats for each game
         seasonGameStatsTotals = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
         prevGameStats = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
         #use deques with max size to keep track of most recent n games
         prev2GamesStats = deque([],2)
         prev3GamesStats = deque([],3)
         prev5GamesStats = deque([],5)
         prev10GamesStats = deque([],10)
         prev20GamesStats = deque([],20)


         prev2GamesStats.append([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
         prev3GamesStats.append([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
         prev5GamesStats.append([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
         prev10GamesStats.append([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
         prev20GamesStats.append([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])


         count = 0

         #need how many games stats each player has
         #  split 80%-20% train-test
         #  best to split by time
         #first 4/5 of (games-first) = train, rest for test
         gamesForPlayer = len(orderedDict)


         for gameid,statList in orderedDict.items():

             allGameIDs.add(gameid)

             #count represents how many games the player has previously played
             gameFeature = [int(playerid)] + [int(gameid)] + statList[:8] + [count]


             gameFeature += prevGameStats
             gameFeature += Util.avgStats(prev2GamesStats)
             gameFeature += Util.avgStats(prev3GamesStats)
             gameFeature += Util.avgStats(prev5GamesStats)
             gameFeature += Util.avgStats(prev10GamesStats)
             gameFeature += Util.avgStats(prev20GamesStats)
             gameFeature += (np.array(seasonGameStatsTotals) / max(count,1)).tolist()
                 
             if(count <= 0.8 * (gamesForPlayer-1)):
                 trainingFeatureList.append(gameFeature)
             else:
                 testingFeatureList.append(gameFeature)
             # print("HERE gameid " + str(gameid))
             completeFeatureMap[playerid][gameid] = gameFeature
             # print(len(gameFeature))
             # if(len(gameFeature) != 158):
             #     print(gameFeature)



             count+=1
             #prevGameIds += [gameid]     
             prevGameStats = statList[8:]    
             prev2GamesStats.append(statList[8:])
             prev3GamesStats.append(statList[8:])
             prev5GamesStats.append(statList[8:])
             prev10GamesStats.append(statList[8:])
             prev20GamesStats.append(statList[8:])
             seasonGameStatsTotals = [x + y for x, y in zip(seasonGameStatsTotals,statList[8:])]


        
         if(playerid in today_stats):
             (todayGameid,statsList) = today_stats[playerid].popitem()
             feature = [int(playerid)] + [int(todayGameid)] + statsList[:8] + [count] + prevGameStats + Util.avgStats(prev2GamesStats) + Util.avgStats(prev3GamesStats) + Util.avgStats(prev5GamesStats) + Util.avgStats(prev10GamesStats) + Util.avgStats(prev20GamesStats) + (np.array(seasonGameStatsTotals) / count).tolist()
             todayFeatureList.append(feature)



     for feature in todayFeatureList:
        todayGameid = str(feature[1])
        ownTeamNum = feature[6]
        injuredList = injuredTodayMap[ownTeamNum]

        injuredListFeatures = []


        if(len(injuredList) == 0):
            injuredListFeatures = awayInjuredListFeatures = np.zeros((1,148))

        else:

            for injuredName in injuredList:
                injuredID = Scraper.playername_to_id(injuredName)

                for (gameid) in reversed(list(completeFeatureMap[injuredID].keys())):

                #get the last features that the injured player had
                    if(gameid <= todayGameid):
                        gameStatsList = completeFeatureMap[injuredID][gameid]

                        # weight = gameStatsList[10]
                        injuredListFeatures.append(gameStatsList[10:])
                        # print(len(gameStatsList[10:]))

                        break
            injuredListFeatures = np.array(injuredListFeatures)
        # print(injuredListFeatures.shape)


        meanInjuredStats = np.mean(injuredListFeatures,0)
        stdInjuredStats = np.std(injuredListFeatures,0)

        feature += (meanInjuredStats.tolist() + stdInjuredStats.tolist())


         #print(list(todayFeatureList))


     injuredMap = {}

     for currentGameID in allGameIDs:
        #create injury features
        # print(currentGameID)
        # print(type(currentGameID))

        #for both the away team and home team
        awayInjuredIDList = injuredIDMap[currentGameID][0]
        awayInjuredListFeatures = []
        for awayInjuredID in awayInjuredIDList:
            # print(type(completeFeatureMap[injuredID].keys()))
            # print("new awayInjuredID " + str(awayInjuredID))
            for (gameid) in reversed(list(completeFeatureMap[awayInjuredID].keys())):
                # print(gameid)
                # print(type(gameid))
                #get the last features that the injured player had
                if(gameid <= currentGameID):
                    gameStatsList = completeFeatureMap[awayInjuredID][gameid]

                    # weight = gameStatsList[10]
                    awayInjuredListFeatures.append(gameStatsList[10:])
                    # print(len(gameStatsList[10:]))

                    # print(awayInjuredID + " " + currentGameID)
                    # print(gameStatsList)
                    break
        if(len(awayInjuredListFeatures) == 0):
            awayInjuredListFeatures = np.zeros((1,148))
        else:
            awayInjuredListFeatures = np.array(awayInjuredListFeatures)
        # print(injuredListFeatures.shape)
        awayMeanInjuredStats = np.mean(awayInjuredListFeatures,0)
        awayStdInjuredStats = np.std(awayInjuredListFeatures,0)
        # print(awayMeanInjuredStats.shape)
        # print(awayStdInjuredStats.shape)




        homeInjuredIDList = injuredIDMap[currentGameID][1]
        homeInjuredListFeatures = []
        for homeInjuredID in homeInjuredIDList:
            # print(type(completeFeatureMap[injuredID].keys()))
            # print(reversed(list(completeFeatureMap[homeInjuredID].keys())))
            for (gameid) in reversed(list(completeFeatureMap[homeInjuredID].keys())):
                #get the last features that the injured player had
                if(gameid <= currentGameID):
                    gameStatsList = completeFeatureMap[homeInjuredID][gameid]

                    # weight = gameStatsList[10]
                    homeInjuredListFeatures.append(gameStatsList[10:])
                    # print(len(gameStatsList[10:]))

                    # print(homeInjuredID + " " + currentGameID)
                    # print(gameStatsList)
                    break
        if(len(homeInjuredListFeatures) == 0):
            homeInjuredListFeatures = np.zeros((1,148))
        else:
            homeInjuredListFeatures = np.array(homeInjuredListFeatures)
        # print(injuredListFeatures.shape)
        homeMeanInjuredStats = np.mean(homeInjuredListFeatures,0)
        homeStdInjuredStats = np.std(homeInjuredListFeatures,0)
        # print(homeMeanInjuredStats.shape)
        # print(homeStdInjuredStats.shape)




        injuredMap[currentGameID] = (awayMeanInjuredStats.tolist() + awayStdInjuredStats.tolist(), homeMeanInjuredStats.tolist() + homeStdInjuredStats.tolist())
     # print(injuredMap)



     #add injuryfeatures to previously computed features
     for gameFeature in list(trainingFeatureList):

         gameid = gameFeature[1]
         isAway = gameFeature[8]
         # print("HERE: " + str(gameid))
         gameFeature += injuredMap[str(gameid)][isAway]

     for gameFeature in list(testingFeatureList):
         gameid = gameFeature[1]
         isAway = gameFeature[8]
         # print("HERE: " + str(gameid))
         gameFeature += injuredMap[str(gameid)][isAway]


     return (np.array(list(trainingFeatureList)),np.array(list(testingFeatureList)), np.array(list(todayFeatureList)))
Exemple #45
0
# See LICENSE.

from sys import stdout

from Stock import Stock
import Scraper
import Rankings
import Fixer
import Writer


# Scrape data from FINVIZ. Certain presets have been established (see direct
# link for more details)
url = 'http://finviz.com/screener.ashx?v=152&f=cap_smallover&' + \
    'ft=4&c=0,1,2,6,7,10,11,13,14,45,65'
html = Scraper.importHtml(url)

# Parse the HTML for the number of pages from which we'll pull data
nPages = -1
for line in html:
    if line[0:40] == '<option selected="selected" value=1>Page':
        # Find indices
        b1 = line.index('/') + 1
        b2 = b1 + line[b1:].index('<')
        # Number of pages containing stock data
        nPages = int(line[b1:b2])
        break

# Create a database containing all stocks
stocks = []
#!/bin/python2
import csv
import Scraper

def findDII(margin):
    dii = (-11.0/24)*margin + (2411.0/24)
    return dii

#Get all the vote data from the Scraper script
rawActions = Scraper.getActions()
#First we find the data for all votes
actions = Scraper.filterActions(actions=rawActions, billsOnly=False, passedOnly=False)
margins = Scraper.getMargins(actions)
leg, votes = Scraper.getVoteDict(actions)
mccarthyID = "M001165"
mccarthyVotes = votes[mccarthyID]
repubs = {}
for l in leg:
    if leg[l]['Party'] == 'D':
        pass
    else:
        repubs[l] = leg[l]
        diisum = 0
        totalVotes = 0
        votesAgainst = 0
        for b in votes[l]:
            if votes[l][b] == 0:
                pass
            elif votes[l][b] != mccarthyVotes[b]:
                diisum += findDII(margins[b])
                votesAgainst += 1
 
     ' Iterate the submissions '
     for sub in submissions:
         try:
             
             'Accept only posts with comments'                  
             if (sub.score < VOTE_TRESHHOLD):
                 continue
             
             'Open File with article id as the name'
             article_file = open(str(sub_reddit) + "\\" + str(article_id), 'w+', newline="\n")
             article_file.write("<article>\n")
             article_file.write("<sub-reddit>" + sub_reddit + "</sub-reddit>\n")
             article_file.write("<news-paper>" + sub.domain + "</news-paper>\n")
             article_file.write("\n")         
             stripped_title = Scraper.string_cleaner(sub.title)
             article_file.write("<title>" + stripped_title  + "</title>\n")
             
             'Get the article content'
             if (SUPPORTED_NEWS_SITES[0] in sub.domain):
                 success = Scraper.ny_times.get_HTML_article(url_opener, article_file, sub.url)
             elif (SUPPORTED_NEWS_SITES[1] in sub.domain):
                 success = Scraper.usa_today.get_HTML_article(url_opener, article_file, sub.url)
             elif (SUPPORTED_NEWS_SITES[2] in sub.domain):
                 success = Scraper.washington_post.get_HTML_article(url_opener, article_file, sub.url)
             else:
                 success = False 
             'Close the XML file'
             article_file.write("</article>\n")
             
             'Found articles counter'
 for result in results_obj:
     try:
         'Get ID'
         try:
             res_id = result['data-fullname']
         except:
             continue
         
         'Get Entry'
         entry_obj = result.find('div', attrs={"class": "entry unvoted"})
         
         'Get title'
         title_obj = (entry_obj.find('p', attrs={"class": "title"})).find('a', attrs={"class": "title "})
         title_parsed = title_obj.contents
         url= title_obj['href']
         title = Scraper.string_cleaner(str(title_parsed[0]))
                           
         'Get Domain'
         domain_obj = entry_obj.find('p', attrs={"class": "title"})
         span_obj = domain_obj.find('span', attrs={"class": "domain"})
         domain_parsed = span_obj.find('a').contents
         domain = Scraper.string_cleaner(str(domain_parsed[0]))
         
         'Subreddit'
         tagline = entry_obj.find('p', attrs={"class": "tagline"})
         hover = tagline.find('a', attrs={"class": "subreddit hover"})
         subredd_parsed = hover.contents
         subredd = Scraper.string_cleaner(str(subredd_parsed[0]))
         subredd = subredd[:-1]    
         
         'Score'
Exemple #49
0
#!/bin/python2
import csv
import Scraper
import json
import urllib2

mccarthyID = "M001165"
pelosiID = "P000197"

def findDII(margin):
    dii = (-11.0/24)*margin + (2411.0/24)
    return dii

#Get all the vote data from the Scraper script
rawActions = Scraper.getActions()
actionsAll = Scraper.filterActions(actions=rawActions, billsOnly=False, passedOnly=False)
marginsAll = Scraper.getMargins(actionsAll)
legAll, votesAll = Scraper.getVoteDict(actionsAll)
mccarthyVotes = votesAll[mccarthyID]
pelosiVotes = votesAll[pelosiID]

actionsBill = Scraper.filterActions(actions = rawActions, billsOnly = True, passedOnly = False)
marginsBill = Scraper.getMargins(actionsBill)
legBill, votesBill = Scraper.getVoteDict(actionsBill)

congress = {}
for l in legAll:
    if legAll[l]['Party'] == 'D':
        congress[l] = legAll[l]
        diiAllSum = 0
        totalAllVotes = 0