Python Scraperの例、scraper.Scraper Pythonの例

コード例 #1

0

ファイルを表示

ファイル: chat-bot.py プロジェクト: destroeder/schedule_osu

def tryScrape(date, chatlist, id):
    try:
        return scraper.Scraper(
            "http://www.osu.ru/pages/schedule/?who=1&what=1&filial=1&group=" +
            str(chatlist[id][0]) + "&mode=full", date)
    except Exception as e:
        print('error', e)

コード例 #2

0

ファイルを表示

    def updateTrailers(self):
        ms = scraper.Scraper()
        if self.recentIsDue():
            util.DEBUG_LOG('    - Fetching recent trailers')
            return [Trailer(t) for t in ms.get_most_recent_movies(None)]

        return []

コード例 #3

0

ファイルを表示

ファイル: scrape_cli.py プロジェクト: stavrossk/scraper

 def __init__(self, scraper_definition_file, logger=logging.getLogger()):
     """
     """
     f = open(scraper_definition_file, 'r')
     xml = f.read()
     f.close()
     self.scraper = scraper.Scraper(xml, logger)

コード例 #4

0

ファイルを表示

    def run(self):
        
        # parse command line arguments
        parser, arguments = self.parse_arguments()
        if len(sys.argv) == 2 and (sys.argv[1] == '-h' or sys.argv[1] == '--help'):
            parser.print_help()
            return
        elif len(sys.argv) < 3:
            raise RuntimeError("ERROR: URL and HTML_TAG are required")
        config = self.config(arguments)
        url, tag = sys.argv[1], sys.argv[2]
        print("Web Scraping with url={} tag={} next={} max={}"\
            .format(url, tag, config.next, config.max if config.max > 0 else 'infinite'))

        # tokenize HTML_TAG
        t = tokenizer.Tokenizer()
        tag_info = t.tokenize(tag)

        # scrape HTML_TAG in URL
        s = scraper.Scraper()
        crawled_data = s.crawl(url, tag_info, config.next, config.max)

        # print out data
        for data in crawled_data:
            print(data)

コード例 #5

0

ファイルを表示

def main():
    params = parse_args(sys.argv)
    print "##", sys.argv, params
    mode = params.get("path", "menu")
    print "$$", mode
    sc = scraper.Scraper(folders, play, record, notify)
    getattr(sc, mode)(params)

コード例 #6

0

ファイルを表示

def main():
    sc = scraper.Scraper()
    number_of_posts_num = 15
    print("Subreddit to WordCloud Generator")
    while True:
        sub_fail = 0
        subreddit_name = input("What subreddit would you like a WordCloud of? (Do not include /r/): ")

        number_of_posts = input("How many top posts do you want to use? Max of 1000: ")
        try:
            number_of_posts_num = int(number_of_posts)
        except ValueError:
            print("Not a valid number")

        if number_of_posts_num <= 0:
            print('Number is less than zero')

        try:
            sc.check_if_subreddit_exist(subreddit_name)
        except:
            print('Not a valid sub!')
            sub_fail = 1

        if sub_fail == 0:
            sc.top_submission_to_word_cloud(subreddit_name, number_of_posts_num)
        run_again = input('Would you like to run the generator again? type 1 to run again')
        if run_again == str(1):
            print('----------------------------------------')
        else:
            break

コード例 #7

0

ファイルを表示

ファイル: __init__.py プロジェクト: scott967/script.cinemavision

    def getPlayableURL(ID, res=None, url=None):
        res = ItunesTrailerScraper.RES.get(res, 'hd720p')

        ts = scraper.Scraper()
        id_location = ID.split('.', 1)
        all_ = [t for t in ts.get_trailers(id_location[-1], id_location[0]) if t]

        if not all_:
            return None

        trailers = [t for t in all_ if t['title'] == 'Trailer']

        url = None
        if trailers:
            try:
                version = [v for v in trailers if any(res in u for u in v['streams'])][0]
                if version:
                    url = [u for u in version['streams'] if res in u][0]
            except:
                import traceback
                traceback.print_exc()

        if not url:
            try:
                streams = all_[0]['streams']
                url = streams.get(res, streams.get('hd720', streams.get('sd')))
            except:
                import traceback
                traceback.print_exc()

        url = re.sub(r'(480p|720p|1080p)', r'h\1', url)

        return url

コード例 #8

0

ファイルを表示

ファイル: main.py プロジェクト: Ignis-Altum/scraper

def scrape_with_threads():
    print("Scraping with threads...")

    products_df = scraper.Filemanager.get_products_data()

    # Create instances of class "Scraper"
    products = [
        scraper.Scraper(category, url)
        for category, url in zip(products_df["category"], products_df["url"])
    ]

    # Create threads
    threads = [
        threading.Thread(target=product.scrape_info) for product in products
    ]

    # Start scraping on all threads
    for thread in threads:
        time.sleep(scraper.REQUEST_DELAY)
        thread.start()

    # Wait for all threads to finish
    for thread in threads:
        thread.join()

    # Save scraped data for each product (sequentially)
    for product in products:
        product.save_info()

コード例 #9

0

ファイルを表示

    def getTrailers(self):
        ms = scraper.Scraper()
        if self.allIsDue():
            util.DEBUG_LOG('    - Fetching all trailers')
            return [Trailer(t) for t in ms.get_all_movies(None)]

        return []

コード例 #10

0

ファイルを表示

ファイル: main.py プロジェクト: wy51r/kod

 def get(self):
     sehir = self.request.get('sehir')
     if memcache.get(sehir) == None:
         s = scraper.Scraper()
         res = s.hava_sehir(sehir)
         logging.debug(res)
         memcache.set(key=sehir, value=res, time=3600)
     self.response.out.write(simplejson.dumps(memcache.get(sehir)))

コード例 #11

0

ファイルを表示

ファイル: __init__.py プロジェクト: blindidiot91/ASTM-Mapping

def run(option):
    qdspath = "N:/Common/Joe Hardy/Quality/Quality Documents"
    outputpath = "C:/Users/User/SyncedFolder/Quality Share/ASTM Mapping/Data/links.csv"

    s = scraper.Scraper(qdspath, option)
    s.walk()
    s.scrape()
    s.organize_results()
    s.output_results(outputpath)

コード例 #12

0

ファイルを表示

 def __init__(self, *args, **kwargs):
     super(MainFrame, self).__init__(*args, **kwargs)
     
     self.webpage = scraper.Scraper('http://feeds.huffingtonpost.com/huffingtonpost/LatestNews')
     self.webpage.scrape() #Scrape the feed for titles, links, and paragraphs
     
     self.InitUI()
     self.Centre()
     self.Show()

コード例 #13

0

ファイルを表示

ファイル: pavuk.py プロジェクト: pavlikus/pavuk

def find_links(urls: List[str]) -> List[Dict[str, str]]:
    """Find all links recursive"""

    scr = scraper.Scraper()
    links = []
    for url in urls:
        links.extend(scr.find_urls(url))
    scr.close()

    return links

コード例 #14

0

ファイルを表示

ファイル: test_scraper.py プロジェクト: grahm-zz-manley/pitchscrape

def test_get_unsaved_reviews():
	scrape = scraper.Scraper(datetime(2018, 1, 3))
	file = open('correct/scraper1.txt')
	correct_results = file.read()
	file.close()
	test_results = ''
	for review in scrape.get_unsaved_reviews():
		test_results += (str(review.artists) + '\n')
		test_results += (review.album_title + '\n')
		test_results += (review.score + '\n')
	assert(test_results == correct_results)

コード例 #15

0

ファイルを表示

def readPlayers(fileName):
    file_players = []
    f = open(fileName, 'r')
    s = scraper.Scraper()
    for line in f:
        playerID = line[:-1]
        print(playerID)
        p = player.Player(playerID)
        s.scrape(p)
        file_players.append(p)
    f.close()
    return file_players

コード例 #16

0

ファイルを表示

ファイル: pavuk.py プロジェクト: pavlikus/pavuk

def get_result(keywords: str,
               search: str,
               webdriver: str,
               qty: int = 0) -> List[Dict[str, str]]:
    """Get main seach result"""

    scr = scraper.Scraper(webdriver)
    qty = qty or 0
    result = scr.get_urls(' '.join(keywords), qty=qty, search=search)
    scr.close()

    return result

コード例 #17

0

ファイルを表示

ファイル: run.py プロジェクト: yiqiaowang/hotel-crawler

def run_scraper(debug=False):
    date = util.get_current_date()

    # create the csv file
    csv_name = '{}_data.csv'.format(date)
    log_file = '{}_log.txt'.format(date)
    util.create_csv(csv_name)

    for date_pair in util.get_date_pairs():
        worker = scraper.Scraper(date_pair[0], date_pair[1], debug)
        data = worker.crawl(log_file)

        if not data:
            data_row = [date_pair[0], date_pair[1], None, None, None, None]
            util.append_row(csv_name, data_row)
        else:
            for room_type in sorted(data):
                room_data = data[room_type]

                # Merge the accomodation data

                for accom_type in sorted(room_data['price_bar']):
                    m_price = merge(room_data, accom_type)
                    data_row = [date_pair[0], date_pair[1],
                                room_type,
                                accom_type,
                                m_price['p_bar'],
                                m_price['p_sum'],
                                m_price['p_adv']
                                ]
                    util.append_row(csv_name, data_row)

                for accom_type in sorted(room_data['price_summer']):
                    m_price = merge(room_data, accom_type)
                    data_row = [date_pair[0], date_pair[1],
                                room_type,
                                accom_type,
                                m_price['p_bar'],
                                m_price['p_sum'],
                                m_price['p_adv']
                                ]
                    util.append_row(csv_name, data_row)
                for accom_type in sorted(room_data['price_adv']):
                    m_price = merge(room_data, accom_type)
                    data_row = [date_pair[0], date_pair[1],
                                room_type,
                                accom_type,
                                m_price['p_bar'],
                                m_price['p_sum'],
                                m_price['p_adv']
                                ]
                    util.append_row(csv_name, data_row)
        worker.clean()

コード例 #18

0

ファイルを表示

 def OnRefresh(self, e): #Reloads the scraper module and mimics previous processes, replaces the old articles in the list with the new articles
     gauge = LoadingGauge(None, title='', size=(300,200)) #Create an instance of the loading screen
     threading.Thread(target=gauge.InitUI).start() #Show the loading screen while the main program is refreshing (But it doesn't...)
     reload(scraper)
     self.webpage = scraper.Scraper('http://feeds.huffingtonpost.com/huffingtonpost/LatestNews')
     self.statusBar.SetStatusText('Refreshing feed...')
     self.webpage.scrape()
     self.statusBar.SetStatusText('')
     self.listCtrl.DeleteAllItems()
     for i in range(0, 15):
         self.listCtrl.InsertStringItem(i, self.webpage.titles[i])
     self.browser.LoadURL(self.webpage.links[0])
     self.Layout()

コード例 #19

0

ファイルを表示

 def __init__(self, company):
     Thread.__init__(self)
     self._scraper = scraper.Scraper((company[0], company[1]))
     self._parser = article_parser.Parser(self._scraper.company_name)
     self._company = company
     self._scraper_results = ()
     self.report = {}
     self.report['articles'] = {}
     self.report['good-count'] = 0
     self.report['bad-count'] = 0
     self.report['op-count'] = 0
     self.report['pe-count'] = 0
     self.driver_done = False

コード例 #20

0

ファイルを表示

def scrape(sku, prod_name, source):
    """
    Run the scraper until the queue is empty.
    """
    thread_count = config.get("scraper").get("thread_count")
    url_objs = list(q_db.find({"sku": sku}).sort('timestamp', pymongo.ASCENDING))
    if url_objs: random.shuffle(url_objs)
    with ThreadPoolExecutor(max_workers=thread_count) as executor:
        sc = scraper.Scraper(sku, prod_name, source)
        for url_obj in url_objs: 
            url = url_obj.get("url")
            # sc.get_request(url, init=False) # Enable when debugging
            executor.submit(sc.get_request, url, init=False) # Disable when debugging

コード例 #21

0

ファイルを表示

def leaderboard(state, key):
    s = scraper.Scraper()

    tid = 0  # Scrape all teams for now, add individual teams later if needed
    year = datetime.date.today().strftime('%Y')

    url = """http://www.fangraphs.com/leaders.aspx?pos=all&stats={0}\
             &lg=all&qual=0&type=8&season={1}\
             &month=0&season1={1}\
             &ind=0&team={2}&page=1_1000"""\
             .format(state, year, tid)\
             .replace(' ', '')

    s.scrape(url, key, 'fangraphs', 'fangraphs', 24 * 60 * 60)

コード例 #22

0

ファイルを表示

    def post_tip(self):
        """
        DevTo or Medium API sends a post daily here
        :return:
        """
        print("Posting tip to slack now...")
        message = "***TIPS FOR TODAY*** \n\n\n\n"
        scraper_obj = scraper.Scraper()
        for article in scraper_obj.get_articles():
            message += f"{article['title']} \n {article['link']} \n\n"

        message += "\n\n\n\n\n\n\n <!channel> :heart::slightly_smiling_face:  *I am Dele, a simple rule" \
                   " based bot, my code lives here: * https://github.com/olayemii/algorismbot " \
                   ":heart::slightly_smiling_face: "
        self._post_message(message)

コード例 #23

0

ファイルを表示

 def setUp(self):
     self.scraper_config = {
         'scraper': {
             'frequency': 1,
             'run_once': True
         },
         'request': {
             'timeout': 10
         },
         'http://127.0.0.1:81': {
             'pattern': "foo.*"
         }
     }
     self.s = scraper.Scraper(tests.mocks.MockOutput(), self.scraper_config)
     pass

コード例 #24

0

ファイルを表示

ファイル: automation.py プロジェクト: ambrociok/headlineMarketCorrelator

def automation():
    d = str(date.today())
    t = datetime.now()

    #Create a massive dataframe for processing
    x = processor.Processor()
    df = x.mergeLabels()
    y = scraper.Scraper()

    #Seperate dataframes for pre today and today
    df_test = y.getNewHeadlines().drop_duplicates(keep='first')
    df1 = df[df['date'] == d].drop_duplicates(ignore_index=True)
    df = df[~df['date'].str.contains(d)]

    #Vectorizes your phrases and creates your classifier
    counter = CountVectorizer(ngram_range=(2, 3))
    classifier = MultinomialNB()
    counter.fit(df['title'] + df['description'])

    training_counts = counter.transform(df['title'])
    labels = df['label']

    #The vectorized counts of the headlines in df_test
    headline_counts = counter.transform(df_test['title'])
    headline_counts_ticker = counter.transform(df1['title'])

    #Training the model
    classifier.fit(training_counts, labels)
    prediction = classifier.predict(headline_counts)
    prediction1 = classifier.predict(headline_counts_ticker)

    chance = 100 * sum(prediction) / len(prediction)
    chanceticker = 100 * sum(prediction1) / len(prediction1)

    totalChance = (chance + chanceticker) / 2

    print('Chances of market going up tomorrow: {0:.2f}%'.format(totalChance))
    print('New Headline Chances: {0:.2f}%'.format(chance))
    print('Ticker Headline Chances: {0:.2f}%'.format(chanceticker))
    print('Prediction New Headline Length: {}'.format(
        np.size(classifier.predict_proba(headline_counts), 0)))
    print('Prediction Ticker Headline Length: {}'.format(
        np.size(classifier.predict_proba(headline_counts_ticker), 0)))

    with open('predictions/predictionsForTomorrow.csv', 'a',
              newline='') as currentCSV:
        writer = csv.writer(currentCSV)
        writer.writerow([d, t, totalChance])

コード例 #25

0

ファイルを表示

def run():
    #Scraper
    titles_path = "N:/Common/ASTMs & Standards/ASTM/2015 Standards"
    output_path = "C:/Users/User/SyncedFolder/Quality Share/ASTM Mapping/Data/titles.csv"

    scraper_ = scraper.Scraper(titles_path)
    scraper_.scrape()
    scraper_.output_results(output_path)

    #Clusterer
    cat_output_path = "C:/Users/User/SyncedFolder/Quality Share/ASTM Mapping/Data/ASTMCategorization.csv"
    uncat_output_path = "C:/Users/User/SyncedFolder/Quality Share/ASTM Mapping/Data/ASTMUncategorized.csv"

    clusterer_ = clusterer.Clusterer(output_path)
    clusterer_.categorize_data()
    clusterer_.output_categorization(cat_output_path, uncat_output_path)

コード例 #26

0

ファイルを表示

ファイル: main.py プロジェクト: Ignis-Altum/scraper

def scrape():
    print("Scraping...")

    products_df = scraper.Filemanager.get_products_data()

    # Create instances of class "Scraper"
    products = [
        scraper.Scraper(category, url)
        for category, url in zip(products_df["category"], products_df["url"])
    ]

    # Scrape and save scraped data for each product (sequentially)
    for product in products:
        print(product.url)
        time.sleep(scraper.REQUEST_DELAY)
        product.scrape_info()
        product.save_info()

コード例 #27

0

ファイルを表示

    def modifyUI(self):
        """ Method to modify UIs for the tabs after scraping.

            First, the required web page is loaded on the webpage tab.
            Second, the python script is generated and stored in script member variable
            Third, scraper instance is created and scraping starts on a separate thread.
            As soon as scraping finishes, the method addScriptAndData() is called.
        """
        url = self.urlInput.text()
        selectors = self.selectorInput.text()
        self.web.load(QUrl(url))
        print("Webpage Loaded \n")

        self.script = ScriptGenerator(url, selectors).generate()

        self.scraper_ = scraper.Scraper(str(url), str(selectors))
        self.scraper_.threadChange.connect(self.addScriptAndData)
        self.scraper_.start()

コード例 #28

0

ファイルを表示

def readPlayers(new_players_file, known_players_file):
    new_players = openFile(new_players_file)
    known_players = openFile(known_players_file)
    player_list = []
    s = scraper.Scraper()
    for playerID in new_players:
        if len(playerID) > 1:
            p = player.Player(playerID)
            player_info = indexIntoLine(p.getID(), known_players)
            if len(player_info) > 1:
                print("Loading " + p.getID() + " from known players")
                p.setSR(int(player_info.split(',')[1]))
                p.setRole(player_info.split(',')[2])
            else:
                print(p.getID() + " not known. Fetching info")
                s.scrape(p)
            player_list.append(p)
    return player_list

コード例 #29

0

ファイルを表示

def main(argv):
    config = None

    try:
        file = open(CONFIG, "r")

        config = file.read()

        file.close()

        config = json.loads(config)
        config = config["scraper"]
    except:
        print("Config syntax error!")
        return 1

    app = scraper.Scraper(config, argv)

    return app.run()

コード例 #30

0

ファイルを表示

ファイル: reply.py プロジェクト: halkver/CallyBot

    def __init__(self, access_token=None, db=None):
        self.access_token = access_token
        self.db = db
        self.scraper = scraper.Scraper(self, self.db)
        self.scraper.start()

        self.rep = {
            "/": "-",
            "\\": "-",
            ":": "-",
            ";": "-",
            ",": "-",
            ".": "-"
        }
        # define desired replacements here.
        # Used in set reminder to get a standard format to work with
        self.rep = dict((re.escape(k), v) for k, v in self.rep.items())
        self.pattern = re.compile("|".join(self.rep.keys()))

        self.delete_conf = {}
        self.user_reminders = {}