Beispiel #1
0
def main():

    date = datetime.datetime(2020, 2, 10).strftime('%Y-%m-%d')
    url = 'https://stats.nba.com/stats/scoreboardV2?DayOffset=0&LeagueID=00&gameDate=' + date

    url = 'https://stats.nba.com/stats/boxscoretraditionalv2?EndPeriod=10&EndRange=0&GameID=0012000047&RangeType=0&Season=2019-20&SeasonType=Regular+Season&StartPeriod=1&StartRange=0'

    datasets_name = [
        'GameHeader', 'LineScore', 'EastConfStandingsByDay',
        'WestConfStandingsByDay'
    ]

    HEADERS = {
        # 'Host': 'i.cdn.turner.com',
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0',
        'Referer': 'https://www.nba.com/stats/',
        'Origin': 'https://www.nba.com',
        'Accept': '*/*',
        'Accept-Language': 'en-GB,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'DNT': '1',
        'Connection': 'keep-alive'
    }

    scrapper = Scrapper(headers=HEADERS, max_call_errors=5)
    json_returned = scrapper.retrieve_json_api_from_url(url=url)

    if json_returned == None:
        return

    print(json.dumps(json_returned, indent=4, sort_keys=True))
Beispiel #2
0
    def populate(self):

        scrapper = Scrapper()
        scrapper.character(self)

        if self.title is None:
            print "Could not retrieve data for this character"
            return

        print("populating " + self.title)

        client = MongoClient(
            "mongodb://*****:*****@127.0.0.1:27017/naruto?authSource=admin")
        db = client.naruto

        characters = db.characters

        character_data = {
            'title': self.title,
            'content': self.body,
            'image': self.image,
        }

        query = characters.find({"title": self.title})

        if query.count() > 0:
            characters.update_one({"title": self.title},
                                  {"$set": character_data})
        else:
            characters.insert_one(character_data)
Beispiel #3
0
def get_data(url, datasets_name):
    HEADERS = {
        'Referer':
        'https://stats.nba.com',
        'Origin':
        'https://stats.nba.com',
        'x-nba-stats-token':
        'true',
        'x-nba-stats-origin':
        'stats',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }

    scrapper = Scrapper(headers=HEADERS, max_call_errors=5)
    json = scrapper.retrieve_json_api_from_url(url=url)

    if json == None:
        return None

    dfs = {}
    for elem in json['resultSets']:
        if elem['name'] not in datasets_name:
            continue

        df = pd.DataFrame(elem['rowSet'], columns=elem['headers'])
        dfs[elem['name']] = df

    return dfs
        def runUnitTest ():
            os.system('cls')
            print 'Running DesignComponent.UnitTest...\n'
            Scrapper.printLine('-')

            DesignComponent.UnitTest.testDownloadDesignFiles()
            
            Scrapper.printLine('-')
            print 'Finished running DesignComponent.UnitTest.'
Beispiel #5
0
        def runUnitTest():
            os.system('cls')
            print 'Running DesignComponent.UnitTest...\n'
            Scrapper.printLine('-')

            DesignComponent.UnitTest.testDownloadDesignFiles()

            Scrapper.printLine('-')
            print 'Finished running DesignComponent.UnitTest.'
Beispiel #6
0
def main():

    emptyImagesDirectory()

    url = "https://www.cars.co.za/searchVehicle.php?new_or_used=&make_model=Hyundai%5Bi10%5D&vfs_area=&agent_locality=&price_range=&os=%27&P="

    num_img = 1000

    newScrapper = Scrapper(url, num_img)
    try:
        newScrapper.startCrawling()
    except urllib.error.HTTPError as e:
        print("Error: ", e, "\n")
 def downloadDesignFiles(self):
     directory = self.directory
     Scrapper.checkPathOrCreate(direcotry)
     Scrapper.downloadFile(self.datasheetURL, directory)
     
     metadata = self.metadata
     for key in metadata:
         if key.endswith(' URL'):
             self.downloadDesignFile(metadata[key])
             self.downloadDocument(metadata[key])
     
     for source in self.sources:
         for key, value in source.metadata:
             if ' URL' in key:
                 self.downloadDesignFile(value)
                 self.downloadDocument(value)
Beispiel #8
0
def api(prov):
    """
    Endpoint untuk mendapatkan data kasus Covid19 di beberapa provinsi yang ada di Indonesia
    ---
    tags:
        - Rest Controller
    parameters:
        - name: body
          in: body
          required: true
          schema:
            id: Provinsi
            required:
                - provinsi
            properties:
                provinsi:
                    type: string
                    description: Silahkan isikan akronim nama provinsi yang akan Anda ambil data kasus Covid19 yang terjadi di provinsi terebut. Saat ini baru tersedia untuk aceh, bali, diy, sumut, babel, jatim, kalsel, dan sulses
                    default: ""
    responses:
        200:
            description: Berhasil
        400:
            description: Mohon maaf, ada permasalahan dalam memproses permintaan Anda
    """

    #post = request.get_json()
    #prov = post['provinsi']

    return jsonify(scr.scrapper(prov))
Beispiel #9
0
class Testing(unittest.TestCase):
    scrapper = Scrapper()
    character = None

    # check if page doesn't exists
    def test_404(self):
        print "Testing source 404"
        url = "https://naruto.fandom.com/wiki/Category:Characters"
        links = self.scrapper.links(url)
        assert links.__class__ == ResultSet, "Should have a result set"

    def test_blank_page(self):
        print "Testing source blank page"
        url = ""
        links = self.scrapper.links(url)
        assert links.__class__ == ResultSet, "Should have a result set"

    def test_character_404(self):
        print "Testing character 404 page"
        url = "https://naruto.fandom.com/wiki/A_(First_Raikasdfasdf)"
        character = Character(url)
        character.populate()
        assert character.title is None, "Character should be empty"

    def test_character_blank(self):
        print "Testing character blank page"
        url = ""
        character = Character(url)
        character.populate()
        assert character.title is None, "Character should be empty"
 def isDesignDocument(url):
     filetype = Scrapper.getFiletype(url)
     if filetype == '':
         return
     if filetype in DesignComponent.documentTypes:
         return True
     return False
Beispiel #11
0
 def isDesignDocument(url):
     filetype = Scrapper.getFiletype(url)
     if filetype == '':
         return
     if filetype in DesignComponent.documentTypes:
         return True
     return False
Beispiel #12
0
    def downloadDesignFiles(self):
        directory = self.directory
        Scrapper.checkPathOrCreate(direcotry)
        Scrapper.downloadFile(self.datasheetURL, directory)

        metadata = self.metadata
        for key in metadata:
            if key.endswith(' URL'):
                self.downloadDesignFile(metadata[key])
                self.downloadDocument(metadata[key])

        for source in self.sources:
            for key, value in source.metadata:
                if ' URL' in key:
                    self.downloadDesignFile(value)
                    self.downloadDocument(value)
Beispiel #13
0
def get_data(url, datasets_name, headers):

    scrapper = Scrapper(headers=headers, max_call_errors=5)
    json = scrapper.retrieve_json_api_from_url(url=url)

    if json == None:
        return None

    dfs = {}
    for elem in json['resultSets']:
        if elem['name'] not in datasets_name:
            continue

        df = pd.DataFrame(elem['rowSet'], columns=elem['headers'])
        dfs[elem['name']] = df

    return dfs
Beispiel #14
0
    def run_Scrapper_0(self, value, companyname, date):
        obj_local = None
        if (value == 1):
            obj_local = self.obj
        elif (value == 2):
            obj_local = self.obj1
        elif (value == 3):
            obj_local = self.obj2
        elif (value == 4):
            obj_local = self.obj3
        elif (value == 5):
            obj_local = self.obj4

        obj_local.Post_Factiva_Home_Request()
        obj1 = Searcher(company_name=companyname,
                        date=date,
                        driver=obj_local.driver)
        obj1.Select_CustomDateRange()
        obj1.Enter_FromDate()
        obj1.Enter_ToDate()
        obj1.Enter_Company()
        obj1.SubmitSearch()

        obj2 = Scrapper(driver=obj_local.driver, session=obj_local.session)
        obj2.MigrateSeleniumtoRequests()
        count_ = 1
        while True:
            obj2.GetAllArticles()
            obj2.Parse_Articles()
            check = obj2.Navigate_NextPage(count=count_)
            if (check):
                break
            count_ += 1

        obj2.Save_ToDataFrame(companyname=companyname, date=date)
Beispiel #15
0
def MainUpdater(subreddit_name="wallpapers"):
    dir_path = "%s/WallpaperUpdater/" % os.environ["APPDATA"]
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    settings = config_loader(dir_path + "settings.toml")
    if (settings["currentimg"] == 6):
        Scrapper(dir_path, subreddit_name)
        settings["currentimg"] = 0

    files = os.listdir(dir_path + "Temp_images")
    Changer(
        os.path.abspath(dir_path + "Temp_images/" +
                        files[settings["currentimg"]]).replace("\\", "/"))
    settings["currentimg"] += 1
    config_saver(settings, dir_path + "settings.toml")
    def addMetadata(self, key, value):
        key = key.strip()
        if key == '': # We can't add a entry with no key.
            return
        
        if key.endswith(' URL'):
            value = Scrapper.fixShortURL(value)
        
        #print 'Adding metadata: ', key, ', ', value
        print 'Adding metadata entry: [" ', key, ' "], [" ', value, ' "]'
        
        if len(self.metadata) == 0: # We have to create a dictionary
            self.metadata = { key: value }
            return

        self.metadata[key] = value
Beispiel #17
0
    def addMetadata(self, key, value):
        key = key.strip()
        if key == '':  # We can't add a entry with no key.
            return

        if key.endswith(' URL'):
            value = Scrapper.fixShortURL(value)

        #print 'Adding metadata: ', key, ', ', value
        print 'Adding metadata entry: [" ', key, ' "], [" ', value, ' "]'

        if len(self.metadata) == 0:  # We have to create a dictionary
            self.metadata = {key: value}
            return

        self.metadata[key] = value
 def generateTestComponent(manufacturer, name, directory):
     print 'Generating test DesignComponent...("', name, ', ', manufacturer, ', ', directory, ')\n'
     
     component = DesignComponent(manufacturer, name, directory)
     component.datasheetURL = Scrapper.fixShortURL('www.genericmanufacturer.com/test/1/2/3/datasheet.pdf')
     
     component.addMetadata('3D Model URL', 'http://www.molex.com/pdm_docs/stp/87918-0001_stp.zip')
     component.addMetadata('Datasheet 1 URL', 'http://www.molex.com/pdm_docs/sd/879180001_sd.pdf')
     component.addMetadata('3D Model 2 URL', 'http://www.te.com/commerce/DocumentDelivery/DDEController?Action=srchrtrv&DocNm=640454-4&DocType=Customer+View+Model&DocLang=English')
     component.addMetadata('Datasheets 2 URL', 'http://www.te.com/commerce/DocumentDelivery/DDEController?Action=srchrtrv&DocNm=640454&DocType=Customer+Drawing&DocLang=English')
     component.addMetadata('Online Catalog URL', 'http://www.digikey.com/catalog/en/partgroup/mta-100-series/9332?mpart=640454-4&vendor=17&WT.z_ref_page_type=PS&WT.z_ref_page_sub_type=PD&WT.z_ref_page_id=PD&WT.z_ref_page_event=DC_Link_Table')
     component.addMetadata('Product Photos URL', 'http://media.digikey.com/photos/Tyco%20Amp%20Photos/640456-4,%20640454-4.jpg')
     component.addMetadata('Featured Product URL', 'http://www.digikey.com/product-highlights/us/en/te-connectivity-mta-connectors/2307')
     component.addMetadata('Series URL', 'http://www.digikey.com/product-search/en?FV=ffec1142')
     
     return component
        def testDownloadDesignFiles():
            print '\nRunning DesignComponent diagnostics...'
            Scrapper.printLine('-')

            testDirectoryName = 'D:\Workspace\eda-sourcerer\source\Test' # already existing folder on D: drive
            component = DesignComponent.UnitTest.generateTestComponent('TestComponent', 'Generic', testDirectoryName)
            print component.toString()
            
            Scrapper.printLine('-');
            print '\nDownloading test files...'
            component.downloadDesignFiles()
            
            Scrapper.printLine('-')
            print 'Finished running DesignComponent diagnostics.\n'
Beispiel #20
0
        def testDownloadDesignFiles():
            print '\nRunning DesignComponent diagnostics...'
            Scrapper.printLine('-')

            testDirectoryName = 'D:\Workspace\eda-sourcerer\source\Test'  # already existing folder on D: drive
            component = DesignComponent.UnitTest.generateTestComponent(
                'TestComponent', 'Generic', testDirectoryName)
            print component.toString()

            Scrapper.printLine('-')
            print '\nDownloading test files...'
            component.downloadDesignFiles()

            Scrapper.printLine('-')
            print 'Finished running DesignComponent diagnostics.\n'
Beispiel #21
0
        def generateTestComponent(manufacturer, name, directory):
            print 'Generating test DesignComponent...("', name, ', ', manufacturer, ', ', directory, ')\n'

            component = DesignComponent(manufacturer, name, directory)
            component.datasheetURL = Scrapper.fixShortURL(
                'www.genericmanufacturer.com/test/1/2/3/datasheet.pdf')

            component.addMetadata(
                '3D Model URL',
                'http://www.molex.com/pdm_docs/stp/87918-0001_stp.zip')
            component.addMetadata(
                'Datasheet 1 URL',
                'http://www.molex.com/pdm_docs/sd/879180001_sd.pdf')
            component.addMetadata(
                '3D Model 2 URL',
                'http://www.te.com/commerce/DocumentDelivery/DDEController?Action=srchrtrv&DocNm=640454-4&DocType=Customer+View+Model&DocLang=English'
            )
            component.addMetadata(
                'Datasheets 2 URL',
                'http://www.te.com/commerce/DocumentDelivery/DDEController?Action=srchrtrv&DocNm=640454&DocType=Customer+Drawing&DocLang=English'
            )
            component.addMetadata(
                'Online Catalog URL',
                'http://www.digikey.com/catalog/en/partgroup/mta-100-series/9332?mpart=640454-4&vendor=17&WT.z_ref_page_type=PS&WT.z_ref_page_sub_type=PD&WT.z_ref_page_id=PD&WT.z_ref_page_event=DC_Link_Table'
            )
            component.addMetadata(
                'Product Photos URL',
                'http://media.digikey.com/photos/Tyco%20Amp%20Photos/640456-4,%20640454-4.jpg'
            )
            component.addMetadata(
                'Featured Product URL',
                'http://www.digikey.com/product-highlights/us/en/te-connectivity-mta-connectors/2307'
            )
            component.addMetadata(
                'Series URL',
                'http://www.digikey.com/product-search/en?FV=ffec1142')

            return component
Beispiel #22
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from Scrapper import Scrapper
import os
from multiprocessing.dummy import Pool as ThreadPool

if __name__ == "__main__":

    tag = input("Please enter the search tag: ")
    path = r"./{tag}/".format(tag=tag)
    if not os.path.exists(path):
        os.makedirs(path)
    scrapper = Scrapper(path)
    url = "https://www.pexels.com/search/{tag}/?format=js".format(tag=tag)
    imgLinks = scrapper.readhtml(url)
    pool = ThreadPool(20)
    pool.map(scrapper.downLoad, imgLinks)
 def downloadDesignFile(self, url):
     if not DesignComponent.isDesignDocument(url):
         return
     Scrapper.downloadFile(url, DesignComponent.designFileTypes)
Beispiel #24
0
  file.close()

def load_data(filename):
  """
  Returns the object stored in the file
  @param filename: the name of the file
  @return the object stored in the file
  """
  file = open(filename, "r")
  data = pickle.load(file)
  file.close()
  return data


"""
Example use of Advise.py

scrapper.query(QUERY='university', OBJECT_TYPE='page')
university_pages = scrapper.get_pages(QUERY='university', OBJECT_TYPE='page')
scrapper.dump("query_data")
store_data(university_pages, "university_pages")
clusters = Cluster.kmeans(data=university_pages, k=20)
"""
if __name__ == "__main__":
  scrapper = Scrapper()
  scrapper.load("query_data")
  university_pages = load_data("university_pages")
  clusters = Cluster.kmeans(data=university_pages)
  visualizer = Visualizer(data=university_pages, clusters=clusters)

Beispiel #25
0
import pandas as pd

from Scrapper import Scrapper
from datetime import date

countries_to_scrape = [
    'GB', 'US', 'BR', 'MX', 'CO', 'FR', 'BE', 'ZA', 'PE', 'AR', 'CA'
]

sc = Scrapper()
sc.start_clean()
countries = sc.get_county_list()
sc.store_output(countries, 'countries-list.csv')
sc.log(f'Output saved - countries-list.csv')
all_countries_data = None
today = date.today()

for idx, row in countries.iterrows():
    country = row['country']
    if country in countries_to_scrape:
        sc.log(f'scrapping country {country}')
        national_data = sc.get_national_data(row['url'])

        file = country + f'_national_data_{today}.csv'
        sc.store_output(national_data, file)
        sc.log(f'National data saved - {file}')

        sub_national_data = sc.get_sub_national_data(row['url'])
        file = country + f'_sub_national_data_{today}.csv'
        sc.store_output(sub_national_data, file)
        sc.log(f'Sub National data saved - {file}')
 def setManufacturer(self, manufacturer):
     properName = Scrapper.fixFilename(manufacturer)
     if properName == '': # The name was not valid.
         return
     self.manufacturer = manufacturer
Beispiel #27
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from multiprocessing.dummy import Pool as ThreadPool
import os
from Scrapper import Scrapper

if __name__ == "__main__":

    urls = []
    numPics = int(input("How many wallpapers do you want to download: "))
    for i in range(1, (numPics // 30) + 2):
        url = "https://www.pexels.com/?format=js&page={page}".format(page=i)
        urls.append(url)

    path = r"./Wallpapers/"
    if not os.path.exists(path):
        os.makedirs(path)
    scrapper = Scrapper(path, numPics)
    imgLinks = scrapper.readHtml(2)
    pool = ThreadPool(20)
    pool.map(scrapper.downLoad, imgLinks)
 def __init__(self, projectDirectory):
     HTMLParser.__init__(self)
     self.projectDirectory = Scrapper.cleanDirecoryPath(projectDirectory)
 def loadFromURL(self, supplyCode, delineator, url):
     htmlDoc = Scrapper.loadContentFromURL(url)
     self.loadFromHTML(supplyCode, delineator, htmlDoc, url)
 def setName(self, name):
     properName = Scrapper.fixFilename(name)
     if properName == '': # The name was not valid.
         return
     self.name = name
Beispiel #31
0
 def setName(self, name):
     properName = Scrapper.fixFilename(name)
     if properName == '':  # The name was not valid.
         return
     self.name = name
 def testGenerateDesignComponent():
     
     print '\nRunning testGenerateDesignComponent diagnostics...'
     Scrapper.printLine('-')
     
     testDirectoryName = 'D:\Workspace\eda-sourcerer\source\Test' # already existing folder on D: drive
     
     component = DesignComponent.UnitTest.generateTestComponent('TestComponent', 'Generic', testDirectoryName)
     
     Scrapper.printLine('-')
     print '\n', component.toString()
     Scrapper.printLine('-')
     
     badFolderName = 'B:\ad\direcotry' # Folder that does not exist
     component = DesignComponent.UnitTest.generateTestComponent('TestComponent', 'Generic', badFolderName)
     
     Scrapper.printLine('-')
     print '\n', component.toString()
     Scrapper.printLine('-')
     
     component.downloadDesignFiles()
     
     Scrapper.printLine('-')
     print 'Finished running testGenerateDesignComponent diagnostics.\n'
Beispiel #33
0
from Character import Character
from Scrapper import Scrapper
import time

# escogemos la pagina a Scrappear
character_page = "https://naruto.fandom.com/wiki/Category:Characters"
characters = []
character_objects = []

scrapper = Scrapper()

# sacamos los links de todos los personajes
while character_page is not None:

    name_box = scrapper.links(character_page)
    character_page = scrapper.nextPage
    characters = []
    character_objects = []

    for name in name_box:
        characters.append(name.get('href'))
        character_objects.append(Character(name.get('href')))

# procesamos cada link
    for character in character_objects:
        character.populate()
        time.sleep(3)


#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 27 18:27:46 2019

@author: ubuntu
"""

from Scrapper import Scrapper, WikiURLGetter
from KnowledgeCreation import KnowledgeCreation

try:
    URL = WikiURLGetter("Machine learning").getURL()
    scrap = Scrapper(URL)
    a = scrap.createDriver()
    scrap.wikipediaTable(a)
    me = scrap.wikipediaContent()
    KnowledgeCreation(me).textPreprocessing()
    
finally:
    a.close()
Beispiel #35
0
        def testGenerateDesignComponent():

            print '\nRunning testGenerateDesignComponent diagnostics...'
            Scrapper.printLine('-')

            testDirectoryName = 'D:\Workspace\eda-sourcerer\source\Test'  # already existing folder on D: drive

            component = DesignComponent.UnitTest.generateTestComponent(
                'TestComponent', 'Generic', testDirectoryName)

            Scrapper.printLine('-')
            print '\n', component.toString()
            Scrapper.printLine('-')

            badFolderName = 'B:\ad\direcotry'  # Folder that does not exist
            component = DesignComponent.UnitTest.generateTestComponent(
                'TestComponent', 'Generic', badFolderName)

            Scrapper.printLine('-')
            print '\n', component.toString()
            Scrapper.printLine('-')

            component.downloadDesignFiles()

            Scrapper.printLine('-')
            print 'Finished running testGenerateDesignComponent diagnostics.\n'
Beispiel #36
0
 def downloadDesignFile(self, url):
     if not DesignComponent.isDesignDocument(url):
         return
     Scrapper.downloadFile(url, DesignComponent.designFileTypes)
from Scrapper import Scrapper
from pyquery import PyQuery as pq

scrapper = Scrapper()


def collection_of_womens_name():
    list_of_name = []
    rsp_data = scrapper.requestData(
        url='https://www.babble.com/pregnancy/1000-most-popular-girl-names')

    if rsp_data is None:
        return list_of_name

    html_query = pq(rsp_data)('li.p1')

    for each_name in html_query:
        name = pq(each_name).text().split(" ")[0]
        if len(name) < 20:
            list_of_name.append(name)

    return list_of_name


if __name__ == "__main__":
    list_of_womens_name = collection_of_womens_name()
Beispiel #38
0
 async def open_page(self, url):
     page = await self.fetch(url)
     return Scrapper(page, url)
Beispiel #39
0
from cleaning import Cleaner
from Scrapper import Scrapper
import pandas as pd
if __name__ == "__main__":
    cleaner = Cleaner("../cache")
    print("Enter 1800<year<1900")

    year = int(input())
    if year >= 1800 and year <= 1900:
        print("=======> " + str(year))
        arks = Scrapper.get_arks(year)
        for ark in arks:
            print("=======>" + ark)
            print(f"- download {ark}")
            file = Scrapper.get_document(ark)
            print("- Extraction {ark}")
            df = cleaner.extract(file)
            print(f"{df.shape[0]} rows detected")
            print(f"- Post processing {ark}")
            df = cleaner.postProcess(df)
            print("- Spell checking")
            df = cleaner.spell_check(df)
            print("- saving ")
            cleaner.save(df, ark)
            print(" finnished " + ark)
            print("\n")
            del file
            del df
Beispiel #40
0
 def setManufacturer(self, manufacturer):
     properName = Scrapper.fixFilename(manufacturer)
     if properName == '':  # The name was not valid.
         return
     self.manufacturer = manufacturer