def main(): date = datetime.datetime(2020, 2, 10).strftime('%Y-%m-%d') url = 'https://stats.nba.com/stats/scoreboardV2?DayOffset=0&LeagueID=00&gameDate=' + date url = 'https://stats.nba.com/stats/boxscoretraditionalv2?EndPeriod=10&EndRange=0&GameID=0012000047&RangeType=0&Season=2019-20&SeasonType=Regular+Season&StartPeriod=1&StartRange=0' datasets_name = [ 'GameHeader', 'LineScore', 'EastConfStandingsByDay', 'WestConfStandingsByDay' ] HEADERS = { # 'Host': 'i.cdn.turner.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0', 'Referer': 'https://www.nba.com/stats/', 'Origin': 'https://www.nba.com', 'Accept': '*/*', 'Accept-Language': 'en-GB,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive' } scrapper = Scrapper(headers=HEADERS, max_call_errors=5) json_returned = scrapper.retrieve_json_api_from_url(url=url) if json_returned == None: return print(json.dumps(json_returned, indent=4, sort_keys=True))
def populate(self): scrapper = Scrapper() scrapper.character(self) if self.title is None: print "Could not retrieve data for this character" return print("populating " + self.title) client = MongoClient( "mongodb://*****:*****@127.0.0.1:27017/naruto?authSource=admin") db = client.naruto characters = db.characters character_data = { 'title': self.title, 'content': self.body, 'image': self.image, } query = characters.find({"title": self.title}) if query.count() > 0: characters.update_one({"title": self.title}, {"$set": character_data}) else: characters.insert_one(character_data)
def get_data(url, datasets_name): HEADERS = { 'Referer': 'https://stats.nba.com', 'Origin': 'https://stats.nba.com', 'x-nba-stats-token': 'true', 'x-nba-stats-origin': 'stats', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } scrapper = Scrapper(headers=HEADERS, max_call_errors=5) json = scrapper.retrieve_json_api_from_url(url=url) if json == None: return None dfs = {} for elem in json['resultSets']: if elem['name'] not in datasets_name: continue df = pd.DataFrame(elem['rowSet'], columns=elem['headers']) dfs[elem['name']] = df return dfs
def runUnitTest (): os.system('cls') print 'Running DesignComponent.UnitTest...\n' Scrapper.printLine('-') DesignComponent.UnitTest.testDownloadDesignFiles() Scrapper.printLine('-') print 'Finished running DesignComponent.UnitTest.'
def runUnitTest(): os.system('cls') print 'Running DesignComponent.UnitTest...\n' Scrapper.printLine('-') DesignComponent.UnitTest.testDownloadDesignFiles() Scrapper.printLine('-') print 'Finished running DesignComponent.UnitTest.'
def main(): emptyImagesDirectory() url = "https://www.cars.co.za/searchVehicle.php?new_or_used=&make_model=Hyundai%5Bi10%5D&vfs_area=&agent_locality=&price_range=&os=%27&P=" num_img = 1000 newScrapper = Scrapper(url, num_img) try: newScrapper.startCrawling() except urllib.error.HTTPError as e: print("Error: ", e, "\n")
def downloadDesignFiles(self): directory = self.directory Scrapper.checkPathOrCreate(direcotry) Scrapper.downloadFile(self.datasheetURL, directory) metadata = self.metadata for key in metadata: if key.endswith(' URL'): self.downloadDesignFile(metadata[key]) self.downloadDocument(metadata[key]) for source in self.sources: for key, value in source.metadata: if ' URL' in key: self.downloadDesignFile(value) self.downloadDocument(value)
def api(prov): """ Endpoint untuk mendapatkan data kasus Covid19 di beberapa provinsi yang ada di Indonesia --- tags: - Rest Controller parameters: - name: body in: body required: true schema: id: Provinsi required: - provinsi properties: provinsi: type: string description: Silahkan isikan akronim nama provinsi yang akan Anda ambil data kasus Covid19 yang terjadi di provinsi terebut. Saat ini baru tersedia untuk aceh, bali, diy, sumut, babel, jatim, kalsel, dan sulses default: "" responses: 200: description: Berhasil 400: description: Mohon maaf, ada permasalahan dalam memproses permintaan Anda """ #post = request.get_json() #prov = post['provinsi'] return jsonify(scr.scrapper(prov))
class Testing(unittest.TestCase): scrapper = Scrapper() character = None # check if page doesn't exists def test_404(self): print "Testing source 404" url = "https://naruto.fandom.com/wiki/Category:Characters" links = self.scrapper.links(url) assert links.__class__ == ResultSet, "Should have a result set" def test_blank_page(self): print "Testing source blank page" url = "" links = self.scrapper.links(url) assert links.__class__ == ResultSet, "Should have a result set" def test_character_404(self): print "Testing character 404 page" url = "https://naruto.fandom.com/wiki/A_(First_Raikasdfasdf)" character = Character(url) character.populate() assert character.title is None, "Character should be empty" def test_character_blank(self): print "Testing character blank page" url = "" character = Character(url) character.populate() assert character.title is None, "Character should be empty"
def isDesignDocument(url): filetype = Scrapper.getFiletype(url) if filetype == '': return if filetype in DesignComponent.documentTypes: return True return False
def get_data(url, datasets_name, headers): scrapper = Scrapper(headers=headers, max_call_errors=5) json = scrapper.retrieve_json_api_from_url(url=url) if json == None: return None dfs = {} for elem in json['resultSets']: if elem['name'] not in datasets_name: continue df = pd.DataFrame(elem['rowSet'], columns=elem['headers']) dfs[elem['name']] = df return dfs
def run_Scrapper_0(self, value, companyname, date): obj_local = None if (value == 1): obj_local = self.obj elif (value == 2): obj_local = self.obj1 elif (value == 3): obj_local = self.obj2 elif (value == 4): obj_local = self.obj3 elif (value == 5): obj_local = self.obj4 obj_local.Post_Factiva_Home_Request() obj1 = Searcher(company_name=companyname, date=date, driver=obj_local.driver) obj1.Select_CustomDateRange() obj1.Enter_FromDate() obj1.Enter_ToDate() obj1.Enter_Company() obj1.SubmitSearch() obj2 = Scrapper(driver=obj_local.driver, session=obj_local.session) obj2.MigrateSeleniumtoRequests() count_ = 1 while True: obj2.GetAllArticles() obj2.Parse_Articles() check = obj2.Navigate_NextPage(count=count_) if (check): break count_ += 1 obj2.Save_ToDataFrame(companyname=companyname, date=date)
def MainUpdater(subreddit_name="wallpapers"): dir_path = "%s/WallpaperUpdater/" % os.environ["APPDATA"] if not os.path.exists(dir_path): os.makedirs(dir_path) settings = config_loader(dir_path + "settings.toml") if (settings["currentimg"] == 6): Scrapper(dir_path, subreddit_name) settings["currentimg"] = 0 files = os.listdir(dir_path + "Temp_images") Changer( os.path.abspath(dir_path + "Temp_images/" + files[settings["currentimg"]]).replace("\\", "/")) settings["currentimg"] += 1 config_saver(settings, dir_path + "settings.toml")
def addMetadata(self, key, value): key = key.strip() if key == '': # We can't add a entry with no key. return if key.endswith(' URL'): value = Scrapper.fixShortURL(value) #print 'Adding metadata: ', key, ', ', value print 'Adding metadata entry: [" ', key, ' "], [" ', value, ' "]' if len(self.metadata) == 0: # We have to create a dictionary self.metadata = { key: value } return self.metadata[key] = value
def addMetadata(self, key, value): key = key.strip() if key == '': # We can't add a entry with no key. return if key.endswith(' URL'): value = Scrapper.fixShortURL(value) #print 'Adding metadata: ', key, ', ', value print 'Adding metadata entry: [" ', key, ' "], [" ', value, ' "]' if len(self.metadata) == 0: # We have to create a dictionary self.metadata = {key: value} return self.metadata[key] = value
def generateTestComponent(manufacturer, name, directory): print 'Generating test DesignComponent...("', name, ', ', manufacturer, ', ', directory, ')\n' component = DesignComponent(manufacturer, name, directory) component.datasheetURL = Scrapper.fixShortURL('www.genericmanufacturer.com/test/1/2/3/datasheet.pdf') component.addMetadata('3D Model URL', 'http://www.molex.com/pdm_docs/stp/87918-0001_stp.zip') component.addMetadata('Datasheet 1 URL', 'http://www.molex.com/pdm_docs/sd/879180001_sd.pdf') component.addMetadata('3D Model 2 URL', 'http://www.te.com/commerce/DocumentDelivery/DDEController?Action=srchrtrv&DocNm=640454-4&DocType=Customer+View+Model&DocLang=English') component.addMetadata('Datasheets 2 URL', 'http://www.te.com/commerce/DocumentDelivery/DDEController?Action=srchrtrv&DocNm=640454&DocType=Customer+Drawing&DocLang=English') component.addMetadata('Online Catalog URL', 'http://www.digikey.com/catalog/en/partgroup/mta-100-series/9332?mpart=640454-4&vendor=17&WT.z_ref_page_type=PS&WT.z_ref_page_sub_type=PD&WT.z_ref_page_id=PD&WT.z_ref_page_event=DC_Link_Table') component.addMetadata('Product Photos URL', 'http://media.digikey.com/photos/Tyco%20Amp%20Photos/640456-4,%20640454-4.jpg') component.addMetadata('Featured Product URL', 'http://www.digikey.com/product-highlights/us/en/te-connectivity-mta-connectors/2307') component.addMetadata('Series URL', 'http://www.digikey.com/product-search/en?FV=ffec1142') return component
def testDownloadDesignFiles(): print '\nRunning DesignComponent diagnostics...' Scrapper.printLine('-') testDirectoryName = 'D:\Workspace\eda-sourcerer\source\Test' # already existing folder on D: drive component = DesignComponent.UnitTest.generateTestComponent('TestComponent', 'Generic', testDirectoryName) print component.toString() Scrapper.printLine('-'); print '\nDownloading test files...' component.downloadDesignFiles() Scrapper.printLine('-') print 'Finished running DesignComponent diagnostics.\n'
def testDownloadDesignFiles(): print '\nRunning DesignComponent diagnostics...' Scrapper.printLine('-') testDirectoryName = 'D:\Workspace\eda-sourcerer\source\Test' # already existing folder on D: drive component = DesignComponent.UnitTest.generateTestComponent( 'TestComponent', 'Generic', testDirectoryName) print component.toString() Scrapper.printLine('-') print '\nDownloading test files...' component.downloadDesignFiles() Scrapper.printLine('-') print 'Finished running DesignComponent diagnostics.\n'
def generateTestComponent(manufacturer, name, directory): print 'Generating test DesignComponent...("', name, ', ', manufacturer, ', ', directory, ')\n' component = DesignComponent(manufacturer, name, directory) component.datasheetURL = Scrapper.fixShortURL( 'www.genericmanufacturer.com/test/1/2/3/datasheet.pdf') component.addMetadata( '3D Model URL', 'http://www.molex.com/pdm_docs/stp/87918-0001_stp.zip') component.addMetadata( 'Datasheet 1 URL', 'http://www.molex.com/pdm_docs/sd/879180001_sd.pdf') component.addMetadata( '3D Model 2 URL', 'http://www.te.com/commerce/DocumentDelivery/DDEController?Action=srchrtrv&DocNm=640454-4&DocType=Customer+View+Model&DocLang=English' ) component.addMetadata( 'Datasheets 2 URL', 'http://www.te.com/commerce/DocumentDelivery/DDEController?Action=srchrtrv&DocNm=640454&DocType=Customer+Drawing&DocLang=English' ) component.addMetadata( 'Online Catalog URL', 'http://www.digikey.com/catalog/en/partgroup/mta-100-series/9332?mpart=640454-4&vendor=17&WT.z_ref_page_type=PS&WT.z_ref_page_sub_type=PD&WT.z_ref_page_id=PD&WT.z_ref_page_event=DC_Link_Table' ) component.addMetadata( 'Product Photos URL', 'http://media.digikey.com/photos/Tyco%20Amp%20Photos/640456-4,%20640454-4.jpg' ) component.addMetadata( 'Featured Product URL', 'http://www.digikey.com/product-highlights/us/en/te-connectivity-mta-connectors/2307' ) component.addMetadata( 'Series URL', 'http://www.digikey.com/product-search/en?FV=ffec1142') return component
#!/usr/bin/env python # -*- coding: utf-8 -*- from Scrapper import Scrapper import os from multiprocessing.dummy import Pool as ThreadPool if __name__ == "__main__": tag = input("Please enter the search tag: ") path = r"./{tag}/".format(tag=tag) if not os.path.exists(path): os.makedirs(path) scrapper = Scrapper(path) url = "https://www.pexels.com/search/{tag}/?format=js".format(tag=tag) imgLinks = scrapper.readhtml(url) pool = ThreadPool(20) pool.map(scrapper.downLoad, imgLinks)
def downloadDesignFile(self, url): if not DesignComponent.isDesignDocument(url): return Scrapper.downloadFile(url, DesignComponent.designFileTypes)
file.close() def load_data(filename): """ Returns the object stored in the file @param filename: the name of the file @return the object stored in the file """ file = open(filename, "r") data = pickle.load(file) file.close() return data """ Example use of Advise.py scrapper.query(QUERY='university', OBJECT_TYPE='page') university_pages = scrapper.get_pages(QUERY='university', OBJECT_TYPE='page') scrapper.dump("query_data") store_data(university_pages, "university_pages") clusters = Cluster.kmeans(data=university_pages, k=20) """ if __name__ == "__main__": scrapper = Scrapper() scrapper.load("query_data") university_pages = load_data("university_pages") clusters = Cluster.kmeans(data=university_pages) visualizer = Visualizer(data=university_pages, clusters=clusters)
import pandas as pd from Scrapper import Scrapper from datetime import date countries_to_scrape = [ 'GB', 'US', 'BR', 'MX', 'CO', 'FR', 'BE', 'ZA', 'PE', 'AR', 'CA' ] sc = Scrapper() sc.start_clean() countries = sc.get_county_list() sc.store_output(countries, 'countries-list.csv') sc.log(f'Output saved - countries-list.csv') all_countries_data = None today = date.today() for idx, row in countries.iterrows(): country = row['country'] if country in countries_to_scrape: sc.log(f'scrapping country {country}') national_data = sc.get_national_data(row['url']) file = country + f'_national_data_{today}.csv' sc.store_output(national_data, file) sc.log(f'National data saved - {file}') sub_national_data = sc.get_sub_national_data(row['url']) file = country + f'_sub_national_data_{today}.csv' sc.store_output(sub_national_data, file) sc.log(f'Sub National data saved - {file}')
def setManufacturer(self, manufacturer): properName = Scrapper.fixFilename(manufacturer) if properName == '': # The name was not valid. return self.manufacturer = manufacturer
#!/usr/bin/env python # -*- coding: utf-8 -*- from multiprocessing.dummy import Pool as ThreadPool import os from Scrapper import Scrapper if __name__ == "__main__": urls = [] numPics = int(input("How many wallpapers do you want to download: ")) for i in range(1, (numPics // 30) + 2): url = "https://www.pexels.com/?format=js&page={page}".format(page=i) urls.append(url) path = r"./Wallpapers/" if not os.path.exists(path): os.makedirs(path) scrapper = Scrapper(path, numPics) imgLinks = scrapper.readHtml(2) pool = ThreadPool(20) pool.map(scrapper.downLoad, imgLinks)
def __init__(self, projectDirectory): HTMLParser.__init__(self) self.projectDirectory = Scrapper.cleanDirecoryPath(projectDirectory)
def loadFromURL(self, supplyCode, delineator, url): htmlDoc = Scrapper.loadContentFromURL(url) self.loadFromHTML(supplyCode, delineator, htmlDoc, url)
def setName(self, name): properName = Scrapper.fixFilename(name) if properName == '': # The name was not valid. return self.name = name
def testGenerateDesignComponent(): print '\nRunning testGenerateDesignComponent diagnostics...' Scrapper.printLine('-') testDirectoryName = 'D:\Workspace\eda-sourcerer\source\Test' # already existing folder on D: drive component = DesignComponent.UnitTest.generateTestComponent('TestComponent', 'Generic', testDirectoryName) Scrapper.printLine('-') print '\n', component.toString() Scrapper.printLine('-') badFolderName = 'B:\ad\direcotry' # Folder that does not exist component = DesignComponent.UnitTest.generateTestComponent('TestComponent', 'Generic', badFolderName) Scrapper.printLine('-') print '\n', component.toString() Scrapper.printLine('-') component.downloadDesignFiles() Scrapper.printLine('-') print 'Finished running testGenerateDesignComponent diagnostics.\n'
from Character import Character from Scrapper import Scrapper import time # escogemos la pagina a Scrappear character_page = "https://naruto.fandom.com/wiki/Category:Characters" characters = [] character_objects = [] scrapper = Scrapper() # sacamos los links de todos los personajes while character_page is not None: name_box = scrapper.links(character_page) character_page = scrapper.nextPage characters = [] character_objects = [] for name in name_box: characters.append(name.get('href')) character_objects.append(Character(name.get('href'))) # procesamos cada link for character in character_objects: character.populate() time.sleep(3)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Jun 27 18:27:46 2019 @author: ubuntu """ from Scrapper import Scrapper, WikiURLGetter from KnowledgeCreation import KnowledgeCreation try: URL = WikiURLGetter("Machine learning").getURL() scrap = Scrapper(URL) a = scrap.createDriver() scrap.wikipediaTable(a) me = scrap.wikipediaContent() KnowledgeCreation(me).textPreprocessing() finally: a.close()
def testGenerateDesignComponent(): print '\nRunning testGenerateDesignComponent diagnostics...' Scrapper.printLine('-') testDirectoryName = 'D:\Workspace\eda-sourcerer\source\Test' # already existing folder on D: drive component = DesignComponent.UnitTest.generateTestComponent( 'TestComponent', 'Generic', testDirectoryName) Scrapper.printLine('-') print '\n', component.toString() Scrapper.printLine('-') badFolderName = 'B:\ad\direcotry' # Folder that does not exist component = DesignComponent.UnitTest.generateTestComponent( 'TestComponent', 'Generic', badFolderName) Scrapper.printLine('-') print '\n', component.toString() Scrapper.printLine('-') component.downloadDesignFiles() Scrapper.printLine('-') print 'Finished running testGenerateDesignComponent diagnostics.\n'
from Scrapper import Scrapper from pyquery import PyQuery as pq scrapper = Scrapper() def collection_of_womens_name(): list_of_name = [] rsp_data = scrapper.requestData( url='https://www.babble.com/pregnancy/1000-most-popular-girl-names') if rsp_data is None: return list_of_name html_query = pq(rsp_data)('li.p1') for each_name in html_query: name = pq(each_name).text().split(" ")[0] if len(name) < 20: list_of_name.append(name) return list_of_name if __name__ == "__main__": list_of_womens_name = collection_of_womens_name()
async def open_page(self, url): page = await self.fetch(url) return Scrapper(page, url)
from cleaning import Cleaner from Scrapper import Scrapper import pandas as pd if __name__ == "__main__": cleaner = Cleaner("../cache") print("Enter 1800<year<1900") year = int(input()) if year >= 1800 and year <= 1900: print("=======> " + str(year)) arks = Scrapper.get_arks(year) for ark in arks: print("=======>" + ark) print(f"- download {ark}") file = Scrapper.get_document(ark) print("- Extraction {ark}") df = cleaner.extract(file) print(f"{df.shape[0]} rows detected") print(f"- Post processing {ark}") df = cleaner.postProcess(df) print("- Spell checking") df = cleaner.spell_check(df) print("- saving ") cleaner.save(df, ark) print(" finnished " + ark) print("\n") del file del df