def saveImage(url, path): try: image = urllib.URLopener() image.retrieve(url, path) except: print("Error saving image") iCount -= 1
def download_one(filename, expected_bytes, debug=0, gz=0): """ Download a file if not present, and make sure it's the right size. Files are stored in \'data\' folder """ filename = filename + ".gz" filepath = mnist_download_folder + filename if not os.path.exists(mnist_download_folder): os.makedirs(mnist_download_folder) if not os.path.exists(filepath): print("Downloading ", filename, " ...") file_download = ur.URLopener() file_download.retrieve(mnist_url + filename, filepath) statinfo = os.stat(filepath) if statinfo.st_size == expected_bytes: if (debug): print("Found and verified", filepath) else: raise Exception( "Failed to verify " + filename + ". Can you get to it with a browser? \nDownload .gz files from http://yann.lecun.com/exdb/mnist/ and store in mnist_download folder" ) else: print("Found and verified", filepath) return filepath
def download_od_model(): """ Downloads a mobile model from the Tensorflow model zoo and prepares it for usage in Tensorflow Serving. """ model_name = 'ssd_mobilenet_v2_coco_2018_03_29' fname = '{}.tar.gz'.format(model_name) url = "http://download.tensorflow.org/models/object_detection/{}".format(fname) mobile_dir = os.path.join(model_dir, model_name) if not os.path.exists(mobile_dir): os.mkdir(mobile_dir) file = urllib.URLopener() file.retrieve(url, fname) tar = tarfile.open(fname, "r:gz") tar.extractall('models') tar.close() os.remove(fname) checkpoint_dir = os.path.join(mobile_dir, '1') os.rename(os.path.join(mobile_dir, 'saved_model'), checkpoint_dir) shutil.move(os.path.join(mobile_dir, 'checkpoint'), os.path.join(checkpoint_dir, 'checkpoint')) shutil.move(os.path.join(mobile_dir, 'frozen_inference_graph.pb'), os.path.join(checkpoint_dir, 'frozen_inference_graph.pb')) shutil.move(os.path.join(mobile_dir, 'model.ckpt.data-00000-of-00001'), os.path.join(checkpoint_dir, 'model.ckpt.data-00000-of-00001')) shutil.move(os.path.join(mobile_dir, 'model.ckpt.index'), os.path.join(checkpoint_dir, 'model.ckpt.index')) shutil.move(os.path.join(mobile_dir, 'model.ckpt.meta'), os.path.join(checkpoint_dir, 'model.ckpt.meta')) shutil.move(os.path.join(mobile_dir, 'pipeline.config'), os.path.join(checkpoint_dir, 'pipeline.config'))
def retrieve_pics(photos: list) -> list: """ Downloads photo from absolute url retrieved from `get_all_info()` function. This also added file name details into PhotoObjects and returns same list with updated parameter :param photos: List of all PhotoObjects :return: Same list with `file_name' field updated """ file_counter = 1 # Counter for files if not os.path.exists(photo_store_folder): # Make folder if it does not exists os.makedirs(photo_store_folder) if not os.path.exists(winner_email_folder): # Make folder if it does not exists os.makedirs(winner_email_folder) for p2 in photos: # Create file name Besides prefix, add little bit of title. Remove # all the special characters from title and use string sequence filename = photo_prefix + "%d_%s" % ( file_counter, re.sub('[^A-Za-z0-9]+', '', p2.title)) filename = filename[:20] + ".jpg" # If name if more than 20 characters, # strip it and add file extension p2.file_name = filename # Update file_name field in the PhotoObject testfile = request.URLopener() # start downloading testfile.retrieve(p2.photo_url, photo_store_folder + filename) # Save file_counter += 1 return photos
def main(): while True: # Read stream urllib.URLopener().retrieve('https://s3.amazonaws.com/hctn/after.jpg', 'after.jpg') img = cv.imread('after.jpg')
def stop(self): """Stops the server.""" self.stop_serving = True try: # This is to force stop the server loop urllib_request.URLopener().open("http://%s:%d" % (self.host, self.port)) except IOError: pass
def load(url, file_name, folder): #downloads file from url testfile = request.URLopener() testfile.retrieve(url, file_name) #un-zips file and puts contents in folder a = py7z_extractall.un7zip(file_name) a.extractall(folder)
def stop(self): self.stop_serving = True try: # This is to force stop the server loop urllib_request.URLopener().open('http://{}:{}'.format( self.host, self.port)) except IOError: pass logging.info('Shutting down the webserver') self.thread.join()
def download_story(media_url, save_path): if not os.path.exists(save_path): try: urllib.URLopener().retrieve(media_url, save_path) return True except Exception as e: log_warn("The story could not be downloaded: {:s}".format(str(e))) return "Error" else: return False
def stop(self): """Stops the server.""" self.stop_serving = True try: # This is to force stop the server loop urllib_request.URLopener().open("http://%s:%d" % (self.host, self.port)) except IOError: pass LOGGER.info("Shutting down the webserver") self.thread.join()
def load_file(tpl: Tuple[Any, Any, Any]) -> None: branch, build_type, build_number = tpl file_path = os.path.join(tempdir(), 'argus', branch, build_type, '{}.json'.format(build_number)) try: if not os.path.exists(file_path): request.URLopener().retrieve( '{}/job/{}-{}-{}/{}/testReport/api/json'.format(Config.JENKINS_URL, Config.JENKINS_PROJECT, branch, build_type, build_number), file_path) except IOError as e: print('Can not download {}'.format(build_number)) print(e)
def img_download(object_id, obj_class, ra, dec): """ Function that when given an right ascension and declination values downloads an image from a SDSS mirror for Data Release 8. The image is then saved within a folder for its class with its given object id. :param object_id: The Galaxy Zoo Object ID for this image. :param obj_class: The Galaxy Class. :param ra: Right Ascension value from the Galaxy Zoo .csv. :param dec: The Declination value from the Galaxy Zoo .csv. :return: """ url = 'http://skyservice.pha.jhu.edu/DR8/ImgCutout/getjpeg.aspx?ra={}&dec={}&scale=0.2&width=240&height=240&opt='\ .format(ra, dec) outfile = gal_data_path + '{}/{}.jpg'.format(obj_class, object_id) image = request.URLopener() image.retrieve(url, outfile)
def _download_one(filename): """ Download a file if not present Default save path is "data/" folder """ filepath = adult_download_folder + filename if not os.path.exists(adult_download_folder): os.makedirs(adult_download_folder) if not os.path.exists(filepath): print("Downloading ", filename, " ...") file_download = ur.URLopener() file_download.retrieve(adult_url + filename, filepath)
def _download_one(filename): """ Download a file if not present Default save path is "data/" folder """ filepath = health_download_folder + filename if not os.path.exists(health_download_folder): os.makedirs(health_download_folder) if not os.path.exists(filepath): print("Downloading ", filename, " ...") file_download = ur.URLopener() file_download.retrieve(health_url + filename, filepath) else: print("Found and verified ", filepath)
def pushtodatabase(bookdetails, name): pushothermodels('language', bookdetails['language']) pushothermodels('genre', bookdetails['genre']) pushothermodels('authors', bookdetails['authors']) bookdic = {} bookdic['title'] = bookdetails['title'] bookdic['pageCount'] = bookdetails['pageCount'] bookdic['isbn'] = bookdetails['isbn'] bookdic['pages'] = bookdetails['pageCount'] languageids = Language.objects.filter( name=bookdetails['language']).values('id')[0]['id'] bookdic['language'] = str(languageids) genreid = Genre.objects.filter( name=bookdetails['genre']).values('id')[0]['id'] bookdic['genre'] = [str(genreid)] authorlist = [] for author in bookdetails['authors']: authorid = Author.objects.filter( name=author.capitalize()).values('id')[0]['id'] authorlist.append(str(authorid)) bookdic['author'] = authorlist bookfiles = {} # bookfiles['pdf'] = File(open(name,'rb'), os.path.basename(name)) bookfiles['pdf'] = File(open(name, 'rb'), bookdic['title'] + os.path.splitext(name)[1]) bookfiles['epub'] = File(open(name, 'rb'), bookdic['title'] + os.path.splitext(name)[1]) testfile = request.URLopener() coverimagename = bookdetails['title'] + '.jpg' testfile.retrieve(bookdetails['image'], coverimagename) bookfiles['cover'] = File(open(coverimagename, 'rb'), coverimagename.replace('pdfbooks/', '')) form = BookForm(bookdic, bookfiles) if form.is_valid(): form.save() else: print(form.errors) os.remove(coverimagename)
def download_oid_gt(): """ Download groundtruth data from the Open Images Dataset """ url = 'https://storage.googleapis.com/openimages/2018_04/' url_to_train_gt = os.path.join(url, 'train', train_gt_file) url_to_test_gt = os.path.join('test', test_gt_file) url_to_val_gt = os.path.join('validation', val_gt_file) file = urllib.URLopener() if not os.path.exists(path_to_test_gt_file): file.retrieve(os.path.join(url, url_to_test_gt), path_to_test_gt_file) if not os.path.exists(path_to_val_gt_file): file.retrieve(os.path.join(url, url_to_val_gt), path_to_val_gt_file) if not os.path.exists(path_to_train_gt_file): file.retrieve(os.path.join(url, url_to_train_gt), path_to_train_gt_file)
def find_and_convert(root, srch_str): """ Searches google images for the srch_str provided, selects one at random, converts it into .ico format, saves it to the appropriate directory and returns the file. """ query_str = '+'.join(srch_str.split()) url = "https://www.google.co.in/search?q=" + query_str + \ "&source=lnt&tbm=isch&tbs=isz:ex,iszw:256,iszh:256" req = urllib2.Request(url, headers=REQUEST_HEADER) soup = BeautifulSoup( urllib2.urlopen(req, timeout=200).read(), 'html.parser') img_arr = [] for a_tag in soup.find_all("div", {"class": "rg_meta"}): img_link = json.loads(a_tag.text)["ou"] try: opener = urllib2.URLopener() opener.addheaders = [('User-Agent', REQUEST_HEADER['User-Agent']), ('Accept', REQUEST_HEADER['Accept']), ('Accept-Language', REQUEST_HEADER['Accept-Language']), ('Connection', REQUEST_HEADER['Connection'])] opener.retrieve(img_link) except urllib2.HTTPError: # urllib2.HTTPError: HTTP Error 403: Forbidden continue img_arr.append(img_link) if len(img_arr) == IMAGE_DOWNLOAD_LIMIT: break img_choice = random.choice(img_arr) if (len(img_arr) != 0) else None if not img_choice: return "" req = urllib2.Request(img_choice, headers=REQUEST_HEADER) img = Image.open(io.BytesIO(urllib2.urlopen(req, timeout=200).read())) img = img.convert("RGB") ico_file_name = os.path.join(os.path.join(root, srch_str), srch_str + ".ico") img.save(ico_file_name) return ico_file_name
def download_data_external(URL, data_format): """ This function is used to download external data into the local machine Args: URL(str): the url that points the external data. data_format(str): the format user specified during boot time, needed for uncompress. Returns: None, the file will be downloaded to local disk """ connector = urllib.URLopener() if data_format == 'gz': connector.retrieve(URL, './cvdata.gz') elif data_format == 'zip': connector.retrieve(URL, './cvdata.zip') elif data_format == 'tar': connector.retrieve(URL, './cvdata.tar') elif data_format == 'uncompressed': connector.retrieve(URL, './cvdata/')
def scraping(url): r = requests.get(url) soup = BeautifulSoup(r.text,'html.parser') for table in soup.find_all('table', attrs={'class':'detail-text'}): for tr in table.find_all('tr'): try: if tr.find_all('td')[1].find('a'): #if tr.find_all('td')[1].find('a', attrs={'href':re.compile('MIDIFiles')}): link = tr.find_all('td')[1].find('a').get('href') year = url[-8:-4] filename = link.split('/')[-1] title = tr.find_all('td')[1].text composer = tr.find_all('td')[0].text print(filename, composer, year, title) downloadfile = ur.URLopener() downloadfile.retrieve('http://www.piano-e-competition.com'+link, filename) f.writerow([filename, composer, year, title]) except: continue
def lambda_handler(event, context): file = url.URLopener() try: file.retrieve("http://www.bogc.dnrc.mt.gov/production/historical.zip", TMP_FILE) with ZipFile(TMP_FILE) as zip: file_leaseProd = zip.read('histLeaseProd.tab') file_wellProd = zip.read('histprodwell.tab') file_wellData = zip.read('histWellData.tab') s3 = boto3.resource('s3') s3.Bucket(BUCKET_NAME).put_object(Key='MT_leaseProd.tab', Body=file_leaseProd) s3.Bucket(BUCKET_NAME).put_object(Key='MT_wellProd.tab', Body=file_wellProd) s3.Bucket(BUCKET_NAME).put_object(Key='MT_wellData.tab', Body=file_wellData) except Exception as e: print(e) raise e
def load(url, file_name, folder): """ Download archive for a StackExchange site and unzip it, skipping either or both if the neessary tables are already available """ # Need special case for Stack Overflow (more than one 7z file) if not os.path.isfile(file_name): #downloads file from url; two url patterns are attempted testfile = request.URLopener() try: testfile.retrieve(url[0], file_name) except error.HTTPError as e: try: testfile.retrieve(url[1], file_name) except: print ("Error: URL retrieval of " + url[0] + " and " + url[1] + " failed for reason: " + e.reason) quit() #un-zips file and puts contents in folder a = py7z_extractall.un7zip(file_name) if not (os.path.isfile(os.path.join(folder, "PostLinks.xml")) and os.path.isfile(os.path.join(folder, "Posts.xml"))): a.extractall(folder)
def retrieveLidar(x1, y1, x2, y2): xmin = min(x1, x2) ymin = min(y1, y2) xmax = max(x1, x2) ymax = max(y1, y2) lastblok = 10 for x in range(xmin, xmax + 1): for y in range(ymin, ymax + 1): print(x, y) b = lastblok # http://gis.arso.gov.si/lidar/otr/laz/b_22/D48GK/GKR_504_107.laz urladdr = "http://gis.arso.gov.si/lidar/otr/laz/b_{2}/D48GK/GKR_{0}_{1}.laz".format( x, y, b) if urlExists(urladdr): urlE = True print("Found!") else: b = 9 urlE = False while b < 100 and urlE == False: b = b + 1 urladdr = "http://gis.arso.gov.si/lidar/otr/laz/b_{2}/D48GK/GKR_{0}_{1}.laz".format( x, y, b) urlE = urlExists(urladdr) if urlE == True: lastblok = b print("Found!") else: print("\n\n\nFile NOT FOUND!\n\n\n") # download if (urlE == True): print("Retrieving file ...") downfile = url.URLopener() downfile.retrieve(urladdr, "raw/GK_{0}_{1}.laz".format(x, y))
def retrieve_pics(photos: list) -> list: """ Downloads photo from absolute url retrieved from `get_all_info()` function. This also added file name details into PhotoObjects and returns same list with updated parameter :param photos: List of all PhotoObjects :return: Same list with `file_name' field updated """ file_counter = 1 # Counter for files if not os.path.exists(photo_store_folder): # Make folder if it does not exists os.makedirs(photo_store_folder) if not os.path.exists(winner_email_folder): # Make folder if it does not exists os.makedirs(winner_email_folder) for p2 in photos: # Create file name Besides prefix, add little bit of title. Remove # all the special characters from title and use string sequence count_text = str(file_counter) if len(count_text) == 1: count_text = "0" + count_text filename = photo_prefix + "%s_%s" % ( count_text, re.sub('[^A-Za-z0-9]+', '', p2.title)) filename = filename[:20] + ".jpg" # If name if more than 20 characters, # if for some reason file does not get downloaded, warn and continue. # Required for Hippo integration. try: # strip it and add file extension p2.file_name = filename # Update file_name field in the PhotoObject testfile = request.URLopener() # start downloading testfile.retrieve(p2.photo_url, photo_store_folder + filename) # Save file_counter += 1 except Exception as e: logging.warn("Failed to download %s due to %s" % (p2.photo_url, e)) return photos
import urllib.request as urllib import os from PIL.Image import core as Image number = 0 with open("imagenet.synset.txt") as f: content = f.readlines() # you may also want to remove whitespace characters like `\n` at the end of each line content = [x.strip() for x in content] os.chdir('./file') for i in content: number = number + 1 name = str(str(number) + ".jpg") try: url = str(i) image = urllib.URLopener() image.retrieve(url, name) im = open(name) im.save(name, dpi=(600, 600)) except IOError: continue
def _download(self, fpath, link): print("Downloading from '{}' to '{}'".format(link,fpath)) urllib.URLopener().retrieve(link, fpath)
def extractincidents(List): data = ur.urlopen(List[0]) testfile = ur.URLopener() testfile.retrieve(List[0], "file.pdf") #Create the df for allArrests...we just want the structure of whatever is in there, #so make a df with that form and then delete all the data in it df = read_pdf("file.pdf", flavor = 'stream', columns=['112,162,241,342,425,465,525,570,599,634,685'],split_text=True,pages='1') allArrests = df[0].df #header = allArrests.iloc[2,:] #header[5] = 'Arestee Birthday' header = ['arrest_time','case_number','arrest_location','offense','arrestee_name','arrestee_birthday','arrestee_address','City','State','Zip','status','officer'] allArrests.drop(allArrests.index, inplace=True) #Loop through all URLs found in the list for urlNum in range(0,len(List),1): # for urlNum in range(0,1,1): #Open a URL in the list of URLs data = ur.urlopen(List[urlNum]) testfile = ur.URLopener() testfile.retrieve(List[urlNum], "file.pdf") #Create a temporary file for pdfReader fp = tempfile.TemporaryFile() fp.write(data.read()) fp.seek(0) #Extract the number of pages in the PDF pdfReader = PdfFileReader(fp) pages = PdfFileReader(fp).getNumPages() #An alternative method of parsing the PDF is below # page1 = pdfReader.getPage(0).extractText() # content = "" # for i in range (0, pdfReader.getNumPages()): # extractedText = pdfReader.getPage(i).extractText() # content += extractedText + "\n" # content = " ".join(content.replace("\xa0", " ").strip().split()) for pageNum in range(1,pages+1,1): #Use CAMELOT to parse the table. In this case, the parsing isn't prefect, however: there are no lines in the table for the lattice method to look for, and text spans multiple rows. #And it gets worse! CAMELOT attempts to define rows by looking for consistent edges in texts, and the tables given run text so close together that CAMELOT #thinks they're one column. So, visually debugging was done by extracting column pixel positions from a plot. df = read_pdf("file.pdf", flavor = 'stream', columns=['112,162,241,342,425,465,525,570,599,634,685'],split_text=True,pages=str(pageNum)) print('Now parsing page: ' + str(pageNum) + ' on PDF number: ' + str(urlNum+1) + ' out of ' + str(len(List))) #CAMELOT does a good job, but returns spanning text above and below the record. The following code: #*finds the blanks in a record #*fills the blanks with the record above and below #*continues until there are no more inappropriate NAs nadf = df[0].df.replace('',np.nan) tocat = np.where(nadf.notna().iloc[:,0])[0] for i in tocat[::-1]: if (i+1 in df[0].df.index and i-1 in df[0].df.index and nadf.notna().iloc[i+1,0]==False and nadf.notna().iloc[i-1,0]==False): #concatenate above and below df[0].df.iloc[i,:]=df[0].df.iloc[i-1,:]+' '+df[0].df.iloc[i,:]+df[0].df.iloc[i+1,:] #Strip off unnecessary spaces df[0].df.iloc[i,:]=df[0].df.iloc[i,:].map(lambda x: x.strip()) #Remove the concatenated rows df[0].df=df[0].df.drop([i-1,i+1],axis=0) df[0].df = df[0].df.reset_index(drop=True) #Second loop for more blank lines nadf = df[0].df.replace('',np.nan) tocat = np.where(nadf.notna().iloc[:,0])[0] for i in tocat[::-1]: if (i+1 in df[0].df.index and i-1 in df[0].df.index and nadf.notna().iloc[i+1,0]==False and nadf.notna().iloc[i-1,0]==False): df[0].df.iloc[i,:]=df[0].df.iloc[i-1,:]+' '+df[0].df.iloc[i,:]+' '+df[0].df.iloc[i+1,:] df[0].df.iloc[i,:]=df[0].df.iloc[i,:].map(lambda x: x.strip()) df[0].df=df[0].df.drop([i-1,i+1],axis=0) df[0].df = df[0].df.reset_index(drop=True) #Drop the header and any rows that don't contain data nadf = df[0].df.replace('',np.nan) tocat = np.where(nadf.isna().iloc[:,0])[0] for i in tocat[::-1]: df[0].df=df[0].df.drop([i],axis=0) df[0].df = df[0].df[~df[0].df[0].str.contains('Arrest')] df[0].df = df[0].df.reset_index(drop=True) #Append the current page to the growing dataframe of all arrests held at the Norman splash page allArrests = allArrests.append(df[0].df) #Format the columns to match what's required in the key for the SQLite database allArrests.columns = header allArrests['arrestee_address'] = allArrests['arrestee_address'] + ' ' + allArrests['City'] + ' ' + allArrests['State'] + ' ' + allArrests['Zip'] allArrests['arrestee_address'] = allArrests['arrestee_address'].map(lambda x: x.strip()) allArrests.drop(['City','State','Zip'], axis=1, inplace = True) #Prepend the header to the dataframe of all arrests at the Norman splash page # allArrests = allArrests.reset_index(drop = True) # allArrests.loc[-1] = header # allArrests.index = allArrests.index + 1 # allArrests = allArrests.sort_index() #Output every arrest on the Norman police page as a CSV; used for testing and debugging allArrests.to_csv("file.csv") return allArrests
import urllib.request as ur import os testfile = ur.URLopener() #import httplib read_file = open('wanted.txt', 'r') raw_lists = read_file.read() urls = [] read_splited = raw_lists.split('<') #print(read_splited) for i in range(1, len(read_splited)): urls.append("http://222.236.46.45" + read_splited[i].split('>')[0]) #headers = {'User-agent': 'Python'} #conn = httplib.HTTPConnection('222.236.46.45') drout = 'Downloads' if not os.path.exists(drout): os.makedirs(drout) k = 0 while (k < len(urls)): try: # This enables us to try downloading again if temporary network error occurs. temp = urls[k].split('/') filename = urls[k].split('/')[len(temp) - 1] print('Downloading ' + filename + '...') # conn.request('GET', urls[k], '', headers) # resp = conn.getresponse() # image = resp.read() # f = open('Downloads/' + filename, 'wb') # f.write(image)
def get_media_story(user_to_check, user_id, ig_client): try: try: feed = ig_client.user_story_feed(user_id) except Exception as e: print("[W] An error occurred: " + str(e)) exit(1) try: feed_json = feed['reel']['items'] except TypeError as e: print("[I] There are no recent stories to process for this user.") return list_video = [] list_image = [] list_video_new = [] list_image_new = [] for media in feed_json: if 'video_versions' in media: list_video.append(media['video_versions'][0]['url']) if 'image_versions2' in media: list_image.append( media['image_versions2']['candidates'][0]['url']) for video in list_video: filename = video.split('/')[-1] final_filename = filename.split('.')[0] + ".mp4" save_path = os.getcwd() + "/stories/{}/".format( user_to_check) + final_filename if not os.path.exists(save_path): print("[I] Downloading video: {:s}".format(final_filename)) try: urllib.URLopener().retrieve(video, save_path) list_video_new.append(save_path) except Exception as e: print("[W] An error occurred: " + str(e)) exit(1) else: print("[I] Story already exists: {:s}".format(final_filename)) for image in list_image: filename = (image.split('/')[-1]).split('?', 1)[0] final_filename = filename.split('.')[0] + ".jpg" save_path = os.getcwd() + "/stories/{}/".format( user_to_check) + final_filename if not os.path.exists(save_path): print("[I] Downloading image: {:s}".format(final_filename)) try: urllib.URLopener().retrieve(image, save_path) list_image_new.append(save_path) except Exception as e: print("[W] An error occurred: " + str(e)) exit(1) else: print("[I] Story already exists: {:s}".format(final_filename)) if (len(list_image_new) != 0) or (len(list_video_new) != 0): print('-' * 70) print("[I] Story downloading ended with " + str(len(list_image_new)) + " new images and " + str(len(list_video_new)) + " new videos downloaded.") else: print('-' * 70) print("[I] No new stories were downloaded.") except Exception as e: print("[E] An error occurred: " + str(e)) exit(1) except KeyboardInterrupt as e: print("[I] User aborted download.") exit(1)
def scrape(self, limit=-1, start=0): self.offset = start t0 = time.time() # url = self.url +"&max_results="+str(limit)+"&start="+str(start) url = self.url sys.stdout.flush() ds = [] k = 0 while True: sys.stdout.flush() try: if time.time() - t0 > 60: print("socket timed out") raise # return [] print("fetching: ", start, "/", limit, url, "proxy:", self.proxy, self.proxy_protocol) # req = urlrequest.Request(url) # if self.proxy is not None and self.proxy is not "": # req.set_proxy(self.proxy, self.proxy_protocol) # response = urlrequest.urlopen(req) response = urlrequest.URLopener(proxies=self.proxies).open(url) # urllib.urlopen(url, proxies=self.proxies) # response = urlopen(url) except socket.error as e: print("socker error, retrying...") time.sleep(2) continue except HTTPError as e: if e.code == 503: to = int(e.hdrs.get('retry-after', 30)) print('Got 503. Retrying after {0:d} seconds.'.format(self.t)) time.sleep(to) continue else: raise xml = response.read() root = ET.fromstring(xml) hasError = root.findall("error") # print("has error? "+str(len(hasError))) if len(hasError) > 0: print("has error: "+xml.decode("utf-8")) raise "error xml" # print("xml:"+xml.decode("utf-8")) records = root.findall(OAI + 'ListRecords/' + OAI + 'record') print("records: ", len(records), "k", k) sys.stdout.flush() if k <= start+len(records): for record in records: meta = record.find(OAI + 'metadata').find(ARXIV + 'arXiv') record = Record(meta).output() if k >= start and (limit == -1 or k < start+limit): if self.append_all: ds.append(record) else: save_record = False for key in self.keys: for word in self.filters[key]: if word.lower() in record[key]: save_record = True if save_record: ds.append(record) k +=1 if limit >= 0 and k >= start+limit: break# skip after max reached listRecords = root.find(OAI + 'ListRecords') if listRecords is None: print("ListRecords not found", xml.decode("utf-8")) sys.stdout.flush() return ds else: print("skipping", len(records)) k += len(records)# skipped if limit >= 0 and k + 1 > start+limit: print("reached limit", k+1, start+limit) sys.stdout.flush() break token = listRecords.find(OAI + 'resumptionToken') if token is None or token.text is None: self.nextUrl = "" break else: url = BASE + 'resumptionToken=%s' % token.text self.nextUrl = url if k >= start: break# use next to continue self.offset += k # end while t1 = time.time() print('fetching is completes in {0:.1f} seconds.'.format(t1 - t0), "offset:", self.offset) sys.stdout.flush() return ds
def next(self, limit=-1): t0 = time.time() sys.stdout.flush() ds = [] k = 0 while True: print("continue fetch: ", self.offset, " for ", limit, self.nextUrl, "proxy:", self.proxy, self.proxy_protocol) sys.stdout.flush() try: if time.time() - t0 > 60: print("socket timed out") raise # req = urlrequest.Request(self.nextUrl) # if self.proxy is not None and self.proxy is not "": # req.set_proxy(self.proxy, self.proxy_protocol) # response = urlrequest.urlopen(req) # response = requests.get(self.nextUrl, proxies=self.proxies) # response = urlrequest.urlopen(self.nextUrl, proxies=self.proxies) response = urlrequest.URLopener(proxies=self.proxies).open(self.nextUrl) # response = urlopen(self.nextUrl) if 1==1: break except HTTPError as e: if e.code == 503: to = int(e.hdrs.get('retry-after', 30)) print('Got 503. Retrying after {0:d} seconds.'.format(self.t)) time.sleep(to) continue else: raise xml = response.read() root = ET.fromstring(xml) hasError = root.findall("error") # print("has error? "+str(len(hasError))) if len(hasError) > 0: print("has error: "+xml.decode("utf-8")) raise "error xml" # print("xml:"+xml.decode("utf-8")) records = root.findall(OAI + 'ListRecords/' + OAI + 'record') print("records: ", len(records), "k", k) sys.stdout.flush() for record in records: meta = record.find(OAI + 'metadata').find(ARXIV + 'arXiv') record = Record(meta).output() if (limit == -1 or k < limit): if self.append_all: ds.append(record) else: save_record = False for key in self.keys: for word in self.filters[key]: if word.lower() in record[key]: save_record = True if save_record: ds.append(record) k +=1 if limit >= 0 and k >= limit: break# skip after max reached listRecords = root.find(OAI + 'ListRecords') if listRecords is None: print("ListRecords not found", xml.decode("utf-8")) sys.stdout.flush() return ds if limit >= 0 and k + 1 > limit: print("reached limit", k+1, limit) sys.stdout.flush() self.nextUrl = "" else: print("getting next token") token = listRecords.find(OAI + 'resumptionToken') if token is None or token.text is None: self.nextUrl = "" else: url = BASE + 'resumptionToken=%s' % token.text self.nextUrl = url print("next size: ", len(ds)) self.offset += k # end while t1 = time.time() print('next completes in {0:.1f} seconds, offset: '.format(t1 - t0), self.offset) sys.stdout.flush() return ds