def __init__(self, phenny): Download.__init__(self) self.re_title = re.compile('ustream\.vars\.channelTitle\=\"(.*?)\"\;ustream\.vars\.') self.re_channel = re.compile('ustream\.vars\.channelId\=(\d*?)\;ustream\.vars\.') self.apikey = phenny.config.ustreamdevapikey self.urltype = None self.h = {}
def __init__(self): Download.__init__(self) self.re_idblob = re.compile("si:\"(.+?)\",") self.re_title = re.compile( "<title>(.*?) \| Video on TED\.com<\/title>") self.urltype = VidType.TED self.h = {}
def doRequest(self): d = Download(self.Url) if d.doRequest(): return 1 self.recs = d.getSOURCE() return 0
def __init__(self): """ Classe per gestire la copia dei dati da una forma /dataset/class1/ /dataset/class1/test To /train/class1 test/class2 """ #variabile dove si trova il dataset self.data_dir = "/Users/Eric/Desktop/eric/Programmazione/python/DeepLearning/data/knifey-spoony" #Url dove posso scaricare un dataset self.data_url = "https://github.com/Hvass-Labs/knifey-spoony/raw/master/knifey-spoony.tar.gz" #path della cartella di train self.train_dir = os.path.join(self.data_dir, "train/") #path della cartella di test self.test_dir = os.path.join(self.data_dir, "test/") #dimensione immagine self.image_size = 200 #canali immagine self.num_channels = 3 self.img_shape = [self.image_size, self.image_size, self.num_channels] self.img_size_flat = self.image_size * self.image_size * self.num_channels #numero di classi del dataset self.num_classes = 3 self.download = Download()
def dl(self): """ Downloads the highest Quallitiy picture available. returns False if something goes wrong. """ if(self.orig_url == ""): if(self.hq_url == ""): down = Download(self.lq_url, self.config.get_image_folder()) if(down.perform()): return True else: down = Download(self.hq_url, self.config.get_image_folder()) if(down.perform()): return True else: down = Download(self.orig_url, as_var=True) if(down.perform()): result = down.get_result() soup = BeautifulSoup(result.getvalue()) download_link = soup.find("a", text="this link") orig_url = self.dl_url_base + download_link["href"] time.sleep(120) down = Download(orig_url, self.config.get_image_folder()) if(down.perform()): self.file_name = down.get_output_name() return True return False
def __init__(self): Download.__init__(self) self.re_id = re.compile("swfobject\.embedSWF\(\"(.*?)\"") self.re_title = re.compile( "\<title\>MIT TechTV \&ndash\; (.*?)\<\/title\>") self.urltype = VidType.MITTECHTV self.h = {}
def __init__(self): Download.__init__(self) self.re_id = re.compile( "<meta\s+name\=\"item\-id\"\s+content\=\"(.*?)\"") self.re_title = re.compile( "<meta\s+name\=\"title\"\s+content\=\"(.*?)\"") self.urltype = VidType.VEOH self.h = {}
def __init__(self): Download.__init__(self) self.re_vid = re.compile( '<meta property="og:video"\s+content="http://blip\.tv/play/(.*?)"/>' ) self.re_title = re.compile('<title>(.*?)</title>') self.urltype = VidType.BLIPTV self.h = {}
def __init__(self): Download.__init__(self) self.re_live_id = re.compile('stream_id \= (\d+?)\;') self.re_recorded_id = re.compile('full_program_clipid \= (\d+?)\;') self.re_live_title = re.compile('stream_title \= "(.+?)"\;') self.re_recorded_title = re.compile('full_program_title \= "(.+?)"\;') self.type = None self.urltype = None self.h = {}
def isGoogleSearch(schema, ip): d = Download(schema + '://' + ip) if d.doRequest(): return False if Utility.containsGoogle(d.getSOURCE()): return True return False
def __init__(self, phenny): Download.__init__(self) self.re_id = re.compile("^[a-zA-Z0-9\_\-]{11}$") self.re_fragment = re.compile("^t\=((\d+)h)?((\d+)m)?((\d+)s)?$") self.urltype = VidType.YOUTUBE self.h = {} self.gdatahost = 'gdata.youtube.com' self.developer_key = phenny.config.youtubedevapikey self.gdataxml = None
def requestHtml(self): url = self.BaseUrl + self.ISBN # print url, self.User_Agent d = Download(url, self.User_Agent) if d.doRequest(): return 1 self.HTML = d.getSOURCE() return 0
def run(self): url = self.BASE_URL + self.SeasonId + self.BASE_URL_PART_3 + str(self.PageNumber) + self.BASE_URL_PART_5 d = Download(url) if d.doRequest(): # fail print 'ERROR: ' + self.SeasonId + '-' + str(self.PageNumber) else: utfstr2file(d.getSOURCE(), './data/' + self.SeasonId + '-' + str(self.PageNumber) + '.raw') return url
def request(self): baseUrl = "http://shaishufang.com/index.php/site/main/uid/" postFix = "/friend/false/category//status//type//page/" url = baseUrl + self.UID + postFix + str(self.Page) d = Download(url, self.Cookie, self.Proxy) if d.doRequest(): return False self.HTML = d.getSOURCE() return True
def doRequest(self): playerId = str(self.PlayerId) seasonType = self.SeasonType.replace(" ", "+") url = self.Url + "PlayerId=" + playerId + "&SeasonType=" + seasonType + "&League=" + self.LeagueId d = Download(url) if d.doRequest() == 1: return 1 self.recs = dumps(loads(d.getSOURCE())) return 0
def request(self): baseUrl = 'http://shaishufang.com/index.php/site/detail/uid/' postFix = '/status//category/none/friend/false' url = baseUrl + self.UID + '/ubid/' + self.BID + postFix d = Download(url, self.Cookie, self.Proxy) if d.doRequest(): return False self.HTML = d.getSOURCE() return True
def file_exists(self, file_path): hash_local = self.hash_file(file_path) download = Download( ("https://commons.wikimedia.org/w/api.php?action=query&list" "=allimages&format=json&aisha1=") + hash_local, as_var=True) if(download.perform()): content = download.get_result().getvalue() json_data = json.loads(content) if(len(json_data["query"]["allimages"]) > 0): return True else: return False
def run(self): url = self.BASE_URL + self.SeasonId + self.BASE_URL_PART_3 + str( self.PageNumber) + self.BASE_URL_PART_5 d = Download(url) if d.doRequest(): # fail print 'ERROR: ' + self.SeasonId + '-' + str(self.PageNumber) else: utfstr2file( d.getSOURCE(), './data/' + self.SeasonId + '-' + str(self.PageNumber) + '.raw') return url
def getStats(self): d = Download(self.API) if d.doRequest(): return False res = [] j = loads(d.getSOURCE()) for item in j['resultSets'][1]['rowSet']: res.append(item[1:]) if len(res) == 0: return False else: return res
def run(self): while True: print 'INFO: ........................................ START' stats = self.dbm.getStats() print 'INFO: deadLinks-', stats[0], ' unvisitedLinks-', stats[1], ' visitedLinks-', stats[2] # get an url from unvisitedLinks url = self.dbm.retrieveUnvisitedLink() if url == False: print 'DEBUG: DONE -- retrieveUnvisitedLink return False' break print 'DEBUG: Processing ', url if not self.urlFilter.isPlainText(url): print 'DEBUG: NotPlainTextURL ', url continue if not self.domainFilter.isInDomain(url): print 'DEBUG: NOT IN DOMAIN ', url continue # requet the url d = Download(url) if d.doRequest() == 1: if not self.dbm.createDeadLink(url): print 'DEBUG: deadLinks already contain ', url else: print 'DEBUG: Add To deadLinks ', url else: if self.dbm.createVisitedLink(url): print 'DEBUG: Add To visitedLinks ', url else: print 'DEBUG: Failed Add To visitedLinks ', url # extract urls from the sourc2 u = URLExtractor(d.getSOURCE(), url) tmpUrls = u.getUrls() if tmpUrls: for url in tmpUrls: if self.dbm.isInDeadLink(url): continue elif self.dbm.isInVisitedLink(url): continue elif self.dbm.isInUnvisitedLink(url): continue else: print 'DEBUG: Add To unvisitedLink ', url self.dbm.createUnvisitedLink(url) print 'INFO: ........................................ END'
class DatasetManagement: def __init__(self): """ Classe per gestire la copia dei dati da una forma /dataset/class1/ /dataset/class1/test To /train/class1 test/class2 """ #variabile dove si trova il dataset self.data_dir = "/Users/Eric/Desktop/eric/Programmazione/python/DeepLearning/data/knifey-spoony" #Url dove posso scaricare un dataset self.data_url = "https://github.com/Hvass-Labs/knifey-spoony/raw/master/knifey-spoony.tar.gz" #path della cartella di train self.train_dir = os.path.join(self.data_dir, "train/") #path della cartella di test self.test_dir = os.path.join(self.data_dir, "test/") #dimensione immagine self.image_size = 200 #canali immagine self.num_channels = 3 self.img_shape = [self.image_size, self.image_size, self.num_channels] self.img_size_flat = self.image_size * self.image_size * self.num_channels #numero di classi del dataset self.num_classes = 3 self.download = Download() def load(self): pass def execute(self): #gestione caricamento dataset da internet o da locale #scarica il dataset da internet se non è presente self.download.maybe_downlaod_and_extract(url=self.data_url, download_dir=self.data_dir) # crea l'istanza del dataset cache_path = os.path.join(self.data_dir, "knifey-spoony.pkl") self.dataset = load_cached(cache_path=cache_path, in_dir=self.data_dir) #divide i dati in test e train secondo le classi pronti per essere processati self.dataset.copy_files(train_dir=self.train_dir, test_dir=self.test_dir)
def main(): # print(sys.argv[0]) # print(type(sys.argv[1])) print('\t\t###########\n\t\t# WELCOME #\n\t\t###########\n') lat1 = float(sys.argv[1]) lon1 = float(sys.argv[2]) lat2 = float(sys.argv[3]) lon2 = float(sys.argv[4]) print('\tStart coordinate (%f, %f)' % (lat1, lon1)) print('\tEnd coordinate (%f, %f)' % (lat2, lon2)) print('\tStart searching ...\n') sc = Search(lat1, lat2, lon1, lon2) sc.searchLevels() picl = sc.qkll[-1] lod = len(sc.qkll) print('\tSearching complete ... \n') dl = Download() tl = list() print('\tDownloading images ...\n') if not os.path.exists('./temp/') : os.makedirs('./temp/') for qk in picl: dl.getUrlImage(qk) tl.append(Tile(qk)) print('\tDownloading complete ...\n') ts = TileSystem() pX1, pY1 = ts.latLongToPixelXY(sc.MinLatitude, sc.MinLongitude, lod) pX2, pY2 = ts.latLongToPixelXY(sc.MaxLatitude, sc.MaxLongitude, lod) print('\tStart merging ...\n') mg = Imerge(pX1, pX2, pY1, pY2, lod) for t in tl: mg.fillIm(t) print('\tMerging complete ...\n') fname = input('\tPlease give a name to the Image.\n\t\t') mg.saveFig(fname) f = open(fname, 'w') f.write('Start coordinate\n \t(%f, %f)\nEnd coordinate\n \t(%f, %f)' % (lat1, lon1, lat2, lon2)) if 'y' == input('\tRemove caches? y?\n\t\t') : filelist = [ f for f in os.listdir('./temp/') ] for f in filelist: os.remove(os.path.join('./temp/', f)) print('\t\t##########\n\t\t# DONE #\n\t\t##########\n')
def worker(appids, isbns, appidsCycle): # appidsCycle = cycle(appids) for isbn in isbns: url = 'http://' + appidsCycle.next() + '.appspot.com/url?url=' + 'http://book.douban.com/isbn/' + str(isbn) # print 'DEBUG: ', url d = Download(url) if d.doRequest(): print isbn, 'network error' continue j = json.loads(d.getSOURCE()) print isbn, j['status_code'] return
def main(): ipList = getIps(config.IP_FILE) top = {} for ip in ipList: print ip start = int(round(time.time())) obj = Download("http://" + ip) if not obj.doRequest(): end = int(round(time.time())) top[ip] = end - start tmp = [(v, k) for k, v in top.iteritems()] topList = [] for item in sorted(tmp): topList.append(item[1]) lst2File(topList, config.TOP_IP_FILE)
def run(self, processName='MainProcess'): for isbn in self.ISBNS: url = 'http://www.amazon.cn/s/ref=nb_sb_noss?field-keywords=' + isbn d = Download(url) if d.doRequest(): print 'ERROR[' + processName + ']: ', isbn, 'NERR' appendstr2file(isbn, './NERR.txt') continue asin = ASINParser(d.getSOURCE()) if asin.getAsin(): print 'INFO[' + processName + ']: ', isbn, asin.getAsin() appendstr2file(isbn + ',' + asin.getAsin(), './OK.txt') else: print 'WARN[' + processName + ']: ', isbn, 'NOER' appendstr2file(isbn, './NOER.txt')
def find_urls(self): """ Finds the Download urls with different qualities and save them. """ down = Download(self.url, as_var=True) if(down.perform()): result = down.get_result() soup = BeautifulSoup(result.getvalue()) download_links = soup.find_all("a", {"class": "DownloadLink"}) if(download_links): self.lq_url = download_links[0]["href"] self.hq_url = download_links[1]["href"] raw_link = soup.find( text="Other options available:").find_next("script").text m = re.search(r"href=..(.*\.\b[a-zA-Z0-9]+\b)", raw_link) if(m): self.orig_url = self.url_base + "/" + m.group(1)
def __init__(self, lat1, lat2, lon1, lon2) : self.ts = TileSystem() self.dl = Download() self.qkll = list() if lat1 > lat2 : self.MinLatitude = lat1 self.MaxLatitude = lat2 else : self.MinLatitude = lat2 self.MaxLatitude = lat1 if lon1 < lon2 : self.MinLongitude = lon1 self.MaxLongitude = lon2 else : self.MinLongitude = lon2 self.MaxLongitude = lon1
def Google_Web_Search_Helper(q, hl='en', start=0): Google_Web_Search_URL = 'https://www.google.com/search?' if not q: return {} else: Google_Web_Search_URL = Google_Web_Search_URL + 'q=' + q Google_Web_Search_URL = Google_Web_Search_URL + '&hl=' + hl Google_Web_Search_URL = Google_Web_Search_URL + '&start=' + start d = Download(Google_Web_Search_URL) if d.doRequest(): return {} else: g = GoogleSearchResultParser(d.getSOURCE()) return g.getJson() """
def run(self, processName='MainProcess'): for asin in self.ASINS: url = 'http://www.amazon.cn/dp/' + asin d = Download(url) if d.doRequest(): print 'ERROR[' + processName + ']: ', asin, 'NERR' appendstr2file(asin, './NERRBasicInfo.txt') continue b = BasicInfoParser(d.getSOURCE()) jsonRes = b.basicInfo() if json.loads(jsonRes): print 'info[' + processName + ']: ', asin appendstr2file(jsonRes, './OKBasicInfo.txt') else: print 'WARN[' + processName + ']: ', asin, 'NOER' appendstr2file(asin, './NOERBasicInfo.txt')
def Save(email): """ Saves the youtube videos and handles errors""" try: Download(email) except Exception as e: SendEmail( email['from'], email['subject'], 'Something went wrong while downloading the ' + email['type'] + ' file: ' + email['url'] + '\n\nThe error was: ' + str(e)) return False
def parse_web(self): down = Download(self.url, as_var=True) if(down.perform()): result = down.get_result() soup = BeautifulSoup(result.getvalue()) mission_table = soup.find( text="Missions used in the Database").find_next("table") mission_params = mission_table.find("tbody").find_all("tr") for m in mission_params: mission_as_list = list(m.children) if(len(mission_as_list) > 5): self.db.insert_mission(mission_as_list[0].text, mission_as_list[1].text, mission_as_list[2].text, self.parse_date( mission_as_list[3].text), self.parse_date( mission_as_list[4].text), mission_as_list[5].text)
def walker(self): while True: urls = self.dbm.retrieveUnvisitedLinks(0, 100) urls = self.urlFilter.getFilteredUrls(urls) if len(urls) == 0: break for url in urls: print 'INFO: Processing ', url d = Download(url) if d.doRequest() == 1: self.dbm.createDeadLink(url) else: self.dbm.createVisitedLink(url) u = URLExtractor(d.getSOURCE(), url) tmpUrls = u.getUrls() if tmpUrls: self.dbm.createUnvisitedLinks(list(set(tmpUrls))) return True
class Search: def __init__(self, lat1, lat2, lon1, lon2) : self.ts = TileSystem() self.dl = Download() self.qkll = list() if lat1 > lat2 : self.MinLatitude = lat1 self.MaxLatitude = lat2 else : self.MinLatitude = lat2 self.MaxLatitude = lat1 if lon1 < lon2 : self.MinLongitude = lon1 self.MaxLongitude = lon2 else : self.MinLongitude = lon2 self.MaxLongitude = lon1 def getTileXY(self, lat, lon, levelOfDetail) : pX, pY = self.ts.latLongToPixelXY(lat, lon, levelOfDetail) tX, tY = self.ts.pixelXYToTileXY(pX, pY) return tX, tY def search1Level(self, levelOfDetail) : tX1, tY1 = self.getTileXY(self.MinLatitude, self.MinLongitude, levelOfDetail) tX2, tY2 = self.getTileXY(self.MaxLatitude, self.MaxLongitude, levelOfDetail) print('\tStart tileXY (%d, %d)' % (tX1, tY1)) print('\tStart tileXY (%d, %d)' % (tX2, tY2)) re = list() for i in range(tY1, tY2 + 1) : for j in range(tX1, tX2 + 1) : qk = self.ts.tileXYToQuadKey(j, i, levelOfDetail) if self.dl.getUrlResponse(qk) : re.append(qk) else : return None return re def searchLevels(self) : lod = 1 ql = self.search1Level(lod) while ql: self.qkll.append(ql) lod += 1 ql = self.search1Level(lod)
def getStats(self): d = Download(self.API) if d.doRequest(): return False res = [] j = loads(d.getSOURCE()) for item in j['resultSets'][0]['rowSet']: tmp = [] name = item[3] pos = item[5] if item[6] == 'null': height = 'None' else: height = item[6] if item[7] == " ": weight = 'None' else: weight = item[7] age = item[9] if item[10] == 'R' or item[10] == 'None' or item[10] == None: exp = 0 else: exp = item[10] tmp.append(name) tmp.append(pos) tmp.append(height) tmp.append(weight) tmp.append(age) tmp.append(exp) res.append(tmp) if len(res) == 0: return False else: return res
def process_download_file(self, file_to_process): with open( "ProcessingFiles" + self.directory_separator + file_to_process, "r") as download_file: try: download_data = json.loads(download_file.read()) for f in sorted(download_data["Files"]): self.map_download_directories( f.replace(self.remote_directory_to_sync + "/", "")) for f in download_data["Files"]: for file_to_download in download_data["Files"][f]: Download(self.ftp_sync, f, file_to_download) except Exception as e: Logger("Error - Unable to download file: " + str(download_file) + ", " + str(e))
def __init__(self, user, host, no_notify, verbose, interval, workflow_id=None): self.host = host self.user = user self.interval = interval self.cromwell = Cromwell(host=host) self.messenger = Messenger(self.user) self.no_notify = no_notify self.verbose = verbose self.workflow_id = workflow_id if user == "*": self.event_subscribers = [EmailNotification(self.cromwell), SystemTestDownload(), Download(self.cromwell.host), GATKDownload()] engine = create_engine("sqlite:///" + config.workflow_db) Base.metadata.bind = engine DBSession = sessionmaker() DBSession.bind = engine self.session = DBSession()
def parse_web(self): down = Download(self.url, as_var=True, post_dict=self.post_dict) found_start = False can_add = False if(down.perform()): web_string_etree = etree.fromstring(down.get_result().getvalue()) for element in web_string_etree.iter("script"): redirect_url = element.text redirect_url_array = redirect_url.split("\"") down = Download(self.base_url + redirect_url_array[1], as_var=True) if(down.perform()): string_etree = html.fromstring( down.get_result().getvalue()) table = string_etree.xpath("//table[@id='QueryResults']") for element in table[0].iter("tr"): list_of_elements = list(element.iter("td")) if(len(list_of_elements) > 5): a = list(list_of_elements[0].iter("a")) if(found_start or self.no_need): can_add = True if(self.new_start): if(self.new_start == a[0].text and not found_start): found_start = True if(can_add): self.db.insert_image(a[0].attrib["href"], a[0].text, self.parse_date( list_of_elements[1].text), list_of_elements[2].text, list_of_elements[3].text, list_of_elements[4].text, list_of_elements[5].text, list_of_elements[6].text, list_of_elements[7].text, self.mission_id, False, False) self.db.update_mission_image_id( self.mission_id, a[0].text) self.db.update_mission_image_id( self.mission_id, str(0))
def __init__(self): Download.__init__(self) self.re_url = re.compile("^http\:\/\/(www\.)dailymotion\.com\/video\/([a-z0-9]+?)\_") self.urltype = VidType.DAILYMOTION self.h = {}
def __init__(self): Download.__init__(self) self.urltype = VidType.VIDDLER self.h = {}
def find_online_category(self, term): result = None down = Download(self.base_api + urllib.quote(term), as_var=True) if(down.perform()): result = down.get_result() return result
#!/usr/bin/env python #coding=utf-8 # # Author: Archer Reilly # Date: 11/Aug/2014 # File: PlayerInfoParserTest.py # Description: test the PlayerInfoParser class # Website: http://csrgxtu.blog.com/ # # Produced By CSRGXTU from PlayerInfoParser import PlayerInfoParser from Download import Download URL = "http://sports.qq.com/d/f_players/3/2890/" player = Download(URL) if player.doRequest() != 0: print "Download Cant Do Requst" else: print "Successfully Do Request" playerParser = PlayerInfoParser(player.getSOURCE())
def __init__(self): Download.__init__(self) self.re_id = re.compile('\:content\:atom\.com\:(.+?)\"') self.urltype = VidType.ATOM self.h = {}
def __init__(self): Download.__init__(self) self.re_fragment = re.compile("^(t\=)?((\d+)h)?((\d+)m)?((\d+)s)?$") self.urltype = VidType.GOOGLEVIDEO self.h = {}
def __init__(self): Download.__init__(self) self.re_id = re.compile("swfobject\.embedSWF\(\"(.*?)\"") self.re_title = re.compile("\<title\>MIT TechTV \&ndash\; (.*?)\<\/title\>") self.urltype = VidType.MITTECHTV self.h = {}
# Website: http://csrgxtu.blog.com/ # # Produced By CSRGXTU import requests from Download import Download from Parser import Parser from TeamInfoParser import TeamInfoParser """ page = requests.get('http://econpy.pythonanywhere.com/ex/001.html') print page.text parser = Parser(page.text) #print parser.getBuyers() """ URL = "http://sports.qq.com/d/f_teams/1/42/" soccer = Download(URL) if soccer.doRequest() == 0: print "Successfully do request" else: print "Failed do request" html = soccer.getSOURCE() parser = TeamInfoParser(html) name = parser.getTeamName() print "name:", unicode(name).encode('utf8') name_cn = parser.getTeamNameCN() print "name_cn:", unicode(name_cn).encode('utf8') logo = parser.getTeamLogo() print "logo:", logo city = parser.getTeamCity() print "city:", city
class Spider: RE_EXPONEA = re.compile("^https?://[^/]*exponea.com(/.*)?$") downloaded = [] visited = [] def __init__(self, loop, url): self.__url = self.__remove_trailing_slash(url) self.__loop = loop self.__downloader = Download() self.__scraper = Scraper() def __gen_file_name(self, link): return link.replace("/", "_") def __remove_trailing_slash(self, url): if url[-1] == "/": return url[:-1] else: return url def __is_exponea(self, url): return Spider.RE_EXPONEA.match(url) # if i used selenium this would be easier (following links), # but i wanted to keep it simple, so here we go: # (i'd use selenium or some scraping framework next time) @asyncio.coroutine def _sanitize_url(self, url, filter_exponea=False): if len(url) > 500: print("Somethings probably wrong (loop?): " + url) # or base64 return None if url.startswith("http") and ( not filter_exponea or self.__is_exponea(url)): # refactor this bit to be generic return url if url.startswith("//") and (not filter_exponea or self.__is_exponea("http:" + url)): return "http:" + url if url.startswith("/"): return self.__url + url # todo there are some base64 encoded images rejected print("Sanitize: Rejected " + url) return None @asyncio.coroutine def run(self): print("Processing " + self.__url) Spider.visited.append(self.__url) # get images page = yield from self.__downloader.download_data_url(self.__url) if not page: return imglinks = yield from self.__scraper.get_image_links(page) for link in imglinks: sanitized = yield from self._sanitize_url(link) if sanitized and (sanitized not in Spider.downloaded): Spider.downloaded.append(sanitized) yield from self.__downloader.download_image_url( sanitized, self.__gen_file_name(sanitized)) # SPAWN! links = yield from self.__scraper.get_links(page) for link in links: sanitized = yield from self._sanitize_url(link, filter_exponea=True) if sanitized and (sanitized not in Spider.visited): next = Spider(self.__loop, sanitized) yield from next.run()
def __init__(self, loop, url): self.__url = self.__remove_trailing_slash(url) self.__loop = loop self.__downloader = Download() self.__scraper = Scraper()
def main(): parser = argparse.ArgumentParser(description='Find all protein database entrys of specified taxon IDs and their descendants.' \ ' One taxID or a taxID input file must be provided. Peptide-Databases from NCBI or Uniprot can be used. User defined databases,' \ ' if header contain taxon IDs (e.g. OX=1111) or ncbi/uniprot accession IDs.') parser.add_argument( '-i', '--input', dest='input', default=None, help='TaxID input file: tabular file containing a column of NCBI' ' taxon IDs. Columns tab separated.') parser.add_argument('-c', '--column', dest='column', type=positive_integer, default=0, help='The column (zero-based) in the tabular ' 'file that contains Taxon IDs. Default = 0.') parser.add_argument( '-t', '--taxon', dest='taxon', type=positive_integer, nargs='+', action='append', help= 'NCBI taxon ID/s for database extraction. Multiple taxonIDs seperated by space.' ) parser.add_argument( '-d', '--database', dest='database', choices=['ncbi', 'uniprot', 'swissprot', 'trembl'], default='uniprot', help= 'Database choice for analysis or for download. Choices: ncbi, uniprot, tremble, swissprot. ' 'No download, if databases with original name are stored in same folder as option --path ' ) parser.add_argument( '-p', '--path', dest='path', default=None, help='Path to folder with all needed ' 'databases: taxdump.tar.gz (for all databases), prot.accession2taxid or prot.accession2taxid.gz and ' 'pdb.accession2taxid.gz (for ncbi databases). Optional: peptide_database named: nr/nr.gz, ' 'uniprot_trembl.fasta/uniprot_trembl.fasta.gz or uniprot_sprot.fasta/uniprot_sprot.fasta.gz' ' or uniprot.fasta./uniprot.fasta.gz') parser.add_argument( '-o', '--out', dest='out', default=None, help= "File name and direction of the result taxon specified peptide database. " "Default = /taxon_specified_db_DATE/taxon_specific_database.fasta") parser.add_argument( '-n', '--dbname', dest='dbname', default=None, help= "Database name and direction. If database is in other folder than --path or name deviates from standard names." ) parser.add_argument( '-l', '--level', dest='level', choices=[ 'species', 'section', 'genus', 'tribe', 'subfamily', 'family', 'superfamily', 'order', 'superorder', 'class', 'phylum', 'kingdom', 'superkingdom' ], default=None, help= 'Hierarchy level up in anchestral tree. Choices: species, section, genus, tribe, ' 'subfamily, family, superfamily, order, superorder, class, phylum, kingdom, superkingdom' ) parser.add_argument( '-z', '--no_descendants', dest='no_descendants', action='store_true', default=False, help= 'Select peptide database only by given taxon IDs, descendant taxons are excluded.' ) parser.add_argument( '-s', '--species', dest='species', action='store_true', default=False, help= 'Select peptide database only until taxonomic level "species", descendents from species are excluded.' ) parser.add_argument( '-r', '--non_redundant', dest='non_redundant', action='store_true', default=False, help= 'Makes the final database non redundant in regard to sequences, headers are concatenated.' ) parser.add_argument( '-u', '--threads', dest='threads', type=positive_integer, action="store", help= 'Number of threads for using multiprocessing. Default = number of cores.' ) parser.add_argument( '-x', '--reduce_header', dest='reduce_header', action='store_true', default=False, help= 'Reduce the long headers of NCBI entries to accession IDs. Use only for NCBI databases.' ) parser.add_argument('--version', action='version', version=('version ' + __version__)) parser.add_argument( '-v', '--verbose', dest='verbose', action='store_true', default=False, help= 'Verbose shows details about program progress and more information.') options = parser.parse_args() # url adresses for download: url_protaccession2taxID = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz' url_protaccession2taxID_md5 = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz.md5' url_pdbaccession2taxID = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz' url_pdbaccession2taxID_md5 = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz.md5' url_taxdump = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' url_taxdump_md5 = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5' url_database_ncbi = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz' url_database_md5_ncbi = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5' url_database_swissprot = 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz' url_database_trembl = 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz' url_uniprot_metadata = 'ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/RELEASE.metalink' db_dict_name = { 'ncbi': url_database_ncbi.split('/')[-1], 'uniprot': 'uniprot.fasta.gz', 'swissprot': url_database_swissprot.split('/')[-1], 'trembl': url_database_trembl.split('/')[-1] } # if not option out, a new folder with name taxon_database and date for result database and log file is created if options.out: output_path = Path.cwd() / options.out else: output_path = Output.createDir(Path.cwd()) logger = initialize_logger(output_path, options.verbose) for arg, value in sorted(vars(options).items()): logger.debug("Argument %s: %r", arg, value) logger.debug("Result database and log file are saved in direction %s" % output_path) # set path_to_db and database_folder for all user input variants # if options.path specified: folder to all databases (can be without protein DB if options.dbname) # if not exist, create folder with user defined name in option --path skip_check = False if options.path: database_folder = Path.cwd() / options.path path_to_db = database_folder / db_dict_name[options.database] # try open config file and read path to database folder, if no path option is entered # no config file, new database folder created else: try: path_to_main = Path(__file__, '..').resolve() with open(str(path_to_main) + "/tax2proteome.config", 'r') as config: database_folder = Path(config.readline().strip()) path_to_db = database_folder / db_dict_name[options.database] except FileNotFoundError: database_folder = Path.cwd() / ('databases_' + str(date.today())) path_to_db = database_folder / db_dict_name[options.database] try: database_folder.mkdir() prot_gz_b = prot_b = pdb_b = taxdump_b = db_gz_b = db_b = False skip_check = True logger.info("Downloaded databases are saved in direction %s" % database_folder) except FileExistsError: logger.debug( "Database folder %s already exists. Checking for content." % database_folder) except OSError: logger.exception( "No permission to create new database folder.", exc_info=True) exit(1) if not database_folder.exists(): try: database_folder.mkdir() logger.info( "New folder %s created. All needed database files will be downloaded and stored in this " "direction." % database_folder) prot_gz_b = prot_b = pdb_b = taxdump_b = db_gz_b = db_b = False skip_check = True except OSError: logger.exception( "Database folder %s does not exist and can not be created." % database_folder, exc_info=True) exit(1) # user given path to database # given path to database checked, if not exists quit. Check if DB is in uniprot or ncbi format if options.dbname: path_to_db = Path.cwd() / options.dbname db_b = Output.check_files_exist([path_to_db])[0] if not db_b: logger.error( "Given database %s does not exist. Enter correct path under option --dbname. Program quits." % path_to_db) exit(1) if not TestFile.test_uniprot(options.dbname): options.database = 'ncbi' # check database folder for content # check if all needed files in database folder: bool values _b: True = file exists and not downloaded again if not skip_check: taxdump_b, prot_gz_b, prot_b, pdb_b, db_gz_b, db_b = Output.check_files_exist( [ database_folder / url_taxdump.split('/')[-1], database_folder / url_protaccession2taxID.split('/')[-1], database_folder / 'prot.accession2taxid', database_folder / url_pdbaccession2taxID.split('/')[-1], path_to_db, path_to_db.parents[0] / path_to_db.stem ]) if db_b: path_to_db = path_to_db.parents[0] / path_to_db.stem if not taxdump_b: logger.warning( "File taxdump.tar.gz does not exist does not exist under the path %s and will be downloaded." % str(database_folder)) if not pdb_b and options.database == 'ncbi': logger.warning( "File pdb.accession2taxid.gz does not exist does not exist under the path %s and will be" " downloaded." % str(database_folder)) if not prot_gz_b and not prot_b and options.database == 'ncbi': logger.warning( "File prot.accession2taxid.gz does not exist does not exist under the path %s and will be" " downloaded." % str(database_folder)) if options.dbname is None and not db_b and not db_gz_b: logger.warning( "Database file %s does not exist does not exist under the path %s and will be downloaded." % (db_dict_name[options.database], str(database_folder))) # download taxdump file (best at the same day) if not taxdump_b: taxdump_md5 = read_ncbi_hash(url_taxdump_md5, logger) dwl_taxdb = Download(url_taxdump, database_folder / url_taxdump.split('/')[-1], taxdump_md5) dwl_taxdb.download() logger.debug('End download of taxdump.tar.gz') # download prot.accession2taxid.gz (only for ncbi) and check md5 hash if not prot_gz_b and not prot_b and options.database == 'ncbi': md5_hash = read_ncbi_hash(url_protaccession2taxID_md5, logger) dwl_protaccession = Download(url_protaccession2taxID, database_folder / url_protaccession2taxID.split('/')[-1], md5=md5_hash) dwl_protaccession.download() logger.debug( 'End download from %s to location %s.' % (url_protaccession2taxID, str(database_folder / url_protaccession2taxID.split('/')[-1]))) # download pdb.accession2taxid.gz (only for ncbi) and check md5 hash if not pdb_b and options.database == 'ncbi': md5_hash = read_ncbi_hash(url_pdbaccession2taxID_md5, logger) dwl_pdbaccession = Download(url_pdbaccession2taxID, database_folder / url_pdbaccession2taxID.split('/')[-1], md5=md5_hash) dwl_pdbaccession.download() logger.debug( 'End download from %s to location %s.' % (url_pdbaccession2taxID, str(database_folder / url_pdbaccession2taxID.split('/')[-1]))) # download peptide database and check md5 hash if not db_b and not db_gz_b: if options.database == 'ncbi': database_version_ncbi = 'ncbi ' + str(date) md5_hash = read_ncbi_hash(url_database_md5_ncbi, logger) dwl_db = Download(url_database_ncbi, database_folder / db_dict_name['ncbi'], md5=md5_hash) dwl_db.download() logger.debug("Databaseversion: %s" % database_version_ncbi) path_to_db = database_folder / db_dict_name['ncbi'] else: if options.database == 'swissprot' or options.database == 'uniprot': database_version_swissprot, hash_swissprot = read_uniprot_metadata( url_uniprot_metadata, db_dict_name['swissprot'], logger) logger.debug("Database version swissprot: %s " % database_version_swissprot) dwl_db_swiss = Download(url_database_swissprot, database_folder / db_dict_name['swissprot'], md5=hash_swissprot) dwl_db_swiss.download() path_to_db = database_folder / db_dict_name['swissprot'] if options.database == 'trembl' or options.database == 'uniprot': database_version_trembl, hash_trembl = read_uniprot_metadata( url_uniprot_metadata, db_dict_name['trembl'], logger) logger.debug("Databaseversion trembl: %s." % database_version_trembl) dwl_db_trembl = Download(url_database_trembl, database_folder / db_dict_name['trembl'], md5=hash_trembl) dwl_db_trembl.download() path_to_db = database_folder / db_dict_name['trembl'] # concetenate swissprot and trembl to uniprot file if options.database == 'uniprot': try: logger.debug( "Concatenate swissprot and trembl to uniprot database with name uniprot.fasta" ) with open(str(database_folder / db_dict_name['trembl']), 'ab') as trembl: with open( str(database_folder / db_dict_name['swissprot']), 'rb') as swissprot: shutil.copyfileobj(swissprot, trembl) # rename trembl to uniprot: Path(database_folder / db_dict_name['trembl']).rename( database_folder / db_dict_name['uniprot']) logger.debug("Uniprot database is now ready.") path_to_db = database_folder / db_dict_name['uniprot'] except FileNotFoundError: logger.exception( "Creation of uniprot database file out of trembl and swissprot file failed.", exc_info=True) exit(1) # create config file try: path_to_main = Path(__file__, '..').resolve() with open(str(path_to_main / "tax2proteome.config"), 'w') as config: config.write(str(database_folder) + '\n') except OSError: logger.debug('Can not create config file') # Read taxIDs from option -t and option -i if options.taxon: taxIDs = set( [taxID for taxonlist in options.taxon for taxID in taxonlist]) else: taxIDs = set() if options.input: try: with open(options.input, 'r') as inputFile: for i, line in enumerate(inputFile): fields = line.rstrip('\r\n').split('\t') if len(fields) >= abs(options.column): taxID = fields[options.column].strip() if taxID.isdigit(): taxIDs.add(int(taxID)) else: logger.error( 'Value %s in line %i of taxon input file is not a number. ' 'Right column number specified?' % (taxID, i)) continue else: logger.error( 'Column number is bigger as number of columns in taxon ID input file. ' 'Program continues without taxon IDs from input file.' ) except FileNotFoundError: logger.exception( 'Taxon ID input file does not exist under specified path.', exc_info=True) if not taxIDs: logger.error( 'No taxon ID given. Please check your input. Program quits. ') raise Exception('No taxon IDs.') exit(1) logger.debug('Given Tax-IDs: %s' % ' '.join(str(it) for it in taxIDs)) # Try load pre-builded taxonomy graph or built taxonomy graph now if not (database_folder / 'taxon_graph').is_file(): taxon_graph = TaxonGraph() logger.debug("Start building taxon graph.") taxon_graph.create_graph(database_folder / url_taxdump.split('/')[-1]) logger.debug("Taxon graph successfully build.") # save TaxonGraph to harddrive: with open(str(database_folder / 'taxon_graph'), 'wb') as handle: pickle.dump(taxon_graph, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.debug('Safe taxon graph to location: %s' % str(database_folder / 'taxon_graph')) # load Taxon Graph else: try: logger.debug('Load taxon graph.') with open(str(database_folder / 'taxon_graph'), 'rb') as handle: taxon_graph = pickle.load(handle) except UnicodeDecodeError or EOFError: logger.exception( "Failed opening path to taxon graph / taxon_graph is corrupted. Delete %s file." % str(database_folder / 'taxon_graph')) exit(1) # adjusts the hierarchy level, if level does not exist, take next smaller level if options.level: logger.debug( "Start selection of next ancestor of level %s for all given taxIDs" % options.level) taxIDs = { taxon_graph.find_level_up(taxID, options.level) for taxID in taxIDs } logger.info( "All taxon IDs are set up to level %s in anchestral tree. Taxon IDs of level %s: %s" % (options.level, options.level, ' '.join(str(it) for it in taxIDs))) final_taxIDs = set() # find all descendants if not options.no_descendants: logger.debug("Start searching for all child taxon IDs.") for taxID in taxIDs: final_taxIDs.update(taxon_graph.find_taxIDs( taxID, options.species)) logger.debug("End searching for all child taxon IDs.") logger.debug('Number of final taxon IDs: %s' % str(len(final_taxIDs))) else: final_taxIDs = taxIDs logger.debug('Number of taxon IDs for database search: %s' % str(len(final_taxIDs))) # generate accession_taxID dict for ncbi db search and write custom specified db to --out with_taxon_ID = TestFile.test_uniprot(path_to_db) if not with_taxon_ID: accession = Accession(final_taxIDs) logger.debug('Read accession files.') if prot_b: accession.read_accessions( database_folder / 'prot.accession2taxid', database_folder / url_pdbaccession2taxID.split('/')[-1], options.threads) else: accession.read_accessions( database_folder / url_protaccession2taxID.split('/')[-1], database_folder / url_pdbaccession2taxID.split('/')[-1], options.threads) logger.debug('All accession IDs collected.') logger.info('Start writing taxon selected peptide database to %s.' % output_path) wc = WriteCustomDB(path_to_db, output_path) wc.read_database(False, gzipped=TestFile.test_gzipped(path_to_db), accessions=accession.accessionIDs, threads=options.threads) logger.debug('End writing taxon selected peptide database.') # non redundant database # uniprot: write custom specified db to --out else: logger.info('Start writing taxon selected peptide database to %s.' % output_path) wc = WriteCustomDB(path_to_db, output_path, final_taxIDs) wc.read_database(True, threads=options.threads, gzipped=TestFile.test_gzipped(path_to_db)) logger.debug('End writing taxon selected peptide database.') # non redundant database if options.non_redundant: DatabaseCleaner.non_redundant(output_path, with_taxon_ID) # remove redundant database: output_path.unlink() if options.reduce_header and not with_taxon_ID: # reduce headers of NCBI database DatabaseCleaner.reduce_header(output_path) output_path.unlink() logger.info('Program finished.') exit(0)
def doRequest(self, url): d = Download(url) if d.doRequest() == None: return None else: return d.getSOURCE()
def __init__(self): Download.__init__(self) self.h = {}
def __init__(self, phenny): Download.__init__(self) self.urltype = None # 97 - TWITCHTV self.type = None self.consumerkey = phenny.config.justintvkey self.h = {}