def fech_pages(mylist, myqueue, tag=False, que1=None): # print('执行:fech_pages') for num_i in mylist: # print('把这个url的页面压入队列: ' + num_i) if tag: while myqueue.full(): time.sleep(2) url_page = downloader.downloader( num_i, proxies=get_random_proxies(), ) if url_page == False: pass else: myqueue.put(url_page) que1.put(num_i) else: while myqueue.full(): time.sleep(10) url_page = downloader.downloader( num_i, proxies=get_random_proxies(), ) if url_page == False: pass else: myqueue.put(url_page) return True
def fech_anjuke_old(): for url in generate_mainurl(): print(url) mainpage_txt = downloader.downloader(url, proxies=get_random_proxies()) houseurls = spider_anjuke.get_houseurls_old(mainpage_txt) infos = pd.DataFrame() for houseurl in houseurls: housepage_txt = downloader.downloader(houseurl, proxies=get_random_proxies()) if housepage_txt is False: pass else: info = items_anjuke.get_items_old(housepage_txt, houseurl) if info is False: pass else: info = pd.DataFrame([info]) infos = infos.append(info, ignore_index=True) piplines.write_csv( data=infos, city='shijiazhuang', web='anjuke', ) infos = pd.DataFrame()
def main(): print( "This is vk messages dumper. It can:\n1) Dump all messages to json\n2) Download all attachments from dumped " "json\n3) Parse json to html (or txt)\n4) Real time dumping with lonpoll\n0) exit\nChoose what you wnat to " "do:") try: choice = getch() except termios.error: print("Your terminal doesn't support RT input!", file=sys.stderr) choice = input() if choice == '0': exit(0) if choice == '1': from jsondump import main as dumper dumper() if choice == '2': from downloader import main as downloader downloader() if choice == '3': from htmlCreator import main as parser parser() if choice == '4': from longpoll import main as rt_dumper rt_dumper() else: print("There is no such option!", file=sys.stderr) exit(1)
def radiosobterurlstream(name,url): #GA("None","Radio - " + name) mensagemprogresso.create('TV Portuguesa','A carregar...') mensagemprogresso.update(0) if re.search('www.radios.pt',url): link=abrir_url(url) try: endereco=re.compile('<param name="url" value="(.+?)"').findall(link)[0] except: xbmc.executebuiltin("XBMC.Notification(Fightnight Music,Não é possível ouvir esta rádio.,'500000',)") return idradio=url.replace('http://www.radios.pt/portalradio/Sintonizador/?radio_id=','').replace('&scope=0','') thumbnail='http://www.radio.com.pt/APR.ROLI.WEB/Images/Logos/'+ idradio +'.gif' else: urlfinal='http://www.radioonline.com.pt/ajax/player.php?clear_s_name=' + url link= clean(abrir_url(urlfinal)) try: player=re.compile('soundManager.createSound\({(.+?)autoLoad').findall(link)[0] except: player=False try: endereco=re.compile('url: "(.+?)"').findall(player)[0].replace(';','') if re.search('serverURL',player): rtmp=re.compile('serverURL: "(.+?)"').findall(player)[0] #rtmp=rtmp.replace('rtmp://195.23.102.206','rtmp://195.23.102.209') #tempfix rtmp=rtmp.replace(':1936','') #tempfix endereco=rtmp + ' playPath=' + endereco except:endereco=False if not endereco: try:endereco=re.compile('<param name="URL" value="(.+?)"').findall(link)[0] except: try: endereco=re.compile('<object data="(.+?)"').findall(link)[0] except: endereco=False if not endereco: xbmc.executebuiltin("XBMC.Notification(TV Portuguesa,Não é possível ouvir esta rádio.,'500000',)") mensagemprogresso.close() return try:thumbnail=re.compile('<img id="station-logo-player" src="(.+?)"').findall(link)[0] except: thumbnail='' if re.search('.asx',endereco): nomeasx='stream.asx' path = xbmc.translatePath(os.path.join(pastaperfil)) lib=os.path.join(path, nomeasx) downloader(endereco,lib) texto= openfile(nomeasx) endereco = xbmc.PlayList(1) endereco.clear() streams=re.compile('<ref.+?"(.+?)"/>').findall(texto) for musica in streams: listitem = xbmcgui.ListItem(name, iconImage="DefaultVideo.png", thumbnailImage=thumbnail) listitem.setInfo("music", {"Title":name}) endereco.add(musica,listitem) else: pass mensagemprogresso.close() listitem = xbmcgui.ListItem(name, iconImage="DefaultVideo.png", thumbnailImage=thumbnail) listitem.setInfo("music", {"Title":name}) xbmc.Player().play(endereco,listitem)
def main(audio_dir, json_dir, download_hours_ahead, audio_hours_retain, json_hours_retain): scraper.scraper(json_dir) logging.info("Scraper complete") sh.cleanup(audio_dir, audio_hours_retain) logging.info("Audio cleanup complete") sh.cleanup(json_dir, json_hours_retain) logging.info("JSON cleanup complete") downloader.downloader(download_hours_ahead, audio_dir, json_dir) logging.info("Downloader complete")
def download(comic_name, page_url): start = 0 html = htmlthrower.gather_download_links(page_url) parsed_html = ImageFinder() parsed_html.feed(html) download_links = parsed_html.__dict__ print(download_links['image_links']) for pages in (download_links['image_links']): print('downloading ...' + str(start + 1)) start = start + 1 downloader.downloader(pages, comic_name + '\\' + (str(start) + '.jpg'))
def download_test(): start = time.clock() dl = downloader() dl.config(max_thread=5, verify=False) # dl.add_url('http://xiazai.xiazaiba.com/Soft/W/WeChatSetup_2.6.4.1000_XiaZaiBa.zip') # dl.add_url('http://dblt.xiazaiba.com/Soft/W/WeChatSetup_2.6.4.1000_XiaZaiBa.zip') dl.add_url( 'http://xiazai.xiazaiba.com/Soft/M/MarvelousDesigner7_Personal_4_1_100_XiaZaiBa.zip' ) dl.add_url( 'http://dblt.xiazaiba.com/Soft/M/MarvelousDesigner7_Personal_4_1_100_XiaZaiBa.zip' ) # dl.add_url('https://download.virtualbox.org/virtualbox/5.2.18/VirtualBox-5.2.18-124319-Win.exe') opener = dl.open() # dl.file.validate_name() # opener = dl.load('', dl.file.name) # if opener.server_validate() is True: opener.start() while True: print 'online:%d, %d/%d [%f kb/s]' % ( opener.GlobalProg.getOnlineQuantity(), opener.file.size - opener.getLeft(), opener.file.size, opener.getinsSpeed() / 1024) time.sleep(1) if opener.isDone(): print '-----------------------------' print 'Total time: %f s, average speed: %f kb/s' % ( time.clock() - start, opener.getavgSpeed() / 1024) print 'done!' break
def __init__(self, jobs, threads, play_url, dry_run): self.jobs = jobs self.threads = threads self.dl = downloader.downloader() self.dry_run = dry_run self.play_url = play_url self.output_filename = None
def fech_anjuke_old(url): mainpage_txt = downloader.downloader(url, ) pre_mainurls_list = [] end_mainurls_list = [] now_main_url = spider_anjuke.get_mainurls_old(mainpage_txt)
def __init__(self): self.page = 0 self.cur_url = r"https://movie.douban.com/top250?start={page}&filter=&type=" self.URL_manager = url_manager.url_manager() self.Downloader = downloader.downloader() self.Outputer = outputer.outputer() self.Parser = html_parser.html_parser()
def save(img_urls, dir_path): for img_url in img_urls: pictrue = downloader(img_url).content picture_name = img_url.split('/')[-1] with open(dir_path + '/' + picture_name, 'wb') as f: f.write(pictrue) time.sleep(2)
def helper(client, sub_servers, C_lib): sql = sq.SQL() cursor = sql.getCursor() query = client["socket"].recv(1024) while query != '4': if query == '2': uploader(sql, client, sub_servers, C_lib) elif query == '1': downloader(sql, client, sub_servers, C_lib) elif query == '3': delete(sql, client, sub_servers, C_lib) sleep(1) print "Waiting for next command .." query = client["socket"].recv(1024) print 'Here is query ..', query client["socket"].close() print "End"
def looper(): latest = 'go' clipboard.copy('go') while clipboard.paste() != 'Stop': if clipboard.paste() != latest: latest = clipboard.paste() logger.info('Latest clipboard item') logger.info(f'clipped: {latest}') if isValidURL(latest): logger.info('Valid url') dl.downloader(dl.ydl_opts, latest) else: logger.debug('Not a valid url') time.sleep(5)
def process_queue(): try: picture_id = crawl_queue.pop() except IndexError: print 'No pictures!' picture_html = downloader(url + picture_id.encode('utf8')).text img_urls = html_parse(picture_html) save(img_urls, picture_dir)
def getSinglePlayerInfo(id): timestamp = int(round(time.time() * 1000)) html_cont = downloader.downloader( 'http://ziliaoku.sports.qq.com/cube/index?callback=getSinglePlayerInfo&cubeId=8&dimId=5¶ms=t1:%s&from=sportsdatabase&_:%d' % (id, timestamp)) con = re.match(r'^getSinglePlayerInfo\((.*)\)$', html_cont) loadJson = json.loads(con.group(1)) return loadJson['data']['playerBaseInfo']
def __init__(self, config): if not self.getlock(): exit() self.config = config self.headers = {'User-Agent': self.config['Headers']['headers']} self.connector = connector(config) self.downloader = downloader(self.headers, self.connector, dataDir=self.config['Main']['Data'], numProcs=4)
def beginDownload(self): username = self.LoginField.get() password = self.PasswordField.get() module = self.ModuleField.get() folder = self.FolderField.get() DLR = downloader.downloader('GUI') _thread.start_new_thread( DLR.beginDLR, (username, password, module, folder, self.updateStatus))
def start(self) : down=downloader(self._sitename,self._siteurl); if down.download(): parse=parser(down._save_location(),down._data_is(),self._siteurl) if parse.parse(): parse_links=parse.links_are(); parse_links=self.check_rel(parse_links,"http://"); self.remove_duplicate(1,parse_links); parse_img=parse.images_are(); self.remove_duplicate(2,parse_img);
def downloadFic(): inLink = request.forms.get('ficURL') fileFormat = int(request.forms.get('ficFormat')) if inLink.count('.') < 1: # abort(400, "") # check that the link doesn't point to local path if inLink.startswith('/'): # beter safe than sory abort(400, "") # c = getControler(inLink) # find and create the controlle that will extract #only the fiction text (with out any header, foote rand etc) from the site d = downloader( c ) # downloder that will down load each chapter it uses the controler from above to clear the text e = eBookCreater(d, fileFormat) # creates the ebook threading.Timer( 0.01, getFanFic, (d, e)).start() # timer to start the download and conversion # it is a thread to alow providing a status of the # download to the browser # give the downloader few seconds to get the details while d.firstChapter: # Wait until this fist page is ready time.sleep( 0.1 ) # do not use pass becasue the cpu is clocked and the flag for fist chapter is never changed # with wait it works yield d.storyName + " - " + d.autor + ";;;" maxChapter = len(d.tableOfContent) yield str(maxChapter) + ";;;" while d.done: # Wait untill the book is downloaded and give the status of the downloaded chapters yield str(d.chapterNumber) + ";;;" time.sleep(1) # wait 1 second just not to spam the client if not d.error: # In case there is an error abort(400, "") while e.done: # Wait until the book is created same as above yield "Creating e-book file;;;-1;;;" time.sleep(1) if not e.error: abort(400, "") threading.Timer( 600, cleanUP, (d.folder, 0)).start() # create a time do delete the e-book file # after 10 minutes (600 seconds) yield str(d.folder) # return the id for downloading the file
def getTeamList(): html_cont = downloader.downloader( 'http://matchweb.sports.qq.com/team/list?columnId=100000&competitionId=100000&callback=getTeamList') jsonFile = open('json/teamList.json', 'w') con = re.match(r'^getTeamList\((.*)\)$', html_cont) loadJson = json.loads(con.group(1)) jsonCon = json.dumps( loadJson['data'], sort_keys=False, indent=4, ensure_ascii=False) jsonFile.write(jsonCon) jsonFile.close() return loadJson['data']['all']
def getMatchSchedule(start = '2018-10-01', end = '2018-10-05'): timestamp = int(round(time.time() * 1000)) html_cont = downloader.downloader( 'http://matchweb.sports.qq.com/kbs/list?from=NBA_PC&columnId=100000&startTime=%s&endTime=%s&callback=schedule&_:%d' % (start, end, timestamp)) con = re.match(r'^schedule\((.*)\)$', html_cont) loadJson = json.loads(con.group(1)) jsonCon = json.dumps(loadJson['data'], sort_keys=False, indent=4, ensure_ascii=False) jsonFile = open('json/getMatchSchedule.json', 'w') jsonFile.write(jsonCon) jsonFile.close()
def getPlayerStatsRankSummary(year = 2017, seasonType = 2, limit = 5): timestamp = int(round(time.time() * 1000)) html_cont = downloader.downloader( 'http://ziliaoku.sports.qq.com/cube/index?callback=getPlayerStatsRankSummary&cubeId=10&dimId=53,54,55,56,57,58¶ms=t2:%d|t3:%d&limit=%d&from=sportsdatabase&_:%d' % (year, seasonType, limit, timestamp)) con = re.match(r'^getPlayerStatsRankSummary\((.*)\)$', html_cont) loadJson = json.loads(con.group(1)) jsonCon = json.dumps(loadJson['data'], sort_keys=False, indent=4, ensure_ascii=False) jsonFile = open('json/getPlayerStatsRankSummary.json', 'w') jsonFile.write(jsonCon) jsonFile.close()
def run(self): while True: if len(_links)!=0: _down=downloader(self,self._sitename,_links.pop(0)); if _down.download(): _parse=parser(_down._save_location(),_down._data_is(),self._siteurl); if _parse.parse(): _parse_links=_parse.links_are(); elif len(_images)!=0: else: continue;
def get_pictures_id(self): painter_url = seed_url + str(self.painter_id) id_ = [] t = 1 html_source = downloader(painter_url).text while t == 1: html_soup = BeautifulSoup(html_source, 'lxml') span = html_soup.find('span', class_='next') try: next_page = span.a.get('href') except: t = 0 pattern = re.compile('(?<=data-id=")\S*(?=">)') picture_id = re.findall(pattern, html_source) id_.extend(picture_id) time.sleep(2) if next_page is not None: html_source = downloader(base_url + next_page).text else: break return id_
def list_album_in_artist_page(url_artist_menu, url_artist): # A dictionary containing all the album from this artist albums_name_available = ["All the album"] albums_url = [] res = requests.get(url_artist_menu) element = bs4.BeautifulSoup(res.text, "html.parser") results = element.findAll("a", href=True) # Iterate to find only the one that have a ref to an album for element in results: if "album" in element["href"] and "help" not in element["href"]: albums_name_available.append(element.find("p").getText().strip()) albums_url.append(element["href"]) # Ask the user what album they would like to download user_response = "" # Loop that will only accept choice from the list while user_response != len(albums_name_available): print("Which album would you like to download?") # Print the list with the available choice for index in range(0,len(albums_name_available)): print(str(index) + "- %s" % (albums_name_available[index])) print(str(len(albums_name_available)) +"- Exit") # Ask the user for what they want user_response = input("\nWrite the number of the selection: ") # Verify if the user input is only decimal if user_response.isdecimal(): # Find what the user want if int(user_response) in range(1, len(albums_name_available)): url_album = url_artist + albums_url[int(user_response) - 1] print(downloader.downloader(url_album)) #@TODO send the choice from the user to the downloader elif int(user_response) == len(albums_name_available) : print("No album will be downloaded.\n") break else: print("\nPlease make a choice in the list using the numbers for selection.\n") #list_album_in_artist_page(research_url_string)
def download_imagelist(self, imagelistUUID, flags): Session = self.SessionFactory() query_imagelist_uri = Session.query(model.ImagelistMetadata).\ filter(model.Imagelist.identifier == imagelistUUID).\ filter(model.Imagelist.id == model.ImagelistMetadata.fkImageList).\ filter(model.ImagelistMetadata.key == 'hv:uri') if query_imagelist_uri.count() == 0: self.log.warning('image list uri not found') return True uri = None for item in query_imagelist_uri: uri = item.value if uri is None: self.log.error('image list uri not found') return True content = downloader.downloader(uri) if content is None: self.log.error("Content is None.") sys.exit(22) anchor = smimeX509validation.LoadDirChainOfTrust("/etc/grid-security/certificates/") smimeProcessor = smimeX509validation.smimeX509validation(anchor) try: dwonloader_responce = content["responce"] except KeyError: self.log.error("Retrive uri failed:'%s'" % (uri)) return False try: smimeProcessor.Process(dwonloader_responce) except smimeX509validation.truststore.TrustStoreError as exp: self.log.error("Validate text '%s' produced error '%s'" % (uri, exp)) self.log.debug("Downloaded=%s" % (content['responce'])) return False except smimeX509validation.smimeX509ValidationError as exp: self.log.error("Validate text '%s' produced error '%s'" % (uri, exp)) self.log.debug("Downloaded=%s" % (uri)) return False if not smimeProcessor.verified: self.log.error("Failed to verify text '%s'" % (content)) return False try: candidate = json.loads(smimeProcessor.InputDaraStringIO.getvalue()) except ValueError: self.log.error("Failed to parse JSON.") sys.exit(20) if candidate is None: self.log.error("No JSON content.") sys.exit(21) self.importer(candidate)
def get_photo(grade): _grade = grade + '%' sql = "select student_no from hnust_student where student_no like '%s'" % _grade try: cursor.execute(sql) results = cursor.fetchall() except: print("Error: unable to fetch data") for item in results: url = "http://kdjw.hnust.edu.cn/kdjw/uploadfile/studentphoto/pic/%s.JPG" % item[ 0] r = requests.get(url) if r.status_code == 200: down.downloader(url) else: url2 = "http://kdjw.hnust.edu.cn/kdjw/uploadfile/studentphoto/pic/%s.jpg" % item[ 0] r2 = requests.get(url2) if r2.status_code == 200: down.downloader(url2) else: pass
def teamRank(): timestamp = int(round(time.time() * 1000)) html_cont = downloader.downloader( 'http://matchweb.sports.qq.com/rank/team?callback=teamRank&competitionId=100000&from=NBA_PC&_:%d' % timestamp) con = re.match(r'^teamRank\((.*)\)\;$', html_cont) loadJson = json.loads(con.group(1)) jsonCon = json.dumps(loadJson[1], sort_keys=False, indent=4, ensure_ascii=False) jsonFile = open('json/teamRank.json', 'w') jsonFile.write(jsonCon) jsonFile.close()
def getTeamStatsTotal(year=2017, seasonType=2): timestamp = int(round(time.time() * 1000)) html_cont = downloader.downloader( 'http://ziliaoku.sports.qq.com/cube/index?callback=getTeamStatsTotal&cubeId=12&dimId=43¶ms=t2:%d|t3:%d|t64:west,east&order=t60&from=sportsdatabase&_:%d' % (year, seasonType, timestamp)) con = re.match(r'^getTeamStatsTotal\((.*)\)$', html_cont) loadJson = json.loads(con.group(1)) jsonCon = json.dumps(loadJson['data']['nbTeamSeasonStatRank'], sort_keys=False, indent=4, ensure_ascii=False) jsonFile = open('json/getTeamStatsTotal.json', 'w') jsonFile.write(jsonCon) jsonFile.close()
def getTeamSchedule(id=1): timestamp = int(round(time.time() * 1000)) html_cont = downloader.downloader( 'http://mat1.gtimg.com/apps/hpage2/nbateammatchlist_%d.json?callback=getCastData&_:%d' % (id, timestamp)) con = re.match(r'^getCastData\((.*)\)$', html_cont) loadJson = json.loads(con.group(1)) jsonCon = json.dumps(loadJson, sort_keys=False, indent=4, ensure_ascii=False) jsonFile = open('json/getTeamSchedule.json', 'w') jsonFile.write(jsonCon) jsonFile.close()
def main(argv): # Get the information from the arguments try: target_address=argv[1] output_file=argv[2] os.chdir(argv[3]) except IndexError: print "my error handler" print "Incorrect number of arguments" print "Usage downloader <target_address> <output_file> [-p [proxy]]" sys.exit(1) # Exit status 1 for invalid usage # Get the information about the proxy if len(argv)>3: if argv[3]=="-p": try: # Check if proxy is passed as an argument http_proxy=argv[4] except IndexError: # If proxy is not passed, use the environment variable http_proxy=os.environ['http_proxy'] # Ensure that http_proxy variable is set try: http_proxy except NameError: http_proxy=None # Create a downloader object handle=downloader(target_address,output_file,http_proxy) # Download the files handle.download() # Concatenate the resulting segments handle.concatenate() # Delete temporary files handle.delete_temp()
def main(argv): # Get the information from the arguments try: target_address=argv[1] output_file=argv[2] except IndexError: print "my error handler" print "Incorrect number of arguments" print "Usage downloader <target_address> <output_file> [-p [proxy]]" sys.exit(1) # Exit status 1 for invalid usage # Get the information about the proxy if len(argv)>3: if argv[3]=="-p": try: # Check if proxy is passed as an argument http_proxy=argv[4] except IndexError: # If proxy is not passed, use the environment variable http_proxy=os.environ['http_proxy'] # Ensure that http_proxy variable is set try: http_proxy except NameError: http_proxy=None # Create a downloader object handle=downloader(target_address,output_file,http_proxy) # Download the files handle.download() # Concatenate the resulting segments handle.concatenate() # Delete temporary files handle.delete_temp()
def download_video(**kwargs): ds = [] url = kwargs['url'] print('geting video from url {}'.format(url)) try: headers['X-Forwarded-For'] = randip() resp = requests.get(url, headers=headers) resp.encoding = 'utf-8' cont = resp.text video = mp4_reg.findall(cont)[0] d = downloader(url=video, path=kwargs['downpath'], picture=kwargs['picture'], title=kwargs['title'], id=kwargs['id']) return d except Exception as e: print(e) return False
def html_parse(html_source): img_urls = [] html_soup = BeautifulSoup(html_source, 'lxml') div = html_soup.find_all('div', class_='works_display') a_label = div[0].a if a_label: img_part_url = div[0].a.get('href') pictures_source = downloader('http://www.pixiv.net/' + img_part_url).text html_soup2 = BeautifulSoup(pictures_source, 'lxml') img_items = html_soup2.find_all('div', class_='item-container') for item in img_items: img_url = item.img.get('data-src') img_urls.append(img_url) return img_urls else: pattern1 = re.compile('(?<=data-src=")\S*(?=" class="original-image")') img_url = re.findall(pattern1, html_source) return img_url
def doprocess(argv): localanalysis= "no" if len(sys.argv) < 3: usage() try: opts,args = getopt.getopt(argv,"l:d:f:h:n:t:o:") except getopt.GetoptError: usage() for opt,arg in opts: if opt == '-d': word = arg elif opt == '-t': filetypes=[] if arg.count(",") != 0: filetypes = arg.split(",") else: filetypes.append(arg) print filetypes elif opt == '-l': limit = int(arg) elif opt == '-h': localanalysis=arg elif opt == '-n': filelimit = int(arg) elif opt == '-o': dir = arg elif opt == '-f': outhtml = arg if os.path.exists(dir): pass else: os.mkdir(dir) if localanalysis == "no": print "[-] Starting online search..." for filetype in filetypes: print "\n[-] Searching for "+filetype+ " files, with a limit of " + str(limit) search=googlesearch.search_google(word,limit,start,filetype) search.process_files() files=search.get_files() print "Results: " + str(len(files)) + " files found" print "Starting to download "+ str(filelimit) + " of them.." print "----------------------------------------------------\n" counter=0 for x in files: if counter <= filelimit: print "["+str(counter+1)+"/"+str(filelimit)+"] " + x getfile=downloader.downloader(x,dir) getfile.down() filename=getfile.name() try: if filename !="": if filetype == "pdf": test=metadataPDF.metapdf(dir+"/"+filename,password) elif filetype == "doc" or filetype == "ppt" or filetype == "xls": test=metadataMSOffice.metaMs2k(dir+"/"+filename) if os.name=="posix": testex=metadataExtractor.metaExtractor(dir+"/"+filename) elif filetype == "docx" or filetype == "pptx" or filetype == "xlsx": test=metadataMSOfficeXML.metaInfoMS(dir+"/"+filename) res=test.getData() if res=="ok": raw=test.getRaw() users=test.getUsers() paths=test.getPaths() soft=test.getSoftware() if (filetype == "doc" or filetype == "xls" or filetype == "ppt") and os.name=="posix": testex.runExtract() testex.getData() paths.extend(testex.getPaths()) respack=[x,users,paths,soft,raw] all.append(respack) else: print "error" #A error in the parsing process else: print "pass" except Exception, e: print("ERROR: "+str(e)) counter+=1
import sys import time from downloader import downloader print "*************GANPAT, THE CRAWLER**************"; if len(sys.argv)>1 : print "SITE: ",sys.argv[1]," started at ",time.ctime(time.time()); down1=downloader(sys.argv[1]); else : print "USAGE: python ganpat.py <sitename>"; sys.exit(2);
def doprocess(argv): filelimit = 50 word = "local" localanalysis = "no" failedfiles = [] emails = [] if len(sys.argv) < 3: usage() try: opts, args = getopt.getopt(argv, "l:d:f:h:n:t:o:") except getopt.GetoptError: usage() for opt, arg in opts: if opt == '-d': word = arg elif opt == '-t': filetypes = [] if arg.count(",") != 0: filetypes = arg.split(",") else: filetypes.append(arg) print filetypes elif opt == '-l': limit = int(arg) elif opt == '-h': localanalysis = arg elif opt == '-n': filelimit = int(arg) elif opt == '-o': dir = arg elif opt == '-f': outhtml = arg if os.path.exists(dir): pass else: os.mkdir(dir) if localanalysis == "no": print "\n[-] Starting online search..." for filetype in filetypes: print "\n[-] Searching for "+ filetype + " files, with a limit of " + str(limit) search = googlesearch.search_google(word, limit, start, filetype) search.process_files() files = search.get_files() print "Results: " + str(len(files)) + " files found" print "Starting to download " + str(filelimit) + " of them:" print "----------------------------------------\n" counter = 1 for x in files: if counter <= filelimit: print "[" + str(counter) + "/" + str(filelimit) + "] " + x getfile = downloader.downloader(x, dir) getfile.down() filename = getfile.name() if filename != "": if filetype == "pdf": test = metadataPDF.metapdf(dir + "/" + filename, password) elif filetype == "doc" or filetype == "ppt" or filetype == "xls": test = metadataMSOffice.metaMs2k(dir + "/" + filename) if os.name == "posix": testex = metadataExtractor.metaExtractor(dir + "/" + filename) elif filetype == "docx" or filetype == "pptx" or filetype == "xlsx": test = metadataMSOfficeXML.metaInfoMS(dir + "/" + filename) res = test.getData() if res == "ok": raw = test.getRaw() users = test.getUsers() paths = test.getPaths() soft = test.getSoftware() email = [] if filetype == "pdf" or filetype == "docx": res = test.getTexts() if res == "ok": email = test.getEmails() for em in email: emails.append(em) else: email = [] failedfiles.append(x + ":" + str(res)) respack=[x, users, paths, soft, raw, email] all.append(respack) else: failedfiles.append(x + ":" + str(res)) print "\t [x] Error in the parsing process" #A error in the parsing process else: pass counter += 1 else: print "[-] Starting local analysis in directory " + dir dirList = os.listdir(dir) print dirList for filename in dirList: if filename != "": filetype = str(filename.split(".")[-1]) if filetype == "pdf": test = metadataPDF.metapdf(dir + "/" + filename, password) elif filetype == "doc" or filetype == "ppt" or filetype == "xls": print "doc" test = metadataMSOffice.metaMs2k(dir + "/" + filename) if os.name == "posix": testex = metadataExtractor.metaExtractor(dir + "/" + filename) elif filetype == "docx" or filetype == "pptx" or filetype == "xlsx": test = metadataMSOfficeXML.metaInfoMS(dir + "/" + filename) res = test.getData() if res == "ok": raw = test.getRaw() users = test.getUsers() paths = test.getPaths() soft = test.getSoftware() if (filetype == "doc" or filetype == "xls" or filetype == "ppt") and os.name=="posix": testex.runExtract() testex.getData() paths.extend(testex.getPaths()) respack = [filename, users, paths, soft, raw, email] all.append(respack) else: failedfiles.append(filename + ":" + str(res)) print "[x] Error in the parsing process" # A error in the parsing process if filetype == "docx" or filetype == "pdf": res = test.getTexts() if res == "ok": email = test.getEmails() for x in email: emails.append(x) else: failedfiles(filename + ":" + str(res)) else: print "pass" else: pass print "processing" proc = processor.processor(all) userlist = proc.sort_users() softlist = proc.sort_software() pathlist = proc.sort_paths() try: html = htmlExport.htmlExport(userlist, softlist, pathlist, all, outhtml, dir, failedfiles, word, emails) save = html.writehtml() except Exception, e: print e print "Error creating the file"
def run_downloader(): d = downloader(url,searching_for,file_providers) write_info() d.run()
def main(podcastname): p = podcastparser.factory(podcastname) print p.episodeTitle, p.downloadUrl if p.downloadUrl: d = downloader(p.downloadUrl, p.episodeTitle, podcastname)
with open(prev_file, 'r', encoding='utf-8') as f: prev = json.load(f) # prev = dao.get_to_be_downloaded() logger.info("Loaded the want-to-download anime list") home = os.path.expanduser("~") driver = webdriver.Chrome(os.path.join(home, 'locallib/chromedriver')) driver.maximize_window() for i in range(len(prev)): # Skip, if already downloaded if prev[i][4]: continue start_time = datetime.datetime.utcnow().isoformat() success = downloader(prev[i][1], driver) end_time = datetime.datetime.utcnow().isoformat() if success: # dao.update_downloaded(end_time, prev[i][0]) prev[i][4] = True prev[i].append(end_time) with open(prev_file, 'w') as f: json.dump(prev, f, indent=2) # Don't forget this line of code. driver.quit() if len(prev) == 0: