Beispiel #1
0
def fech_pages(mylist, myqueue, tag=False, que1=None):
    # print('执行:fech_pages')
    for num_i in mylist:
        # print('把这个url的页面压入队列: ' + num_i)
        if tag:
            while myqueue.full():
                time.sleep(2)
            url_page = downloader.downloader(
                num_i,
                proxies=get_random_proxies(),
            )
            if url_page == False:
                pass
            else:
                myqueue.put(url_page)
                que1.put(num_i)
        else:
            while myqueue.full():
                time.sleep(10)
            url_page = downloader.downloader(
                num_i,
                proxies=get_random_proxies(),
            )
            if url_page == False:
                pass
            else:
                myqueue.put(url_page)
    return True
Beispiel #2
0
def fech_anjuke_old():

    for url in generate_mainurl():
        print(url)
        mainpage_txt = downloader.downloader(url, proxies=get_random_proxies())

        houseurls = spider_anjuke.get_houseurls_old(mainpage_txt)
        infos = pd.DataFrame()
        for houseurl in houseurls:
            housepage_txt = downloader.downloader(houseurl,
                                                  proxies=get_random_proxies())
            if housepage_txt is False:
                pass
            else:
                info = items_anjuke.get_items_old(housepage_txt, houseurl)
                if info is False:
                    pass
                else:
                    info = pd.DataFrame([info])
                    infos = infos.append(info, ignore_index=True)

        piplines.write_csv(
            data=infos,
            city='shijiazhuang',
            web='anjuke',
        )
        infos = pd.DataFrame()
Beispiel #3
0
def main():
    print(
        "This is vk messages dumper. It can:\n1) Dump all messages to json\n2) Download all attachments from dumped "
        "json\n3) Parse json to html (or txt)\n4) Real time dumping with lonpoll\n0) exit\nChoose what you wnat to "
        "do:")
    try:
        choice = getch()
    except termios.error:
        print("Your terminal doesn't support RT input!", file=sys.stderr)
        choice = input()
    if choice == '0':
        exit(0)
    if choice == '1':
        from jsondump import main as dumper
        dumper()
    if choice == '2':
        from downloader import main as downloader
        downloader()
    if choice == '3':
        from htmlCreator import main as parser
        parser()
    if choice == '4':
        from longpoll import main as rt_dumper
        rt_dumper()
    else:
        print("There is no such option!", file=sys.stderr)
        exit(1)
def radiosobterurlstream(name,url):
    #GA("None","Radio - " + name)
    mensagemprogresso.create('TV Portuguesa','A carregar...')
    mensagemprogresso.update(0)
    if re.search('www.radios.pt',url):
        link=abrir_url(url)
        try:
            endereco=re.compile('<param name="url" value="(.+?)"').findall(link)[0]
        except:
            xbmc.executebuiltin("XBMC.Notification(Fightnight Music,Não é possível ouvir esta rádio.,'500000',)")
            return
        idradio=url.replace('http://www.radios.pt/portalradio/Sintonizador/?radio_id=','').replace('&scope=0','')
        thumbnail='http://www.radio.com.pt/APR.ROLI.WEB/Images/Logos/'+ idradio +'.gif'
    else:
        urlfinal='http://www.radioonline.com.pt/ajax/player.php?clear_s_name=' + url
        link= clean(abrir_url(urlfinal))
        try: player=re.compile('soundManager.createSound\({(.+?)autoLoad').findall(link)[0]
        except: player=False
        try:
            endereco=re.compile('url: "(.+?)"').findall(player)[0].replace(';','')
            if re.search('serverURL',player):
                rtmp=re.compile('serverURL: "(.+?)"').findall(player)[0]
                #rtmp=rtmp.replace('rtmp://195.23.102.206','rtmp://195.23.102.209') #tempfix
                rtmp=rtmp.replace(':1936','') #tempfix
                endereco=rtmp + ' playPath=' + endereco

        except:endereco=False
        if not endereco:
            try:endereco=re.compile('<param name="URL" value="(.+?)"').findall(link)[0]
            except:
                try: endereco=re.compile('<object data="(.+?)"').findall(link)[0]
                except: endereco=False

        if not endereco:
            xbmc.executebuiltin("XBMC.Notification(TV Portuguesa,Não é possível ouvir esta rádio.,'500000',)")
            mensagemprogresso.close()
            return

        try:thumbnail=re.compile('<img id="station-logo-player" src="(.+?)"').findall(link)[0]
        except: thumbnail=''
        if re.search('.asx',endereco):
            nomeasx='stream.asx'
            path = xbmc.translatePath(os.path.join(pastaperfil))
            lib=os.path.join(path, nomeasx)
            downloader(endereco,lib)
            texto= openfile(nomeasx)
            endereco = xbmc.PlayList(1)
            endereco.clear()
            streams=re.compile('<ref.+?"(.+?)"/>').findall(texto)
            for musica in streams:
                listitem = xbmcgui.ListItem(name, iconImage="DefaultVideo.png", thumbnailImage=thumbnail)
                listitem.setInfo("music", {"Title":name})
                endereco.add(musica,listitem)
        else: pass
    mensagemprogresso.close()
    listitem = xbmcgui.ListItem(name, iconImage="DefaultVideo.png", thumbnailImage=thumbnail)
    listitem.setInfo("music", {"Title":name})
    xbmc.Player().play(endereco,listitem)
Beispiel #5
0
def main(audio_dir, json_dir, download_hours_ahead, audio_hours_retain,
         json_hours_retain):
    scraper.scraper(json_dir)
    logging.info("Scraper complete")
    sh.cleanup(audio_dir, audio_hours_retain)
    logging.info("Audio cleanup complete")
    sh.cleanup(json_dir, json_hours_retain)
    logging.info("JSON cleanup complete")
    downloader.downloader(download_hours_ahead, audio_dir, json_dir)
    logging.info("Downloader complete")
Beispiel #6
0
def download(comic_name, page_url):
    start = 0
    html = htmlthrower.gather_download_links(page_url)
    parsed_html = ImageFinder()
    parsed_html.feed(html)
    download_links = parsed_html.__dict__
    print(download_links['image_links'])

    for pages in (download_links['image_links']):
        print('downloading ...' + str(start + 1))
        start = start + 1
        downloader.downloader(pages, comic_name + '\\' + (str(start) + '.jpg'))
def download_test():

    start = time.clock()
    dl = downloader()
    dl.config(max_thread=5, verify=False)
    # dl.add_url('http://xiazai.xiazaiba.com/Soft/W/WeChatSetup_2.6.4.1000_XiaZaiBa.zip')
    # dl.add_url('http://dblt.xiazaiba.com/Soft/W/WeChatSetup_2.6.4.1000_XiaZaiBa.zip')
    dl.add_url(
        'http://xiazai.xiazaiba.com/Soft/M/MarvelousDesigner7_Personal_4_1_100_XiaZaiBa.zip'
    )
    dl.add_url(
        'http://dblt.xiazaiba.com/Soft/M/MarvelousDesigner7_Personal_4_1_100_XiaZaiBa.zip'
    )
    # dl.add_url('https://download.virtualbox.org/virtualbox/5.2.18/VirtualBox-5.2.18-124319-Win.exe')
    opener = dl.open()
    # dl.file.validate_name()
    # opener = dl.load('', dl.file.name)

    # if opener.server_validate() is True:
    opener.start()

    while True:

        print 'online:%d, %d/%d [%f kb/s]' % (
            opener.GlobalProg.getOnlineQuantity(), opener.file.size -
            opener.getLeft(), opener.file.size, opener.getinsSpeed() / 1024)
        time.sleep(1)
        if opener.isDone():
            print '-----------------------------'
            print 'Total time: %f s, average speed: %f kb/s' % (
                time.clock() - start, opener.getavgSpeed() / 1024)
            print 'done!'
            break
Beispiel #8
0
 def __init__(self, jobs, threads, play_url, dry_run):
     self.jobs = jobs
     self.threads = threads
     self.dl = downloader.downloader()
     self.dry_run = dry_run
     self.play_url = play_url
     self.output_filename = None
Beispiel #9
0
def fech_anjuke_old(url):

    mainpage_txt = downloader.downloader(url, )

    pre_mainurls_list = []
    end_mainurls_list = []
    now_main_url = spider_anjuke.get_mainurls_old(mainpage_txt)
Beispiel #10
0
 def __init__(self):
     self.page = 0
     self.cur_url = r"https://movie.douban.com/top250?start={page}&filter=&type="
     self.URL_manager = url_manager.url_manager()
     self.Downloader = downloader.downloader()
     self.Outputer = outputer.outputer()
     self.Parser = html_parser.html_parser()
def save(img_urls, dir_path):
    for img_url in img_urls:

        pictrue = downloader(img_url).content
        picture_name = img_url.split('/')[-1]
        with open(dir_path + '/' + picture_name, 'wb') as f:
            f.write(pictrue)
        time.sleep(2)
def helper(client, sub_servers, C_lib):
    sql = sq.SQL()
    cursor = sql.getCursor()
    query = client["socket"].recv(1024)
    while query != '4':
        if query == '2':
            uploader(sql, client, sub_servers, C_lib)
        elif query == '1':
            downloader(sql, client, sub_servers, C_lib)
        elif query == '3':
            delete(sql, client, sub_servers, C_lib)
        sleep(1)
        print "Waiting for next command .."
        query = client["socket"].recv(1024)
        print 'Here is query ..', query
    client["socket"].close()
    print "End"
Beispiel #13
0
def looper():
    latest = 'go'
    clipboard.copy('go')

    while clipboard.paste() != 'Stop':

        if clipboard.paste() != latest:
            latest = clipboard.paste()
            logger.info('Latest clipboard item')
            logger.info(f'clipped: {latest}')

            if isValidURL(latest):
                logger.info('Valid url')
                dl.downloader(dl.ydl_opts, latest)
            else:
                logger.debug('Not a valid url')

        time.sleep(5)
    def process_queue():
        try:
            picture_id = crawl_queue.pop()
        except IndexError:
            print 'No pictures!'
        picture_html = downloader(url + picture_id.encode('utf8')).text
        img_urls = html_parse(picture_html)

        save(img_urls, picture_dir)
Beispiel #15
0
def getSinglePlayerInfo(id):
    timestamp = int(round(time.time() * 1000))

    html_cont = downloader.downloader(
        'http://ziliaoku.sports.qq.com/cube/index?callback=getSinglePlayerInfo&cubeId=8&dimId=5&params=t1:%s&from=sportsdatabase&_:%d'
        % (id, timestamp))
    con = re.match(r'^getSinglePlayerInfo\((.*)\)$', html_cont)
    loadJson = json.loads(con.group(1))
    return loadJson['data']['playerBaseInfo']
Beispiel #16
0
 def __init__(self, config):
     if not self.getlock():
         exit()
     self.config = config
     self.headers = {'User-Agent': self.config['Headers']['headers']}
     self.connector = connector(config)
     self.downloader = downloader(self.headers,
                                  self.connector,
                                  dataDir=self.config['Main']['Data'],
                                  numProcs=4)
    def beginDownload(self):
        username = self.LoginField.get()
        password = self.PasswordField.get()
        module = self.ModuleField.get()
        folder = self.FolderField.get()
        DLR = downloader.downloader('GUI')

        _thread.start_new_thread(
            DLR.beginDLR,
            (username, password, module, folder, self.updateStatus))
Beispiel #18
0
 def start(self) :
 	down=downloader(self._sitename,self._siteurl);
 	if down.download():
 		parse=parser(down._save_location(),down._data_is(),self._siteurl)
 			if parse.parse():
 				parse_links=parse.links_are();
 				parse_links=self.check_rel(parse_links,"http://");
 				self.remove_duplicate(1,parse_links);
 				parse_img=parse.images_are();
 				self.remove_duplicate(2,parse_img);
def downloadFic():
    inLink = request.forms.get('ficURL')
    fileFormat = int(request.forms.get('ficFormat'))

    if inLink.count('.') < 1:  #
        abort(400, "")  # check that the link doesn't point to local path
    if inLink.startswith('/'):  # beter safe than sory
        abort(400, "")  #

    c = getControler(inLink)  # find and create the controlle that will extract
    #only the fiction text (with out any header, foote rand etc) from the site

    d = downloader(
        c
    )  # downloder that will down load each chapter it uses the controler from above to clear the text
    e = eBookCreater(d, fileFormat)  # creates the ebook

    threading.Timer(
        0.01, getFanFic,
        (d, e)).start()  # timer to start the download and conversion
    # it is a thread to alow providing a status of the
    # download to the browser

    # give the downloader few seconds to get the details
    while d.firstChapter:  # Wait until this fist page is ready
        time.sleep(
            0.1
        )  # do not use pass becasue the cpu is clocked and the flag for fist chapter is never changed
        # with wait it works

    yield d.storyName + " - " + d.autor + ";;;"
    maxChapter = len(d.tableOfContent)
    yield str(maxChapter) + ";;;"

    while d.done:  # Wait untill the book is downloaded and give the status of the downloaded chapters
        yield str(d.chapterNumber) + ";;;"
        time.sleep(1)  # wait 1 second just not to spam the client

    if not d.error:  # In case there is an error
        abort(400, "")

    while e.done:  # Wait until the book is created same as above
        yield "Creating e-book file;;;-1;;;"
        time.sleep(1)

    if not e.error:
        abort(400, "")

    threading.Timer(
        600, cleanUP,
        (d.folder, 0)).start()  # create a time do delete the e-book file
    # after 10 minutes (600 seconds)

    yield str(d.folder)  # return the id for downloading the file
def getTeamList():
    html_cont = downloader.downloader(
        'http://matchweb.sports.qq.com/team/list?columnId=100000&competitionId=100000&callback=getTeamList')
    jsonFile = open('json/teamList.json', 'w')
    con = re.match(r'^getTeamList\((.*)\)$', html_cont)
    loadJson = json.loads(con.group(1))
    jsonCon = json.dumps(
        loadJson['data'], sort_keys=False, indent=4, ensure_ascii=False)
    jsonFile.write(jsonCon)
    jsonFile.close()
    return loadJson['data']['all']
def getMatchSchedule(start = '2018-10-01', end = '2018-10-05'):
    timestamp = int(round(time.time() * 1000))

    html_cont = downloader.downloader(
        'http://matchweb.sports.qq.com/kbs/list?from=NBA_PC&columnId=100000&startTime=%s&endTime=%s&callback=schedule&_:%d' % (start, end, timestamp))
    con = re.match(r'^schedule\((.*)\)$', html_cont)
    loadJson = json.loads(con.group(1))
    jsonCon = json.dumps(loadJson['data'], sort_keys=False, indent=4, ensure_ascii=False)

    jsonFile = open('json/getMatchSchedule.json', 'w')
    jsonFile.write(jsonCon)
    jsonFile.close()
Beispiel #22
0
def getPlayerStatsRankSummary(year = 2017, seasonType = 2, limit = 5):
    timestamp = int(round(time.time() * 1000))

    html_cont = downloader.downloader(
        'http://ziliaoku.sports.qq.com/cube/index?callback=getPlayerStatsRankSummary&cubeId=10&dimId=53,54,55,56,57,58&params=t2:%d|t3:%d&limit=%d&from=sportsdatabase&_:%d' % (year, seasonType, limit, timestamp))
    con = re.match(r'^getPlayerStatsRankSummary\((.*)\)$', html_cont)
    loadJson = json.loads(con.group(1))
    jsonCon = json.dumps(loadJson['data'], sort_keys=False, indent=4, ensure_ascii=False)

    jsonFile = open('json/getPlayerStatsRankSummary.json', 'w')
    jsonFile.write(jsonCon)
    jsonFile.close()
Beispiel #23
0
    def run(self):
        while True:
            if len(_links)!=0:
                _down=downloader(self,self._sitename,_links.pop(0));
                if _down.download():
                    _parse=parser(_down._save_location(),_down._data_is(),self._siteurl);
                    if _parse.parse():
                        _parse_links=_parse.links_are();
            elif len(_images)!=0:

            else:
                continue;
Beispiel #24
0
    def get_pictures_id(self):
        painter_url = seed_url + str(self.painter_id)
        id_ = []
        t = 1

        html_source = downloader(painter_url).text
        while t == 1:
            html_soup = BeautifulSoup(html_source, 'lxml')
            span = html_soup.find('span', class_='next')
            try:
                next_page = span.a.get('href')
            except:
                t = 0
            pattern = re.compile('(?<=data-id=")\S*(?=">)')
            picture_id = re.findall(pattern, html_source)
            id_.extend(picture_id)
            time.sleep(2)
            if next_page is not None:
                html_source = downloader(base_url + next_page).text
            else:
                break
        return id_
Beispiel #25
0
def list_album_in_artist_page(url_artist_menu, url_artist):
    # A dictionary containing all the album from this artist
    albums_name_available = ["All the album"]
    albums_url = []
    
    
    res = requests.get(url_artist_menu)
    element = bs4.BeautifulSoup(res.text, "html.parser")
    results = element.findAll("a", href=True)

    # Iterate to find only the one that have a ref to an album
    for element in results:
        if "album" in element["href"] and "help" not in element["href"]:
            albums_name_available.append(element.find("p").getText().strip())
            albums_url.append(element["href"])
    
    # Ask the user what album they would like to download
    user_response = ""
    
    # Loop that will only accept choice from the list
    while user_response != len(albums_name_available):
        print("Which album would you like to download?")
        
        # Print the list with the available choice
        for index in range(0,len(albums_name_available)):
            print(str(index) + "- %s" % (albums_name_available[index]))
        print(str(len(albums_name_available)) +"- Exit")  
    
        # Ask the user for what they want
        user_response = input("\nWrite the number of the selection: ")
        
        # Verify if the user input is only decimal 
        if user_response.isdecimal():

            # Find what the user want
            if int(user_response) in range(1, len(albums_name_available)):
                url_album = url_artist + albums_url[int(user_response) - 1]
                
                print(downloader.downloader(url_album))   
                 
            #@TODO send the choice from the user to the downloader
            
            elif int(user_response) == len(albums_name_available) :
                print("No album will be downloaded.\n")
                break
        
        else:
            print("\nPlease make a choice in the list using the numbers for selection.\n")
        

#list_album_in_artist_page(research_url_string)
Beispiel #26
0
    def download_imagelist(self, imagelistUUID, flags):
        Session = self.SessionFactory()
        query_imagelist_uri = Session.query(model.ImagelistMetadata).\
                filter(model.Imagelist.identifier == imagelistUUID).\
                filter(model.Imagelist.id == model.ImagelistMetadata.fkImageList).\
                filter(model.ImagelistMetadata.key == 'hv:uri')

        if query_imagelist_uri.count() == 0:
            self.log.warning('image list uri not found')
            return True
        uri = None
        for item in query_imagelist_uri:
            uri = item.value
        if uri is None:
            self.log.error('image list uri not found')
            return True
        content = downloader.downloader(uri)
        if content is None:
            self.log.error("Content is None.")
            sys.exit(22)
        anchor = smimeX509validation.LoadDirChainOfTrust("/etc/grid-security/certificates/")
        smimeProcessor = smimeX509validation.smimeX509validation(anchor)
        try:
            dwonloader_responce = content["responce"]
        except KeyError:
            self.log.error("Retrive uri failed:'%s'" % (uri))
            return False
        try:
            smimeProcessor.Process(dwonloader_responce)
        except smimeX509validation.truststore.TrustStoreError as exp:
            self.log.error("Validate text '%s' produced error '%s'" % (uri, exp))
            self.log.debug("Downloaded=%s" % (content['responce']))
            return False
        except smimeX509validation.smimeX509ValidationError as exp:
            self.log.error("Validate text '%s' produced error '%s'" % (uri, exp))
            self.log.debug("Downloaded=%s" % (uri))
            return False
        if not smimeProcessor.verified:
            self.log.error("Failed to  verify text '%s'" % (content))
            return False
        try:
            candidate = json.loads(smimeProcessor.InputDaraStringIO.getvalue())
        except ValueError:
            self.log.error("Failed to parse JSON.")
            sys.exit(20)
        if candidate is None:
            self.log.error("No JSON content.")
            sys.exit(21)
        self.importer(candidate)
Beispiel #27
0
def get_photo(grade):
    _grade = grade + '%'
    sql = "select student_no from hnust_student where student_no like '%s'" % _grade
    try:

        cursor.execute(sql)
        results = cursor.fetchall()

    except:
        print("Error: unable to fetch data")
    for item in results:
        url = "http://kdjw.hnust.edu.cn/kdjw/uploadfile/studentphoto/pic/%s.JPG" % item[
            0]
        r = requests.get(url)
        if r.status_code == 200:
            down.downloader(url)
        else:
            url2 = "http://kdjw.hnust.edu.cn/kdjw/uploadfile/studentphoto/pic/%s.jpg" % item[
                0]
            r2 = requests.get(url2)
            if r2.status_code == 200:
                down.downloader(url2)
            else:
                pass
Beispiel #28
0
def teamRank():
    timestamp = int(round(time.time() * 1000))

    html_cont = downloader.downloader(
        'http://matchweb.sports.qq.com/rank/team?callback=teamRank&competitionId=100000&from=NBA_PC&_:%d'
        % timestamp)
    con = re.match(r'^teamRank\((.*)\)\;$', html_cont)
    loadJson = json.loads(con.group(1))
    jsonCon = json.dumps(loadJson[1],
                         sort_keys=False,
                         indent=4,
                         ensure_ascii=False)

    jsonFile = open('json/teamRank.json', 'w')
    jsonFile.write(jsonCon)
    jsonFile.close()
def getTeamStatsTotal(year=2017, seasonType=2):
    timestamp = int(round(time.time() * 1000))

    html_cont = downloader.downloader(
        'http://ziliaoku.sports.qq.com/cube/index?callback=getTeamStatsTotal&cubeId=12&dimId=43&params=t2:%d|t3:%d|t64:west,east&order=t60&from=sportsdatabase&_:%d'
        % (year, seasonType, timestamp))
    con = re.match(r'^getTeamStatsTotal\((.*)\)$', html_cont)
    loadJson = json.loads(con.group(1))
    jsonCon = json.dumps(loadJson['data']['nbTeamSeasonStatRank'],
                         sort_keys=False,
                         indent=4,
                         ensure_ascii=False)

    jsonFile = open('json/getTeamStatsTotal.json', 'w')
    jsonFile.write(jsonCon)
    jsonFile.close()
def getTeamSchedule(id=1):
    timestamp = int(round(time.time() * 1000))

    html_cont = downloader.downloader(
        'http://mat1.gtimg.com/apps/hpage2/nbateammatchlist_%d.json?callback=getCastData&_:%d'
        % (id, timestamp))

    con = re.match(r'^getCastData\((.*)\)$', html_cont)
    loadJson = json.loads(con.group(1))
    jsonCon = json.dumps(loadJson,
                         sort_keys=False,
                         indent=4,
                         ensure_ascii=False)

    jsonFile = open('json/getTeamSchedule.json', 'w')
    jsonFile.write(jsonCon)
    jsonFile.close()
Beispiel #31
0
def main(argv):

	# Get the information from the arguments
	try:
		target_address=argv[1]
		output_file=argv[2]
		os.chdir(argv[3])
	except IndexError:
		print "my error handler"
		print "Incorrect number of arguments"
		print "Usage downloader <target_address> <output_file> [-p [proxy]]"
		sys.exit(1)		# Exit status 1 for invalid usage

	# Get the information about the proxy
	if len(argv)>3:
		if argv[3]=="-p":

			try:
				# Check if proxy is passed as an argument
				http_proxy=argv[4]

			except IndexError:
				# If proxy is not passed, use the environment variable
				http_proxy=os.environ['http_proxy']


	# Ensure that http_proxy variable is set
	try:
		http_proxy
	except NameError:
		http_proxy=None


	# Create a downloader object
	handle=downloader(target_address,output_file,http_proxy)

	# Download the files
	handle.download()

	# Concatenate the resulting segments
	handle.concatenate()

	# Delete temporary files
	handle.delete_temp()
Beispiel #32
0
def main(argv):

	# Get the information from the arguments
	try:
		target_address=argv[1]
		output_file=argv[2]
	except IndexError:
		print "my error handler"
		print "Incorrect number of arguments"
		print "Usage downloader <target_address> <output_file> [-p [proxy]]"
		sys.exit(1)		# Exit status 1 for invalid usage

	# Get the information about the proxy
	if len(argv)>3:
		if argv[3]=="-p":

			try:
				# Check if proxy is passed as an argument
				http_proxy=argv[4]

			except IndexError:
				# If proxy is not passed, use the environment variable
				http_proxy=os.environ['http_proxy']


	# Ensure that http_proxy variable is set
	try:
		http_proxy
	except NameError:
		http_proxy=None


	# Create a downloader object
	handle=downloader(target_address,output_file,http_proxy)

	# Download the files
	handle.download()

	# Concatenate the resulting segments
	handle.concatenate()

	# Delete temporary files
	handle.delete_temp()
Beispiel #33
0
def download_video(**kwargs):
    ds = []
    url = kwargs['url']
    print('geting video from url {}'.format(url))
    try:
        headers['X-Forwarded-For'] = randip()
        resp = requests.get(url, headers=headers)
        resp.encoding = 'utf-8'
        cont = resp.text
        video = mp4_reg.findall(cont)[0]
        d = downloader(url=video,
                       path=kwargs['downpath'],
                       picture=kwargs['picture'],
                       title=kwargs['title'],
                       id=kwargs['id'])
        return d
    except Exception as e:
        print(e)
        return False
def html_parse(html_source):
    img_urls = []
    html_soup = BeautifulSoup(html_source, 'lxml')
    div = html_soup.find_all('div', class_='works_display')
    a_label = div[0].a
    if a_label:
        img_part_url = div[0].a.get('href')
        pictures_source = downloader('http://www.pixiv.net/' +
                                     img_part_url).text
        html_soup2 = BeautifulSoup(pictures_source, 'lxml')
        img_items = html_soup2.find_all('div', class_='item-container')
        for item in img_items:
            img_url = item.img.get('data-src')
            img_urls.append(img_url)
        return img_urls
    else:
        pattern1 = re.compile('(?<=data-src=")\S*(?=" class="original-image")')
        img_url = re.findall(pattern1, html_source)
        return img_url
Beispiel #35
0
def doprocess(argv):
	localanalysis= "no"
	if len(sys.argv) < 3:
		usage()
	try:
		opts,args = getopt.getopt(argv,"l:d:f:h:n:t:o:")
	except getopt.GetoptError:
		usage()
	for opt,arg in opts:
		if opt == '-d':
			word = arg
		elif opt == '-t':
			filetypes=[]
			if arg.count(",") != 0:
				filetypes = arg.split(",")
			else:
				filetypes.append(arg)
				print filetypes
		elif opt == '-l':
			limit = int(arg)
		elif opt == '-h':
			localanalysis=arg
		elif opt == '-n':
			filelimit = int(arg)
		elif opt == '-o':
			dir = arg
		elif opt == '-f':
			outhtml = arg
	if os.path.exists(dir):
		pass
	else:
		os.mkdir(dir)
	if localanalysis == "no":
		print "[-] Starting online search..."
		for filetype in filetypes:
			print "\n[-] Searching for "+filetype+ " files, with a limit of " + str(limit)
			search=googlesearch.search_google(word,limit,start,filetype)
			search.process_files()
			files=search.get_files()
			print "Results: " + str(len(files)) + " files found" 
			print "Starting to download "+ str(filelimit) + " of them.."
			print "----------------------------------------------------\n"
			counter=0
			for x in files:
				if counter <= filelimit:
					print "["+str(counter+1)+"/"+str(filelimit)+"] " + x
					getfile=downloader.downloader(x,dir)
					getfile.down()
					filename=getfile.name()	
					try:
						if filename !="":
							if filetype == "pdf":
								test=metadataPDF.metapdf(dir+"/"+filename,password)
							elif filetype == "doc" or filetype == "ppt" or filetype == "xls":
								test=metadataMSOffice.metaMs2k(dir+"/"+filename)	
								if os.name=="posix":
									testex=metadataExtractor.metaExtractor(dir+"/"+filename)
							elif filetype == "docx" or filetype == "pptx" or filetype == "xlsx":
								test=metadataMSOfficeXML.metaInfoMS(dir+"/"+filename)
							res=test.getData()
							if res=="ok":
								raw=test.getRaw()
								users=test.getUsers()
								paths=test.getPaths()
								soft=test.getSoftware()
								if (filetype == "doc" or filetype == "xls" or filetype == "ppt") and os.name=="posix":
									testex.runExtract()
									testex.getData()
									paths.extend(testex.getPaths())
								respack=[x,users,paths,soft,raw]
								all.append(respack)
							else:
								print "error" #A error in the parsing process
						else:
							print "pass"
					except Exception, e:
						print("ERROR: "+str(e))
					counter+=1
Beispiel #36
0
import sys
import time
from downloader import downloader
print "*************GANPAT, THE CRAWLER**************";
if len(sys.argv)>1 :
    print "SITE: ",sys.argv[1]," started at ",time.ctime(time.time());
    down1=downloader(sys.argv[1]);
else :
	print "USAGE: python ganpat.py <sitename>";
	sys.exit(2);
Beispiel #37
0
def doprocess(argv):
    filelimit = 50
    word = "local"
    localanalysis = "no"
    failedfiles = []
    emails = []
    if len(sys.argv) < 3:
        usage()
    try:
        opts, args = getopt.getopt(argv, "l:d:f:h:n:t:o:")
    except getopt.GetoptError:
        usage()
    for opt, arg in opts:
        if opt == '-d':
            word = arg
        elif opt == '-t':
            filetypes = []
            if arg.count(",") != 0:
                filetypes = arg.split(",")
            else:
                filetypes.append(arg)
                print filetypes
        elif opt == '-l':
            limit = int(arg)
        elif opt == '-h':
            localanalysis = arg
        elif opt == '-n':
            filelimit = int(arg)
        elif opt == '-o':
            dir = arg
        elif opt == '-f':
            outhtml = arg
    if os.path.exists(dir):
        pass
    else:
        os.mkdir(dir)
    if localanalysis == "no":
        print "\n[-] Starting online search..."
        for filetype in filetypes:
            print "\n[-] Searching for "+ filetype + " files, with a limit of " + str(limit)
            search = googlesearch.search_google(word, limit, start, filetype)
            search.process_files()
            files = search.get_files()
            print "Results: " + str(len(files)) + " files found"
            print "Starting to download " + str(filelimit) + " of them:"
            print "----------------------------------------\n"
            counter = 1
            for x in files:
                if counter <= filelimit:
                    print "[" + str(counter) + "/" + str(filelimit) + "] " + x
                    getfile = downloader.downloader(x, dir)
                    getfile.down()
                    filename = getfile.name()
                    if filename != "":
                        if filetype == "pdf":
                            test = metadataPDF.metapdf(dir + "/" + filename, password)
                        elif filetype == "doc" or filetype == "ppt" or filetype == "xls":
                            test = metadataMSOffice.metaMs2k(dir + "/" + filename)
                            if os.name == "posix":
                                testex = metadataExtractor.metaExtractor(dir + "/" + filename)
                        elif filetype == "docx" or filetype == "pptx" or filetype == "xlsx":
                            test = metadataMSOfficeXML.metaInfoMS(dir + "/" + filename)
                        res = test.getData()
                        if res == "ok":
                            raw = test.getRaw()
                            users = test.getUsers()
                            paths = test.getPaths()
                            soft = test.getSoftware()
                            email = []
                            if filetype == "pdf" or filetype == "docx":
                                res = test.getTexts()
                                if res == "ok":
                                    email = test.getEmails()
                                    for em in email:
                                        emails.append(em)
                                else:
                                    email = []
                                    failedfiles.append(x + ":" + str(res))
                            respack=[x, users, paths, soft, raw, email]
                            all.append(respack)
                        else:
                            failedfiles.append(x + ":" + str(res))
                            print "\t [x] Error in the parsing process" #A error in the parsing process
                    else:
                        pass
                counter += 1
    else:
        print "[-] Starting local analysis in directory " + dir
        dirList = os.listdir(dir)
        print dirList
        for filename in dirList:
            if filename != "":
                filetype = str(filename.split(".")[-1])
                if filetype == "pdf":
                    test = metadataPDF.metapdf(dir + "/" + filename, password)
                elif filetype == "doc" or filetype == "ppt" or filetype == "xls":
                    print "doc"
                    test = metadataMSOffice.metaMs2k(dir + "/" + filename)
                    if os.name == "posix":
                        testex = metadataExtractor.metaExtractor(dir + "/" + filename)
                elif filetype == "docx" or filetype == "pptx" or filetype == "xlsx":
                    test = metadataMSOfficeXML.metaInfoMS(dir + "/" + filename)
                res = test.getData()
                if res == "ok":
                    raw = test.getRaw()
                    users = test.getUsers()
                    paths = test.getPaths()
                    soft = test.getSoftware()
                    if (filetype == "doc" or filetype == "xls" or filetype == "ppt") and os.name=="posix":
                        testex.runExtract()
                        testex.getData()
                        paths.extend(testex.getPaths())
                        respack = [filename, users, paths, soft, raw, email]
                        all.append(respack)
                    else:
                        failedfiles.append(filename + ":" + str(res))
                        print "[x] Error in the parsing process"  # A error in the parsing process

                    if filetype == "docx" or filetype == "pdf":
                        res = test.getTexts()
                        if res == "ok":
                            email = test.getEmails()
                            for x in email:
                                emails.append(x)
                        else:
                            failedfiles(filename + ":" + str(res))
                    else:
                        print "pass"
            else:
                pass
    print "processing"
    proc = processor.processor(all)
    userlist = proc.sort_users()
    softlist = proc.sort_software()
    pathlist = proc.sort_paths()
    try:
        html = htmlExport.htmlExport(userlist, softlist, pathlist, all, outhtml, dir, failedfiles, word, emails)
        save = html.writehtml()
    except Exception, e:
        print e
        print "Error creating the file"
Beispiel #38
0
def run_downloader():
	d = downloader(url,searching_for,file_providers)
	write_info()
	d.run()
def main(podcastname):
  p = podcastparser.factory(podcastname)
  print p.episodeTitle, p.downloadUrl
  if p.downloadUrl:
    d = downloader(p.downloadUrl, p.episodeTitle, podcastname)
with open(prev_file, 'r', encoding='utf-8') as f:
    prev = json.load(f)
# prev = dao.get_to_be_downloaded()
logger.info("Loaded the want-to-download anime list")

home = os.path.expanduser("~")
driver = webdriver.Chrome(os.path.join(home, 'locallib/chromedriver'))
driver.maximize_window()

for i in range(len(prev)):
    # Skip, if already downloaded
    if prev[i][4]:
        continue

    start_time = datetime.datetime.utcnow().isoformat()
    success = downloader(prev[i][1], driver)
    end_time = datetime.datetime.utcnow().isoformat()

    if success:
        # dao.update_downloaded(end_time, prev[i][0])

        prev[i][4] = True
        prev[i].append(end_time)

        with open(prev_file, 'w') as f:
            json.dump(prev, f, indent=2)

# Don't forget this line of code.
driver.quit()

if len(prev) == 0: