Ejemplo n.º 1
0
    def __init__(self, url, referer, id, time, p, format, cw=None, isVideo=False, try_n=4, n_thread=1):
        self._url = url
        self.referer = referer
        self.id = int(id)
        self.time = time
        self.p = p
        self.n_thread = n_thread
        if not isVideo:
            url_alter = Url_alter(url)
        else:
            url_alter = None
        if isVideo and get_ext(url).lower() not in ['.mp4', '.m3u8']:
            get = self.get
        else:
            get = lambda _: self._url
        self.url = LazyUrl_twitter(referer, get, self, url_alter)
        self.format = format
        self.cw = cw
        self.isVideo = isVideo
        self.try_n = try_n
##        time_ms = (int(id) >> 22) + 1288834974657
##        time = time_ms / 1000 # GMT+0
        date = datetime.fromtimestamp(float(time))
        timeStamp = date.strftime(format).replace(':', '\uff1a') # local time
        ext = '.mp4' if isVideo else get_ext(url)
        self.filename = timeStamp.replace('id', str(id)).replace('page', str(p)) + ext
Ejemplo n.º 2
0
 def run(self):
     response = gtk.FileChooserDialog.run(self)
     self.set_modal(True)
     if response == gtk.RESPONSE_OK:
         filename = self.get_filename()
         if utils.get_ext(filename) in [".m3u",".pls",".xspf"]:
             utils.export_playlist(self.songs,filename,utils.get_ext(filename,False))
         else:
             pl_type = ".m3u"
             filename = filename+"."+pl_type
             utils.export_playlist(self.songs,filename,pl_type)
     self.destroy()
Ejemplo n.º 3
0
 def run(self):
     Dispatcher.emit("dialog-run")
     response = gtk.FileChooserDialog.run(self)
     self.set_modal(True)
     if response == gtk.RESPONSE_OK:
         filename = self.get_filename()
         if utils.get_ext(filename) in [".m3u", ".pls", ".xspf"]:
             utils.export_playlist(self.songs, filename,
                                   utils.get_ext(filename, False))
         else:
             pl_type = ".m3u"
             filename = filename + "." + pl_type
             utils.export_playlist(self.songs, filename, pl_type)
     self.destroy()
     Dispatcher.emit("dialog-close")
    def get(self, _):
        if self._url_cache:
            return self._url_cache
        print_ = get_print(self.cw)
        for try_ in range(self.try_n):
            try:
                d = ytdl.YoutubeDL()
                info = d.extract_info(self._url)

                url = info['url']
                ext = get_ext(url)
                self.ext = ext
                print_('get_video: {} {}'.format(url, ext))
                if ext.lower() == '.m3u8':
                    url = M3u8_stream(url,
                                      n_thread=self.n_thread,
                                      post_processing=True)
                self._url_cache = url
                return url
            except Exception as e:
                e_ = e
                msg = print_error(e)[(-1)]
                print_('\nTwitter video Error:\n{}'.format(msg))
                if try_ < self.try_n - 1:
                    sleep(10, self.cw)
        else:
            raise e_
Ejemplo n.º 5
0
def fetch_hubble_image(image_id):
    hubble_api = f"http://hubblesite.org/api/v3/image/{image_id}"
    response = requests.get(hubble_api)
    best_image = response.json()['image_files'][-1]
    image_ext = get_ext(best_image['file_url'])
    image_url = best_image.get('file_url')
    load_image(image_url, f"{image_id}{image_ext}")
Ejemplo n.º 6
0
def handler(event, context):
    aws_event = aws_events.event_parser.EventParser(event)
    logger.info(aws_event.event_types())
    event = True
    while event:
        event = aws_event.next_event('s3')
        logger.info(event)
        if (event is not None):
            event_klass = factory.instantiate('s3', event)
            new_file = event_klass.change_file_extension('txt')
            logger.info(new_file)
            ext = get_ext(event_klass.key)
            logger.info(ext)

            try:
                status, text = ocr_extractor.extract(event_handler=event_klass,
                                                     context=context,
                                                     ext=ext)
                if (status == 'ok'):
                    event_klass.put_content(new_file, text)
                elif (status == 'ocr'):
                    logger.info('>>> Sending to OCR lambda')

            except Exception as e:
                logger.exception('Extraction exception for <{}>'.format(
                    event_klass.key))
 def get(self, referer):
     soup = downloader.read_soup(self._url, referer, session=self.session)
     div = soup.find('div', id='display_image_detail')
     url = urljoin(self._url, div.find('img').parent['href'])
     ext = get_ext(url)
     self.filename = '{:04}{}'.format(self._p, ext)
     return url, self._url
Ejemplo n.º 8
0
    def read(self):
        qs = query_url(self.url)
        for key in qs:
            if key.lower() in ('file', 'filename'):
                name = qs[key][(-1)]
                break
        else:
            name = os.path.basename(self.url)
            for esc in ['?', '#']:
                name = name.split(esc)[0]

        ext = get_ext(name)
        if not ext:
            try:
                ext = downloader.get_ext(self.url)
            except:
                ext = ''
        name = os.path.splitext(name)[0]

        self.urls.append(self.url)
        
        id_ = md5(self.url.encode('utf8')).hexdigest()[:8]
        tail = ' ({}){}'.format(id_, ext)
        filename = clean_title(name, n=-len(tail)) + tail
        
        self.filenames[self.url] = filename
        
        self.title = filename
Ejemplo n.º 9
0
def get_output_path(output_dir, input_name, idx, x):
    main = get_main(input_name)
    ext = get_ext(input_name)
    start = str(x['start_time']).replace('.', 'p')
    end = str(x['end_time']).replace('.', 'p')
    output_name = '{}_{}_{}_to_{}{}'.format(main, idx, start, end, ext)
    return path.join(output_dir, output_name)
Ejemplo n.º 10
0
    def get(self, url):
        soup = read_soup(url, self._cw)
        ori = soup.find('li', id='post-option-view-original')
        if ori:
            img = ori.find('a')['href']
        else:
            img = soup.find('li', id='post-info-size').find('a')['href']

        if get_ext(img) == '.zip':  #4635
            img = soup.find('section', id='content').find('video')['src']

        img = urljoin(url, img)
        ext = get_ext(img)

        self.filename = '{}{}'.format(self.id, ext)
        return img
Ejemplo n.º 11
0
    def get(self, url):
        print_ = get_print(self.cw)
        if self._url:
            return self._url
        ydl = ytdl.YoutubeDL(cw=self.cw)
        try:
            info = ydl.extract_info(url)
        except Exception as e:
            ex = type(ytdl.get_extractor(url))(ydl)
            _download_info = getattr(ex, '_download_info', None)
            if _download_info is not None:
                vod_id = ex._match_id(url)
                info = _download_info(vod_id)
                print_(info)
            raise
        video_best = info['formats'][-1]
        video = video_best['url']

        ext = get_ext(video)
        self.title = info['title']
        id = info['display_id']

        if ext.lower() == '.m3u8':
            video = M3u8_stream(video, n_thread=4, alter=alter)
            ext = '.mp4'
        self.filename = format_filename(self.title, id, ext)
        self.url_thumb = info['thumbnail']
        self.thumb = BytesIO()
        downloader.download(self.url_thumb, buffer=self.thumb)
        self._url = video
        return self._url
Ejemplo n.º 12
0
def get_ext_(url, session, referer):
    try:
        ext = downloader.get_ext(url, session, referer)
    except Exception as e:
        print(e)
        ext = get_ext(url)
    return ext
Ejemplo n.º 13
0
 def __init__(self, url, p, page, cw):
     self.cw = cw
     ext = get_ext(url)
     self.filename = '{:04}{}'.format(p, ext)
     if page.title is not None:
         self.filename = '{}/{}'.format(page.title, self.filename)
     self._url = url
     self.url = LazyUrl(page.url, self.get, self)
 def __init__(self, src, referer, title, session):
     ext = get_ext(src)
     if ext == '.m3u8':
         _src = src
         src = M3u8_stream(_src, referer=referer, session=session)
         ext = '.mp4'
     self.url = LazyUrl(referer, lambda _: src, self)
     self.filename = '{}{}'.format(clean_title(title), ext)
Ejemplo n.º 15
0
 def __init__(self, url, page, p):
     ext = get_ext(url)
     if ext.lower()[1:] not in [
             'jpg', 'jpeg', 'bmp', 'png', 'gif', 'webm', 'webp'
     ]:
         ext = '.jpg'
     self.filename = '{}/{:04}{}'.format(page.title, p, ext)
     self.url = LazyUrl(page.url, lambda _: url, self)
Ejemplo n.º 16
0
 def guardar(self, nombre):
     if not nombre:return
     ext=utils.get_fileext()
     if not ext:ext=utils.get_ext()
     nombre=nombre.replace(" ", "_")
     text=utils.get_text()
     text=text.replace("$", "\$");
     utils.file_write(TEMPLATES_PATH+nombre+"."+ext, text)
 def __init__(self, id, url, p, lazy=True, img=None):
     self.id = id
     self.p = p
     if lazy:
         self.url = LazyUrl(url, self.get_single, self)
     else:
         self.url = LazyUrl(url, lambda _: img, self)
         ext = get_ext(img)
         self.filename = '{}_p{}{}'.format(id, p, ext)
Ejemplo n.º 18
0
    def get(self, url):
        print_ = get_print(self.cw)
        if self._url:
            return self._url
        ydl = ytdl.YoutubeDL(cw=self.cw)
        try:
            info = ydl.extract_info(url)
        except Exception as e:
            ex = type(ytdl.get_extractor(url))(ydl)
            _download_info = getattr(ex, '_download_info', None)
            if _download_info is not None:
                vod_id = ex._match_id(url)
                info = _download_info(vod_id)
                print_(info)
            if 'HTTPError 403' in str(e):
                raise errors.LoginRequired()
            raise

        def print_video(video):
            print_('[{}] [{}] [{}] {}'.format(video['format_id'],
                                              video.get('height'),
                                              video.get('tbr'), video['url']))

        videos = [video for video in info['formats'] if video.get('height')]

        videos = sorted(videos,
                        key=lambda video:
                        (video.get('height', 0), video.get('tbr', 0)),
                        reverse=True)

        for video in videos:
            print_video(video)

        for video in videos:
            if video.get('height', 0) <= get_resolution():  #3723
                video_best = video
                break
        else:
            video_best = videos[-1]
        print_video(video)

        video = video_best['url']

        ext = get_ext(video)
        self.title = info['title']
        id = info['display_id']

        if ext.lower() == '.m3u8':
            video = M3u8_stream(video, n_thread=4, alter=alter)
            ext = '.mp4'
        self.filename = format_filename(self.title, id, ext)
        self.url_thumb = info['thumbnail']
        self.thumb = BytesIO()
        downloader.download(self.url_thumb, buffer=self.thumb)
        self._url = video
        return self._url
Ejemplo n.º 19
0
 def get(self, referer):
     # https://j.nozomi.la/nozomi.js
     s_id = str(self._id)
     url_post = 'https://j.nozomi.la/post/{}/{}/{}.json'.format(
         s_id[-1], s_id[-3:-1], self._id)
     j = downloader.read_json(url_post, referer)
     url = urljoin(referer, j['imageurl'])
     ext = get_ext(url)
     self.filename = '{}{}'.format(self._id, ext)
     return url
Ejemplo n.º 20
0
def get_rep_and_ext(path, model_choice):
    rep = rep_base(path, model_choice)
    ext = get_ext(path)
    if ext is None:
        rep['error'] = True
        rep['error_reason'] = "Cannot determine extension"
        rep['ext'] = ''
        return rep
    rep['ext'] = ext
    return rep
Ejemplo n.º 21
0
def _wrapper(file_path, verbose):
    ext = get_ext(file_path)

    if ext == '.json':
        return parse_json(file_path, verbose)
    elif ext == '.ndjson':
        return parse_ndjson(file_path, verbose)
    else:
        raise AssertionError('File must be in JSON format and '
                             'could not be parsed.')
Ejemplo n.º 22
0
def fetch_hubble_image(image_id, dir_path):
    os.makedirs(dir_path, exist_ok=True)

    response = requests.request('GET', 'http://hubblesite.org/api/v3/image/{}'.format(image_id))

    file_url = response.json()['image_files'][-1]['file_url']

    ext = get_ext(file_url)
    path = os.path.join(dir_path, 'hubble_img_{}{}'.format(image_id, ext))

    download_file(file_url, path)
def upload_photo_to_instagram(img_dir, username, password):
    files = get_files_path_from_dir(img_dir)

    image_files = filter(lambda path: get_ext(path).lower() in IMG_EXT_SET,
                         files)

    bot = Bot()
    bot.login(username, password)

    for img in image_files:
        bot.upload_photo(img)
Ejemplo n.º 24
0
 async def load_audio(self):
     idx = self.results_box.index(tk.ACTIVE)
     if not self.audio[idx]:
         r = self.results[idx]
         start, end, video = r.start, r.end, r.video
         self.audio[idx], _ = await ffmpeg_helper.trim(
             start,
             end,
             file_input=video,
             format=get_ext(video),
             format_out='matroska')
         print('loaded ' + video)
 def get(self, _):
     print_ = get_print(self.cw)
     url = self._url
     ext = get_ext(url)
     if ext.lower() == '.gif':
         print_('get_ext: {}, {}'.format(self.id_, url))
         try:
             ext = downloader.get_ext(url)
         except Exception as e: #3235
             print_('Err: {}, {}\n'.format(self.id_, url)+print_error(e)[0])
     self.filename = '{}_p{}{}'.format(self.id_, self.p, ext)
     return url
def extract(event_handler, ext=None, disable_ocr=True):

    logger.info("\n\n\nStarting Document parse\n" + event_handler.key +
                "\n\n\n")

    # AWS Lambda auto-retries errors for 3x. This should make it disable retrying...kinda. See https://stackoverflow.com/questions/32064038/aws-lambda-function-triggering-multiple-times-for-a-single-event
    #aws_context_retry_uri = os.path.join(temp_uri_prefix, 'aws_lambda_request_ids', context.aws_request_id)
    #if uri_exists(aws_context_retry_uri):
    #    return
    #uri_dump(aws_context_retry_uri, '', mode='w')

    # logger.info('{} invoked with event {}.'.format(os.environ['AWS_LAMBDA_FUNCTION_NAME'], json.dumps(event)))
    if ext is None:
        ext = get_ext(event_handler.key)
    logger.info("file ext " + ext)

    extract_func = PARSE_FUNCS.get(ext)
    if extract_func is None:
        raise ValueError('<{}> has unsupported extension "{}".'.format(
            event_handler.key, ext))
        return

    fallback_to_ocr = False
    # textractor_results = {}
    if extract_func is False:
        fallback_to_ocr = True
        logger.info('Fallback to OCR for <{}>.'.format(event_handler.key))
        return ('ocr', None)

    else:
        with NamedTemporaryFile(mode='wb', suffix=ext, delete=False) as f:
            document_path = f.name
            f.write(event_handler.get_content().read())

            logger.info('Downloaded <{}> to <{}>.'.format(
                event_handler.key, document_path))
        #end with

        text = extract_func(document_path, event_handler)

        if extract_func is pdf_to_text and len(text) < 512 and not disable_ocr:
            return ('ocr', None)

        else:
            if len(text) == 0:
                logger.warning('<{}> does not contain any content.'.format(
                    event_handler.key))
            return ('ok', text)
        #end if

    #end if

    return ('ocr', None)
 def get(self, url):
     html = downloader.read_html(url)
     soup = Soup(html)
     for li in soup.findAll('li'):
         if li.text.strip() == 'Original image':
             break
     else:
         raise Exception('no Original image')
     url = li.find('a')['href']
     ext = get_ext(url)
     self.filename = u'{}{}'.format(self.id_, ext)
     return url
Ejemplo n.º 28
0
def fetch_spacex_last_launch(dir_path):
    os.makedirs(dir_path, exist_ok=True)

    response = requests.request('GET', 'https://api.spacexdata.com/v3/launches/latest')

    images_urls = response.json()['links']['flickr_images']

    for num, url in enumerate(images_urls):
        ext = get_ext(url)
        path = os.path.join(dir_path, 'spacex_{}{}'.format(num, ext))

        download_file(url, path)
Ejemplo n.º 29
0
    async def post(self):
        user_id = id_validator(self.request.match_info['user_id'], 'User')

        if self.request.content_type != 'multipart/form-data' or self.request.content_length == 0:
            return web.json_response(data=[])

        user_table = get_model_by_name('user')
        file_table = get_model_by_name('file')
        user_exists = await self.request.app['pg'].fetchval(
            select([exists().where(user_table.c.user_id == user_id)]))

        if not user_exists:
            await self.request.app['pg'].fetchrow(
                user_table.insert().values(**{'user_id': user_id}))

        reader = await self.request.multipart()
        upload_folder = self.request.app['config']['UPLOAD_FOLDER']
        data = []
        while not reader.at_eof():
            image = await reader.next()

            if not image:
                break

            file_name, ext = get_ext(image.filename)
            generated_file_name = '{}.{}'.format(uuid.uuid4(), ext)
            full_path = os.path.abspath(
                os.path.join(upload_folder, generated_file_name))
            size = 0

            with open(full_path, 'wb') as f:
                while True:
                    chunk = await image.read_chunk()
                    if not chunk:
                        break
                    size += len(chunk)
                    f.write(chunk)

            body = {
                'user_id': user_id,
                'name': image.filename,
                'path': full_path,
                'size': size
            }

            file = await self.request.app['pg'].fetchrow(
                file_table.insert().values(**body).returning(
                    literal_column('*')))
            file = row_to_dict(file, 'file')
            data.append(dict(file))

        return web.json_response(data=data)
Ejemplo n.º 30
0
    def get(self, url):
        if self._url:
            return self._url
        self.info = get_info(url)

        self.title = self.info['title']
        id = self.info['id']

        video_best = self.info['formats'][(-1)]
        self._url = video_best['url']
        ext = get_ext(self._url)
        self.filename = format_filename(self.title, id, ext)
        return self._url
 def get(self, _):
     print_ = get_print(self.cw)
     url = self._url
     ext = get_ext(url)
     if ext.lower()[1:] not in ['jpg', 'png', 'mp4']:  #4645
         print_('get_ext: {}, {}'.format(self.id_, url))
         try:
             ext = downloader.get_ext(url, referer=_)
         except Exception as e:  #3235
             print_('Err: {}, {}\n'.format(self.id_, url) +
                    print_error(e)[0])
     self.filename = '{}_p{}{}'.format(self.id_, self.p, ext)
     return url
Ejemplo n.º 32
0
 def get(self, referer):
     ext = get_ext(self._url)
     name = self.format_.replace('id', '###id*').replace(
         'page',
         '###page*').replace('artist',
                             '###artist*').replace('title', '###title*')
     name = name.replace('###id*', str(self.id_)).replace(
         '###page*',
         str(self.p)).replace('###artist*',
                              self.artist).replace('###title*', self.title)
     self.filename = clean_title(name.strip(), allow_dot=True,
                                 n=-len(ext)) + ext
     return self._url
Ejemplo n.º 33
0
 def get_ext(self, complete=True):
     return utils.get_ext(self.get("uri"), complete)
Ejemplo n.º 34
0
 def Download_COGCC_Data(self, seqNum, fClass, outputDir):
     dlFile = False
     # search list for logs to download
     log_list = ['mud', 'core', 'cores']
     # List variable to count the number of pages on the current COGCC web page
     pageLinks = ['1']
     # Counter - Some of the COGCC files have the same name. This variable allows us to create an 'index' number for every file
     fileCount = 1
     
     # Download COGCC files
     try:
         # Create a mechanize browser
         br = mechanize.Browser()
         # Include some headers that are recognized by the Microsoft server software
         br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
         # Get Site response
         response = br.open("%s%s" % ("http://ogccweblink.state.co.us/results.aspx?id=", seqNum))
         # Read site page html
         html = response.read()
         # Send html data to BeautifulSoup for collection
         soup = BeautifulSoup(html)
         
         # Cycle through the anchor elements. Identify javascript links so we can count the number of pages that we need to cycl through
         for a1 in soup.find_all('a'):
             if "javascript:__doPostBack('WQResultGridView','Page$" in (a1.get('href')):
                 # Append java script text to the pageLinks list
                 pageLinks.append(a1.get_text())
         
         # Cycle through the index numbers starting from 0 up to the number of pages
         for ix in range(1, (len(pageLinks)+1)):
             # Collect all anchor elements
             anchors = soup.find_all('a')
             # cycle from 1 up to the number of anchor elements
             for jx in range(0, len(anchors)):
                 # Determine if the current anchor is a javascript header link
                 if "javascript:__doPostBack" in (anchors[jx].get("href")):
                     # Determine if the current anchor is a javascript page link
                     if "javascript:__doPostBack('WQResultGridView','Page$" in (anchors[jx].get("href")):
                         # Determine if the current anchor is a javascript page link that we want to click on
                         if "javascript:__doPostBack('WQResultGridView','Page$%s" % (ix + 1) in (anchors[jx].get("href")):
                             # Select the javascript form
                             br.select_form(nr=0)
                             # Set the form to modifiable
                             br.set_all_readonly(False)
                             # Pass over necessary arguments to the javascript: __doPostBack function
                             br["__EVENTTARGET"] = 'WQResultGridView'
                             br["__EVENTARGUMENT"] = 'Page$%s' % (ix + 1)
                             # update the response variable with the site response from br.submit
                             response = br.submit()
                             # Update html data
                             html = response.read()
                             # updates the soup objects
                             soup = BeautifulSoup(html)    
                     else:
                         pass
                 # We need to document the current class value of the current well. This well tell us if we should download the well based on the provided parameters
                 elif (jx-1) % 5 == 0:
                         dlFile = False
                         if fClass == "all":
                             dlFile = True
                             if anchors[jx].get_text() == "Wells" or anchors[jx].get_text() == "Facilities" or anchors[jx].get_text() == "Operator":
                                 self.__current_file = 'whf'
                             else:
                                 self.__current_file = 'log'
                         elif fClass == "whfs" and (anchors[jx].get_text() == "Wells" or anchors[jx].get_text() == "Facilities" or anchors[jx].get_text() == "Operator"):
                             dlFile = True
                             self.__current_file = 'whf'
                         # This method is really designed for the Mud log and core search search, not really logs specifically. Need to reinsert the following comment into the fClass == conditional statement
                         #"""(anchors[jx].get_text() == "Wells" or anchors[jx].get_text() == "Well Logs" or anchors[jx].get_text() == "Projects") and"""
                         elif fClass == "logs" and utils.find_substring(anchors[jx+2].get_text(), log_list): 
                             dlFile = True
                             self.__current_file = 'log'
                         else:
                             dlFile = False
                             self.__current_file = ''
                 
                 # We want to download from every 4th link. The 4th link contains the name of the file. So download the file if the current (index-3) % 5 == 0
                 elif (jx-3) % 5 == 0 and dlFile == True:
                         # set the file name
                         fileName = "05" + seqNum + "0000_" + str(fileCount) + "_" + anchors[jx].get_text()
                         # Increment file count
                         fileCount = fileCount + 1
                         # Remove shit characters
                         fileName = re.sub('[\\\/:*?\'\"<>\|\\r\\n]','',fileName)
                         fileName = re.sub(' ','_',fileName)
                         # Set the url using the current anchor element href
                         url = "%s" % ("http://ogccweblink.state.co.us/" + anchors[jx].get("href"))
                         # Get it!!!!
                         r = requests.get(url)
                         # Set the output file path, need to extend the extDict to include other mime types, (i.e. xml, word docs...)
                         # Use try block because we might encounter unknown media type
                         filePath = os.path.join(outputDir, fileName)
                         if r.status_code == 200:
                             with open(filePath, "wb") as image:
                                 image.write(r.content)
                         # retrieve the file extension from the downloaded file
                         ext = utils.get_ext(filePath)
                         # set the extension on the current download
                         try:
                             if ext == 'pdf' or self.__current_file == 'log':
                                 utils.set_file_ext(filePath, ext)
                             else:
                                 pass
                                 # removing the tiff2pdf function. Most colorado files seem to pdf files now. (Except logs)
                                 #utils.tiff2pdf(filePath)
                         except:
                             pass
                 else:
                     pass
         # Clear out all variables
         br = None
         soup = None
         html = None
         response = None
         fileName = None
         filePath = None
         url = None
         r = None
         return True
    
     except requests.ConnectionError, e:
         # Clear out all variables
         br = None
         soup = None
         html = None
         response = None
         fileName = None
         filePath = None
         url = None
         r = None
         return False