Ejemplo n.º 1
0
 def _read_info(self, field):
     if field == 'md5partial':
         dec = mp4.File(str(self.path))
         self._md5partial_offset = dec.audio_offset
         self._md5partial_size = dec.audio_size
         dec.close()
     super(Mp4File, self)._read_info(field)
     if field in TAG_FIELDS:
         dec = mp4.File(str(self.path))
         self.audiosize = dec.audio_size
         self.bitrate = dec.bitrate
         self.duration = dec.duration
         self.samplerate = dec.sample_rate
         self.artist = dec.artist
         self.album = dec.album
         self.title = dec.title
         self.genre = dec.genre
         self.comment = dec.comment
         self.year = dec.year
         self.track = dec.track
         dec.close()
Ejemplo n.º 2
0
def fetch_info(url):
  """Get the bitrate and the duration from the mp3 header. 
  This will only work with tagless mp3 files."""
  # The first 512 bytes are sufficient for extracting the metadata:
  page = requests.get(url, headers = { "Range" : "bytes=0-512" })

  if page.status_code not in [ 200, 206 ]: # 206 = partial content
    raise requests.exceptions.HTTPError("http code " + str(page.status_code) + ".")

  with BytesIO(page.content) as bs:
    mp3 = mp4.File(bs)
    
    return types.Obj(
      duration = mp3.duration,
      bitrate  = mp3.bitrate
    )
Ejemplo n.º 3
0
 def get_duration(self, url, buff=500, explain=False):
     if explain:
         print "in seconds"
     from StringIO import StringIO
     from hsaudiotag import mp4
     r = requests.get(url, stream=True)
     if r.status_code == 200:
         a = r.raw.read(buff)
         b = StringIO()
         b.write(a)
         c = mp4.File(b)
         duration = c.duration
         b.close()
         r.close()
         return duration
     else:
         return -1
Ejemplo n.º 4
0
    def analyze(self):
        pb = self.pb
        page = pb.get_page_source()
        #page_compress = pb.get_page_source(remove_js = True) # no javascript
        # preconditions
        if page == None or len(page) == 0:
            self.error = "no page"
            return False
        #self.page_compressed = zlib.compress(repr(page_compress))
        soup = BS(page, 'html.parser')

        # precondition
        if soup == None:
            self.error = "no parser"
            return False
        # check 404
        if soup.title.text.find("looking for doesn't exist (404)") > 0:
            self.error = "404"
            return False
        #main page
        if not self.quietly:
            sys.stdout.write(".[%03d:analyze]." % self.my_id)
            sys.stdout.flush()
        #analyze main page ===========
        # number of comment
        frame = soup.select("#comments_count")
        if len(frame) > 0:
            self.comments_count = int(frame[0]['data-comments-count'])
        else:
            self.comments_count = -1  #means error
        # number o update
        frame = soup.select("#updates_count")
        if len(frame) > 0:
            self.updates_count = int(frame[0]['data-updates-count'])
        else:
            self.updates_count = -1  #means error
        # reward
        frame = soup.select(".NS-projects-reward")
        proj_reward_append = self.projects_reward_result.append
        if len(frame) > 0:
            #
            for card in frame:
                #money
                money_f = card.select(".money")
                if len(money_f) > 0:
                    money = filter_number(money_f[0].text.strip())
                else:
                    money = 0.0
                #backers
                backers_f = card.select(".num-backers")
                if len(backers_f) > 0:
                    num_backers = filter_number(backers_f[0].text.strip())
                else:
                    num_backers = 0.0
                #description
                desc_f = card.select(".desc")
                if len(desc_f) > 0:
                    description = desc_f[0].text.strip()
                else:
                    description = ""
                #delivery
                delivery_f = card.select("time")
                if len(delivery_f) > 0:
                    delivery_estimated = delivery_f[0]['datetime']
                else:
                    delivery_estimated = ""
                #limited
                limited_f = card.select(".limited-number")
                if len(limited_f) > 0:
                    limited_num = int(
                        re.findall(r"of ([0-9]+)", limited_f[0].text)[0])
                else:
                    limited_num = 0
                proj_reward_append([
                    money,
                    num_backers,
                    description,
                    delivery_estimated,
                    limited_num,
                ])
            #for
        # collect images
        frame = soup.select("img.fit")
        image_fnames_append = self.image_fnames.append
        images_append = self.images.append
        if len(frame) > 0:
            for imgf in frame:
                src = imgf['src']
                src = re.sub(r"\?.*$", "", src)
                image_fnames_append(src)
                images_append("")  #basically nothing is apppended
            if self.has_image:
                inque = list_to_queue(self.image_fnames)
                outque = Queue.Queue()
                tasks = []
                # parallel processing
                for i in range(inque.qsize()):
                    imageD = ImageDownloader(inque, outque, self.quietly, True)
                    imageD.setDaemon(True)
                    tasks.append(imageD)
                    imageD.start()
                inque.join()  #wait till being finished
                for task in tasks:
                    task.stop()
                outlist = queue_to_list(outque)
                self.images = copy.deepcopy(outlist)
                # replace string list to binary data list
        # video (source file name, length)s
        frame = soup.select("video.has_webm")
        self.video_fname = "na"
        v_url = ""
        if len(frame) > 0:
            sources = frame[0].select("source")
            if len(sources) > 0:
                self.video_has = True
                for source in sources:
                    v_url = source['src']
                    if v_url.endswith(".mp4"):
                        if v_url.find('high') > 0:
                            self.video_has_high = 1
                            self.video_fname = v_url
                        else:
                            self.video_has_base = 1
                            self.video_fname = v_url  # if base exists, replace by it.
                if self.video_has_high > 0 or self.video_has_base > 0:
                    url = self.video_fname
                    # video duration
                    try:
                        with closing(
                                requests.get(url,
                                             stream=True,
                                             verify=False,
                                             timeout=600)) as r:
                            a = r.raw.read(2000)  #2kb buffer
                            b = StringIO()
                            b.write(a)
                            c = mp4.File(b)
                            self.video_length = c.duration
                            b.close()
                    except:
                        self.video_length = -1  # if requests got an error
        # collect full description
        frame = soup.select(".full-description")
        rv = ""
        if len(frame) > 0:
            desc = frame[0].text
            rv = re.sub(r"\n\n\n+", "\n", desc)
            try:
                rv = rv.strip()
            except:
                rv = "error"
        self.full_description = rv
        # collect risk
        frame = soup.select("#risks")
        rv = ""
        if len(frame) > 0:
            desc = frame[0].text
            rv = re.sub(r"\n\n\n+", "\n", desc)
            try:
                rv = rv.strip()
            except:
                rv = "error"
        self.risks = rv
        # Facebook
        try:
            frame = soup.select("li.facebook.mr2 .count")
            if len(frame) > 0:
                fb_cnt = frame[0].text.strip()
            self.facebook_like = int(fb_cnt.replace(",", ""))
            if not self.quietly:
                sys.stdout.write(".[%03d:fb]." % self.my_id)
                sys.stdout.flush()
        except:
            if not self.quietly:
                sys.stdout.write(".[%03d:fb!]." % self.my_id)
                sys.stdout.flush()
            self.facebook_like = -1
        #backers =====================
        try:
            backer_url = self.url + "/backers"
            param_backer = {}
            backers_append = self.backers.append
            bc_text = ""
            backer_first_trial = SS.BACKER_CONNECTION_RECOVER
            while 1:
                request_header = {
                    'User-Agent':
                    'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11',
                    'connection': 'close',
                    'charset': 'utf-8'
                }
                backer_first_timeout_trial = SS.BACKER_CONNECTION_RECOVER
                while 1:
                    try:
                        c_backer = requests.get(backer_url,
                                                headers=request_header,
                                                timeout=600)
                        break
                    except requests.Timeout:
                        backer_first_timeout_trial -= 1
                        if backer_first_timeout_trial < 0:
                            self.error = "backer_first_timeout"
                            return False
                        continue
                if c_backer.status_code == 200:
                    bc_text = c_backer.text
                    c_backer.close()
                    break
                elif c_backer.status_code == 429:
                    c_backer.close()
                    time.sleep(SS.PROJECT_PAGE_SERVER_THREAD_POOL
                               )  #for five second sleep
                    continue
                else:
                    if backer_first_trial < 0:
                        self.error = "backer_first"
                        return False
                    else:
                        backer_first_trial -= 1
                        time.sleep(SS.PROJECT_PAGE_SERVER_THREAD_POOL)
                        continue
            soup_backer = BS(bc_text, 'html.parser')
            ele_backers = soup_backer.select("div.NS_backers__backing_row")
            self.get_backer_data(ele_backers)
            cursors = [ele['data-cursor'] for ele in ele_backers]
            #time.sleep(0.5)
            while len(cursors) > 0:
                backer_trial = SS.BACKER_CONNECTION_RECOVER
                bc_text = ""
                param_backer['cursor'] = cursors[-1]
                while 1:
                    try:
                        c_backer = requests.get(backer_url,
                                                headers=request_header,
                                                params=param_backer,
                                                timeout=600)
                    except requests.Timeout:
                        backer_trial -= 1
                        if backer_trial < 0:
                            self.error = "backer_timeout"
                            return False
                        continue
                    if c_backer.status_code == 200:
                        bc_text = c_backer.text
                        c_backer.close()
                        break
                    elif c_backer.status_code == 429:
                        c_backer.close()
                        sys.stdout.write(".?.")
                        sys.stdout.flush()
                        time.sleep(SS.PROJECT_PAGE_SERVER_THREAD_POOL)
                        continue
                soup_backer = BS(bc_text, 'html.parser')
                ele_backers = soup_backer.select("div.NS_backers__backing_row")
                self.get_backer_data(ele_backers)
                cursors = [ele['data-cursor'] for ele in ele_backers]
                time.sleep(1)  # politeness
        except:
            self.error = "backer"
            return False
        return True
Ejemplo n.º 5
0
#test video stream
url = "https://d2pq0u4uni88oo.cloudfront.net/projects/824027/video-390569-h264_base.mp4"
from hsaudiotag import mp4  # $ pip install hsaudiotag
import requests
from StringIO import StringIO
from contextlib import closing
with closing(requests.get(url, stream=True, verify=False)) as r:
    a = r.raw.read(2000)  #2kb buffer
    b = StringIO()
    b.write(a)
    c = mp4.File(b)
    print c.duration
    b.close()
Ejemplo n.º 6
0
 def analyze(self):
     pb = self.pb
     page = pb.get_page_source()
     # preconditions
     assert page is not None and len(page) > 0, ""
     self.page_compressed = zlib.compress(page.encode('utf-8'))
     if not self.quietly:
         sys.stdout.write("Page data compressed: self.page_compressed..\n")
         sys.stdout.flush()
     soup = BS(page,'html.parser')
     # precondition
     assert soup != None, "No Page!"
     #main page
     if not self.quietly:
         sys.stdout.write("..Main..")
         sys.stdout.flush()
     #analyze main page ===========
     # reward
     frame = soup.select(".NS-projects-reward")
     proj_reward_append = self.projects_reward_result.append
     if len(frame) > 0:
         #
         for card in frame:
             #money
             money_f = card.select(".money")
             if len(money_f) > 0:
                 money = filter_number(money_f[0].text.strip())
             else:
                 money = 0.0
             #backers
             backers_f = card.select(".num-backers")
             if len(backers_f) > 0:
                 num_backers = filter_number(backers_f[0].text.strip())
             else:
                 num_backers = 0.0
             #description
             desc_f = card.select(".desc")
             if len(desc_f) > 0:
                 description = desc_f[0].text.strip()
             else:
                 description = ""
             #delivery
             delivery_f = card.select("time")
             if len(delivery_f) > 0:
                 delivery_estimated = delivery_f[0]['datetime']
             else:
                 delivery_estimated = ""
             #limited
             limited_f = card.select(".limited-number")
             if len(limited_f) > 0:
                 limited_num = int(re.findall(r"of ([0-9]+)",limited_f[0].text)[0])
             else:
                 limited_num = 0
             proj_reward_append([
                 money,
                 num_backers,
                 description,
                 delivery_estimated,
                 limited_num,
             ])
         #for
     # collect images
     frame = soup.select("img.fit")
     image_fnames_append = self.image_fnames.append
     images_append = self.images.append
     if len(frame) > 0:
         for imgf in frame:
             src = imgf['src']
             src = re.sub(r"\?.*$","",src)
             image_fnames_append(src)
             images_append("") #basically nothing is apppended
         if self.has_image:
             inque = list_to_queue(self.image_fnames)
             outque = Queue.Queue()
             tasks = []
             # parallel processing
             for i in range(inque.qsize()):
                 imageD = ImageDownloader(inque,outque,self.quietly,True)
                 imageD.setDaemon(True)
                 tasks.append(imageD)
                 imageD.start()
             inque.join() #wait till being finished
             for task in tasks:
                 task.stop()
             outlist = queue_to_list(outque)
             self.images = copy.deepcopy(outlist)
             # replace string list to binary data list
     # video (source file name, length)s
     frame = soup.select("video.has_webm")
     self.video_fname = "na"
     v_url = ""
     if len(frame) > 0:
         sources = frame[0].select("source")
         if len(sources) > 0:
             self.video_has = True
             for source in sources:
                 v_url = source['src']
                 if v_url.endswith(".mp4"):
                     if v_url.find('high') > 0:
                         self.video_has_high = 1
                         self.video_fname = v_url
                     else:
                         self.video_has_base = 1
                         self.video_fname = v_url # if base exists, replace by it.
             if self.video_has_high > 0 or self.video_has_base > 0:
                 url = self.video_fname
                 # video duration
                 r = requests.get(url,stream = True)
                 a = r.raw.read(2000) #2kb buffer
                 b = StringIO()
                 b.write(a)
                 c = mp4.File(b)
                 self.video_length = c.duration
                 b.close()
                 r.close()
     # collect full description
     frame = soup.select(".full-description")        
     rv = ""
     if len(frame) > 0:
         desc = frame[0].text
         rv = re.sub(r"\n\n\n+","\n",desc)
         try:
             rv = rv.strip()
         except:
             pass
     self.full_description = rv        
     # collect risk
     frame = soup.select("#risks")
     rv = ""
     if len(frame) > 0:
         desc = frame[0].text
         rv = re.sub(r"\n\n\n+","\n",desc)
         try:
             rv = rv.strip()
         except:
             pass
     self.risks = rv        
     # Facebook
     frame = soup.select("li.facebook.mr2 .count")
     waiting = 1
     while 1:
         if len(frame) > 0:
             try:
                 facebook_count = int(frame[0].text) #error prone
                 break
             except:
                 if not self.quietly:
                     sys.stdout.write("[facebook waiting...%d]\n"%waiting)
                     sys.stdout.flush()
                 time.sleep(waiting)
                 waiting += 1
                 temp_soup_facebook = BS(pb.get_page_source(),'html.parser')
                 frame = temp_soup_facebook.select("li.facebook.mr2 .count")
         else:
             self.facebook_like = 0
             break
         if waiting >= 10:
             if not self.quietly:
                 sys.stdout.write(" [facebook error] ")
                 sys.stdout.flush()
             self.facebook_like = -1 #means, error
             break
     
     # ============================
     if not self.quietly:
         sys.stdout.write("OK\n")
         sys.stdout.flush()
     #backers =====================
     #btn = pb.css_selector_element("#backers_nav")
     soup = None
     if not self.quietly:
         sys.stdout.write("..Visiting backers data..")
         sys.stdout.flush()
     try:
         headers = { 'User-Agent' : 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11',
                  'connection' : 'close',
                  'charset' : 'utf-8'}
         params = {'format':'json'}            
         con = requests.get(self.url,headers=headers,params=params)
         j = con.json()
         board = j['running_board']
         s = BS(board,'html.parser')
         eles = s.select('a#backers_nav data')
         if len(eles) > 0:
             t = eles[0].text.replace(",","")
             n = int(t)
             pn = n/40 + 2
         else:
             n = 0
             pn = 0            
         self.pagination = pn
         
         backer_url = self.url+"/backers"
         pb.goto(backer_url)
         page = 1
         # measure current backers
         p = pb.get_page_source()
         s = BS(p,'html.parser')
         frame = s.select("div.NS_backers__backing_row .meta a")
         prevc = len(frame)
         # init
         nowc = 0
         # pagination
         while 1:
             pb.scroll_down()
             # measure current backers
             p = pb.get_page_source()
             s = BS(p,'html.parser')
             frame = s.select("div.NS_backers__backing_row .meta a")
             nowc = len(frame)
             if not self.quietly:
                 sys.stdout.write("b")
                 sys.stdout.flush()
             if nowc > prevc:
                 prevc = nowc
             else:
                 break
         self.get_backers(pb)
         #get backers
         p = pb.get_page_source()
         s = BS(p,'html.parser')
         frame = s.select("div.NS_backers__backing_row .meta a")
         backers_append = self.backers.append
         if len(frame) > 0:
             for backer in frame:
                 profile_url = "%s"%(backer['href'])
                 backer_name = backer.text
                 backers_append((profile_url,backer_name,))
Ejemplo n.º 7
0
    def analyze(self):
        pb = self.pb
        page = pb.get_page_source()
        #page_compress = pb.get_page_source(remove_js = True) # no javascript
        # preconditions
        if page == None or len(page) == 0:
            self.error = "no page"
            return False
        #self.page_compressed = zlib.compress(repr(page_compress))
        soup = BS(page, 'html.parser')

        # precondition
        if soup == None:
            self.error = "no parser"
            return False
        # check 404
        if soup.title.text.find("looking for doesn't exist (404)") > 0:
            self.error = "404"
            return False
        #main page
        if not self.quietly:
            sys.stdout.write(".[%03d:analyze]." % self.my_id)
            sys.stdout.flush()
        #analyze main page ===========
        # number of comment
        frame = soup.select("#comments_count")
        if len(frame) > 0:
            self.comments_count = int(frame[0]['data-comments-count'])
        else:
            self.comments_count = -1  #means error
        # reward
        frame = soup.select(".NS-projects-reward")
        proj_reward_append = self.projects_reward_result.append
        if len(frame) > 0:
            #
            for card in frame:
                #money
                money_f = card.select(".money")
                if len(money_f) > 0:
                    money = filter_number(money_f[0].text.strip())
                else:
                    money = 0.0
                #backers
                backers_f = card.select(".num-backers")
                if len(backers_f) > 0:
                    num_backers = filter_number(backers_f[0].text.strip())
                else:
                    num_backers = 0.0
                #description
                desc_f = card.select(".desc")
                if len(desc_f) > 0:
                    description = desc_f[0].text.strip()
                else:
                    description = ""
                #delivery
                delivery_f = card.select("time")
                if len(delivery_f) > 0:
                    delivery_estimated = delivery_f[0]['datetime']
                else:
                    delivery_estimated = ""
                #limited
                limited_f = card.select(".limited-number")
                if len(limited_f) > 0:
                    limited_num = int(
                        re.findall(r"of ([0-9]+)", limited_f[0].text)[0])
                else:
                    limited_num = 0
                proj_reward_append([
                    money,
                    num_backers,
                    description,
                    delivery_estimated,
                    limited_num,
                ])
            #for
        # collect images
        frame = soup.select("img.fit")
        image_fnames_append = self.image_fnames.append
        images_append = self.images.append
        if len(frame) > 0:
            for imgf in frame:
                src = imgf['src']
                src = re.sub(r"\?.*$", "", src)
                image_fnames_append(src)
                images_append("")  #basically nothing is apppended
            if self.has_image:
                inque = list_to_queue(self.image_fnames)
                outque = Queue.Queue()
                tasks = []
                # parallel processing
                for i in range(inque.qsize()):
                    imageD = ImageDownloader(inque, outque, self.quietly, True)
                    imageD.setDaemon(True)
                    tasks.append(imageD)
                    imageD.start()
                inque.join()  #wait till being finished
                for task in tasks:
                    task.stop()
                outlist = queue_to_list(outque)
                self.images = copy.deepcopy(outlist)
                # replace string list to binary data list
        # video (source file name, length)s
        frame = soup.select("video.has_webm")
        self.video_fname = "na"
        v_url = ""
        if len(frame) > 0:
            sources = frame[0].select("source")
            if len(sources) > 0:
                self.video_has = True
                for source in sources:
                    v_url = source['src']
                    if v_url.endswith(".mp4"):
                        if v_url.find('high') > 0:
                            self.video_has_high = 1
                            self.video_fname = v_url
                        else:
                            self.video_has_base = 1
                            self.video_fname = v_url  # if base exists, replace by it.
                if self.video_has_high > 0 or self.video_has_base > 0:
                    url = self.video_fname
                    # video duration
                    with closing(requests.get(url, stream=True,
                                              verify=False)) as r:
                        a = r.raw.read(2000)  #2kb buffer
                        b = StringIO()
                        b.write(a)
                        c = mp4.File(b)
                        self.video_length = c.duration
                        b.close()
        # collect full description
        frame = soup.select(".full-description")
        rv = ""
        if len(frame) > 0:
            desc = frame[0].text
            rv = re.sub(r"\n\n\n+", "\n", desc)
            try:
                rv = rv.strip()
            except:
                rv = "error"
        self.full_description = rv
        # collect risk
        frame = soup.select("#risks")
        rv = ""
        if len(frame) > 0:
            desc = frame[0].text
            rv = re.sub(r"\n\n\n+", "\n", desc)
            try:
                rv = rv.strip()
            except:
                rv = "error"
        self.risks = rv
        # Facebook
        try:
            frame = soup.select("li.facebook.mr2 .count")
            if len(frame) > 0:
                fb_cnt = frame[0].text.strip()
            self.facebook_like = int(fb_cnt.replace(",", ""))
            if not self.quietly:
                sys.stdout.write(".[%03d:fb]." % self.my_id)
                sys.stdout.flush()
        except:
            if not self.quietly:
                sys.stdout.write(".[%03d:fb!]." % self.my_id)
                sys.stdout.flush()
            self.facebook_like = -1
        #backers =====================
        backer_count_eles = soup.select("div#backers_count")
        if len(backer_count_eles) > 0:
            backer_count = int(backer_count_eles[0]['data-backers-count'])
        else:
            self.error = "backer_nocount"
            print backer_url
            return False
        if backer_count > 10:
            backer_url = self.url + "/backers"
            backer_visit_trial = 2
            while 1:
                try:
                    pb.goto(backer_url,
                            filter_func=backer_exist_expected_condition,
                            filter_time_out=600)
                    break
                except:
                    if backer_visit_trial > 0:
                        backer_visit_trial -= 1
                        continue
                    else:
                        self.error = "backer timeout"
                        return False
            try:
                p = pb.get_page_source()
                s = BS(p, 'html.parser')
                frame = s.select("div.NS_backers__backing_row .meta a")
                pb.temp_backer_number = len(frame)  # temp variable
                if len(frame) >= 50:  # has pagination (update per 10)
                    while 1:  # pagination control
                        if abs(backer_count - pb.temp_backer_number) < 10:
                            break
                        pb.scroll_down(
                            filter_func=backer_scroll_expected_condition,
                            filter_time_out=600)  # wait for three minutes
                        p = pb.get_page_source()
                        s = BS(p, 'html.parser')
                        frame = s.select("div.NS_backers__backing_row .meta a")
                        nowc = len(frame)  # get current number
                        if not self.quietly:
                            sys.stdout.write(".[%03d:backer_page = %04d]." %
                                             (self.my_id, nowc))
                            sys.stdout.flush()
                        if nowc == 0 or (
                                nowc %
                                50) != 0:  #precondition, check end (from 51)
                            break  # the final page does not contain 50 projects (less than that)
                #get backers
                s = BS(p, 'html.parser')
                frame = s.select("div.NS_backers__backing_row")
                backers_append = self.backers.append
                if len(frame) > 0:
                    for backer in frame:
                        anchors = backer.select(".meta a")
                        if len(anchors) > 0:
                            profile_url = "%s" % (anchors[0]['href'])
                            backer_name = anchors[0].text
                        else:
                            profile_url = 'na'
                            backer_name = 'na'
                        history = backer.select(".backings")
                        if len(history) > 0:
                            backing_hist_eles = re.findall(
                                r"[0-9,]+", history[0].text)
                            if len(backing_hist_eles) > 0:
                                backing_hist = int(
                                    backing_hist_eles[0].replace(",",
                                                                 "").strip())
                            else:
                                backing_hist = 0
                        else:
                            backing_hist = 0
                        backers_append(
                            (profile_url, backer_name, backing_hist))
            except:
                self.error = "backer ajax"
                return False
        return True