def _read_info(self, field): if field == 'md5partial': dec = mp4.File(str(self.path)) self._md5partial_offset = dec.audio_offset self._md5partial_size = dec.audio_size dec.close() super(Mp4File, self)._read_info(field) if field in TAG_FIELDS: dec = mp4.File(str(self.path)) self.audiosize = dec.audio_size self.bitrate = dec.bitrate self.duration = dec.duration self.samplerate = dec.sample_rate self.artist = dec.artist self.album = dec.album self.title = dec.title self.genre = dec.genre self.comment = dec.comment self.year = dec.year self.track = dec.track dec.close()
def fetch_info(url): """Get the bitrate and the duration from the mp3 header. This will only work with tagless mp3 files.""" # The first 512 bytes are sufficient for extracting the metadata: page = requests.get(url, headers = { "Range" : "bytes=0-512" }) if page.status_code not in [ 200, 206 ]: # 206 = partial content raise requests.exceptions.HTTPError("http code " + str(page.status_code) + ".") with BytesIO(page.content) as bs: mp3 = mp4.File(bs) return types.Obj( duration = mp3.duration, bitrate = mp3.bitrate )
def get_duration(self, url, buff=500, explain=False): if explain: print "in seconds" from StringIO import StringIO from hsaudiotag import mp4 r = requests.get(url, stream=True) if r.status_code == 200: a = r.raw.read(buff) b = StringIO() b.write(a) c = mp4.File(b) duration = c.duration b.close() r.close() return duration else: return -1
def analyze(self): pb = self.pb page = pb.get_page_source() #page_compress = pb.get_page_source(remove_js = True) # no javascript # preconditions if page == None or len(page) == 0: self.error = "no page" return False #self.page_compressed = zlib.compress(repr(page_compress)) soup = BS(page, 'html.parser') # precondition if soup == None: self.error = "no parser" return False # check 404 if soup.title.text.find("looking for doesn't exist (404)") > 0: self.error = "404" return False #main page if not self.quietly: sys.stdout.write(".[%03d:analyze]." % self.my_id) sys.stdout.flush() #analyze main page =========== # number of comment frame = soup.select("#comments_count") if len(frame) > 0: self.comments_count = int(frame[0]['data-comments-count']) else: self.comments_count = -1 #means error # number o update frame = soup.select("#updates_count") if len(frame) > 0: self.updates_count = int(frame[0]['data-updates-count']) else: self.updates_count = -1 #means error # reward frame = soup.select(".NS-projects-reward") proj_reward_append = self.projects_reward_result.append if len(frame) > 0: # for card in frame: #money money_f = card.select(".money") if len(money_f) > 0: money = filter_number(money_f[0].text.strip()) else: money = 0.0 #backers backers_f = card.select(".num-backers") if len(backers_f) > 0: num_backers = filter_number(backers_f[0].text.strip()) else: num_backers = 0.0 #description desc_f = card.select(".desc") if len(desc_f) > 0: description = desc_f[0].text.strip() else: description = "" #delivery delivery_f = card.select("time") if len(delivery_f) > 0: delivery_estimated = delivery_f[0]['datetime'] else: delivery_estimated = "" #limited limited_f = card.select(".limited-number") if len(limited_f) > 0: limited_num = int( re.findall(r"of ([0-9]+)", limited_f[0].text)[0]) else: limited_num = 0 proj_reward_append([ money, num_backers, description, delivery_estimated, limited_num, ]) #for # collect images frame = soup.select("img.fit") image_fnames_append = self.image_fnames.append images_append = self.images.append if len(frame) > 0: for imgf in frame: src = imgf['src'] src = re.sub(r"\?.*$", "", src) image_fnames_append(src) images_append("") #basically nothing is apppended if self.has_image: inque = list_to_queue(self.image_fnames) outque = Queue.Queue() tasks = [] # parallel processing for i in range(inque.qsize()): imageD = ImageDownloader(inque, outque, self.quietly, True) imageD.setDaemon(True) tasks.append(imageD) imageD.start() inque.join() #wait till being finished for task in tasks: task.stop() outlist = queue_to_list(outque) self.images = copy.deepcopy(outlist) # replace string list to binary data list # video (source file name, length)s frame = soup.select("video.has_webm") self.video_fname = "na" v_url = "" if len(frame) > 0: sources = frame[0].select("source") if len(sources) > 0: self.video_has = True for source in sources: v_url = source['src'] if v_url.endswith(".mp4"): if v_url.find('high') > 0: self.video_has_high = 1 self.video_fname = v_url else: self.video_has_base = 1 self.video_fname = v_url # if base exists, replace by it. if self.video_has_high > 0 or self.video_has_base > 0: url = self.video_fname # video duration try: with closing( requests.get(url, stream=True, verify=False, timeout=600)) as r: a = r.raw.read(2000) #2kb buffer b = StringIO() b.write(a) c = mp4.File(b) self.video_length = c.duration b.close() except: self.video_length = -1 # if requests got an error # collect full description frame = soup.select(".full-description") rv = "" if len(frame) > 0: desc = frame[0].text rv = re.sub(r"\n\n\n+", "\n", desc) try: rv = rv.strip() except: rv = "error" self.full_description = rv # collect risk frame = soup.select("#risks") rv = "" if len(frame) > 0: desc = frame[0].text rv = re.sub(r"\n\n\n+", "\n", desc) try: rv = rv.strip() except: rv = "error" self.risks = rv # Facebook try: frame = soup.select("li.facebook.mr2 .count") if len(frame) > 0: fb_cnt = frame[0].text.strip() self.facebook_like = int(fb_cnt.replace(",", "")) if not self.quietly: sys.stdout.write(".[%03d:fb]." % self.my_id) sys.stdout.flush() except: if not self.quietly: sys.stdout.write(".[%03d:fb!]." % self.my_id) sys.stdout.flush() self.facebook_like = -1 #backers ===================== try: backer_url = self.url + "/backers" param_backer = {} backers_append = self.backers.append bc_text = "" backer_first_trial = SS.BACKER_CONNECTION_RECOVER while 1: request_header = { 'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11', 'connection': 'close', 'charset': 'utf-8' } backer_first_timeout_trial = SS.BACKER_CONNECTION_RECOVER while 1: try: c_backer = requests.get(backer_url, headers=request_header, timeout=600) break except requests.Timeout: backer_first_timeout_trial -= 1 if backer_first_timeout_trial < 0: self.error = "backer_first_timeout" return False continue if c_backer.status_code == 200: bc_text = c_backer.text c_backer.close() break elif c_backer.status_code == 429: c_backer.close() time.sleep(SS.PROJECT_PAGE_SERVER_THREAD_POOL ) #for five second sleep continue else: if backer_first_trial < 0: self.error = "backer_first" return False else: backer_first_trial -= 1 time.sleep(SS.PROJECT_PAGE_SERVER_THREAD_POOL) continue soup_backer = BS(bc_text, 'html.parser') ele_backers = soup_backer.select("div.NS_backers__backing_row") self.get_backer_data(ele_backers) cursors = [ele['data-cursor'] for ele in ele_backers] #time.sleep(0.5) while len(cursors) > 0: backer_trial = SS.BACKER_CONNECTION_RECOVER bc_text = "" param_backer['cursor'] = cursors[-1] while 1: try: c_backer = requests.get(backer_url, headers=request_header, params=param_backer, timeout=600) except requests.Timeout: backer_trial -= 1 if backer_trial < 0: self.error = "backer_timeout" return False continue if c_backer.status_code == 200: bc_text = c_backer.text c_backer.close() break elif c_backer.status_code == 429: c_backer.close() sys.stdout.write(".?.") sys.stdout.flush() time.sleep(SS.PROJECT_PAGE_SERVER_THREAD_POOL) continue soup_backer = BS(bc_text, 'html.parser') ele_backers = soup_backer.select("div.NS_backers__backing_row") self.get_backer_data(ele_backers) cursors = [ele['data-cursor'] for ele in ele_backers] time.sleep(1) # politeness except: self.error = "backer" return False return True
#test video stream url = "https://d2pq0u4uni88oo.cloudfront.net/projects/824027/video-390569-h264_base.mp4" from hsaudiotag import mp4 # $ pip install hsaudiotag import requests from StringIO import StringIO from contextlib import closing with closing(requests.get(url, stream=True, verify=False)) as r: a = r.raw.read(2000) #2kb buffer b = StringIO() b.write(a) c = mp4.File(b) print c.duration b.close()
def analyze(self): pb = self.pb page = pb.get_page_source() # preconditions assert page is not None and len(page) > 0, "" self.page_compressed = zlib.compress(page.encode('utf-8')) if not self.quietly: sys.stdout.write("Page data compressed: self.page_compressed..\n") sys.stdout.flush() soup = BS(page,'html.parser') # precondition assert soup != None, "No Page!" #main page if not self.quietly: sys.stdout.write("..Main..") sys.stdout.flush() #analyze main page =========== # reward frame = soup.select(".NS-projects-reward") proj_reward_append = self.projects_reward_result.append if len(frame) > 0: # for card in frame: #money money_f = card.select(".money") if len(money_f) > 0: money = filter_number(money_f[0].text.strip()) else: money = 0.0 #backers backers_f = card.select(".num-backers") if len(backers_f) > 0: num_backers = filter_number(backers_f[0].text.strip()) else: num_backers = 0.0 #description desc_f = card.select(".desc") if len(desc_f) > 0: description = desc_f[0].text.strip() else: description = "" #delivery delivery_f = card.select("time") if len(delivery_f) > 0: delivery_estimated = delivery_f[0]['datetime'] else: delivery_estimated = "" #limited limited_f = card.select(".limited-number") if len(limited_f) > 0: limited_num = int(re.findall(r"of ([0-9]+)",limited_f[0].text)[0]) else: limited_num = 0 proj_reward_append([ money, num_backers, description, delivery_estimated, limited_num, ]) #for # collect images frame = soup.select("img.fit") image_fnames_append = self.image_fnames.append images_append = self.images.append if len(frame) > 0: for imgf in frame: src = imgf['src'] src = re.sub(r"\?.*$","",src) image_fnames_append(src) images_append("") #basically nothing is apppended if self.has_image: inque = list_to_queue(self.image_fnames) outque = Queue.Queue() tasks = [] # parallel processing for i in range(inque.qsize()): imageD = ImageDownloader(inque,outque,self.quietly,True) imageD.setDaemon(True) tasks.append(imageD) imageD.start() inque.join() #wait till being finished for task in tasks: task.stop() outlist = queue_to_list(outque) self.images = copy.deepcopy(outlist) # replace string list to binary data list # video (source file name, length)s frame = soup.select("video.has_webm") self.video_fname = "na" v_url = "" if len(frame) > 0: sources = frame[0].select("source") if len(sources) > 0: self.video_has = True for source in sources: v_url = source['src'] if v_url.endswith(".mp4"): if v_url.find('high') > 0: self.video_has_high = 1 self.video_fname = v_url else: self.video_has_base = 1 self.video_fname = v_url # if base exists, replace by it. if self.video_has_high > 0 or self.video_has_base > 0: url = self.video_fname # video duration r = requests.get(url,stream = True) a = r.raw.read(2000) #2kb buffer b = StringIO() b.write(a) c = mp4.File(b) self.video_length = c.duration b.close() r.close() # collect full description frame = soup.select(".full-description") rv = "" if len(frame) > 0: desc = frame[0].text rv = re.sub(r"\n\n\n+","\n",desc) try: rv = rv.strip() except: pass self.full_description = rv # collect risk frame = soup.select("#risks") rv = "" if len(frame) > 0: desc = frame[0].text rv = re.sub(r"\n\n\n+","\n",desc) try: rv = rv.strip() except: pass self.risks = rv # Facebook frame = soup.select("li.facebook.mr2 .count") waiting = 1 while 1: if len(frame) > 0: try: facebook_count = int(frame[0].text) #error prone break except: if not self.quietly: sys.stdout.write("[facebook waiting...%d]\n"%waiting) sys.stdout.flush() time.sleep(waiting) waiting += 1 temp_soup_facebook = BS(pb.get_page_source(),'html.parser') frame = temp_soup_facebook.select("li.facebook.mr2 .count") else: self.facebook_like = 0 break if waiting >= 10: if not self.quietly: sys.stdout.write(" [facebook error] ") sys.stdout.flush() self.facebook_like = -1 #means, error break # ============================ if not self.quietly: sys.stdout.write("OK\n") sys.stdout.flush() #backers ===================== #btn = pb.css_selector_element("#backers_nav") soup = None if not self.quietly: sys.stdout.write("..Visiting backers data..") sys.stdout.flush() try: headers = { 'User-Agent' : 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11', 'connection' : 'close', 'charset' : 'utf-8'} params = {'format':'json'} con = requests.get(self.url,headers=headers,params=params) j = con.json() board = j['running_board'] s = BS(board,'html.parser') eles = s.select('a#backers_nav data') if len(eles) > 0: t = eles[0].text.replace(",","") n = int(t) pn = n/40 + 2 else: n = 0 pn = 0 self.pagination = pn backer_url = self.url+"/backers" pb.goto(backer_url) page = 1 # measure current backers p = pb.get_page_source() s = BS(p,'html.parser') frame = s.select("div.NS_backers__backing_row .meta a") prevc = len(frame) # init nowc = 0 # pagination while 1: pb.scroll_down() # measure current backers p = pb.get_page_source() s = BS(p,'html.parser') frame = s.select("div.NS_backers__backing_row .meta a") nowc = len(frame) if not self.quietly: sys.stdout.write("b") sys.stdout.flush() if nowc > prevc: prevc = nowc else: break self.get_backers(pb) #get backers p = pb.get_page_source() s = BS(p,'html.parser') frame = s.select("div.NS_backers__backing_row .meta a") backers_append = self.backers.append if len(frame) > 0: for backer in frame: profile_url = "%s"%(backer['href']) backer_name = backer.text backers_append((profile_url,backer_name,))
def analyze(self): pb = self.pb page = pb.get_page_source() #page_compress = pb.get_page_source(remove_js = True) # no javascript # preconditions if page == None or len(page) == 0: self.error = "no page" return False #self.page_compressed = zlib.compress(repr(page_compress)) soup = BS(page, 'html.parser') # precondition if soup == None: self.error = "no parser" return False # check 404 if soup.title.text.find("looking for doesn't exist (404)") > 0: self.error = "404" return False #main page if not self.quietly: sys.stdout.write(".[%03d:analyze]." % self.my_id) sys.stdout.flush() #analyze main page =========== # number of comment frame = soup.select("#comments_count") if len(frame) > 0: self.comments_count = int(frame[0]['data-comments-count']) else: self.comments_count = -1 #means error # reward frame = soup.select(".NS-projects-reward") proj_reward_append = self.projects_reward_result.append if len(frame) > 0: # for card in frame: #money money_f = card.select(".money") if len(money_f) > 0: money = filter_number(money_f[0].text.strip()) else: money = 0.0 #backers backers_f = card.select(".num-backers") if len(backers_f) > 0: num_backers = filter_number(backers_f[0].text.strip()) else: num_backers = 0.0 #description desc_f = card.select(".desc") if len(desc_f) > 0: description = desc_f[0].text.strip() else: description = "" #delivery delivery_f = card.select("time") if len(delivery_f) > 0: delivery_estimated = delivery_f[0]['datetime'] else: delivery_estimated = "" #limited limited_f = card.select(".limited-number") if len(limited_f) > 0: limited_num = int( re.findall(r"of ([0-9]+)", limited_f[0].text)[0]) else: limited_num = 0 proj_reward_append([ money, num_backers, description, delivery_estimated, limited_num, ]) #for # collect images frame = soup.select("img.fit") image_fnames_append = self.image_fnames.append images_append = self.images.append if len(frame) > 0: for imgf in frame: src = imgf['src'] src = re.sub(r"\?.*$", "", src) image_fnames_append(src) images_append("") #basically nothing is apppended if self.has_image: inque = list_to_queue(self.image_fnames) outque = Queue.Queue() tasks = [] # parallel processing for i in range(inque.qsize()): imageD = ImageDownloader(inque, outque, self.quietly, True) imageD.setDaemon(True) tasks.append(imageD) imageD.start() inque.join() #wait till being finished for task in tasks: task.stop() outlist = queue_to_list(outque) self.images = copy.deepcopy(outlist) # replace string list to binary data list # video (source file name, length)s frame = soup.select("video.has_webm") self.video_fname = "na" v_url = "" if len(frame) > 0: sources = frame[0].select("source") if len(sources) > 0: self.video_has = True for source in sources: v_url = source['src'] if v_url.endswith(".mp4"): if v_url.find('high') > 0: self.video_has_high = 1 self.video_fname = v_url else: self.video_has_base = 1 self.video_fname = v_url # if base exists, replace by it. if self.video_has_high > 0 or self.video_has_base > 0: url = self.video_fname # video duration with closing(requests.get(url, stream=True, verify=False)) as r: a = r.raw.read(2000) #2kb buffer b = StringIO() b.write(a) c = mp4.File(b) self.video_length = c.duration b.close() # collect full description frame = soup.select(".full-description") rv = "" if len(frame) > 0: desc = frame[0].text rv = re.sub(r"\n\n\n+", "\n", desc) try: rv = rv.strip() except: rv = "error" self.full_description = rv # collect risk frame = soup.select("#risks") rv = "" if len(frame) > 0: desc = frame[0].text rv = re.sub(r"\n\n\n+", "\n", desc) try: rv = rv.strip() except: rv = "error" self.risks = rv # Facebook try: frame = soup.select("li.facebook.mr2 .count") if len(frame) > 0: fb_cnt = frame[0].text.strip() self.facebook_like = int(fb_cnt.replace(",", "")) if not self.quietly: sys.stdout.write(".[%03d:fb]." % self.my_id) sys.stdout.flush() except: if not self.quietly: sys.stdout.write(".[%03d:fb!]." % self.my_id) sys.stdout.flush() self.facebook_like = -1 #backers ===================== backer_count_eles = soup.select("div#backers_count") if len(backer_count_eles) > 0: backer_count = int(backer_count_eles[0]['data-backers-count']) else: self.error = "backer_nocount" print backer_url return False if backer_count > 10: backer_url = self.url + "/backers" backer_visit_trial = 2 while 1: try: pb.goto(backer_url, filter_func=backer_exist_expected_condition, filter_time_out=600) break except: if backer_visit_trial > 0: backer_visit_trial -= 1 continue else: self.error = "backer timeout" return False try: p = pb.get_page_source() s = BS(p, 'html.parser') frame = s.select("div.NS_backers__backing_row .meta a") pb.temp_backer_number = len(frame) # temp variable if len(frame) >= 50: # has pagination (update per 10) while 1: # pagination control if abs(backer_count - pb.temp_backer_number) < 10: break pb.scroll_down( filter_func=backer_scroll_expected_condition, filter_time_out=600) # wait for three minutes p = pb.get_page_source() s = BS(p, 'html.parser') frame = s.select("div.NS_backers__backing_row .meta a") nowc = len(frame) # get current number if not self.quietly: sys.stdout.write(".[%03d:backer_page = %04d]." % (self.my_id, nowc)) sys.stdout.flush() if nowc == 0 or ( nowc % 50) != 0: #precondition, check end (from 51) break # the final page does not contain 50 projects (less than that) #get backers s = BS(p, 'html.parser') frame = s.select("div.NS_backers__backing_row") backers_append = self.backers.append if len(frame) > 0: for backer in frame: anchors = backer.select(".meta a") if len(anchors) > 0: profile_url = "%s" % (anchors[0]['href']) backer_name = anchors[0].text else: profile_url = 'na' backer_name = 'na' history = backer.select(".backings") if len(history) > 0: backing_hist_eles = re.findall( r"[0-9,]+", history[0].text) if len(backing_hist_eles) > 0: backing_hist = int( backing_hist_eles[0].replace(",", "").strip()) else: backing_hist = 0 else: backing_hist = 0 backers_append( (profile_url, backer_name, backing_hist)) except: self.error = "backer ajax" return False return True