def extract_links_from_swf(file): swf = swfparser.SWFParser(file) urls = set() for tag in swf.tags: for text in analyze_tag(tag): if looks_like_an_url(text): urls.add(text) return list(urls)
def scrap(fh, custom_order=None): """Get useful info from a program.""" swf = swfparser.SWFParser(fh) # get the images base = None images = [] for tag in swf.tags: if tag.name == 'JPEGTables': base = tag.JPEGData elif tag.name == 'DefineBits': images.append((tag.CharacterID, tag.JPEGData)) elif tag.name == 'DefineBitsJPEG2': images.append((tag.CharacterID, tag.ImageData)) images = [base + x[1] for x in sorted(images, reverse=True)] # get the last DefineSprite defsprite = None for tag in swf.tags: if tag.name == 'DefineSprite': defsprite = tag assert tag is not None, "DefineSprite not found" # get the actions doaction = defsprite.ControlTags[0] for act in doaction.Actions: if act.name == 'ActionConstantPool': break else: if len(images) < 3: # not enough images and no constant pool: a non-programs swf! return [] raise ValueError("No ActionConstantPool found!") # do some magic to retrieve the texts cpe = _ConstantPoolExtractor(act.ConstantPool, doaction.Actions) i = 0 all_vals = [] while True: i += 1 name = 'titulo%d1' % i occup = 'titulo%d2' % i bio = 'htmlText' date = 'titulo%d3' % i vals = cpe.get(name, occup, bio, date) if vals is None: break all_vals.append((vals[name], vals[occup], vals[bio], vals[date])) items = [] for i, (name, occup, bio, date) in enumerate(all_vals): date = _fix_date(date) if date is None: continue occup = _fix_occup(occup) bio = _fix_bio(bio) name = _fix_name(name) # use the corresponding image, or through the custom order if custom_order is None: idx = i else: try: idx = custom_order.index(name) except: continue image = images[idx] ep = Episode(name=name, occup=occup, bio=bio, image=image, date=date) items.append(ep) return items