Beispiel #1
0
def kw():
    from BeautifulSoup import BeautifulStoneSoup 
    from urllib2 import urlopen

    tmp_create_lit ()
    disease = request.vars.get ('d')
    result = db.executesql ('''
    SELECT keyword.kw, counts.count 
    FROM (SELECT keycount.kw_id, keycount.count 
          FROM (SELECT id 
                FROM disease 
                WHERE d = '%s') AS diseases
          INNER JOIN keycount
          ON keycount.d_id = diseases.id) AS counts
    INNER JOIN keyword
    ON keyword.id = counts.kw_id;
    ''' % (disease,))
    rList = []
    capabilities = urlopen (deployment_settings.geoserver.wms_capabilities ()).read ()
    soup = BeautifulStoneSoup (capabilities)
    keywords = soup.findAll (name = 'keyword')
    for r in result:
        words = []
        for k in keywords:
            if k.string == r[0]:
                words.append (k)
        mapList = []
        for w in words:
            layer = w.parent.parent
            id = layer.find ('name').string
            name = layer.find ('title').string
            mapList.append ({'filename': id, 'name': name, 'type': 'db'})
        d = dict (kw = r[0], count = r[1], numMaps = len (words), maps = mapList)
        rList.append (d)
    return json.dumps (rList)
Beispiel #2
0
def sync_geoserver (path):
    file = urlopen (path + '/wms?SERVICE=WMS&REQUEST=GetCapabilities')
    buffer = file.read ()
    soup = BeautifulStoneSoup (buffer)
    layers = soup.findAll (name = 'layer')
    results = []
    for l in layers:
        name = l.find ('title')
        id = l.find ('name')
        if name and id:
            text = name.string
            if not text:
                text = id.string
            m = match ('^([^\:]+)\:(.+)$', id.string)
            if m:
                p = m.group (1)
                f = m.group (2)
            else:
                p = '',
                f = id.string
            if dm.query ('maps', prefix = p, filename = f, name = text, src = path).first ():
                pass
            else:
                id = dm.insert ('maps', prefix = p, filename = f, name = text, src = path, public = True)
                keywords = l.findAll (name = 'keyword')
                kw = []
                for k in keywords:
                    kw.append (k.string)
    def Episode(self, stream_name, stream_id, page, totalpage):
        url = self.url_base + stream_id

        data = tools.urlopen(self.app, url, {'cache':3600})

        if data == "":
            mc.ShowDialogNotification("No episode found for " + str(stream_name))
            return []

        rssfeed = re.compile('</a> <a href="(.*?)">RSS</a>').search(data).group(1)

        url = self.url_base + rssfeed
        data = tools.urlopen(self.app, url, {'cache':3600})
        soup = BeautifulStoneSoup(data, convertEntities="xml", smartQuotesTo="xml")

        episodelist = list()
        for info in soup.findAll('item'):
            episode                 =   CreateEpisode()
            episode.name            =   info.title.contents[0]
            episode.id              =   info.link.contents[0]
            episode.description     =   info.description.contents[0]
            episode.thumbnails      =   info.thumbnailimage.contents[0]
            episode.date            =   info.pubdate.contents[0]
            episode.page            =   page
            episode.totalpage       =   totalpage
            episodelist.append(episode)
        return episodelist
Beispiel #4
0
def geocode(address="", city="", state="CA"):
    address = urllib.quote(address.encode('utf-8'))
    g_url = 'http://local.yahooapis.com/MapsService/V1/geocode?appid='
    g_url += '0MoPk9DV34FH0rumXB_xENjSlf.jdG4woRO9nFqyUcM86nLsFSynUvAwZZo6g--'
    g_url += '&street=%s&city=%s&state=%s' % (address, city, state)

    url = urllib.urlopen(g_url)
    dom = BeautifulStoneSoup(url)
    url.close()

    coords = { 'address': None, 'latitude': None, 'longitude': None, }

    result_attr = dom.find('result')

    if result_attr and result_attr['precision'] == 'address':

        dom_fields = ['address', 'latitude', 'longitude']
        for field in dom_fields:
            i = dom.find(field)
            if i:
                if field == 'address': 
                    coords[field] = i.string
                else:
                    try:
                        coords[field] = float(i.string)
                    except:
                        pass

    return coords
Beispiel #5
0
    def handle_noargs(self, **options):        
        page = BS(urllib2.urlopen("http://data.openaustralia.org/members/wikipedia-commons.xml"))

        for member in page.findAll("personinfo"):
            m = Member.objects.get(oa_id=member['id'])
            m.wikipedia = member['wikipedia_url']
            m.save()
Beispiel #6
0
 def search(self, terms):
     torrents = []
     url = self.search_uri % quote_plus(terms)
     try:
         f = requests.get(url)
     except:
         raise Exception("something wrong")
     if f.status_code != requests.codes.ok:
         f.raise_for_status()
     soup = BeautifulStoneSoup(f.text)
     for details in soup.findAll("a", {"href": re.compile("^/torrent/")}):
         div = details.findNext("div")
         seeds = int(div.text)
         div = div.findNext("div")
         try:
             f_link = requests.get(self.uri_prefix + details["href"])
         except:
             raise Exception("something wrong")
         if f.status_code != requests.codes.ok:
             f.raise_for_status()
         soup_link = BeautifulStoneSoup(f_link.text)
         link = soup_link.find("a", {"href": re.compile("^magnet:")})
         if not link:
             continue
         torrents.append({"url": link["href"], "name": details.text, "seeds": seeds, "leechers": int(div.text)})
     return torrents
Beispiel #7
0
    def text_to_xml_item(self, filepath):
        """read file and generate xml"""
        pname = os.path.basename(filepath).replace(".txt", "")
        date = os.path.getmtime(filepath)
        (tags, title, content) = self.read_text(filepath)  # TODO: do exception proc.
        categories = self.get_categories(filepath)
        date_str = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(date) );
        date_str_gmt = time.strftime( "%Y-%m-%d %H:%M:%S", time.gmtime(date) );
        pubDate_str = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime(date) );
        tidied = content
        tidied = tidied.replace("\r\n", "\n")
        
        rex = re.compile(r"<pre>.*?</pre>", re.S)
        tidied = rex.sub(self.escape, tidied)

        tidied = BeautifulStoneSoup(tidied).prettify()
        tidied = tidied.replace("\n", "")
        tidied = tidied.replace(",", "&#44;")
        tidied = self.unescape(tidied)
        
        # add entry
        post_item = wordpress.Item(
            title = title,
            pubDate = pubDate_str,
            post_date = date_str,
            post_date_gmt = date_str_gmt,
            content = tidied,
            post_name = pname)
        post_item.tags = tags
        post_item.categories = categories
        self._wxr.channel.items.append(post_item)
Beispiel #8
0
def parse_complejosxml(data):
	"""regresa un diccionario con 'id': 'nombre complejo'"""
	parser = BeautifulStoneSoup(data)
	ids = [{'id_org': complejo.clave.string,
			'nombre': 'Cinemex ' + complejo.nombre.string}
			for complejo in parser.findAll('cine')]
	return ids
Beispiel #9
0
def videosRSS(url=common.args.url):
    link = common.getURL(url)
    mrssData = re.compile('mrssData += +"(.+)"').findall(link)[0];
    mrssData = urllib2.unquote(base64.decodestring(mrssData))
    tree=BeautifulStoneSoup(mrssData,convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
    print tree.prettify()
    items = tree.findAll('item')
    for item in items:
        title = item.title.contents[0]
        plot  = item.description.contents[0]
        thumb = item.findAll('media:thumbnail')[0]['url']
        duration = item.findAll('media:content')[0]['duration']
        smil = item.findAll('media:text')[5].contents[0]
        smil = smil.replace('smilUrl=','')
        #episode_list.append((title, image, duration, plot, smil))
        u = sys.argv[0]
        u += '?url="'+urllib.quote_plus(smil)+'"'
        u += '&mode="history"'
        u += '&sitemode="play"'
        infoLabels={ "Title":title,
                     #"Season":season,
                     #"Episode":episode,
                     "Plot":plot,
                     #"premiered":airdate,
                     "Duration":duration,
                     #"TVShowTitle":common.args.name
                     }
        common.addVideo(u,title,thumb,infoLabels=infoLabels)
    common.setView('episodes')
Beispiel #10
0
def ExtractPubPar(xmldata):
    """Yields sucessive paragraphs from a Pubmed xml"""

    xmltree = BeautifulStoneSoup(xmldata)
    v = xmltree.find("abstracttext")
    if v:
        yield v.string.strip()
Beispiel #11
0
    def extract_stats(self, page, list_of_pmids):
        if not page:
            return([])     
        (response_header, content) = page
    
        response = []
        soup = BeautifulStoneSoup(content)
        #print soup.prettify()

        for docsum in soup.findAll("docsum"):
            #print(tag.id.text)
            id = docsum.id.text
            author_list = []
            response_dict = {}
            for item in docsum.findAll("item"):
                if item.get("name") == "DOI":
                    doi = item.text
                    response_dict.update(doi=doi)
                if item.get("name") == "pmc":
                    pmcid = item.text
                    share_details_url = "http://www.ncbi.nlm.nih.gov/pmc/articles/%s/citedby/?tool=pubmed" %pmcid
                    response_dict.update(pmcid=pmcid, share_details_url=share_details_url)
            response += [(id, response_dict)]

        return(response)
Beispiel #12
0
def inlines(value, return_list=False):
    try:
        from BeautifulSoup import BeautifulStoneSoup
    except ImportError:
        from beautifulsoup import BeautifulStoneSoup

    content = BeautifulStoneSoup(value, selfClosingTags=['inline', 'img', 'br',
                                                         'input', 'meta',
                                                         'link', 'hr'])

    # Return a list of inline objects found in the value.
    if return_list:
        inline_list = []
        for inline in content.findAll('inline'):
            rendered_inline = render_inline(inline)
            inline_list.append(rendered_inline['context'])
        return inline_list

    # Replace inline markup in the value with rendered inline templates.
    else:
        for inline in content.findAll('inline'):
            rendered_inline = render_inline(inline)
            if rendered_inline:
                inline_template = render_to_string(rendered_inline['template'],
                                                   rendered_inline['context'])
            else:
                inline_template = ''
            value = value.replace(str(inline), inline_template)
        return mark_safe(unicode(value))
    def handle(self, *args, **kwargs):
        resource = urlopen(TRAIL_REPORT_URL)
        soup = BeautifulStoneSoup(resource)
        lift = soup.find("lifts")
        
        cache.set(TRAIL_REPORT_CACHE_KEY, {
            "total": lift.get("total"),
            "open": lift.get("totalopen"),
        }, 7 * 24 * 60 * 60)

        resource = urlopen(WEATHER_REPORT_URL)
        soup = BeautifulStoneSoup(resource)
        report = soup.findAll("report")[1]
        forecast = []

        weather_data = {
            "temperature": report.get("temp"),
            "surface": report.get("surfaceconditions"),
        }

        for i in range(1, 5):
            day = soup.find("day%d" % i)

            if day:
                forecast.append({
                    "day": day.get("day"),
                    "status": WEATHER_TYPES[int(day.get("icon"))],
                })

        weather_data["forecast"] = forecast

        cache.set(WEATHER_REPORT_CACHE_KEY, weather_data, 7 * 24 * 60 * 60)
Beispiel #14
0
def parse_categories(soup):
    categories_list = []

    """
    <category id="pre-school" genre="true">
        <name>ABC 4 Kids</name>
    </category>
    """

    # This next line is the magic to make recursive=False work (wtf?)
    BeautifulStoneSoup.NESTABLE_TAGS["category"] = []
    xml = BeautifulStoneSoup(soup)

    # Get all the top level categories, except the alphabetical ones
    for cat in xml.find('categories').findAll('category', recursive=False):

        id = cat.get('id')
        if cat.get('index') or id == 'index':
            continue

        item = {}
        item['keyword'] = id
        item['name']    = cat.find('name').string;

        categories_list.append(item);

    return categories_list
Beispiel #15
0
 def xml_to_dict (self, data):
     from BeautifulSoup import BeautifulStoneSoup as BS
     soup = BS(data)
     username = soup.find('db:uid').contents[0]
     uid = soup.find('id').contents[0].split('/')[-1]
     title = soup.find('title').contents[0]
     return {'id':uid, 'username':username,'title':title}
Beispiel #16
0
 def search(self, terms, settings={}):
     torrents = []
     f = None
     for url in self.search_uris:
         try:
             final_url = url + '/usearch/' + terms.replace(' ','%20') + '/?field=seeders&sorder=desc&rss=1'
             request = urlRequest(final_url)
             request.add_header('Accept-encoding','gzip')
             response = urlopen(request)
             if response.info().get('Content-Encoding') == 'gzip':
                 buf = StringIO(response.read())
                 data = gzip.GzipFile(fileobj=buf)
                 f = data.read()
             else:
                 f = response.read()
             break
         except:
             pass
     if not f:
         raise Exception('Out of kickass proxies')
     soup = BeautifulStoneSoup(f)
     for item in soup.findAll('item'):
         isVerified = int(item.find('torrent:verified').text)
         if isVerified == 1 or str(settings['trusted_uploaders']).lower() != 'true':
             torrents.append({
                 'url': item.find('torrent:magneturi').text,
                 'name': item.title.text,
                 'seeds': int(item.find('torrent:seeds').text),
                 'leechers': int(item.find('torrent:peers').text),
             })
     sorted_torrents = sorted(torrents,key = lambda k: k['seeds'], reverse=True)
     return sorted_torrents
Beispiel #17
0
    def parse(self):
        """Parses the raw XML of the input file."""

        print "Parsing raw XML file using BeautifulStoneSoup..."
        print

        # initial parse
        soup = BeautifulStoneSoup(self.in_file, selfClosingTags=['milestone', 'ref'])

        print "Finished parsing raw XML using BeautifulStoneSoup."
        print

        out_name = self.to_scrub
        self.open_out(out_name)
        
        print "Finding major divisions in the XML file..."
        print

        count = 1
        
        # gets sub-trees for all books
        divisions = soup.findAll(type="book")
        book = raw_input("What number book of Herodotus would you like to scrub? ")
        for division in divisions:
            if count == int(book):
                self.scrub(division, book)
            count += 1
Beispiel #18
0
    def gettrackinfo(self, track="", artist=""):
        """
        getinfo - Get info from Lastfm about a particular song
        @param track - Track name and other information we may have
        @return song - Return the song information on success
                       else return None on failure
        """
        track = track.replace("&", "and")
        artist = artist.replace("&", "and")
        url = self.trackurl+"&track="+track.encode('ascii')+"&artist="+artist.encode('ascii')
        try:
            file = urllib.urlopen(url)
            page = BeautifulStoneSoup(file.read())
            file.close()
            
            info = page.find("track")
            song = {}
            song["track"] = info.find("name").next
            song["artist"] = info.find("artist").next
            song["albumart"] = info.findAll("image")[0].next
        except Exception as e:
            print e
            return None

        return song
Beispiel #19
0
def getXmlCursor(xml):
    """解析xml,获取当前查询的起始位置,每页数量,总共数量"""
    soup = BeautifulStoneSoup(xml)
    start = int(soup.find('opensearch:startindex').string)
    count = int(soup.find('opensearch:itemsperpage').string)
    totalCount = int(soup.find('opensearch:totalresults').string)
    return (start, count, totalCount)
    def handle(self, *args, **options):
        try:
            resource = urlopen(args[0])
        except IndexError:
            raise CommandError("You have to specify a file path or a URL.")

        soup = BeautifulStoneSoup(resource)

        for event in soup.findAll("event"):
            event_info = {
                "description": event.get("description", u""),
            }

            for attribute in event:
                if not isinstance(attribute, NavigableString):
                    event_info[attribute.name] = attribute.string or u""

            identifier = sha_constructor("%(description)s-%(date)s" % {
                "description": slugify(event_info.get("description").encode("utf-8")),
                "date": event_info.get("date", u"").encode("utf-8"),
            }).hexdigest().encode("utf-8")

            obj, created = Event.objects.get_or_create(identifier=identifier, defaults={
                "title": event_info.get("title", u"").encode("utf-8"),
                "content": event_info.get("body", u"").encode("utf-8"),
                "description": event_info.get("description").encode("utf-8"),
                "url": event_info.get("url", u"").encode("utf-8"),
                "order": int(event_info.get("displayorder", 0)),
                "date": date(
                    int(event_info.get("year", 0)),
                    int(event_info.get("month", 0)),
                    int(event_info.get("day", 0)),
                ),
            })
Beispiel #21
0
def parse_peliculasxml(data):
    parser = BeautifulStoneSoup(data)
    ids = [
        {"titulo": peli.nombre.string, "id_pol": peli.id.string, "id_cineticket": peli.idcineticket.string}
        for peli in parser.findAll("pelicula")
    ]
    return ids
Beispiel #22
0
 def search(self, terms):
     torrents = []
     url = self.search_uri % quote_plus(terms)
     try:
         f = requests.get(url, headers=self.headers)
     except:
         raise Exception("something wrong")
     if f.status_code != requests.codes.ok:
         f.raise_for_status()
     soup = BeautifulStoneSoup(f.text)
     for (c, item) in enumerate(soup.findAll("a", {"class": "magnet"})):
         if c == 30:
             break
         info = item.findPrevious("a")
         link = self.search_uri % quote_plus(info["href"])
         try:
             item_f = requests.get(link, headers=self.headers)
         except:
             raise Exception("something wrong")
         if item_f.status_code != requests.codes.ok:
             item_f.raise_for_status()
         item_soup = BeautifulStoneSoup(item_f.text)
         sp = item_soup.findAll("span", {"class": re.compile("^stat_")})
         if sp:
             sp = [int(i.text.replace(",", "")) for i in sp]
         else:
             sp = [0, 0]
         torrents.append({"url": item["href"], "name": info.text, "seeds": sp[0], "leechers": sp[1]})
     return torrents
def parse(request, doc_id):
    """
        Parse the 'resource_data' xml of a given resource (by id)
    
        NOTE only works on dc docuemnts atm
    """
    
    is_service_avaliable()

    bag = {}
    url = "%sobtain?request_ID=%s&by_doc_ID=true" % (NODE_URL, doc_id)
    req = urllib2.Request(url)
    opener = urllib2.build_opener()
    data = opener.open(req)
    result = simplejson.load(data)
    data = result['documents'][0]['document'][0]['resource_data']
    soup = BeautifulStoneSoup(data)
    
    parsed_data = {}
    fields = ['title','identifier','creator','publisher','date','description','rights']
    for field in fields:
        try:
            parsed_data[field] = soup.find('dc:' + field).text
        except:
            pass
    
    parsed_data['subject'] = ''.join([s.text for s in soup.findAll('dc:subject')])
    
    bag['parsed_data'] = parsed_data
    

    return shortcuts.render_to_response("parse.html", 
                                        bag, 
                                        context_instance=context.RequestContext(request))
Beispiel #24
0
def splitCell(cell):
    """Read the contents of a table cell and build the lecture dicts.

    Gets a BeautifulSoup element of a Cell and splits it into lectures.
    Then it builds the lecture dicts.

    The returned value is a list of lecture dicts (if any or a empty list
    else).
    """
    st = BeautifulStoneSoup(unicode(cell.renderContents('utf-8').replace('<br />', '\n'), 'utf-8'),
                            convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
    elements = unicode(st.renderContents('utf-8'),'utf-8').split('\n\n')
    lectures = []
    for elem in elements:

        lines = elem.split('\n')
        if len(lines) != 3:
            continue

        lines = map(unicode.strip, lines)
        if lines[2] != '-':
            (short, typ) = lines[1].split(" ")
            lectures.append({'short': short,
                             'typ': typ,
                             'name': lines[0],
                             'room': lines[2]})
    return lectures
Beispiel #25
0
def determine_subtype(in_file):
    """Determines the subtype of a genome."""

    hits = defaultdict(int)
    with open(in_file) as handle:
        soup = BeautifulStoneSoup(handle.read())
    
    for seq in soup.findAll('iteration'):
        try:
            hit = seq.iteration_hits.hit.hit_def.contents[0]
        except:
            hit = None
        if hit:
            hits[hit.split('_')[1]] += 1
    
    count = sum(hits.values())
    if count < 5:
        return None
    elif all([x < count*0.6 for x in hits.values()]):
        #print 'too heterogenus %s' % ','.join(map(str,hits.items()))
        return None
    else:
        for key, val in hits.items():
            if val > count*0.6:
                return key
Beispiel #26
0
def determine_subtype_short(in_file):

    hits = defaultdict(int)
    strainer = SoupStrainer(re.compile('iteration'))
    with open(in_file) as handle:
        soup = BeautifulStoneSoup(handle.read(), parseOnlyThese = strainer)
    
    for seq in soup.findAll('iteration'):
        try:
            hit = seq.iteration_hits.hit.hit_def.contents[0]
        except:
            hit = None
        if hit:
            hits[hit.split('_')[1]] += 1
    
    count = sum(hits.values())
    if count < 5:
        return None
    elif all([x < count*0.6 for x in hits.values()]):
        print 'too heterogenus %s' % ','.join(map(str,hits.items()))
        return None
    else:
        for key, val in hits.items():
            if val > count*0.6:
                return key
Beispiel #27
0
def sitemap_parse(sitemap_option, astring, google_results, website_url):
    not_indexed = []
    not_sitemap = []
    error = ''
    sitemap_results = []
    website_host = urlparse(website_url).scheme
    if website_host != '':
        website_url = urlparse(website_url).scheme + "://" + urlparse(website_url).netloc
    if website_url[-1] != '/':
        website_url += '/'
    if astring != '':
        if sitemap_option == 'sitemap':

            resp = requests.get(astring)
            soup = Soup(resp.content)

        elif sitemap_option == 'upload_sitemap':

            soup = Soup(astring)
        urls = soup.findAll('url')
        for u in urls:
            loc = u.find('loc').string
            sitemap_results.append(loc)
            if loc not in google_results:
                not_indexed.append(loc)
        for loc in google_results:
            if loc not in sitemap_results:
                not_sitemap.append(loc)
    return not_indexed, not_sitemap, error
Beispiel #28
0
def get_mirror(type='xml'):
    """Returns a random mirror for a given type 'xml', 'zip', or 'banner'"""
    global _mirrors
    if not _mirrors.get(type):
        # Get the list of mirrors from tvdb
        page = None
        try:
            page = requests.get(server + api_key + '/mirrors.xml').content
        except RequestException:
            pass
        # If there were problems getting the mirror list we'll just fall back to the main site.
        if page:
            data = BeautifulStoneSoup(page, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
            for mirror in data.findAll('mirror'):
                type_mask = int(mirror.typemask.string)
                mirrorpath = mirror.mirrorpath.string
                for t in [(1, 'xml'), (2, 'banner'), (4, 'zip')]:
                    if type_mask & t[0]:
                        _mirrors.setdefault(t[1], set()).add(mirrorpath)
        else:
            log.debug('Unable to get the mirrors list from thetvdb.')
    if _mirrors.get(type):
        return random.sample(_mirrors[type], 1)[0] + ('/banners/' if type == 'banner' else '/api/')
    else:
        # If nothing was populated from the server's mirror list, return the main site as fallback
        return 'http://thetvdb.com' + ('/banners/' if type == 'banner' else '/api/')
Beispiel #29
0
    def create(cls, conn, user_sn, user_id, buddy_id, file_nm):
        ''' Take a filename, parse the XML, and insert it into the database.
            Stores most of the attributes raw, in order to do other sorts of
            processing later.
        '''
        xml = BeautifulStoneSoup(open(file_nm, 'r'))
        msgs = xml('message')
        if len(msgs) == 0: return

        my_msgs = len(xml.findAll({'message': True}, {'sender': user_sn}))
        their_msgs = len(msgs)-my_msgs
        initiated = (msgs[0]['sender'] == user_sn)

        start_time = parser.parse(msgs[0]['time'].replace('.', ':'), fuzzy=True)
        end_time = parser.parse(msgs[-1]['time'].replace('.', ':'), fuzzy=True)
        stats = stat(file_nm)

        cur = conn.cursor()
        try:
            cur.execute(cls.CREATE_NEW_BUDDY_LOG_ENTRY_QUERY,
                        (user_id, buddy_id, stats.st_size, initiated, my_msgs,
                        their_msgs, time.mktime(start_time.timetuple()),
                        time.mktime(end_time.timetuple()), time.time(), file_nm))
            conn.commit()
        except sqlite3.IntegrityError:
            pass
Beispiel #30
0
 def search(self, terms):
     torrents = []
     url = self.search_uri % quote_plus(terms)
     try:
         f = requests.get(url)
     except:
         raise Exception("something wrong")
     if f.status_code != requests.codes.ok:
         f.raise_for_status()
     soup = BeautifulStoneSoup(f.text)
     for item in soup.findAll("item"):
         item_quality = item.link.text.rpartition("_")[2]
         try:
             item_f = requests.get(item.link.text)
         except:
             raise Exception("something wrong")
         if f.status_code != requests.codes.ok:
             f.raise_for_status()
         item_soup = BeautifulStoneSoup(item_f.text)
         qualities = [s.text.strip() for s in item_soup.findAll("span", {"class": re.compile("^tech-quality")})]
         q_index = qualities.index(item_quality)
         span = item_soup.findAll("span", {"title": "Peers and Seeds"})[q_index]
         ps_pos = len(span.parent.contents) - 1
         ps = span.parent.contents[ps_pos].split("/")
         torrents.append(
             {"url": item.enclosure["url"], "name": item.title.text, "seeds": int(ps[1]), "leechers": int(ps[0])}
         )
     return torrents
Beispiel #31
0
    def postDownloadPageDay(self, host=None, postData={}):
        headers = {
            'User-Agent': 'Googlebot/2.1 (+http://www.googlebot.com/bot.html) '
        }
        s = requests.session()
        s.post(host, headers=headers, data={"agree": "OK"})
        r = s.post(host, headers=headers, data=postData)

        decodedstring = BeautifulStoneSoup(
            r.text, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)

        return decodedstring
Beispiel #32
0
    def replace_bad_characters(self, str):
        """
        Osetruje konverzi na utf-8 a prevadi html entity na utf-8 pismena
        """

        str = unicode(
            BeautifulStoneSoup(
                str, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
        str = unicodedata.normalize('NFKD', str).encode('ascii', 'ignore')
        str = unicode(re.sub('[^\w\s-]', '', str).strip().lower())
        str = unicode(str.replace(' ', '-'))
        return str
Beispiel #33
0
    def get_list_of_addon_author_names(self, addon_name):
        try:
            addon_xml = self.get_xml_for_single_addon(addon_name)
            name_tags = addon_xml.authors.findAll('name')

            return [
                BeautifulStoneSoup(str(name_tags[i])).find('name').string
                for i in range(len(name_tags))
            ]

        except AttributeError:
            self._print_search_error()
Beispiel #34
0
def retrieveVideoInfo(video_id):
    video_info = VideoInfo()
    video_info.set_video_hosting_info(getVideoHostingInfo())
    video_info.set_video_id(video_id)
    try:
        video_link = 'http://cdn.playwire.com/' + str(video_id) + '.xml'
        soup = BeautifulStoneSoup(HttpUtils.HttpClient().getHtmlContent(url=video_link), convertEntities=BeautifulStoneSoup.XML_ENTITIES)
        cfg = soup.find("config")
        img_link = cfg.findNext("poster").string
        video_link = cfg.findNext("src").string
        
        video_info.set_video_stopped(False)
        video_info.set_video_image(img_link)
        video_info.set_video_name("PLAYWIRE Video")
        if re.search(r'\Artmp',video_link):
            video_info.add_video_link(VIDEO_QUAL_HD_720, video_link, addUserAgent=False)
        else:
            video_info.add_video_link(VIDEO_QUAL_HD_720, video_link, addUserAgent=True)
    except:
        video_info.set_video_stopped(True)
    return video_info
Beispiel #35
0
def convert_regions():
    f = open('raw/regions.xml', 'r')
    data = f.read()
    f.close()
    soup = BeautifulStoneSoup(data)
    tp = Path('//regions')
    objs = tp.apply(soup)
    f = open('refined/regions.csv', 'w')
    f.write('\t'.join(['reg_id', 'name']) + '\n')
    for o in objs:
        rname = unicode(o.find('regname').string).encode('utf8', 'ignore')
        f.write('\t'.join([str(o.find('reg_id').string), rname]) + '\n')
Beispiel #36
0
def show_vodcast_videos(rss_file):
    log('get_vodcasts started with rss_file=%s' % rss_file)
    r_media = re.compile('^media')
    url = MAIN_URL + rss_file
    rss = urlopen(url).read()
    e = BeautifulStoneSoup.XML_ENTITIES
    tree = BeautifulStoneSoup(rss, convertEntities=e)
    videos = []
    for item in tree.findAll('item'):
        if item.find(r_media):
            thumbnail = item.find(r_media)['url']
        else:
            thumbnail = 'DefaultVideo.png'
        videos.append({
            'title': item.title.string,
            'thumbnail': thumbnail,
            'url': item.enclosure['url'],
            'description': item.description.string
        })
    log('show_vodcast_videos finished with %d videos' % len(videos))
    return videos
Beispiel #37
0
    def getDetailsForSerieByID(self, serieName, serieID):
        url = SERIE_DETAILS_URL % (urllib.quote(serieID))

        try:
            # Change the User Agent
            USER_AGENT = 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10'

            cj = cookielib.CookieJar()
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

            req = urllib2.Request(url)
            req.add_header('User-Agent', USER_AGENT)

            resp = opener.open(req)

            soup = BeautifulStoneSoup(resp.read())
            resp.close()

            for banner in soup.banners.findAll('banner'):
                if banner.language.string == 'en':
                    if not 'Fanart' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'fanart':
                        KNOWN_SHOWS[serieName]['Fanart'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['FanartThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))
                    elif not 'Poster' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'poster':
                        KNOWN_SHOWS[serieName]['Poster'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['PosterThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))
                    elif not 'Season' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'season':
                        KNOWN_SHOWS[serieName]['Season'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['SeasonThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))
                    elif not 'Series' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'series':
                        KNOWN_SHOWS[serieName]['Series'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['SeriesThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))

            return KNOWN_SHOWS[serieName]
        except:
            print 'Error: ' + url
            return None
Beispiel #38
0
def getVideoUrl(vod_sq, vod_key, selAltMovie=False):
    url = "http://v.nate.com/movie_url.php?mov_id=%s&v_key=%s&type=xml" % (
        vod_sq, vod_key)
    xml = urllib2.urlopen(url).read()
    #dom = xml.dom.minidom.parseString(xml)   # encoding error?
    soup = BeautifulStoneSoup(xml, fromEncoding='euc-kr')
    if selAltMovie:
        vid_url = urllib.unquote(soup.movie.mov_url_alt.string)
    else:
        vid_url = urllib.unquote(soup.movie.mov_url.string)
    img_url = soup.movie.master_thumbnail.url.string
    return (vid_url, img_url)
Beispiel #39
0
def _google_checkout_post(url, params):
    u = urlparse("%s%s" % (url, g.GOOGLE_ID))
    conn = HTTPSConnection(u.hostname, u.port)
    auth = base64.encodestring('%s:%s' % (g.GOOGLE_ID, g.GOOGLE_KEY))[:-1]
    headers = {"Authorization": "Basic %s" % auth,
               "Content-type": "text/xml; charset=\"UTF-8\""}

    conn.request("POST", u.path, params, headers)
    response = conn.getresponse().read()
    conn.close()

    return BeautifulStoneSoup(response)
Beispiel #40
0
 def calculate_collocations(self,
                            content,
                            collocation_measures=TrigramAssocMeasures,
                            collocation_finder=TrigramCollocationFinder):
     content = re.sub(r'&#8217;', '\'', content)
     content = re.sub(r'&amp;', '&', content)
     try:
         content = unicode(
             BeautifulStoneSoup(
                 content, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
     except ValueError, e:
         print "ValueError, ignoring: %s" % e
Beispiel #41
0
 def get_routes(self):
     """
     Get all available routes.
     
     Return a dictionary with route ids as keys. Values are dictionaries
     of route attributes.
     """
     url = self._build_api_url('getroutes')
     
     xml = self._grab_url(url)
     soup = BeautifulStoneSoup(xml)
     
     routes = {}
     
     for tag in soup.findAll('route'):
         routes[str(tag.rt.string)] = {
             'id': str(tag.rt.string),
             'name': str(tag.rtnm.string)
             }
             
     return routes
Beispiel #42
0
 def __init__(self, response_text, status_code):
     self.text = response_text
     self.xml = None
     try:
         self.json = json.loads(response_text, object_pairs_hook=SortedDict)
     except (JSONDecodeError, ValueError):
         if self.text[:5] == "<?xml":
             # perhaps it's XML?
             self.xml = BeautifulStoneSoup(self.text)
         # must be an awful code.
         self.json = None
     self.status_code = status_code
Beispiel #43
0
def getChannels(url):
    response = open(file, 'rb')
    link = response.read()
    soup = BeautifulStoneSoup(file,
                              convertEntities=BeautifulStoneSoup.XML_ENTITIES)
    channels = soup('channel')
    for channel in channels:
        name = channel('name')[0].string
        thumbnail = channel('thumbnail')[0].string
        addDir(name, '', 2, thumbnail)
    else:
        INDEX()
Beispiel #44
0
 def _start_job(self, job_id, time_to_live, priority, conversion_parameters):
     print "starting job"
     params = dict(jobId=job_id, timeToLive=time_to_live, priority=priority, conversionParameters=conversion_parameters)
     start_job_result = requests.get(
         "https://%s/%s/%s" % (self._service_host, self._service_base, self._start_job_uri),
         params=params,
         headers=self._http_header
     ).text
     # Although PHP sample code loops through all startjobresult tags, we only get the first one, only need to return one anyway
     tag = BeautifulStoneSoup(start_job_result).find('startjobresult')
     print "starting job done"
     return JobInfo(soup_tag = tag)
Beispiel #45
0
def get_series(series_name_search):
    """Return all possible matches for series_name_search in the chosen language
    
    
    """
    url = "%sapi/GetSeries.php?seriesname=%s&language=%s" % (
        BASE_URL, urllib.quote(series_name_search), _LANGUAGE)
    soup = BeautifulStoneSoup(urllib2.urlopen(url).read())
    matches = []
    for series in soup.data.findAll("series"):
        matches.append(_parse_series(series))
    return matches
Beispiel #46
0
def EPISODE(name, cid):
        showname = name
        xbmcplugin.setContent(pluginhandle, 'episodes')
        xbmcplugin.addSortMethod(pluginhandle, xbmcplugin.SORT_METHOD_NONE)
        url = 'http://www.tnt.tv/processors/services/getCollectionByContentId.do?offset=0&sort=&limit=200&id='+cid
        html=getURL(url)
        tree=BeautifulStoneSoup(html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
        episodes = tree.findAll('episode')
        for episode in episodes:
                episodeId = episode['id']
                name = episode.find('title').string
                thumbnail = episode.find('thumbnailurl').string
                plot = episode.find('description').string
                duration = episode.find('duration').string
                try:
                    seasonNum = int(episode.find('seasonnumber').string)
                    print seasonNum
                except:
                    seasonNum = 0
                try:
                    episodeNum = int(episode.find('episodenumber').string)
                    print episodeNum
                except:
                    episodeNum = 0
                if episodeNum == 0 or seasonNum == 0:
                    print 'bad season or episode value'
                else:
                    name = str(seasonNum)+'x'+str(episodeNum)+' - '+name
                segments = episode.findAll('segment')
                if len(segments) == 0:
                    url = episodeId
                    mode = 4
                    addLink(name,url,mode,thumbnail,plot,seasonNum,episodeNum,showname,duration)
                else:
                    url = ''
                    for segment in segments:
                            url += segment['id']+'<segment>'
                    mode = 5 #PLAYEPISODE
                    addLink(name,url,mode,thumbnail,plot,seasonNum,episodeNum,showname,duration)
        xbmcplugin.endOfDirectory(pluginhandle)
Beispiel #47
0
def getFileTypes(url):
    #list filetypes
    p = re.compile('/details/(.*)')
    match = p.findall(url)
    for name in match:
        temp = 'http://www.archive.org/download/' + name + '/' + name + '_files.xml'
    link = getLink(temp)
    tree = BeautifulStoneSoup(link)

    shn = tree.findAll('file', attrs={"name": re.compile('(.+?\.shn$)')})
    m3u = tree.findAll('file', attrs={"name": re.compile('(.+?\.m3u$)')})
    flac = tree.findAll('file', attrs={"name": re.compile('(.+?\.flac$)')})
    mp3 = tree.findAll('file', attrs={"name": re.compile('(.+?64kb\.mp3$)')})
    vbr = tree.findAll('file', attrs={"name": re.compile('(.+?vbr\.mp3$)')})

    if len(m3u) > 0:
        addDir('.m3u Playlists', temp, 7)
    if len(flac) > 0:
        addDir('1. Flac Files', temp, 7)
    if len(mp3) > 0:
        addDir('2. VBR mp3', temp, 7)
    if len(vbr) > 0:
        addDir('3. 64kB mp3', temp, 7)
    if len(shn) > 0:
        addDir('1. Shorten Files', temp, 7)
def parse_config(file_to_read):
    parsed = BeautifulStoneSoup(open(file_to_read).read())
    adapters = parsed.findAll('adapter')
    if (not adapters):
        adapters = parsed.findAll('interface')
    host_tag = parsed.find('hostname')
    if host_tag:
        host_name = host_tag.string.lower()
    else:
        host_name = None
    domain_tag = parsed.find('domainname')
    if domain_tag:
        domain_name = domain_tag.string
        if domain_name:
            domain_name = domain_name.lower()
    else:
        domain_name = None
    ip_list = []
    for adapter in adapters:
        mac = (adapter.find('address').string if adapter.find('address') else None)
        if mac:
            mac = mac.replace('-', ':').lower()
        adapter_ips = adapter.findAll('adapterip')
        for adapter_ip_node in adapter_ips:
            if (not adapter_ip_node):
                continue
            ip = ''
            for ip_address in adapter_ip_node.find('ip'):
                ip = ip_address.string.strip()
                if (not ip):
                    continue
                info = {'host_name': host_name, 'domain_name': domain_name, 'ip_address': ip, 'mac_address': mac}
                if ((info not in ip_list) and (ip != '127.0.0.1') and (':' not in ip)):
                    ip_list.append(info)
    return ip_list
Beispiel #49
0
    def response(self):
        """Handle/parse the OnlineNIC API response."""
        soup = BeautifulStoneSoup(self.read())

        response = soup.find('response')
        if response is None:
            raise InvalidResponseError('No <response> container found.')

        contents = {}
        for key in [
                'code', 'msg', 'value', 'category', 'action', 'cltrid',
                'svtrid', 'chksum'
        ]:
            value = response.find(key)
            if value is None:
                raise InvalidResponseError(
                    'No {} found in response.'.format(key))
            contents[key] = value.string.strip()

        if contents['code'] in ONLINENIC_ERRORS:
            raise ONLINENIC_ERRORS[contents['code']]('{} [{}]'.format(
                contents['msg'], contents['value']))

        resdata = response.find('resdata')
        if resdata is not None:
            contents['data'] = {}
            for d in resdata.contents:
                if d is not None and d.string.strip():
                    key = d.get('name')
                    val = d.string.strip()
                    if key in contents['data'].keys():
                        if not isinstance(contents['data'][key], list):
                            existing_val = contents['data'][key]
                            contents['data'][key] = []
                            contents['data'][key].append(existing_val)
                        contents['data'][key].append(val)
                    else:
                        contents['data'][key] = val

        return contents
Beispiel #50
0
def image_path_with_fgdc_to_world_file(image_path, world_file, srs, units="m"):
    image = Image.open(image_path)
    (width, height) = image.size

    xml_path = "%s.xml" % (os.path.splitext(image_path)[0])
    with open(xml_path, "r") as f:
        xml = BeautifulStoneSoup(f)

    north_bound = float(xml.find("northbc").text)
    west_bound = float(xml.find("westbc").text)
    south_bound = float(xml.find("southbc").text)
    east_bound = float(xml.find("eastbc").text)

    srs = "%s" % (srs)
    if not srs.startswith("EPSG:"):
        srs = "EPSG:%s" % (srs)

    (west_bound, north_bound) = latlng_to_srs(north_bound, west_bound, srs,
                                              units)
    (east_bound, south_bound) = latlng_to_srs(south_bound, east_bound, srs,
                                              units)

    x_pixel_width = (east_bound - west_bound) / width
    y_pixel_width = (south_bound - north_bound) / height

    for l in [x_pixel_width, 0, 0, y_pixel_width, west_bound, north_bound]:
        world_file.write("%s\n" % l)

    return world_file
 def search(self, terms):
     torrents = []
     data = {'SearchString': '', 'SearchString1': terms, 'search': 'Search'}
     req = Request(self.search_uri, urlencode(data))
     req.add_header('User-Agent', self.user_agent)
     f = urlopen(req)
     soup = BeautifulStoneSoup(f.read())
     for (c, item) in enumerate(soup.findAll('a', {'class': 'magnet'})):
         if c == 30: break
         info = item.findPrevious('a')
         link = self.uri_prefix + info['href']
         item_req = Request(link)
         item_req.add_header('User-Agent', self.user_agent)
         item_f = urlopen(item_req)
         item_soup = BeautifulStoneSoup(item_f.read())
         sp = item_soup.findAll('span', {'class': re.compile('^stat_')})
         if sp:
             sp = [int(i.text.replace(',', '')) for i in sp]
         else:
             sp = [0, 0]
         torrents.append({
             'url': item['href'],
             'name': info.text,
             'seeds': sp[0],
             'leechers': sp[1]
         })
     return torrents
Beispiel #52
0
def ask_whatizit(search_sent_list, client = None, pipeline = 'whatizitSwissprot'):
    """A function which queries the Whatizit tool use the SOAP client.

    Care is taken to ensure that identical sentences are not querried 
    multiple times.

    Arguments:
    search_sent_list -- A LIST of sentences to search.
    client = None -- A SOAP client ... If None then one is created on the fly.
    pipeline = 'whatizitSwissprot' -- The pipeline to search.
    """

    if client is None:
        client = generate_whatizit_client()

    resdict = {}
    for sent in search_sent_list:
        if sent in resdict:
            yield resdict[sent]
        resp = client.service.contact(pipelineName = pipeline, 
                                        text = sent, 
                                        convertToHtml = False)
        soup = BeautifulStoneSoup(de_safe_xml(resp))
        if pipeline == 'whatizitSwissprot':
            groups = soup.findAll('z:uniprot')
            if groups:
                res = [(p.contents[0], p['ids'].split(',')) for p in groups]
            else:
                res = None
        elif pipeline == 'whatizitMeshUp':
            groups = soup.findAll('concepts')
            if groups:
                tmp = [x.contents[0].strip() for x in groups]
                ntmp = [x.split(';') for x in tmp]
                meshids = set(x.split(':')[0] for x in chain.from_iterable(ntmp))
                res = [(None, x) for x in sorted(meshids)]
        else:
            raise KeyError, 'Unknown pipeline: %s' % pipeline
        resdict[sent] = res
        yield res
Beispiel #53
0
def GET_RTMP(vid):
    url = 'http://www.adultswim.com/astv/mvpd/services/cvpXML.do?id=' + vid
    html = common.getURL(url)
    tree = BeautifulStoneSoup(html,
                              convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
    print tree.prettify()
    sbitrate = int(common.settings['quality'])
    hbitrate = -1
    files = tree.findAll('file')
    for filenames in files:
        try:
            bitrate = int(filenames['bitrate'])
        except:
            bitrate = 1
        if bitrate > hbitrate and bitrate <= sbitrate:
            hbitrate = bitrate
            filename = filenames.string
    if 'http://' in filename:
        filename = filename
        return filename
    else:
        filename = filename[1:len(filename) - 4]
        serverDetails = tree.find('akamai')
        server = serverDetails.find('src').string.split('://')[1]
        #get auth
        tokentype = serverDetails.find('authtokentype').string
        window = serverDetails.find('window').string
        aifp = serverDetails.find('aifp').string
        auth = getAUTH(aifp, window, tokentype, vid,
                       filename.replace('mp4:', ''))
        #swfUrl = 'http://www.tbs.com/cvp/tbs_video.swf swfvfy=true'
        rtmp = 'rtmpe://' + server + '?' + auth + ' playpath=' + filename  #+" swfurl="+swfUrl
        return rtmp
Beispiel #54
0
def message_cb(word, word_eol, userdata):   
    message = word[1]
    #Verifica se ela e igual a !news
    if(message == "!news"):

        # Lista para indexar o nome dos servidores
        rss_servers_names = ['lifehacker', 'linux-journal', 'revista-info', 'gizmodo', 'lol-cats']

        # Dicionário com servidores e links
        rss_servers = {
            'lifehacker': 'http://feeds.gawker.com/lifehacker/full.xml',
            'linux-journal': 'http://feeds.feedburner.com/LinuxJournal-BreakingNews',
            'revista-info': 'http://feeds.feedburner.com/Plantao-INFO',
            'gizmodo': 'http://feeds.gawker.com/gizmodo/full',
            'lol-cats': 'http://feeds.feedburner.com/lolcats/rss',
        }
       

        # Inicia a bibilioteca http
        http = httplib2.Http()

        # Realizar a requisicao no servidor escolhido aleatoriamente na lista de servidores
        # - status -> cabecalho da requisicao
        # - response -> corpo do arquivo XML
        status, response = http.request(rss_servers[rss_servers_names[randint(0, len(rss_servers_names)-1)]])

        # Inicializa o soup com o conteúdo XML
        soup = BeautifulStoneSoup(response)

        # Busca todos os itens dentro do XML, onde cada item representa uma noticia
        all_news = soup.findAll("item");

        #Recupera o inicio da mensagem ate o primeiro espaco em branco
        message = word[1]
   
        # Seleciona aleatoriamente uma noticia presente na lista
        selected = randint(0, len(all_news)-1)

        # Envia a mensagem no IRC
        xchat.command("ME "+all_news[selected].title.string + " - " + all_news[selected].link.string)       
 def playitems(self, params):
     print params
     print "@1"
     soup = BeautifulSoup(geturl(params['url']))
     id = dict(
         it.split('=', 1) for it in urllib.unquote(
             soup.find("embed")['flashvars']).split('&'))['vid']
     if 0:
         soup = BeautifulStoneSoup(
             geturl(
                 "http://cosmos.bcst.yahoo.com/rest/v2/pops;id=%s;lmsoverride=1"
                 % id))
         val = {
             "title": soup.channel.item.title,
             "descr": soup.channel.item.description,
             "date": soup.channel.item.find("media:pubStart"),
         }
     print "@@"
     soup = BeautifulStoneSoup(
         geturl(
             "http://cosmos.bcst.yahoo.com/rest/v2/pops;id=%s;lmsoverride=1;element=stream;bw=1200"
             % id))
     print soup
     item = soup.channel.item.find('media:content')
     val = {
         "url":
         "%s playpath=%s swfurl=%s swfvfy=true" %
         (item['url'], item['path'],
          "http://d.yimg.com/m/up/ypp/au/player.swf"),
         "duration":
         item['duration'],
         "name":
         re.sub(r'<!\[CDATA\[([^\]+])\]\]', '',
                soup.channel.item.title.contents[0])
     }
     print("@2", val)
     if "record" in params:
         self.record(val)
     else:
         self.play(val)
Beispiel #56
0
def inlines(value, return_list=False):
    try:
        from BeautifulSoup import BeautifulStoneSoup
    except ImportError:
        from beautifulsoup import BeautifulStoneSoup

    content = BeautifulStoneSoup(
        value,
        selfClosingTags=['inline', 'img', 'br', 'input', 'meta', 'link', 'hr'])
    inline_list = []

    if return_list:
        for inline in content.findAll('inline'):
            rendered_inline = render_inline(inline)
            inline_list.append(rendered_inline['context'])
        return inline_list
    else:
        for inline in content.findAll('inline'):
            rendered_inline = render_inline(inline)
            if rendered_inline:
                inline.replaceWith(
                    BeautifulStoneSoup(
                        render_to_string(rendered_inline['template'],
                                         rendered_inline['context'])))
            else:
                inline.replaceWith(BeautifulStoneSoup(''))
        return mark_safe(content)
Beispiel #57
0
    def popular(self, terms):
        torrents = []
        url = "http://1337x.to/popular-" + terms
        xbmc.log("url: %s" % (url), xbmc.LOGSEVERE)

        f = urlopen(url)
        soup = BeautifulStoneSoup(f.read())
        for table in soup.findAll(
                'table',
            {'class': 'table-list table table-responsive table-striped'}):
            for row in table.find('tbody').findAll('tr'):
                xbmc.log("row: %s" % (row), xbmc.LOGSEVERE)
                details = row.find('td', {"class": "coll-1 name"})
                size = row.find('td', {"class": re.compile("coll-4.*")})
                #            details=row.find('td[class*="coll-1 name"]');
                #            size=row.find('td[class*="coll-4"]');
                #            details=row.find('td', class_='coll-1 name');
                #            size=row.find('td',class_=re.compile('coll-4 .*'));
                name = details.text
                test = 'http://1337x.to' + details.find(
                    'a', {"class": None})['href']

                magnet = ""
                seeds = 0
                leechers = 0
                torrents.append({
                    'url':
                    test,
                    'name':
                    name.encode('ascii', 'ignore').decode('ascii'),
                    'size':
                    size.text,
                    'seeds':
                    seeds,
                    'leechers':
                    leechers,
                    'magnet':
                    magnet,
                })
        return torrents
Beispiel #58
0
def initServerInfoBase(fileName):
    """
    @description: Intializes soup for the Beautiful soup parser. Reads the exisitng Data from the fileName paramter.
    @todo:None
    @param xml: String, Name of file to be loaded in soup.
    @return: Boolean, True if a successful, else False
    """
    if os.path.exists(fileName):
        try:
            f = open(fileName, "r")
        except:
            return None, False
        xml = f.read()
        f.close()
        soup = BeautifulStoneSoup(xml)
        serverinfolist = soup.findAll("serverinfo")
    else:
        serverinfolist = []
        soup = BeautifulSoup()
        xml = "null"

    if len(serverinfolist) == 0:
        serverinfo = Tag(soup, "serverinfo")
        soup.insert(0, serverinfo)

    return soup, True
Beispiel #59
0
def getEpsLegendados(url):
    link = openURL(url)
    soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
    eps = soup.findAll("div", {"class": "well well-sm"})

    plotE = re.findall('<span itemprop="description">\s*(.*?)</span>', link,
                       re.DOTALL | re.MULTILINE)[0]
    plotE = unicode(
        BeautifulStoneSoup(
            plotE,
            convertEntities=BeautifulStoneSoup.HTML_ENTITIES)).encode('utf-8')
    plotE = BeautifulSoup(plotE.replace("<br>", " ")).text

    totE = len(eps)

    try:
        anterior = re.findall('href="(.*?)">Voltar</a></li>', link)[0]
        primeira = re.findall('href="(.*?)">Primeiro</a></li>', link)[0]
        proxima = re.findall('href="(.*?)">Avançar</a></li>', link)[0]
        pa = re.findall('([0-9]+?)$', anterior)[0]
        pd = re.findall('([0-9]+?)$', primeira)[0]
        pp = re.findall('([0-9]+?)$', proxima)[0]
        if (pp != '2'):
            addDir('. Primeira Página', base + primeira, 31,
                   artfolder + 'pagantr.jpg')
        if (pp != '2'):
            addDir('<< Página Anterior ' + pa, base + anterior, 31,
                   artfolder + 'pagantr.jpg')
    except:
        pass

    for ep in eps:
        try:
            titE = ep.img["title"].encode('ascii', 'ignore')
            urlE = base + ep.a["href"]
            if ep.a.img.has_key("src"): imgE = ep.a.img["src"]
            else: imgE = ep.a.img["data-cfsrc"]
            addDir(titE, urlE, 100, imgE, False, totE, plotE)
        except:
            pass

    try:
        ultima = re.findall('href="(.*?)">Último</a></li>', link)[0]
        pu = re.findall('([0-9]+?)$', ultima)[0]
        if (pu != '1'):
            addDir('Página Seguinte ' + pp + ' >>', base + proxima, 31,
                   artfolder + 'proxpag.jpg')
        if (pu != '1'):
            addDir('Última Página ' + pu + ' >>', base + ultima, 31,
                   artfolder + 'proxpag.jpg')
    except:
        pass
def main():
    """Generate a list of all the morphological tags in an XML document."""

    in_file = codecs.open("herodotus.xml", "rU", "utf-8")

    print "Parsing the input file with BeautifulStoneSoup..."
    print

    soup = BeautifulStoneSoup(in_file)

    print "Finding all the tokens..."
    print

    tokens = soup.findAll('w')

    out_file = codecs.open("HDT-morph-list.txt", "w", "utf-8")

    out_file2 = codecs.open("HDT-pos-list.txt", "w", "utf-8")

    unique_tags = Set([])

    short_tags = Set([])

    for token in tokens:
        try:
            tag = token['pos']
            if tag != "":
                unique_tags.add(tag)
                short_tag = tag[:2]
                short_tags.add(short_tag)
            
        except KeyError:
            pass
    
    for tag in unique_tags:
        print >> out_file, tag

    for tag in short_tags:
        print >> out_file2, tag