Example #1
0
    def parse(self, url, data, fromEncoding=None):
        soup = BeautifulSoup(data, fromEncoding=(fromEncoding or "windows-1252"))

        title = soup.fetch("title")[0].string
        fnr = soup.fetch("h1")[0].string.split(".", 1)[0]

        m = DATE_RE.search(title)
        year = int(m.group(1))
        mon = int(m.group(2))
        day = int(m.group(3))
        hour = int(m.group(4))
        minu = int(m.group(5))

        existing = Fundur.objects.filter(fnr=fnr)
        if existing:
            fn = existing[0]
        else:
            fn = Fundur()

        fn.titill = title
        fn.lth, fn.fnr = url_to_lth_fnr(url)
        fn.dags = "%4.4d-%2.2d-%2.2d %2.2d:%2.2d" % (year, mon, day, hour, minu)
        fn.save()

        return ScraperParserHTML.parse(self, url, data, soup=soup)
Example #2
0
def parseMultiselect(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    aList = soup.fetch("a", {"href": "/weather/local/%"})
    aList += soup.fetch("a", {"href": "/outlook/travel/local/%"})
    aList += soup.fetch("a",
                        {"href": "/outlook/travel/businesstraveler/local/%"})

    lastCode = ""
    resultsCount = 0
    for aItem in aList:
        afterLocal = aItem['href'].split("local/")
        if 2 == len(afterLocal):
            textAfterLocal = afterLocal[1]
            if 8 < len(textAfterLocal):
                code = textAfterLocal[:8]
                textAfterLocal = textAfterLocal[8:]
                if textAfterLocal.startswith("?from=search_"):
                    if -1 == lastCode.find(code):
                        lastCode += code
                        text = getAllTextFromTag(aItem)
                        resultsCount += 1
                        returned.append((text, code))
    if 0 == resultsCount:
        return (LOCATION_UNKNOWN, None)
    return (LOCATION_MULTISELECT, universalDataFormatReplaceEntities(returned))
Example #3
0
def parseMultiselect(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    aList = soup.fetch("a", {"href":"/weather/local/%"})
    aList += soup.fetch("a", {"href":"/outlook/travel/local/%"})
    aList += soup.fetch("a", {"href":"/outlook/travel/businesstraveler/local/%"})


    lastCode = ""
    resultsCount = 0
    for aItem in aList:
        afterLocal = aItem['href'].split("local/")
        if 2 == len(afterLocal):
            textAfterLocal = afterLocal[1]
            if 8 < len(textAfterLocal):
                code = textAfterLocal[:8]
                textAfterLocal = textAfterLocal[8:]
                if textAfterLocal.startswith("?from=search_"):
                    if -1 == lastCode.find(code):
                        lastCode += code
                        text = getAllTextFromTag(aItem)
                        resultsCount += 1
                        returned.append((text,code))
    if 0 == resultsCount:
        return (LOCATION_UNKNOWN,None)
    return (LOCATION_MULTISELECT,universalDataFormatReplaceEntities(returned))
def _spider_book_info(url, letter):
    try:
        html = getHttp(url, handleException = False)
        soup = BeautifulSoup()
        soup.feed(html)
        h1 = soup.first("h1")
        if h1 is None:
            return None

        assert h1 is not None
        title = retrieveContents(h1).decode("iso-8859-1")

        subtitle = None
        author = None
        code = None

        labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})]
        data = soup.fetch("span", {"class": "title-data"})
        try:
            index = labels.index("Subtitle")
            subtitle = retrieveContents(data[index]).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Author")
            author = retrieveContents(data[index].first("a")).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Language")
            href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"])
            code = href[19:href.find("&", 19)].decode("iso-8859-1")
        except ValueError:
            pass

        tid = soup.first("input", {"type": "hidden", "name": "tid"})
        assert tid is not None
        book_id = tid["value"].decode("iso-8859-1")

        print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore")

        sel = soup.first("select", {"name": "book"})
        assert sel is not None
        opts = sel.fetch("option")
        formats = []
        for opt in opts:
            try:
                format = retrieveContents(opt).split()[0]
                if format not in ebooks.FORMATS:
                    continue

                val = opt["value"]
                formats.append((format, val))

            except Exception, ex:
                log(SEV_EXC, exceptionAsStr(ex))
        formats.sort()
        return (url, title, subtitle, author, book_id, code, formats)
Example #5
0
def parseFirstDayHtml(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # sky, now, feelsLike, wind, hum, preasure, dev point, visibility
    returned = [None, None, None, None, None, None, None, "N/A"]

    bItems = soup.fetch("b", {"class": "obsTextA"})
    if len(bItems) == 2:
        returned[0] = getAllTextFromTag(bItems[0]).strip()
        temp = getAllTextFromTag(bItems[1]).strip().split("Like ")
        if len(temp) > 1:
            returned[2] = temp[1].replace("&deg;F", "").strip()

    bItem = soup.first("b", {"class": "obsTempTextA"})
    if bItem:
        returned[1] = getAllTextFromTag(bItem).replace("&deg;F", "").strip()

    tdList = soup.fetch("td", {"class": "obsTextA"})
    if len(tdList) == 8:
        tdList = tdList[1::2]
        assert (len(tdList) == 4)
        returned[3] = getAllTextFromTag(tdList[0]).strip()
        returned[4] = getAllTextFromTag(tdList[1]).replace("%", "").strip()
        returned[5] = getAllTextFromTag(tdList[2]).replace(
            "in.", "inches").strip()  ##todo: down, up, ...
        returned[6] = getAllTextFromTag(tdList[3]).replace("&deg;F",
                                                           "").strip()

    for r in returned:
        if r == None or r == "":
            return None
    return returned
Example #6
0
def parseName(htmlTxt):
    # this is funy
    htmlTxt = htmlTxt.replace("<! -- ", "<!---")
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # no results
    fontList = soup.fetch("font", {"face": "arial"})
    for fontItem in fontList:
        iItem = fontItem.first("i")
        if iItem:
            if str(iItem.contents[0]).startswith("Your search for"):
                return (NO_RESULTS, sNoResultsText)

    # get table data
    trList = soup.fetch("tr", {"bgcolor": "#ffffff"})
    resultsCount = 0
    outerList = []
    for trItem in trList:
        tdList = trItem.fetch("td")
        if 5 == len(tdList):
            symbol = getAllTextFromTag(tdList[0]).strip()
            url = tdList[0].first("a")['href']
            name = getAllTextFromTag(tdList[1]).strip()
            market = getAllTextFromTag(tdList[2]).strip()
            industry = getAllTextFromTag(tdList[3]).strip()
            outerList.append((url, symbol, name, market, industry))
            resultsCount += 1

    # no results?
    if 0 == resultsCount:
        return (NO_RESULTS, sNoResultsText)

    return (STOCKS_LIST, universalDataFormatReplaceEntities(outerList))
Example #7
0
def parseFirstDayHtml(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # sky, now, feelsLike, wind, hum, preasure, dev point, visibility
    returned = [None, None, None, None, None, None, None, "N/A"]

    bItems = soup.fetch("b", {"class":"obsTextA"})
    if len(bItems) == 2:
        returned[0] = getAllTextFromTag(bItems[0]).strip()
        temp = getAllTextFromTag(bItems[1]).strip().split("Like ")
        if len(temp) > 1:
            returned[2] = temp[1].replace("&deg;F","").strip()

    bItem = soup.first("b", {"class":"obsTempTextA"})
    if bItem:
        returned[1] = getAllTextFromTag(bItem).replace("&deg;F","").strip()

    tdList = soup.fetch("td", {"class":"obsTextA"})
    if len(tdList) == 8:
        tdList = tdList[1::2]
        assert (len(tdList) == 4)
        returned[3] = getAllTextFromTag(tdList[0]).strip()
        returned[4] = getAllTextFromTag(tdList[1]).replace("%","").strip()
        returned[5] = getAllTextFromTag(tdList[2]).replace("in.","inches").strip() ##todo: down, up, ...
        returned[6] = getAllTextFromTag(tdList[3]).replace("&deg;F","").strip()

    for r in returned:
        if r == None or r == "":
            return None
    return returned
Example #8
0
def parse(wordtosearch):

    url = 'http://dictionary.reference.com/search?q=' + wordtosearch
    # Read the URL and pass it to BeautifulSoup.
    html = urllib2.urlopen(url).read()
    soup = BeautifulSoup()
    soup.feed(html)

    # Read the main table, extracting the words from the table cells.
    maintable = soup.fetch('li')

    # There are 6 lines containg <li> at the bottom that we don't want to print
    # So we remove them from the list by adjustin the count
    removeli = len(maintable) - 6

    counter = 0
    # if removeli is 0 then we need to look for dl tags
    if removeli == 0:
        # fetch dl tags
        maintable = soup.fetch('dl')
        for defs in maintable:
            converttostring = str(defs)
            splitstring = converttostring.split('<dd>')
            removetrash = re.sub(
                '^ |</dd.*dl>|<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>',
                '', splitstring[1])
            addunderscores = re.sub('<u><i>|</i></u>', '_', removetrash)
            convertampersands = re.sub('&', '&', addunderscores)
            definition = convertampersands
            print definition
    else:
        for counter in range(removeli):
            defs = maintable[counter]
            converttostring = str(defs)
            splitstring = converttostring.split('<li>')
            if len(splitstring) != 1:
                removetrash = re.sub(
                    '^ |(<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>)',
                    '', splitstring[1])
                addunderscores = re.sub('(<u><i>|</i></u>)', '_', removetrash)
                convertampersands = re.sub('&', '&', addunderscores)
                definition = convertampersands
                print definition

            else:
                removetrash = re.sub(
                    '^ |<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>',
                    '', splitstring[0])
                addunderscores = re.sub('<u><i>|</u></i>', '_', removetrash)
                convertampersands = re.sub('&', '&', addunderscores)
                definition = convertampersands
                print definition

            counter += 1
Example #9
0
 def parse_posts(self, posts):
   '''Parses posts from table/ajax_getposts.'''
   rposts = []
   for post in posts:
     soup = BeautifulSoup(post)
     rposts.append({
         'date': soup.fetch('tr')[1].fetch('font')[0].text,
         'link': soup.fetch('tr')[1].fetch('font')[1].a['href'],
         'html': soup.fetch('tr')[2].fetch('td')[1],
         'plaintext': soup.fetch('tr')[2].fetch('td')[1].text,
     })
   return rposts
    def action(self):
        resp = self.browser.open(self.BASE_URL)
        html = resp.read()
        soup = BeautifulSoup(html)

        news_chunk1 = soup.fetch('div',{'class':'item'})
        news_chunk2 = soup.fetch('div',{'class':'headline'})
        news_chunk3 = soup.fetch('div',{'class':'body-copy'})

        for new in news_chunk1:
            head = new.findAll('a')
            for hd in head:
                print_ = str(hd).split('=')
                print print_[3].replace('''"thumb" src''','').replace('onclick', '').replace('"','').replace('&nbsp;', '')
Example #11
0
 def test_render_outline (self):
     c = template.Context({"package" : self.package})
     
     t = template.Template('''
     {% load mainpage_extras %}
     {% render_outline package %}
     ''')
     output = t.render(c)
     
     soup = BeautifulSoup(output)
     root = soup.find(attrs={'nodeid' : '1'})
     self.assertTrue('Root' in root.contents[0])
     self.assertEquals(len(soup.fetch('li')), 5)
     self.assertEquals(len(soup.fetch('a')), 5)
     self.assertEquals(len(soup.fetch('ul')), 3)
Example #12
0
    def test_render_outline(self):
        c = template.Context({"package": self.package})

        t = template.Template('''
        {% load mainpage_extras %}
        {% render_outline package %}
        ''')
        output = t.render(c)

        soup = BeautifulSoup(output)
        root = soup.find(attrs={'nodeid': '1'})
        self.assertTrue('Root' in root.contents[0])
        self.assertEquals(len(soup.fetch('li')), 5)
        self.assertEquals(len(soup.fetch('a')), 5)
        self.assertEquals(len(soup.fetch('ul')), 3)
Example #13
0
File: d.py Project: kg-bot/SupyBot
def parse(wordtosearch):

    url = 'http://dictionary.reference.com/search?q=' + wordtosearch
    # Read the URL and pass it to BeautifulSoup.
    html = urllib2.urlopen(url).read()
    soup = BeautifulSoup()
    soup.feed(html)

    # Read the main table, extracting the words from the table cells.
    maintable = soup.fetch('li')

    # There are 6 lines containg <li> at the bottom that we don't want to print
    # So we remove them from the list by adjustin the count
    removeli = len(maintable) - 6

    counter = 0
    # if removeli is 0 then we need to look for dl tags
    if removeli == 0:
        # fetch dl tags
        maintable = soup.fetch('dl')
        for defs in maintable:
            converttostring = str(defs)
            splitstring = converttostring.split('<dd>') 
            removetrash = re.sub('^ |</dd.*dl>|<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[1])
            addunderscores = re.sub('<u><i>|</i></u>', '_', removetrash)
            convertampersands = re.sub('&', '&', addunderscores)
            definition = convertampersands
            print definition
    else:    
        for counter in range(removeli):
            defs = maintable[counter]
            converttostring = str(defs)
            splitstring = converttostring.split('<li>')
            if len(splitstring) != 1:
                removetrash = re.sub('^ |(<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>)', '', splitstring[1])
                addunderscores = re.sub('(<u><i>|</i></u>)', '_', removetrash)
                convertampersands = re.sub('&', '&', addunderscores)
                definition = convertampersands
                print definition
                
            else:
                removetrash = re.sub('^ |<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[0])
                addunderscores = re.sub('<u><i>|</u></i>', '_', removetrash)
                convertampersands = re.sub('&', '&', addunderscores)
                definition = convertampersands
                print definition

            counter += 1
Example #14
0
def reverseAreaCodeLookup(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    noResults = m411testNoResults(soup)
    if NO_RESULTS == noResults:
        return (noResults,m411NoResultsText)
    # results
    tableList = soup.fetch("table", {"id":"listings"})
    if len(tableList) != 1:
        return UNKNOWN_FORMAT, None

    trList = tableList[0].fetch("tr")
    if len(trList) == 0:
        return UNKNOWN_FORMAT, None
    # ignore headers ([1:])
    for trItem in trList[1:]:
        if 0 == len(trItem.fetch("tr")): # they have sth screwed with this <tr><tr> .... </tr></tr>
            tdList = trItem.fetch("td", {"id":"subtextid"})
            if 3 == len(tdList):
                city     = getAllTextFromTag(tdList[0])
                country  = getAllTextFromTag(tdList[1])
                timezone = getAllTextFromTag(tdList[2])
                smallList = (city,country,timezone)
                returned.append(smallList)
            elif 2 == len(tdList):
                city     = getAllTextFromTag(tdList[0])
                country  = ""
                timezone = getAllTextFromTag(tdList[1])
                smallList = (city,country,timezone)
                returned.append(smallList)

    if 0 == len(returned):
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
Example #15
0
def internationalCodeSearch(htmlTxt):
    result = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    noResults = m411testNoResults(soup)
    if NO_RESULTS == noResults:
        return (noResults,m411NoResultsText)
    # results
    tableList = soup.fetch("table",{"id":"listings"})
    if len(tableList) != 1:
        return UNKNOWN_FORMAT, None
    trList = tableList[0].fetch("tr")
    cityCodeList = []
    for trItem in trList:
        if 0 == len(trItem.fetch("tr")):
            tdList = trItem.fetch("td", {"id":"subtextid"})
            if 2 == len(tdList):
                if 0 == len(result):
                    result.append([getAllTextFromTag(tdList[1])])
                else:
                    city = getAllTextFromTag(tdList[0])
                    code = getAllTextFromTag(tdList[1])
                    cityCodeList.append((city,code))
    # sort the (city,code) list by city
    cityCodeList.sort(sortByCityFunc)
    for el in cityCodeList:
        result.append(el)
    if 0 == len(result):
        return UNKNOWN_FORMAT, None
    return (RESULTS_DATA,universalDataFormatReplaceEntities(result))
Example #16
0
 def __init__(self, url='http://www.synthetic.org/play.html'):
     self.playlists = []
     page = BeautifulSoup(urlopen(url))
     for link in page.fetch('a', {'href':re.compile(r'play/rsa\d+\.htm')}):
         iterMonthLists = iterMonthPlayLists(urljoin(url, link['href']))
         try: self.playlists.extend(iterMonthLists)
         except ValueError: warn('unparsed link: %s' % link)
Example #17
0
def reverseZIPCodeLookup(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)
    # results (one? we handle more than one)
    tables = soup.fetch("table", {"summary":"Codes Results"})
    if 0 == len(tables):
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    trList = []
    for tab in tables:
        trList += tab.fetch("tr")
    for tr in trList:
        tdList = tr.fetch("td")
        if len(tdList) == 3:
            city     = getAllTextFromTag(tdList[0])
            country  = getAllTextFromTag(tdList[1])
            timezone = getAllTextFromTag(tdList[2])
            if city != "New Search":
                smallList = (city,country,timezone)
                returned.append(smallList)
        elif len(tdList) == 2: #special case (911)
            city     = getAllTextFromTag(tdList[0])
            country  = getAllTextFromTag(tdList[1])
            if city != "New Search":
                smallList = (city,country,"")
                returned.append(smallList)
    if len(returned) == 0:
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
Example #18
0
def parseFirstDayHtmlYahoo(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # sky, now, feelsLike, wind, hum, preasure, dev point, visibility
    returned = [None, None, None, None, None, None, None, None]

    fontList = soup.fetch("font",{"face":"Arial", "size":"2"})
    list = []
    wasFeelsLike = False
    for f in fontList:
        text = getAllTextFromTag(f).strip()
        if wasFeelsLike:
            list.append(text)
        else:
            if text == "Feels Like:":
                list.append(text)
                wasFeelsLike = True
    if len(list) >= 16:
        smallList = list[1::2]
        returned[0] = ""
        returned[1] = smallList[0].replace("&deg;","")
        returned[2] = smallList[0].replace("&deg;","")
        returned[3] = smallList[3]
        returned[4] = smallList[4].replace("%","")
        returned[5] = smallList[2]
        returned[6] = smallList[1].replace("&deg;","")
        returned[7] = smallList[6]

    for r in returned:
        if r == None:
            return None
    return returned
Example #19
0
def internationalCodeSearch(htmlTxt):
    result = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)
    # Country code(s)
    tableList = soup.fetch("table", {"summary":"Codes Results"})
    if len(tableList) != 1:
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    # found country code
    tdListA = tableList[0].fetch("td",{"style":"%padding-left:5px;"})
    tdListB = tableList[0].fetch("td",{"style":"%line-height:14pt;"})
    if len(tdListA) != len(tdListB):
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    cityCodeList = []
    for i in range(len(tdListA)):
        if 0 == i:
            result.append([getAllTextFromTag(tdListB[i])])
        else:
            city = getAllTextFromTag(tdListA[i])
            code = getAllTextFromTag(tdListB[i])
            cityCodeList.append((city,code))
    # sort the (city,code) list by city
    cityCodeList.sort(sortByCityFunc)
    for el in cityCodeList:
        result.append(el)
    return (RESULTS_DATA,universalDataFormatReplaceEntities(result))
def parseCurrency(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    #TABLE WIDTH="100%" BORDER="0" CELLPADDING="0" CELLSPACING="0" BGCOLOR="#009900"
    #<TABLE WIDTH="100%" BORDER="0" CELLPADDING="1" CELLSPACING="1" BGCOLOR="#000000">
    findTable = soup.fetch(
        "table", {
            "width": "100%",
            "border": "0",
            "cellpadding": "1",
            "cellspacing": "1",
            "bgcolor": "#000000"
        })
    #print findTable
    if not findTable:
        return (UNKNOWN_FORMAT, currencyNoResultsText)
    itemTable = findTable[0]
    findTableTR = itemTable.fetch("tr")
    #Parse page and create dictionary
    for itemTR in findTableTR:
        findTD = itemTR.fetch("td")
        if 0 == len(findTD):
            continue
        if 4 != len(findTD):
            return (UNKNOWN_FORMAT, currencyNoResultsText)
        #print str(findTD[1].contents[0].contents[0].contents[0])
        #print str(findTD[2].contents[0].contents[0]).replace(",","").strip()
        abbrev = str(findTD[1].contents[0].contents[0].contents[0])
        g_AbbrevToRatesDict[abbrev] = float(
            str(findTD[2].contents[0].contents[0]).replace(",", "").strip())
    g_AbbrevToRatesDict["USD"] = 1.0
    return (RESULTS_DATA, g_AbbrevToRatesDict)
Example #21
0
def parseGasOld(htmlTxt, url=None, dbgLevel=0):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    testTitle = soup.first("title")
    if testTitle:
        if getAllTextFromTag(testTitle).startswith("GasBuddy.com - Find cheap gas prices in your city"):
            return (LOCATION_UNKNOWN, gLocationUnknownText)

    outerList = []
    trList = soup.fetch("tr")
    for trItem in trList:
        tdList = trItem.fetch("td")
        if 8 == len(tdList):
            if tdList[1].first("table"):
                price = getAllTextFromTag(tdList[0]).strip()
                name = getAllTextFromTag(tdList[2]).strip()
                address = getAllTextFromTag(tdList[4]).strip()
                area = getAllTextFromTag(tdList[5]).strip()
                time = getAllTextFromTag(tdList[6]).strip()
                smallList = [price, name, address, area, time]
                outerList.append(smallList)
        else:
            if 0 != len(tdList):
                firstB = tdList[0].first("b")
                if firstB:
                    if getAllTextFromTag(firstB).startswith("No gas prices found."):
                        return (NO_RESULTS, gNoResultsText)

    if 0 == len(outerList):
        if dbgLevel > 0:
            print "len(outerList)==0"
        return parsingFailed(url, htmlTxt)

    return (GAS_DATA, universalDataFormatReplaceEntities(outerList))
Example #22
0
    def _search_serie_http(self, searchterms):
        """Search for a serie and return its episode list page URL"""

        # google power!
        url = "http://www.google.com/search?hl=en&q=site:epguides.com%%20%s"
        search = "%s %s"
        search = urllib.quote(search % (searchterms, '"(a Titles and Air Dates Guide)"'))

        f = urllib.urlopen(url % search)
        bs = BeautifulSoup(f)
        print bs
        if not bs: return False

        results = []
    
        # tidy up the search results
        for url in bs.fetch("a", {"href":re.compile("http://epguides.com/")}):
            url = url['href']
        
            # only add serie summary pages (don't end with .html)
            if url.endswith("/"):
                results.append(url)

        if not results: return False

        # The first result is (usually) the correct one
        return results[0]
Example #23
0
    def email(self):
        url = 'http://pgp.mit.edu/pks/lookup?op=index&search=%s' % self.artifact[
            'name']

        try:
            status, response = get(url, headers=self.headers)

            if status:
                if 'No results found' in response.text:
                    pass
                else:
                    data = BeautifulSoup(response.text)
                    hrefs = data.fetch('a')

                    for href in hrefs:
                        content = href.contents

                        if self.artifact['name'] in content[0]:
                            try:
                                name = content[0].split('&lt;')[0]
                                if isinstance(self.artifact['data']['pgp'],
                                              list):
                                    self.artifact['data']['pgp'].append(name)
                                else:
                                    self.artifact['data']['pgp'] = []
                                    self.artifact['data']['pgp'].append(name)
                            except IndexError:
                                pass

        except:
            pass
Example #24
0
def internationalCodeSearch(htmlTxt):
    result = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)
    # Country code(s)
    tableList = soup.fetch("table", {"summary": "Codes Results"})
    if len(tableList) != 1:
        return (UNKNOWN_FORMAT, m411UnknownFormatText)
    # found country code
    tdListA = tableList[0].fetch("td", {"style": "%padding-left:5px;"})
    tdListB = tableList[0].fetch("td", {"style": "%line-height:14pt;"})
    if len(tdListA) != len(tdListB):
        return (UNKNOWN_FORMAT, m411UnknownFormatText)
    cityCodeList = []
    for i in range(len(tdListA)):
        if 0 == i:
            result.append([getAllTextFromTag(tdListB[i])])
        else:
            city = getAllTextFromTag(tdListA[i])
            code = getAllTextFromTag(tdListB[i])
            cityCodeList.append((city, code))
    # sort the (city,code) list by city
    cityCodeList.sort(sortByCityFunc)
    for el in cityCodeList:
        result.append(el)
    return (RESULTS_DATA, universalDataFormatReplaceEntities(result))
Example #25
0
def parseList(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # find table with results
    tableList = soup.fetch("table", {"cellpadding": "0", "cellspacing": "0", "border": "0", "width": "100%"})
    if 0 == len(tableList):
        return (UNKNOWN_FORMAT, jUnknownFormatText)

    outerList = []
    for table in tableList:
        trList = table.fetch("tr")
        if 2 <= len(trList):
            tdCount = len(trList[0].fetch("td"))
            if 3 > tdCount:
                return (UNKNOWN_FORMAT, jUnknownFormatText)
            for tr in trList[1:]:
                tdList = tr.fetch("td")
                rank = ""
                if 4 == tdCount:
                    rank = getAllTextFromTag(tdList[0])
                title = getAllTextFromTag(tdList[-3])
                rating = getAllTextFromTag(tdList[-2])
                explicitness = getAllTextFromTag(tdList[-1])
                url = tdList[-3].first("a")["href"]
                if not url:
                    return (UNKNOWN_FORMAT, jUnknownFormatText)
                outerList.append((rank, title, rating, explicitness, url))

    if 0 == len(outerList):
        return (NO_RESULTS, jNoResultsText)

    return (JOKES_LIST, universalDataFormatReplaceEntities(outerList))
Example #26
0
def reverseZIPCodeLookup(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)
    # results (one? we handle more than one)
    tables = soup.fetch("table", {"summary": "Codes Results"})
    if 0 == len(tables):
        return (UNKNOWN_FORMAT, m411UnknownFormatText)
    trList = []
    for tab in tables:
        trList += tab.fetch("tr")
    for tr in trList:
        tdList = tr.fetch("td")
        if len(tdList) == 3:
            city = getAllTextFromTag(tdList[0])
            country = getAllTextFromTag(tdList[1])
            timezone = getAllTextFromTag(tdList[2])
            if city != "New Search":
                smallList = (city, country, timezone)
                returned.append(smallList)
        elif len(tdList) == 2:  # special case (911)
            city = getAllTextFromTag(tdList[0])
            country = getAllTextFromTag(tdList[1])
            if city != "New Search":
                smallList = (city, country, "")
                returned.append(smallList)
    if len(returned) == 0:
        return (UNKNOWN_FORMAT, m411UnknownFormatText)
    return (RESULTS_DATA, universalDataFormatReplaceEntities(returned))
Example #27
0
def parseFirstDayHtmlYahoo(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # sky, now, feelsLike, wind, hum, preasure, dev point, visibility
    returned = [None, None, None, None, None, None, None, None]

    fontList = soup.fetch("font", {"face": "Arial", "size": "2"})
    list = []
    wasFeelsLike = False
    for f in fontList:
        text = getAllTextFromTag(f).strip()
        if wasFeelsLike:
            list.append(text)
        else:
            if text == "Feels Like:":
                list.append(text)
                wasFeelsLike = True
    if len(list) >= 16:
        smallList = list[1::2]
        returned[0] = ""
        returned[1] = smallList[0].replace("&deg;", "")
        returned[2] = smallList[0].replace("&deg;", "")
        returned[3] = smallList[3]
        returned[4] = smallList[4].replace("%", "")
        returned[5] = smallList[2]
        returned[6] = smallList[1].replace("&deg;", "")
        returned[7] = smallList[6]

    for r in returned:
        if r == None:
            return None
    return returned
Example #28
0
    def fqdn(self):
        url = 'http://pgp.mit.edu/pks/lookup?op=index&search=%s' % self.artifact[
            'name']

        try:
            status, response = get(url, headers=self.headers)

            if status:
                if 'No results found' in response.text:
                    pass

                else:
                    data = BeautifulSoup(response.text)
                    items = data.fetch('a')
                    for item in items:
                        matches = re.findall(re_email, item)
                        for m in matches:
                            if isinstance(self.artifact['data']['pgp'], list):
                                self.artifact['data']['pgp'].append(m)
                            else:
                                self.artifact['data']['pgp'] = []
                                self.artifact['data']['pgp'].append(m)

                            self.artifact['children'].append({
                                'name': m,
                                'type': 'email',
                                'source': 'PGP',
                                'subtype': None
                            })

        except:
            pass
Example #29
0
def bot_w(mess, nick, botCmd):
    """Weather forecast"""
    if (len(botCmd) == 1):
        message = u"Usage: !weather + City Name or zip code"
    else:
        cityname = botCmd[1]
        url = 'http://search.weather.com.cn/static/url.php'
        values = {'cityinfo': cityname.encode('utf8')}
        data = urllib.urlencode(values)
        req = urllib2.Request(url, data)
        response = urllib2.urlopen(req)
        the_page = response.read()
        url = the_page[the_page.find('URL=')+4:len(the_page)-3]
        request = urllib2.Request(url)
        page = urllib2.urlopen(request).read()
        message = 'Cannot find such a city'
        if page.find('<div class="box_contenttodayinwea" id="c_1_1">') != -1:
            soup = BeautifulSoup(page)
            message = str(soup.head.title)
            message = message[7:message.find('-')] + ', '
            page = page[page.find('<div class="box_contenttodayinwea" id="c_1_1">'):]
            page = page[:page.find('</div>') + 6]
            soup = BeautifulSoup(page)
            ems = soup.fetch('em')

            for i in range(0, 3):
                message = message + re.sub('<(.|\n)+?>', '', str(ems[i]))
                if i < 2:
                    message = message + ', '
    return message
Example #30
0
def parseDream2(htmlTxt):
    soup = BeautifulSoup()
    # TODO: this is temporary:
    htmlTxt = htmlTxt.replace(
        "/*<![CDATA[*/ @import \"/knowledge/stylesheets/monobook/main.css\"; /*]]>*/",
        "")

    soup.feed(htmlTxt)

    tableMain = soup.fetch("table", {
        "width": "768",
        "align": "center",
        "cellspacing": "0",
        "cellpadding": "0"
    })
    if not tableMain:
        return (UNKNOWN_FORMAT, dUnknownFormatText)
    td = None
    for table in tableMain:
        tr = table.first("tr")
        if tr:
            tdTest = tr.first("td", {"width": "100%", "valign": "top"})
            if tdTest:
                td = tdTest
    if not td:
        return (UNKNOWN_FORMAT, dUnknownFormatText)
    # why without this it is not working?
    soup2 = BeautifulSoup()
    soup2.feed(str(td).replace("<br />>", ""))
    td = soup2.first("td")
    # no results?
    if td.first("center"):
        return (NO_RESULTS, dNoResultsText)

    # results
    bTable = td.fetch("b")
    if not bTable:
        return (UNKNOWN_FORMAT, dUnknownFormatText)

    outerList = []
    for bItem in bTable:
        title = getAllTextFromTag(bItem)
        next = getLastElementFromTag(bItem)
        pItem = None
        while next and not pItem:
            if isinstance(next, Tag):
                if next.name == "p":
                    pItem = next
            next = next.next
        if pItem:
            text = getAllTextFromTagWithA(pItem.first("font"))
            if text.startswith("Interpretation: "):
                text = text[len("Interpretation: "):]
            outerList.append((title, text))

    if 0 == len(outerList):
        return (NO_RESULTS, dNoResultsText)
    return (DREAM_DATA, universalDataFormatReplaceEntities(outerList))
Example #31
0
def parseRandomQuotes(htmlTxt, modulesInfo):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    quotes = []
    dtList = soup.fetch("dt", {"class": "quote"})
    ddList = soup.fetch("dd", {"class": "author"})
    if len(dtList) == len(ddList) and len(dtList) > 0:
        for i in range(len(ddList)):
            quote = getAllTextFromTag(dtList[i])
            next = ddList[i]
            bItem = None
            while next and None == bItem:
                next = next.next
                if isinstance(next, Tag):
                    if next.name == "b":
                        bItem = next
                    elif next.name == "dt":
                        next = None
                    elif next.name == "select":
                        next = None
            if bItem:
                aItem = bItem.first("a")
                if aItem:
                    author = getAllTextFromTag(aItem)
                else:
                    author = getAllTextFromTag(bItem)
            quotes.append([author, "\"" + quote.strip() + "\""])

    if 0 == len(quotes):
        return UNKNOWN_FORMAT, None
    # build definition
    df = Definition()

    te = df.TextElement("Random Quotes", style=styleNamePageTitle)
    te.setJustification(justCenter)
    df.LineBreakElement()
    addQuotesToDefinition(df, quotes, modulesInfo)
    df.LineBreakElement()
    par = df.ParagraphElement(False)
    par.setJustification(justCenter)
    df.TextElement("Daily", link="s+quotes:daily")
    df.TextElement(" \x95 ", style=styleNameGray)
    df.TextElement("Random", link="s+quotes:random")
    df.PopParentElement()
    return QUOTES_DATA, universalDataFormatWithDefinition(df, [])
Example #32
0
def parseRandomQuotes(htmlTxt, modulesInfo):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    quotes = []
    dtList = soup.fetch("dt", {"class":"quote"})
    ddList = soup.fetch("dd", {"class":"author"})
    if len(dtList) == len(ddList) and len(dtList) > 0:
        for i in range(len(ddList)):
            quote = getAllTextFromTag(dtList[i])
            next = ddList[i]
            bItem = None
            while next and None == bItem:
                next = next.next
                if isinstance(next, Tag):
                    if next.name == "b":
                        bItem = next
                    elif next.name == "dt":
                        next = None
                    elif next.name == "select":
                        next = None
            if bItem:
                aItem = bItem.first("a")
                if aItem:
                    author = getAllTextFromTag(aItem)
                else:
                    author = getAllTextFromTag(bItem)
            quotes.append([author, "\""+quote.strip()+"\""])

    if 0 == len(quotes):
        return UNKNOWN_FORMAT, None
    # build definition
    df = Definition()

    te = df.TextElement("Random Quotes", style=styleNamePageTitle)
    te.setJustification(justCenter)
    df.LineBreakElement()
    addQuotesToDefinition(df, quotes, modulesInfo)
    df.LineBreakElement()
    par = df.ParagraphElement(False)
    par.setJustification(justCenter)
    df.TextElement("Daily", link="s+quotes:daily")
    df.TextElement(" \x95 ", style=styleNameGray)
    df.TextElement("Random", link="s+quotes:random")
    df.PopParentElement()
    return QUOTES_DATA, universalDataFormatWithDefinition(df, [])
 def test_basic(self):
     self.open("http://localhost:8000/resources/Patient")
     self.update_text("#inputEmail", self.localhost_email)
     self.update_text("#inputPassword", self.localhost_password)
     self.click('button[type="Submit"]')
     self.wait_for_text_visible("reportforgenetics", "body")
     self.wait_for_text_visible("Condition", "body")
     self.click('button[name="authorize"]')
     self.wait_for_text_visible("Patient", "table")
     base_url = self.driver.current_url.split('/r')[0]
     source = self.driver.page_source
     soup = BeautifulSoup(source)
     num_rows = len(soup.fetch("a")) - 3  # Skip header, etc
     for i in xrange(num_rows):
         href = soup.fetch("a")[i+3].attrs[0][1]
         self.open(base_url + href)
         self.wait_for_text_visible("Genetics Report for", "h3")
         self.wait_for_text_visible("Clinical Context", "h4")
         self.wait_for_text_visible("Genetics Information", "body")
Example #34
0
 def test_basic(self):
     self.open("http://localhost:8000/resources/Patient")
     self.update_text("#inputEmail", self.localhost_email)
     self.update_text("#inputPassword", self.localhost_password)
     self.click('button[type="Submit"]')
     self.wait_for_text_visible("reportforgenetics", "body")
     self.wait_for_text_visible("Condition", "body")
     self.click('button[name="authorize"]')
     self.wait_for_text_visible("Patient", "table")
     base_url = self.driver.current_url.split('/r')[0]
     source = self.driver.page_source
     soup = BeautifulSoup(source)
     num_rows = len(soup.fetch("a")) - 3  # Skip header, etc
     for i in xrange(num_rows):
         href = soup.fetch("a")[i + 3].attrs[0][1]
         self.open(base_url + href)
         self.wait_for_text_visible("Genetics Report for", "h3")
         self.wait_for_text_visible("Clinical Context", "h4")
         self.wait_for_text_visible("Genetics Information", "body")
Example #35
0
 def cve(self,irc,msg,args):
     word= self._prepare_term(args[0],"-")
     if re.search('cve', word, re.IGNORECASE) == None:
         url = 'http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=' + word
         category = 'keyword'
     else:
         url = 'http://cve.mitre.org/cgi-bin/cvename.cgi?name=' +word
         category = 'name'
     # Read the URL and pass it to BeautifulSoup.
     html = urllib2.urlopen(url).read()
     soup = BeautifulSoup()
     soup.feed(html)
     cveroot = "http://cve.mitre.org"
     # Read the main table, extracting the words from the table cells.
     hreftable = soup.fetch('a', {'href':re.compile('cvename')}, limit=4)
     h1table = soup.fetch('h1')
     h1string = str(h1table)
     if category == 'keyword':
         fonttable = soup.fetch('font', limit=11)
     else:
         fonttable = soup.fetch('font', limit=17)
     if (len(fonttable) == 3) or (re.search('error', h1string, re.IGNORECASE) != None):
         irc.reply("No data found regarding " + word)
     else:
         cve = []
         href = []
         ret = ''
         for line in fonttable:
             string = str(line)
             cve.append(re.sub('^.*">|</font>|\\n', '', string))
         for line in hreftable:
             string = str(line)
             splitstring = string.split('>')
             #print splitstring
             href.append(re.sub('^.*="|"', '', splitstring[0]))
         ret =  "%s %s" % (cve[3], cve[4])
         if category == 'keyword':
             for link in href:
                 ret += cveroot + link + " "
         else:
             ret +=cve[8]
         irc.reply(ret)
def obtener_titulo(url):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    ##    print (soup.getText())
    entrada = None
    for division in soup.fetch('div'):
        idntifica = division.get('id')
        if idntifica == "resultado":
            entrada = soup.find(id="resultado")
            entrada = entrada.getText()
    return entrada
Example #37
0
    def _get_seriedata(self, url):
        """Get serie name and all episodes from the given url
        
        @return serie name, episode data, last update day"""
        
        epdata = []

        f = urllib2.urlopen(url)
        bs = BeautifulSoup(f)

        # this will fail if the serie page has been redirected
        # epguides uses a dimwitted meta refresh instead of a proper one...
        try:
            seriename = bs.fetch("h1")[0].renderContents()
            seriename = re.sub("<.*?>", "", seriename) # remove HTML
        except IndexError:
            return

        # parse just the relevant parts with regexes
        filedata = bs.fetch("pre")[0].renderContents().split("\n")

        # regex-match the relevant parts
        for line in filedata:
            m = self.epRe.search(line)
            if m:
                # convert datestring to gmtime format
                t = time.strptime(m.group('date'), '%d %b %y')
            
                # put episode data into a nifty dict
                data = {'epno'    : m.group('no'),
                        'season'  : int(m.group('season')),
                        'episode' : int(m.group('episode')),
                        'prodno'  : m.group('prodno'),
                        'airdate' : m.group('date'),
                        'airdate2': datetime.date(t[0], t[1], t[2]),
                        'epname'  : m.group('name')}
                
                epdata.append(data)

        # name of serie, episode data and date of last check
        return seriename, epdata, datetime.date.today()
Example #38
0
def tryParseSearchDefinition(htmlTxt, fArtistSearch, modulesInfo, keywords):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    # no results
    input = soup.first("input", {"name": "albumName"})
    if input:
        return NO_RESULTS, None

    # get td's
    headerList = soup.fetch("td", {"class": "tb_header"})
    tdList = soup.fetch("td", {"class": "tb_row_r2"})
    if len(headerList) == 0 or len(tdList) == 0:
        return UNKNOWN_FORMAT, None
    # test modulo offset
    headersCount = len(headerList)
    if (len(tdList) % headersCount) != 0:
        return UNKNOWN_FORMAT, None

    searchResults = []
    # get results
    for index in range(len(tdList) - 1):
        artist = getAllTextFromTag(tdList[index]).strip()
        title = getAllTextFromTag(tdList[index + 1]).strip()
        urlStart = "show.php?id="
        aItem = tdList[index + 1].first("a", {"href": urlStart + "%"})
        if aItem:
            lyricsId = aItem['href'][len(urlStart):]
            searchResults.append([artist, title, lyricsId])

    if 0 == len(searchResults):
        return (UNKNOWN_FORMAT, None)

    if fArtistSearch:
        df = searchResultsToDefinitionThree(searchResults, modulesInfo)
    else:
        #df = searchResultsToDefinition(searchResults, modulesInfo)
        df = searchResultsToDefinitionTwo(searchResults, modulesInfo)

    return LYRICS_SEARCH, universalDataFormatWithDefinition(
        df, [["H", "Search: " + keywords]])
Example #39
0
def tryParseSearchDefinition(htmlTxt, fArtistSearch, modulesInfo, keywords):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    # no results
    input = soup.first("input", {"name":"albumName"})
    if input:
        return NO_RESULTS, None

    # get td's
    headerList = soup.fetch("td", {"class":"tb_header"})
    tdList = soup.fetch("td", {"class":"tb_row_r2"})
    if len(headerList) == 0 or len(tdList) == 0:
        return UNKNOWN_FORMAT, None
    # test modulo offset
    headersCount = len(headerList)
    if (len(tdList) % headersCount) != 0:
        return UNKNOWN_FORMAT, None

    searchResults = []    
    # get results
    for index in range(len(tdList)-1):
        artist = getAllTextFromTag(tdList[index]).strip()
        title = getAllTextFromTag(tdList[index+1]).strip()
        urlStart = "show.php?id="
        aItem = tdList[index+1].first("a", {"href":urlStart+"%"})
        if aItem:
            lyricsId = aItem['href'][len(urlStart):]
            searchResults.append([artist, title, lyricsId])

    if 0 == len(searchResults):
        return (UNKNOWN_FORMAT, None)

    if fArtistSearch:
        df = searchResultsToDefinitionThree(searchResults, modulesInfo)
    else:
        #df = searchResultsToDefinition(searchResults, modulesInfo)
        df = searchResultsToDefinitionTwo(searchResults, modulesInfo)

    return LYRICS_SEARCH, universalDataFormatWithDefinition(df, 
        [["H", "Search: "+keywords]])
Example #40
0
def iterMonthPlayLists(url):
    page = BeautifulSoup(urlopen(url))
    wordRE = re.compile(r'\w+')
    for table in page.fetch('table'):
        rows = iter(table.fetch('tr'))
        # first row: the date
        month,day,year = map(int,''.join(rows.next().fetchText(wordRE)).split('/'))
        # second row: title,artist labels; ignore it
        rows.next()
        # remaining rows : title,artist data
        selections = [[''.join(col.fetchText(wordRE)).strip()
                       for col in row.fetch('td')] for row in rows]
        yield PlayList(selections, Date(year,month,day))
Example #41
0
def getSearchLinks(text):
	"Returns all the links from scraping a google custom search page, given a html document as input"
	soup = BeautifulSoup(text)
	allLinks = soup.findAll('h2', {"class":"r"}) #gets all the h2 tags search urls
	allLinks = map(str, allLinks)

	fsoup = BeautifulSoup(''.join(allLinks))
	links = [] #start with an empty list
	for item in fsoup.fetch('a'):
		  links.append(item['href']) #iterate over all the href elements and append to the links list

	links =  map(lambda x: x.encode('ascii'),links) #BeautifulSoup returns unicode strings, hence converting to ascii
	return links
Example #42
0
def getAllLinks(htmlpage, reg = False):
	links = []

	if reg == False:
		soup = BeautifulSoup(htmlpage)
		for item in soup.fetch('a'):
			links.append(item['href'])
		return encodeToAscii(links)
	else:
	
		linksList = re.findall('<a href=(.*?)>.*?</a>',str(htmlpage))
		for link in linksList:
			links.append(link)
		return links
def getallsubs(content, allowed_languages, filename="", search_string=""):

    #parser = HTMLParser()
#    parser = et.XMLParser(html=1)
#    html = et.fromstring(content, parser).getroot()
#    html = ElementSoup.parse(StringIO(content))

    soup = BeautifulSoup(content)
    #elements = html.findall(".//tr[./td/a/img[@title='Download Thai Subtitle']]")

    subtitles = []
    sub_list = soup.fetch("div", dict(id="subtitle_list"))
    if not sub_list:
        return []
    table = sub_list[0].fetch("table")[0]
    if table is None:
        return []

    for element in table.findAll("tr")[1:]:
        num, title, rating, translate, upload, download = element.findAll("td")
        subtitle_name = title.find('br').previousSibling.strip().strip(" [En]")
        rating = int(round(float(rating.getText().strip('%'))/100.0*5))
        sync = False
        if filename != "" and string.lower(filename) == string.lower(subtitle_name):
            sync = True

        for lang_name, _, let2, let3, _, _  in [
            ("Thai", "0", "th", "tha", "41", 30243),
            ("English", "2", "en", "eng", "11", 30212)
        ]:
            if let3 not in allowed_languages:
                continue
            # rating is really the completeness. 0 means no thai, so not point showing it
            if let3 == 'tha' and rating < 1:
                continue

            link = download.fetch("img",{'title':'Download %s Subtitle'%lang_name})[0].parent['href']
            link = urljoin(MAIN_URL + "/manage/", link)
            lang = {'name': lang_name, '2let': let2, '3let': let3}

            subtitles.append({'rating': str(rating),
                              'filename': subtitle_name,
                              'sync': sync,
                              'link': link,
                              'lang': lang,
                              'hearing_imp': False})

    log(__name__, "got %s results" % len(subtitles))
#    subtitles.sort(key=lambda x: [not x['sync']])
    return subtitles
Example #44
0
def findVideos(domain, sourceCode, videos, verbose): 
    gotSeason = False
    gotURL = False
    
    printInfo1("\nSearching for links in code with the word '%s' in the url..." % videoText)
        
    soup = BeautifulSoup(sourceCode)
    
    for item in soup.fetch(['h2', 'a']):
        if verbose:
            printInfo1("\nParsing line: %s\n..." % item)
            
        if item.contents:
            if item.name == "h2" and seasonText in item.contents[0]:
                season = HTMLParser().unescape(item.contents[0])
                if verbose:
                    printInfo2("Found season text")
                    printInfo1("Season: %s" % season)
                gotSeason = True
            
        if item.name == "a" and videoText in item['href']:
            episodeTitle = HTMLParser().unescape(item['title'])
            url = item['href']
            if verbose:
                printInfo2("Found link to video")
                printInfo1("Episode title: %s" % episodeTitle)
                printInfo1("URL: %s" % url)
            gotURL = True
        
        if not gotSeason and not gotURL:
            if verbose:
                printInfo2("No valuable info in this item")
                
        if gotURL:
            if not gotSeason:
                season = "None"
            url = urljoin(domain, url)
            if verbose:
                printInfo1("Adding...")
                printInfo1("URL: %s" % url)
                printInfo1("Season: %s" % season)
                printInfo1("Episode title: %s" % episodeTitle)               
            videos.append({'url': url, 
                           'season': season, 
                           'episodeTitle': episodeTitle})
            gotSeason = False
            gotURL = False

    printInfo1("Found %s videos" % len(videos))    
    return videos
Example #45
0
    def councilMembers(self, follow_links=True):
        br = self._get_new_browser()
        response = br.open(self._people_uri)

        # Loop through the pages, yielding each of the results
        all_results = False
        while all_results is False:
            soup = BeautifulSoup(response.read())
            table = soup.find('table',
                              id='ctl00_ContentPlaceHolder1_gridPeople_ctl00')

            for councilman, headers, row in self.parseDataTable(table):

                if follow_links and type(councilman['Person Name']) == dict:
                    detail_url = self.host + councilman['Person Name']['url']
                    response = br.open(detail_url)
                    soup = BeautifulSoup(response.read())
                    img = soup.find(
                        'img', {'id': 'ctl00_ContentPlaceHolder1_imgPhoto'})
                    if img:
                        councilman['Photo'] = self.host + img['src']

                yield councilman

            current_page = soup.fetch('a', {'class': 'rgCurrentPage'})
            if current_page:
                current_page = current_page[0]
                next_page = current_page.findNextSibling('a')
            else:
                next_page = None

            if next_page:
                print 'reading page', next_page.text
                print
                event_target = next_page['href'].split("'")[1]
                br.select_form('aspnetForm')
                data = self._data(br.form, event_target)

                del data[
                    'ctl00$ContentPlaceHolder1$gridPeople$ctl00$ctl02$ctl01$ctl01']
                # print data
                data = urllib.urlencode(data)
                response = _try_connect(br, self._people_uri, data)

            else:
                all_results = True

        raise StopIteration
Example #46
0
def parseStock(htmlTxt):
    # this is funy
    htmlTxt = htmlTxt.replace("<! -- ", "<!---")
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    noResults = testNoResults(soup)
    if NO_RESULTS == noResults:
        return (NO_RESULTS, sNoResultsText)

    # get name
    nameTag = soup.first("td", {"height": "30", "class": "ygtb"})
    if not nameTag:
        return (UNKNOWN_FORMAT, sUnknownFormatText)
    name = getAllTextFromTag(nameTag).strip()

    # get all data from table
    bigTable = soup.fetch("table", {"width": "580", "id": "yfncsumtab"})
    if 1 != len(bigTable):
        return (UNKNOWN_FORMAT, sUnknownFormatText)
    tdDataList = bigTable[0].fetch("td", {"class": "yfnc_tabledata1"})
    innerList = [name]
    counter = 0
    for tdItem in tdDataList:
        if 2 == counter:
            # 3th element is with up down icon
            imgItem = tdDataList[2].first("img")
            upDown = ""
            if imgItem:
                upDown = imgItem['alt']
            innerList.append(upDown)
            bItem = tdDataList[2].first("b")
            itemText = ""
            if bItem:
                itemText = getAllTextFromTag(bItem).strip()
            innerList.append(itemText)
        else:
            itemText = getAllTextFromTag(tdItem).strip()
            innerList.append(itemText)
        counter += 1

    # any results?
    if 0 == counter:
        return (UNKNOWN_FORMAT, sUnknownFormatText)

    # one-item UDF
    outerList = [innerList]
    return (STOCKS_DATA, universalDataFormatReplaceEntities(outerList))
Example #47
0
def GetLinks(root, patterns, isnum = True):
    u = urllib.urlopen(root)
    soup = BeautifulSoup(u.read())
    refs = []
    for p in patterns:
        refs += soup.fetch('a', {'href': re.compile(p)})
    links = []
    numbers = []
    for ref in refs:
        if isnum:
            if ref.text[:-1].isdigit():
                links.append(root + str(ref.text))
                numbers.append(int(str(ref.text)[:-1]))
        else: links.append(root + str(ref.text))
    if isnum: return zip(links, numbers)
    return links
Example #48
0
def check_class_open(crns):
    open_crns = []

    for crn in crns:
        url = CRN_URL_TEMPLATE.format(crn)

        r = requests.get(url)

        soup = BeautifulSoup(r.text)
        # find available table column from the given attrs. the first returned is the available seats, the 2nd returned is the max seats
        available = soup.fetch(attrs={'class': 'dddefault', 'width': 30})[0]

        if available.text != u'0':
            open_crns.append(crn)

    return open_crns
Example #49
0
def scrape_somafm_info(url):
    station_list = []
    page = urllib2.urlopen(url)
    html = page.read()
    soup = BeautifulSoup(html)
    for station in soup.fetch('li'):
        station_name = station.h3.contents[0]
        station_desc = ' '.join(station('p', {'class': 'descr'})[0].contents)
        station_desc = format_station_description(station_desc)
        station_url = station.a['href']
        pls_name_info = station_url.rsplit('/')
        pls_url = '%s%s%s' % (url, pls_name_info[2], '.pls')
        irs = InternetRadioStation(name=station_name,
                                   pls_url=pls_url,
                                   desc=station_desc)
        station_list.append(irs)
    return station_list
Example #50
0
def scrape_somafm_info(url):
    station_list = []
    page = urllib2.urlopen(url)
    html = page.read()
    soup = BeautifulSoup(html)
    for station in soup.fetch('li'):
        station_name = station.h3.contents[0]
        station_desc = ' '.join(station('p', {'class':'descr'})[0].contents)
        station_desc = format_station_description(station_desc)
        station_url = station.a['href']
        pls_name_info = station_url.rsplit('/')
        pls_url = '%s%s%s' % (url, pls_name_info[2], '.pls')
        irs = InternetRadioStation(name=station_name,
                                   pls_url=pls_url,
                                   desc=station_desc)
        station_list.append(irs)
    return station_list
Example #51
0
def areaCodeByCity(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)

    resultsTable = soup.first("table", {"summary":"Results Content"})
    if resultsTable:
        strong = resultsTable.first("strong")
        if strong:
            if getAllTextFromTag(strong).startswith("Multiple cities with"):
                aList = resultsTable.fetch("a")
                for aItem in aList:
                    city = getAllTextFromTag(aItem)
                    returned.append(city)
                if len(returned) == 0:
                    return (UNKNOWN_FORMAT,m411UnknownFormatText)
                return (MULTIPLE_SELECT,string.join(returned,"\n"))
    # results
    return reverseZIPCodeLookup(htmlTxt)

    tables = soup.fetch("table", {"summary":"Search Results"})
    if 0 == len(tables):
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    trList = []
    for tab in tables:
        trList += tab.fetch("tr")
    for tr in trList:
        tdList = tr.fetch("td")
        if len(tdList) == 3:
            code     = getAllTextFromTag(tdList[0]).strip()
            country  = getAllTextFromTag(tdList[1]).strip()
            timezone = getAllTextFromTag(tdList[2]).strip()
            if code != "New Search":
                smallList = (code,country,timezone)
                returned.append(smallList)
    if len(returned) == 0:
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
Example #52
0
 def click_link_text(self, link_text, timeout=settings.SMALL_TIMEOUT):
     """ This method clicks link text on a page """
     # If using phantomjs, might need to extract and open the link directly
     if self.browser == 'phantomjs':
         if self.is_link_text_visible(link_text):
             element = self.wait_for_link_text_visible(link_text)
             element.click()
             return
         source = self.driver.page_source
         soup = BeautifulSoup(source)
         html_links = soup.fetch('a')
         for html_link in html_links:
             if html_link.text == link_text:
                 for html_attribute in html_link.attrs:
                     if html_attribute[0] == 'href':
                         href = html_attribute[1]
                         if href.startswith('//'):
                             link = "http:" + href
                         elif href.startswith('/'):
                             url = self.driver.current_url
                             domain_url = self.get_domain_url(url)
                             link = domain_url + href
                         else:
                             link = href
                         self.open(link)
                         return
                 raise Exception(
                     'Could not parse link from link_text [%s]' % link_text)
         raise Exception("Link text [%s] was not found!" % link_text)
     # Not using phantomjs
     element = self.wait_for_link_text_visible(link_text, timeout=timeout)
     self._demo_mode_highlight_if_active(link_text, by=By.LINK_TEXT)
     pre_action_url = self.driver.current_url
     element.click()
     if settings.WAIT_FOR_RSC_ON_CLICKS:
         self.wait_for_ready_state_complete()
     if self.demo_mode:
         if self.driver.current_url != pre_action_url:
             self._demo_mode_pause_if_active()
         else:
             self._demo_mode_pause_if_active(tiny=True)
Example #53
0
 def define(self, irc, msg, args):
     """[word]
     look up the word in wordnet"""
     if len(args) != 1:
         irc.reply("you gotta give me a word to define")
         return
     word = self._prepare_term(args[0], "")
     url = 'http://wordnet.princeton.edu/perl/webwn?s=' + word
     html = urllib2.urlopen(url).read()
     soup = BeautifulSoup()
     soup.feed(html)
     maintable = soup.fetch('li')
     retdef = []
     checkfordefs = len(maintable)
     if checkfordefs != 0:
         for lines in maintable:
             converttostring = str(lines)
             definition = re.sub('^.*\(|\).*$', '', converttostring)
             retdef.append(definition)
     else:
         retdef.append("not found.  Is %s spelled corectly?" % word)
     irc.reply(word + ": " + "; ".join(retdef))
Example #54
0
def _parseList(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    trList = soup.fetch("tr")
    outerList = []
    for tr in trList:
        if len(tr.fetch("tr")) == 0:
            tdList = tr.fetch("td")
            if len(tdList) == 4:
                if tdList[0].first("span", {"class": "title"}):
                    rank = getAllTextFromTag(tdList[0])
                    title = getAllTextFromTag(tdList[1])
                    rating = getAllTextFromTag(tdList[2])
                    explicitness = getAllTextFromTag(tdList[3])
                    aItem = tdList[1].first("a")
                    if aItem:
                        url = aItem['href']
                        outerList.append(
                            (rank, title, rating, explicitness, url))
    if 0 == len(outerList):
        return (NO_RESULTS, jNoResultsText)
    return (JOKES_LIST, universalDataFormatReplaceEntities(outerList))
Example #55
0
def parseList(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # find table with results
    tableList = soup.fetch("table", {
        "cellpadding": "0",
        "cellspacing": "0",
        "border": "0",
        "width": "100%"
    })
    if 0 == len(tableList):
        return (UNKNOWN_FORMAT, jUnknownFormatText)

    outerList = []
    for table in tableList:
        trList = table.fetch("tr")
        if 2 <= len(trList):
            tdCount = len(trList[0].fetch("td"))
            if 3 > tdCount:
                return (UNKNOWN_FORMAT, jUnknownFormatText)
            for tr in trList[1:]:
                tdList = tr.fetch("td")
                rank = ""
                if 4 == tdCount:
                    rank = getAllTextFromTag(tdList[0])
                title = getAllTextFromTag(tdList[-3])
                rating = getAllTextFromTag(tdList[-2])
                explicitness = getAllTextFromTag(tdList[-1])
                url = tdList[-3].first("a")['href']
                if not url:
                    return (UNKNOWN_FORMAT, jUnknownFormatText)
                outerList.append((rank, title, rating, explicitness, url))

    if 0 == len(outerList):
        return (NO_RESULTS, jNoResultsText)

    return (JOKES_LIST, universalDataFormatReplaceEntities(outerList))
Example #56
0
def get_all_images_on_page(page_url):
    prefix = page_url.split('://')[0]
    simple_url = page_url.split('://')[1]
    base_url = simple_url.split('/')[0]
    full_base_url = prefix + "://" + base_url + "/"
    html = requests.get(page_url)
    completed_source = web_core.rebuild_source(html.text, full_base_url)
    soup = BeautifulSoup(completed_source)
    imgs = soup.fetch('img', src=True, onload=None)
    image_url_list = []
    for img in imgs:
        link = img["src"].split("src=")[-1]
        compact_link = link.split('?')[0]
        if (compact_link.endswith('.png') or compact_link.endswith('.jpg')
                or compact_link.endswith('.jpeg')):
            if not link.startswith("http"):
                if ":" not in link:
                    link = full_base_url + link
                else:
                    # The link is weird. Skip it.
                    continue
            image_url_list.append(link)
    return image_url_list