Example #1
0
def tryParseDetails(htmlTxt, updateString):
    htmlTxt = removeCloseTagAttr(htmlTxt)
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    flightInfo = []
    table = soup.first("table", {"name": "flight_info"})
    trList = []
    if table:
        for tr in table.fetch("tr"):
            if len(tr.fetch("td")) == 4:
                trList.append(tr)
            elif len(tr.fetch("td")) == 1:
                img = tr.first("img", {"alt": "Continuing on To"})
                if img:
                    trList.append(tr)
    for tr in trList:
        tdList = tr.fetch("td")
        if len(tdList) == 4:
            info = getAllTextFromTag(tdList[0]).replace(" ", " ").strip()
            infoFrom = getAllTextFromTag(tdList[1]).replace(" ",
                                                            " ").strip()
            infoTo = getAllTextFromTag(tdList[3]).replace(" ",
                                                          " ").strip()
            if info != "":
                flightInfo.append([info, infoFrom, infoTo])
        else:
            flightInfo.append([""])

    flight = ""
    table = soup.first("table", {"name": "headbar2"})
    if table:
        bItem = table.first("b")
        if bItem:
            flight = getAllTextFromTag(bItem)

    if 0 == len(flightInfo) or "" == flight:
        return UNKNOWN_FORMAT, None
    # definition
    df = Definition()
    df.TextElement(flight, style=styleNameBold)
    df.LineBreakElement(1, 2)
    index = 0
    for item in flightInfo:
        # info, from, to
        if len(item) == 3:
            df.TextElement(item[0], style=styleNameHeader)
            if item[1] != "":
                df.LineBreakElement()
                df.TextElement(item[1])
            if item[2] != "":
                gtxt = df.TextElement(item[2])
                gtxt.setJustification(justRight)
            else:
                df.LineBreakElement()
        else:
            df.HorizontalLineElement()

    return RESULTS_DATA, universalDataFormatWithDefinition(
        df, [["U", updateString]])
Example #2
0
class Julienne:
    def __init__(self, table):
        self.soup = BeautifulSoup(table.strip())
        self.row_list = self.soup.first("tbody").findAll("tr")

    def validate(self):
        valid_toplevel = len(self.soup.contents) == 1 and self.soup.contents[0].name == "table"
        num_columns = len(self.soup.first("thead").contents)
        rows = self.row_list
        valid_body = all([len(row) == num_columns for row in rows])
        return valid_toplevel and valid_body

    def columns(self):
        return [tag.string for tag in self.soup.findAll("th")]

    def rows(self):
        rows_sans_whitespace = [[unicode(field.string) for field in row if field != '\n'] for row in self.row_list]
        return [OrderedDict(zip(self.columns(), row)) for row in rows_sans_whitespace]

    def select(self, **kwargs):
        # TODO: Implement selecting rows, possibly by some index
        desired_cols = kwargs['columns']
        rows = self.rows()
        return map(lambda row: { key: row[key] for key in desired_cols }, rows)

    def to_csv(self):
        csv_str = ",".join(self.columns()) + "\n"
        csv_str += "\n".join([",".join(row.viewvalues()) for row in self.rows()])
        return csv_str
def _spider_book_info(url, letter):
    try:
        html = getHttp(url, handleException = False)
        soup = BeautifulSoup()
        soup.feed(html)
        h1 = soup.first("h1")
        if h1 is None:
            return None

        assert h1 is not None
        title = retrieveContents(h1).decode("iso-8859-1")

        subtitle = None
        author = None
        code = None

        labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})]
        data = soup.fetch("span", {"class": "title-data"})
        try:
            index = labels.index("Subtitle")
            subtitle = retrieveContents(data[index]).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Author")
            author = retrieveContents(data[index].first("a")).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Language")
            href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"])
            code = href[19:href.find("&", 19)].decode("iso-8859-1")
        except ValueError:
            pass

        tid = soup.first("input", {"type": "hidden", "name": "tid"})
        assert tid is not None
        book_id = tid["value"].decode("iso-8859-1")

        print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore")

        sel = soup.first("select", {"name": "book"})
        assert sel is not None
        opts = sel.fetch("option")
        formats = []
        for opt in opts:
            try:
                format = retrieveContents(opt).split()[0]
                if format not in ebooks.FORMATS:
                    continue

                val = opt["value"]
                formats.append((format, val))

            except Exception, ex:
                log(SEV_EXC, exceptionAsStr(ex))
        formats.sort()
        return (url, title, subtitle, author, book_id, code, formats)
Example #4
0
    def __call__(self, url):
        try:
            doc = urllib2.urlopen(url).read()
        except urllib2.URLError:
            return None
        doc = BeautifulSoup(urllib2.urlopen(url).read())
        title = u''
        description = u''

        # title
        title = doc.title.string
        if not title:
            title = doc.first('meta', attrs={'name': 'title'})
            if title:
                title = title.get('content')

        # description
        description = doc.first('meta', attrs={'name': 'description'})
        if description:
            description = description.get('content')

        # Find favicon
        favicon_url = doc.first('link', rel='shortcut icon')
        if favicon_url:
            favicon_url = favicon_url.get('href')
        else:
            host_url = urlparse(url)
            favicon_url = host_url[0] + u'://' + host_url[1] + u'/favicon.ico'

        return json.dumps({
            'title': title,
            'description': description,
            'favicon_url': favicon_url})
Example #5
0
def tryParseDetails(htmlTxt, updateString):
    htmlTxt = removeCloseTagAttr(htmlTxt)
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    flightInfo = []
    table = soup.first("table", {"name":"flight_info"})
    trList = []
    if table:
        for tr in table.fetch("tr"):
            if len(tr.fetch("td")) == 4:
                trList.append(tr)
            elif len(tr.fetch("td")) == 1:
                img = tr.first("img", {"alt":"Continuing on To"})
                if img:
                    trList.append(tr)
    for tr in trList:
        tdList = tr.fetch("td")
        if len(tdList)==4:
            info = getAllTextFromTag(tdList[0]).replace(" "," ").strip()
            infoFrom = getAllTextFromTag(tdList[1]).replace(" "," ").strip()
            infoTo = getAllTextFromTag(tdList[3]).replace(" "," ").strip()
            if info != "":
                flightInfo.append([info, infoFrom, infoTo])
        else:
            flightInfo.append([""])

    flight = ""
    table = soup.first("table", {"name":"headbar2"})
    if table:
        bItem = table.first("b")
        if bItem:
            flight = getAllTextFromTag(bItem)

    if 0==len(flightInfo) or ""==flight:
        return UNKNOWN_FORMAT, None
    # definition
    df = Definition()
    df.TextElement(flight, style=styleNameBold)
    df.LineBreakElement(1,2)
    index = 0
    for item in flightInfo:
        # info, from, to
        if len(item) == 3:
            df.TextElement(item[0], style=styleNameHeader)
            if item[1] != "":
                df.LineBreakElement()
                df.TextElement(item[1])
            if item[2] != "":
                gtxt = df.TextElement(item[2])
                gtxt.setJustification(justRight)
            else:
                df.LineBreakElement()
        else:
            df.HorizontalLineElement()

    return RESULTS_DATA, universalDataFormatWithDefinition(df, [["U",updateString]])
    def scrapeList(self):
        """
        Scrapes the pages for a list.
        Saves the list index page locally.
        Sends for scraping of each month's pages (and then on to the individual messages).
        """
        
        # Get the page that list the months of archive pages.
        source = self.fetchPage(self.list_url)
        
        # The copy of the page we save is filtered for email addresses, links, etc.
        filtered_source = self.filterPage(source)

        # Save our local copy.
        # eg /Users/phil/Sites/examplesite/html/list-name/index.html
        local_index = open(self.publish_dir + '/index.html', 'w')
        local_index.write(filtered_source)
        local_index.close()
        
        soup = BeautifulSoup(source)

        if not soup.first('table'):
            return

        # Go through each row in the table except the first (which is column headers).
        for row in soup.first('table')('tr')[1:]:
            # Get the text in the first column: "February 2009:"
            archive_date = row('td')[0].string
            if 'quarter' in archive_date:
                (ordinal, nothing , year) = archive_date.split()
                quarter_no = str(['First','Second','Third','Fourth'].index(ordinal)+1)
                # Strip the colon off.
                year = year[:-1]
                formatted_date = year+'q'+quarter_no
            else:
                if ' ' in archive_date:
                    (month, year) = archive_date.split()
                    # Strip the colon off.
                    year = year[:-1]
                    formatted_date = year+'-'+month
                else:
                    formatted_date=archive_date[:-1]




            # Scrape the date page for this month and get all its messages.
            # keep_fetching will be True or False, depending on whether we need to keep getting older months.
            try:
                keep_fetching = self.scrapeMonth(formatted_date)
            except urllib2.HTTPError: 
                print "Skipping ",formatted_date
                keep_fetching = True

            if not keep_fetching:
                break;
Example #7
0
def personSearch(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    noResults = m411testNoResults(soup)
    if NO_RESULTS == noResults:
        return (noResults,m411NoResultsText)
    ttlPref = soup.first("td",{"class":"TTLPREF"})
    if not ttlPref:
        ttlPref = soup.first("span",{"class":"TTLPREF"})
    if not ttlPref:
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    # too many results:
    font = ttlPref.first("font",{"color":"#FF0000"})
    if font:
        if "No results." == font.contents[0]:
            return (NO_RESULTS,m411NoResultsText)
        if "Results found in multiple cities." == font.contents[0]:
            brList = ttlPref.fetch("br")
            brList = brList[4:] ## skip text about select
            for br in brList:
                text = str(br.next).replace("<br />","").replace("\n","").strip()
                if len(text) > 0:
                    returned.append(text)
            return (MULTIPLE_SELECT, string.join(returned,"\n"))
        return (TOO_MANY_RESULTS,m411TooManyResults)
    # results:
    brList = ttlPref.fetch("br")
    resultsCount = len(brList) - 2
    if 0 == resultsCount:
        # no city?
        if "NO CITY FOUND" == str(brList[1].next).replace("\n","").strip():
            return (NO_CITY,m411NoCity)
    results = resultsCount/5
    if results*5 != resultsCount:    ## test if number of <br> is 5*n+2
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    # get them
    brList = brList[1:]  ## skip first br
    counter = 0
    smallList = []    
    for br in brList:
        text = str(br.next).replace("<br />","").replace("\n","").strip()
        if results > 0:
            if 0 == counter:
                smallList = [text]
            if 1 == counter or 2 == counter:
                smallList.append(text)
            if 3 == counter:
                smallList.append(text)
                returned.append(smallList)                
                results -= 1
        counter += 1
        if 5 == counter:
            counter = 0
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
Example #8
0
def _parseRandomJoke(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    table = soup.first("table", {"id": "jokeIframeTable2"})
    if not table:
        return UNKNOWN_FORMAT, None
    # title
    titleSpan = table.first("span", {"class": "jokeTitle_v2"})
    if not titleSpan:
        return UNKNOWN_FORMAT, None
    title = getAllTextFromTag(titleSpan)
    # text
    trList = table.fetch("tr")
    text = ""
    if len(trList) > 6:
        tdList = trList[5].fetch("td")
        if len(tdList) == 3:
            text = getAllTextFromToInBrFormat(tdList[1], tdList[2])
            if len(text.replace("&nbsp;", " ").strip()) < 2:
                text = ""
    if "" == text:
        return UNKNOWN_FORMAT, None
    smallList = [title, text]
    # rating
    table = soup.first("table", {"id": "Table5"})
    if table:
        td = table.first("td")
        if td:
            imgList = td.fetch("img", {"src": "%"})
            rating = "not rated"
            translator = {
                "iconrate_one": "1",
                "iconrate_two": "2",
                "iconrate_three": "3",
                "iconrate_four": "4",
                "iconrate_five": "5",
                "iconrate_one_half": "1.5",
                "iconrate_two_half": "2.5",
                "iconrate_three_half": "3.5",
                "iconrate_four_half": "4.5",
                "iconrate_zero_half": "0.5",
            }
            for img in imgList:
                src = img["src"]
                src = src.split("/")[-1]
                src = src.replace(".gif", "")
                try:
                    rat = translator[src]
                    rating = rat
                except:
                    pass
            smallList.append(rating)
    outerList = [smallList]
    return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
Example #9
0
def _parseRandomJoke(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    table = soup.first("table", {"id": "jokeIframeTable2"})
    if not table:
        return UNKNOWN_FORMAT, None
    # title
    titleSpan = table.first("span", {"class": "jokeTitle_v2"})
    if not titleSpan:
        return UNKNOWN_FORMAT, None
    title = getAllTextFromTag(titleSpan)
    # text
    trList = table.fetch("tr")
    text = ""
    if len(trList) > 6:
        tdList = trList[5].fetch("td")
        if len(tdList) == 3:
            text = getAllTextFromToInBrFormat(tdList[1], tdList[2])
            if len(text.replace("&nbsp;", " ").strip()) < 2:
                text = ""
    if "" == text:
        return UNKNOWN_FORMAT, None
    smallList = [title, text]
    # rating
    table = soup.first("table", {"id": "Table5"})
    if table:
        td = table.first("td")
        if td:
            imgList = td.fetch("img", {"src": "%"})
            rating = "not rated"
            translator = {
                "iconrate_one": "1",
                "iconrate_two": "2",
                "iconrate_three": "3",
                "iconrate_four": "4",
                "iconrate_five": "5",
                "iconrate_one_half": "1.5",
                "iconrate_two_half": "2.5",
                "iconrate_three_half": "3.5",
                "iconrate_four_half": "4.5",
                "iconrate_zero_half": "0.5",
            }
            for img in imgList:
                src = img['src']
                src = src.split("/")[-1]
                src = src.replace(".gif", "")
                try:
                    rat = translator[src]
                    rating = rat
                except:
                    pass
            smallList.append(rating)
    outerList = [smallList]
    return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
    def _parse_letter_page(self, letter, html, index):
        self._check_finish()
        soup = BeautifulSoup()
        soup.feed(html)
        div = soup.first("div", {"class": "sidebar-module"})
        assert div is not None
        count = int(retrieveContents(div.contents[2]).split()[2])
        offset = 0
        self._lock.acquire()
        try:
            if count <= self._data[letter][0]:
                print 'Letter "%s" is up to date (%d records).' % (letter, self._data[letter][0])
                return True, count, 0
            offset = self._offsets[letter]
        finally:
            self._lock.release()

        spidered = 0
        div = soup.first("div", {"class": "titleList"})
        assert div is not None
        as = div.fetch("a")
        urls = []
        for a in as:
            url = _g_manybooks_url + urllib.quote(a["href"])
            urls.append(url)

        for url in urls:
            self._check_finish()
            i = -1
            self._lock.acquire()
            try:
                books = self._data[letter][1]
                i = _find_book_index(books, url, index)
            finally:
                self._lock.release()

            if -1 != i:
                index = i + 1
            else:
                book = _spider_book_info(url, letter)
                if book is not None:
                    spidered += 1
                    self._lock.acquire()
                    try:
                        self._fresh_books.append((letter, index + offset, book))
                        if len(self._fresh_books) == self.flush_after:
                            self._flush_books()
                        offset += 1
                        self._offsets[letter] = offset
                        if self._data[letter][0] + offset  == count:
                            return True, count, spidered
                    finally:
                        self._lock.release()
        return (index + offset == count), index, spidered
Example #11
0
    def scrapeList(self):
        """
        Scrapes the pages for a list.
        Saves the list index page locally.
        Sends for scraping of each month's pages (and then on to the individual messages).
        """

        # Get the page that list the months of archive pages.
        source = self.fetchPage(self.list_url)

        # The copy of the page we save is filtered for email addresses, links, etc.
        filtered_source = self.filterPage(source)

        # Save our local copy.
        # eg /Users/phil/Sites/examplesite/html/list-name/index.html
        local_index = open(self.publish_dir + '/index.html', 'w')
        local_index.write(filtered_source)
        local_index.close()

        soup = BeautifulSoup(source)

        if not soup.first('table'):
            return

        # Go through each row in the table except the first (which is column headers).
        for row in soup.first('table')('tr')[1:]:
            # Get the text in the first column: "February 2009:"
            archive_date = row('td')[0].string
            if 'quarter' in archive_date:
                (ordinal, nothing, year) = archive_date.split()
                quarter_no = str(
                    ['First', 'Second', 'Third', 'Fourth'].index(ordinal) + 1)
                # Strip the colon off.
                year = year[:-1]
                formatted_date = year + 'q' + quarter_no
            else:
                if ' ' in archive_date:
                    (month, year) = archive_date.split()
                    # Strip the colon off.
                    year = year[:-1]
                    formatted_date = year + '-' + month
                else:
                    formatted_date = archive_date[:-1]

            # Scrape the date page for this month and get all its messages.
            # keep_fetching will be True or False, depending on whether we need to keep getting older months.
            try:
                keep_fetching = self.scrapeMonth(formatted_date)
            except urllib2.HTTPError:
                print "Skipping ", formatted_date
                keep_fetching = True

            if not keep_fetching:
                break
Example #12
0
def get_slides(args):
    contents = get_file_contents(args.file)
    soup = BeautifulSoup(markdown(contents))

    hsoup = BeautifulSoup()
    html = Tag(hsoup, 'html')
    hsoup.append(html)

    head = Tag(hsoup, 'head')
    title = Tag(hsoup, 'title')
    title.setString(args.file)
    head.append(title)

    link = Tag(hsoup, 'link')
    link['rel'] = 'stylesheet'
    link['type'] = 'text/css'
    if args.offline:
        link['href'] = 'default.css'
    else:
        link[
            'href'] = 'http://gdg-xian.github.io/html5slides-markdown/themes/default.css'
    head.append(link)

    script = Tag(hsoup, 'script')
    if args.offline:
        script['src'] = 'html5slides.js'
    else:
        script[
            'src'] = 'http://gdg-xian.github.io/html5slides-markdown/javascripts/html5slides.js'
    head.append(script)
    html.append(head)

    body = Tag(hsoup, 'body')
    body['style'] = 'display:none'
    section = Tag(hsoup, 'section')
    section['class'] = 'slides layout-regular template-default'
    body.append(section)
    elements = []
    elements.append(soup.first())
    elements.extend(soup.first().findNextSiblings())
    article = Tag(hsoup, 'article')
    section.append(article)
    for element in elements:
        if element.name == 'hr':
            article = Tag(hsoup, 'article')
            section.append(article)
        else:
            article.append(element)

    html.append(body)

    return prettify(html)
Example #13
0
def parseJoke(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    table = soup.first("table", {"width": "328", "id": "Table2"})
    if not table:
        return (UNKNOWN_FORMAT, jUnknownFormatText)
    tdList = table.fetch("td", {
        "colspan": "3",
        "valign": "top",
        "class": "body"
    })
    if 3 != len(tdList):
        return (UNKNOWN_FORMAT, jUnknownFormatText)

    # simple format - simple parser
    title = getAllTextFromTag(tdList[0]).strip()
    text = getAllTextFromToInBrFormat(tdList[1], tdList[2].previous)
    smallList = [title, text]
    # add rating information
    if len(title) + len(
            text
    ) > 16:  # in random joke sometimes it returns small nothing... so to be sure
        span = soup.first("span", {"class": "body"})
        if span:
            text = getAllTextFromTag(span).replace("\n", "").strip()
            img = span.first("img", {"src": "%"})
            if text.startswith("CURRENT RATING") and img:
                src = img['src']
                src = src.split("/")[-1]
                src = src.replace(".gif", "")
                translator = {
                    "iconrate_one": "1",
                    "iconrate_two": "2",
                    "iconrate_three": "3",
                    "iconrate_four": "4",
                    "iconrate_five": "5",
                    "iconrate_one_half": "1.5",
                    "iconrate_two_half": "2.5",
                    "iconrate_three_half": "3.5",
                    "iconrate_four_half": "4.5",
                    "iconrate_zero_half": "0.5",
                }
                rating = "not rated"
                try:
                    rating = translator[src]
                except:
                    pass
                smallList.append(rating)
    outerList = [smallList]
    return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
def get_slides(args):
    contents = get_file_contents(args.file)
    soup = BeautifulSoup(markdown(contents))

    hsoup = BeautifulSoup()
    html = Tag(hsoup, 'html')
    hsoup.append(html)

    head = Tag(hsoup, 'head')
    title = Tag(hsoup, 'title')
    title.setString(args.file)
    head.append(title)

    link = Tag(hsoup, 'link')
    link['rel'] = 'stylesheet'
    link['type'] = 'text/css'
    if args.offline:
        link['href'] = 'default.css'
    else:
        link['href'] = 'http://gdg-xian.github.io/html5slides-markdown/themes/default.css'
    head.append(link)

    script = Tag(hsoup, 'script')
    if args.offline:
        script['src'] = 'html5slides.js'
    else:
        script['src'] = 'http://gdg-xian.github.io/html5slides-markdown/javascripts/html5slides.js'
    head.append(script)
    html.append(head)

    body = Tag(hsoup, 'body')
    body['style'] = 'display:none'
    section = Tag(hsoup, 'section')
    section['class'] = 'slides layout-regular template-default'
    body.append(section)
    elements = []
    elements.append(soup.first())
    elements.extend(soup.first().findNextSiblings())
    article = Tag(hsoup, 'article')
    section.append(article)
    for element in elements:
        if element.name == 'hr':
            article = Tag(hsoup, 'article')
            section.append(article)
        else:
            article.append(element)

    html.append(body)

    return prettify(html)
Example #15
0
 def onData(self, info, data):
     try:
         self._logger.RTC_INFO('got input: ' + pformat(info) + ', ' + pformat(data))
         t = data.tm.sec + data.tm.nsec * 1e-9
         portid = (info['component'], info['port'])
         if portid not in self._basewme:
             self._logger.RTC_INFO('First input from this port > create basic structure on WME')
             iid = self._agent.GetInputLink()
             wme = self._agent.CreateIdWME(iid, 'data')
             self._basewme[portid] = wme
             ot = type(data.data)
             if ot in types.StringTypes:
                 if data.data[:5] == '<?xml':
                     self._logger.RTC_INFO('Parsing XML type input')
                     doc = BeautifulSoup(data.data)
                     wme2 = self._agent.CreateIdWME(wme, 'data')
                     usedwords = {}
                     self.docRecur(doc.first(), wme2, usedwords)
                     self._datawme[portid] = wme2
                 else:
                     self._datawme[portid] = self._agent.CreateStringWME(wme, 'data', data.data)
             elif ot == types.IntType:
                 self._datawme[portid] = self._agent.CreateIntWME(wme, 'data', data.data)
             elif ot == types.FloatType:
                 self._datawme[portid] = self._agent.CreateFloatWME(wme, 'data', data.data)
             else:
                 self._logger.RTC_ERROR('unsupported data type: ' + str(ot))
             self._timewme[portid] = self._agent.CreateFloatWME(wme, 'time', t)
             self._dataidwme[portid] = self._agent.CreateIntWME(wme, 'id', self._dataid)
             for k, v in info.iteritems():
                 self._agent.CreateStringWME(wme, k, v)
         else:
             self._agent.Update(self._timewme[portid], t)
             self._agent.Update(self._dataidwme[portid], self._dataid)
             if type(data.data) in types.StringTypes and data.data[:5] == '<?xml':
                 self._logger.RTC_INFO('Parsing XML type input')
                 doc = BeautifulSoup(data.data)
                 self._agent.DestroyWME(self._datawme[portid])
                 wme2 = self._agent.CreateIdWME(self._basewme[portid], 'data')
                 usedwords = {}
                 self.docRecur(doc.first(), wme2, usedwords)
                 self._datawme[portid] = wme2
             else:
                 self._agent.Update(self._datawme[portid], data.data)
         self._agent.Commit()
         self._dataid += 1
     except:
         self._logger.RTC_ERROR(traceback.format_exc())
Example #16
0
def getTextFromDirtyText(dirtyText):
    soup = BeautifulSoup()
    soup.feed("<xxx>" + dirtyText + "</xxx><yyy>test</yyy>")
    dirtySoup = soup.first("xxx")
    textWithBr = getAllTextFromToInBrFormat(dirtySoup, getLastElementFromTag(dirtySoup).next)
    text = textWithBr.replace("<br>", "\n").replace("<b>", "").replace("</b>", "")
    return text
Example #17
0
def parseGasOld(htmlTxt, url=None, dbgLevel=0):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    testTitle = soup.first("title")
    if testTitle:
        if getAllTextFromTag(testTitle).startswith("GasBuddy.com - Find cheap gas prices in your city"):
            return (LOCATION_UNKNOWN, gLocationUnknownText)

    outerList = []
    trList = soup.fetch("tr")
    for trItem in trList:
        tdList = trItem.fetch("td")
        if 8 == len(tdList):
            if tdList[1].first("table"):
                price = getAllTextFromTag(tdList[0]).strip()
                name = getAllTextFromTag(tdList[2]).strip()
                address = getAllTextFromTag(tdList[4]).strip()
                area = getAllTextFromTag(tdList[5]).strip()
                time = getAllTextFromTag(tdList[6]).strip()
                smallList = [price, name, address, area, time]
                outerList.append(smallList)
        else:
            if 0 != len(tdList):
                firstB = tdList[0].first("b")
                if firstB:
                    if getAllTextFromTag(firstB).startswith("No gas prices found."):
                        return (NO_RESULTS, gNoResultsText)

    if 0 == len(outerList):
        if dbgLevel > 0:
            print "len(outerList)==0"
        return parsingFailed(url, htmlTxt)

    return (GAS_DATA, universalDataFormatReplaceEntities(outerList))
Example #18
0
def reversePhoneLookupWhitepages(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    noResults = m411testNoResultsReversePhoneLookup(soup)
    if NO_RESULTS == noResults:
        return (noResults,m411NoResultsText)
    div = soup.first("div", {"class":"listings"})
    if div:
        for table in div.fetch("table"):
            for tr in table.fetch("tr"):
                text1 = tr.first("div",{"class":"textb"})
                text2 = tr.first("div",{"class":"text"})
                if text1 and text2:
                    name = getAllTextFromTag(text1)
                    cont = getAllTextFromToInBrFormat(text2, getLastElementFromTag(text2).next)
                    parts = cont.split("<br>")
                    (address,city,phone) = ("","","")
                    if len(parts) == 3:
                        (address,city,phone) = parts
                    if len(parts) == 2:
                        (city,phone) = parts
                    if len(parts) == 4:
                        (prefix,address,city,phone) = parts
                    returned.append((name,address.strip(),city.strip(),phone.strip()))
    if len(returned) == 0:
        return UNKNOWN_FORMAT, None
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
Example #19
0
def parseFirstDayHtml(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # sky, now, feelsLike, wind, hum, preasure, dev point, visibility
    returned = [None, None, None, None, None, None, None, "N/A"]

    bItems = soup.fetch("b", {"class":"obsTextA"})
    if len(bItems) == 2:
        returned[0] = getAllTextFromTag(bItems[0]).strip()
        temp = getAllTextFromTag(bItems[1]).strip().split("Like ")
        if len(temp) > 1:
            returned[2] = temp[1].replace("&deg;F","").strip()

    bItem = soup.first("b", {"class":"obsTempTextA"})
    if bItem:
        returned[1] = getAllTextFromTag(bItem).replace("&deg;F","").strip()

    tdList = soup.fetch("td", {"class":"obsTextA"})
    if len(tdList) == 8:
        tdList = tdList[1::2]
        assert (len(tdList) == 4)
        returned[3] = getAllTextFromTag(tdList[0]).strip()
        returned[4] = getAllTextFromTag(tdList[1]).replace("%","").strip()
        returned[5] = getAllTextFromTag(tdList[2]).replace("in.","inches").strip() ##todo: down, up, ...
        returned[6] = getAllTextFromTag(tdList[3]).replace("&deg;F","").strip()

    for r in returned:
        if r == None or r == "":
            return None
    return returned
Example #20
0
 def retrievePostText(self, url):
     mUrl = url.replace("http://ca.indeed.com/", "http://ca.indeed.com/m/")
     html = self.doHttpRequest(mUrl)
     # description is located within <div id='desc'>
     soup = BeautifulSoup(html)
     div = soup.first('div', {'id': 'desc'})
     return self.fetch_body(div)
Example #21
0
def get_title(url):
    """Fetches the contents of url and extracts (and utf-8 encodes)
       the contents of <title>"""
    if not url or not url.startswith("http://"):
        return None

    try:
        # if we don't find it in the first kb of the resource, we
        # probably won't find it
        opener = urlopen(url, timeout=15)
        text = opener.read(1024)
        opener.close()
        bs = BeautifulSoup(text)
        if not bs:
            return

        title_bs = bs.first("title")

        if not title_bs or title_bs.children:
            return

        return title_bs.text.encode("utf-8")

    except:
        return None
Example #22
0
def cleanupResults(results):
    '''
        Extract and cleanup the returned HTML.
        Retrn a pipe seperated list.
    '''
    pipeSeparatedData = []
    status = results.status
    if status == 200:
        data = results.read()
        soup = BeautifulSoup(data)
        target_HTML = soup.first('pre')  # grab text inside <pre html tags
        # break it up into lines
        targetLines = []
        for line in target_HTML:
            targetLines.append(line)
        # we're interested in line 2
        # break it down to a list
        sunriseDataList = targetLines[2].split('\n')
        # strip leading/trailing space
        sunriseDataList = map(stripit, sunriseDataList)
        # remove empty members
        sunriseDataList = filter(None, sunriseDataList)
        # parse line to produce pipe separated name/value string
        for en in sunriseDataList:
            icu = re.sub(r'  +', '|', en)
            pipeSeparatedData.append(icu)
        # sample output
        # [u'Begin civil twilight|6:56 a.m.', u'Sunrise|7:21 a.m.', u'Sun transit|1:32 p.m.',
        # u'Sunset|7:44 p.m.', u'End civil twilight|8:08 p.m.']
    else:
        print 'Error from Sunrise/Sunset website - no data available'
    return pipeSeparatedData
Example #23
0
def getTorrentDetails(url):
    from BeautifulSoup import BeautifulSoup, SoupStrainer

    html = getHTML(url)
    spanResults = BeautifulSoup(html)
    ltrSpan = spanResults.find('span', {'dir': 'ltr'})

    if ltrSpan == None:
        raise Exception
        return

    if 'artist.php' not in ltrSpan.next.attrs[0][1] or (
            'Album' not in ltrSpan.nextSibling
            and 'Anthology' not in ltrSpan.nextSibling and 'Compilation'
            not in ltrSpan.nextSibling and 'Single' not in ltrSpan.nextSibling
            and 'Soundtrack' not in ltrSpan.nextSibling
            and 'EP' not in ltrSpan.nextSibling):
        raise Exception
        return

    tagStrainer = SoupStrainer('a', href=re.compile('torrents.php\?taglist\='))
    tagResults = BeautifulSoup(html, tagStrainer)

    artist = ''

    for element in ltrSpan.contents[:-1]:
        try:
            artist += element.string
        except:
            artist += element

    album = ltrSpan.contents[-1][3:]
    genre = tagResults.first().string

    return (unescape(artist), unescape(album), genre)
Example #24
0
 def test_link_other_proj_no_html2text(self):
     # without html2text, the dash in other-project doesn't get escaped right
     html = BeautifulSoup('''<pre>Foo: <a href="/p/other-project/issues/detail?id=1">issue other-project:1</a></pre>''')
     assert_equal(
         _as_markdown(html.first(), 'myproj'),
         'Foo: [issue other\\-project:1](https://code.google.com/p/other-project/issues/detail?id=1)'
     )
Example #25
0
def get_title(url):
    """Fetches the contents of url and extracts (and utf-8 encodes)
       the contents of <title>"""
    if not url or not url.startswith('http://'):
        return None

    try:
        # if we don't find it in the first kb of the resource, we
        # probably won't find it
        opener = urlopen(url, timeout=15)
        text = opener.read(1024)
        opener.close()
        bs = BeautifulSoup(text)
        if not bs:
            return

        title_bs = bs.first('title')

        if not title_bs or title_bs.children:
            return

        return title_bs.text.encode('utf-8')

    except:
        return None
Example #26
0
	def retrievePostText(self, url):
		mUrl = url.replace("http://ca.indeed.com/", "http://ca.indeed.com/m/")
		html = self.doHttpRequest(mUrl)
		# description is located within <div id='desc'>
		soup = BeautifulSoup(html)		
		div = soup.first('div', {'id' : 'desc'})
		return self.fetch_body(div)
Example #27
0
def parseCurrencyData(htmlText):
    global _g_imgRe
    soup = BeautifulSoup()
    soup.feed(htmlText)
    # <table width="60%" border="0" cellpadding="3" summary="Displays latest tourist currency rates">
    table = soup.first(
        "table", {
            "border": "0",
            "width": "60%",
            "cellpadding": "3",
            "summary": "Displays latest tourist currency rates"
        })
    assert table is not None
    tbody = table.first("tbody")
    assert tbody is not None
    rows = tbody.fetch("tr")
    currencies = dict()
    for row in rows:
        cells = row.fetch("td")
        img = cells[0].fetch("img")[0]
        match = _g_imgRe.match(img["src"])
        if match is None:
            continue
        abbrev = match.group(1)
        rate = float(str(cells[2].contents[0]).strip().split()[0])
        currencies[abbrev] = rate
    usdRate = currencies["USD"]
    for key in currencies.iterkeys():
        currencies[key] = currencies[key] / usdRate
    assert 1 == currencies["USD"]
    return (RESULTS_DATA, currencies)
def cleanupResults(results):
    '''
        Extract and cleanup the returned HTML.
        Retrn a pipe seperated list.
    '''
    pipeSeparatedData = []
    status = results.status
    if status == 200:
        data = results.read()
        soup = BeautifulSoup(data)
        target_HTML = soup.first('pre') # grab text inside <pre html tags
        # break it up into lines
        targetLines = []
        for line in target_HTML:
            targetLines.append(line)
        # we're interested in line 2
        # break it down to a list
        sunriseDataList = targetLines[2].split('\n')
        # strip leading/trailing space
        sunriseDataList = map(stripit, sunriseDataList)
        # remove empty members
        sunriseDataList = filter(None, sunriseDataList)
        # parse line to produce pipe separated name/value string
        for en in sunriseDataList:
            icu = re.sub(r'  +', '|', en)
            pipeSeparatedData.append(icu)
        # sample output
        # [u'Begin civil twilight|6:56 a.m.', u'Sunrise|7:21 a.m.', u'Sun transit|1:32 p.m.',
        # u'Sunset|7:44 p.m.', u'End civil twilight|8:08 p.m.']
    else:
        print 'Error from Sunrise/Sunset website - no data available'
    return pipeSeparatedData
Example #29
0
def getTorrentDetails(url):
   from BeautifulSoup import BeautifulSoup, SoupStrainer

   html = getHTML(url)
   spanResults = BeautifulSoup(html)
   ltrSpan = spanResults.find('span', {'dir' : 'ltr'})

   if ltrSpan == None:
      raise Exception
      return

   if 'artist.php' not in ltrSpan.next.attrs[0][1] or ('Album' not in ltrSpan.nextSibling and 'Anthology' not in ltrSpan.nextSibling and 'Compilation' not in ltrSpan.nextSibling and 'Single' not in ltrSpan.nextSibling and 'Soundtrack' not in ltrSpan.nextSibling and 'EP' not in ltrSpan.nextSibling):
      raise Exception
      return

   tagStrainer = SoupStrainer('a', href=re.compile('torrents.php\?taglist\='))
   tagResults = BeautifulSoup(html, tagStrainer)

   artist = ''

   for element in ltrSpan.contents[:-1]:
      try:
         artist += element.string
      except:
         artist += element

   album = ltrSpan.contents[-1][3:]
   genre = tagResults.first().string

   return (unescape(artist), unescape(album), genre)
Example #30
0
 def save(self):
     #取出第一张图片的html,使用正则
     soup = BeautifulSoup(self.content_html) 
     self.content_pic = str(soup.first("img")) #soup.first("img") #只返回第一个pic,需要转化为str,否则是对象
     if not self.content_pic : 
         self.content_pic = ''
     super(News, self).save() 
    def scrapeList(self):
        """
        Scrapes the pages for a list.
        Saves the list index page locally.
        Sends for scraping of each month's pages (and then on to the individual messages).
        """

        # Get the page that list the months of archive pages.
        source = self.fetchPage(self.list_url)

        # The copy of the page we save is filtered for email addresses, links, etc.
        filtered_source = self.filterPage(source)

        # Save our local copy.
        # eg /Users/phil/Sites/examplesite/html/list-name/index.html
        local_index = open(self.publish_dir + "/index.html", "w")
        local_index.write(filtered_source)
        local_index.close()

        soup = BeautifulSoup(source)

        # Go through each row in the table except the first (which is column headers).
        for row in soup.first("table")("tr")[1:]:
            # Get the text in the first column: "February 2009:"
            (month, year) = row("td")[0].string.split()
            # Strip the colon off.
            year = year[:-1]

            # Scrape the date page for this month and get all its messages.
            # keep_fetching will be True or False, depending on whether we need to keep getting older months.
            keep_fetching = self.scrapeMonth(year + "-" + month)

            if not keep_fetching:
                break
Example #32
0
def parseCurrencyData(htmlText):
    global _g_imgRe
    soup = BeautifulSoup()
    soup.feed(htmlText)
    # <table width="60%" border="0" cellpadding="3" summary="Displays latest tourist currency rates">
    table = soup.first("table", {"border": "0", "width": "60%", "cellpadding": "3", "summary": "Displays latest tourist currency rates"})
    assert table is not None
    tbody = table.first("tbody")
    assert tbody is not None
    rows = tbody.fetch("tr")
    currencies = dict()
    for row in rows:
        cells = row.fetch("td")
        img = cells[0].fetch("img")[0]
        match = _g_imgRe.match(img["src"])
        if match is None:
            continue
        abbrev = match.group(1)
        rate = float(str(cells[2].contents[0]).strip().split()[0])
        currencies[abbrev] = rate
    usdRate = currencies["USD"]
    for key in currencies.iterkeys():
        currencies[key] = currencies[key] / usdRate
    assert 1 == currencies["USD"]
    return (RESULTS_DATA, currencies)
Example #33
0
def parseFirstDayHtml(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # sky, now, feelsLike, wind, hum, preasure, dev point, visibility
    returned = [None, None, None, None, None, None, None, "N/A"]

    bItems = soup.fetch("b", {"class": "obsTextA"})
    if len(bItems) == 2:
        returned[0] = getAllTextFromTag(bItems[0]).strip()
        temp = getAllTextFromTag(bItems[1]).strip().split("Like ")
        if len(temp) > 1:
            returned[2] = temp[1].replace("&deg;F", "").strip()

    bItem = soup.first("b", {"class": "obsTempTextA"})
    if bItem:
        returned[1] = getAllTextFromTag(bItem).replace("&deg;F", "").strip()

    tdList = soup.fetch("td", {"class": "obsTextA"})
    if len(tdList) == 8:
        tdList = tdList[1::2]
        assert (len(tdList) == 4)
        returned[3] = getAllTextFromTag(tdList[0]).strip()
        returned[4] = getAllTextFromTag(tdList[1]).replace("%", "").strip()
        returned[5] = getAllTextFromTag(tdList[2]).replace(
            "in.", "inches").strip()  ##todo: down, up, ...
        returned[6] = getAllTextFromTag(tdList[3]).replace("&deg;F",
                                                           "").strip()

    for r in returned:
        if r == None or r == "":
            return None
    return returned
    def _menu_item_exists(self, menu_item_text):
        response = self.app.get(get_route("home"))

        soup = BeautifulSoup(response.body)

        menu = soup.first("ul", {"class": "nav"})

        return bool(menu.first("a", text=menu_item_text))
Example #35
0
def parseDream2(htmlTxt):
    soup = BeautifulSoup()
    # TODO: this is temporary:
    htmlTxt = htmlTxt.replace(
        "/*<![CDATA[*/ @import \"/knowledge/stylesheets/monobook/main.css\"; /*]]>*/",
        "")

    soup.feed(htmlTxt)

    tableMain = soup.fetch("table", {
        "width": "768",
        "align": "center",
        "cellspacing": "0",
        "cellpadding": "0"
    })
    if not tableMain:
        return (UNKNOWN_FORMAT, dUnknownFormatText)
    td = None
    for table in tableMain:
        tr = table.first("tr")
        if tr:
            tdTest = tr.first("td", {"width": "100%", "valign": "top"})
            if tdTest:
                td = tdTest
    if not td:
        return (UNKNOWN_FORMAT, dUnknownFormatText)
    # why without this it is not working?
    soup2 = BeautifulSoup()
    soup2.feed(str(td).replace("<br />>", ""))
    td = soup2.first("td")
    # no results?
    if td.first("center"):
        return (NO_RESULTS, dNoResultsText)

    # results
    bTable = td.fetch("b")
    if not bTable:
        return (UNKNOWN_FORMAT, dUnknownFormatText)

    outerList = []
    for bItem in bTable:
        title = getAllTextFromTag(bItem)
        next = getLastElementFromTag(bItem)
        pItem = None
        while next and not pItem:
            if isinstance(next, Tag):
                if next.name == "p":
                    pItem = next
            next = next.next
        if pItem:
            text = getAllTextFromTagWithA(pItem.first("font"))
            if text.startswith("Interpretation: "):
                text = text[len("Interpretation: "):]
            outerList.append((title, text))

    if 0 == len(outerList):
        return (NO_RESULTS, dNoResultsText)
    return (DREAM_DATA, universalDataFormatReplaceEntities(outerList))
Example #36
0
def parseJoke(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    table = soup.first("table", {"width": "328", "id": "Table2"})
    if not table:
        return (UNKNOWN_FORMAT, jUnknownFormatText)
    tdList = table.fetch("td", {"colspan": "3", "valign": "top", "class": "body"})
    if 3 != len(tdList):
        return (UNKNOWN_FORMAT, jUnknownFormatText)

    # simple format - simple parser
    title = getAllTextFromTag(tdList[0]).strip()
    text = getAllTextFromToInBrFormat(tdList[1], tdList[2].previous)
    smallList = [title, text]
    # add rating information
    if len(title) + len(text) > 16:  # in random joke sometimes it returns small nothing... so to be sure
        span = soup.first("span", {"class": "body"})
        if span:
            text = getAllTextFromTag(span).replace("\n", "").strip()
            img = span.first("img", {"src": "%"})
            if text.startswith("CURRENT RATING") and img:
                src = img["src"]
                src = src.split("/")[-1]
                src = src.replace(".gif", "")
                translator = {
                    "iconrate_one": "1",
                    "iconrate_two": "2",
                    "iconrate_three": "3",
                    "iconrate_four": "4",
                    "iconrate_five": "5",
                    "iconrate_one_half": "1.5",
                    "iconrate_two_half": "2.5",
                    "iconrate_three_half": "3.5",
                    "iconrate_four_half": "4.5",
                    "iconrate_zero_half": "0.5",
                }
                rating = "not rated"
                try:
                    rating = translator[src]
                except:
                    pass
                smallList.append(rating)
    outerList = [smallList]
    return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
Example #37
0
def reversePhoneLookup(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    noResults = m411testNoResultsReversePhoneLookup(soup)
    if NO_RESULTS == noResults:
        return (noResults,m411NoResultsText)
    tdWithResults = soup.first("td",{"class":"TTLPREF"})
    if not tdWithResults:
        tdWithResults = soup.first("span",{"class":"TTLPREF"})
    if not tdWithResults:
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    # results are inside <td>
    fontColor = tdWithResults.first("font")
    if fontColor:
        # "No details available."
        counter = 0
        for br in tdWithResults.fetch("br"):
            # we belive that after 6th <br> is city
            if counter == 5:
                city =  "%s" % str(br.next).replace("\n","").strip()
                returned.append(["","",city,""])
            counter += 1
    else:
        # all data, or city & phone
        counter = 0
        person = ""
        address = ""
        city = ""
        phone = ""
        for br in tdWithResults.fetch("br"):
            # 7 <br> in <td> 
            if 1 == counter:
                if not isinstance(br.next,Tag):
                    person = "%s" % str(br.next).replace("\n","").strip()
            if 2 == counter:
                if not isinstance(br.next,Tag):
                    address = "%s" % str(br.next).replace("\n","").strip()
            if 3 == counter:
                city = "%s" % str(br.next).replace("\n","").strip()
            if 4 == counter:
                phone = "%s" % str(br.next).replace("\n","").strip()
            counter += 1
        returned.append((person,address,city,phone))    
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
Example #38
0
def create_html_digest_for_label(email_id, threads, label, soup):

    threadtable = soup.find('table', {'id': label + 'table'})
    threadentrytemplate = open("html_templates/thread.html").read()
    count = 0
    thread_ids = []
    for thread in threads:
        # print 'thread '+label, count
        threadsoup = BeautifulSoup(threadentrytemplate)
        threadtag = threadsoup.first()

        subject = thread['subject']
        outline = thread['snippet']
        thread_ids.append(thread['id'])

        sender = get_sender_string(email_id, thread['participants'])

        sendertag = threadtag.find('span', {'id': 'thsender'})
        if sendertag is not None:
            sendertag.contents[0].replaceWith(sender)

        subjecttag = threadtag.find('span', {'id': 'thsubject'})
        if subjecttag is not None:
            subjecttag.contents[0].replaceWith(subject)

        outlinetag = threadtag.find('div', {'id': 'thoutline'})
        if outlinetag is not None:
            outlinetag.contents[0].replaceWith(outline)

        inboxonce = threadtag.find('a', {'class': 'inboxonce'})
        if inboxonce is not None:
            inboxonce[
                'href'] = prioritizer_url + '/daily_digest/inbox_once?email=' + email_id + "&id=" + thread[
                    'id']

        inboxalways = threadtag.find('a', {'class': 'inboxalways'})
        if inboxalways is not None:
            inboxalways[
                'href'] = prioritizer_url + '/daily_digest/inbox_always?email=' + email_id + "&id=" + thread[
                    'id']

        unsubscribe = threadtag.find('a', {'class': 'unsubscribe'})
        if unsubscribe is not None:
            unsubscribe[
                'href'] = prioritizer_url + '/daily_digest/unsubscribe?email=' + email_id + "&id=" + thread[
                    'id']

        threadtable.append(threadtag)
        count += 1

    if count > 0:
        labelcounttag = soup.find('span', {'id': label + 'number'})
        if labelcounttag is not None:
            labelcounttag.contents[0].replaceWith(" (" + str(count) + ")")

    return soup, count, thread_ids
Example #39
0
    def parse_job_post(self, html):
        features = dict()
        soup = BeautifulSoup(html)
        form = soup.first('form', {'name': 'applyjob'})
        features[COMPANY_NAME] = form.first('input', {'name': 'company_name'})
        features[POSITION_TITLE] = form.first('input', {'name': 'position'})
        features[POST_DATE] = form.first('input', {'name': 'insert_date'})

        print html
        return (None, features)
Example #40
0
def getTextFromDirtyText(dirtyText):
    soup = BeautifulSoup()
    soup.feed("<xxx>" + dirtyText + "</xxx><yyy>test</yyy>")
    dirtySoup = soup.first("xxx")
    textWithBr = getAllTextFromToInBrFormat(
        dirtySoup,
        getLastElementFromTag(dirtySoup).next)
    text = textWithBr.replace("<br>", "\n").replace("<b>",
                                                    "").replace("</b>", "")
    return text
Example #41
0
	def parse_job_post(self, html):
		features = dict()
		soup = BeautifulSoup(html)
		form = soup.first('form', {'name' : 'applyjob'})
		features[COMPANY_NAME] = form.first('input', {'name' : 'company_name'})
		features[POSITION_TITLE] = form.first('input', {'name' : 'position'})
		features[POST_DATE] = form.first('input', {'name' : 'insert_date'})
			
			
		print html
		return (None, features)
Example #42
0
    def children(self):
        # this is nasty, but the children are not encoded in the OBO
        if len(self._children) == 0:
            file = utils.download('%smini' % (url_go_lookup[:-3] % self.id))
            soup = BeautifulSoup(file)
            tab = soup.first('table')
            for entry in tab.contents:
                if isinstance(entry, Tag):
                    self._children.append(entry.findAll('a')[1].contents[0])

        return self._children
Example #43
0
 def fetchPastie(self):
     downloaded_page, headers = downloadUrl(self.url)
     if downloaded_page:
         htmlDom = BeautifulSoup(downloaded_page)
         # search for <textarea class="raw">
         textarea = htmlDom.first('textarea', {'class': 'raw'})
         if textarea:
             # replace html entities like &gt;
             decoded = BeautifulSoup(textarea.contents[0], convertEntities=BeautifulSoup.HTML_ENTITIES)
             self.pastie_content = decoded.contents[0]
     return self.pastie_content
Example #44
0
def getURLData(url):

    try:
        doc = urllib2.urlopen(url, timeout=5).read()
    except urllib2.URLError:
        return None
    try:
        doc = BeautifulSoup(urllib2.urlopen(url).read())
    except UnicodeEncodeError: # This is for links to files/images.
        doc = BeautifulSoup('')

    title = url
    description = u''

    # title
    if doc.title:
        title = doc.title.string
    if not title:
        title = doc.first('meta', attrs={'name': 'title'})
        if title:
            title = title.get('content')

    # description
    description = doc.first('meta', attrs={'name': 'description'})
    if description:
        description = description.get('content')

    # Find favicon
    host_url = urlparse(url)
    favicon_url = doc.first('link', rel='shortcut icon')
    if favicon_url:
        favicon_url = favicon_url.get('href')
        if not favicon_url.startswith('http'):
            favicon_url = host_url[0] + u'://' + host_url[1] + favicon_url
    else:
        favicon_url = host_url[0] + u'://' + host_url[1] + u'/favicon.ico'

    return json.dumps({
        'title': title,
        'description': description,
        'favicon_url': favicon_url})
Example #45
0
def unwrap_minecraftforum(url, resp, body):
    urls = {
        'forum': url,  # This *is* the forum page!
    }

    # It might also have clues as to where the downloads are.
    soup = BeautifulSoup(body)
    post_elt = soup.first('div', 'entry-content')

    best_href = None
    best_score = 0
    for a_elt in post_elt.findAll(
            'a', 'bbc_url'
    ):  # This class distinguishes URLS inserted by the author of the post.
        try:
            href = a_elt['href']

            # Many entries contain self-links.
            if href == url:
                continue

            # Check for licence link.
            licence_score = sum(pat_score
                                for (pat, pat_score) in LICENCE_SCORES
                                if pat.search(href))
            if licence_score:
                urls['licence'] = href
                continue

            # Otherwise, this is a candidate for download or home link.
            score = sum(pat_score for (pat, pat_score) in URL_SCORES
                        if pat.search(href))
            labels = []
            try:
                label = ''.join(a_elt.findAll(text=True))
                if label:
                    labels.append(label)
            except AttributeError:
                pass
            # Look for label immediately preceeding link:
            label = a_elt.findPreviousSibling(text=True)
            if label:
                labels.append(label)

            for label in labels:
                score += sum(pat_score for (pat, pat_score) in LABEL_SCORES
                             if pat.search(label))
            if a_elt.img:
                score += 10
            if score > best_score:
                best_href, best_score = href, score
        except KeyError, e:
            print >> sys.stderr, a_elt, 'did not have', e
    def scrapeList(self):
        source = self.fetchPage(self.list_url)
        filtered_source = self.filterPage(source)
        soup = BeautifulSoup(source)

    	for row in soup.first('table')('tr')[1:]:
            rel_url = row('td')[2]('a')[0].get('href')
            source = self.fetchPage(self.list_url + '/' + rel_url)

            local_month = open(self.local_dir + '/' + rel_url, 'w')
            local_month.write(source)
            local_month.close()
Example #47
0
def getURLData(url):

    try:
        doc = urllib2.urlopen(url, timeout=5).read()
    except urllib2.URLError:
        return None
    try:
        doc = BeautifulSoup(urllib2.urlopen(url).read())
    except UnicodeEncodeError:  # This is for links to files/images.
        doc = BeautifulSoup('')

    title = url
    description = u''

    # title
    if doc.title:
        title = doc.title.string
    if not title:
        title = doc.first('meta', attrs={'name': 'title'})
        if title:
            title = title.get('content')

    # description
    description = doc.first('meta', attrs={'name': 'description'})
    if description:
        description = description.get('content')

    # Find favicon
    host_url = urlparse(url)
    favicon_url = doc.first('link', rel='shortcut icon')
    if favicon_url:
        favicon_url = favicon_url.get('href')
        if not favicon_url.startswith('http'):
            favicon_url = host_url[0] + u'://' + host_url[1] + favicon_url
    else:
        favicon_url = host_url[0] + u'://' + host_url[1] + u'/favicon.ico'

    return {'title': title,
            'description': description,
            'favicon_url': favicon_url}
Example #48
0
    def scrapeList(self):
        source = self.fetchPage(self.list_url)
        filtered_source = self.filterPage(source)
        soup = BeautifulSoup(source)


        for row in soup.first('table')('tr')[1:]:
            rel_url = row('td')[2]('a')[0].get('href')
            source = self.fetchPage(self.list_url + '/' + rel_url)

            local_month = open(self.local_dir + '/' + rel_url, 'w')
            local_month.write(source)
            local_month.close()
Example #49
0
 def fetch_pastie(self):
     downloaded_page, headers = download_url(self.url)
     if downloaded_page:
         htmlDom = BeautifulSoup(downloaded_page)
         # search for <textarea class="raw">
         textarea = htmlDom.first('textarea', {'class': 'raw'})
         if textarea:
             # replace html entities like &gt;
             decoded = BeautifulSoup(
                 textarea.contents[0],
                 convertEntities=BeautifulSoup.HTML_ENTITIES)
             self.pastie_content = decoded.contents[0]
     return self.pastie_content
Example #50
0
def download_script(script_id, save_to):
    vimhome = "http://www.vim.org/scripts/"
    data = urllib2.urlopen(vimhome + "/script.php?script_id=" + script_id)
    soup = BeautifulSoup(data)
    #the first row of download link table
    a_tag = soup.first('td', {'class': 'rowodd'}).find('a')

    download_link = a_tag["href"]
    download_filename = a_tag.text

    src_data = urllib2.urlopen(vimhome + download_link).read()
    dst_path = os.path.join(save_to, download_filename)
    dst_fileobj = open(dst_path, "wb")
    dst_fileobj.write(src_data)
    dst_fileobj.close()
Example #51
0
def parseStock(htmlTxt):
    # this is funy
    htmlTxt = htmlTxt.replace("<! -- ", "<!---")
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    noResults = testNoResults(soup)
    if NO_RESULTS == noResults:
        return (NO_RESULTS, sNoResultsText)

    # get name
    nameTag = soup.first("td", {"height": "30", "class": "ygtb"})
    if not nameTag:
        return (UNKNOWN_FORMAT, sUnknownFormatText)
    name = getAllTextFromTag(nameTag).strip()

    # get all data from table
    bigTable = soup.fetch("table", {"width": "580", "id": "yfncsumtab"})
    if 1 != len(bigTable):
        return (UNKNOWN_FORMAT, sUnknownFormatText)
    tdDataList = bigTable[0].fetch("td", {"class": "yfnc_tabledata1"})
    innerList = [name]
    counter = 0
    for tdItem in tdDataList:
        if 2 == counter:
            # 3th element is with up down icon
            imgItem = tdDataList[2].first("img")
            upDown = ""
            if imgItem:
                upDown = imgItem['alt']
            innerList.append(upDown)
            bItem = tdDataList[2].first("b")
            itemText = ""
            if bItem:
                itemText = getAllTextFromTag(bItem).strip()
            innerList.append(itemText)
        else:
            itemText = getAllTextFromTag(tdItem).strip()
            innerList.append(itemText)
        counter += 1

    # any results?
    if 0 == counter:
        return (UNKNOWN_FORMAT, sUnknownFormatText)

    # one-item UDF
    outerList = [innerList]
    return (STOCKS_DATA, universalDataFormatReplaceEntities(outerList))
Example #52
0
class AtomRecentActivity(RecentActivitySource):
    def __init__(self, lastDay, feedUrl):
        RecentActivitySource.__init__(self, lastDay)
        self.feedUrl = feedUrl

    def collectData(self):
        opener = urllib2.build_opener()
        self.feedXml = BeautifulSoup(opener.open(self.feedUrl))

    def interpretData(self):
        feed = self.feedXml.first()
        for entry in feed.findAll('entry'):
            self.logEntryAsActivity(entry)

    def logEntryAsActivity(self, entry):
        self.recentActivity[entry.updated.text] += 1