Example #1
1
def GetBlogHistory(url, direction):
    global totalnum
    global currentnum

    if currentnum > 15:
        print "sleep 30 seconds before continuing retrieving new instances"
        time.sleep(
            30
        )  #sleep 30seconds after retrieved 20 instances, way to avoid block from site for too frequent access...
        currentnum = 0
    currentnum += 1

    try:
        driver.get(url)
    except:
        if printtimeoutexcept == 1:
            print "retry for timeout(20sec) exception, url: " + url
        GetBlogHistory(url, direction)
    content = driver.page_source
    soup = BeautifulSoup.BeautifulSOAP(content)
    if direction == 0 or direction == -1:
        llist = soup.find("div", attrs={"class": "pleft thide"})
        if (llist != None):
            lurl = dict(llist.contents[0].attrs)['href']
            if printblognum == 1:
                totalnum += 1
                print "%d. %s %s" % (totalnum, lurl,
                                     html_parser.unescape(
                                         dict(llist.attrs)['a']))
            else:
                print lurl + '  ' + html_parser.unescape(
                    dict(llist.attrs)['a'])
            GetBlogHistory(lurl, -1)
    if direction == 0 or direction == 1:
        rlist = soup.find("div", attrs={"class": "pright thide"})
        if (rlist != None):
            rurl = dict(rlist.contents[0].attrs)['href']
            if printblognum == 1:
                totalnum += 1
                print "%d. %s %s" % (totalnum, rurl,
                                     html_parser.unescape(
                                         dict(rlist.attrs)['a']))
            else:
                print rurl + '  ' + html_parser.unescape(
                    dict(rlist.attrs)['a'])
            GetBlogHistory(rurl, 1)
Example #2
0
def rpm_list(urllink):
    download_list = []
    html_doc = urllib2.urlopen(urllink)
    html_bs = bs.BeautifulSOAP(html_doc)
    for i in html_bs.findAll('a'):
        download_list.append(i.attrs[0][1])
    return download_list[1:]
Example #3
0
    def loadConfig(self, configPath):
        "Initialize the state of the fairy from XML config files"
        self.configPath = os.path.join(
            os.path.dirname(os.path.abspath(sys.argv[0])), configPath)
        printAndLog("Merge Fairy loading configuration file " +
                    self.configPath)

        soup = BeautifulSoup.BeautifulSOAP(open(self.configPath))
        dom = soup.first("fairy")

        for attr in ["urlbase", "pathbase", "buildcmd"]:
            self.__dict__[attr] = dom[attr]
        self.pollinterval = int(float(dom["pollinterval"]))

        self.branches = []
        self.branchMap = {}  # map between URLs and branch objects
        for branchNode in dom.fetch("branch"):
            branch = SvnBranch(self.buildcmd,
                               urlbase=self.urlbase,
                               pathbase=self.pathbase).initFromXml(branchNode)
            self.branches.append(branch)
            self.branchMap[branch.url] = branch
        self.branches.sort(key=lambda b: b.url)

        self.dependencyMap = DependencyMap()
        for dependencyNode in dom.fetch("dependency"):
            dependency = Dependency().initFromXml(dependencyNode, self)
            self.dependencyMap.addDependency(dependency)
Example #4
0
 def _reddit(self, args):
     """Usage: `{cmd_prefix}reddit [*subreddits]`"""
     output = []
     args = args if args else ['']
     for arg in args:
         if arg:
             site = 'http://www.reddit.com/r/{}'.format(arg)
             logger.log((site, ), (None, ))
         else:
             site = 'http://www.reddit.com/'
         bs = BeautifulSoup.BeautifulSOAP(urlgrabber.urlread(site, size=2097152*10))
         output.extend(bs.findAll('a', 'title'))
     return '\n'.join('{}: {} {}'.format(i + 1, o.string, o.get('href')) for i, o in enumerate(output[:5]))
Example #5
0
        def parse_resp(content):
            params = []

            soup = BeautifulSoup.BeautifulSOAP(content)
            form = soup.find('form')

            form_attr_dict = dict(form.attrs)
            for e in form.findAll('input'):
                e_attr_dict = dict(e.attrs)
                params.append((e_attr_dict['name'], e_attr_dict['value']))

            return form_attr_dict['action'], form_attr_dict['method'].upper(
            ), dict(params)
Example #6
0
def parse_full(xml):
    '''given the xml returned from a comp search, try an iterrupt 
    the returned data from a search query.
    '''
    soup = BeautifulSoup.BeautifulSOAP(xml)
    data = soup.find('data')
    if not data:
        raise ParseException('unable to find data element')

    totals = data.find('totals')
    results = data.find('results')
    if not totals:
        raise ParseException('unable to find totals element')
    if not results:
        raise ParseException('unable to find results element')

    # totals
    #
    by_category = {}
    for tab in totals.findAll('tab'):
        table_name = tab.find('table').text
        count = int(tab.find('total').text)
        by_category[table_name] = count

    # results. since we're not exactly sure what's coming back (other then
    # inspecting the category the query was made with) lets just be better at
    # handling stuff
    #
    result_rows = []
    for child in results.childGenerator():
        if not isinstance(child, BeautifulSoup.Tag):
            continue
        result_rows.append(_soupdict(child))

    return {
        'totals': by_category,
        'rows': result_rows,
    }
Example #7
0
def GetBlogHistory(url, direction):
    global totalnum
    try:
        driver.get(url)
    except:
        if printtimeoutexcept == 1:
            print "Retry for timeout(20sec) exception, url: " + url
        GetBlogHistory(url, direction)
    content = driver.page_source
    soup = BeautifulSoup.BeautifulSOAP(content)
    if direction == 0 or direction == -1:
        llist = soup.find("div", attrs={"class": "pleft thide"})
        if (llist != None):
            lurl = dict(llist.contents[0].attrs)['href']
            if printblognum == 1:
                totalnum += 1
                print "%d. %s %s" % (totalnum, lurl,
                                     html_parser.unescape(
                                         dict(llist.attrs)['a']))
            else:
                print lurl + '  ' + html_parser.unescape(
                    dict(llist.attrs)['a'])
            GetBlogHistory(lurl, -1)
    if direction == 0 or direction == 1:
        rlist = soup.find("div", attrs={"class": "pright thide"})
        if (rlist != None):
            rurl = dict(rlist.contents[0].attrs)['href']
            if printblognum == 1:
                totalnum += 1
                print "%d. %s %s" % (totalnum, rurl,
                                     html_parser.unescape(
                                         dict(rlist.attrs)['a']))
            else:
                print rurl + '  ' + html_parser.unescape(
                    dict(rlist.attrs)['a'])
            GetBlogHistory(rurl, 1)