Ejemplo n.º 1
0
def get_page(page):
    content = ''
    imgurl = ''
    title = ''
    img_binary = ''
    tags = []
    explain = ''
    try:
        content = urllib2.urlopen(page, timeout=2).read()
        try:
            soup = bsoup(content, 'lxml')
        except:
            soup = bsoup(content, 'html.parser')
        tmp1 = soup.findAll('div', {'class': 'detail_image'})[0]
        tmp2 = soup.find('div', {'id': 'miaoshu'})
        tmp3 = soup.find('div', {'id': 'detail_additional'}).contents[-2]

        tags = re.findall("<a href=.*?>([^<]*)?</a>", unicode(tmp3))
        imgurl = tmp1.img.get('src', '')
        title = tmp1.img.get('title', '')
        explain = tmp2.p.text
        if explain == '' or explain == None:
            explain = title
        explain = explain.replace('\r\n', '').replace('\n', '')
        title = simplify(title)
        req = urllib2.Request(url=imgurl, headers={"Referer": page})
        img_binary = urllib2.urlopen(req).read()
    except Exception, e:
        print e
        return False, '', '', '', '', ''
Ejemplo n.º 2
0
def openSoup(x):
    """Opens URL and create soup"""
    try:
        return bsoup(urllib2.urlopen(x).read())
    except urllib2.HTTPError, e:
        print 'Taking a breather...'
        time.sleep(120)
        return bsoup(urllib2.urlopen(x).read())
Ejemplo n.º 3
0
def openSoup(x):
	"""Opens URL and create soup"""
	try:
		return bsoup(urllib2.urlopen(x).read())
	except urllib2.HTTPError, e:
		print 'Taking a breather...'
		time.sleep(120)
		return bsoup(urllib2.urlopen(x).read())
Ejemplo n.º 4
0
    def read_file(self, filename):
        """
        Reads a single SemCor file in NLP Annotation Format (NAF).
        """
        with io.open(filename, 'r') as fin:
            print filename
            xml = fin.read(
            )  # What happen if IOError: [Errno 5] Input/output error?? WTF is Errno5 !!@$!%#@^$& ???!!!!@#$!%@
            text = bsoup(xml).find('text')
            terms = bsoup(xml).find('terms')

            sentences = defaultdict(list)
            paragraphs = defaultdict(list)

            # Gets the term layer.

            termid2sense = {}
            for term in terms.findAll('term'):
                termid = int(term.get('id')[1:])
                term_sense = None
                try:
                    sense = term.findAll('externalref')[-1].get('reference')
                    term_sense = sense[6:] if sense.startswith(
                        'eng30-') else sense
                except:
                    pass

                t = Term(termid, term.get('pos'), term.get('lemma'),
                         term_sense, term.get('type'))
                termid2sense[termid] = t

            # Gets the text layer.
            wordid2meta = {}
            for word in text.findAll('wf'):
                wordid = int(word.get('id')[1:])
                sentid = int(word.get('sent'))
                paraid = int(word.get('para'))
                try:
                    term = termid2sense[wordid]
                except:
                    # TODO: please check that all 'words' with term annotation
                    #       are punctuation.
                    term = Term(id=wordid,
                                pos=u'PUNCT',
                                lemma=None,
                                sense=None,
                                type=u'punct')
                w = Word(wordid, word.text, sentid, paraid, term)
                wordid2meta[wordid] = w
                sentences[sentid].append(wordid)
                paragraphs[paraid].append(sentid)

            return wordid2meta, termid2sense, sentences, paragraphs
Ejemplo n.º 5
0
    def read_file(self, filename):
        """
        Reads a single SemCor file in NLP Annotation Format (NAF).
        """
        with io.open(filename, 'r') as fin:
            print filename
            xml = fin.read() # What happen if IOError: [Errno 5] Input/output error?? WTF is Errno5 !!@$!%#@^$& ???!!!!@#$!%@
            text = bsoup(xml).find('text')
            terms = bsoup(xml).find('terms')
            
            sentences = defaultdict(list)
            paragraphs = defaultdict(list)
            
            # Gets the term layer.
             
            termid2sense = {}
            for term in terms.findAll('term'):
                termid = int(term.get('id')[1:])
                term_sense = None
                try:
                    sense = term.findAll('externalref')[-1].get('reference')
                    term_sense = sense[6:] if sense.startswith('eng30-') else sense
                except:
                    pass
                
                t = Term(termid, term.get('pos'), term.get('lemma'), 
                         term_sense, term.get('type'))
                termid2sense[termid] = t
            
            # Gets the text layer.
            wordid2meta = {}
            for word in text.findAll('wf'):
                wordid = int(word.get('id')[1:])
                sentid = int(word.get('sent'))
                paraid = int(word.get('para'))
                try:
                    term = termid2sense[wordid]
                except:
                    # TODO: please check that all 'words' with term annotation
                    #       are punctuation.
                    term=Term(id=wordid, pos=u'PUNCT', lemma=None, 
                              sense=None, type=u'punct')                    
                w = Word(wordid, word.text, sentid, paraid, term)
                wordid2meta[wordid] = w
                sentences[sentid].append(wordid)
                paragraphs[paraid].append(sentid)

            return wordid2meta, termid2sense, sentences, paragraphs
Ejemplo n.º 6
0
def parseFiles(annotationsPath):
  objectList = []
  # Retrieves all the files in a directory and checks if they are xml
  annotationsFullPath = os.path.abspath(annotationsPath)
  fileList = os.listdir(annotationsFullPath)
  
  if len(fileList) > 0:
    lastFile = ''
    for file in fileList:
        fileTypeMatch = re.search('.xml',file)
        if fileTypeMatch:
            print "Processing file: " + file
            try:
                filePath = os.path.join(annotationsFullPath, file)
                f = open(filePath)
                soup = bsoup(f)
                f.close()
                # Finds the object of all xml files and places the objects into a list
                # and returns it.
                parsedXML = (soup.findAll('name'))
                for object in parsedXML:
                    match = re.search('(<name>)(\w+)(</name>)', str(object))
                    objectList += match.group(2),
            except IOError:
                sys.stderr.write("There was a problem with file: " + file + '\n')
  else:
    sys.stderr.write("Error - No xml files found.")
    sys.exit(1)
  return objectList
Ejemplo n.º 7
0
def get_salt_version(url, args, prev_branch=False):
    if 'windows' not in url and 'osx' not in url and prev_branch:
        url = url.replace(url.split('/')[-1], args)
    get_url = requests.get(url)
    ret_code = get_url.status_code
    if ret_code != 200:
        print('Attempt to query url failed with http error code: {0}'.format(
            ret_code))
        sys.exit(1)
    html = get_url.content
    parse_html = bsoup(html)
    pkg_name = 'salt-master'
    if 'osx' in url:
        try:
            pkg_name = 'salt-{0}'.format(args.branch)
        except AttributeError:
            pkg_name = 'salt-{0}'.format(args)
    if 'windows' in url:
        try:
            pkg_name = 'Salt-Minion-{0}'.format(args.branch)
        except AttributeError:
            pkg_name = 'Salt-Minion-{0}'.format(args)

    for tag in parse_html.findAll(attrs={'href': re.compile(pkg_name + ".*")}):
        match = re.search("([0-9]{1,4}\.)([0-9]{1,2}\.)([0-9]{1,2})", str(tag))
        salt_ver = (match.group(0))
    return salt_ver
Ejemplo n.º 8
0
def parse_html_method(tab_os, os_v, args):
    '''
    Parse the index.html for install commands
    '''
    # Get and Parse url variables
    if args.branch != LATEST:
        url = 'https://repo.saltstack.com/staging/{0}.html'.format(args.branch)
    else:
        url = 'https://repo.saltstack.com/staging/index.html'.format()

    get_url = requests.get(url)
    html = get_url.content
    parse_html = bsoup(html)

    os_instruction = []

    for tag in parse_html.findAll(attrs={'id' : tab_os}):
        # grab all instructions for a specific os and release
        # for example grab debian7 for latest release
        for tab_os_v in tag.findAll(attrs={'class': re.compile(os_v + ".*")}):
            for cmd in tab_os_v.findAll(attrs={'class': 'language-bash'}):
                if cmd not in os_instruction:
                    os_instruction.append(cmd)
            for cmd_2 in tab_os_v.findAll(attrs={'class': 'language-ini'}):
                if cmd_2 not in os_instruction:
                    os_instruction.append(cmd_2)
        # get all instructions that run on both veresion of each os_family
        for cmd_all in tag.findAll('code', attrs={'class': None}):
            if cmd_all not in os_instruction:
                os_instruction.append(cmd_all)
    return os_instruction
Ejemplo n.º 9
0
def parseFiles(annotationsPath,objectType):
    tagList = []
    # Retrieves all the files in a directory and checks if they are xml
    annotationsFullPath = os.path.abspath(annotationsPath)
    fileList = os.listdir(annotationsFullPath)
    for file in fileList:
        fileTypeMatch = re.search('.xml',file)
        if fileTypeMatch:
            print "Processing file: " + file
            try:
                filePath = os.path.join(annotationsFullPath, file)
                f = open(filePath)
                soup = bsoup(f)
                f.close()
                # Finds the object of all xml files and places the objects into a list
                # and returns it.
                parsedXML = (soup.findAll('name'))
                if objectType == 'all':
                    for object in parsedXML:
                        tagList.append(addToTagList(soup))
                elif objectType in ('car','person','bicycle'):
                    for object in parsedXML:
                        match = re.search('(<name>)(\w+)(</name>)', str(object))
                        if match.group(2) == objectType:
                            tagList.append(addToTagList(soup))
            except IOError:
                sys.stderr.write('There was a problem with file: ' + file + '\n')
    return tagList
Ejemplo n.º 10
0
 def test_instances(self):
     """
     Returns the test instances from SemEval2007 Coarse-grain WSD task.
     
     >>> coarse_wsd = SemEval2007_Coarse_WSD()
     >>> inst2ans = coarse_wsd.get_answers()
     >>> for inst in inst2ans:
     ...    print inst, inst2ans[inst]
     ...    break
     d004.s073.t013 answer(sensekey=[u'pointer%1:06:01::', u'pointer%1:06:00::', u'pointer%1:10:00::'], lemma=u'pointer', pos=u'n')
     """
     Instance = namedtuple('instance', 'id, lemma, word')
     test_file = io.open(self.test_file, 'r').read()
     inst2ans = self.get_answers()
     
     for text in bsoup(test_file).findAll('text'):
         textid = text['id']
         document = " ".join([remove_tags(i) for i in str(text).split('\n') 
                              if remove_tags(i)])
         for sent in text.findAll('sentence'):
             sentence =  " ".join([remove_tags(i) for i in 
                                   str(sent).split('\n') if remove_tags(i)])
             for instance in sent.findAll('instance'):
                 instid = instance['id']
                 lemma = instance['lemma']
                 word = instance.text
                 inst = Instance(instid, lemma, word)
                 yield inst, inst2ans[instid], unicode(sentence), unicode(document)
Ejemplo n.º 11
0
def parseWikiContent(text):
    soup = bsoup(text)
    # check exists
    noarticle = soup('div', {"class": "noarticletext"})
    if len(noarticle) != 0:
        print 'not exist!!!'
        return None
    pSet = soup('div',
                {'id': 'mw-content-text'})[0].findChildren('p',
                                                           recursive=False)
    loops = 3
    contents = ''
    for p in pSet:
        if loops == 0:
            break
        #print p
        content = p.getText()
        #print content
        if len(content) >= 4 and content[0:6].find(u'坐标') == -1:
            content = filterInvalidChar(pattern, content)
            contents += content.encode('utf-8') + '\n'
        loops -= 1
    if len(contents) > 0:
        return contents
    else:
        return None
def parseFiles(annotationsPath,objectType):
    orientationDict = {'car':[0,0,0,0,0],'person':[0,0,0,0,0],'bicycle':[0,0,0,0,0]}
    tempType=''
    # Creates two lists and an object in one list corresponds to the orientation in the other list based on position.
    parsedObjectXMLList = []
    parsedOrientationXMLList = []
    # Retrieves all the files in a directory and checks if they are xml
    fileList = os.listdir(annotationsPath)
    annotationsFullPath = os.path.abspath(annotationsPath)
    for file in fileList:
        fileTypeMatch = re.search('.xml',file)
        if fileTypeMatch:
            print "Processing file: " + file
            try:
                filePath = os.path.join(annotationsFullPath, file)
                f = open(filePath)
                soup = bsoup(f)
                f.close()
                # Finds the object of all xml files and places the objects into a list
                # and returns it.
                parsedObjectXML = (soup.findAll('name'))
                parsedOrientationXML = soup.pose.string

                for object in parsedObjectXML:
                    match = re.search('(<name>)(\w+)(</name>)', str(object))
                    object = match.group(2)
                    if objectType == 'all':
                        parsedObjectXMLList += object,
                        parsedOrientationXMLList += str(parsedOrientationXML),
                    elif objectType == 'car' and object == 'car':
                        parsedObjectXMLList += object,
                        parsedOrientationXMLList += str(parsedOrientationXML),
                    elif objectType == 'person' and object == 'person':
                        parsedObjectXMLList += object,
                        parsedOrientationXMLList += str(parsedOrientationXML),
                    elif objectType == 'bicycle' and object == 'bicycle':
                        parsedObjectXMLList += object,
                        parsedOrientationXMLList += str(parsedOrientationXML),
            except IOError:
                sys.stderr.write('There was a problem with file: ' + file + '/n')

    for x in range (0,len(parsedObjectXMLList)):
        if objectType == 'all':
            tempType = objectType
            objectType = parsedObjectXMLList[x]
        if parsedObjectXMLList[x] == objectType:
            if parsedOrientationXMLList[x] == 'Left':
                (orientationDict[objectType])[0]+=1
            elif parsedOrientationXMLList[x] == 'Right':
                (orientationDict[objectType])[1]+=1
            elif parsedOrientationXMLList[x] == 'Frontal':
                (orientationDict[objectType])[2]+=1
            elif parsedOrientationXMLList[x] == 'Rear':
                (orientationDict[objectType])[3]+=1
            elif parsedOrientationXMLList[x] == 'Unspecified':
                (orientationDict[objectType])[4]+=1

        if tempType == 'all':
            objectType = tempType
    return orientationDict
Ejemplo n.º 13
0
    def FakeResponse(a):
        test = file("backend/test/data/inventory/test_20120310_055847_infibeam.html", "r").read()
        test_data = str(bsoup(test).fetch('ul', 'srch_result portrait')[0].fetch('li')[0])

        #monkey patching test-data to get the correct minimal test-data 
        test_data = str("<ul class='srch_result portrait'>" + test_data + "</ul>")
        return '200 OK', test_data        
Ejemplo n.º 14
0
def remove_images(data_html):
    """
    remove occurences of images from data_html
    if there are any links that do not have any text, will also be removed
    """
    soup = bsoup(data_html)

    # remove all images
    for image in soup.findAll("img"):
        image.replaceWith('')
        try:
            logger.debug('removed img: %s' % (image["src"]))
        except KeyError:
            logger.debug('removed img: %s' % ("image link was not available"))

    # remove links to images or to anything, without any text
    # eg: <a href='http://link/to/some-page'></a>
    # following will be left as it is:
    # <a href='http://link/to/some-page'>some text</a>
    for a in soup.findAll('a'):
        if not a.renderContents().strip():
            a.replaceWith('')
            logger.debug('removed a tag containing: %s' % (a))

    return smart_unicode(soup.renderContents())
Ejemplo n.º 15
0
def organizeImageInfo(annotationsFileList,photoFileList,annotationsFullPath,photoFullPath, classes, orientation, tags):
    size = determinePhotoSize(classes,orientation)
    imageDict = {}
    root = Tk()

    annotationsSet = Set(annotationsFileList)
    for photo in photoFileList:
        photoMatch = re.search('(2014_)(\w+)(.png)',photo)
        if photoMatch:
            xml = str(photoMatch.group(1)) + str(photoMatch.group(2)) + '.xml'
            if xml in annotationsSet:
                xmlPath = os.path.join(annotationsFullPath, xml)
                f = open(xmlPath)
                soup = bsoup(f)
                f.close()
                nameTagList,truncationTagList,occludedTagList,poseTagList,xminTagList,yminTagList,xmaxTagList,ymaxTagList = parseXML(soup)
                for name,truncation,occluded,pose,xmin,ymin,xmax,ymax in zip(nameTagList,truncationTagList,occludedTagList,poseTagList,xminTagList,yminTagList,xmaxTagList,ymaxTagList):
                    print "Processing file: " + xml
                    if classes.lower() == name[1]:
                        if orientation.lower() in (pose[1].lower(),'all'):
                            if tags.lower() == 'all' or tags.lower() == 'none' and int(truncation[1]) == 0 and int(occluded[1]) == 0 or tags.lower() == 'occluded' and int(occluded[1]) or tags.lower() == 'truncated' and int(truncation[1]) or tags.lower() == 'occluded and truncated' and int(occluded[1]) and int(truncation[1]):
                                print "Match found in: " + photo
                                photoPath = os.path.join(photoFullPath, photo)
                                image = Image.open(photoPath)
                                image = image.crop((int(xmin[1]),int(ymin[1]),int(xmax[1]),int(ymax[1])))
                                image = image.resize(size)
                                image = ImageTk.PhotoImage(image)
                                imageDict[image] = xml
    return imageDict,root,size
Ejemplo n.º 16
0
    def read_file(self, filename):
        """
        Reads a single SemCor file in NLP Annotation Format (NAF).
        """
        with io.open(filename, 'r') as fin:
            xml = fin.read()
            text = bsoup(xml).find('text')
            terms = bsoup(xml).find('terms')
            
            sentences = defaultdict(list)
            paragraphs = defaultdict(list)
            
            # Gets the term layer.
            Term = namedtuple('term', 'id, pos, lemma, sense, type') 
            termid2sense = {}
            for term in terms.findAll('term'):
                termid = int(term.get('id')[1:])
                term_sense = None
                try:
                    sense = term.findAll('externalref')[-1].get('reference')
                    term_sense = sense[6:] if sense.startswith('eng30-') else sense
                except:
                    pass
                
                t = Term(termid, term.get('pos'), term.get('lemma'), 
                         term_sense, term.get('type'))
                termid2sense[termid] = t
            
            # Gets the text layer.
            Word = namedtuple('word', 'id, text, offset, sentid, paraid, term')
            wordid2meta = {}
            for word in text.findAll('wf'):
                wordid = int(word.get('id')[1:])
                sentid = int(word.get('sent'))
                paraid = int(word.get('para'))
                try:
                    term = termid2sense[wordid]
                except:
                    term = None
                w = Word(wordid, word.text, word.get('offset'), 
                         sentid, paraid, term)
                wordid2meta[wordid] = w
                sentences[sentid].append(wordid)
                paragraphs[paraid].append(sentid)

            return wordid2meta, termid2sense, sentences, paragraphs
Ejemplo n.º 17
0
Archivo: dbo.py Proyecto: ks07/bukget
def _get_page(url, delay=2):
    conf = config.Configuration()
    global _timer
    while (datetime.datetime.now() - _timer).seconds < delay:
        time.sleep(.25)
    _timer = datetime.datetime.now()
    if conf.debug:
        print 'Fetching: %s' % url
    return bsoup(urlopen(url).read())
Ejemplo n.º 18
0
def get_salt_version(url):
    get_url = requests.get(url)
    html = get_url.content
    parse_html = bsoup(html)

    for tag in parse_html.findAll(
            attrs={'href': re.compile('salt-master' + ".*")}):
        match = re.search("([0-9]{1,4}\.)([0-9]{1,2}\.)([0-9]{1,2})", str(tag))
        salt_ver = (match.group(0))
    return salt_ver
Ejemplo n.º 19
0
def collectObjectArea(annotationsPath,objectClass,noTruncated,noOcclusion):
    areaList = []
    truncationArea = []
    occlusionArea = []
    # Retrieves all the files in a directory and checks if they are xml
    annotationsFullPath = os.path.abspath(annotationsPath)
    fileList = os.listdir(annotationsFullPath)
    for file in fileList:
        fileTypeMatch = re.search('.xml',file)
        if fileTypeMatch:
            print "Processing file: " + file
            try:
                filePath = os.path.join(annotationsFullPath, file)
                f = open(filePath)
                soup = bsoup(f)
                f.close()
                parsedXML = (soup.findAll('name'))
                
                # Finds the object of all xml files and checks if it is a part of objectClass.
                if objectClass == 'all':
                    for photo in parsedXML:
                        for object in photo:
                            truncatedMatch = int(soup.truncated.string)
                            occlusionMatch = int(soup.occluded.string)
                            if not truncatedMatch and not occlusionMatch:
                                calculateArea(soup,areaList)
                            if truncatedMatch and not occlusionMatch:
                                calculateArea(soup,truncationArea)
                            if occlusionMatch and not truncatedMatch:
                                calculateArea(soup,occlusionArea)
                            if occlusionMatch and truncatedMatch:
                                calculateArea(soup,occlusionArea)
                else:
                    for photo in parsedXML:
                        for object in photo:
                            truncatedMatch = int(soup.truncated.string)
                            occlusionMatch = int(soup.occluded.string)
                            # For all objects of the type that the user specifies, area is
                            # calculated and added to a list.
                            if object == objectClass:
                                if not truncatedMatch and not occlusionMatch:
                                    calculateArea(soup,areaList)
                                if truncatedMatch and not occlusionMatch:
                                    calculateArea(soup,truncationArea)
                                if occlusionMatch and not truncatedMatch:
                                    calculateArea(soup,occlusionArea)
                                if occlusionMatch and truncatedMatch:
                                    calculateArea(soup,occlusionArea)
            except IOError:
                sys.stderr.write('There was a problem with file: ' + file +'\n')
    if noTruncated is False:
        areaList += truncationArea
    if noOcclusion is False:
        areaList += occlusionArea
    return areaList
Ejemplo n.º 20
0
 def order(self,data1):
     s = requests.Session()
     url = "http://desktop.nju.edu.cn:8080/jiaowu/login.do"
     r1 = s.post(url,data=data1,headers=headers)
     soup = bsoup(r1.text)
     name = soup.find(id='UserInfo')
     print(str(data1['password']))
     if name is not None:
         file1.write("学号:"+str(data1['userName'])+" 密码:"+str(data1['password'])+"\n")
         print("学号:"+str(data1['userName'])+" 密码:"+str(data1['password'])+"\n")
         exit(1)
Ejemplo n.º 21
0
 def yield_sentences(self):
     test_file = io.open(self.test_file, 'r').read()
     inst2ans = self.get_answers()        
     for text in bsoup(test_file).findAll('text'):
         if not text:
             continue
         textid = text['id']
         context_doc = " ".join([remove_tags(i) for i in 
                                 str(text).split('\n') if remove_tags(i)])
         for sent in text.findAll('sentence'):
             context_sent =  " ".join([remove_tags(i) for i in 
                                   str(sent).split('\n') if remove_tags(i)])
             yield sent, context_sent, context_doc, inst2ans, textid
Ejemplo n.º 22
0
 def yield_sentences(self):
     test_file = io.open(self.test_file, 'r').read()
     inst2ans = self.get_answers()        
     for text in bsoup(test_file).findAll('text'):
         if not text:
             continue
         textid = text['id']
         context_doc = " ".join([remove_tags(i) for i in 
                                 str(text).split('\n') if remove_tags(i)])
         for sent in text.findAll('sentence'):
             context_sent =  " ".join([remove_tags(i) for i in 
                                   str(sent).split('\n') if remove_tags(i)])
             yield sent, context_sent, context_doc, inst2ans, textid
Ejemplo n.º 23
0
 def _get(self, url, cookie_update=False):
     """
 General HTTP Get function.  Requires a URL.
 """
     if cookie_update:
         self.cookie = ""
     headers = {"Cookie": self.cookie}
     http = self.con(self.host)
     http.request("GET", url, headers=headers)
     resp = http.getresponse()
     page = bsoup(resp.read())
     if cookie_update:
         self._set_cookie(resp)
     return page
Ejemplo n.º 24
0
def get_yahoo_xml_photo_news(data_html, headline):
    """scans the data for img tags and replaces them with  yahoo nsml standard
    newsitem photo tags 
    if the img does not exist at the src, it will be removed"""
    soup = bsoup(data_html)
    image_count = 0
    return_tag_string = ""
    for image in soup.findAll("img"):
        new_img_tag = ''
        src = ''
        try:
            # extract the elements from img tag
            if image.has_key("src"):
                src = image["src"]
                filename = src.split("/")[-1]

                alt = ""  # default to blank
                if image.has_key("alt"):
                    alt = image["alt"]
                elif image.has_key("title"):
                    alt = image["title"]

                if image.has_key("width") and image.has_key("height"):
                    width = image["width"]
                    height = image["height"]

                type, mime = get_media_mime_type(filename)

                new_img_tag = """<NewsComponent Duid="photo%s">""" % image_count + \
                """<Role FormalName="Photo"/>""" + \
                """<NewsLines><HeadLine>%s</HeadLine></NewsLines>""" % headline + \
                """<DescriptiveMetadata><Language FormalName="en"/>""" + \
                """</DescriptiveMetadata><NewsComponent>""" + \
                """<Role FormalName="Caption"/><ContentItem>""" + \
                """<MediaType FormalName="Text"/>""" + \
                """<Format FormalName="NITF"/><DataContent><nitf><body>""" + \
                """<body.content><p>%s</p></body.content>""" % alt + \
                """</body></nitf></DataContent></ContentItem></NewsComponent>""" + \
                """<NewsComponent Duid="base%s">""" %image_count + \
                """<Role FormalName="BaseImage"/>""" + \
                """<ContentItem Href="%s">""" % src + \
                """<MediaType FormalName="Photo"/>""" + \
                """<Format FormalName="JPEG Baseline"/>""" + \
                """<MimeType FormalName="%s"/>""" % mime + \
                """</ContentItem></NewsComponent></NewsComponent>"""
                image_count += 1
        except ContifyValidationError, e:
            # move on to the next image, catch the exception - log it and move on
            logger.error(e)
        finally:
Ejemplo n.º 25
0
def convert_html_to_inline(html):
    """
    Parses an html document and substitutes img tags with inlined
    base64 encoded images

    Arguments:
    - `html`: An html, represented as a str object
    """
    soup = bsoup(html)

    for tag in soup.findAll('img'):
        new_tag = convert_imgtag_to_base64(unicode(tag))
        tag.replaceWith(new_tag)

    return soup
Ejemplo n.º 26
0
def convert_imgtag_to_base64(tag):
    """
    Returns an image tag with the URI substituted by a base64
    representation of the resource

    Arguments:
    - `tag`: A BeautifulSoup object representing an img tag
    """
    assert isinstance(tag, basestring)
    soup = bsoup(tag)
    tag = soup.contents[0]
    img_uri = tag['src']
    mime_type, data = get_image(img_uri)
    tag['src'] = "data:%s;base64,%s" % (mime_type, data)

    return tag
Ejemplo n.º 27
0
def download_users_and_parse(url):

    web_content = urllib.urlopen(url)
    html_data = web_content.read()
    web_content.close()

    data = bsoup(html_data).findAll('h2', attrs={'class':['user-leaderboard-list-name']})

    user_list = []
    for item in data:
        user_list.append(item.findAll('a')[0]['href'].replace('/', ''))

    with open('../output/trending-users.json', 'w') as output:
        json.dump(user_list, output)

    return user_list
Ejemplo n.º 28
0
 def _post(self, url, payload, cookie_update=False):
     """
 General HTTP post function.  Requires a url and a payload.
 """
     body = urllib.urlencode(payload)
     headers = {
         "Cookie": self.cookie,
         "Content-Length": len(body),
         "Content-Type": "application/x-www-form-urlencoded",
     }
     http = self.con(self.host)
     http.request("POST", url, body, headers)
     resp = http.getresponse()
     page = bsoup(resp.read())
     if cookie_update:
         self._set_cookie(resp)
     return page
Ejemplo n.º 29
0
def img_to_yahoo_media(
    data_html,
    producer,
    prefix=None,
):
    """
    TODO:
    * Height and width being ignored
    * All a href to images being ignored
    * Image Caption - if image within divs is not being considered
    
    scans the data for img tags and replaces them with  yahoo nsml standard media tags
    if the img does not exist at the src, it will be removed
    <img src="filename.jpg" alt="#alt_text#" align="#alignment#" width="#w#" height="#h#"/>
    <media media-type="image" style="align:#alignment#">
        <media-reference mime-type="" source="#photo_number"/>
    </media>    
    @param data_html data in html format
    @param source source of the image, should be the publication or the account
    @param check_exists if True, will check if file exists and update the tags, else remove the instance
    @return data with img tags replace by media tags
    """
    # use this with the following template tag if required
    # {{ entry.get_body_with_yahoo_media_tags|xmlsafe|hreftext|safe }}
    # remember TODO, remove nesting of  img tags inside para tags
    soup = bsoup(data_html)
    # remove a href to images!!
    image_count = 0
    for image in soup.findAll("img"):
        new_img_tag = ''
        src = ''
        try:
            # extract the elements from img tag
            if image.has_key("src"):
                src = image["src"]
                filename = src.split("/")[-1]
                type, mime = get_media_mime_type(filename)
                new_img_tag = """<media style="rightSide" media-type="image">""" + \
                            """<media-reference source="#photo%s" mime-type=""/></media>""" % \
                            image_count
                image_count += 1
        except ContifyValidationError, e:
            # move on to the next image, catch the exception - log it and move on
            logger.error(e)
        finally:
Ejemplo n.º 30
0
    def order(self,data1,data2):
        s = requests.Session()
        url1 = 'http://login.qyer.com/qcross/login/auth.php?action=login'
        r1 = s.post(url1,data=data1,headers=headers)
        url2 = r1.json()['data']['arr_synlogin'][2]
        r2 = s.post(url2,headers=headers)
    #     cookies = r1.cookies+r2.cookies
        print r2.cookies
        url3 = 'http://z.qyer.com/orderformconfirm'
        r3 = s.post(url3, data2,cookies=r2.cookies,headers=headers)
        print "================"
        soup = bsoup(r3.text)
        token = soup.findAll("input")[3]["value"]
        data_order['form_token'] = token
#         ordercookie = self.dealcookies(r1.cookies,r2.cookies, r3.cookies)
#         print ordercookie
        url4 = 'http://z.qyer.com/orderform'
        headers['Referer'] = 'http://z.qyer.com/orderformconfirm'
        submit = s.post(url4,data = data_order,cookies=r3.cookies,headers=headers )
Ejemplo n.º 31
0
    def setUp(self):
        self.test = file("backend/test/data/inventory/test_20120310_055847_saholic.html", "r").read()
        self.test_data = str(bsoup(self.test).fetch('div', 'productItem')[0])
        self.inventory = SaholicInventory(self.test_data)

        self.attr = attribute()
        self.item = {
            self.attr.name  : u'Alcatel  OT-230D',
            self.attr.id    : md5( 'SHOLIC_Alcatel  OT-230D' ).hexdigest(),
            self.attr.url   : u'http://saholic.com/mobile-phones/alcatel-ot-230d-1001720',
            self.attr.specs : None,
            self.attr.color : None,
            self.attr.brand : None,
            self.attr.stock : None,
            self.attr.source: u'SHOLIC',
            self.attr.price : u'949',
            self.attr.image : u"http://static2.saholic.com/images/media/1001720/alcatel-ot-230d-icon-1313564847734.jpg",
            self.attr.delivery : None
            }
Ejemplo n.º 32
0
 def setUp(self):        
     self.test = file("backend/test/data/inventory/test_20120310_055847_flipkart.html", "r").read()
     self.test_data = str(bsoup(self.test).fetch('div', 'fk-srch-item')[0])
     self.inventory = FlipkartInventory(self.test_data)
     
     self.attr = attribute()
     
     self.item  = {
         self.attr.name  : u'Samsung Galaxy Y S5360',
         self.attr.color : u'Grey',
         self.attr.specs : u'Android v2.3 OS, 2 MP Primary Camera, 3-inch Touchscreen, FM Radio',
         self.attr.stock : u'In Stock.',
         self.attr.price : u'7650',
         self.attr.image : u'http://img1.flixcart.com//image/mobile/4/4/4/samsung-galaxy-y-s5360-125x125-imad2pzjx3uq8paz.jpeg',
         self.attr.brand : None,
         self.attr.delivery : u'2-4 business days. Free Home Delivery.',
         self.attr.source : u'FKART',
         self.attr.url    : u'http://flipkart.com//samsung-galaxy-y-s5360-mobile-phone/p/itmd2pz2rpcg5smz/search-mobile-/1?pid=mobd2pyzfanvw444&ref=c337db2d-b97a-4b4b-9061-bf3705435edd&_l=HmmZvbFeU9Oo4NUBP6Fi6Q--&_r=t2xsnCM8eE1pqUPoLth04Q--',
         self.attr.id     : md5( 'FKART_Samsung Galaxy Y S5360' ).hexdigest()
      }
Ejemplo n.º 33
0
    def setUp(self):        
        self.test = file("backend/test/data/inventory/test_20120310_055847_infibeam.html", "r").read()
        self.test_data = str(bsoup(self.test).fetch('ul', 'srch_result portrait')[0].fetch('li')[0])

        #monkey patching test-data to get the correct minimal test-data 
        self.test_data = str("<ul class='srch_result portrait'>" +  self.test_data + "</ul>")

        #inventory object.
        self.inventory = InfibeamInventory(self.test_data)

        #inventory item to be tested against.
        self.attr = attribute()
        self.item = {
            self.attr.id    : md5( 'IBEAM_Sony Ericsson XPERIA X2 (Black)' ).hexdigest(),
            self.attr.url   : "http://infibeam.com/Mobiles/i-Sony-Ericsson-XPERIA-X2-Slider/P-E-M-Sony-Ericsson-XPERIAX2.html?id=Black",
            self.attr.name  : u'Sony Ericsson XPERIA X2 (Black)',
            self.attr.color : None,
            self.attr.specs : None,
            self.attr.stock : None,
            self.attr.brand : None,
            self.attr.price : u'25767',
            self.attr.source: u'IBEAM',
            self.attr.delivery : None,
            self.attr.image : u'http://cdn-img-a.infibeam.net/img/2ffd0b46/80/22/p-e-m-sony-ericsson-xperiax2-front-1.wm.jpg?op_sharpen=1&wid=120&hei=140'}
Ejemplo n.º 34
0
def parseWikiContent(text):
    soup = bsoup(text)
    # check exists
    noarticle = soup('div', {"class" : "noarticletext"})
    if len(noarticle) != 0:
        print 'not exist!!!'
        return None
    pSet = soup('div', {'id' : 'mw-content-text'})[0].findChildren('p', recursive=False)
    loops = 3
    contents = ''
    for p in pSet:
        if loops == 0:
            break
        #print p
        content = p.getText()
        #print content
        if len(content) >= 4 and content[0:6].find(u'坐标') == -1:
            content = filterInvalidChar(pattern, content)
            contents += content.encode('utf-8') + '\n'
        loops -= 1
    if len(contents) > 0:
        return contents
    else:
        return None
Ejemplo n.º 35
0
def get_salt_version(url, args, prev_branch=False):
    if 'osx' not in url and prev_branch:
        url = url.replace(url.split('/')[-1], args)
    get_url = requests.get(url)
    ret_code = get_url.status_code
    if ret_code != 200:
        print('Attempt to query url failed with http error code: {0}'.format(ret_code))
        sys.exit(1)
    html = get_url.content
    parse_html = bsoup(html)
    pkg_name = 'salt-master'
    if 'osx' in url:
        try:
            pkg_name = 'salt-{0}'.format(args.branch)
        except AttributeError:
            pkg_name = 'salt-{0}'.format(args)
    if 'windows' in url:
        pkg_name = 'Salt-Minion-{0}'.format(args.branch)

    for tag in parse_html.findAll(attrs={'href': re.compile(pkg_name +
                                                           ".*")}):
        match = re.search("([0-9]{1,4}\.)([0-9]{1,2}\.)([0-9]{1,2})", str(tag))
        salt_ver = (match.group(0))
    return salt_ver
Ejemplo n.º 36
0
def img_to_media(data_html, producer, prefix=None, download_to=None):
    """
    TODO:
    * Height and width being ignored
    * All a href to images being ignored
    * Image Caption - if image within divs is not being considered
    
    scans the data for img tags and replaces them with nsml standard media tags
    if the img does not exist at the src, it will be removed
    files will be downloaded if the location is specified
    <img src="filename.jpg" alt="#alt_text#" align="#alignment#" width="#w#" height="#h#"/>
    <media media-type="image" style="align:#alignment#">
        <media-reference mime-type="image/jpeg" source="filename.jpg" alternate-text="#alt-text#" height="#h#" width="#w#"></media-reference>
        <media-caption>
            #caption#
        </media-caption>
        <media-producer>
            source
        </media-producer>
    </media>    
    @param data_html data in html format
    @param source source of the image, should be the publication or the account
    @param check_exists if True, will check if file exists and update the tags, else remove the instance
    @param download_to location where the files needs to be downloaded, optional
    @return data with img tags replace by media tags
    """
    soup = bsoup(data_html)
    src_list = []
    # remove a href to images!!
    for image in soup.findAll("img"):
        new_img_tag = ''
        src = ''
        try:
            # extract the elements from img tag
            if image.has_key("src"):
                src = image["src"]

                filename = src.split("/")[-1]
                if prefix:
                    filename = prefix + "_" + filename

                alt = ""  # default to blank
                if image.has_key("alt"):
                    alt = image["alt"]

                align = 'align:right'  # default to right
                if image.has_key("align"):
                    align = image["align"]

                if image.has_key("width") and image.has_key("height"):
                    width = image["width"]
                    height = image["height"]

                    new_img_tag = get_media_tags(filename,
                                                 producer,
                                                 alt,
                                                 height=height,
                                                 width=width,
                                                 align=align)
                else:
                    new_img_tag = get_media_tags(filename, producer, alt,
                                                 align)

                # ignore height and width for now
                new_img_tag = get_media_tags(filename, producer, alt, align)
            else:
                # error - src has to be there!!
                raise ContifyValidationError("Image src missing, img tag: %s" %
                                             (image))
        except ContifyValidationError, e:
            # move on to the next image, catch the exception - log it and move on
            logger.error(e)
        finally:
Ejemplo n.º 37
0
def rd_parse_post(entry):
    blogger_id = entry.id
    created = entry.published.split('.')[:1][0].replace('T', ' ')
    updated = entry.updated.split('.')[:1][0].replace('T', ' ')

    link = entry.link  #[-1]
    url = link.replace('http://rugbydump.blogspot.com', '')
    title = entry.title.encode('ASCII', 'ignore')

    content = entry.summary
    content = renode.sub(node, content).encode('ASCII', 'ignore')

    # Fix up content a bit
    xcontent = bsoup(content)
    img = xcontent.img
    src = img['src'].split('/')[-1]
    img['src'] = '/media/posts/' + src
    img['alt'] = title

    del (img['border'])
    del (img['style'])
    del (img['id'])

    # Put a centererd paragraph around the image
    np = Tag(xcontent, 'p', [('style', 'text-align: center;')])
    np.insert(0, img)

    try:
        xcontent.a.replaceWith(
            np)  # Takes away the link around the first image
    except:
        xcontent.insert(
            0, np
        )  # Lol that was pretty important (just inserts it and the blank link will remain unfortunately)

    # Remove the last div
    xcontent.findAll('div', attrs={'class':
                                   'blogger-post-footer'})[0].extract()

    try:
        blurb = xcontent.span.contents[0]
    except:
        blurb = ''

    content = xcontent.prettify()

    try:
        numcomments = entry.thr_total
    except AttributeError:
        numcomments = 0

    try:
        return {
            'src': src,
            'created': created,
            'updated': updated,
            'url': url,
            'numcomments': numcomments,
            'blogger_id': blogger_id,
            'title': title,
            'blurb': blurb,
            'content': content,
        }
    except UnicodeDecodeError:
        print "Skipping post \"%s\".." % title
        return
Ejemplo n.º 38
0
        'medium': common.sesame_base_url + video['thumbnailLarge'],
        'large': common.sesame_base_url + video['poster']
      },
      'cast': str(video['character']).split(';'),
      'width': float(video['width']),
      'height': float(video['height']),
    }
    menu.addVideoItem(item, video)
    items.append(item)
  settings.setobj('temp video list', items)

ok = True
page = common.args.get('page', None)
if page == 'topics':
  html = utils.getHTML('videos')
  lis = bsoup(html).find('select', {'class':re.compile("filter-topic")}).findAll('option')
  for item in lis:
    if item['value'] == '':
      continue
    menu.addFolderItem(item.string, {'page':'list_vids','reset':1,'topic':int(item['value'])})

elif page == 'recent':
  videos = fetch_vids(reset=True)
  list_vids(videos)
  menu.moreVideosBtn()

elif page == 'muppets':
  # get JSON-formatted names
  html = utils.getHTML('ump-portlet/js/sw/sw.ump.js')
  match = re.findall("muppets\s+:\s+\[([\s\"a-zA-Z\|\,]+)\]", html)
  match = re.findall("\"([a-zA-Z\s\|]+)\"", match[0])
Ejemplo n.º 39
0
def ssdut_news_parse(raw):
    ''' parse the raw page src,

    store all result in a Storage object.
    all strings are unicode

    result.soup
        BeautifulSoup object
    result.raw
        raw page src
    result.hash
        sha1 hash of the page
    result.title
        title
    result.source
        来源
    result.date_str - date in string
    result.date - date object
    result.body
        html src of the news body
    result.clean_body
        unescaped src of the news body,
    result.publisher
        发表人
    '''
    soup = bsoup(raw)
    result = Storage()

    # raw page / hash
    result.raw = raw
    result.soup = soup

    # title
    s = soup.find(attrs={'class': re_compile('title')})
    result.title = s.text

    # source
    text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn"))
    r = re_compile(ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:")
    res = r.findall(text)[0]
    result.source = res[1].rstrip()

    # date
    result.date_str = res[0]
    result.date = datetime.date(*[int(n) for n in result.date_str.split('-')])

    # content (body)
    c = soup.find(attrs={'class': re_compile('content')})
    result.body = unicode(c)

    # content (body)  unescaped
    texts = c.findAll(text=True)
    all_texts = '\n'.join(texts)
    result.clean_body = html_parser.unescape(all_texts)

    # publisher (could be find at the bottom of page)
    s = soup.find(
        attrs={
            "style": "font-size:14px;float:left;text-align:right;width:80%"
        })
    r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)")
    #logging.debug("publisher string = %r " % s)

    try:
        name = r.findall(s.text)[0]
    except:
        logging.warn(" %s has no publisher " % result.title)
        name = ""  # no publisher: like this: index.php/News/8692.html
    result.publisher = name.rstrip().lstrip()

    # use utf-8 encoding
    for k in ['title', 'source', 'body', 'clean_body', 'publisher']:
        result[k] = result[k].encode('utf-8')

    hash_src = result.body + result.title + result.publisher
    if isinstance(hash_src, str):
        hash_src = unicode(hash_src, "utf-8", "ignore")
    elif isinstance(hash_src, unicode):
        pass
    else:
        pass
    result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest()
    result.search_text = ''.join([
        result.title, result.source, result.clean_body, result.publisher,
        result.sha1
    ])
    return result
Ejemplo n.º 40
0
def ssdut_news_list(page_raw):
    ''' parse the news_list page,
    get a list of news, the same squence as the page,

    result.soup
          .page_no
          .news_list
          .total_records
    '''
    result = Storage()
    soup = bsoup(page_raw)
    result.soup = soup

    # get current page number
    r = soup.find(text=ur"\u4e0b\u4e00\u9875")  # text=u"下一页"
    if r:
        '''not the last page'''
        next_page_link = r.parent.attrs[0][1]
        #logging.debug("r.parent.attrs = %r" % r.parent.attrs)
        r = re_compile(r'/p/(\d+)')
        page_no = r.search(next_page_link).group(1)
        page_no = int(page_no)  # - 1
    else:
        ''' the last page'''
        r = soup.find(text=ur'\u4e0a\u4e00\u9875')
        prev_page_link = r.parent.attrs[0][1]
        #logging.debug("r.parent.attrs = %r" % r.parent.attrs)
        r = re_compile(r'/p/(\d+)')
        page_no = r.search(prev_page_link).group(1)
        page_no = int(page_no)  # + 1
    result.page_no = page_no

    # get the news list
    res = soup.findAll(attrs={"bgcolor": "#EEEEEE"})
    news_list = []
    counter = 1
    for r in res:
        a = r.findChildren("a")
        date_str = r.find(text=re_compile("\d{4}-\d{2}-\d{2}")).encode("utf-8")
        news_list.append({
            "link":
            a[0].get("href").encode("utf-8"),
            "title":
            a[0].text.encode("utf-8"),
            "source":
            a[1].text.encode("utf-8"),
            "source_link":
            a[1].get("href").encode("utf-8"),
            "date_str":
            date_str,
            "date":
            datetime.date(*[int(n) for n in date_str.split("-")]),
            "no":
            counter,
        })
        counter += 1
        #logging.debug("source = %s, source_link = %s" %
        #              (news_list[-1]['source'], news_list[-1]['source_link']))
    result.news_list = news_list

    # tital news num
    # 共\d+ t条记录
    s = soup.find(text=re_compile(ur"\u5171\d+ \u6761\u8bb0\u5f55"))
    r = re_compile(ur"\u5171(\d+)")
    result.total_records = int(r.search(s).group(1))

    return result
Ejemplo n.º 41
0
    "HEALTH_AND_FITNESS", "HOUSE_AND_HOME", "LIFESTYLE", "MAPS_AND_NAVIGATION",
    "MEDICAL", "MUSIC_AND_AUDIO", "NEWS_AND_MAGAZINES", "PARENTING",
    "PERSONALIZATION", "PHOTOGRAPHY", "PRODUCTIVITY", "SHOPPING", "SOCIAL",
    "SPORTS", "TOOLS", "TRAVEL_AND_LOCAL", "VIDEO_PLAYERS", "WEATHER",
    "LIBRARIES_AND_DEMO", "GAME_ARCADE", "GAME_PUZZLE", "GAME_CARD",
    "GAME_CASUAL", "GAME_RACING", "GAME_SPORTS", "GAME_ACTION",
    "GAME_ADVENTURE", "GAME_BOARD", "GAME_CASINO", "GAME_EDUCATIONAL",
    "GAME_MUSIC", "GAME_ROLE_PLAYING", "GAME_SIMULATION", "GAME_STRATEGY",
    "GAME_TRIVIA", "GAME_WORD", "ANDROID_WEAR"
]
for i in range(0, len(category)):
    url = "https://play.google.com/store/apps/category/" + category[
        i] + "/collection/topselling_free?hl=ja"

    html = requests.get(url).text
    soup = bsoup(html)

    urlslist = soup.findAll("a", {"class": "card-click-target"})
    urls = []
    #open the file to keep the list, as required
    filename = url[44:-33] + ".txt"
    fo = open(filename, 'w')

    #Url list
    for a in urlslist:
        link = "https://play.google.com" + a['href']
        #print(link)
        urls.append(link)
    url = urls[::4]
    for item in url:
        item = item[46:]  #list as package name
Ejemplo n.º 42
0
def openSoup(x):
    """Opens URL and create soup"""
    return bsoup(urllib2.urlopen(x).read())
Ejemplo n.º 43
0
 def FakeResponse(a):
     test = file("backend/test/data/inventory/test_20120310_055847_flipkart.html", "r").read()
     test_data = str(bsoup(test).fetch('div', 'fk-srch-item'))
     return '200 OK', test_data
Ejemplo n.º 44
0
#coding: utf-8
#import urllib2, sys
#import xml.etree.ElementTree as etree
from BeautifulSoup import BeautifulSoup as bsoup

f = open('links.txt','w')
links = []
#url = "http://gyokai-search.com/2nd-genre.htm"
url = "http://cooksonia.6.ql.bz/test/gyokai.html"
html = urllib2.urlopen(url).read()
soup = bsoup(html)
links = soup.findAll('a')
for link in links:
	str_ = dict(link.attrs)['href']
	f.write(str_+"\n")

f.close()

"""
for url in links:
	html = urllib2.urlopen(url).read()
	soup = bsoup(html)
	h1 = soup.find('h1', attrs={"class":"yjXL"}).contents
	text = soup.find('p', attrs={"class":"ynDetailText"}).contents
	f = open('./rss/'+str(h1[0])+'.txt','w')
	for t in text:
		f.write(str(t))
	f.close()
"""
Ejemplo n.º 45
0
def ssdut_news_list(page_raw):
    ''' parse the news_list page,
    get a list of news, the same squence as the page,

    result.soup
          .page_no
          .news_list
          .total_records
    '''
    result = Storage()
    soup = bsoup(page_raw)
    result.soup = soup

    # get current page number
    r = soup.find(text=ur"\u4e0b\u4e00\u9875")  # text=u"下一页"
    if r:
        '''not the last page'''
        next_page_link = r.parent.attrs[0][1]
        #logging.debug("r.parent.attrs = %r" % r.parent.attrs)
        r = re_compile(r'/p/(\d+)')
        page_no = r.search(next_page_link).group(1)
        page_no = int(page_no)  # - 1
    else:
        ''' the last page'''
        r = soup.find(text=ur'\u4e0a\u4e00\u9875')
        prev_page_link = r.parent.attrs[0][1]
        #logging.debug("r.parent.attrs = %r" % r.parent.attrs)
        r = re_compile(r'/p/(\d+)')
        page_no = r.search(prev_page_link).group(1)
        page_no = int(page_no)  # + 1
    result.page_no = page_no

    # get the news list
    res = soup.findAll(attrs={"bgcolor": "#EEEEEE"})
    news_list = []
    counter = 1
    for r in res:
        a = r.findChildren("a")
        date_str = r.find(text=re_compile("\d{4}-\d{2}-\d{2}")).encode("utf-8")
        news_list.append(
            {
                "link": a[0].get("href").encode("utf-8"),
                "title": a[0].text.encode("utf-8"),
                "source": a[1].text.encode("utf-8"),
                "source_link": a[1].get("href").encode("utf-8"),
                "date_str": date_str,
                "date": datetime.date(
                    *[int(n) for n in date_str.split("-")]),
                "no": counter,
            })
        counter += 1
        #logging.debug("source = %s, source_link = %s" %
        #              (news_list[-1]['source'], news_list[-1]['source_link']))
    result.news_list = news_list

    # tital news num
    # 共\d+ t条记录
    s = soup.find(text=re_compile(ur"\u5171\d+ \u6761\u8bb0\u5f55"))
    r = re_compile(ur"\u5171(\d+)")
    result.total_records = int(r.search(s).group(1))

    return result
Ejemplo n.º 46
0
    u = urllib.urlopen(link)
    torrent = u.read()
    f = file(path, 'wb')
    f.write(torrent)
    f.close()
    return True


def list_shows():
    result = []
    try:
        inputFP = urllib.urlopen(BASE_URL)
    except Exception, e:
        print e
        return result
    for option in bsoup(inputFP)('option'):
        optContents = ''.join(option.contents)
        optValue = option['value']
        if optValue.isnumeric():
            result.append({optContents: optValue})
    inputFP.close()
    return result


def is_in_cache(cache, key, element):
    if cache.has_key(key):
        return element in cache[key]
    return False


def store_in_cache(cache, key, element):
Ejemplo n.º 47
0
def ssdut_news_parse(raw):
    ''' parse the raw page src,

    store all result in a Storage object.
    all strings are unicode

    result.soup
        BeautifulSoup object
    result.raw
        raw page src
    result.hash
        sha1 hash of the page
    result.title
        title
    result.source
        来源
    result.date_str - date in string
    result.date - date object
    result.body
        html src of the news body
    result.clean_body
        unescaped src of the news body,
    result.publisher
        发表人
    '''
    soup = bsoup(raw)
    result = Storage()

    # raw page / hash
    result.raw = raw
    result.soup = soup

    # title
    s = soup.find(attrs={'class': re_compile('title')})
    result.title = s.text

    # source
    text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn"))
    r = re_compile(
        ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:")
    res = r.findall(text)[0]
    result.source = res[1].rstrip()

    # date
    result.date_str = res[0]
    result.date = datetime.date(*[int(n) for n in result.date_str.split('-')])

    # content (body)
    c = soup.find(attrs={'class': re_compile('content')})
    result.body = unicode(c)

    # content (body)  unescaped
    texts = c.findAll(text=True)
    all_texts = '\n'.join(texts)
    result.clean_body = html_parser.unescape(all_texts)

    # publisher (could be find at the bottom of page)
    s = soup.find(
        attrs={
            "style": "font-size:14px;float:left;text-align:right;width:80%"
        })
    r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)")
    #logging.debug("publisher string = %r " % s)

    try:
        name = r.findall(s.text)[0]
    except:
        logging.warn(" %s has no publisher " % result.title)
        name = ""  # no publisher: like this: index.php/News/8692.html
    result.publisher = name.rstrip().lstrip()

    # use utf-8 encoding
    for k in ['title', 'source', 'body', 'clean_body', 'publisher']:
        result[k] = result[k].encode('utf-8')


    hash_src = result.body + result.title + result.publisher
    if isinstance(hash_src, str):
        hash_src = unicode(hash_src, "utf-8", "ignore")
    elif isinstance(hash_src, unicode):
        pass
    else:
        pass
    result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest()
    result.search_text = ''.join([result.title, result.source,
                                  result.clean_body, result.publisher,
                                  result.sha1])
    return result